diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000..176a458f94
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+* text=auto
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index d53176133b..56f72c49e9 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,16 +2,21 @@
 
 # Level Zero adapter
 source/adapters/level_zero      @oneapi-src/unified-runtime-level-zero-write
+test/adapters/level_zero        @oneapi-src/unified-runtime-level-zero-write
 
 # CUDA and HIP adapters
 source/adapters/cuda            @oneapi-src/unified-runtime-cuda-write
+test/adapters/cuda              @oneapi-src/unified-runtime-cuda-write
 source/adapters/hip             @oneapi-src/unified-runtime-hip-write
+test/adapters/hip               @oneapi-src/unified-runtime-hip-write
 
 # OpenCL adapter
 source/adapters/opencl          @oneapi-src/unified-runtime-opencl-write
+test/adapters/opencl            @oneapi-src/unified-runtime-opencl-write
 
 # Native CPU adapter
 source/adapters/native_cpu          @oneapi-src/unified-runtime-native-cpu-write
+test/adapters/native_cpu            @oneapi-src/unified-runtime-native-cpu-write
 
 # Command-buffer experimental feature
 source/adapters/**/command_buffer.*     @oneapi-src/unified-runtime-command-buffer-write
@@ -20,6 +25,7 @@ scripts/core/exp-command-buffer.yml     @oneapi-src/unified-runtime-command-buff
 test/conformance/exp_command_buffer**   @oneapi-src/unified-runtime-command-buffer-write
 
 # Bindless Images experimental feature
-scripts/core/EXP-BINDLESS-IMAGES.rst @oneapi-src/unified-runtime-bindless-images-write
-scripts/core/exp-bindless-images.yml @oneapi-src/unified-runtime-bindless-images-write
-source/adapters/**/image.*           @oneapi-src/unified-runtime-bindless-images-write
+source/adapters/**/image.*              @oneapi-src/unified-runtime-bindless-images-write
+scripts/core/EXP-BINDLESS-IMAGES.rst    @oneapi-src/unified-runtime-bindless-images-write
+scripts/core/exp-bindless-images.yml    @oneapi-src/unified-runtime-bindless-images-write
+test/conformance/exp_bindless_images**  @oneapi-src/unified-runtime-bindless-images-write
diff --git a/.github/docker/ubuntu-22.04.Dockerfile b/.github/docker/ubuntu-22.04.Dockerfile
index e8e88caa72..09eaab03a8 100644
--- a/.github/docker/ubuntu-22.04.Dockerfile
+++ b/.github/docker/ubuntu-22.04.Dockerfile
@@ -67,7 +67,8 @@ RUN /opt/install_dpcpp.sh
 COPY install_libbacktrace.sh /opt/install_libbacktrace.sh
 RUN /opt/install_libbacktrace.sh
 
-# Add a new (non-root) 'user'
-ENV USER user
+# Add a new (non-root) 'test_user' and switch to it
+ENV USER test_user
 ENV USERPASS pass
 RUN useradd -m "${USER}" -g sudo -p "$(mkpasswd ${USERPASS})"
+USER test_user
diff --git a/.github/scripts/compute_benchmarks.py b/.github/scripts/compute_benchmarks.py
new file mode 100755
index 0000000000..01f1b90c40
--- /dev/null
+++ b/.github/scripts/compute_benchmarks.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+
+# Copyright (C) 2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+import subprocess  # nosec B404
+import csv
+import argparse
+import io
+import json
+from pathlib import Path
+
+# Function to run the benchmark with the given parameters and environment variables
+def run_benchmark(directory, ioq, env_vars):
+    env = os.environ.copy()
+    env.update(env_vars)
+    command = [
+        f"{directory}/api_overhead_benchmark_sycl",
+        "--test=SubmitKernel",
+        f"--Ioq={ioq}",
+        "--DiscardEvents=0",
+        "--MeasureCompletion=0",
+        "--iterations=10000",
+        "--Profiling=0",
+        "--NumKernels=10",
+        "--KernelExecTime=1",
+        "--csv",
+        "--noHeaders"
+    ]
+    result = subprocess.run(command, capture_output=True, text=True, env=env)  # nosec B603
+    return command, result.stdout
+
+# Function to parse the CSV output and extract the mean execution time
+def parse_output(output):
+    # Use StringIO to turn the string output into a file-like object for the csv reader
+    csv_file = io.StringIO(output)
+    reader = csv.reader(csv_file)
+
+    # Skip the header row
+    next(reader, None)
+    data_row = next(reader, None)
+    if data_row is None:
+        raise ValueError("Benchmark output does not contain data.")
+    try:
+        name = data_row[0] # Name of the benchmark is the first value
+        mean = float(data_row[1]) # Mean is the second value
+        return (name, mean)
+    except ValueError:
+        raise ValueError(f"Could not convert mean execution time to float: '{data_row[1]}'")
+    except IndexError:
+        raise ValueError("Data row does not contain enough values.")
+
+# Function to generate the mermaid bar chart script
+def generate_mermaid_script(labels, chart_data):
+    mermaid_script=f"""
+---
+config:
+    gantt:
+        rightPadding: 10
+        leftPadding: 120
+        sectionFontSize: 10
+        numberSectionStyles: 2
+---
+gantt
+    title api_overhead_benchmark_sycl, mean execution time per 10 kernels (μs)
+    todayMarker off
+    dateFormat  X
+    axisFormat %s
+"""
+    for label in labels:
+        nbars = 0
+        print_label = label.replace(" ", "<br>")
+        mermaid_script += f"""
+    section {print_label}
+"""
+        for (name, data) in chart_data:
+            if data is not None:
+                if label in data:
+                    nbars += 1
+                    mean = data[label]
+                    crit = "crit," if name == "This PR" else ""
+                    mermaid_script += f"""
+        {name} ({mean} us)   : {crit} 0, {int(mean)}
+"""
+        padding = 4 - nbars
+        if padding > 0:
+            for _ in range(padding):
+                mermaid_script += f"""
+    -   : 0, 0
+"""
+
+    return mermaid_script
+
+# Function to generate the markdown collapsible sections for each variant
+def generate_markdown_details(variant_details):
+    markdown_sections = []
+    for label, command, env_vars, output in variant_details:
+        env_vars_str = '\n'.join(f"{key}={value}" for key, value in env_vars.items())
+        markdown_sections.append(f"""
+<details>
+<summary>{label}</summary>
+
+#### Environment Variables:
+{env_vars_str}
+
+#### Command:
+{' '.join(command)}
+
+#### Output:
+{output}
+
+</details>
+""")
+    return "\n".join(markdown_sections)
+
+# Function to generate the full markdown
+def generate_markdown_with_mermaid_chart(mermaid_script, variant_details):
+    return f"""
+# Benchmark Results
+```mermaid
+{mermaid_script}
+```
+## Details
+{generate_markdown_details(variant_details)}
+"""
+
+def save_benchmark_results(save_name, benchmark_data):
+    benchmarks_dir = Path.home() / 'benchmarks'
+    benchmarks_dir.mkdir(exist_ok=True)
+    file_path = benchmarks_dir / f"{save_name}.json"
+    with file_path.open('w') as file:
+        json.dump(benchmark_data, file, indent=4)
+    print(f"Benchmark results saved to {file_path}")
+
+def load_benchmark_results(compare_name):
+    benchmarks_dir = Path.home() / 'benchmarks'
+    file_path = benchmarks_dir / f"{compare_name}.json"
+    if file_path.exists():
+        with file_path.open('r') as file:
+            return json.load(file)
+    else:
+        return None
+
+def main(directory, additional_env_vars, save_name, compare_names):
+    variants = [
+        (1, {'UR_L0_USE_IMMEDIATE_COMMANDLISTS': '0'}, "Imm-CmdLists-OFF"),
+        (0, {'UR_L0_USE_IMMEDIATE_COMMANDLISTS': '0'}, "Imm-CmdLists-OFF"),
+        (1, {'UR_L0_USE_IMMEDIATE_COMMANDLISTS': '1'}, ""),
+        (0, {'UR_L0_USE_IMMEDIATE_COMMANDLISTS': '1'}, ""),
+    ]
+
+    # Run benchmarks and collect means, labels, and variant details
+    means = []
+    labels = []
+    variant_details = []
+    for ioq, env_vars, extra_label in variants:
+        merged_env_vars = {**env_vars, **additional_env_vars}
+        command, output = run_benchmark(directory, ioq, merged_env_vars)
+        (label, mean) = parse_output(output)
+        label += f" {extra_label}"
+        means.append(mean)
+        labels.append(label)
+        variant_details.append((label, command, merged_env_vars, output))
+
+    benchmark_data = {label: mean for label, mean in zip(labels, means)}
+
+    chart_data = [("This PR", benchmark_data)]
+    for name in compare_names:
+        chart_data.append((name, load_benchmark_results(name)))
+
+    if save_name:
+        save_benchmark_results(save_name, benchmark_data)
+
+    mermaid_script = generate_mermaid_script(labels, chart_data)
+
+    markdown_content = generate_markdown_with_mermaid_chart(mermaid_script, variant_details)
+
+    with open('benchmark_results.md', 'w') as file:
+        file.write(markdown_content)
+
+    print("Markdown with benchmark results has been written to benchmark_results.md")
+
+def validate_and_parse_env_args(env_args):
+    env_vars = {}
+    for arg in env_args:
+        if '=' not in arg:
+            raise ValueError(f"Environment variable argument '{arg}' is not in the form Variable=Value.")
+        key, value = arg.split('=', 1)
+        env_vars[key] = value
+    return env_vars
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Run benchmarks and generate a Mermaid bar chart script.')
+    parser.add_argument('benchmark_directory', type=str, help='The directory where the benchmarks are located.')
+    parser.add_argument("--env", type=str, help='Use env variable for a benchmark run.', action="append", default=[])
+    parser.add_argument("--save", type=str, help='Save the results for comparison under a specified name.')
+    parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append", default=["baseline"])
+
+    args = parser.parse_args()
+
+    additional_env_vars = validate_and_parse_env_args(args.env)
+
+    main(args.benchmark_directory, additional_env_vars, args.save, args.compare)
diff --git a/.github/scripts/get_system_info.sh b/.github/scripts/get_system_info.sh
index 6ca38a33ef..8301c99099 100755
--- a/.github/scripts/get_system_info.sh
+++ b/.github/scripts/get_system_info.sh
@@ -53,7 +53,7 @@ function system_info {
 	echo "**********/proc/meminfo**********"
 	cat /proc/meminfo
 	echo "**********build/bin/urinfo**********"
-	$(dirname "$(readlink -f "$0")")/../../build/bin/urinfo || true
+	$(dirname "$(readlink -f "$0")")/../../build/bin/urinfo --no-linear-ids --verbose || true
 	echo "******OpenCL*******"
 	# The driver version of OpenCL Graphics is the compute-runtime version
 	clinfo || echo "OpenCL not installed"
diff --git a/.github/workflows/benchmarks_core.yml b/.github/workflows/benchmarks_core.yml
new file mode 100644
index 0000000000..34ffb7134e
--- /dev/null
+++ b/.github/workflows/benchmarks_core.yml
@@ -0,0 +1,199 @@
+name: Compute Benchmarks
+
+on:
+  # this workflow can by only triggered by other workflows
+  # for example by: e2e_cuda.yml or e2e_opencl.yml
+  workflow_call:
+    # acceptable input from adapter-specific workflows
+    inputs:
+      name:
+        description: Adapter name
+        type: string
+        required: true
+      str_name:
+        description: Formatted adapter name
+        type: string
+        required: true
+      config:
+        description: Params for sycl configuration
+        type: string
+        required: true
+      unit:
+        description: Test unit (cpu/gpu)
+        type: string
+        required: true
+      runner_tag:
+        description: Tag defined for the runner
+        type: string
+        required: true
+      trigger:
+        description: Type of workflow trigger
+        type: string
+        required: true
+      comment:
+        description: Text if triggered by a comment
+        type: string
+        required: false
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  e2e-build-hw:
+    if: github.repository == 'oneapi-src/unified-runtime'  # run only on upstream; forks will not have the HW
+    name: Build SYCL, UR, run Compute Benchmarks
+    strategy:
+      matrix:
+        adapter: [
+          {name: "${{inputs.name}}",
+          str_name: "${{inputs.str_name}}",
+          config: "${{inputs.config}}",
+          unit: "${{inputs.unit}}"}
+        ]
+        build_type: [Release]
+        compiler: [{c: clang, cxx: clang++}]
+
+    runs-on: ${{inputs.runner_tag}}
+
+    steps:
+    # Workspace on self-hosted runners is not cleaned automatically.
+    # We have to delete the files created outside of using actions.
+    - name: Cleanup self-hosted workspace
+      if: always()
+      run: |
+        ls -la ./
+        rm -rf ./* || true
+
+    - name: Add comment to PR
+      uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+      if: ${{ always() && inputs.trigger != 'schedule' }}
+      with:
+        script: |
+          const adapter = '${{ matrix.adapter.name }}';
+          const url = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}';
+          const body = `Compute Benchmarks ${adapter} run: \n${url}`;
+
+          github.rest.issues.createComment({
+            issue_number: context.issue.number,
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            body: body
+          })
+
+    - name: Checkout UR
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      with:
+        path: ur-repo
+
+    # On issue_comment trigger (for PRs) we need to fetch special ref for
+    # proper PR's merge commit. Note, this ref may be absent if the PR is already merged.
+    - name: Fetch PR's merge commit
+      if: ${{ inputs.trigger != 'schedule' }}
+      working-directory: ${{github.workspace}}/ur-repo
+      env:
+        PR_NO: ${{github.event.issue.number}}
+      run: |
+        git fetch -- https://github.com/${{github.repository}} +refs/pull/${PR_NO}/*:refs/remotes/origin/pr/${PR_NO}/*
+        git checkout origin/pr/${PR_NO}/merge
+        git rev-parse origin/pr/${PR_NO}/merge
+
+    - name: Checkout SYCL
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      with:
+        repository: intel/llvm
+        ref: refs/heads/sycl
+        path: sycl-repo
+        fetch-depth: 1
+        fetch-tags: false
+
+    - name: Set CUDA env vars
+      if: matrix.adapter.name == 'CUDA'
+      run: |
+        echo "CUDA_LIB_PATH=/usr/local/cuda/lib64/stubs" >> $GITHUB_ENV
+        echo "LD_LIBRARY_PATH=/usr/local/cuda/compat/:/usr/local/cuda/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
+
+    - name: Configure SYCL
+      run: >
+        python3 sycl-repo/buildbot/configure.py
+        -t ${{matrix.build_type}}
+        -o ${{github.workspace}}/sycl_build
+        --cmake-gen "Ninja"
+        --ci-defaults ${{matrix.adapter.config}}
+        --cmake-opt="-DLLVM_INSTALL_UTILS=ON"
+        --cmake-opt="-DSYCL_PI_TESTS=OFF"
+        --cmake-opt="-DSYCL_PI_UR_USE_FETCH_CONTENT=OFF"
+        --cmake-opt="-DSYCL_PI_UR_SOURCE_DIR=${{github.workspace}}/ur-repo/"
+        --cmake-opt=-DCMAKE_C_COMPILER_LAUNCHER=ccache
+        --cmake-opt=-DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+
+    - name: Build SYCL
+      run: cmake --build ${{github.workspace}}/sycl_build -j
+
+    - name: Set additional env. vars
+      run: |
+        echo "${{github.workspace}}/sycl_build/bin" >> $GITHUB_PATH
+        echo "LD_LIBRARY_PATH=${{github.workspace}}/sycl_build/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV
+
+    # Running (newly built) sycl-ls sets up some extra variables
+    - name: Setup SYCL variables
+      run: |
+        which clang++ sycl-ls
+        SYCL_PI_TRACE=-1 sycl-ls
+
+    - name: Checkout Compute Benchmarks
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      with:
+        repository: intel/compute-benchmarks
+        path: compute-benchmarks-repo
+        submodules: recursive
+
+    - name: Configure Compute Benchmarks
+      run: >
+        cmake
+        -B ${{github.workspace}}/compute-benchmarks-build/
+        -S ${{github.workspace}}/compute-benchmarks-repo/
+        -DCMAKE_BUILD_TYPE=Release
+        -DBUILD_SYCL=ON
+        -DSYCL_COMPILER_ROOT=${{github.workspace}}/sycl_build
+        -DALLOW_WARNINGS=ON
+
+    - name: Build Compute Benchmarks
+      run: cmake --build ${{github.workspace}}/compute-benchmarks-build/ -j
+
+    - name: Set oneAPI Device Selector
+      run: |
+        echo "ONEAPI_DEVICE_SELECTOR=${{ matrix.adapter.str_name }}:${{ matrix.adapter.unit }}" >> $GITHUB_ENV
+
+    - name: Extract arguments from comment
+      id: args
+      run: echo "ARGS=$(echo '${{ inputs.comment }}' | sed -n 's/.*\/benchmarks-[^ ]* \(.*\)/\1/p')" >> $GITHUB_ENV
+
+    - name: Run SYCL API Overhead benchmark
+      id: benchmarks
+      run: ${{github.workspace}}/ur-repo/.github/scripts/compute_benchmarks.py ${{github.workspace}}/compute-benchmarks-build/bin/ $ARGS
+
+    - name: Add comment to PR
+      uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+      if: ${{ always() && inputs.trigger != 'schedule' }}
+      with:
+        script: |
+          let markdown = ""
+          try {
+            const fs = require('fs');
+            markdown = fs.readFileSync('benchmark_results.md', 'utf8');
+          } catch(err) {
+          }
+
+          const adapter = '${{ matrix.adapter.name }}';
+          const url = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}';
+          const test_status = '${{ steps.benchmarks.outcome }}';
+          const job_status = '${{ job.status }}';
+          const body = `Compute Benchmarks ${adapter} run:\n${url}\nJob status: ${job_status}. Test status: ${test_status}.\n ${markdown}`;
+
+          github.rest.issues.createComment({
+            issue_number: context.issue.number,
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            body: body
+          })
diff --git a/.github/workflows/benchmarks_level_zero.yml b/.github/workflows/benchmarks_level_zero.yml
new file mode 100644
index 0000000000..6b38aaeedd
--- /dev/null
+++ b/.github/workflows/benchmarks_level_zero.yml
@@ -0,0 +1,26 @@
+name: Compute Benchmarks Level Zero
+
+on:
+  issue_comment:
+    types: [created, edited]
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  e2e-build-hw:
+    # trigger only if PR comment contains "benchmarks-level-zero"
+    if: ${{ (github.event.issue.pull_request && contains(github.event.comment.body, '/benchmarks-level-zero'))}}
+    name: Start benchmarks job
+    # use core flow, run it with L0 specific parameters
+    uses: ./.github/workflows/benchmarks_core.yml
+    # parameters that we pass to the core flow
+    with:
+      name: "L0"
+      runner_tag: "L0_PERF"
+      str_name: "level_zero"
+      config: ""
+      unit: "gpu"
+      trigger: "${{github.event_name}}"
+      comment: ${{github.event.comment.body}}
diff --git a/.github/workflows/build-hw-reusable.yml b/.github/workflows/build-hw-reusable.yml
index a7945e2ac2..8c8f28a868 100644
--- a/.github/workflows/build-hw-reusable.yml
+++ b/.github/workflows/build-hw-reusable.yml
@@ -15,6 +15,13 @@ on:
 permissions:
   contents: read
 
+env:
+  UR_LOG_CUDA: "level:error;flush:error"
+  UR_LOG_HIP: "level:error;flush:error"
+  UR_LOG_LEVEL_ZERO: "level:error;flush:error"
+  UR_LOG_NATIVE_CPU: "level:error;flush:error"
+  UR_LOG_OPENCL: "level:error;flush:error"
+
 jobs:
   adapter-build-hw:
     name: Build & Test HW
diff --git a/.github/workflows/e2e_core.yml b/.github/workflows/e2e_core.yml
index 8554c5c476..7f2050d83d 100644
--- a/.github/workflows/e2e_core.yml
+++ b/.github/workflows/e2e_core.yml
@@ -34,6 +34,22 @@ on:
         description: Type of workflow trigger
         type: string
         required: true
+      xfail:
+        description: Allow test failures
+        type: string
+        required: false
+      xfail_not:
+        description: Not xfail
+        type: string
+        required: false
+      filter_out:
+        description: Tests to filter out completely
+        type: string
+        required: false
+      extra_lit_flags:
+        description: Additional llvm-lit flags to use
+        type: string
+        required: false
 
 permissions:
   contents: read
@@ -50,7 +66,8 @@ jobs:
           str_name: "${{inputs.str_name}}",
           prefix: "${{inputs.prefix}}",
           config: "${{inputs.config}}",
-          unit: "${{inputs.unit}}"}
+          unit: "${{inputs.unit}}",
+          extra_lit_flags: "${{inputs.extra_lit_flags}}"},
         ]
         build_type: [Release]
         compiler: [{c: clang, cxx: clang++}]
@@ -103,7 +120,7 @@ jobs:
       uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       with:
         repository: intel/llvm
-        ref: sycl
+        ref: refs/heads/sycl
         path: sycl-repo
 
     - name: Set CUDA env vars
@@ -112,11 +129,6 @@ jobs:
         echo "CUDA_LIB_PATH=/usr/local/cuda/lib64/stubs" >> $GITHUB_ENV
         echo "LD_LIBRARY_PATH=/usr/local/cuda/compat/:/usr/local/cuda/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
 
-    - name: Run pre setup
-      run: |
-        source /opt/intel/oneapi/setvars.sh --force
-        sycl-ls
-
     - name: Configure SYCL
       run: >
         python3 sycl-repo/buildbot/configure.py
@@ -134,11 +146,15 @@ jobs:
     - name: Build SYCL
       run: cmake --build ${{github.workspace}}/sycl_build -j
 
+    - name: Set extra llvm-lit options
+      if: matrix.adapter.extra_lit_flags != ''
+      run: echo "LIT_OPTS=${{matrix.adapter.extra_lit_flags}}" >> $GITHUB_ENV
+
     - name: Run check-sycl
       # Remove after fixing SYCL test :: abi/layout_handler.cpp
       # This issue does not affect further execution of e2e with UR.
       continue-on-error: true
-      run: cmake --build ${{github.workspace}}/sycl_build --target check-sycl -j6
+      run: cmake --build ${{github.workspace}}/sycl_build --target check-sycl
 
     - name: Set additional env. vars
       run: |
@@ -161,6 +177,13 @@ jobs:
         -DCMAKE_CXX_COMPILER="$(which clang++)"
         -DLLVM_LIT="${{github.workspace}}/sycl-repo/llvm/utils/lit/lit.py"
 
+    - name: Set test filters for L0
+      if: matrix.adapter.name == 'L0'
+      run: |
+        echo "LIT_XFAIL_NOT=${{inputs.xfail_not}}" >> $GITHUB_ENV
+        echo "LIT_XFAIL=${{inputs.xfail}}" >> $GITHUB_ENV
+        echo "LIT_FILTER_OUT=${{inputs.filter_out}}" >> $GITHUB_ENV
+
     - name: Run e2e tests
       id: tests
       run: ninja -C build-e2e check-sycl-e2e
diff --git a/.github/workflows/e2e_level_zero.yml b/.github/workflows/e2e_level_zero.yml
new file mode 100644
index 0000000000..66c23715ea
--- /dev/null
+++ b/.github/workflows/e2e_level_zero.yml
@@ -0,0 +1,36 @@
+name: E2E Level Zero
+
+on:
+  schedule:
+    # Run every day at 23:00 UTC
+    - cron: '0 23 * * *'
+  issue_comment:
+    types: [created, edited]
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  e2e-build-hw:
+    # trigger only if PR comment contains "e2e-level-zero"
+    if: ${{ (github.event.issue.pull_request && contains(github.event.comment.body, '/e2e-level-zero')) || (github.event_name == 'schedule') }}
+    name: Start e2e job
+    # use core flow, run it with L0 specific parameters
+    uses: ./.github/workflows/e2e_core.yml
+    # parameters that we pass to the core flow
+    with:
+      name: "L0"
+      runner_tag: "L0_E2E"
+      str_name: "level_zero"
+      prefix: "ext_oneapi_"
+      config: ""
+      unit: "gpu"
+      trigger: "${{github.event_name}}"
+      # Failing tests
+      xfail: "ESIMD/preemption.cpp;syclcompat/atomic/atomic_class.cpp;ProgramManager/uneven_kernel_split.cpp;Plugin/level_zero_ext_intel_queue_index.cpp;Plugin/level_zero_ext_intel_cslice.cpp;Matrix/joint_matrix_rowmajorA_rowmajorB.cpp;Matrix/element_wise_ops.cpp;Matrix/element_wise_all_ops.cpp;Matrix/SG32/element_wise_all_ops.cpp"
+      # Flaky tests
+      filter_out: "GroupAlgorithm/root_group.cpp|Basic/exceptions-SYCL-2020.cpp|Graph/UnsupportedDevice/device_query.cpp|Graph/RecordReplay/exception_inconsistent_contexts.cpp"
+      # These runners by default spawn upwards of 260 workers. That's too much for the GPU.
+      # We also add a time out just in case some test hangs
+      extra_lit_flags: "-sv -j 50 --max-time 600"
diff --git a/.github/workflows/hadolint.yml b/.github/workflows/hadolint.yml
deleted file mode 100644
index a1063aefe8..0000000000
--- a/.github/workflows/hadolint.yml
+++ /dev/null
@@ -1,34 +0,0 @@
-# Runs linter for Docker files
-name: Hadolint
-
-on:
-  workflow_dispatch:
-  push:
-  pull_request:
-    paths:
-      - '.github/docker/*Dockerfile'
-      - '.github/workflows/hadolint.yml'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  linux:
-    name: Hadolint
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Clone the git repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
-      - name: Run Hadolint
-        uses: hadolint/hadolint-action@54c9adbab1582c2ef04b2016b760714a4bfde3cf # v3.1.0
-        with:
-          recursive: true
-          dockerfile:  ".github/docker/*Dockerfile"
-          # ignore pinning apt packages to versions
-          ignore: DL3008
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index 97362d4f1a..d0cb335d96 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -10,6 +10,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 
+permissions:
+  contents: read
+
 jobs:
   labeler:
     permissions:
@@ -17,4 +20,4 @@ jobs:
       pull-requests: write
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/labeler@v5
+      - uses: actions/labeler@8558fd74291d67161a8a78ce36a881fa63b766a9 # v5.0.0
diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml
new file mode 100644
index 0000000000..2efb04c86a
--- /dev/null
+++ b/.github/workflows/trivy.yml
@@ -0,0 +1,50 @@
+# Runs linter for Docker files
+name: Trivy
+
+on:
+  workflow_dispatch:
+  push:
+  pull_request:
+    paths:
+      - '.github/docker/*Dockerfile'
+      - '.github/workflows/trivy.yml'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+
+jobs:
+  linux:
+    name: Trivy
+    runs-on: ubuntu-latest
+    permissions:
+      security-events: write
+
+    steps:
+      - name: Clone repo
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+
+      - name: Run Trivy
+        uses: aquasecurity/trivy-action@84384bd6e777ef152729993b8145ea352e9dd3ef # v0.17.0
+        with:
+          scan-type: 'config'
+          hide-progress: false
+          format: 'sarif'
+          output: 'trivy-results.sarif'
+          exit-code: 1  # Fail if issue found
+          # file with suppressions: .trivyignore (in root dir)
+
+      - name: Print report and trivyignore file
+        run: |
+          echo "### Trivy ignore content:"
+          cat .trivyignore
+          echo "### Trivy report:"
+          cat trivy-results.sarif
+
+      - name: Upload results
+        uses: github/codeql-action/upload-sarif@e8893c57a1f3a2b659b6b55564fdfdbbd2982911 # v3.24.0
+        with:
+          sarif_file: 'trivy-results.sarif'
diff --git a/.trivyignore b/.trivyignore
new file mode 100644
index 0000000000..63749166bc
--- /dev/null
+++ b/.trivyignore
@@ -0,0 +1,6 @@
+# Docs: https://aquasecurity.github.io/trivy/latest/docs/configuration/filtering/#trivyignore
+
+# In docker files:
+# HEALTHCHECK is not required for development, nor in CI (failed docker = failed CI).
+# We're not hosting any application with usage of the dockers.
+AVD-DS-0026
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4fcd74e729..cfd8752657 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,8 +3,8 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-cmake_minimum_required(VERSION 3.14.0 FATAL_ERROR)
-project(unified-runtime VERSION 0.9.0)
+cmake_minimum_required(VERSION 3.20.0 FATAL_ERROR)
+project(unified-runtime VERSION 0.10.0)
 
 include(GNUInstallDirs)
 include(CheckCXXSourceCompiles)
@@ -47,12 +47,15 @@ option(UR_BUILD_ADAPTER_ALL "Build all currently supported adapters" OFF)
 option(UR_BUILD_EXAMPLE_CODEGEN "Build the codegen example." OFF)
 option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace for linux" OFF)
 option(UR_ENABLE_ASSERTIONS "Enable assertions for all build types" OFF)
+option(UR_BUILD_XPTI_LIBS "Build the XPTI libraries when tracing is enabled" ON)
 set(UR_DPCXX "" CACHE FILEPATH "Path of the DPC++ compiler executable")
+set(UR_DPCXX_BUILD_FLAGS "" CACHE STRING "Build flags to pass to DPC++ when compiling device programs")
 set(UR_SYCL_LIBRARY_DIR "" CACHE PATH
     "Path of the SYCL runtime library directory")
 set(UR_CONFORMANCE_TARGET_TRIPLES "" CACHE STRING
     "List of sycl targets to build CTS device binaries for")
 set(UR_CONFORMANCE_AMD_ARCH "" CACHE STRING "AMD device target ID to build CTS binaries for")
+option(UR_CONFORMANCE_ENABLE_MATCH_FILES "Enable CTS match files" ON)
 set(UR_ADAPTER_LEVEL_ZERO_SOURCE_DIR "" CACHE PATH
     "Path to external 'level_zero' adapter source dir")
 set(UR_ADAPTER_OPENCL_SOURCE_DIR "" CACHE PATH
@@ -107,31 +110,34 @@ endif()
 if(UR_ENABLE_TRACING)
     add_compile_definitions(UR_ENABLE_TRACING)
 
-    # fetch xpti proxy library for the tracing layer
-    FetchContentSparse_Declare(xpti https://github.com/intel/llvm.git "sycl-nightly/20230703" "xpti")
-    FetchContent_MakeAvailable(xpti)
+    if (UR_BUILD_XPTI_LIBS)
+        # fetch xpti proxy library for the tracing layer
+        FetchContentSparse_Declare(xpti https://github.com/intel/llvm.git "sycl-nightly/20230703" "xpti")
+        FetchContent_MakeAvailable(xpti)
 
-    # set -fPIC for xpti since we are linking it with a shared library
-    set_target_properties(xpti PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        # set -fPIC for xpti since we are linking it with a shared library
+        set_target_properties(xpti PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
-    # fetch the xptifw dispatcher, mostly used for testing
-    # these variables need to be set for xptifw to compile
-    set(XPTI_SOURCE_DIR ${xpti_SOURCE_DIR})
-    set(XPTI_DIR ${xpti_SOURCE_DIR})
-    set(XPTI_ENABLE_TESTS OFF CACHE INTERNAL "Turn off xptifw tests")
+        # fetch the xptifw dispatcher, mostly used for testing
+        # these variables need to be set for xptifw to compile
+        set(XPTI_SOURCE_DIR ${xpti_SOURCE_DIR})
+        set(XPTI_DIR ${xpti_SOURCE_DIR})
+        set(XPTI_ENABLE_TESTS OFF CACHE INTERNAL "Turn off xptifw tests")
 
-    FetchContentSparse_Declare(xptifw https://github.com/intel/llvm.git "sycl-nightly/20230703" "xptifw")
+        FetchContentSparse_Declare(xptifw https://github.com/intel/llvm.git "sycl-nightly/20230703" "xptifw")
 
-    FetchContent_MakeAvailable(xptifw)
+        FetchContent_MakeAvailable(xptifw)
 
-    check_cxx_compiler_flag("-Wno-error=maybe-uninitialized" HAS_MAYBE_UNINIT)
-    if (HAS_MAYBE_UNINIT)
-        target_compile_options(xptifw PRIVATE -Wno-error=maybe-uninitialized)
+        check_cxx_compiler_flag("-Wno-error=maybe-uninitialized" HAS_MAYBE_UNINIT)
+        if (HAS_MAYBE_UNINIT)
+            target_compile_options(xptifw PRIVATE -Wno-error=maybe-uninitialized)
+        endif()
+
+        set_target_properties(xptifw PROPERTIES
+            LIBRARY_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
+        )
     endif()
 
-    set_target_properties(xptifw PROPERTIES
-        LIBRARY_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
-    )
     if (MSVC)
         set(TARGET_XPTI $<IF:$<CONFIG:Release>,xpti,xptid>)
     else()
diff --git a/README.md b/README.md
index cb43c380b9..3d53d2117e 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,7 @@
 [![Build and test](https://github.com/oneapi-src/unified-runtime/actions/workflows/cmake.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/cmake.yml)
 [![E2E Cuda](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_cuda.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_cuda.yml)
 [![E2E OpenCL](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_opencl.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_opencl.yml)
+[![E2E Level Zero](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_level_zero.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_level_zero.yml)
 [![CodeQL](https://github.com/oneapi-src/unified-runtime/actions/workflows/codeql.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/codeql.yml)
 [![Bandit](https://github.com/oneapi-src/unified-runtime/actions/workflows/bandit.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/bandit.yml)
 [![Coverity](https://scan.coverity.com/projects/28213/badge.svg)](https://scan.coverity.com/projects/oneapi-src-unified-runtime)
@@ -13,7 +14,6 @@
 ## Table of contents
 
 - [Unified Runtime](#unified-runtime)
-  - [Adapters](#adapters)
   - [Table of contents](#table-of-contents)
   - [Contents of the repo](#contents-of-the-repo)
   - [Integration](#integration)
@@ -29,7 +29,7 @@
     - [Adapter naming convention](#adapter-naming-convention)
     - [Source code generation](#source-code-generation)
     - [Documentation](#documentation)
-6. [Release Process](#release-process)
+  - [Release Process](#release-process)
 
 ## Contents of the repo
 
@@ -88,7 +88,7 @@ for more detailed instructions on the correct setup.
 
 Required packages:
 - C++ compiler with C++17 support
-- [CMake](https://cmake.org/) >= 3.14.0
+- [CMake](https://cmake.org/) >= 3.20.0
 - Python v3.6.6 or later
 
 ### Windows
@@ -131,6 +131,7 @@ List of options provided by CMake:
 | UR_ENABLE_SANITIZER | Enable device sanitizer layer | ON/OFF | ON |
 | UR_CONFORMANCE_TARGET_TRIPLES | SYCL triples to build CTS device binaries for | Comma-separated list | spir64 |
 | UR_CONFORMANCE_AMD_ARCH | AMD device target ID to build CTS binaries for | string | `""` |
+| UR_CONFORMANCE_ENABLE_MATCH_FILES | Enable CTS match files | ON/OFF | ON |
 | UR_BUILD_ADAPTER_L0     | Build the Level-Zero adapter            | ON/OFF     | OFF     |
 | UR_BUILD_ADAPTER_OPENCL | Build the OpenCL adapter                | ON/OFF     | OFF     |
 | UR_BUILD_ADAPTER_CUDA   | Build the CUDA adapter                  | ON/OFF     | OFF     |
@@ -140,6 +141,8 @@ List of options provided by CMake:
 | UR_HIP_PLATFORM         | Build HIP adapter for AMD or NVIDIA platform           | AMD/NVIDIA | AMD     |
 | UR_ENABLE_COMGR         | Enable comgr lib usage           | AMD/NVIDIA | AMD     |
 | UR_DPCXX | Path of the DPC++ compiler executable to build CTS device binaries | File path | `""` |
+| UR_DEVICE_CODE_EXTRACTOR | Path of the `clang-offload-extract` executable from the DPC++ package, required for CTS device binaries | File path | `"${dirname(UR_DPCXX)}/clang-offload-extract"` |
+| UR_DPCXX_BUILD_FLAGS | Build flags to pass to DPC++ when compiling device programs | Space-separated options list | `""` |
 | UR_SYCL_LIBRARY_DIR | Path of the SYCL runtime library directory to build CTS device binaries | Directory path | `""` |
 | UR_HIP_ROCM_DIR | Path of the default ROCm HIP installation | Directory path | `/opt/rocm` |
 | UR_HIP_INCLUDE_DIR | Path of the ROCm HIP include directory | Directory path | `${UR_HIP_ROCM_DIR}/include` |
diff --git a/include/ur_api.h b/include/ur_api.h
index 8680dbeffb..96a58e04ea 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -7,7 +7,7 @@
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  * @file ur_api.h
- * @version v0.9-r0
+ * @version v0.10-r0
  *
  */
 #ifndef UR_API_H_INCLUDED
@@ -222,6 +222,7 @@ typedef enum ur_function_t {
     UR_FUNCTION_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_EXP = 220,                 ///< Enumerator for ::urCommandBufferUpdateKernelLaunchExp
     UR_FUNCTION_COMMAND_BUFFER_GET_INFO_EXP = 221,                             ///< Enumerator for ::urCommandBufferGetInfoExp
     UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP = 222,                     ///< Enumerator for ::urCommandBufferCommandGetInfoExp
+    UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP = 223,                         ///< Enumerator for ::urEnqueueTimestampRecordingExp
     /// @cond
     UR_FUNCTION_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -1097,11 +1098,12 @@ urPlatformGetInfo(
 ///     - API versions contain major and minor attributes, use
 ///       ::UR_MAJOR_VERSION and ::UR_MINOR_VERSION
 typedef enum ur_api_version_t {
-    UR_API_VERSION_0_6 = UR_MAKE_VERSION(0, 6),     ///< version 0.6
-    UR_API_VERSION_0_7 = UR_MAKE_VERSION(0, 7),     ///< version 0.7
-    UR_API_VERSION_0_8 = UR_MAKE_VERSION(0, 8),     ///< version 0.8
-    UR_API_VERSION_0_9 = UR_MAKE_VERSION(0, 9),     ///< version 0.9
-    UR_API_VERSION_CURRENT = UR_MAKE_VERSION(0, 9), ///< latest known version
+    UR_API_VERSION_0_6 = UR_MAKE_VERSION(0, 6),      ///< version 0.6
+    UR_API_VERSION_0_7 = UR_MAKE_VERSION(0, 7),      ///< version 0.7
+    UR_API_VERSION_0_8 = UR_MAKE_VERSION(0, 8),      ///< version 0.8
+    UR_API_VERSION_0_9 = UR_MAKE_VERSION(0, 9),      ///< version 0.9
+    UR_API_VERSION_0_10 = UR_MAKE_VERSION(0, 10),    ///< version 0.10
+    UR_API_VERSION_CURRENT = UR_MAKE_VERSION(0, 10), ///< latest known version
     /// @cond
     UR_API_VERSION_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -1434,201 +1436,214 @@ urDeviceGetSelected(
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Supported device info
 typedef enum ur_device_info_t {
-    UR_DEVICE_INFO_TYPE = 0,                                        ///< [::ur_device_type_t] type of the device
-    UR_DEVICE_INFO_VENDOR_ID = 1,                                   ///< [uint32_t] vendor Id of the device
-    UR_DEVICE_INFO_DEVICE_ID = 2,                                   ///< [uint32_t] Id of the device
-    UR_DEVICE_INFO_MAX_COMPUTE_UNITS = 3,                           ///< [uint32_t] the number of compute units
-    UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS = 4,                    ///< [uint32_t] max work item dimensions
-    UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES = 5,                         ///< [size_t[]] return an array of max work item sizes
-    UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE = 6,                         ///< [size_t] max work group size
-    UR_DEVICE_INFO_SINGLE_FP_CONFIG = 7,                            ///< [::ur_device_fp_capability_flags_t] single precision floating point
-                                                                    ///< capability
-    UR_DEVICE_INFO_HALF_FP_CONFIG = 8,                              ///< [::ur_device_fp_capability_flags_t] half precision floating point
-                                                                    ///< capability
-    UR_DEVICE_INFO_DOUBLE_FP_CONFIG = 9,                            ///< [::ur_device_fp_capability_flags_t] double precision floating point
-                                                                    ///< capability
-    UR_DEVICE_INFO_QUEUE_PROPERTIES = 10,                           ///< [::ur_queue_flags_t] command queue properties supported by the device
-    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR = 11,                ///< [uint32_t] preferred vector width for char
-    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT = 12,               ///< [uint32_t] preferred vector width for short
-    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT = 13,                 ///< [uint32_t] preferred vector width for int
-    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG = 14,                ///< [uint32_t] preferred vector width for long
-    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT = 15,               ///< [uint32_t] preferred vector width for float
-    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE = 16,              ///< [uint32_t] preferred vector width for double
-    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF = 17,                ///< [uint32_t] preferred vector width for half float
-    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR = 18,                   ///< [uint32_t] native vector width for char
-    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT = 19,                  ///< [uint32_t] native vector width for short
-    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT = 20,                    ///< [uint32_t] native vector width for int
-    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG = 21,                   ///< [uint32_t] native vector width for long
-    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT = 22,                  ///< [uint32_t] native vector width for float
-    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE = 23,                 ///< [uint32_t] native vector width for double
-    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF = 24,                   ///< [uint32_t] native vector width for half float
-    UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY = 25,                        ///< [uint32_t] max clock frequency in MHz
-    UR_DEVICE_INFO_MEMORY_CLOCK_RATE = 26,                          ///< [uint32_t] memory clock frequency in MHz
-    UR_DEVICE_INFO_ADDRESS_BITS = 27,                               ///< [uint32_t] address bits
-    UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE = 28,                         ///< [uint64_t] max memory allocation size
-    UR_DEVICE_INFO_IMAGE_SUPPORTED = 29,                            ///< [::ur_bool_t] images are supported
-    UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS = 30,                        ///< [uint32_t] max number of image objects arguments of a kernel declared
-                                                                    ///< with the read_only qualifier
-    UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS = 31,                       ///< [uint32_t] max number of image objects arguments of a kernel declared
-                                                                    ///< with the write_only qualifier
-    UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS = 32,                  ///< [uint32_t] max number of image objects arguments of a kernel declared
-                                                                    ///< with the read_write qualifier
-    UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH = 33,                          ///< [size_t] max width of Image2D object
-    UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT = 34,                         ///< [size_t] max height of Image2D object
-    UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH = 35,                          ///< [size_t] max width of Image3D object
-    UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT = 36,                         ///< [size_t] max height of Image3D object
-    UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH = 37,                          ///< [size_t] max depth of Image3D object
-    UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE = 38,                      ///< [size_t] max image buffer size
-    UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE = 39,                       ///< [size_t] max image array size
-    UR_DEVICE_INFO_MAX_SAMPLERS = 40,                               ///< [uint32_t] max number of samplers that can be used in a kernel
-    UR_DEVICE_INFO_MAX_PARAMETER_SIZE = 41,                         ///< [size_t] max size in bytes of all arguments passed to a kernel
-    UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN = 42,                        ///< [uint32_t] memory base address alignment
-    UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE = 43,                      ///< [::ur_device_mem_cache_type_t] global memory cache type
-    UR_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE = 44,                  ///< [uint32_t] global memory cache line size in bytes
-    UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE = 45,                      ///< [uint64_t] size of global memory cache in bytes
-    UR_DEVICE_INFO_GLOBAL_MEM_SIZE = 46,                            ///< [uint64_t] size of global memory in bytes
-    UR_DEVICE_INFO_GLOBAL_MEM_FREE = 47,                            ///< [uint64_t] size of global memory which is free in bytes
-    UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE = 48,                   ///< [uint64_t] max constant buffer size in bytes
-    UR_DEVICE_INFO_MAX_CONSTANT_ARGS = 49,                          ///< [uint32_t] max number of __const declared arguments in a kernel
-    UR_DEVICE_INFO_LOCAL_MEM_TYPE = 50,                             ///< [::ur_device_local_mem_type_t] local memory type
-    UR_DEVICE_INFO_LOCAL_MEM_SIZE = 51,                             ///< [uint64_t] local memory size in bytes
-    UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT = 52,                   ///< [::ur_bool_t] support error correction to global and local memory
-    UR_DEVICE_INFO_HOST_UNIFIED_MEMORY = 53,                        ///< [::ur_bool_t] unified host device memory
-    UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION = 54,                 ///< [size_t] profiling timer resolution in nanoseconds
-    UR_DEVICE_INFO_ENDIAN_LITTLE = 55,                              ///< [::ur_bool_t] little endian byte order
-    UR_DEVICE_INFO_AVAILABLE = 56,                                  ///< [::ur_bool_t] device is available
-    UR_DEVICE_INFO_COMPILER_AVAILABLE = 57,                         ///< [::ur_bool_t] device compiler is available
-    UR_DEVICE_INFO_LINKER_AVAILABLE = 58,                           ///< [::ur_bool_t] device linker is available
-    UR_DEVICE_INFO_EXECUTION_CAPABILITIES = 59,                     ///< [::ur_device_exec_capability_flags_t] device kernel execution
-                                                                    ///< capability bit-field
-    UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES = 60,                 ///< [::ur_queue_flags_t] device command queue property bit-field
-    UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES = 61,                   ///< [::ur_queue_flags_t] host queue property bit-field
-    UR_DEVICE_INFO_BUILT_IN_KERNELS = 62,                           ///< [char[]] a semi-colon separated list of built-in kernels
-    UR_DEVICE_INFO_PLATFORM = 63,                                   ///< [::ur_platform_handle_t] the platform associated with the device
-    UR_DEVICE_INFO_REFERENCE_COUNT = 64,                            ///< [uint32_t] Reference count of the device object.
-                                                                    ///< The reference count returned should be considered immediately stale.
-                                                                    ///< It is unsuitable for general use in applications. This feature is
-                                                                    ///< provided for identifying memory leaks.
-    UR_DEVICE_INFO_IL_VERSION = 65,                                 ///< [char[]] IL version
-    UR_DEVICE_INFO_NAME = 66,                                       ///< [char[]] Device name
-    UR_DEVICE_INFO_VENDOR = 67,                                     ///< [char[]] Device vendor
-    UR_DEVICE_INFO_DRIVER_VERSION = 68,                             ///< [char[]] Driver version
-    UR_DEVICE_INFO_PROFILE = 69,                                    ///< [char[]] Device profile
-    UR_DEVICE_INFO_VERSION = 70,                                    ///< [char[]] Device version
-    UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION = 71,                    ///< [char[]] Version of backend runtime
-    UR_DEVICE_INFO_EXTENSIONS = 72,                                 ///< [char[]] Return a space separated list of extension names
-    UR_DEVICE_INFO_PRINTF_BUFFER_SIZE = 73,                         ///< [size_t] Maximum size in bytes of internal printf buffer
-    UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC = 74,                ///< [::ur_bool_t] prefer user synchronization when sharing object with
-                                                                    ///< other API
-    UR_DEVICE_INFO_PARENT_DEVICE = 75,                              ///< [::ur_device_handle_t] return parent device handle
-    UR_DEVICE_INFO_SUPPORTED_PARTITIONS = 76,                       ///< [::ur_device_partition_t[]] Returns an array of partition types
-                                                                    ///< supported by the device
-    UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES = 77,                  ///< [uint32_t] maximum number of sub-devices when the device is
-                                                                    ///< partitioned
-    UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN = 78,                  ///< [::ur_device_affinity_domain_flags_t] Returns a bit-field of the
-                                                                    ///< supported affinity domains for partitioning.
-                                                                    ///< If the device does not support any affinity domains, then 0 will be returned.
-    UR_DEVICE_INFO_PARTITION_TYPE = 79,                             ///< [::ur_device_partition_property_t[]] return an array of
-                                                                    ///< ::ur_device_partition_property_t for properties specified in
-                                                                    ///< ::urDevicePartition
-    UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS = 80,                         ///< [uint32_t] max number of sub groups
-    UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS = 81,     ///< [::ur_bool_t] support sub group independent forward progress
-    UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL = 82,                      ///< [uint32_t[]] return an array of sub group sizes supported on Intel
-                                                                    ///< device
-    UR_DEVICE_INFO_USM_HOST_SUPPORT = 83,                           ///< [::ur_device_usm_access_capability_flags_t] support USM host memory
-                                                                    ///< access
-    UR_DEVICE_INFO_USM_DEVICE_SUPPORT = 84,                         ///< [::ur_device_usm_access_capability_flags_t] support USM device memory
-                                                                    ///< access
-    UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT = 85,                  ///< [::ur_device_usm_access_capability_flags_t] support USM single device
-                                                                    ///< shared memory access
-    UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT = 86,                   ///< [::ur_device_usm_access_capability_flags_t] support USM cross device
-                                                                    ///< shared memory access
-    UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT = 87,                  ///< [::ur_device_usm_access_capability_flags_t] support USM system wide
-                                                                    ///< shared memory access
-    UR_DEVICE_INFO_UUID = 88,                                       ///< [uint8_t[]] return device UUID
-    UR_DEVICE_INFO_PCI_ADDRESS = 89,                                ///< [char[]] return device PCI address
-    UR_DEVICE_INFO_GPU_EU_COUNT = 90,                               ///< [uint32_t] return Intel GPU EU count
-    UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH = 91,                          ///< [uint32_t] return Intel GPU EU SIMD width
-    UR_DEVICE_INFO_GPU_EU_SLICES = 92,                              ///< [uint32_t] return Intel GPU number of slices
-    UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 93,                  ///< [uint32_t] return Intel GPU EU count per subslice
-    UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE = 94,                    ///< [uint32_t] return Intel GPU number of subslices per slice
-    UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU = 95,                      ///< [uint32_t] return Intel GPU number of threads per EU
-    UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH = 96,                       ///< [uint32_t] return max memory bandwidth in Mb/s
-    UR_DEVICE_INFO_IMAGE_SRGB = 97,                                 ///< [::ur_bool_t] device supports sRGB images
-    UR_DEVICE_INFO_BUILD_ON_SUBDEVICE = 98,                         ///< [::ur_bool_t] Return true if sub-device should do its own program
-                                                                    ///< build
-    UR_DEVICE_INFO_ATOMIC_64 = 99,                                  ///< [::ur_bool_t] support 64 bit atomics
-    UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 100,          ///< [::ur_memory_order_capability_flags_t] return a bit-field of atomic
-                                                                    ///< memory order capabilities
-    UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 101,          ///< [::ur_memory_scope_capability_flags_t] return a bit-field of atomic
-                                                                    ///< memory scope capabilities
-    UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES = 102,           ///< [::ur_memory_order_capability_flags_t] return a bit-field of atomic
-                                                                    ///< memory fence order capabilities
-    UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES = 103,           ///< [::ur_memory_scope_capability_flags_t] return a bit-field of atomic
-                                                                    ///< memory fence scope capabilities
-    UR_DEVICE_INFO_BFLOAT16 = 104,                                  ///< [::ur_bool_t] support for bfloat16
-    UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES = 105,                 ///< [uint32_t] Returns 1 if the device doesn't have a notion of a
-                                                                    ///< queue index. Otherwise, returns the number of queue indices that are
-                                                                    ///< available for this device.
-    UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS = 106,       ///< [::ur_bool_t] support the ::urKernelSetSpecializationConstants entry
-                                                                    ///< point
-    UR_DEVICE_INFO_MEMORY_BUS_WIDTH = 107,                          ///< [uint32_t] return the width in bits of the memory bus interface of the
-                                                                    ///< device.
-    UR_DEVICE_INFO_MAX_WORK_GROUPS_3D = 108,                        ///< [size_t[3]] return max 3D work groups
-    UR_DEVICE_INFO_ASYNC_BARRIER = 109,                             ///< [::ur_bool_t] return true if Async Barrier is supported
-    UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT = 110,                       ///< [::ur_bool_t] return true if specifying memory channels is supported
-    UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED = 111,            ///< [::ur_bool_t] Return true if the device supports enqueueing commands
-                                                                    ///< to read and write pipes from the host.
-    UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP = 112,              ///< [uint32_t] The maximum number of registers available per block.
-    UR_DEVICE_INFO_IP_VERSION = 113,                                ///< [uint32_t] The device IP version. The meaning of the device IP version
-                                                                    ///< is implementation-defined, but newer devices should have a higher
-                                                                    ///< version than older devices.
-    UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT = 114,                    ///< [::ur_bool_t] return true if the device supports virtual memory.
-    UR_DEVICE_INFO_ESIMD_SUPPORT = 115,                             ///< [::ur_bool_t] return true if the device supports ESIMD.
-    UR_DEVICE_INFO_COMPONENT_DEVICES = 116,                         ///< [::ur_device_handle_t[]] The set of component devices contained by
-                                                                    ///< this composite device.
-    UR_DEVICE_INFO_COMPOSITE_DEVICE = 117,                          ///< [::ur_device_handle_t] The composite device containing this component
-                                                                    ///< device.
-    UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP = 0x1000,             ///< [::ur_bool_t] Returns true if the device supports the use of
-                                                                    ///< command-buffers.
-    UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP = 0x1001,      ///< [::ur_bool_t] Returns true if the device supports updating the kernel
-                                                                    ///< commands in a command-buffer.
-    UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP = 0x2000,            ///< [::ur_bool_t] returns true if the device supports the creation of
-                                                                    ///< bindless images
-    UR_DEVICE_INFO_BINDLESS_IMAGES_SHARED_USM_SUPPORT_EXP = 0x2001, ///< [::ur_bool_t] returns true if the device supports the creation of
-                                                                    ///< bindless images backed by shared USM
-    UR_DEVICE_INFO_BINDLESS_IMAGES_1D_USM_SUPPORT_EXP = 0x2002,     ///< [::ur_bool_t] returns true if the device supports the creation of 1D
-                                                                    ///< bindless images backed by USM
-    UR_DEVICE_INFO_BINDLESS_IMAGES_2D_USM_SUPPORT_EXP = 0x2003,     ///< [::ur_bool_t] returns true if the device supports the creation of 2D
-                                                                    ///< bindless images backed by USM
-    UR_DEVICE_INFO_IMAGE_PITCH_ALIGN_EXP = 0x2004,                  ///< [uint32_t] returns the required alignment of the pitch between two
-                                                                    ///< rows of an image in bytes
-    UR_DEVICE_INFO_MAX_IMAGE_LINEAR_WIDTH_EXP = 0x2005,             ///< [size_t] returns the maximum linear width allowed for images allocated
-                                                                    ///< using USM
-    UR_DEVICE_INFO_MAX_IMAGE_LINEAR_HEIGHT_EXP = 0x2006,            ///< [size_t] returns the maximum linear height allowed for images
-                                                                    ///< allocated using USM
-    UR_DEVICE_INFO_MAX_IMAGE_LINEAR_PITCH_EXP = 0x2007,             ///< [size_t] returns the maximum linear pitch allowed for images allocated
-                                                                    ///< using USM
-    UR_DEVICE_INFO_MIPMAP_SUPPORT_EXP = 0x2008,                     ///< [::ur_bool_t] returns true if the device supports allocating mipmap
-                                                                    ///< resources
-    UR_DEVICE_INFO_MIPMAP_ANISOTROPY_SUPPORT_EXP = 0x2009,          ///< [::ur_bool_t] returns true if the device supports sampling mipmap
-                                                                    ///< images with anisotropic filtering
-    UR_DEVICE_INFO_MIPMAP_MAX_ANISOTROPY_EXP = 0x200A,              ///< [uint32_t] returns the maximum anisotropic ratio supported by the
-                                                                    ///< device
-    UR_DEVICE_INFO_MIPMAP_LEVEL_REFERENCE_SUPPORT_EXP = 0x200B,     ///< [::ur_bool_t] returns true if the device supports using images created
-                                                                    ///< from individual mipmap levels
-    UR_DEVICE_INFO_INTEROP_MEMORY_IMPORT_SUPPORT_EXP = 0x200C,      ///< [::ur_bool_t] returns true if the device supports importing external
-                                                                    ///< memory resources
-    UR_DEVICE_INFO_INTEROP_MEMORY_EXPORT_SUPPORT_EXP = 0x200D,      ///< [::ur_bool_t] returns true if the device supports exporting internal
-                                                                    ///< memory resources
-    UR_DEVICE_INFO_INTEROP_SEMAPHORE_IMPORT_SUPPORT_EXP = 0x200E,   ///< [::ur_bool_t] returns true if the device supports importing external
-                                                                    ///< semaphore resources
-    UR_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP = 0x200F,   ///< [::ur_bool_t] returns true if the device supports exporting internal
-                                                                    ///< event resources
-    UR_DEVICE_INFO_CUBEMAP_SUPPORT_EXP = 0x2010,                    ///< [::ur_bool_t] returns true if the device supports allocating and
-                                                                    ///< accessing cubemap resources
-    UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP = 0x2011, ///< [::ur_bool_t] returns true if the device supports sampling cubemapped
-                                                                    ///< images across face boundaries
+    UR_DEVICE_INFO_TYPE = 0,                                         ///< [::ur_device_type_t] type of the device
+    UR_DEVICE_INFO_VENDOR_ID = 1,                                    ///< [uint32_t] vendor Id of the device
+    UR_DEVICE_INFO_DEVICE_ID = 2,                                    ///< [uint32_t] Id of the device
+    UR_DEVICE_INFO_MAX_COMPUTE_UNITS = 3,                            ///< [uint32_t] the number of compute units
+    UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS = 4,                     ///< [uint32_t] max work item dimensions
+    UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES = 5,                          ///< [size_t[]] return an array of max work item sizes
+    UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE = 6,                          ///< [size_t] max work group size
+    UR_DEVICE_INFO_SINGLE_FP_CONFIG = 7,                             ///< [::ur_device_fp_capability_flags_t] single precision floating point
+                                                                     ///< capability
+    UR_DEVICE_INFO_HALF_FP_CONFIG = 8,                               ///< [::ur_device_fp_capability_flags_t] half precision floating point
+                                                                     ///< capability
+    UR_DEVICE_INFO_DOUBLE_FP_CONFIG = 9,                             ///< [::ur_device_fp_capability_flags_t] double precision floating point
+                                                                     ///< capability
+    UR_DEVICE_INFO_QUEUE_PROPERTIES = 10,                            ///< [::ur_queue_flags_t] command queue properties supported by the device
+    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR = 11,                 ///< [uint32_t] preferred vector width for char
+    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT = 12,                ///< [uint32_t] preferred vector width for short
+    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT = 13,                  ///< [uint32_t] preferred vector width for int
+    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG = 14,                 ///< [uint32_t] preferred vector width for long
+    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT = 15,                ///< [uint32_t] preferred vector width for float
+    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE = 16,               ///< [uint32_t] preferred vector width for double
+    UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF = 17,                 ///< [uint32_t] preferred vector width for half float
+    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR = 18,                    ///< [uint32_t] native vector width for char
+    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT = 19,                   ///< [uint32_t] native vector width for short
+    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT = 20,                     ///< [uint32_t] native vector width for int
+    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG = 21,                    ///< [uint32_t] native vector width for long
+    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT = 22,                   ///< [uint32_t] native vector width for float
+    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE = 23,                  ///< [uint32_t] native vector width for double
+    UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF = 24,                    ///< [uint32_t] native vector width for half float
+    UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY = 25,                         ///< [uint32_t] max clock frequency in MHz
+    UR_DEVICE_INFO_MEMORY_CLOCK_RATE = 26,                           ///< [uint32_t] memory clock frequency in MHz
+    UR_DEVICE_INFO_ADDRESS_BITS = 27,                                ///< [uint32_t] address bits
+    UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE = 28,                          ///< [uint64_t] max memory allocation size
+    UR_DEVICE_INFO_IMAGE_SUPPORTED = 29,                             ///< [::ur_bool_t] images are supported
+    UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS = 30,                         ///< [uint32_t] max number of image objects arguments of a kernel declared
+                                                                     ///< with the read_only qualifier
+    UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS = 31,                        ///< [uint32_t] max number of image objects arguments of a kernel declared
+                                                                     ///< with the write_only qualifier
+    UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS = 32,                   ///< [uint32_t] max number of image objects arguments of a kernel declared
+                                                                     ///< with the read_write qualifier
+    UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH = 33,                           ///< [size_t] max width of Image2D object
+    UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT = 34,                          ///< [size_t] max height of Image2D object
+    UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH = 35,                           ///< [size_t] max width of Image3D object
+    UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT = 36,                          ///< [size_t] max height of Image3D object
+    UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH = 37,                           ///< [size_t] max depth of Image3D object
+    UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE = 38,                       ///< [size_t] max image buffer size
+    UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE = 39,                        ///< [size_t] max image array size
+    UR_DEVICE_INFO_MAX_SAMPLERS = 40,                                ///< [uint32_t] max number of samplers that can be used in a kernel
+    UR_DEVICE_INFO_MAX_PARAMETER_SIZE = 41,                          ///< [size_t] max size in bytes of all arguments passed to a kernel
+    UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN = 42,                         ///< [uint32_t] memory base address alignment
+    UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE = 43,                       ///< [::ur_device_mem_cache_type_t] global memory cache type
+    UR_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE = 44,                   ///< [uint32_t] global memory cache line size in bytes
+    UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE = 45,                       ///< [uint64_t] size of global memory cache in bytes
+    UR_DEVICE_INFO_GLOBAL_MEM_SIZE = 46,                             ///< [uint64_t] size of global memory in bytes
+    UR_DEVICE_INFO_GLOBAL_MEM_FREE = 47,                             ///< [uint64_t] size of global memory which is free in bytes
+    UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE = 48,                    ///< [uint64_t] max constant buffer size in bytes
+    UR_DEVICE_INFO_MAX_CONSTANT_ARGS = 49,                           ///< [uint32_t] max number of __const declared arguments in a kernel
+    UR_DEVICE_INFO_LOCAL_MEM_TYPE = 50,                              ///< [::ur_device_local_mem_type_t] local memory type
+    UR_DEVICE_INFO_LOCAL_MEM_SIZE = 51,                              ///< [uint64_t] local memory size in bytes
+    UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT = 52,                    ///< [::ur_bool_t] support error correction to global and local memory
+    UR_DEVICE_INFO_HOST_UNIFIED_MEMORY = 53,                         ///< [::ur_bool_t] unified host device memory
+    UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION = 54,                  ///< [size_t] profiling timer resolution in nanoseconds
+    UR_DEVICE_INFO_ENDIAN_LITTLE = 55,                               ///< [::ur_bool_t] little endian byte order
+    UR_DEVICE_INFO_AVAILABLE = 56,                                   ///< [::ur_bool_t] device is available
+    UR_DEVICE_INFO_COMPILER_AVAILABLE = 57,                          ///< [::ur_bool_t] device compiler is available
+    UR_DEVICE_INFO_LINKER_AVAILABLE = 58,                            ///< [::ur_bool_t] device linker is available
+    UR_DEVICE_INFO_EXECUTION_CAPABILITIES = 59,                      ///< [::ur_device_exec_capability_flags_t] device kernel execution
+                                                                     ///< capability bit-field
+    UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES = 60,                  ///< [::ur_queue_flags_t] device command queue property bit-field
+    UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES = 61,                    ///< [::ur_queue_flags_t] host queue property bit-field
+    UR_DEVICE_INFO_BUILT_IN_KERNELS = 62,                            ///< [char[]] a semi-colon separated list of built-in kernels
+    UR_DEVICE_INFO_PLATFORM = 63,                                    ///< [::ur_platform_handle_t] the platform associated with the device
+    UR_DEVICE_INFO_REFERENCE_COUNT = 64,                             ///< [uint32_t] Reference count of the device object.
+                                                                     ///< The reference count returned should be considered immediately stale.
+                                                                     ///< It is unsuitable for general use in applications. This feature is
+                                                                     ///< provided for identifying memory leaks.
+    UR_DEVICE_INFO_IL_VERSION = 65,                                  ///< [char[]] IL version
+    UR_DEVICE_INFO_NAME = 66,                                        ///< [char[]] Device name
+    UR_DEVICE_INFO_VENDOR = 67,                                      ///< [char[]] Device vendor
+    UR_DEVICE_INFO_DRIVER_VERSION = 68,                              ///< [char[]] Driver version
+    UR_DEVICE_INFO_PROFILE = 69,                                     ///< [char[]] Device profile
+    UR_DEVICE_INFO_VERSION = 70,                                     ///< [char[]] Device version
+    UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION = 71,                     ///< [char[]] Version of backend runtime
+    UR_DEVICE_INFO_EXTENSIONS = 72,                                  ///< [char[]] Return a space separated list of extension names
+    UR_DEVICE_INFO_PRINTF_BUFFER_SIZE = 73,                          ///< [size_t] Maximum size in bytes of internal printf buffer
+    UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC = 74,                 ///< [::ur_bool_t] prefer user synchronization when sharing object with
+                                                                     ///< other API
+    UR_DEVICE_INFO_PARENT_DEVICE = 75,                               ///< [::ur_device_handle_t] return parent device handle
+    UR_DEVICE_INFO_SUPPORTED_PARTITIONS = 76,                        ///< [::ur_device_partition_t[]] Returns an array of partition types
+                                                                     ///< supported by the device
+    UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES = 77,                   ///< [uint32_t] maximum number of sub-devices when the device is
+                                                                     ///< partitioned
+    UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN = 78,                   ///< [::ur_device_affinity_domain_flags_t] Returns a bit-field of the
+                                                                     ///< supported affinity domains for partitioning.
+                                                                     ///< If the device does not support any affinity domains, then 0 will be returned.
+    UR_DEVICE_INFO_PARTITION_TYPE = 79,                              ///< [::ur_device_partition_property_t[]] return an array of
+                                                                     ///< ::ur_device_partition_property_t for properties specified in
+                                                                     ///< ::urDevicePartition
+    UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS = 80,                          ///< [uint32_t] max number of sub groups
+    UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS = 81,      ///< [::ur_bool_t] support sub group independent forward progress
+    UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL = 82,                       ///< [uint32_t[]] return an array of sub group sizes supported on Intel
+                                                                     ///< device
+    UR_DEVICE_INFO_USM_HOST_SUPPORT = 83,                            ///< [::ur_device_usm_access_capability_flags_t] support USM host memory
+                                                                     ///< access
+    UR_DEVICE_INFO_USM_DEVICE_SUPPORT = 84,                          ///< [::ur_device_usm_access_capability_flags_t] support USM device memory
+                                                                     ///< access
+    UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT = 85,                   ///< [::ur_device_usm_access_capability_flags_t] support USM single device
+                                                                     ///< shared memory access
+    UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT = 86,                    ///< [::ur_device_usm_access_capability_flags_t] support USM cross device
+                                                                     ///< shared memory access
+    UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT = 87,                   ///< [::ur_device_usm_access_capability_flags_t] support USM system wide
+                                                                     ///< shared memory access
+    UR_DEVICE_INFO_UUID = 88,                                        ///< [uint8_t[]] return device UUID
+    UR_DEVICE_INFO_PCI_ADDRESS = 89,                                 ///< [char[]] return device PCI address
+    UR_DEVICE_INFO_GPU_EU_COUNT = 90,                                ///< [uint32_t] return Intel GPU EU count
+    UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH = 91,                           ///< [uint32_t] return Intel GPU EU SIMD width
+    UR_DEVICE_INFO_GPU_EU_SLICES = 92,                               ///< [uint32_t] return Intel GPU number of slices
+    UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 93,                   ///< [uint32_t] return Intel GPU EU count per subslice
+    UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE = 94,                     ///< [uint32_t] return Intel GPU number of subslices per slice
+    UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU = 95,                       ///< [uint32_t] return Intel GPU number of threads per EU
+    UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH = 96,                        ///< [uint32_t] return max memory bandwidth in Mb/s
+    UR_DEVICE_INFO_IMAGE_SRGB = 97,                                  ///< [::ur_bool_t] device supports sRGB images
+    UR_DEVICE_INFO_BUILD_ON_SUBDEVICE = 98,                          ///< [::ur_bool_t] Return true if sub-device should do its own program
+                                                                     ///< build
+    UR_DEVICE_INFO_ATOMIC_64 = 99,                                   ///< [::ur_bool_t] support 64 bit atomics
+    UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 100,           ///< [::ur_memory_order_capability_flags_t] return a bit-field of atomic
+                                                                     ///< memory order capabilities
+    UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 101,           ///< [::ur_memory_scope_capability_flags_t] return a bit-field of atomic
+                                                                     ///< memory scope capabilities
+    UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES = 102,            ///< [::ur_memory_order_capability_flags_t] return a bit-field of atomic
+                                                                     ///< memory fence order capabilities
+    UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES = 103,            ///< [::ur_memory_scope_capability_flags_t] return a bit-field of atomic
+                                                                     ///< memory fence scope capabilities
+    UR_DEVICE_INFO_BFLOAT16 = 104,                                   ///< [::ur_bool_t] support for bfloat16
+    UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES = 105,                  ///< [uint32_t] Returns 1 if the device doesn't have a notion of a
+                                                                     ///< queue index. Otherwise, returns the number of queue indices that are
+                                                                     ///< available for this device.
+    UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS = 106,        ///< [::ur_bool_t] support the ::urKernelSetSpecializationConstants entry
+                                                                     ///< point
+    UR_DEVICE_INFO_MEMORY_BUS_WIDTH = 107,                           ///< [uint32_t] return the width in bits of the memory bus interface of the
+                                                                     ///< device.
+    UR_DEVICE_INFO_MAX_WORK_GROUPS_3D = 108,                         ///< [size_t[3]] return max 3D work groups
+    UR_DEVICE_INFO_ASYNC_BARRIER = 109,                              ///< [::ur_bool_t] return true if Async Barrier is supported
+    UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT = 110,                        ///< [::ur_bool_t] return true if specifying memory channels is supported
+    UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED = 111,             ///< [::ur_bool_t] Return true if the device supports enqueueing commands
+                                                                     ///< to read and write pipes from the host.
+    UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP = 112,               ///< [uint32_t] The maximum number of registers available per block.
+    UR_DEVICE_INFO_IP_VERSION = 113,                                 ///< [uint32_t] The device IP version. The meaning of the device IP version
+                                                                     ///< is implementation-defined, but newer devices should have a higher
+                                                                     ///< version than older devices.
+    UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT = 114,                     ///< [::ur_bool_t] return true if the device supports virtual memory.
+    UR_DEVICE_INFO_ESIMD_SUPPORT = 115,                              ///< [::ur_bool_t] return true if the device supports ESIMD.
+    UR_DEVICE_INFO_COMPONENT_DEVICES = 116,                          ///< [::ur_device_handle_t[]] The set of component devices contained by
+                                                                     ///< this composite device.
+    UR_DEVICE_INFO_COMPOSITE_DEVICE = 117,                           ///< [::ur_device_handle_t] The composite device containing this component
+                                                                     ///< device.
+    UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP = 0x1000,              ///< [::ur_bool_t] Returns true if the device supports the use of
+                                                                     ///< command-buffers.
+    UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP = 0x1001,       ///< [::ur_bool_t] Returns true if the device supports updating the kernel
+                                                                     ///< commands in a command-buffer.
+    UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP = 0x2000,             ///< [::ur_bool_t] returns true if the device supports the creation of
+                                                                     ///< bindless images
+    UR_DEVICE_INFO_BINDLESS_IMAGES_SHARED_USM_SUPPORT_EXP = 0x2001,  ///< [::ur_bool_t] returns true if the device supports the creation of
+                                                                     ///< bindless images backed by shared USM
+    UR_DEVICE_INFO_BINDLESS_IMAGES_1D_USM_SUPPORT_EXP = 0x2002,      ///< [::ur_bool_t] returns true if the device supports the creation of 1D
+                                                                     ///< bindless images backed by USM
+    UR_DEVICE_INFO_BINDLESS_IMAGES_2D_USM_SUPPORT_EXP = 0x2003,      ///< [::ur_bool_t] returns true if the device supports the creation of 2D
+                                                                     ///< bindless images backed by USM
+    UR_DEVICE_INFO_IMAGE_PITCH_ALIGN_EXP = 0x2004,                   ///< [uint32_t] returns the required alignment of the pitch between two
+                                                                     ///< rows of an image in bytes
+    UR_DEVICE_INFO_MAX_IMAGE_LINEAR_WIDTH_EXP = 0x2005,              ///< [size_t] returns the maximum linear width allowed for images allocated
+                                                                     ///< using USM
+    UR_DEVICE_INFO_MAX_IMAGE_LINEAR_HEIGHT_EXP = 0x2006,             ///< [size_t] returns the maximum linear height allowed for images
+                                                                     ///< allocated using USM
+    UR_DEVICE_INFO_MAX_IMAGE_LINEAR_PITCH_EXP = 0x2007,              ///< [size_t] returns the maximum linear pitch allowed for images allocated
+                                                                     ///< using USM
+    UR_DEVICE_INFO_MIPMAP_SUPPORT_EXP = 0x2008,                      ///< [::ur_bool_t] returns true if the device supports allocating mipmap
+                                                                     ///< resources
+    UR_DEVICE_INFO_MIPMAP_ANISOTROPY_SUPPORT_EXP = 0x2009,           ///< [::ur_bool_t] returns true if the device supports sampling mipmap
+                                                                     ///< images with anisotropic filtering
+    UR_DEVICE_INFO_MIPMAP_MAX_ANISOTROPY_EXP = 0x200A,               ///< [uint32_t] returns the maximum anisotropic ratio supported by the
+                                                                     ///< device
+    UR_DEVICE_INFO_MIPMAP_LEVEL_REFERENCE_SUPPORT_EXP = 0x200B,      ///< [::ur_bool_t] returns true if the device supports using images created
+                                                                     ///< from individual mipmap levels
+    UR_DEVICE_INFO_INTEROP_MEMORY_IMPORT_SUPPORT_EXP = 0x200C,       ///< [::ur_bool_t] returns true if the device supports importing external
+                                                                     ///< memory resources
+    UR_DEVICE_INFO_INTEROP_MEMORY_EXPORT_SUPPORT_EXP = 0x200D,       ///< [::ur_bool_t] returns true if the device supports exporting internal
+                                                                     ///< memory resources
+    UR_DEVICE_INFO_INTEROP_SEMAPHORE_IMPORT_SUPPORT_EXP = 0x200E,    ///< [::ur_bool_t] returns true if the device supports importing external
+                                                                     ///< semaphore resources
+    UR_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP = 0x200F,    ///< [::ur_bool_t] returns true if the device supports exporting internal
+                                                                     ///< event resources
+    UR_DEVICE_INFO_CUBEMAP_SUPPORT_EXP = 0x2010,                     ///< [::ur_bool_t] returns true if the device supports allocating and
+                                                                     ///< accessing cubemap resources
+    UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP = 0x2011,  ///< [::ur_bool_t] returns true if the device supports sampling cubemapped
+                                                                     ///< images across face boundaries
+    UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP = 0x2012, ///< [::ur_bool_t] returns true if the device is capable of fetching USM
+                                                                     ///< backed 1D sampled image data.
+    UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP = 0x2013,     ///< [::ur_bool_t] returns true if the device is capable of fetching
+                                                                     ///< non-USM backed 1D sampled image data.
+    UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP = 0x2014, ///< [::ur_bool_t] returns true if the device is capable of fetching USM
+                                                                     ///< backed 2D sampled image data.
+    UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP = 0x2015,     ///< [::ur_bool_t] returns true if the device is capable of fetching
+                                                                     ///< non-USM backed 2D sampled image data.
+    UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP = 0x2016, ///< [::ur_bool_t] returns true if the device is capable of fetching USM
+                                                                     ///< backed 3D sampled image data.
+    UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP = 0x2017,     ///< [::ur_bool_t] returns true if the device is capable of fetching
+                                                                     ///< non-USM backed 3D sampled image data.
+    UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP = 0x2018,         ///< [::ur_bool_t] returns true if the device supports timestamp recording
     /// @cond
     UR_DEVICE_INFO_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -1654,7 +1669,7 @@ typedef enum ur_device_info_t {
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP < propName`
+///         + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
 ///         + If `propName` is not supported by the adapter.
 ///     - ::UR_RESULT_ERROR_INVALID_SIZE
@@ -4757,7 +4772,8 @@ typedef enum ur_kernel_group_info_t {
     UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE = 0,                   ///< [size_t[3]] Return Work Group maximum global size
     UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE = 1,                    ///< [size_t] Return maximum Work Group size
     UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE = 2,            ///< [size_t[3]] Return Work Group size required by the source code, such
-                                                                 ///< as __attribute__((required_work_group_size(X,Y,Z))
+                                                                 ///< as __attribute__((required_work_group_size(X,Y,Z)), or (0, 0, 0) if
+                                                                 ///< unspecified
     UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE = 3,                     ///< [size_t] Return local memory required by the Kernel
     UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE = 4, ///< [size_t] Return preferred multiple of Work Group size for launch
     UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE = 5,                   ///< [size_t] Return minimum amount of private memory in bytes used by each
@@ -4773,7 +4789,8 @@ typedef enum ur_kernel_group_info_t {
 typedef enum ur_kernel_sub_group_info_t {
     UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE = 0,     ///< [uint32_t] Return maximum SubGroup size
     UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS = 1,     ///< [uint32_t] Return maximum number of SubGroup
-    UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS = 2, ///< [uint32_t] Return number of SubGroup required by the source code
+    UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS = 2, ///< [uint32_t] Return number of SubGroup required by the source code or 0
+                                                         ///< if unspecified
     UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL = 3,   ///< [uint32_t] Return SubGroup size required by Intel
     /// @cond
     UR_KERNEL_SUB_GROUP_INFO_FORCE_UINT32 = 0x7fffffff
@@ -5606,6 +5623,7 @@ typedef enum ur_command_t {
     UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP = 0x1000,   ///< Event created by ::urCommandBufferEnqueueExp
     UR_COMMAND_INTEROP_SEMAPHORE_WAIT_EXP = 0x2000,   ///< Event created by ::urBindlessImagesWaitExternalSemaphoreExp
     UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP = 0x2001, ///< Event created by ::urBindlessImagesSignalExternalSemaphoreExp
+    UR_COMMAND_TIMESTAMP_RECORDING_EXP = 0x2002,      ///< Event created by ::urEnqueueTimestampRecordingExp
     /// @cond
     UR_COMMAND_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -5973,6 +5991,7 @@ urEventSetCallback(
 ///     - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION
 ///     - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
+///     - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGS - "The kernel argument values have not been specified."
 ///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
 UR_APIEXPORT ur_result_t UR_APICALL
@@ -7535,7 +7554,6 @@ urBindlessImagesImageFreeExp(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `NULL == pImageFormat`
 ///         + `NULL == pImageDesc`
-///         + `NULL == phMem`
 ///         + `NULL == phImage`
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
@@ -7550,7 +7568,6 @@ urBindlessImagesUnsampledImageCreateExp(
     ur_exp_image_mem_handle_t hImageMem,   ///< [in] handle to memory from which to create the image
     const ur_image_format_t *pImageFormat, ///< [in] pointer to image format specification
     const ur_image_desc_t *pImageDesc,     ///< [in] pointer to image description
-    ur_mem_handle_t *phMem,                ///< [out] pointer to handle of image object created
     ur_exp_image_handle_t *phImage         ///< [out] pointer to handle of image object created
 );
 
@@ -7574,7 +7591,6 @@ urBindlessImagesUnsampledImageCreateExp(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `NULL == pImageFormat`
 ///         + `NULL == pImageDesc`
-///         + `NULL == phMem`
 ///         + `NULL == phImage`
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
@@ -7591,7 +7607,6 @@ urBindlessImagesSampledImageCreateExp(
     const ur_image_format_t *pImageFormat, ///< [in] pointer to image format specification
     const ur_image_desc_t *pImageDesc,     ///< [in] pointer to image description
     ur_sampler_handle_t hSampler,          ///< [in] sampler to be used
-    ur_mem_handle_t *phMem,                ///< [out] pointer to handle of image object created
     ur_exp_image_handle_t *phImage         ///< [out] pointer to handle of image object created
 );
 
@@ -8878,6 +8893,48 @@ urKernelSuggestMaxCooperativeGroupCountExp(
     uint32_t *pGroupCountRet        ///< [out] pointer to maximum number of groups
 );
 
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Unified Runtime Experimental APIs for enqueuing timestamp recordings
+#if !defined(__GNUC__)
+#pragma region enqueue timestamp recording(experimental)
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Enqueue a command for recording the device timestamp
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hQueue`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == phEvent`
+///     - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST
+///         + `phEventWaitList == NULL && numEventsInWaitList > 0`
+///         + `phEventWaitList != NULL && numEventsInWaitList == 0`
+UR_APIEXPORT ur_result_t UR_APICALL
+urEnqueueTimestampRecordingExp(
+    ur_queue_handle_t hQueue,                 ///< [in] handle of the queue object
+    bool blocking,                            ///< [in] indicates whether the call to this function should block until
+                                              ///< until the device timestamp recording command has executed on the
+                                              ///< device.
+    uint32_t numEventsInWaitList,             ///< [in] size of the event wait list
+    const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+                                              ///< events that must be complete before the kernel execution.
+                                              ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait
+                                              ///< events.
+    ur_event_handle_t *phEvent                ///< [in,out] return an event object that identifies this particular kernel
+                                              ///< execution instance. Profiling information can be queried
+                                              ///< from this event as if `hQueue` had profiling enabled. Querying
+                                              ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT`
+                                              ///< reports the timestamp at the time of the call to this function.
+                                              ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END`
+                                              ///< reports the timestamp recorded when the command is executed on the device.
+);
+
 #if !defined(__GNUC__)
 #pragma endregion
 #endif
@@ -10588,6 +10645,18 @@ typedef struct ur_enqueue_cooperative_kernel_launch_exp_params_t {
     ur_event_handle_t **pphEvent;
 } ur_enqueue_cooperative_kernel_launch_exp_params_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for urEnqueueTimestampRecordingExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+///     allowing the callback the ability to modify the parameter's value
+typedef struct ur_enqueue_timestamp_recording_exp_params_t {
+    ur_queue_handle_t *phQueue;
+    bool *pblocking;
+    uint32_t *pnumEventsInWaitList;
+    const ur_event_handle_t **pphEventWaitList;
+    ur_event_handle_t **pphEvent;
+} ur_enqueue_timestamp_recording_exp_params_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function parameters for urBindlessImagesUnsampledImageHandleDestroyExp
 /// @details Each entry is a pointer to the parameter passed to the function;
@@ -10640,7 +10709,6 @@ typedef struct ur_bindless_images_unsampled_image_create_exp_params_t {
     ur_exp_image_mem_handle_t *phImageMem;
     const ur_image_format_t **ppImageFormat;
     const ur_image_desc_t **ppImageDesc;
-    ur_mem_handle_t **pphMem;
     ur_exp_image_handle_t **pphImage;
 } ur_bindless_images_unsampled_image_create_exp_params_t;
 
@@ -10655,7 +10723,6 @@ typedef struct ur_bindless_images_sampled_image_create_exp_params_t {
     const ur_image_format_t **ppImageFormat;
     const ur_image_desc_t **ppImageDesc;
     ur_sampler_handle_t *phSampler;
-    ur_mem_handle_t **pphMem;
     ur_exp_image_handle_t **pphImage;
 } ur_bindless_images_sampled_image_create_exp_params_t;
 
diff --git a/include/ur_ddi.h b/include/ur_ddi.h
index 5a069f0881..52ae226c2a 100644
--- a/include/ur_ddi.h
+++ b/include/ur_ddi.h
@@ -7,7 +7,7 @@
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  * @file ur_ddi.h
- * @version v0.9-r0
+ * @version v0.10-r0
  *
  */
 #ifndef UR_DDI_H_INCLUDED
@@ -1448,10 +1448,20 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueCooperativeKernelLaunchExp_t)(
     const ur_event_handle_t *,
     ur_event_handle_t *);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for urEnqueueTimestampRecordingExp
+typedef ur_result_t(UR_APICALL *ur_pfnEnqueueTimestampRecordingExp_t)(
+    ur_queue_handle_t,
+    bool,
+    uint32_t,
+    const ur_event_handle_t *,
+    ur_event_handle_t *);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Table of EnqueueExp functions pointers
 typedef struct ur_enqueue_exp_dditable_t {
     ur_pfnEnqueueCooperativeKernelLaunchExp_t pfnCooperativeKernelLaunchExp;
+    ur_pfnEnqueueTimestampRecordingExp_t pfnTimestampRecordingExp;
 } ur_enqueue_exp_dditable_t;
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1513,7 +1523,6 @@ typedef ur_result_t(UR_APICALL *ur_pfnBindlessImagesUnsampledImageCreateExp_t)(
     ur_exp_image_mem_handle_t,
     const ur_image_format_t *,
     const ur_image_desc_t *,
-    ur_mem_handle_t *,
     ur_exp_image_handle_t *);
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -1525,7 +1534,6 @@ typedef ur_result_t(UR_APICALL *ur_pfnBindlessImagesSampledImageCreateExp_t)(
     const ur_image_format_t *,
     const ur_image_desc_t *,
     ur_sampler_handle_t,
-    ur_mem_handle_t *,
     ur_exp_image_handle_t *);
 
 ///////////////////////////////////////////////////////////////////////////////
diff --git a/include/ur_print.h b/include/ur_print.h
index 3126c1714b..3377980ce7 100644
--- a/include/ur_print.h
+++ b/include/ur_print.h
@@ -1954,6 +1954,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueWriteHostPipeParams(const stru
 ///         - `buff_size < out_size`
 UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueCooperativeKernelLaunchExpParams(const struct ur_enqueue_cooperative_kernel_launch_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_enqueue_timestamp_recording_exp_params_t struct
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_SIZE
+///         - `buff_size < out_size`
+UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueTimestampRecordingExpParams(const struct ur_enqueue_timestamp_recording_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print ur_bindless_images_unsampled_image_handle_destroy_exp_params_t struct
 /// @returns
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index a5074c5da1..bed5f01670 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -7,7 +7,7 @@
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  * @file ur_print.hpp
- * @version v0.9-r0
+ * @version v0.10-r0
  *
  */
 #ifndef UR_PRINT_HPP
@@ -916,6 +916,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) {
     case UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP:
         os << "UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP";
         break;
+    case UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP:
+        os << "UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP";
+        break;
     default:
         os << "unknown enumerator";
         break;
@@ -2553,6 +2556,27 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) {
     case UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP:
         os << "UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP";
         break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP:
+        os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP";
+        break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP:
+        os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP";
+        break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP:
+        os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP";
+        break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP:
+        os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP";
+        break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP:
+        os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP";
+        break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP:
+        os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP";
+        break;
+    case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP:
+        os << "UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP";
+        break;
     default:
         os << "unknown enumerator";
         break;
@@ -4190,6 +4214,90 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_device_info
 
         os << ")";
     } break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP: {
+        const ur_bool_t *tptr = (const ur_bool_t *)ptr;
+        if (sizeof(ur_bool_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        os << *tptr;
+
+        os << ")";
+    } break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP: {
+        const ur_bool_t *tptr = (const ur_bool_t *)ptr;
+        if (sizeof(ur_bool_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        os << *tptr;
+
+        os << ")";
+    } break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP: {
+        const ur_bool_t *tptr = (const ur_bool_t *)ptr;
+        if (sizeof(ur_bool_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        os << *tptr;
+
+        os << ")";
+    } break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP: {
+        const ur_bool_t *tptr = (const ur_bool_t *)ptr;
+        if (sizeof(ur_bool_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        os << *tptr;
+
+        os << ")";
+    } break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP: {
+        const ur_bool_t *tptr = (const ur_bool_t *)ptr;
+        if (sizeof(ur_bool_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        os << *tptr;
+
+        os << ")";
+    } break;
+    case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP: {
+        const ur_bool_t *tptr = (const ur_bool_t *)ptr;
+        if (sizeof(ur_bool_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        os << *tptr;
+
+        os << ")";
+    } break;
+    case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: {
+        const ur_bool_t *tptr = (const ur_bool_t *)ptr;
+        if (sizeof(ur_bool_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        os << *tptr;
+
+        os << ")";
+    } break;
     default:
         os << "unknown enumerator";
         return UR_RESULT_ERROR_INVALID_ENUMERATION;
@@ -8698,6 +8806,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_command_t value) {
     case UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP:
         os << "UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP";
         break;
+    case UR_COMMAND_TIMESTAMP_RECORDING_EXP:
+        os << "UR_COMMAND_TIMESTAMP_RECORDING_EXP";
+        break;
     default:
         os << "unknown enumerator";
         break;
@@ -14014,6 +14125,48 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
     return os;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ur_enqueue_timestamp_recording_exp_params_t type
+/// @returns
+///     std::ostream &
+inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_enqueue_timestamp_recording_exp_params_t *params) {
+
+    os << ".hQueue = ";
+
+    ur::details::printPtr(os,
+                          *(params->phQueue));
+
+    os << ", ";
+    os << ".blocking = ";
+
+    os << *(params->pblocking);
+
+    os << ", ";
+    os << ".numEventsInWaitList = ";
+
+    os << *(params->pnumEventsInWaitList);
+
+    os << ", ";
+    os << ".phEventWaitList = {";
+    for (size_t i = 0; *(params->pphEventWaitList) != NULL && i < *params->pnumEventsInWaitList; ++i) {
+        if (i != 0) {
+            os << ", ";
+        }
+
+        ur::details::printPtr(os,
+                              (*(params->pphEventWaitList))[i]);
+    }
+    os << "}";
+
+    os << ", ";
+    os << ".phEvent = ";
+
+    ur::details::printPtr(os,
+                          *(params->pphEvent));
+
+    return os;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Print operator for the ur_bindless_images_unsampled_image_handle_destroy_exp_params_t type
 /// @returns
@@ -14165,12 +14318,6 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
     ur::details::printPtr(os,
                           *(params->ppImageDesc));
 
-    os << ", ";
-    os << ".phMem = ";
-
-    ur::details::printPtr(os,
-                          *(params->pphMem));
-
     os << ", ";
     os << ".phImage = ";
 
@@ -14221,12 +14368,6 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
     ur::details::printPtr(os,
                           *(params->phSampler));
 
-    os << ", ";
-    os << ".phMem = ";
-
-    ur::details::printPtr(os,
-                          *(params->pphMem));
-
     os << ", ";
     os << ".phImage = ";
 
@@ -17036,6 +17177,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, ur_function_
     case UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP: {
         os << (const struct ur_enqueue_cooperative_kernel_launch_exp_params_t *)params;
     } break;
+    case UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP: {
+        os << (const struct ur_enqueue_timestamp_recording_exp_params_t *)params;
+    } break;
     case UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_HANDLE_DESTROY_EXP: {
         os << (const struct ur_bindless_images_unsampled_image_handle_destroy_exp_params_t *)params;
     } break;
diff --git a/scripts/Doxyfile b/scripts/Doxyfile
index 0329aeffff..64f0ac58a6 100644
--- a/scripts/Doxyfile
+++ b/scripts/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "Intel One API Unified Runtime API"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = v0.9
+PROJECT_NUMBER         = v0.10
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/scripts/core/EXP-BINDLESS-IMAGES.rst b/scripts/core/EXP-BINDLESS-IMAGES.rst
index af90c1ea0f..ee54c69291 100644
--- a/scripts/core/EXP-BINDLESS-IMAGES.rst
+++ b/scripts/core/EXP-BINDLESS-IMAGES.rst
@@ -91,6 +91,12 @@ Enums
     * ${X}_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP
     * ${X}_DEVICE_INFO_CUBEMAP_SUPPORT_EXP
     * ${X}_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP
+    * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP
+    * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP
+    * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP
+    * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP
+    * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP
+    * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP
 
 * ${x}_command_t
     * ${X}_COMMAND_INTEROP_SEMAPHORE_WAIT_EXP
@@ -198,6 +204,8 @@ Changelog
 +------------------------------------------------------------------------+
 | 10.0     | Added cubemap image type, sampling properties, and device   |
 |          | queries.                                                    |
++------------------------------------------------------------------------+
+| 11.0     | Added device queries for sampled image fetch capabilities.  |
 +----------+-------------------------------------------------------------+
 
 Contributors
diff --git a/scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst b/scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst
new file mode 100644
index 0000000000..3cf46095b2
--- /dev/null
+++ b/scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst
@@ -0,0 +1,70 @@
+<%
+    OneApi=tags['$OneApi']
+    x=tags['$x']
+    X=x.upper()
+%>
+
+.. _experimental-enqueue-timestamp-recording:
+
+================================================================================
+Enqueue Timestamp Recording
+================================================================================
+
+.. warning::
+
+    Experimental features:
+
+    *   May be replaced, updated, or removed at any time.
+    *   Do not require maintaining API/ABI stability of their own additions over
+        time.
+    *   Do not require conformance testing of their own additions.
+
+
+Motivation
+--------------------------------------------------------------------------------
+Currently, the only way to get timestamp information is through enabling
+profiling on a queue and retrieving the information from events coming from
+commands submitted to it. However, not all systems give full control of the
+queue construction to the programmer wanting the profiling information. To amend
+this, this extension adds the ability to enqueue a timestamp recording on any
+queue, with or without profiling enabled. This event can in turn be queried for
+the usual profiling information.
+
+
+API
+--------------------------------------------------------------------------------
+
+Enums
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* ${x}_device_info_t
+    * ${X}_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP
+
+* ${x}_command_t
+    * ${X}_COMMAND_TIMESTAMP_RECORDING_EXP
+
+Functions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+* ${x}EnqueueTimestampRecordingExp
+
+Changelog
+--------------------------------------------------------------------------------
+
++-----------+------------------------+
+| Revision  | Changes                |
++===========+========================+
+| 1.0       | Initial Draft          |
++-----------+------------------------+
+
+
+Support
+--------------------------------------------------------------------------------
+
+Adapters which support this experimental feature *must* return true for the new
+`${X}_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP` device info query.
+
+
+Contributors
+--------------------------------------------------------------------------------
+
+* Steffen Larsen `steffen.larsen@intel.com <steffen.larsen@intel.com>`_
diff --git a/scripts/core/INTRO.rst b/scripts/core/INTRO.rst
index 52e78153ae..a81c282070 100644
--- a/scripts/core/INTRO.rst
+++ b/scripts/core/INTRO.rst
@@ -182,6 +182,8 @@ Unified Runtime loader implements tracing support through the `XPTI framework <h
        | **user_data**: A pointer to `function_with_args_t` object, that includes function ID, name, arguments, and return value.
      - None
 
+The Unified Runtime tracing layer also supports logging tracing output directly, rather than using XPTI. Use the `UR_LOG_TRACING` environment variable to control this output. See the `Logging`_ section below for details of the syntax. All traces are logged at the *info* log level.
+
 Sanitizers
 ---------------------
 
@@ -282,6 +284,26 @@ Environment Variables
 
 Specific environment variables can be set to control the behavior of unified runtime or enable certain features.
 
+.. envvar:: UR_LOG_LEVEL_ZERO
+
+   Holds parameters for setting Unified Runtime level zero adapter logging. The syntax is described in the Logging_ section.
+
+.. envvar:: UR_LOG_CUDA
+
+   Holds parameters for setting Unified Runtime cuda adapter logging. The syntax is described in the Logging_ section.
+
+.. envvar:: UR_LOG_HIP
+
+   Holds parameters for setting Unified Runtime hip adapter logging. The syntax is described in the Logging_ section.
+
+.. envvar:: UR_LOG_OPENCL
+
+   Holds parameters for setting Unified Runtime opencl adapter logging. The syntax is described in the Logging_ section.
+
+.. envvar:: UR_LOG_NATIVE_CPU
+
+   Holds parameters for setting Unified Runtime native cpu logging. The syntax is described in the Logging_ section.
+
 .. envvar:: UR_LOG_LOADER
 
    Holds parameters for setting Unified Runtime loader logging. The syntax is described in the Logging_ section.
@@ -298,6 +320,10 @@ Specific environment variables can be set to control the behavior of unified run
 
    Holds parameters for setting Unified Runtime validation logging. The syntax is described in the Logging_ section.
 
+.. envvar:: UR_LOG_TRACING
+
+   Holds parameters for setting Unified Runtime tracing logging. The syntax is described in the Logging_ section.
+
 .. envvar:: UR_ADAPTERS_FORCE_LOAD
 
    Holds a comma-separated list of library paths used by the loader for adapter discovery. By setting this value you can
diff --git a/scripts/core/enqueue.yml b/scripts/core/enqueue.yml
index e91330ca77..8a264bfc49 100644
--- a/scripts/core/enqueue.yml
+++ b/scripts/core/enqueue.yml
@@ -65,6 +65,8 @@ returns:
     - $X_RESULT_ERROR_INVALID_WORK_DIMENSION
     - $X_RESULT_ERROR_INVALID_WORK_GROUP_SIZE
     - $X_RESULT_ERROR_INVALID_VALUE
+    - $X_RESULT_ERROR_INVALID_KERNEL_ARGS
+        - "The kernel argument values have not been specified."
     - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY
     - $X_RESULT_ERROR_OUT_OF_RESOURCES
 --- #--------------------------------------------------------------------------
diff --git a/scripts/core/exp-bindless-images.yml b/scripts/core/exp-bindless-images.yml
index a6f17b1a74..c5a3d5d5e3 100644
--- a/scripts/core/exp-bindless-images.yml
+++ b/scripts/core/exp-bindless-images.yml
@@ -92,6 +92,24 @@ etors:
     - name: CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP
       value: "0x2011"
       desc: "[$x_bool_t] returns true if the device supports sampling cubemapped images across face boundaries"
+    - name: BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP
+      value: "0x2012"
+      desc: "[$x_bool_t] returns true if the device is capable of fetching USM backed 1D sampled image data."
+    - name: BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP
+      value: "0x2013"
+      desc: "[$x_bool_t] returns true if the device is capable of fetching non-USM backed 1D sampled image data."
+    - name: BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP
+      value: "0x2014"
+      desc: "[$x_bool_t] returns true if the device is capable of fetching USM backed 2D sampled image data."
+    - name: BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP
+      value: "0x2015"
+      desc: "[$x_bool_t] returns true if the device is capable of fetching non-USM backed 2D sampled image data."
+    - name: BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP
+      value: "0x2016"
+      desc: "[$x_bool_t] returns true if the device is capable of fetching USM backed 3D sampled image data."
+    - name: BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP
+      value: "0x2017"
+      desc: "[$x_bool_t] returns true if the device is capable of fetching non-USM backed 3D sampled image data."
 --- #--------------------------------------------------------------------------
 type: enum
 extend: true
@@ -417,9 +435,6 @@ params:
     - type: "const $x_image_desc_t*"
       name: pImageDesc
       desc: "[in] pointer to image description"
-    - type: $x_mem_handle_t*
-      name: phMem
-      desc: "[out] pointer to handle of image object created"
     - type: $x_exp_image_handle_t*
       name: phImage
       desc: "[out] pointer to handle of image object created"
@@ -457,9 +472,6 @@ params:
     - type: $x_sampler_handle_t
       name: hSampler
       desc: "[in] sampler to be used"
-    - type: $x_mem_handle_t*
-      name: phMem
-      desc: "[out] pointer to handle of image object created"
     - type: $x_exp_image_handle_t*
       name: phImage
       desc: "[out] pointer to handle of image object created"
diff --git a/scripts/core/exp-enqueue-timestamp-recording.yml b/scripts/core/exp-enqueue-timestamp-recording.yml
new file mode 100644
index 0000000000..d8eff2a6cc
--- /dev/null
+++ b/scripts/core/exp-enqueue-timestamp-recording.yml
@@ -0,0 +1,68 @@
+#
+# Copyright (C) 2024 Intel Corporation
+#
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# See YaML.md for syntax definition
+#
+--- #--------------------------------------------------------------------------
+type: header
+desc: "Intel $OneApi Unified Runtime Experimental APIs for enqueuing timestamp recordings"
+ordinal: "99"
+--- #--------------------------------------------------------------------------
+type: enum
+extend: true
+typed_etors: true
+desc: "Extension enums to $x_device_info_t to support timestamp recordings."
+name: $x_device_info_t
+etors: 
+    - name: TIMESTAMP_RECORDING_SUPPORT_EXP
+      value: "0x2018"
+      desc: "[$x_bool_t] returns true if the device supports timestamp recording"
+--- #--------------------------------------------------------------------------
+type: enum
+extend: true
+desc: "Command Type experimental enumerations."
+name: $x_command_t
+etors:
+    - name: TIMESTAMP_RECORDING_EXP
+      value: "0x2002"
+      desc: Event created by $xEnqueueTimestampRecordingExp
+--- #--------------------------------------------------------------------------
+type: function
+desc: "Enqueue a command for recording the device timestamp"
+class: $xEnqueue
+name: TimestampRecordingExp
+params:
+    - type: $x_queue_handle_t
+      name: hQueue
+      desc: "[in] handle of the queue object"
+    - type: bool
+      name: blocking
+      desc: |
+            [in] indicates whether the call to this function should block until
+            until the device timestamp recording command has executed on the
+            device.
+    - type: uint32_t
+      name: numEventsInWaitList
+      desc: "[in] size of the event wait list"
+    - type: "const $x_event_handle_t*"
+      name: phEventWaitList
+      desc: |
+            [in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution.
+            If nullptr, the numEventsInWaitList must be 0, indicating no wait events. 
+    - type: $x_event_handle_t*
+      name: phEvent
+      desc: |
+            [in,out] return an event object that identifies this particular kernel execution instance. Profiling information can be queried
+            from this event as if `hQueue` had profiling enabled. Querying `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT`
+            reports the timestamp at the time of the call to this function. Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END`
+            reports the timestamp recorded when the command is executed on the device.
+returns:
+    - $X_RESULT_ERROR_INVALID_NULL_HANDLE
+    - $X_RESULT_ERROR_INVALID_NULL_POINTER
+    - $X_RESULT_ERROR_INVALID_EVENT_WAIT_LIST:
+        - "`phEventWaitList == NULL && numEventsInWaitList > 0`"
+        - "`phEventWaitList != NULL && numEventsInWaitList == 0`"
diff --git a/scripts/core/kernel.yml b/scripts/core/kernel.yml
index 4ce4f9c70a..4a0bf0bab1 100644
--- a/scripts/core/kernel.yml
+++ b/scripts/core/kernel.yml
@@ -135,7 +135,7 @@ etors:
     - name: WORK_GROUP_SIZE
       desc: "[size_t] Return maximum Work Group size"
     - name: COMPILE_WORK_GROUP_SIZE
-      desc: "[size_t[3]] Return Work Group size required by the source code, such as __attribute__((required_work_group_size(X,Y,Z))"
+      desc: "[size_t[3]] Return Work Group size required by the source code, such as __attribute__((required_work_group_size(X,Y,Z)), or (0, 0, 0) if unspecified"
     - name: LOCAL_MEM_SIZE
       desc: "[size_t] Return local memory required by the Kernel"
     - name: PREFERRED_WORK_GROUP_SIZE_MULTIPLE
@@ -154,7 +154,7 @@ etors:
     - name: MAX_NUM_SUB_GROUPS
       desc: "[uint32_t] Return maximum number of SubGroup"
     - name: COMPILE_NUM_SUB_GROUPS
-      desc: "[uint32_t] Return number of SubGroup required by the source code"
+      desc: "[uint32_t] Return number of SubGroup required by the source code or 0 if unspecified"
     - name: SUB_GROUP_SIZE_INTEL
       desc: "[uint32_t] Return SubGroup size required by Intel"
 --- #--------------------------------------------------------------------------
diff --git a/scripts/core/platform.yml b/scripts/core/platform.yml
index 510a7e16b3..c35b71622d 100644
--- a/scripts/core/platform.yml
+++ b/scripts/core/platform.yml
@@ -140,6 +140,9 @@ etors:
     - name: "0_9"
       value: "$X_MAKE_VERSION( 0, 9 )"
       desc: "version 0.9"
+    - name: "0_10"
+      value: "$X_MAKE_VERSION( 0, 10 )"
+      desc: "version 0.10"
 --- #--------------------------------------------------------------------------
 type: function
 desc: "Returns the API version supported by the specified platform"
diff --git a/scripts/core/registry.yml b/scripts/core/registry.yml
index 5ae70f7bd1..6a551d5821 100644
--- a/scripts/core/registry.yml
+++ b/scripts/core/registry.yml
@@ -580,6 +580,9 @@ etors:
 - name: COMMAND_BUFFER_COMMAND_GET_INFO_EXP
   desc: Enumerator for $xCommandBufferCommandGetInfoExp
   value: '222'
+- name: ENQUEUE_TIMESTAMP_RECORDING_EXP
+  desc: Enumerator for $xEnqueueTimestampRecordingExp
+  value: '223'
 ---
 type: enum
 desc: Defines structure types
diff --git a/scripts/ctest_parser.py b/scripts/ctest_parser.py
index bd1a84f3ee..5074a5c9b9 100755
--- a/scripts/ctest_parser.py
+++ b/scripts/ctest_parser.py
@@ -24,7 +24,7 @@ def get_cts_test_suite_names(working_directory):
     ]
 
 def percent(amount, total):
-    return round((amount / total) * 100, 2)
+    return round((amount / (total or 1)) * 100, 2)
 
 def summarize_results(results):
     total = results['Total']
diff --git a/scripts/generate_docs.py b/scripts/generate_docs.py
index 5c14305898..9b29123e40 100644
--- a/scripts/generate_docs.py
+++ b/scripts/generate_docs.py
@@ -10,6 +10,7 @@
 import subprocess
 import util
 import re
+from parse_specs import Version
 
 RE_ENABLE   = r"^\#\#\s*\-\-validate\s*\=\s*on$"
 RE_DISABLE  = r"^\#\#\s*\-\-validate\s*\=\s*off$"
@@ -87,7 +88,7 @@ def _make_ref(symbol, symbol_type, meta):
     generate a valid reStructuredText file
 """
 def _generate_valid_rst(fin, fout, namespace, tags, ver, rev, meta, fast_mode):
-    ver=float(ver)
+    ver = Version(ver)
     enable = True
     code_block = False
 
diff --git a/scripts/generate_kernel_header.py b/scripts/generate_kernel_header.py
index f765189334..46e78d6ed7 100755
--- a/scripts/generate_kernel_header.py
+++ b/scripts/generate_kernel_header.py
@@ -12,9 +12,7 @@
 import subprocess
 import sys
 
-from mako.template import Template
-
-HEADER_TEMPLATE = Template("""/*
+HEADER_TEMPLATE = """/*
  *
  * Copyright (C) 2023 Intel Corporation
  *
@@ -22,7 +20,7 @@
  * See LICENSE.TXT
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
- * @file ${file_name}.h
+ * @file %s.h
  *
  */
 
@@ -33,66 +31,63 @@
 namespace uur {
 namespace device_binaries {
     std::map<std::string, std::vector<std::string>> program_kernel_map = {
-% for program, entry_points in kernel_name_dict.items():
-        {"${program}", {
-  % for entry_point in entry_points:
-            "${entry_point}",
-  % endfor
-        }},
-% endfor
+%s
     };
 }
 }
-""")
+"""
 
+PROGRAM_TEMPLATE = """\
+        {"%s", {
+%s
+        }},
+"""
+
+ENTRY_POINT_TEMPLATE = """\
+            "%s",
+"""
 
 def generate_header(output_file, kernel_name_dict):
     """Render the template and write it to the output file."""
     file_name = os.path.basename(output_file)
-    rendered = HEADER_TEMPLATE.render(file_name=file_name,
-                                      kernel_name_dict=kernel_name_dict)
+    device_binaries = ""
+    for program, entry_points in kernel_name_dict.items():
+        content = ""
+        for entry_point in entry_points:
+            content += ENTRY_POINT_TEMPLATE % entry_point
+        device_binaries += PROGRAM_TEMPLATE % (program, content)
+    rendered = HEADER_TEMPLATE % (file_name, device_binaries)
     rendered = re.sub(r"\r\n", r"\n", rendered)
-
     with open(output_file, "w") as fout:
         fout.write(rendered)
 
 
-def get_mangled_names(dpcxx_path, source_file, output_header):
+def get_mangled_names(source_file, output_header):
     """Return a list of all the entry point names from a given sycl source file.
 
     Filters out wrapper and offset handler entry points.
     """
     output_dir = os.path.dirname(output_header)
-    il_file = os.path.join(output_dir, os.path.basename(source_file) + ".ll")
-    generate_il_command = f"""\
-        {dpcxx_path} -S -fsycl -fsycl-device-code-split=off \
-        -fsycl-device-only -o {il_file} {source_file}"""
-    subprocess.run(generate_il_command, shell=True)
-    kernel_line_regex = re.compile("define.*spir_kernel")
-    definition_lines = []
-    with open(il_file) as f:
+    name = os.path.splitext(os.path.basename(source_file))[0]
+    ih_file = os.path.join(output_dir, name, name + ".ih")
+    definitions = []
+    writing = False
+    with open(ih_file) as f:
         lines = f.readlines()
         for line in lines:
-            if kernel_line_regex.search(line) is not None:
-                definition_lines.append(line)
-
-    entry_point_names = []
-    kernel_name_regex = re.compile(r"@(.*?)\(")
-    for line in definition_lines:
-        if kernel_name_regex.search(line) is None:
-            continue
-        kernel_name = kernel_name_regex.search(line).group(1)
-        if "kernel_wrapper" not in kernel_name and "with_offset" not in kernel_name:
-            entry_point_names.append(kernel_name)
+            if "}" in line and writing:
+                break
+            # __pf_kernel_wrapper seems to be an internal function used by dpcpp
+            if writing and "19__pf_kernel_wrapper" not in line:
+                definitions.append(line.replace(",", "").strip()[1:-1])
+            if "const char* const kernel_names[] = {" in line:
+                    writing = True
 
-    os.remove(il_file)
-    return entry_point_names
+    return definitions
 
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--dpcxx_path",
-                        help="Full path to dpc++ compiler executable.")
     parser.add_argument(
         "-o",
         "--output",
@@ -105,7 +100,7 @@ def main():
     for source_file in args.source_files:
         program_name = os.path.splitext(os.path.basename(source_file))[0]
         mangled_names[program_name] = get_mangled_names(
-            args.dpcxx_path, source_file, args.output)
+            source_file, args.output)
     generate_header(args.output, mangled_names)
 
 
diff --git a/scripts/parse_specs.py b/scripts/parse_specs.py
index 1bff8887f4..c54e0adbf8 100644
--- a/scripts/parse_specs.py
+++ b/scripts/parse_specs.py
@@ -6,20 +6,23 @@
  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 """
+
 import os
 import generate_ids
 import util
 import re
 import hashlib
 import json
-import yaml
 import copy
 from templates.helper import param_traits, type_traits, value_traits
 import ctypes
 import itertools
+from typing import Dict, List, Optional
+from version import Version
+
 
-default_version = "0.9"
-all_versions = ["0.6", "0.7", "0.8", "0.9"]
+default_version = Version("0.10")
+all_versions = [Version(ver) for ver in ["0.6", "0.7", "0.8", "0.9", "0.10"]]
 
 """
     preprocess object
@@ -105,14 +108,14 @@ def __validate_version(d, prefix="", base_version=default_version):
                 raise Exception(prefix+"'version' must be a string: '%s'"%type(d['version']))
 
             try:
-                version = str(float(d['version']))
+                version = str(d['version'])
             except:
                 version = None
 
             if version != d['version']:
                 raise Exception(prefix+"'version' invalid value: '%s'"%d['version'])
 
-        return float(d.get('version', base_version))
+        return Version(d.get('version', base_version))
 
     def __validate_tag(d, key, tags, case):
         for x in tags:
@@ -127,7 +130,7 @@ def __validate_desc(desc):
                     raise Exception(prefix+"'version' must be a string: '%s'"%type(k))
 
                 try:
-                    version = str(float(k))
+                    version = str(k)
                 except:
                     version = None
 
@@ -212,8 +215,8 @@ def __validate_etors(d, tags):
         typed = d.get('typed_etors', False)
 
         value = -1
-        d_ver = d.get('version', default_version)
-        max_ver = float(d_ver)
+        d_ver = Version(d.get('version', default_version))
+        max_ver = d_ver
         for i, item in enumerate(d['etors']):
             prefix="'etors'[%s] "%i
             if not isinstance(item, dict):
@@ -300,8 +303,8 @@ def __validate_members(d, tags, meta):
         if not isinstance(d['members'], list):
             raise Exception("'members' must be a sequence: '%s'"%type(d['members']))
 
-        d_ver = d.get('version', default_version)
-        max_ver = float(d_ver)
+        d_ver = Version(d.get('version', default_version))
+        max_ver = d_ver
         for i, item in enumerate(d['members']):
             prefix="'members'[%s] "%i
             if not isinstance(item, dict):
@@ -342,8 +345,8 @@ def __validate_params(d, tags, meta):
         if not isinstance(d['params'], list):
             raise Exception("'params' must be a sequence: '%s'"%type(d['params']))
 
-        d_ver = d.get('version', default_version)
-        max_ver = float(d_ver)
+        d_ver = Version(d.get('version', default_version))
+        max_ver = d_ver
         min = {'[in]': None, '[out]': None, '[in,out]': None}
         for i, item in enumerate(d['params']):
             prefix="'params'[%s] "%i
@@ -501,24 +504,23 @@ def __validate_union_tag(d):
 """
     filters object by version
 """
-def _filter_version(d, max_ver):
-    ver = float(d.get('version', default_version))
+def _filter_version(d, max_ver: Version) -> Optional[Dict]:
+    ver = Version(d.get('version', default_version))
     if ver > max_ver:
         return None
 
-    def __filter_desc(d):
+    def __filter_desc(d) -> dict:
         if 'desc' in d and isinstance(d['desc'], dict):
             for k, v in d['desc'].items():
-                if float(k) <= max_ver:
-                    desc = v
-            d['desc'] = desc
+                if Version(k) <= max_ver:
+                    d['desc'] = v
         return d
 
     flt = []
     type = d['type']
     if 'enum' == type:
         for e in d['etors']:
-            ver = float(e.get('version', default_version))
+            ver = Version(e.get('version', default_version))
             if ver <= max_ver:
                 flt.append(__filter_desc(e))
         if d['name'].endswith('version_t'):
@@ -531,14 +533,14 @@ def __filter_desc(d):
 
     elif 'function' == type:
         for p in d['params']:
-            ver = float(p.get('version', default_version))
+            ver = Version(p.get('version', default_version))
             if ver <= max_ver:
                 flt.append(__filter_desc(p))
         d['params'] = flt
 
     elif 'struct' == type or 'union' == type or 'class' == type:
         for m in d.get('members',[]):
-            ver = float(m.get('version', default_version))
+            ver = Version(m.get('version', default_version))
             if ver <= max_ver:
                 flt.append(__filter_desc(m))
         d['members'] = flt
@@ -548,15 +550,15 @@ def __filter_desc(d):
 """
     creates docs per version
 """
-def _make_versions(d, max_ver):
+def _make_versions(d, max_ver : Version) -> List[Version]:
     docs = []
     type = d['type']
     if 'function' == type or 'struct' == type:
         for ver in all_versions:
-            if float(ver) > max_ver:
+            if ver > max_ver:
                 break
 
-            dv = _filter_version(copy.deepcopy(d), float(ver))
+            dv = _filter_version(copy.deepcopy(d), ver)
             if not dv:
                 continue
 
@@ -936,7 +938,7 @@ def parse(section, version, tags, meta, ref):
             if not _validate_doc(f, d, tags, line_nums[i], meta):
                 continue
 
-            d = _filter_version(d, float(version))
+            d = _filter_version(d, version)
             if not d:
                 continue
 
@@ -948,7 +950,10 @@ def parse(section, version, tags, meta, ref):
             # extract header from objects
             if re.match(r"header", d['type']):
                 header = d
-                header['ordinal'] = int(int(header.get('ordinal',"1000")) * float(header.get('version',"1.0")))
+                header["ordinal"] = int(
+                    int(header.get("ordinal", "1000"))
+                    * Version(header.get("version", "1.0")).major
+                )
                 header['ordinal'] *= 1000 if re.match(r"extension", header.get('desc',"").lower()) else 1
                 header['ordinal'] *= 1000 if re.match(r"experimental", header.get('desc',"").lower()) else 1
                 basename = os.path.splitext(os.path.basename(f))[0]
@@ -961,7 +966,7 @@ def parse(section, version, tags, meta, ref):
                 for c in '_-':
                     name = name.replace(c, ' ')
             elif header:
-                for d in _make_versions(d, float(version)):
+                for d in _make_versions(d, version):
                     objects.append(d)
                     meta = _generate_meta(d, header['ordinal'], meta)
 
diff --git a/scripts/run.py b/scripts/run.py
index b5a9ccc8b8..5d9b475007 100755
--- a/scripts/run.py
+++ b/scripts/run.py
@@ -13,9 +13,12 @@
 import parse_specs
 import generate_code
 import generate_docs
-import os, sys, platform
+import os
+import sys
+import platform
 import time
 import subprocess
+from version import Version
 
 """
     helper for adding mutually-exclusive boolean arguments "--name" and "--!name"
@@ -95,7 +98,7 @@ def get_version_from_cmakelists():
         for line in cmakelists_file.readlines():
             line = line.strip()
             if line.startswith('project('):
-                return re.findall(r'\d+\.\d+', line)[0]
+                return Version(re.findall(r'\d+\.\d+', line)[0])
     raise Exception(f'unable to read project version from {cmakelists_path}')
 
 
@@ -117,8 +120,13 @@ def main():
     add_argument(parser, "pdf", "generation of PDF file.")
     add_argument(parser, "rst", "generation of reStructuredText files.", True)
     parser.add_argument("--update_spec", type=str, help="root of integrated spec directory to update")
-    parser.add_argument("--ver", type=str, default=get_version_from_cmakelists(),
-                        required=False, help="specification version to generate.")
+    parser.add_argument(
+        "--ver",
+        type=parse_specs.Version,
+        default=get_version_from_cmakelists(),
+        required=False,
+        help="specification version to generate.",
+    )
     parser.add_argument("--api-json", type=str, default="unified_runtime.json", required=False, help="json output file for the spec")
     parser.add_argument("--clang-format", type=str, default="clang-format", required=False, help="path to clang-format executable")
     parser.add_argument('--fast-mode', action='store_true', help='Disable sections which are slow to render')
@@ -136,82 +144,75 @@ def main():
         'meta'   : {},
         'ref'    : {}
         }
-    
-    try:
 
-        for section in configParser.sections():
-            input['configs'].append({
-                'name'     : section,
-                'namespace': configParser.get(section,'namespace'),
-                'tags'     : {'$'+key : configParser.get(section,key) for key in configParser.get(section,'tags').split(",")},
-                })
-
-        # phase 2: parse specs
-        for config in input['configs']:
-            specs, input['meta'], input['ref'] = parse_specs.parse(config['name'], args['ver'], config['tags'], input['meta'], input['ref'])
-            input['specs'].append(specs)
+    for section in configParser.sections():
+        input['configs'].append({
+            'name'     : section,
+            'namespace': configParser.get(section,'namespace'),
+            'tags'     : {'$'+key : configParser.get(section,key) for key in configParser.get(section,'tags').split(",")},
+            })
 
-        util.jsonWrite(args['api_json'], input)
+    # phase 2: parse specs
+    for config in input['configs']:
+        specs, input['meta'], input['ref'] = parse_specs.parse(config['name'], args['ver'], config['tags'], input['meta'], input['ref'])
+        input['specs'].append(specs)
 
-        # phase 3: generate files
-        if args['clean']:
-            clean()
+    util.jsonWrite(args['api_json'], input)
 
-        incpath = os.path.join("../include/")
-        srcpath = os.path.join("../source/")
-        docpath = os.path.join("../docs/")
+    # phase 3: generate files
+    if args['clean']:
+        clean()
 
-        generate_docs.prepare(docpath, args['rst'], args['html'], args['ver'])
+    incpath = os.path.join("../include/")
+    srcpath = os.path.join("../source/")
+    docpath = os.path.join("../docs/")
 
-        for idx, specs in enumerate(input['specs']):
-            config = input['configs'][idx]
-            if args[config['name']]:
+    generate_docs.prepare(docpath, args['rst'], args['html'], args['ver'])
 
-                generate_code.generate_api(incpath, srcpath, config['namespace'], config['tags'], args['ver'], args['rev'], specs, input['meta'])
+    for idx, specs in enumerate(input['specs']):
+        config = input['configs'][idx]
+        if args[config['name']]:
 
-                # clang-format ur_api.h
-                proc = subprocess.run([args['clang_format'], "--style=file", "-i" , "ur_api.h"], stderr=subprocess.PIPE, cwd=incpath)
-                if proc.returncode != 0:
-                    print("-- clang-format failed with non-zero return code. --")
-                    print(proc.stderr.decode())
-                    raise Exception("Failed to format ur_api.h")
+            generate_code.generate_api(incpath, srcpath, config['namespace'], config['tags'], args['ver'], args['rev'], specs, input['meta'])
 
-                if args['rst']:
-                    generate_docs.generate_rst(docpath, config['name'], config['namespace'], config['tags'], args['ver'], args['rev'], specs, input['meta'], args['fast_mode'])
+            # clang-format ur_api.h
+            proc = subprocess.run([args['clang_format'], "--style=file", "-i" , "ur_api.h"], stderr=subprocess.PIPE, cwd=incpath)
+            if proc.returncode != 0:
+                print("-- clang-format failed with non-zero return code. --")
+                print(proc.stderr.decode())
+                raise Exception("Failed to format ur_api.h")
 
-            if util.makeErrorCount():
-                print("\n%s Errors found during generation, stopping execution!"%util.makeErrorCount())
-                return
+            if args['rst']:
+                generate_docs.generate_rst(docpath, config['name'], config['namespace'], config['tags'], args['ver'], args['rev'], specs, input['meta'], args['fast_mode'])
 
-        if args['debug']:
-            util.makoFileListWrite("generated.json")
+        if util.makeErrorCount():
+            print("\n%s Errors found during generation, stopping execution!"%util.makeErrorCount())
+            return
 
-        # phase 4: build code
-        if args['build']:
-            if not build():
-                print("\nBuild failed, stopping execution!")
-                return
+    if args['debug']:
+        util.makoFileListWrite("generated.json")
 
-        # phase 5: prep for publication of html or pdf
-        if args['html'] or args['pdf']:
-            generate_docs.generate_common(docpath, configParser.sections(), args['ver'], args['rev'])
+    # phase 4: build code
+    if args['build']:
+        if not build():
+            print("\nBuild failed, stopping execution!")
+            return
 
-        # phase 5: publish documentation
-        if args['html']:
-            generate_docs.generate_html(docpath)
+    # phase 5: prep for publication of html or pdf
+    if args['html'] or args['pdf']:
+        generate_docs.generate_common(docpath, configParser.sections(), args['ver'], args['rev'])
 
-        if args['pdf']:
-            generate_docs.generate_pdf(docpath)
+    # phase 5: publish documentation
+    if args['html']:
+        generate_docs.generate_html(docpath)
 
-        if args['update_spec']:
-            update_spec(args['update_spec'])
+    if args['pdf']:
+        generate_docs.generate_pdf(docpath)
 
-        print("\nCompleted in %.1f seconds!"%(time.time() - start))
+    if args['update_spec']:
+        update_spec(args['update_spec'])
 
-    except BaseException as e:
-        print("Failed to generate specification.")
-        print(e)
-        return sys.exit(1)
+    print("\nCompleted in %.1f seconds!"%(time.time() - start))
 
 
 if __name__ == '__main__':
diff --git a/scripts/templates/helper.py b/scripts/templates/helper.py
index 3b8e5693b7..1d539d70fe 100644
--- a/scripts/templates/helper.py
+++ b/scripts/templates/helper.py
@@ -12,6 +12,7 @@
 
 # allow imports from top-level scripts directory
 sys.path.append("..")
+from version import Version
 
 """
     Extracts traits from a spec object
@@ -1081,15 +1082,20 @@ def make_param_checks(namespace, tags, obj, cpp=False, meta=None):
 def get_class_function_objs(specs, cname, version = None):
     objects = []
     for s in specs:
-        for obj in s['objects']:
+        for obj in s["objects"]:
             is_function = obj_traits.is_function(obj)
             match_cls = cname == obj_traits.class_name(obj)
             if is_function and match_cls:
                 if version is None:
                     objects.append(obj)
-                elif float(obj.get('version',"1.0")) <= version:
+                elif Version(obj.get("version", "1.0")) <= version:
                     objects.append(obj)
-    return sorted(objects, key=lambda obj: (float(obj.get('version',"1.0"))*10000) + int(obj.get('ordinal',"100")))
+    return sorted(
+        objects,
+        key=lambda obj: (Version(obj.get("version", "1.0")).major * 10000)
+        + int(obj.get("ordinal", "100")),
+    )
+
 
 """
 Public:
@@ -1107,8 +1113,16 @@ def get_class_function_objs_exp(specs, cname):
                     exp_objects.append(obj)
                 else:
                     objects.append(obj)
-    objects = sorted(objects, key=lambda obj: (float(obj.get('version',"1.0"))*10000) + int(obj.get('ordinal',"100")))
-    exp_objects = sorted(exp_objects, key=lambda obj: (float(obj.get('version',"1.0"))*10000) + int(obj.get('ordinal',"100")))
+    objects = sorted(
+        objects,
+        key=lambda obj: (Version(obj.get("version", "1.0")).major * 10000)
+        + int(obj.get("ordinal", "100")),
+    )
+    exp_objects = sorted(
+        exp_objects,
+        key=lambda obj: (Version(obj.get("version", "1.0")).major * 10000)
+        + int(obj.get("ordinal", "100")),
+    )
     return objects, exp_objects
 
 """
@@ -1222,7 +1236,7 @@ def _filter_param_list(params, filters1=["[in]", "[in,out]", "[out]"], filters2=
 def get_pfncbtables(specs, meta, namespace, tags):
     tables = []
     for cname in sorted(meta['class'], key=lambda x: meta['class'][x]['ordinal']):
-        objs = get_class_function_objs(specs, cname, 1.0)
+        objs = get_class_function_objs(specs, cname, Version('1.0'))
         if len(objs) > 0:
             name = get_table_name(namespace, tags, {'class': cname})
             print(name)
diff --git a/scripts/templates/trcddi.cpp.mako b/scripts/templates/trcddi.cpp.mako
index 2ace43072b..6f6579d5ac 100644
--- a/scripts/templates/trcddi.cpp.mako
+++ b/scripts/templates/trcddi.cpp.mako
@@ -45,10 +45,16 @@ namespace ur_tracing_layer
         ${th.make_pfncb_param_type(n, tags, obj)} params = { &${",&".join(th.make_param_lines(n, tags, obj, format=["name"]))} };
         uint64_t instance = context.notify_begin(${th.make_func_etor(n, tags, obj)}, "${th.make_func_name(n, tags, obj)}", &params);
 
+        context.logger.info("---> ${th.make_func_name(n, tags, obj)}");
+
         ${x}_result_t result = ${th.make_pfn_name(n, tags, obj)}( ${", ".join(th.make_param_lines(n, tags, obj, format=["name"]))} );
 
         context.notify_end(${th.make_func_etor(n, tags, obj)}, "${th.make_func_name(n, tags, obj)}", &params, &result, instance);
 
+        std::ostringstream args_str;
+        ur::extras::printFunctionParams(args_str, ${th.make_func_etor(n, tags, obj)}, &params);
+        context.logger.info("({}) -> {};\n", args_str.str(), result);
+
         return result;
     }
     %if 'condition' in obj:
@@ -112,6 +118,10 @@ namespace ur_tracing_layer
             return result;
         }
 
+        // Recreate the logger in case env variables have been modified between
+        // program launch and the call to `urLoaderInit`
+        logger = logger::create_logger("tracing", true, true);
+
         ur_tracing_layer::context.codelocData = codelocData;
 
     %for tbl in th.get_pfntables(specs, meta, n, tags):
diff --git a/scripts/version.py b/scripts/version.py
new file mode 100644
index 0000000000..8fa72d65a7
--- /dev/null
+++ b/scripts/version.py
@@ -0,0 +1,41 @@
+"""
+Copyright (C) 2024 Intel Corporation
+
+Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+See LICENSE.TXT
+SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""
+
+import functools
+import re
+
+
+@functools.total_ordering
+class Version:
+    def __init__(self, version: str):
+        assert isinstance(version, (str, Version))
+        if isinstance(version, str):
+            self.str = version
+            match = re.match(r"^(\d+)\.(\d+)$", self.str)
+            assert isinstance(match, re.Match)
+            self.major = int(match.groups()[0])
+            self.minor = int(match.groups()[1])
+        else:
+            self.str = version.str
+            self.major = version.major
+            self.minor = version.minor
+
+    def __eq__(self, other) -> bool:
+        assert isinstance(other, Version)
+        return self.major == other.major and self.minor == other.minor
+
+    def __lt__(self, other) -> bool:
+        if not isinstance(other, Version):
+            import ipdb; ipdb.set_trace()
+        return self.major < other.major or (
+            self.major == other.major and self.minor < other.minor
+        )
+
+    def __str__(self) -> str:
+        return self.str
diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp
index 8f1ede3010..8d21a93c75 100644
--- a/source/adapters/cuda/command_buffer.cpp
+++ b/source/adapters/cuda/command_buffer.cpp
@@ -203,9 +203,10 @@ static ur_result_t enqueueCommandBufferFillHelper(
       }
       }
 
-      UR_CHECK_ERROR(cuGraphAddMemsetNode(
-          &GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
-          DepsList.size(), &NodeParams, CommandBuffer->Device->getContext()));
+      UR_CHECK_ERROR(
+          cuGraphAddMemsetNode(&GraphNode, CommandBuffer->CudaGraph,
+                               DepsList.data(), DepsList.size(), &NodeParams,
+                               CommandBuffer->Device->getNativeContext()));
 
       // Get sync point and register the cuNode with it.
       *SyncPoint =
@@ -237,7 +238,7 @@ static ur_result_t enqueueCommandBufferFillHelper(
       UR_CHECK_ERROR(cuGraphAddMemsetNode(
           &GraphNodeFirst, CommandBuffer->CudaGraph, DepsList.data(),
           DepsList.size(), &NodeParamsStepFirst,
-          CommandBuffer->Device->getContext()));
+          CommandBuffer->Device->getNativeContext()));
 
       // Get sync point and register the cuNode with it.
       *SyncPoint = CommandBuffer->addSyncPoint(
@@ -269,7 +270,7 @@ static ur_result_t enqueueCommandBufferFillHelper(
         UR_CHECK_ERROR(cuGraphAddMemsetNode(
             &GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
             DepsList.size(), &NodeParamsStep,
-            CommandBuffer->Device->getContext()));
+            CommandBuffer->Device->getNativeContext()));
 
         GraphNodePtr = std::make_shared<CUgraphNode>(GraphNode);
         // Get sync point and register the cuNode with it.
@@ -478,7 +479,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp(
 
     UR_CHECK_ERROR(cuGraphAddMemcpyNode(
         &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
-        &NodeParams, hCommandBuffer->Device->getContext()));
+        &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
     // Get sync point and register the cuNode with it.
     *pSyncPoint =
@@ -513,8 +514,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
   }
 
   try {
-    auto Src = std::get<BufferMem>(hSrcMem->Mem).get() + srcOffset;
-    auto Dst = std::get<BufferMem>(hDstMem->Mem).get() + dstOffset;
+    auto Src = std::get<BufferMem>(hSrcMem->Mem)
+                   .getPtrWithOffset(hCommandBuffer->Device, srcOffset);
+    auto Dst = std::get<BufferMem>(hDstMem->Mem)
+                   .getPtrWithOffset(hCommandBuffer->Device, dstOffset);
 
     CUDA_MEMCPY3D NodeParams = {};
     setCopyParams(&Src, CU_MEMORYTYPE_DEVICE, &Dst, CU_MEMORYTYPE_DEVICE, size,
@@ -522,7 +525,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
 
     UR_CHECK_ERROR(cuGraphAddMemcpyNode(
         &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
-        &NodeParams, hCommandBuffer->Device->getContext()));
+        &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
     // Get sync point and register the cuNode with it.
     *pSyncPoint =
@@ -553,8 +556,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
   }
 
   try {
-    CUdeviceptr SrcPtr = std::get<BufferMem>(hSrcMem->Mem).get();
-    CUdeviceptr DstPtr = std::get<BufferMem>(hDstMem->Mem).get();
+    auto SrcPtr =
+        std::get<BufferMem>(hSrcMem->Mem).getPtr(hCommandBuffer->Device);
+    auto DstPtr =
+        std::get<BufferMem>(hDstMem->Mem).getPtr(hCommandBuffer->Device);
     CUDA_MEMCPY3D NodeParams = {};
 
     setCopyRectParams(region, &SrcPtr, CU_MEMORYTYPE_DEVICE, srcOrigin,
@@ -563,7 +568,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
 
     UR_CHECK_ERROR(cuGraphAddMemcpyNode(
         &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
-        &NodeParams, hCommandBuffer->Device->getContext()));
+        &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
     // Get sync point and register the cuNode with it.
     *pSyncPoint =
@@ -593,7 +598,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp(
   }
 
   try {
-    auto Dst = std::get<BufferMem>(hBuffer->Mem).get() + offset;
+    auto Dst = std::get<BufferMem>(hBuffer->Mem)
+                   .getPtrWithOffset(hCommandBuffer->Device, offset);
 
     CUDA_MEMCPY3D NodeParams = {};
     setCopyParams(pSrc, CU_MEMORYTYPE_HOST, &Dst, CU_MEMORYTYPE_DEVICE, size,
@@ -601,7 +607,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp(
 
     UR_CHECK_ERROR(cuGraphAddMemcpyNode(
         &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
-        &NodeParams, hCommandBuffer->Device->getContext()));
+        &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
     // Get sync point and register the cuNode with it.
     *pSyncPoint =
@@ -630,7 +636,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp(
   }
 
   try {
-    auto Src = std::get<BufferMem>(hBuffer->Mem).get() + offset;
+    auto Src = std::get<BufferMem>(hBuffer->Mem)
+                   .getPtrWithOffset(hCommandBuffer->Device, offset);
 
     CUDA_MEMCPY3D NodeParams = {};
     setCopyParams(&Src, CU_MEMORYTYPE_DEVICE, pDst, CU_MEMORYTYPE_HOST, size,
@@ -638,7 +645,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp(
 
     UR_CHECK_ERROR(cuGraphAddMemcpyNode(
         &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
-        &NodeParams, hCommandBuffer->Device->getContext()));
+        &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
     // Get sync point and register the cuNode with it.
     *pSyncPoint =
@@ -670,7 +677,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp(
   }
 
   try {
-    CUdeviceptr DstPtr = std::get<BufferMem>(hBuffer->Mem).get();
+    auto DstPtr =
+        std::get<BufferMem>(hBuffer->Mem).getPtr(hCommandBuffer->Device);
     CUDA_MEMCPY3D NodeParams = {};
 
     setCopyRectParams(region, pSrc, CU_MEMORYTYPE_HOST, hostOffset,
@@ -680,7 +688,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp(
 
     UR_CHECK_ERROR(cuGraphAddMemcpyNode(
         &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
-        &NodeParams, hCommandBuffer->Device->getContext()));
+        &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
     // Get sync point and register the cuNode with it.
     *pSyncPoint =
@@ -712,7 +720,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
   }
 
   try {
-    CUdeviceptr SrcPtr = std::get<BufferMem>(hBuffer->Mem).get();
+    auto SrcPtr =
+        std::get<BufferMem>(hBuffer->Mem).getPtr(hCommandBuffer->Device);
     CUDA_MEMCPY3D NodeParams = {};
 
     setCopyRectParams(region, &SrcPtr, CU_MEMORYTYPE_DEVICE, bufferOffset,
@@ -722,7 +731,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
 
     UR_CHECK_ERROR(cuGraphAddMemcpyNode(
         &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
-        &NodeParams, hCommandBuffer->Device->getContext()));
+        &NodeParams, hCommandBuffer->Device->getNativeContext()));
 
     // Get sync point and register the cuNode with it.
     *pSyncPoint =
@@ -821,7 +830,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp(
                 PatternSizeIsValid,
             UR_RESULT_ERROR_INVALID_SIZE);
 
-  auto DstDevice = std::get<BufferMem>(hBuffer->Mem).get() + offset;
+  auto DstDevice = std::get<BufferMem>(hBuffer->Mem)
+                       .getPtrWithOffset(hCommandBuffer->Device, offset);
 
   return enqueueCommandBufferFillHelper(
       hCommandBuffer, &DstDevice, CU_MEMORYTYPE_DEVICE, pPattern, patternSize,
@@ -854,7 +864,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
 
   try {
     std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     uint32_t StreamToken;
     ur_stream_guard_ Guard;
     CUstream CuStream = hQueue->getNextComputeStream(
@@ -972,7 +982,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
       if (ArgValue == nullptr) {
         Kernel->setKernelArg(ArgIndex, 0, nullptr);
       } else {
-        CUdeviceptr CuPtr = std::get<BufferMem>(ArgValue->Mem).get();
+        CUdeviceptr CuPtr =
+            std::get<BufferMem>(ArgValue->Mem).getPtr(CommandBuffer->Device);
         Kernel->setKernelArg(ArgIndex, sizeof(CUdeviceptr), (void *)&CuPtr);
       }
     } catch (ur_result_t Err) {
diff --git a/source/adapters/cuda/command_buffer.hpp b/source/adapters/cuda/command_buffer.hpp
index 84a9e0405b..d83269f2ae 100644
--- a/source/adapters/cuda/command_buffer.hpp
+++ b/source/adapters/cuda/command_buffer.hpp
@@ -10,155 +10,13 @@
 
 #include <ur/ur.hpp>
 #include <ur_api.h>
+#include <ur_print.hpp>
 
 #include "context.hpp"
 #include "logger/ur_logger.hpp"
 #include <cuda.h>
 #include <memory>
 
-static inline const char *getUrResultString(ur_result_t Result) {
-  switch (Result) {
-  case UR_RESULT_SUCCESS:
-    return "UR_RESULT_SUCCESS";
-  case UR_RESULT_ERROR_INVALID_OPERATION:
-    return "UR_RESULT_ERROR_INVALID_OPERATION";
-  case UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES:
-    return "UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES";
-  case UR_RESULT_ERROR_INVALID_QUEUE:
-    return "UR_RESULT_ERROR_INVALID_QUEUE";
-  case UR_RESULT_ERROR_INVALID_VALUE:
-    return "UR_RESULT_ERROR_INVALID_VALUE";
-  case UR_RESULT_ERROR_INVALID_CONTEXT:
-    return "UR_RESULT_ERROR_INVALID_CONTEXT";
-  case UR_RESULT_ERROR_INVALID_PLATFORM:
-    return "UR_RESULT_ERROR_INVALID_PLATFORM";
-  case UR_RESULT_ERROR_INVALID_BINARY:
-    return "UR_RESULT_ERROR_INVALID_BINARY";
-  case UR_RESULT_ERROR_INVALID_PROGRAM:
-    return "UR_RESULT_ERROR_INVALID_PROGRAM";
-  case UR_RESULT_ERROR_INVALID_SAMPLER:
-    return "UR_RESULT_ERROR_INVALID_SAMPLER";
-  case UR_RESULT_ERROR_INVALID_BUFFER_SIZE:
-    return "UR_RESULT_ERROR_INVALID_BUFFER_SIZE";
-  case UR_RESULT_ERROR_INVALID_MEM_OBJECT:
-    return "UR_RESULT_ERROR_INVALID_MEM_OBJECT";
-  case UR_RESULT_ERROR_INVALID_EVENT:
-    return "UR_RESULT_ERROR_INVALID_EVENT";
-  case UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST:
-    return "UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST";
-  case UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET:
-    return "UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET";
-  case UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE:
-    return "UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE";
-  case UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE:
-    return "UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE";
-  case UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE:
-    return "UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE";
-  case UR_RESULT_ERROR_DEVICE_NOT_FOUND:
-    return "UR_RESULT_ERROR_DEVICE_NOT_FOUND";
-  case UR_RESULT_ERROR_INVALID_DEVICE:
-    return "UR_RESULT_ERROR_INVALID_DEVICE";
-  case UR_RESULT_ERROR_DEVICE_LOST:
-    return "UR_RESULT_ERROR_DEVICE_LOST";
-  case UR_RESULT_ERROR_DEVICE_REQUIRES_RESET:
-    return "UR_RESULT_ERROR_DEVICE_REQUIRES_RESET";
-  case UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE:
-    return "UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE";
-  case UR_RESULT_ERROR_DEVICE_PARTITION_FAILED:
-    return "UR_RESULT_ERROR_DEVICE_PARTITION_FAILED";
-  case UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT:
-    return "UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT";
-  case UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE:
-    return "UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE";
-  case UR_RESULT_ERROR_INVALID_WORK_DIMENSION:
-    return "UR_RESULT_ERROR_INVALID_WORK_DIMENSION";
-  case UR_RESULT_ERROR_INVALID_KERNEL_ARGS:
-    return "UR_RESULT_ERROR_INVALID_KERNEL_ARGS";
-  case UR_RESULT_ERROR_INVALID_KERNEL:
-    return "UR_RESULT_ERROR_INVALID_KERNEL";
-  case UR_RESULT_ERROR_INVALID_KERNEL_NAME:
-    return "UR_RESULT_ERROR_INVALID_KERNEL_NAME";
-  case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX:
-    return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX";
-  case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE:
-    return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE";
-  case UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE:
-    return "UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE";
-  case UR_RESULT_ERROR_INVALID_IMAGE_SIZE:
-    return "UR_RESULT_ERROR_INVALID_IMAGE_SIZE";
-  case UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR:
-    return "UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR";
-  case UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED:
-    return "UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED";
-  case UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE:
-    return "UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE";
-  case UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE:
-    return "UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE";
-  case UR_RESULT_ERROR_UNINITIALIZED:
-    return "UR_RESULT_ERROR_UNINITIALIZED";
-  case UR_RESULT_ERROR_OUT_OF_HOST_MEMORY:
-    return "UR_RESULT_ERROR_OUT_OF_HOST_MEMORY";
-  case UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY:
-    return "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY";
-  case UR_RESULT_ERROR_OUT_OF_RESOURCES:
-    return "UR_RESULT_ERROR_OUT_OF_RESOURCES";
-  case UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE:
-    return "UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE";
-  case UR_RESULT_ERROR_PROGRAM_LINK_FAILURE:
-    return "UR_RESULT_ERROR_PROGRAM_LINK_FAILURE";
-  case UR_RESULT_ERROR_UNSUPPORTED_VERSION:
-    return "UR_RESULT_ERROR_UNSUPPORTED_VERSION";
-  case UR_RESULT_ERROR_UNSUPPORTED_FEATURE:
-    return "UR_RESULT_ERROR_UNSUPPORTED_FEATURE";
-  case UR_RESULT_ERROR_INVALID_ARGUMENT:
-    return "UR_RESULT_ERROR_INVALID_ARGUMENT";
-  case UR_RESULT_ERROR_INVALID_NULL_HANDLE:
-    return "UR_RESULT_ERROR_INVALID_NULL_HANDLE";
-  case UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE:
-    return "UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE";
-  case UR_RESULT_ERROR_INVALID_NULL_POINTER:
-    return "UR_RESULT_ERROR_INVALID_NULL_POINTER";
-  case UR_RESULT_ERROR_INVALID_SIZE:
-    return "UR_RESULT_ERROR_INVALID_SIZE";
-  case UR_RESULT_ERROR_UNSUPPORTED_SIZE:
-    return "UR_RESULT_ERROR_UNSUPPORTED_SIZE";
-  case UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT:
-    return "UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT";
-  case UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT:
-    return "UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT";
-  case UR_RESULT_ERROR_INVALID_ENUMERATION:
-    return "UR_RESULT_ERROR_INVALID_ENUMERATION";
-  case UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION:
-    return "UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION";
-  case UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT:
-    return "UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT";
-  case UR_RESULT_ERROR_INVALID_NATIVE_BINARY:
-    return "UR_RESULT_ERROR_INVALID_NATIVE_BINARY";
-  case UR_RESULT_ERROR_INVALID_GLOBAL_NAME:
-    return "UR_RESULT_ERROR_INVALID_GLOBAL_NAME";
-  case UR_RESULT_ERROR_INVALID_FUNCTION_NAME:
-    return "UR_RESULT_ERROR_INVALID_FUNCTION_NAME";
-  case UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION:
-    return "UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION";
-  case UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION:
-    return "UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION";
-  case UR_RESULT_ERROR_PROGRAM_UNLINKED:
-    return "UR_RESULT_ERROR_PROGRAM_UNLINKED";
-  case UR_RESULT_ERROR_OVERLAPPING_REGIONS:
-    return "UR_RESULT_ERROR_OVERLAPPING_REGIONS";
-  case UR_RESULT_ERROR_INVALID_HOST_PTR:
-    return "UR_RESULT_ERROR_INVALID_HOST_PTR";
-  case UR_RESULT_ERROR_INVALID_USM_SIZE:
-    return "UR_RESULT_ERROR_INVALID_USM_SIZE";
-  case UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE:
-    return "UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE";
-  case UR_RESULT_ERROR_ADAPTER_SPECIFIC:
-    return "UR_RESULT_ERROR_ADAPTER_SPECIFIC";
-  default:
-    return "UR_RESULT_ERROR_UNKNOWN";
-  }
-}
-
 // Trace an internal UR call
 #define UR_TRACE(Call)                                                         \
   {                                                                            \
@@ -173,7 +31,7 @@ static inline const char *getUrResultString(ur_result_t Result) {
       logger::always("UR ---> {}", #Call);                                     \
     Result = (Call);                                                           \
     if (PrintTrace)                                                            \
-      logger::always("UR <--- {}({})", #Call, getUrResultString(Result));      \
+      logger::always("UR <--- {}({})", #Call, Result);                         \
   }
 
 // Handle to a kernel command.
diff --git a/source/adapters/cuda/context.cpp b/source/adapters/cuda/context.cpp
index 40ae0ce4ad..0d08dab76c 100644
--- a/source/adapters/cuda/context.cpp
+++ b/source/adapters/cuda/context.cpp
@@ -46,23 +46,19 @@ UR_APIEXPORT ur_result_t UR_APICALL
 urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices,
                 const ur_context_properties_t *pProperties,
                 ur_context_handle_t *phContext) {
-  std::ignore = DeviceCount;
   std::ignore = pProperties;
 
-  assert(DeviceCount == 1);
-  ur_result_t RetErr = UR_RESULT_SUCCESS;
-
   std::unique_ptr<ur_context_handle_t_> ContextPtr{nullptr};
   try {
     ContextPtr = std::unique_ptr<ur_context_handle_t_>(
-        new ur_context_handle_t_{*phDevices});
+        new ur_context_handle_t_{phDevices, DeviceCount});
     *phContext = ContextPtr.release();
   } catch (ur_result_t Err) {
-    RetErr = Err;
+    return Err;
   } catch (...) {
-    RetErr = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
   }
-  return RetErr;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
@@ -72,9 +68,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
 
   switch (static_cast<uint32_t>(ContextInfoType)) {
   case UR_CONTEXT_INFO_NUM_DEVICES:
-    return ReturnValue(1);
+    return ReturnValue(static_cast<uint32_t>(hContext->getDevices().size()));
   case UR_CONTEXT_INFO_DEVICES:
-    return ReturnValue(hContext->getDevice());
+    return ReturnValue(hContext->getDevices().data(),
+                       hContext->getDevices().size());
   case UR_CONTEXT_INFO_REFERENCE_COUNT:
     return ReturnValue(hContext->getReferenceCount());
   case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
@@ -88,7 +85,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
     int Major = 0;
     UR_CHECK_ERROR(cuDeviceGetAttribute(
         &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-        hContext->getDevice()->get()));
+        hContext->getDevices()[0]->get()));
     uint32_t Capabilities =
         (Major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
                            UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
@@ -137,7 +134,10 @@ urContextRetain(ur_context_handle_t hContext) {
 
 UR_APIEXPORT ur_result_t UR_APICALL urContextGetNativeHandle(
     ur_context_handle_t hContext, ur_native_handle_t *phNativeContext) {
-  *phNativeContext = reinterpret_cast<ur_native_handle_t>(hContext->get());
+  // FIXME: this entry point has been deprecated in the SYCL RT and should be
+  // changed to unsupoorted once deprecation period has elapsed.
+  *phNativeContext = reinterpret_cast<ur_native_handle_t>(
+      hContext->getDevices()[0]->getNativeContext());
   return UR_RESULT_SUCCESS;
 }
 
diff --git a/source/adapters/cuda/context.hpp b/source/adapters/cuda/context.hpp
index 4ddcaf3a87..f28e58afe7 100644
--- a/source/adapters/cuda/context.hpp
+++ b/source/adapters/cuda/context.hpp
@@ -33,27 +33,26 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data);
 ///
 /// One of the main differences between the UR API and the CUDA driver API is
 /// that the second modifies the state of the threads by assigning
-/// `CUcontext` objects to threads. `CUcontext` objects store data associated
+/// \c CUcontext objects to threads. \c CUcontext objects store data associated
 /// with a given device and control access to said device from the user side.
 /// UR API context are objects that are passed to functions, and not bound
 /// to threads.
-/// The ur_context_handle_t_ object doesn't implement this behavior. It only
-/// holds the CUDA context data. The RAII object \ref ScopedContext implements
-/// the active context behavior.
 ///
-/// <b> Primary vs User-defined context </b>
+/// Since the \c ur_context_handle_t can contain multiple devices, and a \c
+/// CUcontext refers to only a single device, the \c CUcontext is more tightly
+/// coupled to a \c ur_device_handle_t than a \c ur_context_handle_t. In order
+/// to remove some ambiguities about the different semantics of \c
+/// \c ur_context_handle_t and native \c CUcontext, we access the native \c
+/// CUcontext solely through the \c ur_device_handle_t class, by using the
+/// object \ref ScopedContext, which sets the active device (by setting the
+/// active native \c CUcontext).
 ///
-/// CUDA has two different types of context, the Primary context,
-/// which is usable by all threads on a given process for a given device, and
-/// the aforementioned custom contexts.
-/// The CUDA documentation, confirmed with performance analysis, suggest using
-/// the Primary context whenever possible.
-/// The Primary context is also used by the CUDA Runtime API.
-/// For UR applications to interop with CUDA Runtime API, they have to use
-/// the primary context - and make that active in the thread.
-/// The `ur_context_handle_t_` object can be constructed with a `kind` parameter
-/// that allows to construct a Primary or `user-defined` context, so that
-/// the UR object interface is always the same.
+/// <b> Primary vs User-defined \c CUcontext </b>
+///
+/// CUDA has two different types of \c CUcontext, the Primary context, which is
+/// usable by all threads on a given process for a given device, and the
+/// aforementioned custom \c CUcontext s. The CUDA documentation, confirmed with
+/// performance analysis, suggest using the Primary context whenever possible.
 ///
 ///  <b> Destructor callback </b>
 ///
@@ -63,6 +62,18 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data);
 ///  See proposal for details.
 ///  https://github.com/codeplaysoftware/standards-proposals/blob/master/extended-context-destruction/index.md
 ///
+///
+///  <b> Memory Management for Devices in a Context <\b>
+///
+///  A \c ur_mem_handle_t is associated with a \c ur_context_handle_t_, which
+///  may refer to multiple devices. Therefore the \c ur_mem_handle_t must
+///  handle a native allocation for each device in the context. UR is
+///  responsible for automatically handling event dependencies for kernels
+///  writing to or reading from the same \c ur_mem_handle_t and migrating memory
+///  between native allocations for devices in the same \c ur_context_handle_t_
+///  if necessary.
+///
+///
 struct ur_context_handle_t_ {
 
   struct deleter_data {
@@ -72,18 +83,21 @@ struct ur_context_handle_t_ {
     void operator()() { Function(UserData); }
   };
 
-  using native_type = CUcontext;
-
-  native_type CUContext;
-  ur_device_handle_t DeviceID;
+  std::vector<ur_device_handle_t> Devices;
   std::atomic_uint32_t RefCount;
 
-  ur_context_handle_t_(ur_device_handle_t_ *DevID)
-      : CUContext{DevID->getContext()}, DeviceID{DevID}, RefCount{1} {
-    urDeviceRetain(DeviceID);
+  ur_context_handle_t_(const ur_device_handle_t *Devs, uint32_t NumDevices)
+      : Devices{Devs, Devs + NumDevices}, RefCount{1} {
+    for (auto &Dev : Devices) {
+      urDeviceRetain(Dev);
+    }
   };
 
-  ~ur_context_handle_t_() { urDeviceRelease(DeviceID); }
+  ~ur_context_handle_t_() {
+    for (auto &Dev : Devices) {
+      urDeviceRelease(Dev);
+    }
+  }
 
   void invokeExtendedDeleters() {
     std::lock_guard<std::mutex> Guard(Mutex);
@@ -98,9 +112,9 @@ struct ur_context_handle_t_ {
     ExtendedDeleters.emplace_back(deleter_data{Function, UserData});
   }
 
-  ur_device_handle_t getDevice() const noexcept { return DeviceID; }
-
-  native_type get() const noexcept { return CUContext; }
+  const std::vector<ur_device_handle_t> &getDevices() const noexcept {
+    return Devices;
+  }
 
   uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
 
@@ -123,12 +137,11 @@ struct ur_context_handle_t_ {
 namespace {
 class ScopedContext {
 public:
-  ScopedContext(ur_context_handle_t Context) {
-    if (!Context) {
-      throw UR_RESULT_ERROR_INVALID_CONTEXT;
+  ScopedContext(ur_device_handle_t Device) {
+    if (!Device) {
+      throw UR_RESULT_ERROR_INVALID_DEVICE;
     }
-
-    setContext(Context->get());
+    setContext(Device->getNativeContext());
   }
 
   ScopedContext(CUcontext NativeContext) { setContext(NativeContext); }
diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp
index da11f3f1bf..3a94587d1f 100644
--- a/source/adapters/cuda/device.cpp
+++ b/source/adapters/cuda/device.cpp
@@ -47,7 +47,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
 
   static constexpr uint32_t MaxWorkItemDimensions = 3u;
 
-  ScopedContext Active(hDevice->getContext());
+  ScopedContext Active(hDevice);
 
   switch ((uint32_t)propName) {
   case UR_DEVICE_INFO_TYPE: {
@@ -926,6 +926,34 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     // CUDA supports cubemap seamless filtering.
     return ReturnValue(true);
   }
+  case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP: {
+    // CUDA does support fetching 1D USM sampled image data.
+    return ReturnValue(true);
+  }
+  case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP: {
+    // CUDA does not support fetching 1D non-USM sampled image data.
+    return ReturnValue(false);
+  }
+  case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP: {
+    // CUDA does support fetching 2D USM sampled image data.
+    return ReturnValue(true);
+  }
+  case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP: {
+    // CUDA does support fetching 2D non-USM sampled image data.
+    return ReturnValue(true);
+  }
+  case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP: {
+    // CUDA does not support 3D USM sampled textures
+    return ReturnValue(false);
+  }
+  case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP: {
+    // CUDA does support fetching 3D non-USM sampled image data.
+    return ReturnValue(true);
+  }
+  case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: {
+    // CUDA supports recording timestamp events.
+    return ReturnValue(true);
+  }
   case UR_DEVICE_INFO_DEVICE_ID: {
     int Value = 0;
     UR_CHECK_ERROR(cuDeviceGetAttribute(
@@ -1206,7 +1234,7 @@ ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice,
                                                    uint64_t *pDeviceTimestamp,
                                                    uint64_t *pHostTimestamp) {
   CUevent Event;
-  ScopedContext Active(hDevice->getContext());
+  ScopedContext Active(hDevice);
 
   if (pDeviceTimestamp) {
     UR_CHECK_ERROR(cuEventCreate(&Event, CU_EVENT_DEFAULT));
diff --git a/source/adapters/cuda/device.hpp b/source/adapters/cuda/device.hpp
index 373c8a8ab9..0a40329026 100644
--- a/source/adapters/cuda/device.hpp
+++ b/source/adapters/cuda/device.hpp
@@ -22,6 +22,7 @@ struct ur_device_handle_t_ {
   CUevent EvBase; // CUDA event used as base counter
   std::atomic_uint32_t RefCount;
   ur_platform_handle_t Platform;
+  uint32_t DeviceIndex;
 
   static constexpr uint32_t MaxWorkItemDimensions = 3u;
   size_t MaxWorkItemSizes[MaxWorkItemDimensions];
@@ -34,9 +35,9 @@ struct ur_device_handle_t_ {
 
 public:
   ur_device_handle_t_(native_type cuDevice, CUcontext cuContext, CUevent evBase,
-                      ur_platform_handle_t platform)
+                      ur_platform_handle_t platform, uint32_t DevIndex)
       : CuDevice(cuDevice), CuContext(cuContext), EvBase(evBase), RefCount{1},
-        Platform(platform) {
+        Platform(platform), DeviceIndex{DevIndex} {
 
     UR_CHECK_ERROR(cuDeviceGetAttribute(
         &MaxRegsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
@@ -79,12 +80,16 @@ struct ur_device_handle_t_ {
 
   native_type get() const noexcept { return CuDevice; };
 
-  CUcontext getContext() const noexcept { return CuContext; };
+  CUcontext getNativeContext() const noexcept { return CuContext; };
 
   uint32_t getReferenceCount() const noexcept { return RefCount; }
 
   ur_platform_handle_t getPlatform() const noexcept { return Platform; };
 
+  // Returns the index of the device relative to the other devices in the same
+  // platform
+  uint32_t getIndex() const noexcept { return DeviceIndex; }
+
   uint64_t getElapsedTime(CUevent) const;
 
   size_t getMaxWorkItemSizes(int index) const noexcept {
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index 6f99941095..9627fc6da2 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -26,7 +26,7 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
   UR_ASSERT(EventWaitList, UR_RESULT_SUCCESS);
 
   try {
-    ScopedContext Active(CommandQueue->getContext());
+    ScopedContext Active(CommandQueue->getDevice());
 
     auto Result = forLatestEvents(
         EventWaitList, NumEventsInWaitList,
@@ -188,7 +188,7 @@ bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device,
 // @param [out] ThreadsPerBlock Number of threads per block we should run
 // @param [out] BlocksPerGrid Number of blocks per grid we should run
 ur_result_t
-setKernelParams(const ur_context_handle_t Context,
+setKernelParams([[maybe_unused]] const ur_context_handle_t Context,
                 const ur_device_handle_t Device, const uint32_t WorkDim,
                 const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
                 const size_t *LocalWorkSize, ur_kernel_handle_t &Kernel,
@@ -201,7 +201,7 @@ setKernelParams(const ur_context_handle_t Context,
 
   try {
     // Set the active context here as guessLocalWorkSize needs an active context
-    ScopedContext Active(Context);
+    ScopedContext Active(Device);
     {
       size_t *ReqdThreadsPerBlock = Kernel->ReqdThreadsPerBlock;
       MaxWorkGroupSize = Device->getMaxWorkGroupSize();
@@ -271,7 +271,6 @@ setKernelParams(const ur_context_handle_t Context,
                                    CudaImplicitOffset);
     }
 
-    auto Device = Context->getDevice();
     if (LocalSize > static_cast<uint32_t>(Device->getMaxCapacityLocalMem())) {
       setErrorMessage("Excessive allocation of local memory on the device",
                       UR_RESULT_ERROR_ADAPTER_SPECIFIC);
@@ -329,7 +328,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
   // This function makes one stream work on the previous work (or work
   // represented by input events) and then all future work waits on that stream.
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     uint32_t StreamToken;
     ur_stream_guard_ Guard;
     CUstream CuStream = hQueue->getNextComputeStream(
@@ -410,11 +409,43 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
     const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
   // Preconditions
-  UR_ASSERT(hQueue->getContext() == hKernel->getContext(),
+  UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(),
             UR_RESULT_ERROR_INVALID_KERNEL);
   UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
   UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
 
+  std::vector<ur_event_handle_t> DepEvents(
+      phEventWaitList, phEventWaitList + numEventsInWaitList);
+  std::vector<std::pair<ur_mem_handle_t, ur_lock>> MemMigrationLocks;
+
+  // phEventWaitList only contains events that are handed to UR by the SYCL
+  // runtime. However since UR handles memory dependencies within a context
+  // we may need to add more events to our dependent events list if the UR
+  // context contains multiple devices
+  if (hQueue->getContext()->Devices.size() > 1) {
+    MemMigrationLocks.reserve(hKernel->Args.MemObjArgs.size());
+    for (auto &MemArg : hKernel->Args.MemObjArgs) {
+      bool PushBack = false;
+      if (auto MemDepEvent = MemArg.Mem->LastEventWritingToMemObj;
+          MemDepEvent && std::find(DepEvents.begin(), DepEvents.end(),
+                                   MemDepEvent) == DepEvents.end()) {
+        DepEvents.push_back(MemDepEvent);
+        PushBack = true;
+      }
+      if ((MemArg.AccessFlags &
+           (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) ||
+          PushBack) {
+        if (std::find_if(MemMigrationLocks.begin(), MemMigrationLocks.end(),
+                         [MemArg](auto &Lock) {
+                           return Lock.first == MemArg.Mem;
+                         }) == MemMigrationLocks.end())
+          MemMigrationLocks.emplace_back(
+              std::pair{MemArg.Mem, ur_lock{MemArg.Mem->MemoryMigrationMutex}});
+      }
+    }
+  }
+
+  // Early exit for zero size kernel
   if (*pGlobalWorkSize == 0) {
     return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
                                           phEventWaitList, phEvent);
@@ -426,26 +457,37 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   size_t BlocksPerGrid[3] = {1u, 1u, 1u};
 
   uint32_t LocalSize = hKernel->getLocalSize();
-  ur_result_t Result = UR_RESULT_SUCCESS;
   CUfunction CuFunc = hKernel->get();
 
-  Result = setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
-                           pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
-                           hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
-  if (Result != UR_RESULT_SUCCESS) {
-    return Result;
-  }
+  // This might return UR_RESULT_ERROR_ADAPTER_SPECIFIC, which cannot be handled
+  // using the standard UR_CHECK_ERROR
+  if (ur_result_t Ret =
+          setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
+                          pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
+                          hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
+      Ret != UR_RESULT_SUCCESS)
+    return Ret;
 
   try {
     std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
+    ScopedContext Active(hQueue->getDevice());
     uint32_t StreamToken;
     ur_stream_guard_ Guard;
     CUstream CuStream = hQueue->getNextComputeStream(
         numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
 
-    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
-                               phEventWaitList);
+    if (DepEvents.size()) {
+      UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, DepEvents.size(),
+                                       DepEvents.data()));
+    }
+
+    // For memory migration across devices in the same context
+    if (hQueue->getContext()->Devices.size() > 1) {
+      for (auto &MemArg : hKernel->Args.MemObjArgs) {
+        migrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice());
+      }
+    }
 
     if (phEvent) {
       RetImplEvent =
@@ -454,6 +496,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
+    // Once event has been started we can unlock MemoryMigrationMutex
+    if (hQueue->getContext()->Devices.size() > 1) {
+      for (auto &MemArg : hKernel->Args.MemObjArgs) {
+        // Telling the ur_mem_handle_t that it will need to wait on this kernel
+        // if it has been written to
+        if (phEvent && (MemArg.AccessFlags &
+                        (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY))) {
+          MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.get());
+        }
+      }
+      // We can release the MemoryMigrationMutexes now
+      MemMigrationLocks.clear();
+    }
+
     auto &ArgIndices = hKernel->getArgIndices();
     UR_CHECK_ERROR(cuLaunchKernel(
         CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2],
@@ -469,9 +525,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     }
 
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
@@ -565,35 +621,52 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
     size_t hostRowPitch, size_t hostSlicePitch, void *pDst,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
-  ur_result_t Result = UR_RESULT_SUCCESS;
-  CUdeviceptr DevPtr = std::get<BufferMem>(hBuffer->Mem).get();
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
-  try {
-    ScopedContext Active(hQueue->getContext());
-    CUstream CuStream = hQueue->getNextTransferStream();
+  ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex};
+  auto Device = hQueue->getDevice();
+  ScopedContext Active(Device);
+  CUstream Stream = hQueue->getNextTransferStream();
 
-    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
-                               phEventWaitList);
+  try {
+    // Note that this entry point may be called on a queue that may not be the
+    // last queue to write to the MemBuffer, meaning we must perform the copy
+    // from a different device
+    if (hBuffer->LastEventWritingToMemObj &&
+        hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() !=
+            hQueue->getDevice()) {
+      hQueue = hBuffer->LastEventWritingToMemObj->getQueue();
+      Device = hQueue->getDevice();
+      ScopedContext Active(Device);
+      Stream = CUstream{0}; // Default stream for different device
+      // We may have to wait for an event on another queue if it is the last
+      // event writing to mem obj
+      UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1,
+                                       &hBuffer->LastEventWritingToMemObj));
+    }
+
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
+                                     phEventWaitList));
 
     if (phEvent) {
       RetImplEvent =
           std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
-              UR_COMMAND_MEM_BUFFER_READ_RECT, hQueue, CuStream));
+              UR_COMMAND_MEM_BUFFER_READ_RECT, hQueue, Stream));
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    Result = commonEnqueueMemBufferCopyRect(
-        CuStream, region, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin,
+    auto DevPtr = std::get<BufferMem>(hBuffer->Mem).getPtr(Device);
+    UR_CHECK_ERROR(commonEnqueueMemBufferCopyRect(
+        Stream, region, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin,
         bufferRowPitch, bufferSlicePitch, pDst, CU_MEMORYTYPE_HOST, hostOrigin,
-        hostRowPitch, hostSlicePitch);
+        hostRowPitch, hostSlicePitch));
 
     if (phEvent) {
       UR_CHECK_ERROR(RetImplEvent->record());
     }
 
     if (blockingRead) {
-      UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
+      UR_CHECK_ERROR(cuStreamSynchronize(Stream));
     }
 
     if (phEvent) {
@@ -601,9 +674,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
     }
 
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
@@ -613,15 +686,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
     size_t hostRowPitch, size_t hostSlicePitch, void *pSrc,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
-  ur_result_t Result = UR_RESULT_SUCCESS;
-  CUdeviceptr DevPtr = std::get<BufferMem>(hBuffer->Mem).get();
+  CUdeviceptr DevPtr =
+      std::get<BufferMem>(hBuffer->Mem).getPtr(hQueue->getDevice());
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
   try {
-    ScopedContext active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     CUstream cuStream = hQueue->getNextTransferStream();
-    Result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
-                               phEventWaitList);
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+                                     phEventWaitList));
 
     if (phEvent) {
       RetImplEvent =
@@ -630,10 +703,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    Result = commonEnqueueMemBufferCopyRect(
+    UR_CHECK_ERROR(commonEnqueueMemBufferCopyRect(
         cuStream, region, pSrc, CU_MEMORYTYPE_HOST, hostOrigin, hostRowPitch,
         hostSlicePitch, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin,
-        bufferRowPitch, bufferSlicePitch);
+        bufferRowPitch, bufferSlicePitch));
 
     if (phEvent) {
       UR_CHECK_ERROR(RetImplEvent->record());
@@ -648,9 +721,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
     }
 
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy(
@@ -666,7 +739,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy(
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     ur_result_t Result = UR_RESULT_SUCCESS;
 
     auto Stream = hQueue->getNextTransferStream();
@@ -680,8 +753,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    auto Src = std::get<BufferMem>(hBufferSrc->Mem).get() + srcOffset;
-    auto Dst = std::get<BufferMem>(hBufferDst->Mem).get() + dstOffset;
+    auto Src = std::get<BufferMem>(hBufferSrc->Mem)
+                   .getPtrWithOffset(hQueue->getDevice(), srcOffset);
+    auto Dst = std::get<BufferMem>(hBufferDst->Mem)
+                   .getPtrWithOffset(hQueue->getDevice(), dstOffset);
 
     UR_CHECK_ERROR(cuMemcpyDtoDAsync(Dst, Src, size, Stream));
 
@@ -706,12 +781,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
   ur_result_t Result = UR_RESULT_SUCCESS;
-  CUdeviceptr SrcPtr = std::get<BufferMem>(hBufferSrc->Mem).get();
-  CUdeviceptr DstPtr = std::get<BufferMem>(hBufferDst->Mem).get();
+  CUdeviceptr SrcPtr =
+      std::get<BufferMem>(hBufferSrc->Mem).getPtr(hQueue->getDevice());
+  CUdeviceptr DstPtr =
+      std::get<BufferMem>(hBufferDst->Mem).getPtr(hQueue->getDevice());
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     CUstream CuStream = hQueue->getNextTransferStream();
     Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
@@ -784,7 +861,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
 
     auto Stream = hQueue->getNextTransferStream();
     ur_result_t Result =
@@ -797,7 +874,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    auto DstDevice = std::get<BufferMem>(hBuffer->Mem).get() + offset;
+    auto DstDevice = std::get<BufferMem>(hBuffer->Mem)
+                         .getPtrWithOffset(hQueue->getDevice(), offset);
     auto N = size / patternSize;
 
     // pattern size in bytes
@@ -933,18 +1011,34 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
   std::ignore = rowPitch;
   std::ignore = slicePitch;
 
-  UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface,
-            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(hImage->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
-  ur_result_t Result = UR_RESULT_SUCCESS;
+  ur_lock MemoryMigrationLock{hImage->MemoryMigrationMutex};
+  auto Device = hQueue->getDevice();
+  CUstream Stream = hQueue->getNextTransferStream();
 
   try {
-    ScopedContext Active(hQueue->getContext());
-    CUstream CuStream = hQueue->getNextTransferStream();
-    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
-                               phEventWaitList);
+    // Note that this entry point may be called on a queue that may not be the
+    // last queue to write to the Image, meaning we must perform the copy
+    // from a different device
+    if (hImage->LastEventWritingToMemObj &&
+        hImage->LastEventWritingToMemObj->getQueue()->getDevice() !=
+            hQueue->getDevice()) {
+      hQueue = hImage->LastEventWritingToMemObj->getQueue();
+      Device = hQueue->getDevice();
+      ScopedContext Active(Device);
+      Stream = CUstream{0}; // Default stream for different device
+      // We may have to wait for an event on another queue if it is the last
+      // event writing to mem obj
+      UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1,
+                                       &hImage->LastEventWritingToMemObj));
+    }
+
+    ScopedContext Active(Device);
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
+                                     phEventWaitList));
 
-    CUarray Array = std::get<SurfaceMem>(hImage->Mem).getArray();
+    CUarray Array = std::get<SurfaceMem>(hImage->Mem).getArray(Device);
 
     CUDA_ARRAY_DESCRIPTOR ArrayDesc;
     UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array));
@@ -954,29 +1048,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
     size_t ByteOffsetX = origin.x * ElementByteSize * ArrayDesc.NumChannels;
     size_t BytesToCopy = ElementByteSize * ArrayDesc.NumChannels * region.width;
 
-    ur_mem_type_t ImgType = std::get<SurfaceMem>(hImage->Mem).getImageType();
+    ur_mem_type_t ImgType = std::get<SurfaceMem>(hImage->Mem).getType();
 
     std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
     if (phEvent) {
       RetImplEvent =
           std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
-              UR_COMMAND_MEM_IMAGE_READ, hQueue, CuStream));
+              UR_COMMAND_MEM_IMAGE_READ, hQueue, Stream));
       UR_CHECK_ERROR(RetImplEvent->start());
     }
     if (ImgType == UR_MEM_TYPE_IMAGE1D) {
       UR_CHECK_ERROR(
-          cuMemcpyAtoHAsync(pDst, Array, ByteOffsetX, BytesToCopy, CuStream));
+          cuMemcpyAtoHAsync(pDst, Array, ByteOffsetX, BytesToCopy, Stream));
     } else {
       ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height,
                                          region.depth};
       ur_rect_offset_t SrcOffset = {ByteOffsetX, origin.y, origin.z};
 
-      Result = commonEnqueueMemImageNDCopy(
-          CuStream, ImgType, AdjustedRegion, &Array, CU_MEMORYTYPE_ARRAY,
-          SrcOffset, pDst, CU_MEMORYTYPE_HOST, ur_rect_offset_t{});
-      if (Result != UR_RESULT_SUCCESS) {
-        return Result;
-      }
+      UR_CHECK_ERROR(commonEnqueueMemImageNDCopy(
+          Stream, ImgType, AdjustedRegion, &Array, CU_MEMORYTYPE_ARRAY,
+          SrcOffset, pDst, CU_MEMORYTYPE_HOST, ur_rect_offset_t{}));
     }
 
     if (phEvent) {
@@ -985,7 +1076,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
     }
 
     if (blockingRead) {
-      UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
+      UR_CHECK_ERROR(cuStreamSynchronize(Stream));
     }
   } catch (ur_result_t Err) {
     return Err;
@@ -993,7 +1084,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
     return UR_RESULT_ERROR_UNKNOWN;
   }
 
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
@@ -1005,18 +1096,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
   std::ignore = rowPitch;
   std::ignore = slicePitch;
 
-  UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface,
-            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(hImage->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  auto &Image = std::get<SurfaceMem>(hImage->Mem);
+  // FIXME: We are assuming that the lifetime of host ptr lives as long as the
+  // image
+  if (!Image.HostPtr)
+    Image.HostPtr = pSrc;
 
   ur_result_t Result = UR_RESULT_SUCCESS;
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     CUstream CuStream = hQueue->getNextTransferStream();
     Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
 
-    CUarray Array = std::get<SurfaceMem>(hImage->Mem).getArray();
+    CUarray Array = Image.getArray(hQueue->getDevice());
 
     CUDA_ARRAY_DESCRIPTOR ArrayDesc;
     UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array));
@@ -1034,7 +1129,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    ur_mem_type_t ImgType = std::get<SurfaceMem>(hImage->Mem).getImageType();
+    ur_mem_type_t ImgType = Image.getType();
     if (ImgType == UR_MEM_TYPE_IMAGE1D) {
       UR_CHECK_ERROR(
           cuMemcpyHtoAAsync(Array, ByteOffsetX, pSrc, BytesToCopy, CuStream));
@@ -1071,24 +1166,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy(
     ur_rect_offset_t dstOrigin, ur_rect_region_t region,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
-  UR_ASSERT(hImageSrc->MemType == ur_mem_handle_t_::Type::Surface,
-            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-  UR_ASSERT(hImageDst->MemType == ur_mem_handle_t_::Type::Surface,
-            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-  UR_ASSERT(std::get<SurfaceMem>(hImageSrc->Mem).getImageType() ==
-                std::get<SurfaceMem>(hImageDst->Mem).getImageType(),
+  UR_ASSERT(hImageSrc->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(hImageDst->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(std::get<SurfaceMem>(hImageSrc->Mem).getType() ==
+                std::get<SurfaceMem>(hImageDst->Mem).getType(),
             UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
   ur_result_t Result = UR_RESULT_SUCCESS;
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     CUstream CuStream = hQueue->getNextTransferStream();
     Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
 
-    CUarray SrcArray = std::get<SurfaceMem>(hImageSrc->Mem).getArray();
-    CUarray DstArray = std::get<SurfaceMem>(hImageDst->Mem).getArray();
+    CUarray SrcArray =
+        std::get<SurfaceMem>(hImageSrc->Mem).getArray(hQueue->getDevice());
+    CUarray DstArray =
+        std::get<SurfaceMem>(hImageDst->Mem).getArray(hQueue->getDevice());
 
     CUDA_ARRAY_DESCRIPTOR SrcArrayDesc;
     UR_CHECK_ERROR(cuArrayGetDescriptor(&SrcArrayDesc, SrcArray));
@@ -1117,7 +1212,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy(
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    ur_mem_type_t ImgType = std::get<SurfaceMem>(hImageSrc->Mem).getImageType();
+    ur_mem_type_t ImgType = std::get<SurfaceMem>(hImageSrc->Mem).getType();
 
     ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height,
                                        region.depth};
@@ -1154,8 +1249,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
     ur_map_flags_t mapFlags, size_t offset, size_t size,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent, void **ppRetMap) {
-  UR_ASSERT(hBuffer->MemType == ur_mem_handle_t_::Type::Buffer,
-            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(hBuffer->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
   UR_ASSERT(offset + size <= std::get<BufferMem>(hBuffer->Mem).getSize(),
             UR_RESULT_ERROR_INVALID_SIZE);
 
@@ -1177,7 +1271,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
                                     MapPtr, numEventsInWaitList,
                                     phEventWaitList, phEvent);
   } else {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
 
     if (IsPinned) {
       Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList,
@@ -1208,8 +1302,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
     ur_queue_handle_t hQueue, ur_mem_handle_t hMem, void *pMappedPtr,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
-  UR_ASSERT(hMem->MemType == ur_mem_handle_t_::Type::Buffer,
-            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  UR_ASSERT(hMem->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
   auto &BufferImpl = std::get<BufferMem>(hMem->Mem);
 
   auto *Map = BufferImpl.getMapDetails(pMappedPtr);
@@ -1225,7 +1318,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
         hQueue, hMem, true, Map->getMapOffset(), Map->getMapSize(), pMappedPtr,
         numEventsInWaitList, phEventWaitList, phEvent);
   } else {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
 
     if (IsPinned) {
       Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList,
@@ -1256,7 +1349,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
   std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     uint32_t StreamToken;
     ur_stream_guard_ Guard;
     CUstream CuStream = hQueue->getNextComputeStream(
@@ -1310,7 +1403,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy(
   std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     CUstream CuStream = hQueue->getNextTransferStream();
     Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
@@ -1347,7 +1440,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
   UR_CHECK_ERROR(cuPointerGetAttribute(
       &PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem));
   UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE);
-  ur_device_handle_t Device = hQueue->getContext()->getDevice();
+  ur_device_handle_t Device = hQueue->getDevice();
 
   // Certain cuda devices and Windows do not have support for some Unified
   // Memory features. cuMemPrefetchAsync requires concurrent memory access
@@ -1373,7 +1466,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
   std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     CUstream CuStream = hQueue->getNextTransferStream();
     Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
                                phEventWaitList);
@@ -1414,7 +1507,7 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
       (advice & UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE) ||
       (advice & UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE) ||
       (advice & UR_USM_ADVICE_FLAG_DEFAULT)) {
-    ur_device_handle_t Device = hQueue->getContext()->getDevice();
+    ur_device_handle_t Device = hQueue->getDevice();
     if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
       setErrorMessage("Mem advise ignored as device does not support "
                       "concurrent managed access",
@@ -1441,7 +1534,7 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
   std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
 
     if (phEvent) {
       EventPtr =
@@ -1453,16 +1546,16 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
     if (advice & UR_USM_ADVICE_FLAG_DEFAULT) {
       UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size,
                                  CU_MEM_ADVISE_UNSET_READ_MOSTLY,
-                                 hQueue->getContext()->getDevice()->get()));
+                                 hQueue->getDevice()->get()));
       UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size,
                                  CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION,
-                                 hQueue->getContext()->getDevice()->get()));
+                                 hQueue->getDevice()->get()));
       UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size,
                                  CU_MEM_ADVISE_UNSET_ACCESSED_BY,
-                                 hQueue->getContext()->getDevice()->get()));
+                                 hQueue->getDevice()->get()));
     } else {
       Result = setCuMemAdvise((CUdeviceptr)pMem, size, advice,
-                              hQueue->getContext()->getDevice()->get());
+                              hQueue->getDevice()->get());
     }
 
     if (phEvent) {
@@ -1493,7 +1586,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
   ur_result_t result = UR_RESULT_SUCCESS;
 
   try {
-    ScopedContext active(hQueue->getContext());
+    ScopedContext active(hQueue->getDevice());
     CUstream cuStream = hQueue->getNextTransferStream();
     result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
                                phEventWaitList);
@@ -1543,33 +1636,50 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
   UR_ASSERT(!hBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
   UR_ASSERT(offset + size <= std::get<BufferMem>(hBuffer->Mem).Size,
             UR_RESULT_ERROR_INVALID_SIZE);
-
-  ur_result_t Result = UR_RESULT_SUCCESS;
-  CUdeviceptr DevPtr = std::get<BufferMem>(hBuffer->Mem).get();
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+  ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex};
+  auto Device = hQueue->getDevice();
+  ScopedContext Active(Device);
+  CUstream Stream = hQueue->getNextTransferStream();
 
   try {
-    ScopedContext Active(hQueue->getContext());
-    CUstream CuStream = hQueue->getNextTransferStream();
-
-    Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
-                               phEventWaitList);
+    // Note that this entry point may be called on a queue that may not be the
+    // last queue to write to the MemBuffer, meaning we must perform the copy
+    // from a different device
+    if (hBuffer->LastEventWritingToMemObj &&
+        hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() !=
+            hQueue->getDevice()) {
+      hQueue = hBuffer->LastEventWritingToMemObj->getQueue();
+      Device = hQueue->getDevice();
+      ScopedContext Active(Device);
+      Stream = CUstream{0}; // Default stream for different device
+      // We may have to wait for an event on another queue if it is the last
+      // event writing to mem obj
+      UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1,
+                                       &hBuffer->LastEventWritingToMemObj));
+    }
+
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
+                                     phEventWaitList));
 
     if (phEvent) {
       RetImplEvent =
           std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
-              UR_COMMAND_MEM_BUFFER_READ, hQueue, CuStream));
+              UR_COMMAND_MEM_BUFFER_READ, hQueue, Stream));
       UR_CHECK_ERROR(RetImplEvent->start());
     }
 
-    UR_CHECK_ERROR(cuMemcpyDtoHAsync(pDst, DevPtr + offset, size, CuStream));
+    UR_CHECK_ERROR(cuMemcpyDtoHAsync(
+        pDst,
+        std::get<BufferMem>(hBuffer->Mem).getPtrWithOffset(Device, offset),
+        size, Stream));
 
     if (phEvent) {
       UR_CHECK_ERROR(RetImplEvent->record());
     }
 
     if (blockingRead) {
-      UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
+      UR_CHECK_ERROR(cuStreamSynchronize(Stream));
     }
 
     if (phEvent) {
@@ -1577,10 +1687,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
     }
 
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   }
-
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
@@ -1592,11 +1701,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
             UR_RESULT_ERROR_INVALID_SIZE);
 
   ur_result_t Result = UR_RESULT_SUCCESS;
-  CUdeviceptr DevPtr = std::get<BufferMem>(hBuffer->Mem).get();
+  CUdeviceptr DevPtr =
+      std::get<BufferMem>(hBuffer->Mem).getPtr(hQueue->getDevice());
   std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     CUstream CuStream = hQueue->getNextTransferStream();
 
     Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
@@ -1709,3 +1819,33 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe(
 
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+    ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+  try {
+    ScopedContext Active(hQueue->getDevice());
+    CUstream CuStream = hQueue->getNextComputeStream();
+
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+                                     phEventWaitList));
+
+    RetImplEvent =
+        std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+            UR_COMMAND_TIMESTAMP_RECORDING_EXP, hQueue, CuStream));
+    UR_CHECK_ERROR(RetImplEvent->start());
+    UR_CHECK_ERROR(RetImplEvent->record());
+
+    if (blocking) {
+      UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
+    }
+
+    *phEvent = RetImplEvent.release();
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp
index ac66bf479e..f9889a3f46 100644
--- a/source/adapters/cuda/event.cpp
+++ b/source/adapters/cuda/event.cpp
@@ -54,7 +54,7 @@ ur_result_t ur_event_handle_t_::start() {
   ur_result_t Result = UR_RESULT_SUCCESS;
 
   try {
-    if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) {
+    if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) {
       // NOTE: This relies on the default stream to be unused.
       UR_CHECK_ERROR(cuEventRecord(EvQueued, 0));
       UR_CHECK_ERROR(cuEventRecord(EvStart, Stream));
@@ -149,7 +149,7 @@ ur_result_t ur_event_handle_t_::release() {
 
   UR_CHECK_ERROR(cuEventDestroy(EvEnd));
 
-  if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) {
+  if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) {
     UR_CHECK_ERROR(cuEventDestroy(EvQueued));
     UR_CHECK_ERROR(cuEventDestroy(EvStart));
   }
@@ -190,7 +190,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
   UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet);
 
   ur_queue_handle_t Queue = hEvent->getQueue();
-  if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
+  if (Queue == nullptr || (!(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) &&
+                           !hEvent->isTimestampEvent())) {
     return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
   }
 
@@ -220,7 +221,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(ur_event_handle_t,
 UR_APIEXPORT ur_result_t UR_APICALL
 urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) {
   try {
-    ScopedContext Active(phEventWaitList[0]->getContext());
+    ScopedContext Active(phEventWaitList[0]->getQueue()->getDevice());
 
     auto WaitFunc = [](ur_event_handle_t Event) -> ur_result_t {
       UR_ASSERT(Event, UR_RESULT_ERROR_INVALID_EVENT);
@@ -255,7 +256,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) {
     std::unique_ptr<ur_event_handle_t_> event_ptr{hEvent};
     ur_result_t Result = UR_RESULT_ERROR_INVALID_EVENT;
     try {
-      ScopedContext Active(hEvent->getContext());
       Result = hEvent->release();
     } catch (...) {
       Result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
diff --git a/source/adapters/cuda/event.hpp b/source/adapters/cuda/event.hpp
index 390fd7833a..5ed68f0f25 100644
--- a/source/adapters/cuda/event.hpp
+++ b/source/adapters/cuda/event.hpp
@@ -57,6 +57,10 @@ struct ur_event_handle_t_ {
     return UR_EVENT_STATUS_COMPLETE;
   }
 
+  bool isTimestampEvent() const noexcept {
+    return getCommandType() == UR_COMMAND_TIMESTAMP_RECORDING_EXP;
+  }
+
   ur_context_handle_t getContext() const noexcept { return Context; };
 
   uint32_t incrementReferenceCount() { return ++RefCount; }
@@ -83,13 +87,14 @@ struct ur_event_handle_t_ {
   static ur_event_handle_t
   makeNative(ur_command_t Type, ur_queue_handle_t Queue, CUstream Stream,
              uint32_t StreamToken = std::numeric_limits<uint32_t>::max()) {
-    const bool ProfilingEnabled =
-        Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE;
+    const bool RequiresTimings =
+        Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE ||
+        Type == UR_COMMAND_TIMESTAMP_RECORDING_EXP;
     native_type EvEnd = nullptr, EvQueued = nullptr, EvStart = nullptr;
     UR_CHECK_ERROR(cuEventCreate(
-        &EvEnd, ProfilingEnabled ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING));
+        &EvEnd, RequiresTimings ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING));
 
-    if (ProfilingEnabled) {
+    if (RequiresTimings) {
       UR_CHECK_ERROR(cuEventCreate(&EvQueued, CU_EVENT_DEFAULT));
       UR_CHECK_ERROR(cuEventCreate(&EvStart, CU_EVENT_DEFAULT));
     }
diff --git a/source/adapters/cuda/image.cpp b/source/adapters/cuda/image.cpp
index 3dd1a7f34b..95dc2e258e 100644
--- a/source/adapters/cuda/image.cpp
+++ b/source/adapters/cuda/image.cpp
@@ -331,7 +331,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPitchedAllocExp(
     const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool,
     size_t widthInBytes, size_t height, size_t elementSizeBytes, void **ppMem,
     size_t *pResultPitch) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
   std::ignore = pUSMDesc;
   std::ignore = pool;
@@ -350,7 +352,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPitchedAllocExp(
             UR_RESULT_ERROR_INVALID_VALUE);
   ur_result_t Result = UR_RESULT_SUCCESS;
   try {
-    ScopedContext Active(hDevice->getContext());
+    ScopedContext Active(hDevice);
     UR_CHECK_ERROR(cuMemAllocPitch((CUdeviceptr *)ppMem, pResultPitch,
                                    widthInBytes, height, elementSizeBytes));
   } catch (ur_result_t error) {
@@ -366,7 +368,9 @@ UR_APIEXPORT ur_result_t UR_APICALL
 urBindlessImagesUnsampledImageHandleDestroyExp(ur_context_handle_t hContext,
                                                ur_device_handle_t hDevice,
                                                ur_exp_image_handle_t hImage) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
   UR_CHECK_ERROR(cuSurfObjectDestroy((CUsurfObject)hImage));
@@ -377,7 +381,9 @@ UR_APIEXPORT ur_result_t UR_APICALL
 urBindlessImagesSampledImageHandleDestroyExp(ur_context_handle_t hContext,
                                              ur_device_handle_t hDevice,
                                              ur_exp_image_handle_t hImage) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
   UR_CHECK_ERROR(cuTexObjectDestroy((CUtexObject)hImage));
@@ -388,7 +394,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc,
     ur_exp_image_mem_handle_t *phImageMem) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
   // Populate descriptor
@@ -435,7 +443,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp(
     return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
   }
 
-  ScopedContext Active(hDevice->getContext());
+  ScopedContext Active(hDevice);
 
   // Allocate a cuArray
   if (pImageDesc->numMipLevel == 1) {
@@ -475,10 +483,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp(
 UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageFreeExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     ur_exp_image_mem_handle_t hImageMem) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
-  ScopedContext Active(hDevice->getContext());
+  ScopedContext Active(hDevice);
   try {
     UR_CHECK_ERROR(cuArrayDestroy((CUarray)hImageMem));
   } catch (ur_result_t Err) {
@@ -492,9 +502,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageFreeExp(
 UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     ur_exp_image_mem_handle_t hImageMem, const ur_image_format_t *pImageFormat,
-    const ur_image_desc_t *pImageDesc, ur_mem_handle_t *phMem,
+    [[maybe_unused]] const ur_image_desc_t *pImageDesc,
     ur_exp_image_handle_t *phImage) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
   unsigned int NumChannels = 0;
@@ -509,7 +521,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
 
   try {
 
-    ScopedContext Active(hDevice->getContext());
+    ScopedContext Active(hDevice);
 
     CUDA_RESOURCE_DESC image_res_desc = {};
 
@@ -523,15 +535,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
     UR_CHECK_ERROR(cuSurfObjectCreate(&surface, &image_res_desc));
     *phImage = (ur_exp_image_handle_t)surface;
 
-    auto urMemObj = std::unique_ptr<ur_mem_handle_t_>(new ur_mem_handle_t_{
-        hContext, (CUarray)hImageMem, surface, pImageDesc->type});
-
-    if (urMemObj == nullptr) {
-      return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
-    }
-
-    *phMem = urMemObj.release();
-
   } catch (ur_result_t Err) {
     return Err;
   } catch (...) {
@@ -545,11 +548,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     ur_exp_image_mem_handle_t hImageMem, const ur_image_format_t *pImageFormat,
     const ur_image_desc_t *pImageDesc, ur_sampler_handle_t hSampler,
-    ur_mem_handle_t *phMem, ur_exp_image_handle_t *phImage) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+    ur_exp_image_handle_t *phImage) {
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
-  ScopedContext Active(hDevice->getContext());
+  ScopedContext Active(hDevice);
 
   unsigned int NumChannels = 0;
   UR_CHECK_ERROR(
@@ -611,15 +616,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
     UR_CHECK_ERROR(
         urTextureCreate(hSampler, pImageDesc, image_res_desc, phImage));
 
-    auto urMemObj = std::unique_ptr<ur_mem_handle_t_>(new ur_mem_handle_t_{
-        hContext, (CUarray)hImageMem, (CUtexObject)*phImage, hSampler,
-        pImageDesc->type});
-
-    if (urMemObj == nullptr) {
-      return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
-    }
-
-    *phMem = urMemObj.release();
   } catch (ur_result_t Err) {
     return Err;
   } catch (...) {
@@ -654,7 +650,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
                                             &PixelSizeBytes));
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     CUstream Stream = hQueue->getNextTransferStream();
     enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList);
 
@@ -986,11 +982,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMipmapGetLevelExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     ur_exp_image_mem_handle_t hImageMem, uint32_t mipmapLevel,
     ur_exp_image_mem_handle_t *phImageMem) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
   try {
-    ScopedContext Active(hDevice->getContext());
+    ScopedContext Active(hDevice);
     CUarray ImageArray;
     UR_CHECK_ERROR(cuMipmappedArrayGetLevel(
         &ImageArray, (CUmipmappedArray)hImageMem, mipmapLevel));
@@ -1007,10 +1005,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMipmapGetLevelExp(
 UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMipmapFreeExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     ur_exp_image_mem_handle_t hMem) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
-  ScopedContext Active(hDevice->getContext());
+  ScopedContext Active(hDevice);
   try {
     UR_CHECK_ERROR(cuMipmappedArrayDestroy((CUmipmappedArray)hMem));
   } catch (ur_result_t Err) {
@@ -1025,11 +1025,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportOpaqueFDExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size,
     ur_exp_interop_mem_desc_t *pInteropMemDesc,
     ur_exp_interop_mem_handle_t *phInteropMem) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
   try {
-    ScopedContext Active(hDevice->getContext());
+    ScopedContext Active(hDevice);
 
     CUDA_EXTERNAL_MEMORY_HANDLE_DESC extMemDesc = {};
     extMemDesc.size = size;
@@ -1068,7 +1070,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
     const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc,
     ur_exp_interop_mem_handle_t hInteropMem,
     ur_exp_image_mem_handle_t *phImageMem) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
   unsigned int NumChannels = 0;
@@ -1080,7 +1084,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
       pImageFormat->channelType, pImageFormat->channelOrder, &format, nullptr));
 
   try {
-    ScopedContext Active(hDevice->getContext());
+    ScopedContext Active(hDevice);
 
     CUDA_ARRAY3D_DESCRIPTOR ArrayDesc = {};
     ArrayDesc.Width = pImageDesc->width;
@@ -1119,11 +1123,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
 UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     ur_exp_interop_mem_handle_t hInteropMem) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
   try {
-    ScopedContext Active(hDevice->getContext());
+    ScopedContext Active(hDevice);
     UR_CHECK_ERROR(cuDestroyExternalMemory((CUexternalMemory)hInteropMem));
   } catch (ur_result_t Err) {
     return Err;
@@ -1138,11 +1144,13 @@ urBindlessImagesImportExternalSemaphoreOpaqueFDExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     ur_exp_interop_semaphore_desc_t *pInteropSemaphoreDesc,
     ur_exp_interop_semaphore_handle_t *phInteropSemaphoreHandle) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
   try {
-    ScopedContext Active(hDevice->getContext());
+    ScopedContext Active(hDevice);
 
     CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC extSemDesc = {};
 
@@ -1177,11 +1185,13 @@ urBindlessImagesImportExternalSemaphoreOpaqueFDExp(
 UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesDestroyExternalSemaphoreExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     ur_exp_interop_semaphore_handle_t hInteropSemaphore) {
-  UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
 
   try {
-    ScopedContext Active(hDevice->getContext());
+    ScopedContext Active(hDevice);
     UR_CHECK_ERROR(
         cuDestroyExternalSemaphore((CUexternalSemaphore)hInteropSemaphore));
   } catch (ur_result_t Err) {
@@ -1198,7 +1208,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp(
     ur_event_handle_t *phEvent) {
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     CUstream Stream = hQueue->getNextTransferStream();
 
     enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList);
@@ -1230,7 +1240,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp(
     ur_event_handle_t *phEvent) {
 
   try {
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
     CUstream Stream = hQueue->getNextTransferStream();
 
     enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList);
diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp
index c9334add15..675fdbe0a3 100644
--- a/source/adapters/cuda/kernel.cpp
+++ b/source/adapters/cuda/kernel.cpp
@@ -19,7 +19,7 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName,
   std::unique_ptr<ur_kernel_handle_t_> Kernel{nullptr};
 
   try {
-    ScopedContext Active(hProgram->getContext());
+    ScopedContext Active(hProgram->getDevice());
 
     CUfunction CuFunc;
     CUresult FunctionResult =
@@ -293,8 +293,6 @@ UR_APIEXPORT ur_result_t UR_APICALL
 urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex,
                      const ur_kernel_arg_mem_obj_properties_t *Properties,
                      ur_mem_handle_t hArgValue) {
-  std::ignore = Properties;
-
   // Below sets kernel arg when zero-sized buffers are handled.
   // In such case the corresponding memory is null.
   if (hArgValue == nullptr) {
@@ -304,10 +302,14 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex,
 
   ur_result_t Result = UR_RESULT_SUCCESS;
   try {
-    if (hArgValue->MemType == ur_mem_handle_t_::Type::Surface) {
+    auto Device = hKernel->getProgram()->getDevice();
+    ur_mem_flags_t MemAccess =
+        Properties ? Properties->memoryAccess : UR_MEM_FLAG_READ_WRITE;
+    hKernel->Args.addMemObjArg(argIndex, hArgValue, MemAccess);
+    if (hArgValue->isImage()) {
       CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
       UR_CHECK_ERROR(cuArray3DGetDescriptor(
-          &arrayDesc, std::get<SurfaceMem>(hArgValue->Mem).getArray()));
+          &arrayDesc, std::get<SurfaceMem>(hArgValue->Mem).getArray(Device)));
       if (arrayDesc.Format != CU_AD_FORMAT_UNSIGNED_INT32 &&
           arrayDesc.Format != CU_AD_FORMAT_SIGNED_INT32 &&
           arrayDesc.Format != CU_AD_FORMAT_HALF &&
@@ -317,10 +319,11 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex,
                         UR_RESULT_ERROR_ADAPTER_SPECIFIC);
         return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
       }
-      CUsurfObject CuSurf = std::get<SurfaceMem>(hArgValue->Mem).getSurface();
+      CUsurfObject CuSurf =
+          std::get<SurfaceMem>(hArgValue->Mem).getSurface(Device);
       hKernel->setKernelArg(argIndex, sizeof(CuSurf), (void *)&CuSurf);
     } else {
-      CUdeviceptr CuPtr = std::get<BufferMem>(hArgValue->Mem).get();
+      CUdeviceptr CuPtr = std::get<BufferMem>(hArgValue->Mem).getPtr(Device);
       hKernel->setKernelArg(argIndex, sizeof(CUdeviceptr), (void *)&CuPtr);
     }
   } catch (ur_result_t Err) {
diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp
index 9e65066ba7..b7a7358b27 100644
--- a/source/adapters/cuda/kernel.hpp
+++ b/source/adapters/cuda/kernel.hpp
@@ -63,6 +63,14 @@ struct ur_kernel_handle_t_ {
     args_size_t ParamSizes;
     args_index_t Indices;
     args_size_t OffsetPerIndex;
+    // A struct to keep track of memargs so that we can do dependency analysis
+    // at urEnqueueKernelLaunch
+    struct mem_obj_arg {
+      ur_mem_handle_t_ *Mem;
+      int Index;
+      ur_mem_flags_t AccessFlags;
+    };
+    std::vector<mem_obj_arg> MemObjArgs;
 
     std::uint32_t ImplicitOffsetArgs[3] = {0, 0, 0};
 
@@ -116,6 +124,20 @@ struct ur_kernel_handle_t_ {
              Size + (AlignedLocalOffset - LocalOffset));
     }
 
+    void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) {
+      assert(hMem && "Invalid mem handle");
+      // To avoid redundancy we are not storing mem obj with index i at index
+      // i in the vec of MemObjArgs.
+      for (auto &Arg : MemObjArgs) {
+        if (Arg.Index == Index) {
+          // Overwrite the mem obj with the same index
+          Arg = arguments::mem_obj_arg{hMem, Index, Flags};
+          return;
+        }
+      }
+      MemObjArgs.push_back(arguments::mem_obj_arg{hMem, Index, Flags});
+    }
+
     void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) {
       assert(Size == sizeof(std::uint32_t) * 3);
       std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size);
@@ -142,7 +164,7 @@ struct ur_kernel_handle_t_ {
     urContextRetain(Context);
     /// Note: this code assumes that there is only one device per context
     ur_result_t RetError = urKernelGetGroupInfo(
-        this, Context->getDevice(),
+        this, Program->getDevice(),
         UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
         sizeof(ReqdThreadsPerBlock), ReqdThreadsPerBlock, nullptr);
     (void)RetError;
@@ -165,6 +187,7 @@ struct ur_kernel_handle_t_ {
   uint32_t getReferenceCount() const noexcept { return RefCount; }
 
   native_type get() const noexcept { return Function; };
+  ur_program_handle_t getProgram() const noexcept { return Program; };
 
   native_type get_with_offset_parameter() const noexcept {
     return FunctionWithOffsetParam;
diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp
index f097d2474e..c28ce98748 100644
--- a/source/adapters/cuda/memory.cpp
+++ b/source/adapters/cuda/memory.cpp
@@ -36,59 +36,50 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
   const bool PerformInitialCopy =
       (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) ||
       ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && !EnableUseHostPtr);
-  ur_result_t Result = UR_RESULT_SUCCESS;
   ur_mem_handle_t MemObj = nullptr;
 
   try {
-    ScopedContext Active(hContext);
-    CUdeviceptr Ptr = 0;
     auto HostPtr = pProperties ? pProperties->pHost : nullptr;
-
     BufferMem::AllocMode AllocMode = BufferMem::AllocMode::Classic;
 
     if ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && EnableUseHostPtr) {
       UR_CHECK_ERROR(
           cuMemHostRegister(HostPtr, size, CU_MEMHOSTREGISTER_DEVICEMAP));
-      UR_CHECK_ERROR(cuMemHostGetDevicePointer(&Ptr, HostPtr, 0));
       AllocMode = BufferMem::AllocMode::UseHostPtr;
     } else if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) {
       UR_CHECK_ERROR(cuMemAllocHost(&HostPtr, size));
-      UR_CHECK_ERROR(cuMemHostGetDevicePointer(&Ptr, HostPtr, 0));
       AllocMode = BufferMem::AllocMode::AllocHostPtr;
-    } else {
-      UR_CHECK_ERROR(cuMemAlloc(&Ptr, size));
-      if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) {
-        AllocMode = BufferMem::AllocMode::CopyIn;
-      }
+    } else if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) {
+      AllocMode = BufferMem::AllocMode::CopyIn;
     }
 
-    ur_mem_handle_t parentBuffer = nullptr;
+    auto URMemObj = std::unique_ptr<ur_mem_handle_t_>(
+        new ur_mem_handle_t_{hContext, flags, AllocMode, HostPtr, size});
+    if (URMemObj == nullptr) {
+      return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+    }
 
-    auto URMemObj = std::unique_ptr<ur_mem_handle_t_>(new ur_mem_handle_t_{
-        hContext, parentBuffer, flags, AllocMode, Ptr, HostPtr, size});
-    if (URMemObj != nullptr) {
-      MemObj = URMemObj.release();
-      if (PerformInitialCopy) {
-        // Operates on the default stream of the current CUDA context.
+    // First allocation will be made at urMemBufferCreate if context only
+    // has one device
+    if (PerformInitialCopy && HostPtr) {
+      // Perform initial copy to every device in context
+      for (auto &Device : hContext->getDevices()) {
+        ScopedContext Active(Device);
+        // getPtr may allocate mem if not already allocated
+        const auto &Ptr = std::get<BufferMem>(URMemObj->Mem).getPtr(Device);
         UR_CHECK_ERROR(cuMemcpyHtoD(Ptr, HostPtr, size));
-        // Synchronize with default stream implicitly used by cuMemcpyHtoD
-        // to make buffer data available on device before any other UR call
-        // uses it.
-        CUstream defaultStream = 0;
-        UR_CHECK_ERROR(cuStreamSynchronize(defaultStream));
       }
-    } else {
-      Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
     }
+    MemObj = URMemObj.release();
   } catch (ur_result_t Err) {
-    Result = Err;
+    return Err;
   } catch (...) {
-    Result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+    return UR_RESULT_ERROR_OUT_OF_RESOURCES;
   }
 
   *phBuffer = MemObj;
 
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) {
@@ -117,26 +108,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) {
       return UR_RESULT_SUCCESS;
     }
 
-    ScopedContext Active(MemObjPtr->getContext());
-
-    if (hMem->MemType == ur_mem_handle_t_::Type::Buffer) {
-      auto &BufferImpl = std::get<BufferMem>(MemObjPtr->Mem);
-      switch (BufferImpl.MemAllocMode) {
-      case BufferMem::AllocMode::CopyIn:
-      case BufferMem::AllocMode::Classic:
-        UR_CHECK_ERROR(cuMemFree(BufferImpl.Ptr));
-        break;
-      case BufferMem::AllocMode::UseHostPtr:
-        UR_CHECK_ERROR(cuMemHostUnregister(BufferImpl.HostPtr));
-        break;
-      case BufferMem::AllocMode::AllocHostPtr:
-        UR_CHECK_ERROR(cuMemFreeHost(BufferImpl.HostPtr));
-      };
-    } else if (hMem->MemType == ur_mem_handle_t_::Type::Surface) {
-      auto &SurfaceImpl = std::get<SurfaceMem>(MemObjPtr->Mem);
-      UR_CHECK_ERROR(cuSurfObjectDestroy(SurfaceImpl.getSurface()));
-      UR_CHECK_ERROR(cuArrayDestroy(SurfaceImpl.getArray()));
-    }
+    UR_CHECK_ERROR(hMem->clear());
 
   } catch (ur_result_t Err) {
     Result = Err;
@@ -161,10 +133,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) {
 /// \param[out] phNativeMem Set to the native handle of the UR mem object.
 ///
 /// \return UR_RESULT_SUCCESS
-UR_APIEXPORT ur_result_t UR_APICALL urMemGetNativeHandle(
-    ur_mem_handle_t hMem, ur_device_handle_t, ur_native_handle_t *phNativeMem) {
-  *phNativeMem = reinterpret_cast<ur_native_handle_t>(
-      std::get<BufferMem>(hMem->Mem).get());
+UR_APIEXPORT ur_result_t UR_APICALL
+urMemGetNativeHandle(ur_mem_handle_t hMem, ur_device_handle_t Device,
+                     ur_native_handle_t *phNativeMem) {
+  try {
+    *phNativeMem = reinterpret_cast<ur_native_handle_t>(
+        std::get<BufferMem>(hMem->Mem).getPtr(Device));
+  } catch (ur_result_t Err) {
+    return Err;
+  } catch (...) {
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
   return UR_RESULT_SUCCESS;
 }
 
@@ -177,14 +156,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory,
 
   UrReturnHelper ReturnValue(propSize, pMemInfo, pPropSizeRet);
 
-  ScopedContext Active(hMemory->getContext());
+  // Any device in context will do
+  auto Device = hMemory->getContext()->getDevices()[0];
+  ScopedContext Active(Device);
 
   switch (MemInfoType) {
   case UR_MEM_INFO_SIZE: {
     try {
       size_t AllocSize = 0;
       UR_CHECK_ERROR(cuMemGetAddressRange(
-          nullptr, &AllocSize, std::get<BufferMem>(hMemory->Mem).Ptr));
+          nullptr, &AllocSize,
+          std::get<BufferMem>(hMemory->Mem).getPtr(Device)));
       return ReturnValue(AllocSize);
     } catch (ur_result_t Err) {
       return Err;
@@ -242,160 +224,34 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
               UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
   }
 
-  ur_result_t Result = UR_RESULT_SUCCESS;
-
   // We only support RBGA channel order
   // TODO: check SYCL CTS and spec. May also have to support BGRA
   UR_ASSERT(pImageFormat->channelOrder == UR_IMAGE_CHANNEL_ORDER_RGBA,
             UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION);
 
-  // We have to use cuArray3DCreate, which has some caveats. The height and
-  // depth parameters must be set to 0 produce 1D or 2D arrays. pImageDesc gives
-  // a minimum value of 1, so we need to convert the answer.
-  CUDA_ARRAY3D_DESCRIPTOR ArrayDesc;
-  ArrayDesc.NumChannels = 4; // Only support 4 channel image
-  ArrayDesc.Flags = 0;       // No flags required
-  ArrayDesc.Width = pImageDesc->width;
-  if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
-    ArrayDesc.Height = 0;
-    ArrayDesc.Depth = 0;
-  } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
-    ArrayDesc.Height = pImageDesc->height;
-    ArrayDesc.Depth = 0;
-  } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
-    ArrayDesc.Height = pImageDesc->height;
-    ArrayDesc.Depth = pImageDesc->depth;
-  }
-
-  // We need to get this now in bytes for calculating the total image size later
-  size_t PixelTypeSizeBytes;
-
-  switch (pImageFormat->channelType) {
-  case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8:
-  case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8:
-    ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
-    PixelTypeSizeBytes = 1;
-    break;
-  case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8:
-    ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT8;
-    PixelTypeSizeBytes = 1;
-    break;
-  case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16:
-  case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16:
-    ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT16;
-    PixelTypeSizeBytes = 2;
-    break;
-  case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16:
-    ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT16;
-    PixelTypeSizeBytes = 2;
-    break;
-  case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT:
-    ArrayDesc.Format = CU_AD_FORMAT_HALF;
-    PixelTypeSizeBytes = 2;
-    break;
-  case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32:
-    ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
-    PixelTypeSizeBytes = 4;
-    break;
-  case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32:
-    ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT32;
-    PixelTypeSizeBytes = 4;
-    break;
-  case UR_IMAGE_CHANNEL_TYPE_FLOAT:
-    ArrayDesc.Format = CU_AD_FORMAT_FLOAT;
-    PixelTypeSizeBytes = 4;
-    break;
-  default:
-    detail::ur::die(
-        "urMemImageCreate given unsupported image_channel_data_type");
-  }
-
-  // When a dimension isn't used pImageDesc has the size set to 1
-  size_t PixelSizeBytes =
-      PixelTypeSizeBytes * 4; // 4 is the only number of channels we support
-  size_t ImageSizeBytes = PixelSizeBytes * pImageDesc->width *
-                          pImageDesc->height * pImageDesc->depth;
-
-  ScopedContext Active(hContext);
-  CUarray ImageArray = nullptr;
-  try {
-    UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &ArrayDesc));
-  } catch (ur_result_t Err) {
-    if (Err == UR_RESULT_ERROR_INVALID_VALUE) {
-      return UR_RESULT_ERROR_INVALID_IMAGE_SIZE;
-    }
-    return Err;
-  } catch (...) {
-    return UR_RESULT_ERROR_UNKNOWN;
-  }
+  auto URMemObj = std::unique_ptr<ur_mem_handle_t_>(
+      new ur_mem_handle_t_{hContext, flags, *pImageFormat, *pImageDesc, pHost});
 
   try {
     if (PerformInitialCopy) {
-      // We have to use a different copy function for each image dimensionality
-      if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
-        UR_CHECK_ERROR(cuMemcpyHtoA(ImageArray, 0, pHost, ImageSizeBytes));
-      } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
-        CUDA_MEMCPY2D CpyDesc;
-        memset(&CpyDesc, 0, sizeof(CpyDesc));
-        CpyDesc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
-        CpyDesc.srcHost = pHost;
-        CpyDesc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
-        CpyDesc.dstArray = ImageArray;
-        CpyDesc.WidthInBytes = PixelSizeBytes * pImageDesc->width;
-        CpyDesc.Height = pImageDesc->height;
-        UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc));
-      } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
-        CUDA_MEMCPY3D CpyDesc;
-        memset(&CpyDesc, 0, sizeof(CpyDesc));
-        CpyDesc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
-        CpyDesc.srcHost = pHost;
-        CpyDesc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
-        CpyDesc.dstArray = ImageArray;
-        CpyDesc.WidthInBytes = PixelSizeBytes * pImageDesc->width;
-        CpyDesc.Height = pImageDesc->height;
-        CpyDesc.Depth = pImageDesc->depth;
-        UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc));
+      for (const auto &Device : hContext->getDevices()) {
+        UR_CHECK_ERROR(migrateMemoryToDeviceIfNeeded(URMemObj.get(), Device));
       }
     }
 
-    // CUDA_RESOURCE_DESC is a union of different structs, shown here
-    // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html
-    // We need to fill it as described here to use it for a surface or texture
-    // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__SURFOBJECT.html
-    // CUDA_RESOURCE_DESC::resType must be CU_RESOURCE_TYPE_ARRAY and
-    // CUDA_RESOURCE_DESC::res::array::hArray must be set to a valid CUDA array
-    // handle.
-    // CUDA_RESOURCE_DESC::flags must be set to zero
-
-    CUDA_RESOURCE_DESC ImageResDesc;
-    ImageResDesc.res.array.hArray = ImageArray;
-    ImageResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
-    ImageResDesc.flags = 0;
-
-    CUsurfObject Surface;
-    UR_CHECK_ERROR(cuSurfObjectCreate(&Surface, &ImageResDesc));
-
-    auto MemObj = std::unique_ptr<ur_mem_handle_t_>(new ur_mem_handle_t_(
-        hContext, ImageArray, Surface, flags, pImageDesc->type, phMem));
-
-    if (MemObj == nullptr) {
+    if (URMemObj == nullptr) {
       return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
     }
 
-    *phMem = MemObj.release();
+    *phMem = URMemObj.release();
   } catch (ur_result_t Err) {
-    if (ImageArray) {
-      cuArrayDestroy(ImageArray);
-    }
+    (*phMem)->clear();
     return Err;
   } catch (...) {
-    if (ImageArray) {
-      cuArrayDestroy(ImageArray);
-    }
+    (*phMem)->clear();
     return UR_RESULT_ERROR_UNKNOWN;
   }
-
-  return Result;
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t hMemory,
@@ -407,14 +263,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t hMemory,
 
   auto Context = hMemory->getContext();
 
-  ScopedContext Active(Context);
+  // Any device will do
+  auto Device = Context->getDevices()[0];
+  ScopedContext Active(Device);
   UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
 
   try {
     CUDA_ARRAY3D_DESCRIPTOR ArrayInfo;
 
     UR_CHECK_ERROR(cuArray3DGetDescriptor(
-        &ArrayInfo, std::get<SurfaceMem>(hMemory->Mem).getArray()));
+        &ArrayInfo, std::get<SurfaceMem>(hMemory->Mem).getArray(Device)));
 
     const auto cuda2urFormat = [](CUarray_format CUFormat,
                                   ur_image_channel_type_t *ChannelType) {
@@ -544,27 +402,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition(
   UR_ASSERT(pRegion->size != 0u, UR_RESULT_ERROR_INVALID_BUFFER_SIZE);
 
   auto &BufferImpl = std::get<BufferMem>(hBuffer->Mem);
-
-  assert((pRegion->origin <= (pRegion->origin + pRegion->size)) && "Overflow");
   UR_ASSERT(((pRegion->origin + pRegion->size) <= BufferImpl.getSize()),
             UR_RESULT_ERROR_INVALID_BUFFER_SIZE);
-  // Retained indirectly due to retaining parent buffer below.
-  ur_context_handle_t Context = hBuffer->Context;
-
-  BufferMem::AllocMode AllocMode = BufferMem::AllocMode::Classic;
 
-  assert(BufferImpl.Ptr != BufferMem::native_type{0});
-  BufferMem::native_type Ptr = BufferImpl.Ptr + pRegion->origin;
-
-  void *HostPtr = nullptr;
-  if (BufferImpl.HostPtr) {
-    HostPtr = static_cast<char *>(BufferImpl.HostPtr) + pRegion->origin;
-  }
-
-  std::unique_ptr<ur_mem_handle_t_> MemObj{nullptr};
+  std::unique_ptr<ur_mem_handle_t_> RetMemObj{nullptr};
   try {
-    MemObj = std::unique_ptr<ur_mem_handle_t_>{new ur_mem_handle_t_{
-        Context, hBuffer, flags, AllocMode, Ptr, HostPtr, pRegion->size}};
+    for (auto Device : hBuffer->Context->getDevices()) {
+      BufferImpl.getPtr(
+          Device); // This is allocating a dev ptr behind the scenes
+                   // which is necessary before SubBuffer partition
+    }
+    RetMemObj = std::unique_ptr<ur_mem_handle_t_>{
+        new ur_mem_handle_t_{hBuffer, pRegion->origin}};
   } catch (ur_result_t Err) {
     *phMem = nullptr;
     return Err;
@@ -573,6 +422,189 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition(
     return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
   }
 
-  *phMem = MemObj.release();
+  *phMem = RetMemObj.release();
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
+                                           const ur_device_handle_t hDevice) {
+  ScopedContext Active(hDevice);
+  ur_lock LockGuard(Mem->MemoryAllocationMutex);
+
+  if (Mem->isBuffer()) {
+    auto &Buffer = std::get<BufferMem>(Mem->Mem);
+    auto &DevPtr = Buffer.Ptrs[hDevice->getIndex() % Buffer.Ptrs.size()];
+
+    // Allocation has already been made
+    if (DevPtr != BufferMem::native_type{0}) {
+      return UR_RESULT_SUCCESS;
+    }
+
+    if (Buffer.MemAllocMode == BufferMem::AllocMode::AllocHostPtr) {
+      // Host allocation has already been made
+      UR_CHECK_ERROR(cuMemHostGetDevicePointer(&DevPtr, Buffer.HostPtr, 0));
+    } else if (Buffer.MemAllocMode == BufferMem::AllocMode::UseHostPtr) {
+      UR_CHECK_ERROR(cuMemHostRegister(Buffer.HostPtr, Buffer.Size,
+                                       CU_MEMHOSTALLOC_DEVICEMAP));
+      UR_CHECK_ERROR(cuMemHostGetDevicePointer(&DevPtr, Buffer.HostPtr, 0));
+    } else {
+      UR_CHECK_ERROR(cuMemAlloc(&DevPtr, Buffer.Size));
+    }
+  } else {
+    CUarray ImageArray;
+    CUsurfObject Surface;
+    try {
+      auto &Image = std::get<SurfaceMem>(Mem->Mem);
+      // Allocation has already been made
+      if (Image.Arrays[hDevice->getIndex() % Image.Arrays.size()]) {
+        return UR_RESULT_SUCCESS;
+      }
+      UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &Image.ArrayDesc));
+      Image.Arrays[hDevice->getIndex() % Image.Arrays.size()] = ImageArray;
+
+      // CUDA_RESOURCE_DESC is a union of different structs, shown here
+      // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html
+      // We need to fill it as described here to use it for a surface or texture
+      // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__SURFOBJECT.html
+      // CUDA_RESOURCE_DESC::resType must be CU_RESOURCE_TYPE_ARRAY and
+      // CUDA_RESOURCE_DESC::res::array::hArray must be set to a valid CUDA
+      // array handle. CUDA_RESOURCE_DESC::flags must be set to zero
+      CUDA_RESOURCE_DESC ImageResDesc;
+      ImageResDesc.res.array.hArray = ImageArray;
+      ImageResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+      ImageResDesc.flags = 0;
+
+      UR_CHECK_ERROR(cuSurfObjectCreate(&Surface, &ImageResDesc));
+      Image.SurfObjs[hDevice->getIndex() % Image.SurfObjs.size()] = Surface;
+    } catch (ur_result_t Err) {
+      if (ImageArray) {
+        UR_CHECK_ERROR(cuArrayDestroy(ImageArray));
+      }
+      return Err;
+    } catch (...) {
+      if (ImageArray) {
+        UR_CHECK_ERROR(cuArrayDestroy(ImageArray));
+      }
+      return UR_RESULT_ERROR_UNKNOWN;
+    }
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+namespace {
+ur_result_t migrateBufferToDevice(ur_mem_handle_t Mem,
+                                  ur_device_handle_t hDevice) {
+  auto &Buffer = std::get<BufferMem>(Mem->Mem);
+  if (Mem->LastEventWritingToMemObj == nullptr) {
+    // Device allocation being initialized from host for the first time
+    if (Buffer.HostPtr) {
+      UR_CHECK_ERROR(
+          cuMemcpyHtoD(Buffer.getPtr(hDevice), Buffer.HostPtr, Buffer.Size));
+    }
+  } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
+             hDevice) {
+    UR_CHECK_ERROR(cuMemcpyDtoD(
+        Buffer.getPtr(hDevice),
+        Buffer.getPtr(Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
+        Buffer.Size));
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
+                                 ur_device_handle_t hDevice) {
+  auto &Image = std::get<SurfaceMem>(Mem->Mem);
+  // When a dimension isn't used image_desc has the size set to 1
+  size_t PixelSizeBytes = Image.PixelTypeSizeBytes *
+                          4; // 4 is the only number of channels we support
+  size_t ImageSizeBytes = PixelSizeBytes * Image.ImageDesc.width *
+                          Image.ImageDesc.height * Image.ImageDesc.depth;
+
+  CUarray ImageArray = Image.getArray(hDevice);
+
+  CUDA_MEMCPY2D CpyDesc2D;
+  CUDA_MEMCPY3D CpyDesc3D;
+  // We have to use a different copy function for each image
+  // dimensionality
+  if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
+    memset(&CpyDesc2D, 0, sizeof(CpyDesc2D));
+    CpyDesc2D.srcHost = Image.HostPtr;
+    CpyDesc2D.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
+    CpyDesc2D.dstArray = ImageArray;
+    CpyDesc2D.WidthInBytes = PixelSizeBytes * Image.ImageDesc.width;
+    CpyDesc2D.Height = Image.ImageDesc.height;
+  } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
+    memset(&CpyDesc3D, 0, sizeof(CpyDesc3D));
+    CpyDesc3D.srcHost = Image.HostPtr;
+    CpyDesc3D.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
+    CpyDesc3D.dstArray = ImageArray;
+    CpyDesc3D.WidthInBytes = PixelSizeBytes * Image.ImageDesc.width;
+    CpyDesc3D.Height = Image.ImageDesc.height;
+    CpyDesc3D.Depth = Image.ImageDesc.depth;
+  }
+
+  if (Mem->LastEventWritingToMemObj == nullptr) {
+    if (Image.HostPtr) {
+      if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
+        UR_CHECK_ERROR(
+            cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
+      } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
+        CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
+        CpyDesc2D.srcHost = Image.HostPtr;
+        UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
+      } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
+        CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
+        CpyDesc3D.srcHost = Image.HostPtr;
+        UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
+      }
+    }
+  } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
+             hDevice) {
+    if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
+      // FIXME: 1D memcpy from DtoD going through the host.
+      UR_CHECK_ERROR(cuMemcpyAtoH(
+          Image.HostPtr,
+          Image.getArray(
+              Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
+          0 /*srcOffset*/, ImageSizeBytes));
+      UR_CHECK_ERROR(
+          cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
+    } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
+      CpyDesc2D.srcArray = Image.getArray(
+          Mem->LastEventWritingToMemObj->getQueue()->getDevice());
+      UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
+    } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
+      CpyDesc3D.srcArray = Image.getArray(
+          Mem->LastEventWritingToMemObj->getQueue()->getDevice());
+      UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
+    }
+  }
+  return UR_RESULT_SUCCESS;
+}
+} // namespace
+
+// If calling this entry point it is necessary to lock the memoryMigrationMutex
+// beforehand
+ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
+                                          const ur_device_handle_t hDevice) {
+  UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+  // Device allocation has already been initialized with most up to date
+  // data in buffer
+  if (Mem->HaveMigratedToDeviceSinceLastWrite
+          [hDevice->getIndex() %
+           Mem->HaveMigratedToDeviceSinceLastWrite.size()]) {
+    return UR_RESULT_SUCCESS;
+  }
+
+  ScopedContext Active(hDevice);
+  if (Mem->isBuffer()) {
+    UR_CHECK_ERROR(migrateBufferToDevice(Mem, hDevice));
+  } else {
+    UR_CHECK_ERROR(migrateImageToDevice(Mem, hDevice));
+  }
+
+  Mem->HaveMigratedToDeviceSinceLastWrite
+      [hDevice->getIndex() % Mem->HaveMigratedToDeviceSinceLastWrite.size()] =
+      true;
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp
index 3f09552038..6b7e9d0156 100644
--- a/source/adapters/cuda/memory.hpp
+++ b/source/adapters/cuda/memory.hpp
@@ -11,10 +11,19 @@
 
 #include <cassert>
 #include <cuda.h>
+#include <memory>
 #include <ur_api.h>
 #include <variant>
 
 #include "common.hpp"
+#include "context.hpp"
+#include "device.hpp"
+#include "event.hpp"
+
+ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
+                                           const ur_device_handle_t);
+ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
+                                          const ur_device_handle_t);
 
 // Handler for plain, pointer-based CUDA allocations
 struct BufferMem {
@@ -27,7 +36,7 @@ struct BufferMem {
     /// Original flags for the mapped region
     ur_map_flags_t MapFlags;
     /// Allocated host memory used exclusively for this map.
-    std::unique_ptr<unsigned char[]> MapMem;
+    std::shared_ptr<unsigned char[]> MapMem;
 
     BufferMap(size_t MapSize, size_t MapOffset, ur_map_flags_t MapFlags)
         : MapSize(MapSize), MapOffset(MapOffset), MapFlags(MapFlags),
@@ -61,11 +70,16 @@ struct BufferMem {
 
   using native_type = CUdeviceptr;
 
+private:
+  /// CUDA handler for the pointer
+  std::vector<native_type> Ptrs;
+
+public:
   /// If this allocation is a sub-buffer (i.e., a view on an existing
   /// allocation), this is the pointer to the parent handler structure
-  ur_mem_handle_t Parent;
-  /// CUDA handler for the pointer
-  native_type Ptr;
+  ur_mem_handle_t Parent = nullptr;
+  /// Outer UR mem holding this BufferMem in variant
+  ur_mem_handle_t OuterMemStruct;
   /// Pointer associated with this device on the host
   void *HostPtr;
   /// Size of the allocation in bytes
@@ -75,12 +89,34 @@ struct BufferMem {
 
   AllocMode MemAllocMode;
 
-  BufferMem(ur_mem_handle_t Parent, BufferMem::AllocMode Mode, CUdeviceptr Ptr,
-            void *HostPtr, size_t Size)
-      : Parent{Parent}, Ptr{Ptr}, HostPtr{HostPtr}, Size{Size},
-        PtrToBufferMap{}, MemAllocMode{Mode} {};
+  BufferMem(ur_context_handle_t Context, ur_mem_handle_t OuterMemStruct,
+            AllocMode Mode, void *HostPtr, size_t Size)
+      : Ptrs(Context->getDevices().size(), native_type{0}),
+        OuterMemStruct{OuterMemStruct}, HostPtr{HostPtr}, Size{Size},
+        MemAllocMode{Mode} {};
 
-  native_type get() const noexcept { return Ptr; }
+  BufferMem(const BufferMem &Buffer) = default;
+
+  native_type getPtrWithOffset(const ur_device_handle_t Device, size_t Offset) {
+    if (ur_result_t Err =
+            allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
+        Err != UR_RESULT_SUCCESS) {
+      throw Err;
+    }
+    return reinterpret_cast<native_type>(
+        reinterpret_cast<uint8_t *>(Ptrs[Device->getIndex() % Ptrs.size()]) +
+        Offset);
+  }
+
+  native_type getPtr(const ur_device_handle_t Device) {
+    return getPtrWithOffset(Device, 0);
+  }
+
+  void *getVoid(const ur_device_handle_t Device) {
+    return reinterpret_cast<void *>(getPtrWithOffset(Device, 0));
+  }
+
+  bool isSubBuffer() const noexcept { return Parent != nullptr; }
 
   size_t getSize() const noexcept { return Size; }
 
@@ -120,104 +156,315 @@ struct BufferMem {
     assert(MapPtr != nullptr);
     PtrToBufferMap.erase(MapPtr);
   }
-};
 
-// Handler data for surface object (i.e. Images)
-struct SurfaceMem {
-  CUarray Array;
-  CUsurfObject SurfObj;
-  ur_mem_type_t ImageType;
+  ur_result_t clear() {
+    if (Parent != nullptr) {
+      return UR_RESULT_SUCCESS;
+    }
 
-  SurfaceMem(CUarray Array, CUsurfObject Surf, ur_mem_type_t ImageType,
-             void *HostPtr)
-      : Array{Array}, SurfObj{Surf}, ImageType{ImageType} {
-    (void)HostPtr;
+    switch (MemAllocMode) {
+    case AllocMode::CopyIn:
+    case AllocMode::Classic:
+      for (auto &DevPtr : Ptrs) {
+        if (DevPtr != native_type{0}) {
+          UR_CHECK_ERROR(cuMemFree(DevPtr));
+        }
+      }
+      break;
+    case AllocMode::UseHostPtr:
+      UR_CHECK_ERROR(cuMemHostUnregister(HostPtr));
+      break;
+    case AllocMode::AllocHostPtr:
+      UR_CHECK_ERROR(cuMemFreeHost(HostPtr));
+    }
+    return UR_RESULT_SUCCESS;
   }
 
-  CUarray getArray() const noexcept { return Array; }
+  friend struct ur_mem_handle_t_;
+  friend ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
+                                                    const ur_device_handle_t);
+};
 
-  CUsurfObject getSurface() const noexcept { return SurfObj; }
+// Handler data for surface object (i.e. Images)
+struct SurfaceMem {
+private:
+  std::vector<CUarray> Arrays;
+  std::vector<CUsurfObject> SurfObjs;
 
-  ur_mem_type_t getImageType() const noexcept { return ImageType; }
-};
+public:
+  ur_mem_handle_t OuterMemStruct;
 
-// For sampled/unsampled images
-struct ImageMem {
-  CUarray Array;
-  void *Handle;
-  ur_mem_type_t ImageType;
-  ur_sampler_handle_t Sampler;
+  ur_image_format_t ImageFormat;
+  ur_image_desc_t ImageDesc;
+  CUDA_ARRAY3D_DESCRIPTOR ArrayDesc;
+  size_t PixelTypeSizeBytes;
+  void *HostPtr;
 
-  ImageMem(CUarray Array, void *Handle, ur_mem_type_t ImageType,
-           ur_sampler_handle_t Sampler)
-      : Array{Array}, Handle{Handle}, ImageType{ImageType}, Sampler{Sampler} {};
+  SurfaceMem(ur_context_handle_t Context, ur_mem_handle_t OuterMemStruct,
+             ur_image_format_t ImageFormat, ur_image_desc_t ImageDesc,
+             void *HostPtr)
+      : Arrays(Context->Devices.size(), CUarray{0}),
+        SurfObjs(Context->Devices.size(), CUsurfObject{0}),
+        OuterMemStruct{OuterMemStruct},
+        ImageFormat{ImageFormat}, ImageDesc{ImageDesc}, HostPtr{HostPtr} {
+    // We have to use hipArray3DCreate, which has some caveats. The height and
+    // depth parameters must be set to 0 produce 1D or 2D arrays. image_desc
+    // gives a minimum value of 1, so we need to convert the answer.
+    ArrayDesc.NumChannels = 4; // Only support 4 channel image
+    ArrayDesc.Flags = 0;       // No flags required
+    ArrayDesc.Width = ImageDesc.width;
+    if (ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
+      ArrayDesc.Height = 0;
+      ArrayDesc.Depth = 0;
+    } else if (ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
+      ArrayDesc.Height = ImageDesc.height;
+      ArrayDesc.Depth = 0;
+    } else if (ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
+      ArrayDesc.Height = ImageDesc.height;
+      ArrayDesc.Depth = ImageDesc.depth;
+    }
 
-  CUarray get_array() const noexcept { return Array; }
+    // We need to get PixelTypeSizeBytes for calculating the total image size
+    // later
+    switch (ImageFormat.channelType) {
+
+    case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8:
+    case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8:
+      ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
+      PixelTypeSizeBytes = 1;
+      break;
+    case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8:
+      ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT8;
+      PixelTypeSizeBytes = 1;
+      break;
+    case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16:
+    case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16:
+      ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT16;
+      PixelTypeSizeBytes = 2;
+      break;
+    case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16:
+      ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT16;
+      PixelTypeSizeBytes = 2;
+      break;
+    case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT:
+      ArrayDesc.Format = CU_AD_FORMAT_HALF;
+      PixelTypeSizeBytes = 2;
+      break;
+    case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32:
+      ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
+      PixelTypeSizeBytes = 4;
+      break;
+    case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32:
+      ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT32;
+      PixelTypeSizeBytes = 4;
+      break;
+    case UR_IMAGE_CHANNEL_TYPE_FLOAT:
+      ArrayDesc.Format = CU_AD_FORMAT_FLOAT;
+      PixelTypeSizeBytes = 4;
+      break;
+    default:
+      detail::ur::die(
+          "urMemImageCreate given unsupported image_channel_data_type");
+    }
+  }
 
-  void *get_handle() const noexcept { return Handle; }
+  // Will allocate a new array on device if not already allocated
+  CUarray getArray(const ur_device_handle_t Device) {
+    if (ur_result_t Err =
+            allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
+        Err != UR_RESULT_SUCCESS) {
+      throw Err;
+    }
+    return Arrays[Device->getIndex() % Arrays.size()];
+  }
+  // Will allocate a new surface on device if not already allocated
+  CUsurfObject getSurface(const ur_device_handle_t Device) {
+    if (ur_result_t Err =
+            allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
+        Err != UR_RESULT_SUCCESS) {
+      throw Err;
+    }
+    return SurfObjs[Device->getIndex() % SurfObjs.size()];
+  }
 
-  ur_mem_type_t get_image_type() const noexcept { return ImageType; }
+  ur_mem_type_t getType() { return ImageDesc.type; }
 
-  ur_sampler_handle_t get_sampler() const noexcept { return Sampler; }
+  ur_result_t clear() {
+    for (auto Array : Arrays) {
+      if (Array) {
+        UR_CHECK_ERROR(cuArrayDestroy(Array));
+      }
+    }
+    for (auto Surf : SurfObjs) {
+      if (Surf != CUsurfObject{0}) {
+        UR_CHECK_ERROR(cuSurfObjectDestroy(Surf));
+      }
+    }
+    return UR_RESULT_SUCCESS;
+  }
+  friend ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
+                                                    const ur_device_handle_t);
 };
 
 /// UR Mem mapping to CUDA memory allocations, both data and texture/surface.
 /// \brief Represents non-SVM allocations on the CUDA backend.
 /// Keeps tracks of all mapped regions used for Map/Unmap calls.
 /// Only one region can be active at the same time per allocation.
+///
+/// The ur_mem_handle_t is responsible for memory allocation and migration
+/// across devices in the same ur_context_handle_t. If a kernel writes to a
+/// ur_mem_handle_t then it will write to LastEventWritingToMemObj. Then all
+/// subsequent operations that want to read from the ur_mem_handle_t must wait
+/// on the event referring to the last write.
+///
+/// Since urMemBufferCreate/urMemImageCreate do not take a queue or device
+/// object, only a ur_context_handle_t, at mem obj creation we don't know which
+/// device we must make a native image/allocation on. Therefore no allocations
+/// are made at urMemBufferCreate/urMemImageCreate. Instead device
+/// images/allocations are made lazily. These allocations are made implicitly
+/// with a call to getPtr/getArray which will allocate a new allocation/image on
+/// device if need be.
+///
+/// Memory migration between native allocations for devices in the same
+/// ur_context_handle_t will occur at:
+///
+///   1. urEnqueueKernelLaunch
+///   2. urEnqueueMem(Buffer|Image)Read(Rect)
+///
+/// Migrations will occur in both cases if the most recent version of data
+/// is on a different device, marked by
+/// LastEventWritingToMemObj->getQueue()->getDevice()
+///
+/// Example trace:
+/// ~~~~~~~~~~~~~~
+///
+/// =====> urContextCreate([device0, device1], ...) // associated with [q0, q1]
+///             -> OUT: hContext
+///
+/// =====> urMemBufferCreate(hContext,...);
+///             -> No native allocations made
+///             -> OUT: hBuffer
+///
+/// =====> urEnqueueMemBufferWrite(q0, hBuffer,...);
+///             -> Allocation made on q0 ie device0
+///             -> New allocation initialized with host data.
+///
+/// =====> urKernelSetArgMemObj(hKernel0, hBuffer, ...);
+///             -> ur_kernel_handle_t associated with a ur_program_handle_t,
+///                which is in turn unique to a device. So we can set the kernel
+///                arg with the ptr of the device specific allocation.
+///             -> hKernel0->getProgram()->getDevice() == device0
+///             -> allocateMemObjOnDeviceIfNeeded(device0);
+///                   -> Native allocation already made on device0, continue.
+///
+/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...);
+///             -> Suppose that hKernel0 writes to hBuffer.
+///             -> Call hBuffer->setLastEventWritingToMemObj with return event
+///                from this operation
+///             -> Enqueue native kernel launch
+///
+/// =====> urKernelSetArgMemObj(hKernel1, hBuffer, ...);
+///             -> hKernel1->getProgram()->getDevice() == device1
+///             -> New allocation will be made on device1 when calling
+///                getPtr(device1)
+///                   -> No native allocation on device1
+///                   -> Make native allocation on device1
+///
+/// =====> urEnqueueKernelLaunch(q1, hKernel1, ...);
+///             -> Suppose hKernel1 wants to read from hBuffer and not write.
+///             -> migrateMemoryToDeviceIfNeeded(device1);
+///                   -> hBuffer->LastEventWritingToMemObj is not nullptr
+///                   -> Check if memory has been migrated to device1 since the
+///                      last write
+///                        -> Hasn't been migrated
+///                   -> Wait on LastEventWritingToMemObj.
+///                   -> Migrate memory from device0's native allocation to
+///                      device1's native allocation.
+///             -> Enqueue native kernel launch
+///
+/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...);
+///             -> migrateMemoryToDeviceIfNeeded(device0);
+///                   -> hBuffer->LastEventWritingToMemObj refers to an event
+///                      from q0
+///                        -> Migration not necessary
+///             -> Enqueue native kernel launch
+///
 struct ur_mem_handle_t_ {
   // Context where the memory object is accessible
   ur_context_handle_t Context;
 
   /// Reference counting of the handler
   std::atomic_uint32_t RefCount;
-  enum class Type { Buffer, Surface, Texture } MemType;
 
   // Original mem flags passed
   ur_mem_flags_t MemFlags;
 
+  // If we make a ur_mem_handle_t_ from a native allocation, it can be useful to
+  // associate it with the device that holds the native allocation.
+  ur_device_handle_t DeviceWithNativeAllocation{nullptr};
+
+  // Has the memory been migrated to a device since the last write?
+  std::vector<bool> HaveMigratedToDeviceSinceLastWrite;
+
+  // We should wait on this event prior to migrating memory across allocations
+  // in this ur_mem_handle_t_
+  ur_event_handle_t LastEventWritingToMemObj{nullptr};
+
+  // Enumerates all possible types of accesses.
+  enum access_mode_t { unknown, read_write, read_only, write_only };
+
+  ur_mutex MemoryAllocationMutex; // A mutex for allocations
+  ur_mutex MemoryMigrationMutex;  // A mutex for memory transfers
+
   /// A UR Memory object represents either plain memory allocations ("Buffers"
   /// in OpenCL) or typed allocations ("Images" in OpenCL).
   /// In CUDA their API handlers are different. Whereas "Buffers" are allocated
   /// as pointer-like structs, "Images" are stored in Textures or Surfaces.
-  /// This union allows implementation to use either from the same handler.
-  std::variant<BufferMem, SurfaceMem, ImageMem> Mem;
+  /// This variant allows implementation to use either from the same handler.
+  std::variant<BufferMem, SurfaceMem> Mem;
 
   /// Constructs the UR mem handler for a non-typed allocation ("buffer")
-  ur_mem_handle_t_(ur_context_handle_t Context, ur_mem_handle_t Parent,
-                   ur_mem_flags_t MemFlags, BufferMem::AllocMode Mode,
-                   CUdeviceptr Ptr, void *HostPtr, size_t Size)
-      : Context{Context}, RefCount{1}, MemType{Type::Buffer},
-        MemFlags{MemFlags}, Mem{BufferMem{Parent, Mode, Ptr, HostPtr, Size}} {
-    if (isSubBuffer()) {
-      urMemRetain(std::get<BufferMem>(Mem).Parent);
-    } else {
-      urContextRetain(Context);
-    }
-  };
-
-  /// Constructs the UR allocation for an Image object (surface in CUDA)
-  ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array,
-                   CUsurfObject Surf, ur_mem_flags_t MemFlags,
-                   ur_mem_type_t ImageType, void *HostPtr)
-      : Context{Context}, RefCount{1}, MemType{Type::Surface},
-        MemFlags{MemFlags}, Mem{SurfaceMem{Array, Surf, ImageType, HostPtr}} {
+  ur_mem_handle_t_(ur_context_handle_t Ctxt, ur_mem_flags_t MemFlags,
+                   BufferMem::AllocMode Mode, void *HostPtr, size_t Size)
+      : Context{Ctxt}, RefCount{1}, MemFlags{MemFlags},
+        HaveMigratedToDeviceSinceLastWrite(Context->Devices.size(), false),
+        Mem{std::in_place_type<BufferMem>, Ctxt, this, Mode, HostPtr, Size} {
     urContextRetain(Context);
-  }
+  };
 
-  /// Constructs the UR allocation for an unsampled image object
-  ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array,
-                   CUsurfObject Surf, ur_mem_type_t ImageType)
-      : Context{Context}, RefCount{1}, MemType{Type::Surface}, MemFlags{0},
-        Mem{ImageMem{Array, (void *)Surf, ImageType, nullptr}} {
-    urContextRetain(Context);
-  }
+  // Subbuffer constructor
+  ur_mem_handle_t_(ur_mem_handle_t Parent, size_t SubBufferOffset)
+      : Context{Parent->Context}, RefCount{1}, MemFlags{Parent->MemFlags},
+        HaveMigratedToDeviceSinceLastWrite(Parent->Context->Devices.size(),
+                                           false),
+        Mem{BufferMem{std::get<BufferMem>(Parent->Mem)}} {
+    auto &SubBuffer = std::get<BufferMem>(Mem);
+    SubBuffer.Parent = Parent;
+    SubBuffer.OuterMemStruct = this;
+    if (SubBuffer.HostPtr) {
+      SubBuffer.HostPtr =
+          static_cast<char *>(SubBuffer.HostPtr) + SubBufferOffset;
+    }
+    for (auto &DevPtr : SubBuffer.Ptrs) {
+      if (DevPtr) {
+        DevPtr += SubBufferOffset;
+      }
+    }
+    urMemRetain(Parent);
+  };
 
-  /// Constructs the UR allocation for a sampled image object
-  ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array, CUtexObject Tex,
-                   ur_sampler_handle_t Sampler, ur_mem_type_t ImageType)
-      : Context{Context}, RefCount{1}, MemType{Type::Texture}, MemFlags{0},
-        Mem{ImageMem{Array, (void *)Tex, ImageType, Sampler}} {
+  /// Constructs the UR mem handler for an Image object
+  ur_mem_handle_t_(ur_context_handle_t Ctxt, ur_mem_flags_t MemFlags,
+                   ur_image_format_t ImageFormat, ur_image_desc_t ImageDesc,
+                   void *HostPtr)
+      : Context{Ctxt}, RefCount{1}, MemFlags{MemFlags},
+        HaveMigratedToDeviceSinceLastWrite(Context->Devices.size(), false),
+        Mem{std::in_place_type<SurfaceMem>,
+            Ctxt,
+            this,
+            ImageFormat,
+            ImageDesc,
+            HostPtr} {
     urContextRetain(Context);
   }
 
@@ -229,13 +476,24 @@ struct ur_mem_handle_t_ {
     urContextRelease(Context);
   }
 
-  bool isBuffer() const noexcept { return MemType == Type::Buffer; }
+  bool isBuffer() const noexcept {
+    return std::holds_alternative<BufferMem>(Mem);
+  }
 
   bool isSubBuffer() const noexcept {
     return (isBuffer() && (std::get<BufferMem>(Mem).Parent != nullptr));
   }
 
-  bool isImage() const noexcept { return MemType == Type::Surface; }
+  bool isImage() const noexcept {
+    return std::holds_alternative<SurfaceMem>(Mem);
+  }
+
+  ur_result_t clear() {
+    if (isBuffer()) {
+      return std::get<BufferMem>(Mem).clear();
+    }
+    return std::get<SurfaceMem>(Mem).clear();
+  }
 
   ur_context_handle_t getContext() const noexcept { return Context; }
 
@@ -244,4 +502,22 @@ struct ur_mem_handle_t_ {
   uint32_t decrementReferenceCount() noexcept { return --RefCount; }
 
   uint32_t getReferenceCount() const noexcept { return RefCount; }
+
+  void setLastEventWritingToMemObj(ur_event_handle_t NewEvent) {
+    assert(NewEvent && "Invalid event!");
+    // This entry point should only ever be called when using multi device ctx
+    assert(Context->Devices.size() > 1);
+    urEventRetain(NewEvent);
+    if (LastEventWritingToMemObj != nullptr) {
+      urEventRelease(LastEventWritingToMemObj);
+    }
+    LastEventWritingToMemObj = NewEvent;
+    for (const auto &Device : Context->getDevices()) {
+      // This event is never an interop event so will always have an associated
+      // queue
+      HaveMigratedToDeviceSinceLastWrite
+          [Device->getIndex() % HaveMigratedToDeviceSinceLastWrite.size()] =
+              Device == NewEvent->getQueue()->getDevice();
+    }
+  }
 };
diff --git a/source/adapters/cuda/physical_mem.cpp b/source/adapters/cuda/physical_mem.cpp
index e2c46696a0..c8d4adf0ef 100644
--- a/source/adapters/cuda/physical_mem.cpp
+++ b/source/adapters/cuda/physical_mem.cpp
@@ -23,7 +23,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urPhysicalMemCreate(
   CUmemAllocationProp AllocProps = {};
   AllocProps.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   AllocProps.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-  UR_CHECK_ERROR(GetDeviceOrdinal(hDevice, AllocProps.location.id));
+  AllocProps.location.id = hDevice->getIndex();
 
   CUmemGenericAllocationHandle ResHandle;
   switch (auto Result = cuMemCreate(&ResHandle, size, &AllocProps, 0)) {
@@ -32,7 +32,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urPhysicalMemCreate(
   default:
     UR_CHECK_ERROR(Result);
   }
-  *phPhysicalMem = new ur_physical_mem_handle_t_(ResHandle, hContext);
+  *phPhysicalMem = new ur_physical_mem_handle_t_(ResHandle, hContext, hDevice);
 
   return UR_RESULT_SUCCESS;
 }
@@ -51,7 +51,7 @@ urPhysicalMemRelease(ur_physical_mem_handle_t hPhysicalMem) {
   try {
     std::unique_ptr<ur_physical_mem_handle_t_> PhysicalMemGuard(hPhysicalMem);
 
-    ScopedContext Active(hPhysicalMem->getContext());
+    ScopedContext Active(hPhysicalMem->getDevice());
     UR_CHECK_ERROR(cuMemRelease(hPhysicalMem->get()));
     return UR_RESULT_SUCCESS;
   } catch (ur_result_t err) {
diff --git a/source/adapters/cuda/physical_mem.hpp b/source/adapters/cuda/physical_mem.hpp
index 0ce332e112..c77ad0d547 100644
--- a/source/adapters/cuda/physical_mem.hpp
+++ b/source/adapters/cuda/physical_mem.hpp
@@ -26,41 +26,29 @@ struct ur_physical_mem_handle_t_ {
   std::atomic_uint32_t RefCount;
   native_type PhysicalMem;
   ur_context_handle_t_ *Context;
+  ur_device_handle_t Device;
 
-  ur_physical_mem_handle_t_(native_type PhysMem, ur_context_handle_t_ *Ctx)
-      : RefCount(1), PhysicalMem(PhysMem), Context(Ctx) {
+  ur_physical_mem_handle_t_(native_type PhysMem, ur_context_handle_t_ *Ctx,
+                            ur_device_handle_t Device)
+      : RefCount(1), PhysicalMem(PhysMem), Context(Ctx), Device(Device) {
     urContextRetain(Context);
+    urDeviceRetain(Device);
   }
 
-  ~ur_physical_mem_handle_t_() { urContextRelease(Context); }
+  ~ur_physical_mem_handle_t_() {
+    urContextRelease(Context);
+    urDeviceRelease(Device);
+  }
 
   native_type get() const noexcept { return PhysicalMem; }
 
   ur_context_handle_t_ *getContext() const noexcept { return Context; }
 
+  ur_device_handle_t_ *getDevice() const noexcept { return Device; }
+
   uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
 
   uint32_t decrementReferenceCount() noexcept { return --RefCount; }
 
   uint32_t getReferenceCount() const noexcept { return RefCount; }
 };
-
-// Find a device ordinal of a device.
-inline ur_result_t GetDeviceOrdinal(ur_device_handle_t Device, int &Ordinal) {
-  ur_adapter_handle_t AdapterHandle = &adapter;
-  // Get list of platforms
-  uint32_t NumPlatforms;
-  UR_CHECK_ERROR(urPlatformGet(&AdapterHandle, 1, 0, nullptr, &NumPlatforms));
-  UR_ASSERT(NumPlatforms, UR_RESULT_ERROR_UNKNOWN);
-
-  std::vector<ur_platform_handle_t> Platforms{NumPlatforms};
-  UR_CHECK_ERROR(urPlatformGet(&AdapterHandle, 1, NumPlatforms,
-                               Platforms.data(), nullptr));
-
-  // Ordinal corresponds to the platform ID as each device has its own platform.
-  CUdevice NativeDevice = Device->get();
-  for (Ordinal = 0; size_t(Ordinal) < Platforms.size(); ++Ordinal)
-    if (Platforms[Ordinal]->Devices[0]->get() == NativeDevice)
-      return UR_RESULT_SUCCESS;
-  return UR_RESULT_ERROR_INVALID_DEVICE;
-}
diff --git a/source/adapters/cuda/platform.cpp b/source/adapters/cuda/platform.cpp
index 4c730f997a..27b94f756f 100644
--- a/source/adapters/cuda/platform.cpp
+++ b/source/adapters/cuda/platform.cpp
@@ -52,9 +52,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetInfo(
 /// There is only one CUDA platform, and contains all devices on the system.
 /// Triggers the CUDA Driver initialization (cuInit) the first time, so this
 /// must be the first PI API called.
-///
-/// However because multiple devices in a context is not currently supported,
-/// place each device in a separate platform.
 UR_APIEXPORT ur_result_t UR_APICALL
 urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries,
               ur_platform_handle_t *phPlatforms, uint32_t *pNumPlatforms) {
@@ -62,7 +59,7 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries,
   try {
     static std::once_flag InitFlag;
     static uint32_t NumPlatforms = 1;
-    static std::vector<ur_platform_handle_t_> Platforms;
+    static ur_platform_handle_t_ Platform;
 
     UR_ASSERT(phPlatforms || pNumPlatforms, UR_RESULT_ERROR_INVALID_VALUE);
     UR_ASSERT(!phPlatforms || NumEntries > 0, UR_RESULT_ERROR_INVALID_SIZE);
@@ -76,39 +73,34 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries,
           int NumDevices = 0;
           UR_CHECK_ERROR(cuDeviceGetCount(&NumDevices));
           try {
-            // make one platform per device
-            NumPlatforms = NumDevices;
-            Platforms.resize(NumDevices);
-
             for (int i = 0; i < NumDevices; ++i) {
               CUdevice Device;
               UR_CHECK_ERROR(cuDeviceGet(&Device, i));
               CUcontext Context;
               UR_CHECK_ERROR(cuDevicePrimaryCtxRetain(&Context, Device));
 
-              ScopedContext active(Context);
+              ScopedContext Active(Context); // Set native ctx as active
               CUevent EvBase;
               UR_CHECK_ERROR(cuEventCreate(&EvBase, CU_EVENT_DEFAULT));
 
               // Use default stream to record base event counter
               UR_CHECK_ERROR(cuEventRecord(EvBase, 0));
 
-              Platforms[i].Devices.emplace_back(new ur_device_handle_t_{
-                  Device, Context, EvBase, &Platforms[i]});
+              Platform.Devices.emplace_back(
+                  new ur_device_handle_t_{Device, Context, EvBase, &Platform,
+                                          static_cast<uint32_t>(i)});
             }
           } catch (const std::bad_alloc &) {
             // Signal out-of-memory situation
             for (int i = 0; i < NumDevices; ++i) {
-              Platforms[i].Devices.clear();
+              Platform.Devices.clear();
             }
-            Platforms.clear();
             Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
           } catch (ur_result_t Err) {
             // Clear and rethrow to allow retry
             for (int i = 0; i < NumDevices; ++i) {
-              Platforms[i].Devices.clear();
+              Platform.Devices.clear();
             }
-            Platforms.clear();
             Result = Err;
             throw Err;
           } catch (...) {
@@ -123,9 +115,7 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries,
     }
 
     if (phPlatforms != nullptr) {
-      for (unsigned i = 0; i < std::min(NumEntries, NumPlatforms); ++i) {
-        phPlatforms[i] = &Platforms[i];
-      }
+      *phPlatforms = &Platform;
     }
 
     return Result;
diff --git a/source/adapters/cuda/program.cpp b/source/adapters/cuda/program.cpp
index 390e96b34c..c5cb763ada 100644
--- a/source/adapters/cuda/program.cpp
+++ b/source/adapters/cuda/program.cpp
@@ -9,6 +9,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "program.hpp"
+#include "ur_util.hpp"
 
 bool getMaxRegistersJitOptionValue(const std::string &BuildOptions,
                                    unsigned int &Value) {
@@ -44,23 +45,6 @@ bool getMaxRegistersJitOptionValue(const std::string &BuildOptions,
   return true;
 }
 
-ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Context)
-    : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1},
-      Context{Context}, KernelReqdWorkGroupSizeMD{} {
-  urContextRetain(Context);
-}
-
-ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(Context); }
-
-std::pair<std::string, std::string>
-splitMetadataName(const std::string &metadataName) {
-  size_t splitPos = metadataName.rfind('@');
-  if (splitPos == std::string::npos)
-    return std::make_pair(metadataName, std::string{});
-  return std::make_pair(metadataName.substr(0, splitPos),
-                        metadataName.substr(splitPos, metadataName.length()));
-}
-
 ur_result_t
 ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata,
                                   size_t Length) {
@@ -197,12 +181,14 @@ ur_result_t createProgram(ur_context_handle_t hContext,
                           const uint8_t *pBinary,
                           const ur_program_properties_t *pProperties,
                           ur_program_handle_t *phProgram) {
-  UR_ASSERT(hContext->getDevice()->get() == hDevice->get(),
+  UR_ASSERT(std::find(hContext->getDevices().begin(),
+                      hContext->getDevices().end(),
+                      hDevice) != hContext->getDevices().end(),
             UR_RESULT_ERROR_INVALID_CONTEXT);
   UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE);
 
   std::unique_ptr<ur_program_handle_t_> RetProgram{
-      new ur_program_handle_t_{hContext}};
+      new ur_program_handle_t_{hContext, hDevice}};
 
   if (pProperties) {
     if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
@@ -222,18 +208,12 @@ ur_result_t createProgram(ur_context_handle_t hContext,
   return UR_RESULT_SUCCESS;
 }
 
-/// CUDA will handle the PTX/CUBIN binaries internally through CUmodule object.
-/// So, urProgramCreateWithIL and urProgramCreateWithBinary are equivalent in
-/// terms of CUDA adapter. See \ref urProgramCreateWithBinary.
+// A program is unique to a device so this entry point cannot be supported with
+// a multi device context
 UR_APIEXPORT ur_result_t UR_APICALL
-urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
-                      size_t length, const ur_program_properties_t *pProperties,
-                      ur_program_handle_t *phProgram) {
-  ur_device_handle_t hDevice = hContext->getDevice();
-  auto pBinary = reinterpret_cast<const uint8_t *>(pIL);
-
-  return createProgram(hContext, hDevice, length, pBinary, pProperties,
-                       phProgram);
+urProgramCreateWithIL(ur_context_handle_t, const void *, size_t,
+                      const ur_program_properties_t *, ur_program_handle_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 
 /// CUDA will handle the PTX/CUBIN binaries internally through a call to
@@ -272,7 +252,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext,
   ur_result_t Result = UR_RESULT_SUCCESS;
 
   try {
-    ScopedContext Active(hProgram->getContext());
+    ScopedContext Active(hProgram->getDevice());
 
     hProgram->buildProgram(pOptions);
     hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_EXECUTABLE;
@@ -297,13 +277,17 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count,
               const ur_program_handle_t *phPrograms, const char *pOptions,
               ur_program_handle_t *phProgram) {
   ur_result_t Result = UR_RESULT_SUCCESS;
+  // All programs must be associated with the same device
+  for (auto i = 1u; i < count; ++i)
+    UR_ASSERT(phPrograms[i]->getDevice() == phPrograms[0]->getDevice(),
+              UR_RESULT_ERROR_INVALID_DEVICE);
 
   try {
-    ScopedContext Active(hContext);
+    ScopedContext Active(phPrograms[0]->getDevice());
 
     CUlinkState State;
     std::unique_ptr<ur_program_handle_t_> RetProgram{
-        new ur_program_handle_t_{hContext}};
+        new ur_program_handle_t_{hContext, phPrograms[0]->getDevice()}};
 
     UR_CHECK_ERROR(cuLinkCreate(0, nullptr, nullptr, &State));
     try {
@@ -390,7 +374,7 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName,
   case UR_PROGRAM_INFO_NUM_DEVICES:
     return ReturnValue(1u);
   case UR_PROGRAM_INFO_DEVICES:
-    return ReturnValue(&hProgram->Context->DeviceID, 1);
+    return ReturnValue(&hProgram->Device, 1);
   case UR_PROGRAM_INFO_SOURCE:
     return ReturnValue(hProgram->Binary);
   case UR_PROGRAM_INFO_BINARY_SIZES:
@@ -434,7 +418,7 @@ urProgramRelease(ur_program_handle_t hProgram) {
     ur_result_t Result = UR_RESULT_ERROR_INVALID_PROGRAM;
 
     try {
-      ScopedContext Active(hProgram->getContext());
+      ScopedContext Active(hProgram->getDevice());
       auto cuModule = hProgram->get();
       // "0" is a valid handle for a cuModule, so the best way to check if we
       // actually loaded a module and need to unload it is to look at the build
@@ -491,8 +475,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer(
     ur_device_handle_t hDevice, ur_program_handle_t hProgram,
     const char *pFunctionName, void **ppFunctionPointer) {
   // Check if device passed is the same the device bound to the context
-  UR_ASSERT(hDevice == hProgram->getContext()->getDevice(),
-            UR_RESULT_ERROR_INVALID_DEVICE);
+  UR_ASSERT(hDevice == hProgram->getDevice(), UR_RESULT_ERROR_INVALID_DEVICE);
 
   CUfunction Func;
   CUresult Ret = cuModuleGetFunction(&Func, hProgram->get(), pFunctionName);
diff --git a/source/adapters/cuda/program.hpp b/source/adapters/cuda/program.hpp
index feb3a09f31..5d41374d34 100644
--- a/source/adapters/cuda/program.hpp
+++ b/source/adapters/cuda/program.hpp
@@ -24,6 +24,7 @@ struct ur_program_handle_t_ {
   size_t BinarySizeInBytes;
   std::atomic_uint32_t RefCount;
   ur_context_handle_t Context;
+  ur_device_handle_t Device;
 
   /* The ur_program_binary_type_t property is defined individually for every
    * device in a program. However, since the CUDA adapter only has 1 device per
@@ -42,8 +43,17 @@ struct ur_program_handle_t_ {
   std::string BuildOptions;
   ur_program_build_status_t BuildStatus = UR_PROGRAM_BUILD_STATUS_NONE;
 
-  ur_program_handle_t_(ur_context_handle_t Context);
-  ~ur_program_handle_t_();
+  ur_program_handle_t_(ur_context_handle_t Context, ur_device_handle_t Device)
+      : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1},
+        Context{Context}, Device{Device}, KernelReqdWorkGroupSizeMD{} {
+    urContextRetain(Context);
+    urDeviceRetain(Device);
+  }
+
+  ~ur_program_handle_t_() {
+    urContextRelease(Context);
+    urDeviceRelease(Device);
+  }
 
   ur_result_t setMetadata(const ur_program_metadata_t *Metadata, size_t Length);
 
@@ -51,6 +61,7 @@ struct ur_program_handle_t_ {
 
   ur_result_t buildProgram(const char *BuildOptions);
   ur_context_handle_t getContext() const { return Context; };
+  ur_device_handle_t getDevice() const noexcept { return Device; };
 
   native_type get() const noexcept { return Module; };
 
diff --git a/source/adapters/cuda/queue.cpp b/source/adapters/cuda/queue.cpp
index 120d665524..773126f4d5 100644
--- a/source/adapters/cuda/queue.cpp
+++ b/source/adapters/cuda/queue.cpp
@@ -123,7 +123,8 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice,
   try {
     std::unique_ptr<ur_queue_handle_t_> Queue{nullptr};
 
-    if (hContext->getDevice() != hDevice) {
+    if (std::find(hContext->getDevices().begin(), hContext->getDevices().end(),
+                  hDevice) == hContext->getDevices().end()) {
       *phQueue = nullptr;
       return UR_RESULT_ERROR_INVALID_DEVICE;
     }
@@ -145,10 +146,10 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice,
         IsOutOfOrder = true;
       }
       if (URFlags & UR_QUEUE_FLAG_PRIORITY_HIGH) {
-        ScopedContext Active(hContext);
+        ScopedContext Active(hDevice);
         UR_CHECK_ERROR(cuCtxGetStreamPriorityRange(nullptr, &Priority));
       } else if (URFlags & UR_QUEUE_FLAG_PRIORITY_LOW) {
-        ScopedContext Active(hContext);
+        ScopedContext Active(hDevice);
         UR_CHECK_ERROR(cuCtxGetStreamPriorityRange(&Priority, nullptr));
       }
     }
@@ -193,7 +194,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) {
     if (!hQueue->backendHasOwnership())
       return UR_RESULT_SUCCESS;
 
-    ScopedContext Active(hQueue->getContext());
+    ScopedContext Active(hQueue->getDevice());
 
     hQueue->forEachStream([](CUstream S) {
       UR_CHECK_ERROR(cuStreamSynchronize(S));
@@ -212,7 +213,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) {
   ur_result_t Result = UR_RESULT_SUCCESS;
 
   try {
-    ScopedContext active(hQueue->getContext());
+    ScopedContext active(hQueue->getDevice());
 
     hQueue->syncStreams</*ResetUsed=*/true>(
         [](CUstream s) { UR_CHECK_ERROR(cuStreamSynchronize(s)); });
@@ -242,7 +243,7 @@ urQueueGetNativeHandle(ur_queue_handle_t hQueue, ur_queue_native_desc_t *pDesc,
                        ur_native_handle_t *phNativeQueue) {
   std::ignore = pDesc;
 
-  ScopedContext Active(hQueue->getContext());
+  ScopedContext Active(hQueue->getDevice());
   *phNativeQueue =
       reinterpret_cast<ur_native_handle_t>(hQueue->getNextComputeStream());
   return UR_RESULT_SUCCESS;
@@ -276,7 +277,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle(
       new ur_queue_handle_t_{std::move(ComputeCuStreams),
                              std::move(TransferCuStreams),
                              hContext,
-                             hContext->getDevice(),
+                             hDevice,
                              CuFlags,
                              Flags,
                              /*priority*/ 0,
diff --git a/source/adapters/cuda/queue.hpp b/source/adapters/cuda/queue.hpp
index c79ca18a9b..46e9968fa9 100644
--- a/source/adapters/cuda/queue.hpp
+++ b/source/adapters/cuda/queue.hpp
@@ -97,6 +97,7 @@ struct ur_queue_handle_t_ {
                                    uint32_t *StreamToken = nullptr);
   native_type getNextTransferStream();
   native_type get() { return getNextComputeStream(); };
+  ur_device_handle_t getDevice() const noexcept { return Device; };
 
   bool hasBeenSynchronized(uint32_t StreamToken) {
     // stream token not associated with one of the compute streams
diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp
index 2ffc0755ee..80fd211863 100644
--- a/source/adapters/cuda/ur_interface_loader.cpp
+++ b/source/adapters/cuda/ur_interface_loader.cpp
@@ -407,6 +407,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
 
   pDdiTable->pfnCooperativeKernelLaunchExp =
       urEnqueueCooperativeKernelLaunchExp;
+  pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp;
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/cuda/usm.cpp b/source/adapters/cuda/usm.cpp
index 59d3ba70fd..4e6c6898d5 100644
--- a/source/adapters/cuda/usm.cpp
+++ b/source/adapters/cuda/usm.cpp
@@ -33,7 +33,7 @@ urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc,
             UR_RESULT_ERROR_INVALID_VALUE);
 
   if (!hPool) {
-    return USMHostAllocImpl(ppMem, hContext, nullptr, size, alignment);
+    return USMHostAllocImpl(ppMem, hContext, /* flags */ 0, size, alignment);
   }
 
   auto UMFPool = hPool->HostMemPool.get();
@@ -57,7 +57,7 @@ urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
             UR_RESULT_ERROR_INVALID_VALUE);
 
   if (!hPool) {
-    return USMDeviceAllocImpl(ppMem, hContext, hDevice, nullptr, size,
+    return USMDeviceAllocImpl(ppMem, hContext, hDevice, /* flags */ 0, size,
                               alignment);
   }
 
@@ -82,8 +82,8 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
             UR_RESULT_ERROR_INVALID_VALUE);
 
   if (!hPool) {
-    return USMSharedAllocImpl(ppMem, hContext, hDevice, nullptr, nullptr, size,
-                              alignment);
+    return USMSharedAllocImpl(ppMem, hContext, hDevice, /*host flags*/ 0,
+                              /*device flags*/ 0, size, alignment);
   }
 
   auto UMFPool = hPool->SharedMemPool.get();
@@ -95,11 +95,10 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Pointer) {
+ur_result_t USMFreeImpl(ur_context_handle_t, void *Pointer) {
   ur_result_t Result = UR_RESULT_SUCCESS;
   try {
-    ScopedContext Active(Context);
-    bool IsManaged;
+    unsigned int IsManaged;
     unsigned int Type;
     void *AttributeValues[2] = {&IsManaged, &Type};
     CUpointer_attribute Attributes[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED,
@@ -131,11 +130,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext,
   return USMFreeImpl(hContext, pMem);
 }
 
-ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                               ur_device_handle_t, ur_usm_device_mem_flags_t *,
-                               size_t Size, uint32_t Alignment) {
+ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t,
+                               ur_device_handle_t Device,
+                               ur_usm_device_mem_flags_t, size_t Size,
+                               uint32_t Alignment) {
   try {
-    ScopedContext Active(Context);
+    ScopedContext Active(Device);
     UR_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)ResultPtr, Size));
   } catch (ur_result_t Err) {
     return Err;
@@ -150,12 +150,13 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                               ur_device_handle_t, ur_usm_host_mem_flags_t *,
-                               ur_usm_device_mem_flags_t *, size_t Size,
+ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t,
+                               ur_device_handle_t Device,
+                               ur_usm_host_mem_flags_t,
+                               ur_usm_device_mem_flags_t, size_t Size,
                                uint32_t Alignment) {
   try {
-    ScopedContext Active(Context);
+    ScopedContext Active(Device);
     UR_CHECK_ERROR(cuMemAllocManaged((CUdeviceptr *)ResultPtr, Size,
                                      CU_MEM_ATTACH_GLOBAL));
   } catch (ur_result_t Err) {
@@ -171,11 +172,10 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
   return UR_RESULT_SUCCESS;
 }
 
-ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                             ur_usm_host_mem_flags_t *, size_t Size,
+ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t,
+                             ur_usm_host_mem_flags_t, size_t Size,
                              uint32_t Alignment) {
   try {
-    ScopedContext Active(Context);
     UR_CHECK_ERROR(cuMemAllocHost(ResultPtr, Size));
   } catch (ur_result_t Err) {
     return Err;
@@ -199,7 +199,6 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem,
   UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet);
 
   try {
-    ScopedContext Active(hContext);
     switch (propName) {
     case UR_USM_ALLOC_INFO_TYPE: {
       unsigned int Value;
@@ -358,24 +357,24 @@ umf_result_t USMMemoryProvider::get_min_page_size(void *Ptr, size_t *PageSize) {
 
 ur_result_t USMSharedMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
                                                   uint32_t Alignment) {
-  return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, nullptr, Size,
-                            Alignment);
+  return USMSharedAllocImpl(ResultPtr, Context, Device, /*host flags*/ 0,
+                            /*device flags*/ 0, Size, Alignment);
 }
 
 ur_result_t USMDeviceMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
                                                   uint32_t Alignment) {
-  return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size,
+  return USMDeviceAllocImpl(ResultPtr, Context, Device, /* flags */ 0, Size,
                             Alignment);
 }
 
 ur_result_t USMHostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
                                                 uint32_t Alignment) {
-  return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment);
+  return USMHostAllocImpl(ResultPtr, Context, /* flags */ 0, Size, Alignment);
 }
 
 ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context,
                                              ur_usm_pool_desc_t *PoolDesc)
-    : Context(Context) {
+    : Context{Context} {
   const void *pNext = PoolDesc->pNext;
   while (pNext != nullptr) {
     const ur_base_desc_t *BaseDesc = static_cast<const ur_base_desc_t *>(pNext);
@@ -406,25 +405,25 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context,
           &this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Host])
           .second;
 
-  auto Device = Context->DeviceID;
-  MemProvider =
-      umf::memoryProviderMakeUnique<USMDeviceMemoryProvider>(Context, Device)
-          .second;
-  DeviceMemPool =
-      umf::poolMakeUniqueFromOps(
-          &UMF_DISJOINT_POOL_OPS, std::move(MemProvider),
-          &this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Device])
-          .second;
-
-  MemProvider =
-      umf::memoryProviderMakeUnique<USMSharedMemoryProvider>(Context, Device)
-          .second;
-  SharedMemPool =
-      umf::poolMakeUniqueFromOps(
-          &UMF_DISJOINT_POOL_OPS, std::move(MemProvider),
-          &this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Shared])
-          .second;
-  Context->addPool(this);
+  for (const auto &Device : Context->getDevices()) {
+    MemProvider =
+        umf::memoryProviderMakeUnique<USMDeviceMemoryProvider>(Context, Device)
+            .second;
+    DeviceMemPool = umf::poolMakeUniqueFromOps(
+                        &UMF_DISJOINT_POOL_OPS, std::move(MemProvider),
+                        &this->DisjointPoolConfigs
+                             .Configs[usm::DisjointPoolMemType::Device])
+                        .second;
+    MemProvider =
+        umf::memoryProviderMakeUnique<USMSharedMemoryProvider>(Context, Device)
+            .second;
+    SharedMemPool = umf::poolMakeUniqueFromOps(
+                        &UMF_DISJOINT_POOL_OPS, std::move(MemProvider),
+                        &this->DisjointPoolConfigs
+                             .Configs[usm::DisjointPoolMemType::Shared])
+                        .second;
+    Context->addPool(this);
+  }
 }
 
 bool ur_usm_pool_handle_t_::hasUMFPool(umf_memory_pool_t *umf_pool) {
diff --git a/source/adapters/cuda/usm.hpp b/source/adapters/cuda/usm.hpp
index 2ec3df150f..e5d1f7fbaa 100644
--- a/source/adapters/cuda/usm.hpp
+++ b/source/adapters/cuda/usm.hpp
@@ -118,15 +118,15 @@ class USMHostMemoryProvider final : public USMMemoryProvider {
 
 ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
                                ur_device_handle_t Device,
-                               ur_usm_device_mem_flags_t *Flags, size_t Size,
+                               ur_usm_device_mem_flags_t Flags, size_t Size,
                                uint32_t Alignment);
 
 ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
                                ur_device_handle_t Device,
-                               ur_usm_host_mem_flags_t *,
-                               ur_usm_device_mem_flags_t *, size_t Size,
+                               ur_usm_host_mem_flags_t,
+                               ur_usm_device_mem_flags_t, size_t Size,
                                uint32_t Alignment);
 
 ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                             ur_usm_host_mem_flags_t *Flags, size_t Size,
+                             ur_usm_host_mem_flags_t Flags, size_t Size,
                              uint32_t Alignment);
diff --git a/source/adapters/cuda/usm_p2p.cpp b/source/adapters/cuda/usm_p2p.cpp
index 810a11ef84..e09220d730 100644
--- a/source/adapters/cuda/usm_p2p.cpp
+++ b/source/adapters/cuda/usm_p2p.cpp
@@ -14,8 +14,8 @@
 UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PEnablePeerAccessExp(
     ur_device_handle_t commandDevice, ur_device_handle_t peerDevice) {
   try {
-    ScopedContext active(commandDevice->getContext());
-    UR_CHECK_ERROR(cuCtxEnablePeerAccess(peerDevice->getContext(), 0));
+    ScopedContext active(commandDevice);
+    UR_CHECK_ERROR(cuCtxEnablePeerAccess(peerDevice->getNativeContext(), 0));
   } catch (ur_result_t err) {
     return err;
   }
@@ -25,8 +25,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PEnablePeerAccessExp(
 UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PDisablePeerAccessExp(
     ur_device_handle_t commandDevice, ur_device_handle_t peerDevice) {
   try {
-    ScopedContext active(commandDevice->getContext());
-    UR_CHECK_ERROR(cuCtxDisablePeerAccess(peerDevice->getContext()));
+    ScopedContext active(commandDevice);
+    UR_CHECK_ERROR(cuCtxDisablePeerAccess(peerDevice->getNativeContext()));
   } catch (ur_result_t err) {
     return err;
   }
@@ -43,7 +43,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp(
   int value;
   CUdevice_P2PAttribute cuAttr;
   try {
-    ScopedContext active(commandDevice->getContext());
+    ScopedContext active(commandDevice);
     switch (propName) {
     case UR_EXP_PEER_INFO_UR_PEER_ACCESS_SUPPORTED: {
       cuAttr = CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED;
diff --git a/source/adapters/cuda/virtual_mem.cpp b/source/adapters/cuda/virtual_mem.cpp
index 9c37dda4fb..29908ad1d4 100644
--- a/source/adapters/cuda/virtual_mem.cpp
+++ b/source/adapters/cuda/virtual_mem.cpp
@@ -17,12 +17,12 @@
 #include <cuda.h>
 
 UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo(
-    ur_context_handle_t hContext, ur_device_handle_t hDevice,
+    ur_context_handle_t, ur_device_handle_t hDevice,
     ur_virtual_mem_granularity_info_t propName, size_t propSize,
     void *pPropValue, size_t *pPropSizeRet) {
   UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
 
-  ScopedContext Active(hContext);
+  ScopedContext Active(hDevice);
   switch (propName) {
   case UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM:
   case UR_VIRTUAL_MEM_GRANULARITY_INFO_RECOMMENDED: {
@@ -33,7 +33,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo(
     CUmemAllocationProp AllocProps = {};
     AllocProps.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
     AllocProps.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-    UR_CHECK_ERROR(GetDeviceOrdinal(hDevice, AllocProps.location.id));
+    AllocProps.location.id = hDevice->getIndex();
 
     size_t Granularity;
     UR_CHECK_ERROR(
@@ -50,15 +50,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo(
 UR_APIEXPORT ur_result_t UR_APICALL
 urVirtualMemReserve(ur_context_handle_t hContext, const void *pStart,
                     size_t size, void **ppStart) {
-  ScopedContext Active(hContext);
+  // Reserve the virtual mem. Only need to do once for arbitrary context
+  ScopedContext Active(hContext->getDevices()[0]);
   UR_CHECK_ERROR(cuMemAddressReserve((CUdeviceptr *)ppStart, size, 0,
                                      (CUdeviceptr)pStart, 0));
   return UR_RESULT_SUCCESS;
 }
 
-UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemFree(
-    ur_context_handle_t hContext, const void *pStart, size_t size) {
-  ScopedContext Active(hContext);
+UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemFree(ur_context_handle_t,
+                                                     const void *pStart,
+                                                     size_t size) {
   UR_CHECK_ERROR(cuMemAddressFree((CUdeviceptr)pStart, size));
   return UR_RESULT_SUCCESS;
 }
@@ -66,22 +67,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemFree(
 UR_APIEXPORT ur_result_t UR_APICALL
 urVirtualMemSetAccess(ur_context_handle_t hContext, const void *pStart,
                       size_t size, ur_virtual_mem_access_flags_t flags) {
-  CUmemAccessDesc AccessDesc = {};
-  if (flags & UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE)
-    AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-  else if (flags & UR_VIRTUAL_MEM_ACCESS_FLAG_READ_ONLY)
-    AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READ;
-  else
-    AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_NONE;
-  AccessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-  // TODO: When contexts support multiple devices, we should create a descriptor
-  //       for each. We may also introduce a variant of this function with a
-  //       specific device.
-  UR_CHECK_ERROR(
-      GetDeviceOrdinal(hContext->getDevice(), AccessDesc.location.id));
-
-  ScopedContext Active(hContext);
-  UR_CHECK_ERROR(cuMemSetAccess((CUdeviceptr)pStart, size, &AccessDesc, 1));
+  // Set access for every device in the context
+  for (auto &Device : hContext->getDevices()) {
+    CUmemAccessDesc AccessDesc = {};
+    if (flags & UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE)
+      AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+    else if (flags & UR_VIRTUAL_MEM_ACCESS_FLAG_READ_ONLY)
+      AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READ;
+    else
+      AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_NONE;
+    AccessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    AccessDesc.location.id = Device->getIndex();
+    ScopedContext Active(Device);
+    UR_CHECK_ERROR(cuMemSetAccess((CUdeviceptr)pStart, size, &AccessDesc, 1));
+  }
   return UR_RESULT_SUCCESS;
 }
 
@@ -89,7 +88,8 @@ UR_APIEXPORT ur_result_t UR_APICALL
 urVirtualMemMap(ur_context_handle_t hContext, const void *pStart, size_t size,
                 ur_physical_mem_handle_t hPhysicalMem, size_t offset,
                 ur_virtual_mem_access_flags_t flags) {
-  ScopedContext Active(hContext);
+  // Map the virtual mem. Only need to do once for arbitrary context
+  ScopedContext Active(hContext->getDevices()[0]);
   UR_CHECK_ERROR(
       cuMemMap((CUdeviceptr)pStart, size, offset, hPhysicalMem->get(), 0));
   if (flags)
@@ -99,7 +99,8 @@ urVirtualMemMap(ur_context_handle_t hContext, const void *pStart, size_t size,
 
 UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemUnmap(
     ur_context_handle_t hContext, const void *pStart, size_t size) {
-  ScopedContext Active(hContext);
+  // Unmap the virtual mem. Only need to do once for arbitrary context
+  ScopedContext Active(hContext->getDevices()[0]);
   UR_CHECK_ERROR(cuMemUnmap((CUdeviceptr)pStart, size));
   return UR_RESULT_SUCCESS;
 }
@@ -110,12 +111,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGetInfo(
     size_t propSize, void *pPropValue, size_t *pPropSizeRet) {
   UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
 
-  ScopedContext Active(hContext);
+  // Set arbitrary context
+  ScopedContext Active(hContext->getDevices()[0]);
   switch (propName) {
   case UR_VIRTUAL_MEM_INFO_ACCESS_MODE: {
     CUmemLocation MemLocation = {};
     MemLocation.type = CU_MEM_LOCATION_TYPE_DEVICE;
-    UR_CHECK_ERROR(GetDeviceOrdinal(hContext->getDevice(), MemLocation.id));
+    MemLocation.id = hContext->getDevices()[0]->getIndex();
 
     unsigned long long CuAccessFlags;
     UR_CHECK_ERROR(
diff --git a/source/adapters/hip/command_buffer.hpp b/source/adapters/hip/command_buffer.hpp
index 50fddc5448..751fde3720 100644
--- a/source/adapters/hip/command_buffer.hpp
+++ b/source/adapters/hip/command_buffer.hpp
@@ -10,154 +10,12 @@
 
 #include <ur/ur.hpp>
 #include <ur_api.h>
+#include <ur_print.hpp>
 
 #include "context.hpp"
 #include <hip/hip_runtime.h>
 #include <memory>
 
-static inline const char *getUrResultString(ur_result_t Result) {
-  switch (Result) {
-  case UR_RESULT_SUCCESS:
-    return "UR_RESULT_SUCCESS";
-  case UR_RESULT_ERROR_INVALID_OPERATION:
-    return "UR_RESULT_ERROR_INVALID_OPERATION";
-  case UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES:
-    return "UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES";
-  case UR_RESULT_ERROR_INVALID_QUEUE:
-    return "UR_RESULT_ERROR_INVALID_QUEUE";
-  case UR_RESULT_ERROR_INVALID_VALUE:
-    return "UR_RESULT_ERROR_INVALID_VALUE";
-  case UR_RESULT_ERROR_INVALID_CONTEXT:
-    return "UR_RESULT_ERROR_INVALID_CONTEXT";
-  case UR_RESULT_ERROR_INVALID_PLATFORM:
-    return "UR_RESULT_ERROR_INVALID_PLATFORM";
-  case UR_RESULT_ERROR_INVALID_BINARY:
-    return "UR_RESULT_ERROR_INVALID_BINARY";
-  case UR_RESULT_ERROR_INVALID_PROGRAM:
-    return "UR_RESULT_ERROR_INVALID_PROGRAM";
-  case UR_RESULT_ERROR_INVALID_SAMPLER:
-    return "UR_RESULT_ERROR_INVALID_SAMPLER";
-  case UR_RESULT_ERROR_INVALID_BUFFER_SIZE:
-    return "UR_RESULT_ERROR_INVALID_BUFFER_SIZE";
-  case UR_RESULT_ERROR_INVALID_MEM_OBJECT:
-    return "UR_RESULT_ERROR_INVALID_MEM_OBJECT";
-  case UR_RESULT_ERROR_INVALID_EVENT:
-    return "UR_RESULT_ERROR_INVALID_EVENT";
-  case UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST:
-    return "UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST";
-  case UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET:
-    return "UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET";
-  case UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE:
-    return "UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE";
-  case UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE:
-    return "UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE";
-  case UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE:
-    return "UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE";
-  case UR_RESULT_ERROR_DEVICE_NOT_FOUND:
-    return "UR_RESULT_ERROR_DEVICE_NOT_FOUND";
-  case UR_RESULT_ERROR_INVALID_DEVICE:
-    return "UR_RESULT_ERROR_INVALID_DEVICE";
-  case UR_RESULT_ERROR_DEVICE_LOST:
-    return "UR_RESULT_ERROR_DEVICE_LOST";
-  case UR_RESULT_ERROR_DEVICE_REQUIRES_RESET:
-    return "UR_RESULT_ERROR_DEVICE_REQUIRES_RESET";
-  case UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE:
-    return "UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE";
-  case UR_RESULT_ERROR_DEVICE_PARTITION_FAILED:
-    return "UR_RESULT_ERROR_DEVICE_PARTITION_FAILED";
-  case UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT:
-    return "UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT";
-  case UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE:
-    return "UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE";
-  case UR_RESULT_ERROR_INVALID_WORK_DIMENSION:
-    return "UR_RESULT_ERROR_INVALID_WORK_DIMENSION";
-  case UR_RESULT_ERROR_INVALID_KERNEL_ARGS:
-    return "UR_RESULT_ERROR_INVALID_KERNEL_ARGS";
-  case UR_RESULT_ERROR_INVALID_KERNEL:
-    return "UR_RESULT_ERROR_INVALID_KERNEL";
-  case UR_RESULT_ERROR_INVALID_KERNEL_NAME:
-    return "UR_RESULT_ERROR_INVALID_KERNEL_NAME";
-  case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX:
-    return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX";
-  case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE:
-    return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE";
-  case UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE:
-    return "UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE";
-  case UR_RESULT_ERROR_INVALID_IMAGE_SIZE:
-    return "UR_RESULT_ERROR_INVALID_IMAGE_SIZE";
-  case UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR:
-    return "UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR";
-  case UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED:
-    return "UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED";
-  case UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE:
-    return "UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE";
-  case UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE:
-    return "UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE";
-  case UR_RESULT_ERROR_UNINITIALIZED:
-    return "UR_RESULT_ERROR_UNINITIALIZED";
-  case UR_RESULT_ERROR_OUT_OF_HOST_MEMORY:
-    return "UR_RESULT_ERROR_OUT_OF_HOST_MEMORY";
-  case UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY:
-    return "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY";
-  case UR_RESULT_ERROR_OUT_OF_RESOURCES:
-    return "UR_RESULT_ERROR_OUT_OF_RESOURCES";
-  case UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE:
-    return "UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE";
-  case UR_RESULT_ERROR_PROGRAM_LINK_FAILURE:
-    return "UR_RESULT_ERROR_PROGRAM_LINK_FAILURE";
-  case UR_RESULT_ERROR_UNSUPPORTED_VERSION:
-    return "UR_RESULT_ERROR_UNSUPPORTED_VERSION";
-  case UR_RESULT_ERROR_UNSUPPORTED_FEATURE:
-    return "UR_RESULT_ERROR_UNSUPPORTED_FEATURE";
-  case UR_RESULT_ERROR_INVALID_ARGUMENT:
-    return "UR_RESULT_ERROR_INVALID_ARGUMENT";
-  case UR_RESULT_ERROR_INVALID_NULL_HANDLE:
-    return "UR_RESULT_ERROR_INVALID_NULL_HANDLE";
-  case UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE:
-    return "UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE";
-  case UR_RESULT_ERROR_INVALID_NULL_POINTER:
-    return "UR_RESULT_ERROR_INVALID_NULL_POINTER";
-  case UR_RESULT_ERROR_INVALID_SIZE:
-    return "UR_RESULT_ERROR_INVALID_SIZE";
-  case UR_RESULT_ERROR_UNSUPPORTED_SIZE:
-    return "UR_RESULT_ERROR_UNSUPPORTED_SIZE";
-  case UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT:
-    return "UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT";
-  case UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT:
-    return "UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT";
-  case UR_RESULT_ERROR_INVALID_ENUMERATION:
-    return "UR_RESULT_ERROR_INVALID_ENUMERATION";
-  case UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION:
-    return "UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION";
-  case UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT:
-    return "UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT";
-  case UR_RESULT_ERROR_INVALID_NATIVE_BINARY:
-    return "UR_RESULT_ERROR_INVALID_NATIVE_BINARY";
-  case UR_RESULT_ERROR_INVALID_GLOBAL_NAME:
-    return "UR_RESULT_ERROR_INVALID_GLOBAL_NAME";
-  case UR_RESULT_ERROR_INVALID_FUNCTION_NAME:
-    return "UR_RESULT_ERROR_INVALID_FUNCTION_NAME";
-  case UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION:
-    return "UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION";
-  case UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION:
-    return "UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION";
-  case UR_RESULT_ERROR_PROGRAM_UNLINKED:
-    return "UR_RESULT_ERROR_PROGRAM_UNLINKED";
-  case UR_RESULT_ERROR_OVERLAPPING_REGIONS:
-    return "UR_RESULT_ERROR_OVERLAPPING_REGIONS";
-  case UR_RESULT_ERROR_INVALID_HOST_PTR:
-    return "UR_RESULT_ERROR_INVALID_HOST_PTR";
-  case UR_RESULT_ERROR_INVALID_USM_SIZE:
-    return "UR_RESULT_ERROR_INVALID_USM_SIZE";
-  case UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE:
-    return "UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE";
-  case UR_RESULT_ERROR_ADAPTER_SPECIFIC:
-    return "UR_RESULT_ERROR_ADAPTER_SPECIFIC";
-  default:
-    return "UR_RESULT_ERROR_UNKNOWN";
-  }
-}
-
 // Trace an internal UR call
 #define UR_TRACE(Call)                                                         \
   {                                                                            \
@@ -169,10 +27,10 @@ static inline const char *getUrResultString(ur_result_t Result) {
 #define UR_CALL(Call, Result)                                                  \
   {                                                                            \
     if (PrintTrace)                                                            \
-      fprintf(stderr, "UR ---> %s\n", #Call);                                  \
+      std::cerr << "UR ---> " << #Call << "\n";                                \
     Result = (Call);                                                           \
     if (PrintTrace)                                                            \
-      fprintf(stderr, "UR <--- %s(%s)\n", #Call, getUrResultString(Result));   \
+      std::cerr << "UR <--- " << #Call << "(" << Result << ")\n";              \
   }
 
 // Handle to a kernel command.
diff --git a/source/adapters/hip/device.cpp b/source/adapters/hip/device.cpp
index ae4dbe159e..dd20a4f50f 100644
--- a/source/adapters/hip/device.cpp
+++ b/source/adapters/hip/device.cpp
@@ -724,11 +724,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   }
 
   case UR_DEVICE_INFO_GLOBAL_MEM_FREE: {
+    // Work around an issue on some (unsupported) architectures,
+    // where hipMemGetInfo fails internally and returns hipErrorInvalidValue
+    // when trying to query the amount of available global memory. Since we
+    // can't distinguish this condition from us doing something wrong, we can't
+    // handle it gracefully.
+    hipDeviceProp_t Props;
+    UR_CHECK_ERROR(hipGetDeviceProperties(&Props, hDevice->get()));
+    if (strcmp(Props.gcnArchName, "gfx1031") == 0) {
+      return ReturnValue(size_t{0});
+    }
+
     size_t FreeMemory = 0;
     size_t TotalMemory = 0;
-    detail::ur::assertion(hipMemGetInfo(&FreeMemory, &TotalMemory) ==
-                              hipSuccess,
-                          "failed hipMemGetInfo() API.");
+    UR_CHECK_ERROR(hipMemGetInfo(&FreeMemory, &TotalMemory));
     return ReturnValue(FreeMemory);
   }
 
@@ -841,6 +850,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_COMPOSITE_DEVICE:
     // These two are exclusive of L0.
     return ReturnValue(0);
+  case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP:
+    return ReturnValue(true);
 
   // TODO: Investigate if this information is available on HIP.
   case UR_DEVICE_INFO_GPU_EU_COUNT:
diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp
index 79522d4c93..4b98a9c491 100644
--- a/source/adapters/hip/enqueue.cpp
+++ b/source/adapters/hip/enqueue.cpp
@@ -15,12 +15,13 @@
 #include "kernel.hpp"
 #include "memory.hpp"
 #include "queue.hpp"
+#include "ur_api.h"
 
 #include <ur/ur.hpp>
 
 extern size_t imageElementByteSize(hipArray_Format ArrayFormat);
 
-ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
+ur_result_t enqueueEventsWait(ur_queue_handle_t Queue, hipStream_t Stream,
                               uint32_t NumEventsInWaitList,
                               const ur_event_handle_t *EventWaitList) {
   if (!EventWaitList) {
@@ -29,8 +30,8 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
   try {
     auto Result = forLatestEvents(
         EventWaitList, NumEventsInWaitList,
-        [Stream](ur_event_handle_t Event) -> ur_result_t {
-          ScopedContext Active(Event->getDevice());
+        [Stream, Queue](ur_event_handle_t Event) -> ur_result_t {
+          ScopedContext Active(Queue->getDevice());
           if (Event->isCompleted() || Event->getStream() == Stream) {
             return UR_RESULT_SUCCESS;
           } else {
@@ -218,8 +219,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
     // last queue to write to the MemBuffer, meaning we must perform the copy
     // from a different device
     if (hBuffer->LastEventWritingToMemObj &&
-        hBuffer->LastEventWritingToMemObj->getDevice() != hQueue->getDevice()) {
-      Device = hBuffer->LastEventWritingToMemObj->getDevice();
+        hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() !=
+            hQueue->getDevice()) {
+      // This event is never created with interop so getQueue is never null
+      hQueue = hBuffer->LastEventWritingToMemObj->getQueue();
+      Device = hQueue->getDevice();
       ScopedContext Active(Device);
       HIPStream = hipStream_t{0}; // Default stream for different device
       // We may have to wait for an event on another queue if it is the last
@@ -584,8 +588,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
     // last queue to write to the MemBuffer, meaning we must perform the copy
     // from a different device
     if (hBuffer->LastEventWritingToMemObj &&
-        hBuffer->LastEventWritingToMemObj->getDevice() != hQueue->getDevice()) {
-      Device = hBuffer->LastEventWritingToMemObj->getDevice();
+        hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() !=
+            hQueue->getDevice()) {
+      // This event is never created with interop so getQueue is never null
+      hQueue = hBuffer->LastEventWritingToMemObj->getQueue();
+      Device = hQueue->getDevice();
       ScopedContext Active(Device);
       HIPStream = hipStream_t{0}; // Default stream for different device
       // We may have to wait for an event on another queue if it is the last
@@ -1017,8 +1024,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
     // last queue to write to the MemBuffer, meaning we must perform the copy
     // from a different device
     if (hImage->LastEventWritingToMemObj &&
-        hImage->LastEventWritingToMemObj->getDevice() != hQueue->getDevice()) {
-      Device = hImage->LastEventWritingToMemObj->getDevice();
+        hImage->LastEventWritingToMemObj->getQueue()->getDevice() !=
+            hQueue->getDevice()) {
+      hQueue = hImage->LastEventWritingToMemObj->getQueue();
+      Device = hQueue->getDevice();
       ScopedContext Active(Device);
       HIPStream = hipStream_t{0}; // Default stream for different device
       // We may have to wait for an event on another queue if it is the last
@@ -1231,49 +1240,42 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
   UR_ASSERT(offset + size <= BufferImpl.getSize(),
             UR_RESULT_ERROR_INVALID_SIZE);
 
-  ur_result_t Result = UR_RESULT_ERROR_INVALID_OPERATION;
-  const bool IsPinned =
-      BufferImpl.MemAllocMode == BufferMem::AllocMode::AllocHostPtr;
-
-  // Currently no support for overlapping regions
-  if (BufferImpl.getMapPtr() != nullptr) {
-    return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  auto MapPtr = BufferImpl.mapToPtr(size, offset, mapFlags);
+  if (!MapPtr) {
+    return UR_RESULT_ERROR_INVALID_MEM_OBJECT;
   }
 
-  // Allocate a pointer in the host to store the mapped information
-  auto HostPtr = BufferImpl.mapToPtr(size, offset, mapFlags);
-  *ppRetMap = std::get<BufferMem>(hBuffer->Mem).getMapPtr();
-  if (HostPtr) {
-    Result = UR_RESULT_SUCCESS;
-  }
+  const bool IsPinned =
+      BufferImpl.MemAllocMode == BufferMem::AllocMode::AllocHostPtr;
 
-  if (!IsPinned &&
-      ((mapFlags & UR_MAP_FLAG_READ) || (mapFlags & UR_MAP_FLAG_WRITE))) {
-    // Pinned host memory is already on host so it doesn't need to be read.
-    Result = urEnqueueMemBufferRead(hQueue, hBuffer, blockingMap, offset, size,
-                                    HostPtr, numEventsInWaitList,
-                                    phEventWaitList, phEvent);
-  } else {
-    ScopedContext Active(hQueue->getDevice());
+  try {
+    if (!IsPinned && (mapFlags & (UR_MAP_FLAG_READ | UR_MAP_FLAG_WRITE))) {
+      // Pinned host memory is already on host so it doesn't need to be read.
+      UR_CHECK_ERROR(urEnqueueMemBufferRead(
+          hQueue, hBuffer, blockingMap, offset, size, MapPtr,
+          numEventsInWaitList, phEventWaitList, phEvent));
+    } else {
+      ScopedContext Active(hQueue->getDevice());
 
-    if (IsPinned) {
-      Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList,
-                                   nullptr);
-    }
+      if (IsPinned) {
+        UR_CHECK_ERROR(urEnqueueEventsWait(hQueue, numEventsInWaitList,
+                                           phEventWaitList, nullptr));
+      }
 
-    if (phEvent) {
-      try {
+      if (phEvent) {
         *phEvent = ur_event_handle_t_::makeNative(
             UR_COMMAND_MEM_BUFFER_MAP, hQueue, hQueue->getNextTransferStream());
         UR_CHECK_ERROR((*phEvent)->start());
         UR_CHECK_ERROR((*phEvent)->record());
-      } catch (ur_result_t Error) {
-        Result = Error;
       }
     }
+  } catch (ur_result_t Error) {
+    return Error;
   }
 
-  return Result;
+  *ppRetMap = MapPtr;
+
+  return UR_RESULT_SUCCESS;
 }
 
 /// Implements the unmap from the host, using a BufferWrite operation.
@@ -1284,47 +1286,44 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
     ur_queue_handle_t hQueue, ur_mem_handle_t hMem, void *pMappedPtr,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
-  ur_result_t Result = UR_RESULT_SUCCESS;
   UR_ASSERT(hMem->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-  UR_ASSERT(std::get<BufferMem>(hMem->Mem).getMapPtr() != nullptr,
-            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
-  UR_ASSERT(std::get<BufferMem>(hMem->Mem).getMapPtr() == pMappedPtr,
-            UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+  auto &BufferImpl = std::get<BufferMem>(hMem->Mem);
 
-  const bool IsPinned = std::get<BufferMem>(hMem->Mem).MemAllocMode ==
-                        BufferMem::AllocMode::AllocHostPtr;
-
-  if (!IsPinned &&
-      ((std::get<BufferMem>(hMem->Mem).getMapFlags() & UR_MAP_FLAG_WRITE) ||
-       (std::get<BufferMem>(hMem->Mem).getMapFlags() &
-        UR_MAP_FLAG_WRITE_INVALIDATE_REGION))) {
-    // Pinned host memory is only on host so it doesn't need to be written to.
-    Result = urEnqueueMemBufferWrite(
-        hQueue, hMem, true, std::get<BufferMem>(hMem->Mem).getMapOffset(),
-        std::get<BufferMem>(hMem->Mem).getMapSize(), pMappedPtr,
-        numEventsInWaitList, phEventWaitList, phEvent);
-  } else {
-    ScopedContext Active(hQueue->getDevice());
+  auto *Map = BufferImpl.getMapDetails(pMappedPtr);
+  UR_ASSERT(Map != nullptr, UR_RESULT_ERROR_INVALID_MEM_OBJECT);
 
-    if (IsPinned) {
-      Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList,
-                                   nullptr);
-    }
+  const bool IsPinned =
+      BufferImpl.MemAllocMode == BufferMem::AllocMode::AllocHostPtr;
 
-    if (phEvent) {
-      try {
+  try {
+    if (!IsPinned &&
+        (Map->getMapFlags() &
+         (UR_MAP_FLAG_WRITE | UR_MAP_FLAG_WRITE_INVALIDATE_REGION))) {
+      // Pinned host memory is only on host so it doesn't need to be written to.
+      UR_CHECK_ERROR(urEnqueueMemBufferWrite(
+          hQueue, hMem, true, Map->getMapOffset(), Map->getMapSize(),
+          pMappedPtr, numEventsInWaitList, phEventWaitList, phEvent));
+    } else {
+      ScopedContext Active(hQueue->getDevice());
+
+      if (IsPinned) {
+        UR_CHECK_ERROR(urEnqueueEventsWait(hQueue, numEventsInWaitList,
+                                           phEventWaitList, nullptr));
+      }
+
+      if (phEvent) {
         *phEvent = ur_event_handle_t_::makeNative(
             UR_COMMAND_MEM_UNMAP, hQueue, hQueue->getNextTransferStream());
         UR_CHECK_ERROR((*phEvent)->start());
         UR_CHECK_ERROR((*phEvent)->record());
-      } catch (ur_result_t Error) {
-        Result = Error;
       }
     }
+  } catch (ur_result_t Error) {
+    return Error;
   }
 
-  std::get<BufferMem>(hMem->Mem).unmap(pMappedPtr);
-  return Result;
+  BufferImpl.unmap(pMappedPtr);
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
@@ -1842,10 +1841,14 @@ setKernelParams(const ur_device_handle_t Device, const uint32_t WorkDim,
           static_cast<size_t>(Device->getMaxBlockDimY()),
           static_cast<size_t>(Device->getMaxBlockDimZ())};
 
+      auto &ReqdThreadsPerBlock = Kernel->ReqdThreadsPerBlock;
       MaxWorkGroupSize = Device->getMaxWorkGroupSize();
 
       if (LocalWorkSize != nullptr) {
         auto isValid = [&](int dim) {
+          UR_ASSERT(ReqdThreadsPerBlock[dim] == 0 ||
+                        LocalWorkSize[dim] == ReqdThreadsPerBlock[dim],
+                    UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE);
           UR_ASSERT(LocalWorkSize[dim] <= MaxThreadsPerBlock[dim],
                     UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE);
           // Checks that local work sizes are a divisor of the global work sizes
@@ -1967,3 +1970,36 @@ void setCopyRectParams(ur_rect_region_t Region, const void *SrcPtr,
                      : (DstType == hipMemoryTypeDevice ? hipMemcpyHostToDevice
                                                        : hipMemcpyHostToHost));
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+    ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+
+  ur_result_t Result = UR_RESULT_SUCCESS;
+  std::unique_ptr<ur_event_handle_t_> RetImplEvent{nullptr};
+  try {
+    ScopedContext Active(hQueue->getDevice());
+
+    uint32_t StreamToken;
+    ur_stream_quard Guard;
+    hipStream_t HIPStream = hQueue->getNextComputeStream(
+        numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
+    UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList,
+                                     phEventWaitList));
+
+    RetImplEvent =
+        std::unique_ptr<ur_event_handle_t_>(ur_event_handle_t_::makeNative(
+            UR_COMMAND_TIMESTAMP_RECORDING_EXP, hQueue, HIPStream));
+    UR_CHECK_ERROR(RetImplEvent->start());
+    UR_CHECK_ERROR(RetImplEvent->record());
+
+    if (blocking) {
+      UR_CHECK_ERROR(hipStreamSynchronize(HIPStream));
+    }
+
+    *phEvent = RetImplEvent.release();
+  } catch (ur_result_t Err) {
+    Result = Err;
+  }
+  return Result;
+}
diff --git a/source/adapters/hip/event.cpp b/source/adapters/hip/event.cpp
index 7478c0e778..5327c43a3b 100644
--- a/source/adapters/hip/event.cpp
+++ b/source/adapters/hip/event.cpp
@@ -22,7 +22,8 @@ ur_event_handle_t_::ur_event_handle_t_(ur_command_t Type,
       StreamToken{StreamToken}, EventId{0}, EvEnd{nullptr}, EvStart{nullptr},
       EvQueued{nullptr}, Queue{Queue}, Stream{Stream}, Context{Context} {
 
-  bool ProfilingEnabled = Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE;
+  bool ProfilingEnabled =
+      Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent();
 
   UR_CHECK_ERROR(hipEventCreateWithFlags(
       &EvEnd, ProfilingEnabled ? hipEventDefault : hipEventDisableTiming));
@@ -58,7 +59,7 @@ ur_result_t ur_event_handle_t_::start() {
   ur_result_t Result = UR_RESULT_SUCCESS;
 
   try {
-    if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) {
+    if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) {
       // NOTE: This relies on the default stream to be unused.
       UR_CHECK_ERROR(hipEventRecord(EvQueued, 0));
       UR_CHECK_ERROR(hipEventRecord(EvStart, Queue->get()));
@@ -177,7 +178,7 @@ ur_result_t ur_event_handle_t_::release() {
   assert(Queue != nullptr);
   UR_CHECK_ERROR(hipEventDestroy(EvEnd));
 
-  if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) {
+  if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) {
     UR_CHECK_ERROR(hipEventDestroy(EvQueued));
     UR_CHECK_ERROR(hipEventDestroy(EvStart));
   }
@@ -244,7 +245,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
   UR_ASSERT(!(pPropValue && propValueSize == 0), UR_RESULT_ERROR_INVALID_VALUE);
 
   ur_queue_handle_t Queue = hEvent->getQueue();
-  if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
+  if (Queue == nullptr || (!(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) &&
+                           !hEvent->isTimestampEvent())) {
     return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
   }
 
diff --git a/source/adapters/hip/event.hpp b/source/adapters/hip/event.hpp
index 50de73b14f..5e7c1d7e7d 100644
--- a/source/adapters/hip/event.hpp
+++ b/source/adapters/hip/event.hpp
@@ -28,8 +28,6 @@ struct ur_event_handle_t_ {
 
   ur_queue_handle_t getQueue() const noexcept { return Queue; }
 
-  ur_device_handle_t getDevice() const noexcept { return Queue->getDevice(); }
-
   hipStream_t getStream() const noexcept { return Stream; }
 
   uint32_t getComputeStreamToken() const noexcept { return StreamToken; }
@@ -55,6 +53,10 @@ struct ur_event_handle_t_ {
     return UR_EVENT_STATUS_COMPLETE;
   }
 
+  bool isTimestampEvent() const noexcept {
+    return getCommandType() == UR_COMMAND_TIMESTAMP_RECORDING_EXP;
+  }
+
   ur_context_handle_t getContext() const noexcept { return Context; };
 
   uint32_t incrementReferenceCount() { return ++RefCount; }
diff --git a/source/adapters/hip/image.cpp b/source/adapters/hip/image.cpp
index a8fcd6f465..4085614f31 100644
--- a/source/adapters/hip/image.cpp
+++ b/source/adapters/hip/image.cpp
@@ -59,7 +59,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
     [[maybe_unused]] ur_exp_image_mem_handle_t hImageMem,
     [[maybe_unused]] const ur_image_format_t *pImageFormat,
     [[maybe_unused]] const ur_image_desc_t *pImageDesc,
-    [[maybe_unused]] ur_mem_handle_t *phMem,
     [[maybe_unused]] ur_exp_image_handle_t *phImage) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
@@ -71,7 +70,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
     [[maybe_unused]] const ur_image_format_t *pImageFormat,
     [[maybe_unused]] const ur_image_desc_t *pImageDesc,
     [[maybe_unused]] ur_sampler_handle_t hSampler,
-    [[maybe_unused]] ur_mem_handle_t *phMem,
     [[maybe_unused]] ur_exp_image_handle_t *phImage) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp
index ddb8f10595..f35d3957bc 100644
--- a/source/adapters/hip/kernel.cpp
+++ b/source/adapters/hip/kernel.cpp
@@ -91,14 +91,17 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
     return ReturnValue(size_t(MaxThreads));
   }
   case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
-    size_t group_size[3] = {0, 0, 0};
-    // Returns the work-group size specified in the kernel source or IL.
-    // If the work-group size is not specified in the kernel source or IL,
-    // (0, 0, 0) is returned.
-    // https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/clGetKernelWorkGroupInfo.html
-
-    // TODO: can we extract the work group size from the PTX?
-    return ReturnValue(group_size, 3);
+    size_t GroupSize[3] = {0, 0, 0};
+    const auto &ReqdWGSizeMDMap =
+        hKernel->getProgram()->KernelReqdWorkGroupSizeMD;
+    const auto ReqdWGSizeMD = ReqdWGSizeMDMap.find(hKernel->getName());
+    if (ReqdWGSizeMD != ReqdWGSizeMDMap.end()) {
+      const auto ReqdWGSize = ReqdWGSizeMD->second;
+      GroupSize[0] = std::get<0>(ReqdWGSize);
+      GroupSize[1] = std::get<1>(ReqdWGSize);
+      GroupSize[2] = std::get<2>(ReqdWGSize);
+    }
+    return ReturnValue(GroupSize, 3);
   }
   case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: {
     // OpenCL LOCAL == HIP SHARED
diff --git a/source/adapters/hip/kernel.hpp b/source/adapters/hip/kernel.hpp
index 83693a3d41..afea69832b 100644
--- a/source/adapters/hip/kernel.hpp
+++ b/source/adapters/hip/kernel.hpp
@@ -42,6 +42,9 @@ struct ur_kernel_handle_t_ {
   ur_program_handle_t Program;
   std::atomic_uint32_t RefCount;
 
+  static constexpr uint32_t ReqdThreadsPerBlockDimensions = 3u;
+  size_t ReqdThreadsPerBlock[ReqdThreadsPerBlockDimensions];
+
   /// Structure that holds the arguments to the kernel.
   /// Note earch argument size is known, since it comes
   /// from the kernel signature.
@@ -154,6 +157,11 @@ struct ur_kernel_handle_t_ {
                       ur_context_handle_t Ctxt)
       : Function{Func}, FunctionWithOffsetParam{FuncWithOffsetParam},
         Name{Name}, Context{Ctxt}, Program{Program}, RefCount{1} {
+    assert(Program->getDevice());
+    UR_CHECK_ERROR(urKernelGetGroupInfo(
+        this, Program->getDevice(),
+        UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
+        sizeof(ReqdThreadsPerBlock), ReqdThreadsPerBlock, nullptr));
     urProgramRetain(Program);
     urContextRetain(Context);
   }
diff --git a/source/adapters/hip/memory.cpp b/source/adapters/hip/memory.cpp
index dcc3e34fad..ff209884ce 100644
--- a/source/adapters/hip/memory.cpp
+++ b/source/adapters/hip/memory.cpp
@@ -525,11 +525,12 @@ inline ur_result_t migrateBufferToDevice(ur_mem_handle_t Mem,
       UR_CHECK_ERROR(
           hipMemcpyHtoD(Buffer.getPtr(hDevice), Buffer.HostPtr, Buffer.Size));
     }
-  } else if (Mem->LastEventWritingToMemObj->getDevice() != hDevice) {
-    UR_CHECK_ERROR(
-        hipMemcpyDtoD(Buffer.getPtr(hDevice),
-                      Buffer.getPtr(Mem->LastEventWritingToMemObj->getDevice()),
-                      Buffer.Size));
+  } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
+             hDevice) {
+    UR_CHECK_ERROR(hipMemcpyDtoD(
+        Buffer.getPtr(hDevice),
+        Buffer.getPtr(Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
+        Buffer.Size));
   }
   return UR_RESULT_SUCCESS;
 }
@@ -577,22 +578,24 @@ inline ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
       CpyDesc3D.srcHost = Image.HostPtr;
       UR_CHECK_ERROR(hipDrvMemcpy3D(&CpyDesc3D));
     }
-  } else if (Mem->LastEventWritingToMemObj->getDevice() != hDevice) {
+  } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
+             hDevice) {
     if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
       // FIXME: 1D memcpy from DtoD going through the host.
       UR_CHECK_ERROR(hipMemcpyAtoH(
           Image.HostPtr,
-          Image.getArray(Mem->LastEventWritingToMemObj->getDevice()),
+          Image.getArray(
+              Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
           0 /*srcOffset*/, ImageSizeBytes));
       UR_CHECK_ERROR(
           hipMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
     } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
-      CpyDesc2D.srcArray =
-          Image.getArray(Mem->LastEventWritingToMemObj->getDevice());
+      CpyDesc2D.srcArray = Image.getArray(
+          Mem->LastEventWritingToMemObj->getQueue()->getDevice());
       UR_CHECK_ERROR(hipMemcpyParam2D(&CpyDesc2D));
     } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
-      CpyDesc3D.srcArray =
-          Image.getArray(Mem->LastEventWritingToMemObj->getDevice());
+      CpyDesc3D.srcArray = Image.getArray(
+          Mem->LastEventWritingToMemObj->getQueue()->getDevice());
       UR_CHECK_ERROR(hipDrvMemcpy3D(&CpyDesc3D));
     }
   }
diff --git a/source/adapters/hip/memory.hpp b/source/adapters/hip/memory.hpp
index 7707794b3c..5d2aa6f9a5 100644
--- a/source/adapters/hip/memory.hpp
+++ b/source/adapters/hip/memory.hpp
@@ -9,12 +9,15 @@
 //===----------------------------------------------------------------------===//
 #pragma once
 
-#include "common.hpp"
 #include "context.hpp"
 #include "event.hpp"
 #include <cassert>
+#include <memory>
+#include <unordered_map>
 #include <variant>
 
+#include "common.hpp"
+
 ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
                                            const ur_device_handle_t);
 ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
@@ -22,6 +25,41 @@ ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
 
 // Handler for plain, pointer-based HIP allocations
 struct BufferMem {
+  struct BufferMap {
+    /// Size of the active mapped region.
+    size_t MapSize;
+    /// Offset of the active mapped region.
+    size_t MapOffset;
+    /// Original flags for the mapped region
+    ur_map_flags_t MapFlags;
+    /// Allocated host memory used exclusively for this map.
+    std::shared_ptr<unsigned char[]> MapMem;
+
+    BufferMap(size_t MapSize, size_t MapOffset, ur_map_flags_t MapFlags)
+        : MapSize(MapSize), MapOffset(MapOffset), MapFlags(MapFlags),
+          MapMem(nullptr) {}
+
+    BufferMap(size_t MapSize, size_t MapOffset, ur_map_flags_t MapFlags,
+              std::unique_ptr<unsigned char[]> &&MapMem)
+        : MapSize(MapSize), MapOffset(MapOffset), MapFlags(MapFlags),
+          MapMem(std::move(MapMem)) {}
+
+    size_t getMapSize() const noexcept { return MapSize; }
+
+    size_t getMapOffset() const noexcept { return MapOffset; }
+
+    ur_map_flags_t getMapFlags() const noexcept { return MapFlags; }
+  };
+
+  /** AllocMode
+   * Classic: Just a normal buffer allocated on the device via hip malloc
+   * UseHostPtr: Use an address on the host for the device
+   * CopyIn: The data for the device comes from the host but the host
+   pointer is not available later for re-use
+   * AllocHostPtr: Uses pinned-memory allocation
+  */
+  enum class AllocMode { Classic, UseHostPtr, CopyIn, AllocHostPtr };
+
   using native_type = hipDeviceptr_t;
 
   // If this allocation is a sub-buffer (i.e., a view on an existing
@@ -29,33 +67,14 @@ struct BufferMem {
   ur_mem_handle_t Parent = nullptr;
   // Outer mem holding this struct in variant
   ur_mem_handle_t OuterMemStruct;
-
   /// Pointer associated with this device on the host
   void *HostPtr;
   /// Size of the allocation in bytes
   size_t Size;
-  /// Size of the active mapped region.
-  size_t MapSize;
-  /// Offset of the active mapped region.
-  size_t MapOffset;
-  /// Pointer to the active mapped region, if any
-  void *MapPtr;
-  /// Original flags for the mapped region
-  ur_map_flags_t MapFlags;
+  /// A map that contains all the active mappings for this buffer.
+  std::unordered_map<void *, BufferMap> PtrToBufferMap;
 
-  /** AllocMode
-   * Classic: Just a normal buffer allocated on the device via hip malloc
-   * UseHostPtr: Use an address on the host for the device
-   * CopyIn: The data for the device comes from the host but the host
-   pointer is not available later for re-use
-   * AllocHostPtr: Uses pinned-memory allocation
-  */
-  enum class AllocMode {
-    Classic,
-    UseHostPtr,
-    CopyIn,
-    AllocHostPtr
-  } MemAllocMode;
+  AllocMode MemAllocMode;
 
 private:
   // Vector of HIP pointers
@@ -65,10 +84,8 @@ struct BufferMem {
   BufferMem(ur_context_handle_t Context, ur_mem_handle_t OuterMemStruct,
             AllocMode Mode, void *HostPtr, size_t Size)
       : OuterMemStruct{OuterMemStruct}, HostPtr{HostPtr}, Size{Size},
-        MapSize{0}, MapOffset{0}, MapPtr{nullptr}, MapFlags{UR_MAP_FLAG_WRITE},
-        MemAllocMode{Mode}, Ptrs(Context->Devices.size(), native_type{0}){};
-
-  BufferMem(const BufferMem &Buffer) = default;
+        PtrToBufferMap{}, MemAllocMode{Mode},
+        Ptrs(Context->Devices.size(), native_type{0}){};
 
   // This will allocate memory on device if there isn't already an active
   // allocation on the device
@@ -98,45 +115,41 @@ struct BufferMem {
 
   size_t getSize() const noexcept { return Size; }
 
-  void *getMapPtr() const noexcept { return MapPtr; }
-
-  size_t getMapSize() const noexcept { return MapSize; }
-
-  size_t getMapOffset() const noexcept { return MapOffset; }
+  BufferMap *getMapDetails(void *Map) {
+    auto details = PtrToBufferMap.find(Map);
+    if (details != PtrToBufferMap.end()) {
+      return &details->second;
+    }
+    return nullptr;
+  }
 
   /// Returns a pointer to data visible on the host that contains
   /// the data on the device associated with this allocation.
   /// The offset is used to index into the HIP allocation.
   ///
-  void *mapToPtr(size_t Size, size_t Offset, ur_map_flags_t Flags) noexcept {
-    assert(MapPtr == nullptr);
-    MapSize = Size;
-    MapOffset = Offset;
-    MapFlags = Flags;
-    if (HostPtr) {
-      MapPtr = static_cast<char *>(HostPtr) + Offset;
+  void *mapToPtr(size_t MapSize, size_t MapOffset,
+                 ur_map_flags_t MapFlags) noexcept {
+    void *MapPtr = nullptr;
+    if (HostPtr == nullptr) {
+      /// If HostPtr is invalid, we need to create a Mapping that owns its own
+      /// memory on the host.
+      auto MapMem = std::make_unique<unsigned char[]>(MapSize);
+      MapPtr = MapMem.get();
+      PtrToBufferMap.insert(
+          {MapPtr, BufferMap(MapSize, MapOffset, MapFlags, std::move(MapMem))});
     } else {
-      // TODO: Allocate only what is needed based on the offset
-      MapPtr = static_cast<void *>(malloc(this->getSize()));
+      /// However, if HostPtr already has valid memory (e.g. pinned allocation),
+      /// we can just use that memory for the mapping.
+      MapPtr = static_cast<char *>(HostPtr) + MapOffset;
+      PtrToBufferMap.insert({MapPtr, BufferMap(MapSize, MapOffset, MapFlags)});
     }
     return MapPtr;
   }
 
   /// Detach the allocation from the host memory.
-  void unmap(void *) noexcept {
+  void unmap(void *MapPtr) noexcept {
     assert(MapPtr != nullptr);
-
-    if (MapPtr != HostPtr) {
-      free(MapPtr);
-    }
-    MapPtr = nullptr;
-    MapSize = 0;
-    MapOffset = 0;
-  }
-
-  ur_map_flags_t getMapFlags() const noexcept {
-    assert(MapPtr != nullptr);
-    return MapFlags;
+    PtrToBufferMap.erase(MapPtr);
   }
 
   ur_result_t clear() {
@@ -414,7 +427,7 @@ struct ur_mem_handle_t_ {
         HaveMigratedToDeviceSinceLastWrite(Context->Devices.size(), false),
         Mem{std::in_place_type<BufferMem>, Ctxt, this, Mode, HostPtr, Size} {
     urContextRetain(Context);
-  };
+  }
 
   // Subbuffer constructor
   ur_mem_handle_t_(ur_mem Parent, size_t SubBufferOffset)
@@ -435,7 +448,7 @@ struct ur_mem_handle_t_ {
       }
     }
     urMemRetain(Parent);
-  };
+  }
 
   /// Constructs the UR mem handler for an Image object
   ur_mem_handle_t_(ur_context Ctxt, ur_mem_flags_t MemFlags,
@@ -498,7 +511,7 @@ struct ur_mem_handle_t_ {
     LastEventWritingToMemObj = NewEvent;
     for (const auto &Device : Context->getDevices()) {
       HaveMigratedToDeviceSinceLastWrite[Device->getIndex()] =
-          Device == NewEvent->getDevice();
+          Device == NewEvent->getQueue()->getDevice();
     }
   }
 };
diff --git a/source/adapters/hip/program.cpp b/source/adapters/hip/program.cpp
index 1ee862804a..8e3653ee02 100644
--- a/source/adapters/hip/program.cpp
+++ b/source/adapters/hip/program.cpp
@@ -9,6 +9,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "program.hpp"
+#include "ur_util.hpp"
 
 #ifdef SYCL_ENABLE_KERNEL_FUSION
 #ifdef UR_COMGR_VERSION4_INCLUDE
@@ -78,15 +79,6 @@ void getCoMgrBuildLog(const amd_comgr_data_set_t BuildDataSet, char *BuildLog,
 } // namespace
 #endif
 
-std::pair<std::string, std::string>
-splitMetadataName(const std::string &metadataName) {
-  size_t splitPos = metadataName.rfind('@');
-  if (splitPos == std::string::npos)
-    return std::make_pair(metadataName, std::string{});
-  return std::make_pair(metadataName.substr(0, splitPos),
-                        metadataName.substr(splitPos, metadataName.length()));
-}
-
 ur_result_t
 ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata,
                                   size_t Length) {
@@ -107,8 +99,29 @@ ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata,
       const char *MetadataValPtrEnd =
           MetadataValPtr + MetadataElement.size - sizeof(std::uint64_t);
       GlobalIDMD[Prefix] = std::string{MetadataValPtr, MetadataValPtrEnd};
+    } else if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) {
+      // If metadata is reqd_work_group_size, record it for the corresponding
+      // kernel name.
+      size_t MDElemsSize = MetadataElement.size - sizeof(std::uint64_t);
+
+      // Expect between 1 and 3 32-bit integer values.
+      UR_ASSERT(MDElemsSize >= sizeof(std::uint32_t) &&
+                    MDElemsSize <= sizeof(std::uint32_t) * 3,
+                UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE);
+
+      // Get pointer to data, skipping 64-bit size at the start of the data.
+      const char *ValuePtr =
+          reinterpret_cast<const char *>(MetadataElement.value.pData) +
+          sizeof(std::uint64_t);
+      // Read values and pad with 1's for values not present.
+      std::uint32_t ReqdWorkGroupElements[] = {1, 1, 1};
+      std::memcpy(ReqdWorkGroupElements, ValuePtr, MDElemsSize);
+      KernelReqdWorkGroupSizeMD[Prefix] =
+          std::make_tuple(ReqdWorkGroupElements[0], ReqdWorkGroupElements[1],
+                          ReqdWorkGroupElements[2]);
     }
   }
+
   return UR_RESULT_SUCCESS;
 }
 
@@ -459,8 +472,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
   std::unique_ptr<ur_program_handle_t_> RetProgram{
       new ur_program_handle_t_{hContext, hDevice}};
 
-  // TODO: Set metadata here and use reqd_work_group_size information.
-  // See urProgramCreateWithBinary in CUDA adapter.
   if (pProperties) {
     if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
       return UR_RESULT_ERROR_INVALID_NULL_POINTER;
@@ -469,8 +480,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
     }
     Result =
         RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count);
+    UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);
   }
-  UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);
 
   auto pBinary_string = reinterpret_cast<const char *>(pBinary);
   if (size == 0) {
diff --git a/source/adapters/hip/program.hpp b/source/adapters/hip/program.hpp
index 651d59a42d..cc5da40af5 100644
--- a/source/adapters/hip/program.hpp
+++ b/source/adapters/hip/program.hpp
@@ -12,6 +12,7 @@
 #include <ur_api.h>
 
 #include <atomic>
+#include <unordered_map>
 
 #include "context.hpp"
 
@@ -30,6 +31,8 @@ struct ur_program_handle_t_ {
   bool IsRelocatable = false;
 
   std::unordered_map<std::string, std::string> GlobalIDMD;
+  std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
+      KernelReqdWorkGroupSizeMD;
 
   constexpr static size_t MAX_LOG_SIZE = 8192u;
 
@@ -38,8 +41,8 @@ struct ur_program_handle_t_ {
   ur_program_build_status_t BuildStatus = UR_PROGRAM_BUILD_STATUS_NONE;
 
   ur_program_handle_t_(ur_context_handle_t Ctxt, ur_device_handle_t Device)
-      : Module{nullptr}, Binary{},
-        BinarySizeInBytes{0}, RefCount{1}, Context{Ctxt}, Device{Device} {
+      : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1},
+        Context{Ctxt}, Device{Device}, KernelReqdWorkGroupSizeMD{} {
     urContextRetain(Context);
     urDeviceRetain(Device);
   }
diff --git a/source/adapters/hip/ur_interface_loader.cpp b/source/adapters/hip/ur_interface_loader.cpp
index 1cf9431fcb..71979b75b1 100644
--- a/source/adapters/hip/ur_interface_loader.cpp
+++ b/source/adapters/hip/ur_interface_loader.cpp
@@ -377,6 +377,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
 
   pDdiTable->pfnCooperativeKernelLaunchExp =
       urEnqueueCooperativeKernelLaunchExp;
+  pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp;
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/hip/usm.cpp b/source/adapters/hip/usm.cpp
index 4068c1d865..e871f394f2 100644
--- a/source/adapters/hip/usm.cpp
+++ b/source/adapters/hip/usm.cpp
@@ -27,7 +27,7 @@ urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc,
             UR_RESULT_ERROR_INVALID_VALUE);
 
   if (!hPool) {
-    return USMHostAllocImpl(ppMem, hContext, nullptr, size, alignment);
+    return USMHostAllocImpl(ppMem, hContext, /* flags */ 0, size, alignment);
   }
 
   return umfPoolMallocHelper(hPool, ppMem, size, alignment);
@@ -43,7 +43,7 @@ urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
             UR_RESULT_ERROR_INVALID_VALUE);
 
   if (!hPool) {
-    return USMDeviceAllocImpl(ppMem, hContext, hDevice, nullptr, size,
+    return USMDeviceAllocImpl(ppMem, hContext, hDevice, /* flags */ 0, size,
                               alignment);
   }
 
@@ -60,8 +60,8 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
             UR_RESULT_ERROR_INVALID_VALUE);
 
   if (!hPool) {
-    return USMSharedAllocImpl(ppMem, hContext, hDevice, nullptr, nullptr, size,
-                              alignment);
+    return USMSharedAllocImpl(ppMem, hContext, hDevice, /*host flags*/ 0,
+                              /*device flags*/ 0, size, alignment);
   }
 
   return umfPoolMallocHelper(hPool, ppMem, size, alignment);
@@ -105,7 +105,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext,
 
 ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t,
                                ur_device_handle_t Device,
-                               ur_usm_device_mem_flags_t *, size_t Size,
+                               ur_usm_device_mem_flags_t, size_t Size,
                                [[maybe_unused]] uint32_t Alignment) {
   try {
     ScopedContext Active(Device);
@@ -120,8 +120,8 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t,
 
 ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t,
                                ur_device_handle_t Device,
-                               ur_usm_host_mem_flags_t *,
-                               ur_usm_device_mem_flags_t *, size_t Size,
+                               ur_usm_host_mem_flags_t,
+                               ur_usm_device_mem_flags_t, size_t Size,
                                [[maybe_unused]] uint32_t Alignment) {
   try {
     ScopedContext Active(Device);
@@ -136,7 +136,7 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t,
 
 ur_result_t USMHostAllocImpl(void **ResultPtr,
                              [[maybe_unused]] ur_context_handle_t Context,
-                             ur_usm_host_mem_flags_t *, size_t Size,
+                             ur_usm_host_mem_flags_t, size_t Size,
                              [[maybe_unused]] uint32_t Alignment) {
   try {
     UR_CHECK_ERROR(hipHostMalloc(ResultPtr, Size));
@@ -309,19 +309,19 @@ umf_result_t USMMemoryProvider::get_min_page_size(void *Ptr, size_t *PageSize) {
 
 ur_result_t USMSharedMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
                                                   uint32_t Alignment) {
-  return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, nullptr, Size,
-                            Alignment);
+  return USMSharedAllocImpl(ResultPtr, Context, Device, /*host flags*/ 0,
+                            /*device flags*/ 0, Size, Alignment);
 }
 
 ur_result_t USMDeviceMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
                                                   uint32_t Alignment) {
-  return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size,
+  return USMDeviceAllocImpl(ResultPtr, Context, Device, /* flags */ 0, Size,
                             Alignment);
 }
 
 ur_result_t USMHostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
                                                 uint32_t Alignment) {
-  return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment);
+  return USMHostAllocImpl(ResultPtr, Context, /* flags */ 0, Size, Alignment);
 }
 
 ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context,
diff --git a/source/adapters/hip/usm.hpp b/source/adapters/hip/usm.hpp
index d02145584f..a1c3964263 100644
--- a/source/adapters/hip/usm.hpp
+++ b/source/adapters/hip/usm.hpp
@@ -118,17 +118,17 @@ class USMHostMemoryProvider final : public USMMemoryProvider {
 
 ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
                                ur_device_handle_t Device,
-                               ur_usm_device_mem_flags_t *Flags, size_t Size,
+                               ur_usm_device_mem_flags_t Flags, size_t Size,
                                uint32_t Alignment);
 
 ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
                                ur_device_handle_t Device,
-                               ur_usm_host_mem_flags_t *,
-                               ur_usm_device_mem_flags_t *, size_t Size,
+                               ur_usm_host_mem_flags_t,
+                               ur_usm_device_mem_flags_t, size_t Size,
                                uint32_t Alignment);
 
 ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
-                             ur_usm_host_mem_flags_t *Flags, size_t Size,
+                             ur_usm_host_mem_flags_t Flags, size_t Size,
                              uint32_t Alignment);
 
 bool checkUSMAlignment(uint32_t &alignment, const ur_usm_desc_t *pUSMDesc);
diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index a06163c2a9..67415a0de0 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -1048,8 +1048,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
       // Create command-list to execute before `CommandListPtr` and will signal
       // when `EventWaitList` dependencies are complete.
       ur_command_list_ptr_t WaitCommandList{};
-      UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList,
-                                                      false, false));
+      UR_CALL(Queue->Context->getAvailableCommandList(
+          Queue, WaitCommandList, false, NumEventsInWaitList, EventWaitList,
+          false));
 
       ZE2UR_CALL(zeCommandListAppendBarrier,
                  (WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent,
@@ -1086,7 +1087,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
   // Create a command-list to signal RetEvent on completion
   ur_command_list_ptr_t SignalCommandList{};
   UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList,
-                                                  false, false));
+                                                  false, NumEventsInWaitList,
+                                                  EventWaitList, false));
   // Reset the wait-event for the UR command-buffer that is signaled when its
   // submission dependencies have been satisfied.
   ZE2UR_CALL(zeCommandListAppendEventReset,
diff --git a/source/adapters/level_zero/common.cpp b/source/adapters/level_zero/common.cpp
index 7ae7272355..926d5f4ba4 100644
--- a/source/adapters/level_zero/common.cpp
+++ b/source/adapters/level_zero/common.cpp
@@ -88,7 +88,11 @@ bool setEnvVar(const char *name, const char *value) {
 ZeUSMImportExtension ZeUSMImport;
 
 // This will count the calls to Level-Zero
+// TODO: remove the ifdef once
+// https://github.com/oneapi-src/unified-runtime/issues/1454 is implemented
+#ifndef UR_L0_CALL_COUNT_IN_TESTS
 std::map<std::string, int> *ZeCallCount = nullptr;
+#endif
 
 inline void zeParseError(ze_result_t ZeError, const char *&ErrorString) {
   switch (ZeError) {
diff --git a/source/adapters/level_zero/common.hpp b/source/adapters/level_zero/common.hpp
index 93bf407567..e16d767b71 100644
--- a/source/adapters/level_zero/common.hpp
+++ b/source/adapters/level_zero/common.hpp
@@ -241,6 +241,19 @@ static const uint32_t UrL0QueueSyncNonBlocking = [] {
   return L0QueueSyncLockingModeValue;
 }();
 
+// Controls whether the L0 Adapter creates signal events for commands on
+// integrated gpu devices.
+static const uint32_t UrL0OutOfOrderIntegratedSignalEvent = [] {
+  const char *UrL0OutOfOrderIntegratedSignalEventEnv =
+      std::getenv("UR_L0_OOQ_INTEGRATED_SIGNAL_EVENT");
+  uint32_t UrL0OutOfOrderIntegratedSignalEventValue = 1;
+  if (UrL0OutOfOrderIntegratedSignalEventEnv) {
+    UrL0OutOfOrderIntegratedSignalEventValue =
+        std::atoi(UrL0OutOfOrderIntegratedSignalEventEnv);
+  }
+  return UrL0OutOfOrderIntegratedSignalEventValue;
+}();
+
 // This class encapsulates actions taken along with a call to Level Zero API.
 class ZeCall {
 private:
diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp
index 4880c14c4b..85fe582ec3 100644
--- a/source/adapters/level_zero/context.cpp
+++ b/source/adapters/level_zero/context.cpp
@@ -645,7 +645,8 @@ static const size_t CmdListsCleanupThreshold = [] {
 // Retrieve an available command list to be used in a PI call.
 ur_result_t ur_context_handle_t_::getAvailableCommandList(
     ur_queue_handle_t Queue, ur_command_list_ptr_t &CommandList,
-    bool UseCopyEngine, bool AllowBatching,
+    bool UseCopyEngine, uint32_t NumEventsInWaitList,
+    const ur_event_handle_t *EventWaitList, bool AllowBatching,
     ze_command_queue_handle_t *ForcedCmdQueue) {
   // Immediate commandlists have been pre-allocated and are always available.
   if (Queue->UsingImmCmdLists) {
@@ -677,9 +678,22 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList(
   // for this queue.
   if (Queue->hasOpenCommandList(UseCopyEngine)) {
     if (AllowBatching) {
-      CommandList = CommandBatch.OpenCommandList;
-      UR_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList));
-      return UR_RESULT_SUCCESS;
+      bool batchingAllowed = true;
+      if (ForcedCmdQueue &&
+          CommandBatch.OpenCommandList->second.ZeQueue != *ForcedCmdQueue) {
+        // Current open batch doesn't match the forced command queue
+        batchingAllowed = false;
+      }
+      if (!UrL0OutOfOrderIntegratedSignalEvent &&
+          Queue->Device->isIntegrated()) {
+        batchingAllowed = eventCanBeBatched(Queue, UseCopyEngine,
+                                            NumEventsInWaitList, EventWaitList);
+      }
+      if (batchingAllowed) {
+        CommandList = CommandBatch.OpenCommandList;
+        UR_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList));
+        return UR_RESULT_SUCCESS;
+      }
     }
     // If this command isn't allowed to be batched or doesn't match the forced
     // command queue, then we need to go ahead and execute what is already in
diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp
index ff173aa984..4184411de7 100644
--- a/source/adapters/level_zero/context.hpp
+++ b/source/adapters/level_zero/context.hpp
@@ -292,11 +292,11 @@ struct ur_context_handle_t_ : _ur_object {
   // When using immediate commandlists, retrieves an immediate command list
   // for executing on this device. Immediate commandlists are created only
   // once for each SYCL Queue and after that they are reused.
-  ur_result_t
-  getAvailableCommandList(ur_queue_handle_t Queue,
-                          ur_command_list_ptr_t &CommandList,
-                          bool UseCopyEngine, bool AllowBatching = false,
-                          ze_command_queue_handle_t *ForcedCmdQueue = nullptr);
+  ur_result_t getAvailableCommandList(
+      ur_queue_handle_t Queue, ur_command_list_ptr_t &CommandList,
+      bool UseCopyEngine, uint32_t NumEventsInWaitList,
+      const ur_event_handle_t *EventWaitList, bool AllowBatching = false,
+      ze_command_queue_handle_t *ForcedCmdQueue = nullptr);
 
   // Checks if Device is covered by this context.
   // For that the Device or its root devices need to be in the context.
diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp
index 7f9e9b499b..7f832f30f2 100644
--- a/source/adapters/level_zero/device.cpp
+++ b/source/adapters/level_zero/device.cpp
@@ -1,6 +1,6 @@
 //===--------- device.cpp - Level Zero Adapter ----------------------------===//
 //
-// Copyright (C) 2023 Intel Corporation
+// Copyright (C) 2023-2024 Intel Corporation
 //
 // Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
 // Exceptions. See LICENSE.TXT
@@ -188,8 +188,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
   }
   case UR_DEVICE_INFO_ATOMIC_64:
     return ReturnValue(
-        static_cast<uint32_t>(Device->ZeDeviceModuleProperties->flags &
-                              ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS));
+        static_cast<ur_bool_t>(Device->ZeDeviceModuleProperties->flags &
+                               ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS));
   case UR_DEVICE_INFO_EXTENSIONS: {
     // Convention adopted from OpenCL:
     //     "Returns a space separated list of extension names (the extension
@@ -258,9 +258,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
   case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE:
     return ReturnValue(uint32_t{0});
   case UR_DEVICE_INFO_COMPILER_AVAILABLE:
-    return ReturnValue(static_cast<uint32_t>(true));
+    return ReturnValue(static_cast<ur_bool_t>(true));
   case UR_DEVICE_INFO_LINKER_AVAILABLE:
-    return ReturnValue(static_cast<uint32_t>(true));
+    return ReturnValue(static_cast<ur_bool_t>(true));
   case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: {
     uint32_t MaxComputeUnits =
         Device->ZeDeviceProperties->numEUsPerSubslice *
@@ -337,8 +337,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
   case UR_DEVICE_INFO_DRIVER_VERSION:
   case UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION:
     return ReturnValue(Device->Platform->ZeDriverVersion.c_str());
-  case UR_DEVICE_INFO_VERSION:
-    return ReturnValue(Device->Platform->ZeDriverApiVersion.c_str());
+  case UR_DEVICE_INFO_VERSION: {
+    // from compute-runtime/shared/source/helpers/hw_ip_version.h
+    typedef struct {
+      uint32_t revision : 6;
+      uint32_t reserved : 8;
+      uint32_t release : 8;
+      uint32_t architecture : 10;
+    } version_components_t;
+    typedef struct {
+      union {
+        uint32_t value;
+        version_components_t components;
+      };
+    } ipVersion_t;
+    ipVersion_t IpVersion;
+    IpVersion.value = Device->ZeDeviceIpVersionExt->ipVersion;
+    std::stringstream S;
+    S << IpVersion.components.architecture << "."
+      << IpVersion.components.release << "." << IpVersion.components.revision;
+    return ReturnValue(S.str().c_str());
+  }
   case UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: {
     auto Res = Device->Platform->populateDeviceCacheIfNeeded();
     if (Res != UR_RESULT_SUCCESS) {
@@ -410,7 +429,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
   case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION:
     return ReturnValue("");
   case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC:
-    return ReturnValue(static_cast<uint32_t>(true));
+    return ReturnValue(static_cast<ur_bool_t>(true));
   case UR_DEVICE_INFO_PRINTF_BUFFER_SIZE:
     return ReturnValue(
         size_t{Device->ZeDeviceModuleProperties->printfBufferSize});
@@ -427,7 +446,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     return ReturnValue(ur_device_exec_capability_flag_t{
         UR_DEVICE_EXEC_CAPABILITY_FLAG_NATIVE_KERNEL});
   case UR_DEVICE_INFO_ENDIAN_LITTLE:
-    return ReturnValue(static_cast<uint32_t>(true));
+    return ReturnValue(static_cast<ur_bool_t>(true));
   case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT:
     return ReturnValue(static_cast<uint32_t>(Device->ZeDeviceProperties->flags &
                                              ZE_DEVICE_PROPERTY_FLAG_ECC));
@@ -604,7 +623,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
   }
   case UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: {
     // TODO: Not supported yet. Needs to be updated after support is added.
-    return ReturnValue(static_cast<uint32_t>(false));
+    return ReturnValue(static_cast<ur_bool_t>(false));
   }
   case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
     // ze_device_compute_properties.subGroupSizes is in uint32_t whereas the
@@ -790,7 +809,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     return UR_RESULT_ERROR_INVALID_VALUE;
   case UR_DEVICE_INFO_BFLOAT16: {
     // bfloat16 math functions are not yet supported on Intel GPUs.
-    return ReturnValue(bool{false});
+    return ReturnValue(ur_bool_t{false});
   }
   case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
     // There are no explicit restrictions in L0 programming guide, so assume all
@@ -839,9 +858,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     return ReturnValue(capabilities);
   }
   case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT:
-    return ReturnValue(uint32_t{false});
+    return ReturnValue(ur_bool_t{false});
   case UR_DEVICE_INFO_IMAGE_SRGB:
-    return ReturnValue(uint32_t{false});
+    return ReturnValue(ur_bool_t{false});
 
   case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES:
   case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: {
@@ -853,6 +872,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
         0)); //__read_write attribute currently undefinde in opencl
   }
   case UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT: {
+    return ReturnValue(static_cast<ur_bool_t>(true));
+  }
+  case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: {
     return ReturnValue(static_cast<uint32_t>(true));
   }
 
@@ -893,9 +915,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
       // can know if we are in (a) or (b) by checking if a tile is root device
       // or not.
       ur_device_handle_t URDev = Device->Platform->getDeviceFromNativeHandle(d);
-      if (URDev->isSubDevice())
+      if (URDev->isSubDevice()) {
         // We are in COMPOSITE mode, return an empty list.
-        return ReturnValue(0);
+        if (pSize) {
+          *pSize = 0;
+        }
+        return UR_RESULT_SUCCESS;
+      }
 
       Res.push_back(URDev);
     }
@@ -1078,7 +1104,7 @@ bool ur_device_handle_t_::useDriverInOrderLists() {
   static const bool UseDriverInOrderLists = [] {
     const char *UrRet = std::getenv("UR_L0_USE_DRIVER_INORDER_LISTS");
     if (!UrRet)
-      return true;
+      return false;
     return std::atoi(UrRet) != 0;
   }();
 
diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp
index 484890670b..3cdfcbce7e 100644
--- a/source/adapters/level_zero/device.hpp
+++ b/source/adapters/level_zero/device.hpp
@@ -176,6 +176,10 @@ struct ur_device_handle_t_ : _ur_object {
            (ZeDeviceProperties->deviceId & 0xff0) == 0xb60;
   }
 
+  bool isIntegrated() {
+    return (ZeDeviceProperties->flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED);
+  }
+
   // Does this device represent a single compute slice?
   bool isCCS() const {
     return QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute]
diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index cd61a8aa23..d91353349f 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -84,8 +84,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
 
     // Get a new command list to be used on this call
     ur_command_list_ptr_t CommandList{};
-    UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                    UseCopyEngine));
+    UR_CALL(Queue->Context->getAvailableCommandList(
+        Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList));
 
     ze_event_handle_t ZeEvent = nullptr;
     ur_event_handle_t InternalEvent;
@@ -256,7 +256,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
     // Get an arbitrary command-list in the queue.
     ur_command_list_ptr_t CmdList;
     UR_CALL(Queue->Context->getAvailableCommandList(
-        Queue, CmdList, false /*UseCopyEngine=*/, OkToBatch));
+        Queue, CmdList, false /*UseCopyEngine=*/, NumEventsInWaitList,
+        EventWaitList, OkToBatch));
 
     // Insert the barrier into the command-list and execute.
     UR_CALL(insertBarrierIntoCmdList(CmdList, TmpWaitList, *Event, IsInternal));
@@ -311,7 +312,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
           if (ZeQueue) {
             ur_command_list_ptr_t CmdList;
             UR_CALL(Queue->Context->getAvailableCommandList(
-                Queue, CmdList, UseCopyEngine, OkToBatch, &ZeQueue));
+                Queue, CmdList, UseCopyEngine, NumEventsInWaitList,
+                EventWaitList, OkToBatch, &ZeQueue));
             CmdLists.push_back(CmdList);
           }
         }
@@ -324,7 +326,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
     // Get any available command list.
     ur_command_list_ptr_t CmdList;
     UR_CALL(Queue->Context->getAvailableCommandList(
-        Queue, CmdList, false /*UseCopyEngine=*/, OkToBatch));
+        Queue, CmdList, false /*UseCopyEngine=*/, NumEventsInWaitList,
+        EventWaitList, OkToBatch));
     CmdLists.push_back(CmdList);
   }
 
@@ -365,8 +368,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
   }
 
   // Execute each command list so the barriers can be encountered.
-  for (ur_command_list_ptr_t &CmdList : CmdLists)
+  for (ur_command_list_ptr_t &CmdList : CmdLists) {
+    bool IsCopy =
+        CmdList->second.isCopy(reinterpret_cast<ur_queue_handle_t>(Queue));
+    const auto &CommandBatch =
+        (IsCopy) ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch;
+    // Only batch if the matching CmdList is already open.
+    OkToBatch = CommandBatch.OpenCommandList == CmdList;
+
     UR_CALL(Queue->executeCommandList(CmdList, false, OkToBatch));
+  }
 
   UR_CALL(Queue->ActiveBarriers.clear());
   auto UREvent = reinterpret_cast<ur_event_handle_t>(*Event);
@@ -469,8 +480,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
 ) {
   std::shared_lock<ur_shared_mutex> EventLock(Event->Mutex);
 
-  if (Event->UrQueue &&
-      (Event->UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) == 0) {
+  // The event must either have profiling enabled or be recording timestamps.
+  bool isTimestampedEvent = Event->isTimestamped();
+  if (!Event->isProfilingEnabled() && !isTimestampedEvent) {
     return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
   }
 
@@ -483,6 +495,61 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
 
   UrReturnHelper ReturnValue(PropValueSize, PropValue, PropValueSizeRet);
 
+  // For timestamped events we have the timestamps ready directly on the event
+  // handle, so we short-circuit the return.
+  if (isTimestampedEvent) {
+    uint64_t ContextStartTime = Event->RecordEventStartTimestamp;
+    switch (PropName) {
+    case UR_PROFILING_INFO_COMMAND_QUEUED:
+    case UR_PROFILING_INFO_COMMAND_SUBMIT:
+      return ReturnValue(ContextStartTime);
+    case UR_PROFILING_INFO_COMMAND_END:
+    case UR_PROFILING_INFO_COMMAND_START: {
+      // If RecordEventEndTimestamp on the event is non-zero it means it has
+      // collected the result of the queue already. In that case it has been
+      // adjusted and is ready for immediate return.
+      if (Event->RecordEventEndTimestamp)
+        return ReturnValue(Event->RecordEventEndTimestamp);
+
+      // Otherwise we need to collect it from the queue.
+      auto Entry = Event->UrQueue->EndTimeRecordings.find(Event);
+
+      // Unexpected state if there is no end-time record.
+      if (Entry == Event->UrQueue->EndTimeRecordings.end())
+        return UR_RESULT_ERROR_UNKNOWN;
+      auto &EndTimeRecording = Entry->second;
+
+      // End time needs to be adjusted for resolution and valid bits.
+      uint64_t ContextEndTime =
+          (EndTimeRecording.RecordEventEndTimestamp & TimestampMaxValue) *
+          ZeTimerResolution;
+
+      // If the result is 0, we have not yet gotten results back and so we just
+      // return it.
+      if (ContextEndTime == 0)
+        return ReturnValue(ContextEndTime);
+
+      // Handle a possible wrap-around (the underlying HW counter is < 64-bit).
+      // Note, it will not report correct time if there were multiple wrap
+      // arounds, and the longer term plan is to enlarge the capacity of the
+      // HW timestamps.
+      if (ContextEndTime < ContextStartTime)
+        ContextEndTime += TimestampMaxValue * ZeTimerResolution;
+
+      // Now that we have the result, there is no need to keep it in the queue
+      // anymore, so we cache it on the event and evict the record from the
+      // queue.
+      Event->RecordEventEndTimestamp = ContextEndTime;
+      Event->UrQueue->EndTimeRecordings.erase(Entry);
+
+      return ReturnValue(ContextEndTime);
+    }
+    default:
+      logger::error("urEventGetProfilingInfo: not supported ParamName");
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    }
+  }
+
   ze_kernel_timestamp_result_t tsResult;
 
   // A Command-buffer consists of three command-lists for which only a single
@@ -591,6 +658,63 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
   return UR_RESULT_SUCCESS;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+    ur_queue_handle_t Queue,      ///< [in] handle of the queue object
+    bool Blocking,                ///< [in] blocking or non-blocking enqueue
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before this command can be executed. If nullptr,
+                        ///< the numEventsInWaitList must be 0, indicating
+                        ///< that this command does not wait on any event to
+                        ///< complete.
+    ur_event_handle_t
+        *OutEvent ///< [in,out] return an event object that identifies
+                  ///< this particular command instance.
+) {
+  // Lock automatically releases when this goes out of scope.
+  std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);
+
+  ur_device_handle_t Device = Queue->Device;
+
+  bool UseCopyEngine = false;
+  _ur_ze_event_list_t TmpWaitList;
+  UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
+      NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
+
+  // Get a new command list to be used on this call
+  ur_command_list_ptr_t CommandList{};
+  UR_CALL(Queue->Context->getAvailableCommandList(
+      Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList,
+      /* AllowBatching */ false));
+
+  UR_CALL(createEventAndAssociateQueue(
+      Queue, OutEvent, UR_COMMAND_TIMESTAMP_RECORDING_EXP, CommandList,
+      /* IsInternal */ false, /* HostVisible */ true));
+  ze_event_handle_t ZeEvent = (*OutEvent)->ZeEvent;
+  (*OutEvent)->WaitList = TmpWaitList;
+
+  uint64_t DeviceStartTimestamp = 0;
+  UR_CALL(urDeviceGetGlobalTimestamps(Device, &DeviceStartTimestamp, nullptr));
+  (*OutEvent)->RecordEventStartTimestamp = DeviceStartTimestamp;
+
+  // Create a new entry in the queue's recordings.
+  Queue->EndTimeRecordings[*OutEvent] =
+      ur_queue_handle_t_::end_time_recording{};
+
+  ZE2UR_CALL(zeCommandListAppendWriteGlobalTimestamp,
+             (CommandList->first,
+              &Queue->EndTimeRecordings[*OutEvent].RecordEventEndTimestamp,
+              ZeEvent, (*OutEvent)->WaitList.Length,
+              (*OutEvent)->WaitList.ZeEventList));
+
+  UR_CALL(
+      Queue->executeCommandList(CommandList, Blocking, /* OkToBatch */ false));
+
+  return UR_RESULT_SUCCESS;
+}
+
 ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent(
     ze_event_handle_t &ZeHostVisibleEvent) {
 
@@ -598,6 +722,7 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent(
                                                           this->Mutex);
 
   if (!HostVisibleEvent) {
+    this->IsCreatingHostProxyEvent = true;
     if (UrQueue->ZeEventsScope != OnDemandHostVisibleProxy)
       die("getOrCreateHostVisibleEvent: missing host-visible event");
 
@@ -612,7 +737,7 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent(
 
     ur_command_list_ptr_t CommandList{};
     UR_CALL(UrQueue->Context->getAvailableCommandList(
-        UrQueue, CommandList, false /* UseCopyEngine */, OkToBatch))
+        UrQueue, CommandList, false /* UseCopyEngine */, 0, nullptr, OkToBatch))
 
     // Create a "proxy" host-visible event.
     UR_CALL(createEventAndAssociateQueue(
@@ -620,12 +745,18 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent(
         /* IsInternal */ false, /* IsMultiDevice */ false,
         /* HostVisible */ true));
 
-    ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
-               (CommandList->first, 1, &ZeEvent));
+    if (this->IsInnerBatchedEvent) {
+      ZE2UR_CALL(zeCommandListAppendBarrier,
+                 (CommandList->first, ZeEvent, 0, nullptr));
+    } else {
+      ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
+                 (CommandList->first, 1, &ZeEvent));
+    }
     ZE2UR_CALL(zeCommandListAppendSignalEvent,
                (CommandList->first, HostVisibleEvent->ZeEvent));
 
     UR_CALL(UrQueue->executeCommandList(CommandList, false, OkToBatch))
+    this->IsCreatingHostProxyEvent = false;
   }
 
   ZeHostVisibleEvent = HostVisibleEvent->ZeEvent;
@@ -682,7 +813,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventWait(
 
           ze_event_handle_t ZeEvent = HostVisibleEvent->ZeEvent;
           logger::debug("ZeEvent = {}", ur_cast<std::uintptr_t>(ZeEvent));
-          ZE2UR_CALL(zeHostSynchronize, (ZeEvent));
+          // If this event was an inner batched event, then sync with
+          // the Queue instead of waiting on the event.
+          if (HostVisibleEvent->IsInnerBatchedEvent && Event->ZeBatchedQueue) {
+            ZE2UR_CALL(zeHostSynchronize, (Event->ZeBatchedQueue));
+          } else {
+            ZE2UR_CALL(zeHostSynchronize, (ZeEvent));
+          }
           Event->Completed = true;
         }
       }
@@ -877,10 +1014,28 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
   }
 
   // Save pointer to the queue before deleting/resetting event.
+  auto Queue = Event->UrQueue;
+
+  // If the event was a timestamp recording, we try to evict its entry in the
+  // queue.
+  if (Event->isTimestamped()) {
+    auto Entry = Queue->EndTimeRecordings.find(Event);
+    if (Entry != Queue->EndTimeRecordings.end()) {
+      auto &EndTimeRecording = Entry->second;
+      if (EndTimeRecording.RecordEventEndTimestamp == 0) {
+        // If the end time recording has not finished, we tell the queue that
+        // the event is no longer alive to avoid invalid write-backs.
+        EndTimeRecording.EventHasDied = true;
+      } else {
+        // Otherwise we evict the entry.
+        Event->UrQueue->EndTimeRecordings.erase(Entry);
+      }
+    }
+  }
+
   // When we add an event to the cache we need to check whether profiling is
   // enabled or not, so we access properties of the queue and that's why queue
   // must released later.
-  auto Queue = Event->UrQueue;
   if (DisableEventsCaching || !Event->OwnNativeHandle) {
     delete Event;
   } else {
@@ -938,7 +1093,12 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked,
   std::list<ur_event_handle_t> EventsToBeReleased;
   ur_queue_handle_t AssociatedQueue = nullptr;
   {
-    std::scoped_lock<ur_shared_mutex> EventLock(Event->Mutex);
+    // If the Event is already locked, then continue with the cleanup, otherwise
+    // block on locking the event.
+    std::unique_lock<ur_shared_mutex> EventLock(Event->Mutex, std::try_to_lock);
+    if (!EventLock.owns_lock() && !Event->IsCreatingHostProxyEvent) {
+      EventLock.lock();
+    }
     if (SetEventCompleted)
       Event->Completed = true;
     // Exit early of event was already cleanedup.
@@ -1268,16 +1428,26 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
         }
 
         auto Queue = EventList[I]->UrQueue;
-        if (Queue) {
-          // The caller of createAndRetainUrZeEventList must already hold
-          // a lock of the CurQueue. Additionally lock the Queue if it
-          // is different from CurQueue.
-          // TODO: rework this to avoid deadlock when another thread is
-          //       locking the same queues but in a different order.
-          auto Lock = ((Queue == CurQueue)
-                           ? std::unique_lock<ur_shared_mutex>()
-                           : std::unique_lock<ur_shared_mutex>(Queue->Mutex));
 
+        auto CurQueueDevice = CurQueue->Device;
+        std::optional<std::unique_lock<ur_shared_mutex>> QueueLock =
+            std::nullopt;
+        // The caller of createAndRetainUrZeEventList must already hold
+        // a lock of the CurQueue. However, if the CurQueue is different
+        // then the Event's Queue, we need to drop that lock and
+        // acquire the Event's Queue lock. This is done to avoid a lock
+        // ordering issue.
+        // For the rest of this scope, CurQueue cannot be accessed.
+        // TODO: This solution is very error-prone. This requires a refactor
+        // to either have fine-granularity locks inside of the queues or
+        // to move any operations on queues other than CurQueue out
+        // of this scope.
+        if (Queue && Queue != CurQueue) {
+          CurQueue->Mutex.unlock();
+          QueueLock = std::unique_lock<ur_shared_mutex>(Queue->Mutex);
+        }
+
+        if (Queue) {
           // If the event that is going to be waited is in an open batch
           // different from where this next command is going to be added,
           // then we have to force execute of that open command-list
@@ -1320,17 +1490,17 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
         }
 
         ur_command_list_ptr_t CommandList;
-        if (Queue && Queue->Device != CurQueue->Device) {
+        if (Queue && Queue->Device != CurQueueDevice) {
           // Get a command list prior to acquiring an event lock.
           // This prevents a potential deadlock with recursive
           // event locks.
-          UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                          false, true));
+          UR_CALL(Queue->Context->getAvailableCommandList(
+              Queue, CommandList, false, 0, nullptr, true));
         }
 
         std::shared_lock<ur_shared_mutex> Lock(EventList[I]->Mutex);
 
-        if (Queue && Queue->Device != CurQueue->Device &&
+        if (Queue && Queue->Device != CurQueueDevice &&
             !EventList[I]->IsMultiDevice) {
           ze_event_handle_t MultiDeviceZeEvent = nullptr;
           ur_event_handle_t MultiDeviceEvent;
@@ -1365,6 +1535,10 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
           this->UrEventList[TmpListLength]->RefCount.increment();
         }
 
+        if (QueueLock.has_value()) {
+          QueueLock.reset();
+          CurQueue->Mutex.lock();
+        }
         TmpListLength += 1;
       }
     }
@@ -1443,3 +1617,12 @@ bool ur_event_handle_t_::isProfilingEnabled() const {
   return !UrQueue || // tentatively assume user events are profiling enabled
          (UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0;
 }
+
+// Tells if this event was created as a timestamp event, allowing profiling
+// info even if profiling is not enabled.
+bool ur_event_handle_t_::isTimestamped() const {
+  // If we are recording, the start time of the event will be non-zero. The
+  // end time might still be missing, depending on whether the corresponding
+  // enqueue is still running.
+  return RecordEventStartTimestamp != 0;
+}
diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp
index 9fa4663ab4..2d1f536e4e 100644
--- a/source/adapters/level_zero/event.hpp
+++ b/source/adapters/level_zero/event.hpp
@@ -198,6 +198,23 @@ struct ur_event_handle_t_ : _ur_object {
   // performance
   bool IsMultiDevice = {false};
 
+  // Indicates inner batched event which was not used as a signal event.
+  bool IsInnerBatchedEvent = {false};
+
+  // Queue where the batched command was executed.
+  ze_command_queue_handle_t ZeBatchedQueue = {nullptr};
+
+  // Indicates within creation of proxy event.
+  bool IsCreatingHostProxyEvent = {false};
+
+  // Indicates the recorded start and end timestamps for the event. These are
+  // only set for events returned by timestamp recording enqueue functions.
+  // A non-zero value for RecordEventStartTimestamp indicates the event was the
+  // result of a timestamp recording. If RecordEventEndTimestamp is non-zero, it
+  // means the event has fetched the end-timestamp from the queue.
+  uint64_t RecordEventStartTimestamp = 0;
+  uint64_t RecordEventEndTimestamp = 0;
+
   // Besides each PI object keeping a total reference count in
   // _ur_object::RefCount we keep special track of the event *external*
   // references. This way we are able to tell when the event is not referenced
@@ -222,6 +239,10 @@ struct ur_event_handle_t_ : _ur_object {
   // Tells if this event is with profiling capabilities.
   bool isProfilingEnabled() const;
 
+  // Tells if this event was created as a timestamp event, allowing profiling
+  // info even if profiling is not enabled.
+  bool isTimestamped() const;
+
   // Get the host-visible event or create one and enqueue its signal.
   ur_result_t getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent);
 
diff --git a/source/adapters/level_zero/image.cpp b/source/adapters/level_zero/image.cpp
index 3b767f9127..b953e69f06 100644
--- a/source/adapters/level_zero/image.cpp
+++ b/source/adapters/level_zero/image.cpp
@@ -656,13 +656,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageFreeExp(
 UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     ur_exp_image_mem_handle_t hImageMem, const ur_image_format_t *pImageFormat,
-    const ur_image_desc_t *pImageDesc, ur_mem_handle_t *phMem,
-    ur_exp_image_handle_t *phImage) {
+    const ur_image_desc_t *pImageDesc, ur_exp_image_handle_t *phImage) {
   std::shared_lock<ur_shared_mutex> Lock(hContext->Mutex);
 
   UR_ASSERT(hContext && hDevice && hImageMem,
             UR_RESULT_ERROR_INVALID_NULL_HANDLE);
-  UR_ASSERT(pImageFormat && pImageDesc && phMem && phImage,
+  UR_ASSERT(pImageFormat && pImageDesc && phImage,
             UR_RESULT_ERROR_INVALID_NULL_POINTER);
 
   ZeStruct<ze_image_desc_t> ZeImageDesc;
@@ -687,11 +686,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
                   UrImage->ZeImage, &ZeImage));
       ZE2UR_CALL(zeContextMakeImageResident,
                  (hContext->ZeContext, hDevice->ZeDevice, ZeImage));
-      UR_CALL(createUrMemFromZeImage(hContext, ZeImage, /*OwnZeMemHandle*/ true,
-                                     ZeImageDesc, phMem));
     } else {
       ZeImage = UrImage->ZeImage;
-      *phMem = nullptr;
     }
   } else if (MemAllocProperties.type == ZE_MEMORY_TYPE_DEVICE) {
     ze_image_pitched_exp_desc_t PitchedDesc;
@@ -710,8 +706,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
                                &ZeImageDesc, &ZeImage));
     ZE2UR_CALL(zeContextMakeImageResident,
                (hContext->ZeContext, hDevice->ZeDevice, ZeImage));
-    UR_CALL(createUrMemFromZeImage(hContext, ZeImage, /*OwnZeMemHandle*/ true,
-                                   ZeImageDesc, phMem));
   } else {
     return UR_RESULT_ERROR_INVALID_VALUE;
   }
@@ -741,10 +735,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
     ur_context_handle_t hContext, ur_device_handle_t hDevice,
     ur_exp_image_mem_handle_t hImageMem, const ur_image_format_t *pImageFormat,
     const ur_image_desc_t *pImageDesc, ur_sampler_handle_t hSampler,
-    ur_mem_handle_t *phMem, ur_exp_image_handle_t *phImage) {
+    ur_exp_image_handle_t *phImage) {
 
   UR_CALL(urBindlessImagesUnsampledImageCreateExp(
-      hContext, hDevice, hImageMem, pImageFormat, pImageDesc, phMem, phImage));
+      hContext, hDevice, hImageMem, pImageFormat, pImageDesc, phImage));
 
   struct combined_sampled_image_handle {
     uint64_t RawImageHandle;
@@ -790,8 +784,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
 
   // Get a new command list to be used on this call
   ur_command_list_ptr_t CommandList{};
-  UR_CALL(hQueue->Context->getAvailableCommandList(hQueue, CommandList,
-                                                   UseCopyEngine, OkToBatch));
+  UR_CALL(hQueue->Context->getAvailableCommandList(
+      hQueue, CommandList, UseCopyEngine, numEventsInWaitList, phEventWaitList,
+      OkToBatch));
 
   ze_event_handle_t ZeEvent = nullptr;
   ur_event_handle_t InternalEvent;
@@ -800,7 +795,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
   UR_CALL(createEventAndAssociateQueue(hQueue, Event, UR_COMMAND_MEM_IMAGE_COPY,
                                        CommandList, IsInternal,
                                        /*IsMultiDevice*/ false));
-  ZeEvent = (*Event)->ZeEvent;
+  UR_CALL(setSignalEvent(hQueue, UseCopyEngine, &ZeEvent, Event,
+                         numEventsInWaitList, phEventWaitList,
+                         CommandList->second.ZeQueue));
   (*Event)->WaitList = TmpWaitList;
 
   const auto &ZeCommandList = CommandList->first;
diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp
index d96351dd5d..40d6260ac9 100644
--- a/source/adapters/level_zero/kernel.cpp
+++ b/source/adapters/level_zero/kernel.cpp
@@ -201,7 +201,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   // Get a new command list to be used on this call
   ur_command_list_ptr_t CommandList{};
   UR_CALL(Queue->Context->getAvailableCommandList(
-      Queue, CommandList, UseCopyEngine, true /* AllowBatching */));
+      Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList,
+      true /* AllowBatching */));
 
   ze_event_handle_t ZeEvent = nullptr;
   ur_event_handle_t InternalEvent{};
@@ -210,7 +211,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
 
   UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH,
                                        CommandList, IsInternal, false));
-  ZeEvent = (*Event)->ZeEvent;
+  UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
+                         NumEventsInWaitList, EventWaitList,
+                         CommandList->second.ZeQueue));
   (*Event)->WaitList = TmpWaitList;
 
   // Save the kernel in the event, so that when the event is signalled
@@ -268,13 +271,264 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
-    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
-    const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
-    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
-  return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
-                               pGlobalWorkSize, pLocalWorkSize,
-                               numEventsInWaitList, phEventWaitList, phEvent);
+    ur_queue_handle_t Queue,   ///< [in] handle of the queue object
+    ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object
+    uint32_t WorkDim, ///< [in] number of dimensions, from 1 to 3, to specify
+                      ///< the global and work-group work-items
+    const size_t
+        *GlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned
+                           ///< values that specify the offset used to
+                           ///< calculate the global ID of a work-item
+    const size_t *GlobalWorkSize, ///< [in] pointer to an array of workDim
+                                  ///< unsigned values that specify the number
+                                  ///< of global work-items in workDim that
+                                  ///< will execute the kernel function
+    const size_t
+        *LocalWorkSize, ///< [in][optional] pointer to an array of workDim
+                        ///< unsigned values that specify the number of local
+                        ///< work-items forming a work-group that will execute
+                        ///< the kernel function. If nullptr, the runtime
+                        ///< implementation will choose the work-group size.
+    uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t
+        *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+                        ///< pointer to a list of events that must be complete
+                        ///< before the kernel execution. If nullptr, the
+                        ///< numEventsInWaitList must be 0, indicating that no
+                        ///< wait event.
+    ur_event_handle_t
+        *OutEvent ///< [in,out][optional] return an event object that identifies
+                  ///< this particular kernel execution instance.
+) {
+  auto ZeDevice = Queue->Device->ZeDevice;
+
+  ze_kernel_handle_t ZeKernel{};
+  if (Kernel->ZeKernelMap.empty()) {
+    ZeKernel = Kernel->ZeKernel;
+  } else {
+    auto It = Kernel->ZeKernelMap.find(ZeDevice);
+    if (It == Kernel->ZeKernelMap.end()) {
+      /* kernel and queue don't match */
+      return UR_RESULT_ERROR_INVALID_QUEUE;
+    }
+    ZeKernel = It->second;
+  }
+  // Lock automatically releases when this goes out of scope.
+  std::scoped_lock<ur_shared_mutex, ur_shared_mutex, ur_shared_mutex> Lock(
+      Queue->Mutex, Kernel->Mutex, Kernel->Program->Mutex);
+  if (GlobalWorkOffset != NULL) {
+    if (!Queue->Device->Platform->ZeDriverGlobalOffsetExtensionFound) {
+      logger::error("No global offset extension found on this driver");
+      return UR_RESULT_ERROR_INVALID_VALUE;
+    }
+
+    ZE2UR_CALL(zeKernelSetGlobalOffsetExp,
+               (ZeKernel, GlobalWorkOffset[0], GlobalWorkOffset[1],
+                GlobalWorkOffset[2]));
+  }
+
+  // If there are any pending arguments set them now.
+  for (auto &Arg : Kernel->PendingArguments) {
+    // The ArgValue may be a NULL pointer in which case a NULL value is used for
+    // the kernel argument declared as a pointer to global or constant memory.
+    char **ZeHandlePtr = nullptr;
+    if (Arg.Value) {
+      UR_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode,
+                                        Queue->Device));
+    }
+    ZE2UR_CALL(zeKernelSetArgumentValue,
+               (ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr));
+  }
+  Kernel->PendingArguments.clear();
+
+  ze_group_count_t ZeThreadGroupDimensions{1, 1, 1};
+  uint32_t WG[3]{};
+
+  // New variable needed because GlobalWorkSize parameter might not be of size 3
+  size_t GlobalWorkSize3D[3]{1, 1, 1};
+  std::copy(GlobalWorkSize, GlobalWorkSize + WorkDim, GlobalWorkSize3D);
+
+  if (LocalWorkSize) {
+    // L0
+    UR_ASSERT(LocalWorkSize[0] < (std::numeric_limits<uint32_t>::max)(),
+              UR_RESULT_ERROR_INVALID_VALUE);
+    UR_ASSERT(LocalWorkSize[1] < (std::numeric_limits<uint32_t>::max)(),
+              UR_RESULT_ERROR_INVALID_VALUE);
+    UR_ASSERT(LocalWorkSize[2] < (std::numeric_limits<uint32_t>::max)(),
+              UR_RESULT_ERROR_INVALID_VALUE);
+    WG[0] = static_cast<uint32_t>(LocalWorkSize[0]);
+    WG[1] = static_cast<uint32_t>(LocalWorkSize[1]);
+    WG[2] = static_cast<uint32_t>(LocalWorkSize[2]);
+  } else {
+    // We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize
+    // values do not fit to 32-bit that the API only supports currently.
+    bool SuggestGroupSize = true;
+    for (int I : {0, 1, 2}) {
+      if (GlobalWorkSize3D[I] > UINT32_MAX) {
+        SuggestGroupSize = false;
+      }
+    }
+    if (SuggestGroupSize) {
+      ZE2UR_CALL(zeKernelSuggestGroupSize,
+                 (ZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1],
+                  GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2]));
+    } else {
+      for (int I : {0, 1, 2}) {
+        // Try to find a I-dimension WG size that the GlobalWorkSize[I] is
+        // fully divisable with. Start with the max possible size in
+        // each dimension.
+        uint32_t GroupSize[] = {
+            Queue->Device->ZeDeviceComputeProperties->maxGroupSizeX,
+            Queue->Device->ZeDeviceComputeProperties->maxGroupSizeY,
+            Queue->Device->ZeDeviceComputeProperties->maxGroupSizeZ};
+        GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]);
+        while (GlobalWorkSize3D[I] % GroupSize[I]) {
+          --GroupSize[I];
+        }
+
+        if (GlobalWorkSize3D[I] / GroupSize[I] > UINT32_MAX) {
+          logger::error(
+              "urEnqueueCooperativeKernelLaunchExp: can't find a WG size "
+              "suitable for global work size > UINT32_MAX");
+          return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+        }
+        WG[I] = GroupSize[I];
+      }
+      logger::debug("urEnqueueCooperativeKernelLaunchExp: using computed WG "
+                    "size = {{{}, {}, {}}}",
+                    WG[0], WG[1], WG[2]);
+    }
+  }
+
+  // TODO: assert if sizes do not fit into 32-bit?
+
+  switch (WorkDim) {
+  case 3:
+    ZeThreadGroupDimensions.groupCountX =
+        static_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
+    ZeThreadGroupDimensions.groupCountY =
+        static_cast<uint32_t>(GlobalWorkSize3D[1] / WG[1]);
+    ZeThreadGroupDimensions.groupCountZ =
+        static_cast<uint32_t>(GlobalWorkSize3D[2] / WG[2]);
+    break;
+  case 2:
+    ZeThreadGroupDimensions.groupCountX =
+        static_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
+    ZeThreadGroupDimensions.groupCountY =
+        static_cast<uint32_t>(GlobalWorkSize3D[1] / WG[1]);
+    WG[2] = 1;
+    break;
+  case 1:
+    ZeThreadGroupDimensions.groupCountX =
+        static_cast<uint32_t>(GlobalWorkSize3D[0] / WG[0]);
+    WG[1] = WG[2] = 1;
+    break;
+
+  default:
+    logger::error("urEnqueueCooperativeKernelLaunchExp: unsupported work_dim");
+    return UR_RESULT_ERROR_INVALID_VALUE;
+  }
+
+  // Error handling for non-uniform group size case
+  if (GlobalWorkSize3D[0] !=
+      size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) {
+    logger::error("urEnqueueCooperativeKernelLaunchExp: invalid work_dim. The "
+                  "range is not a "
+                  "multiple of the group size in the 1st dimension");
+    return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+  }
+  if (GlobalWorkSize3D[1] !=
+      size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) {
+    logger::error("urEnqueueCooperativeKernelLaunchExp: invalid work_dim. The "
+                  "range is not a "
+                  "multiple of the group size in the 2nd dimension");
+    return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+  }
+  if (GlobalWorkSize3D[2] !=
+      size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) {
+    logger::debug("urEnqueueCooperativeKernelLaunchExp: invalid work_dim. The "
+                  "range is not a "
+                  "multiple of the group size in the 3rd dimension");
+    return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+  }
+
+  ZE2UR_CALL(zeKernelSetGroupSize, (ZeKernel, WG[0], WG[1], WG[2]));
+
+  bool UseCopyEngine = false;
+  _ur_ze_event_list_t TmpWaitList;
+  UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
+      NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
+
+  // Get a new command list to be used on this call
+  ur_command_list_ptr_t CommandList{};
+  UR_CALL(Queue->Context->getAvailableCommandList(
+      Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList,
+      true /* AllowBatching */));
+
+  ze_event_handle_t ZeEvent = nullptr;
+  ur_event_handle_t InternalEvent{};
+  bool IsInternal = OutEvent == nullptr;
+  ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
+
+  UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH,
+                                       CommandList, IsInternal, false));
+  UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
+                         NumEventsInWaitList, EventWaitList,
+                         CommandList->second.ZeQueue));
+  (*Event)->WaitList = TmpWaitList;
+
+  // Save the kernel in the event, so that when the event is signalled
+  // the code can do a urKernelRelease on this kernel.
+  (*Event)->CommandData = (void *)Kernel;
+
+  // Increment the reference count of the Kernel and indicate that the Kernel
+  // is in use. Once the event has been signalled, the code in
+  // CleanupCompletedEvent(Event) will do a urKernelRelease to update the
+  // reference count on the kernel, using the kernel saved in CommandData.
+  UR_CALL(urKernelRetain(Kernel));
+
+  // Add to list of kernels to be submitted
+  if (IndirectAccessTrackingEnabled)
+    Queue->KernelsToBeSubmitted.push_back(Kernel);
+
+  if (Queue->UsingImmCmdLists && IndirectAccessTrackingEnabled) {
+    // If using immediate commandlists then gathering of indirect
+    // references and appending to the queue (which means submission)
+    // must be done together.
+    std::unique_lock<ur_shared_mutex> ContextsLock(
+        Queue->Device->Platform->ContextsMutex, std::defer_lock);
+    // We are going to submit kernels for execution. If indirect access flag is
+    // set for a kernel then we need to make a snapshot of existing memory
+    // allocations in all contexts in the platform. We need to lock the mutex
+    // guarding the list of contexts in the platform to prevent creation of new
+    // memory alocations in any context before we submit the kernel for
+    // execution.
+    ContextsLock.lock();
+    Queue->CaptureIndirectAccesses();
+    // Add the command to the command list, which implies submission.
+    ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel,
+               (CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent,
+                (*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList));
+  } else {
+    // Add the command to the command list for later submission.
+    // No lock is needed here, unlike the immediate commandlist case above,
+    // because the kernels are not actually submitted yet. Kernels will be
+    // submitted only when the comamndlist is closed. Then, a lock is held.
+    ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel,
+               (CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent,
+                (*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList));
+  }
+
+  logger::debug("calling zeCommandListAppendLaunchCooperativeKernel() with"
+                "  ZeEvent {}",
+                ur_cast<std::uintptr_t>(ZeEvent));
+  printZeEventList((*Event)->WaitList);
+
+  // Execute command list asynchronously, as the event will be used
+  // to track down its completion.
+  UR_CALL(Queue->executeCommandList(CommandList, false, true));
+
+  return UR_RESULT_SUCCESS;
 }
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
@@ -583,15 +837,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo(
 
     ZeStruct<ze_kernel_properties_t> kernelProperties;
     kernelProperties.pNext = &workGroupProperties;
-
-    auto ZeResult = ZE_CALL_NOCHECK(
-        zeKernelGetProperties,
-        (Kernel->ZeKernelMap[Device->ZeDevice], &kernelProperties));
-    if (ZeResult || workGroupProperties.maxGroupSize == 0) {
+    // Set the Kernel to use as the ZeKernel initally for native handle support.
+    // This makes the assumption that this device is the same device where this
+    // kernel was created.
+    auto ZeKernelDevice = Kernel->ZeKernel;
+    auto It = Kernel->ZeKernelMap.find(Device->ZeDevice);
+    if (It != Kernel->ZeKernelMap.end()) {
+      ZeKernelDevice = Kernel->ZeKernelMap[Device->ZeDevice];
+    }
+    if (ZeKernelDevice) {
+      auto ZeResult = ZE_CALL_NOCHECK(zeKernelGetProperties,
+                                      (ZeKernelDevice, &kernelProperties));
+      if (ZeResult || workGroupProperties.maxGroupSize == 0) {
+        return ReturnValue(
+            uint64_t{Device->ZeDeviceComputeProperties->maxTotalGroupSize});
+      }
+      return ReturnValue(workGroupProperties.maxGroupSize);
+    } else {
       return ReturnValue(
           uint64_t{Device->ZeDeviceComputeProperties->maxTotalGroupSize});
     }
-    return ReturnValue(workGroupProperties.maxGroupSize);
   }
   case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
     struct {
@@ -815,10 +1080,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle(
 UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     ur_kernel_handle_t hKernel, size_t localWorkSize,
     size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) {
-  (void)hKernel;
   (void)localWorkSize;
   (void)dynamicSharedMemorySize;
-  *pGroupCountRet = 1;
+  std::shared_lock<ur_shared_mutex> Guard(hKernel->Mutex);
+  uint32_t TotalGroupCount = 0;
+  ZE2UR_CALL(zeKernelSuggestMaxCooperativeGroupCount,
+             (hKernel->ZeKernel, &TotalGroupCount));
+  *pGroupCountRet = TotalGroupCount;
   return UR_RESULT_SUCCESS;
 }
 
@@ -835,6 +1103,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
   try {
     Kernel = new ur_kernel_handle_t_(ZeKernel, Properties->isNativeHandleOwned,
                                      Context);
+    if (Properties->isNativeHandleOwned) {
+      // If ownership is passed to the adapter we need to pass the kernel
+      // to this vector which is then used during ZeKernelRelease.
+      Kernel->ZeKernels.push_back(ZeKernel);
+    }
+
     *RetKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
   } catch (const std::bad_alloc &) {
     return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
diff --git a/source/adapters/level_zero/memory.cpp b/source/adapters/level_zero/memory.cpp
index 4757a0563d..77cb6abb38 100644
--- a/source/adapters/level_zero/memory.cpp
+++ b/source/adapters/level_zero/memory.cpp
@@ -62,8 +62,9 @@ ur_result_t enqueueMemCopyHelper(ur_command_t CommandType,
 
   // Get a new command list to be used on this call
   ur_command_list_ptr_t CommandList{};
-  UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                  UseCopyEngine, OkToBatch));
+  UR_CALL(Queue->Context->getAvailableCommandList(
+      Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList,
+      OkToBatch));
 
   ze_event_handle_t ZeEvent = nullptr;
   ur_event_handle_t InternalEvent;
@@ -71,7 +72,9 @@ ur_result_t enqueueMemCopyHelper(ur_command_t CommandType,
   ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
   UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList,
                                        IsInternal, false));
-  ZeEvent = (*Event)->ZeEvent;
+  UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
+                         NumEventsInWaitList, EventWaitList,
+                         CommandList->second.ZeQueue));
   (*Event)->WaitList = TmpWaitList;
 
   const auto &ZeCommandList = CommandList->first;
@@ -112,8 +115,9 @@ ur_result_t enqueueMemCopyRectHelper(
 
   // Get a new command list to be used on this call
   ur_command_list_ptr_t CommandList{};
-  UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                  UseCopyEngine, OkToBatch));
+  UR_CALL(Queue->Context->getAvailableCommandList(
+      Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList,
+      OkToBatch));
 
   ze_event_handle_t ZeEvent = nullptr;
   ur_event_handle_t InternalEvent;
@@ -121,8 +125,9 @@ ur_result_t enqueueMemCopyRectHelper(
   ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
   UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList,
                                        IsInternal, false));
-
-  ZeEvent = (*Event)->ZeEvent;
+  UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
+                         NumEventsInWaitList, EventWaitList,
+                         CommandList->second.ZeQueue));
   (*Event)->WaitList = TmpWaitList;
 
   const auto &ZeCommandList = CommandList->first;
@@ -219,8 +224,9 @@ static ur_result_t enqueueMemFillHelper(ur_command_t CommandType,
   ur_command_list_ptr_t CommandList{};
   // We want to batch these commands to avoid extra submissions (costly)
   bool OkToBatch = true;
-  UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                  UseCopyEngine, OkToBatch));
+  UR_CALL(Queue->Context->getAvailableCommandList(
+      Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList,
+      OkToBatch));
 
   ze_event_handle_t ZeEvent = nullptr;
   ur_event_handle_t InternalEvent;
@@ -228,8 +234,9 @@ static ur_result_t enqueueMemFillHelper(ur_command_t CommandType,
   ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
   UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList,
                                        IsInternal, false));
-
-  ZeEvent = (*Event)->ZeEvent;
+  UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
+                         NumEventsInWaitList, EventWaitList,
+                         CommandList->second.ZeQueue));
   (*Event)->WaitList = TmpWaitList;
 
   const auto &ZeCommandList = CommandList->first;
@@ -334,8 +341,9 @@ static ur_result_t enqueueMemImageCommandHelper(
 
   // Get a new command list to be used on this call
   ur_command_list_ptr_t CommandList{};
-  UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                  UseCopyEngine, OkToBatch));
+  UR_CALL(Queue->Context->getAvailableCommandList(
+      Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList,
+      OkToBatch));
 
   ze_event_handle_t ZeEvent = nullptr;
   ur_event_handle_t InternalEvent;
@@ -343,7 +351,9 @@ static ur_result_t enqueueMemImageCommandHelper(
   ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
   UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList,
                                        IsInternal, false));
-  ZeEvent = (*Event)->ZeEvent;
+  UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
+                         NumEventsInWaitList, EventWaitList,
+                         CommandList->second.ZeQueue));
   (*Event)->WaitList = TmpWaitList;
 
   const auto &ZeCommandList = CommandList->first;
@@ -991,8 +1001,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
   } else {
     // For discrete devices we need a command list
     ur_command_list_ptr_t CommandList{};
-    UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                    UseCopyEngine));
+    UR_CALL(Queue->Context->getAvailableCommandList(
+        Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList));
 
     // Add the event to the command list.
     CommandList->second.append(reinterpret_cast<ur_event_handle_t>(*Event));
@@ -1004,6 +1014,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
     char *ZeHandleSrc;
     UR_CALL(Buffer->getZeHandle(ZeHandleSrc, AccessMode, Queue->Device));
 
+    UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
+                           NumEventsInWaitList, EventWaitList,
+                           CommandList->second.ZeQueue));
+
     ZE2UR_CALL(zeCommandListAppendMemoryCopy,
                (ZeCommandList, *RetMap, ZeHandleSrc + Offset, Size, ZeEvent,
                 WaitList.Length, WaitList.ZeEventList));
@@ -1114,7 +1128,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
 
   ur_command_list_ptr_t CommandList{};
   UR_CALL(Queue->Context->getAvailableCommandList(
-      reinterpret_cast<ur_queue_handle_t>(Queue), CommandList, UseCopyEngine));
+      reinterpret_cast<ur_queue_handle_t>(Queue), CommandList, UseCopyEngine,
+      NumEventsInWaitList, EventWaitList));
 
   CommandList->second.append(reinterpret_cast<ur_event_handle_t>(*Event));
   (*Event)->RefCount.increment();
@@ -1131,6 +1146,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
   UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only,
                               Queue->Device));
 
+  UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
+                         NumEventsInWaitList, EventWaitList,
+                         CommandList->second.ZeQueue));
+
   ZE2UR_CALL(zeCommandListAppendMemoryCopy,
              (ZeCommandList, ZeHandleDst + MapInfo.Offset, MappedPtr,
               MapInfo.Size, ZeEvent, (*Event)->WaitList.Length,
@@ -1241,8 +1260,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
   ur_command_list_ptr_t CommandList{};
   // TODO: Change UseCopyEngine argument to 'true' once L0 backend
   // support is added
-  UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                  UseCopyEngine));
+  UR_CALL(Queue->Context->getAvailableCommandList(
+      Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList));
 
   // TODO: do we need to create a unique command type for this?
   ze_event_handle_t ZeEvent = nullptr;
@@ -1298,7 +1317,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMAdvise(
   // TODO: Additional analysis is required to check if this operation will
   // run faster on copy engines.
   UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
-                                                  UseCopyEngine));
+                                                  UseCopyEngine, 0, nullptr));
 
   // TODO: do we need to create a unique command type for this?
   ze_event_handle_t ZeEvent = nullptr;
diff --git a/source/adapters/level_zero/program.cpp b/source/adapters/level_zero/program.cpp
index e71ef2a8f9..447721f004 100644
--- a/source/adapters/level_zero/program.cpp
+++ b/source/adapters/level_zero/program.cpp
@@ -891,10 +891,15 @@ void ur_program_handle_t_::ur_release_program_resources(bool deletion) {
     }
 
     if (ZeModule && OwnZeModule) {
-      for (auto &ZeModulePair : this->ZeModuleMap) {
-        ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModulePair.second));
+      if (ZeModuleMap.empty()) {
+        // interop api
+        ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModule));
+      } else {
+        for (auto &ZeModulePair : this->ZeModuleMap) {
+          ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModulePair.second));
+        }
+        this->ZeModuleMap.clear();
       }
-      this->ZeModuleMap.clear();
     }
     resourcesReleased = true;
   }
diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp
index 1030b491cf..f47f960cc0 100644
--- a/source/adapters/level_zero/queue.cpp
+++ b/source/adapters/level_zero/queue.cpp
@@ -1335,6 +1335,7 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList,
     // in the command list is not empty, otherwise we are going to just create
     // and remove proxy event right away and dereference deleted object
     // afterwards.
+    bool AppendBarrierNeeded = true;
     if (ZeEventsScope == LastCommandInBatchHostVisible &&
         !CommandList->second.EventList.empty()) {
       // If there are only internal events in the command list then we don't
@@ -1403,6 +1404,7 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList,
           ZE2UR_CALL(zeCommandListAppendSignalEvent,
                      (CommandList->first, HostVisibleEvent->ZeEvent));
         } else {
+          AppendBarrierNeeded = false;
           ZE2UR_CALL(
               zeCommandListAppendBarrier,
               (CommandList->first, HostVisibleEvent->ZeEvent, 0, nullptr));
@@ -1415,6 +1417,27 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList,
       // If we don't have host visible proxy then signal event if needed.
       this->signalEventFromCmdListIfLastEventDiscarded(CommandList);
     }
+    // Append Signalling of the inner events at the end of the batch if this is
+    // an integrated gpu and out of order signal events are not allowed.
+    if (!UrL0OutOfOrderIntegratedSignalEvent && this->Device->isIntegrated()) {
+      for (auto &Event : CommandList->second.EventList) {
+        // If the events scope does not apply a barrier already above, then we
+        // need to apply a barrier to wait on all the previous commands without
+        // signal events to complete before we can signal the batched events as
+        // completed. This functionality is only used if this command list is
+        // out of order and there are events created that were not used as
+        // signal events.
+        if (Event->IsInnerBatchedEvent) {
+          if (AppendBarrierNeeded) {
+            ZE2UR_CALL(zeCommandListAppendBarrier,
+                       (CommandList->first, nullptr, 0, nullptr));
+            AppendBarrierNeeded = false;
+          }
+          ZE2UR_CALL(zeCommandListAppendSignalEvent,
+                     (CommandList->first, Event->ZeEvent));
+        }
+      }
+    }
 
     // Close the command list and have it ready for dispatch.
     ZE2UR_CALL(zeCommandListClose, (CommandList->first));
@@ -1489,8 +1512,7 @@ ur_queue_handle_t_::resetDiscardedEvent(ur_command_list_ptr_t CommandList) {
 }
 
 ur_result_t ur_queue_handle_t_::addEventToQueueCache(ur_event_handle_t Event) {
-  if (!Event->IsMultiDevice && Event->UrQueue) {
-    auto Device = Event->UrQueue->Device;
+  if (!Event->IsMultiDevice) {
     auto EventCachesMap = Event->isHostVisible() ? &EventCachesDeviceMap[0]
                                                  : &EventCachesDeviceMap[1];
     if (EventCachesMap->find(Device) == EventCachesMap->end()) {
@@ -1517,6 +1539,34 @@ ur_result_t ur_queue_handle_t_::active_barriers::clear() {
   return UR_RESULT_SUCCESS;
 }
 
+void ur_queue_handle_t_::clearEndTimeRecordings() {
+  uint64_t ZeTimerResolution = Device->ZeDeviceProperties->timerResolution;
+  const uint64_t TimestampMaxValue =
+      ((1ULL << Device->ZeDeviceProperties->kernelTimestampValidBits) - 1ULL);
+
+  for (auto Entry : EndTimeRecordings) {
+    auto &Event = Entry.first;
+    auto &EndTimeRecording = Entry.second;
+    if (!Entry.second.EventHasDied) {
+      // Write the result back to the event if it is not dead.
+      uint64_t ContextEndTime =
+          (EndTimeRecording.RecordEventEndTimestamp & TimestampMaxValue) *
+          ZeTimerResolution;
+
+      // Handle a possible wrap-around (the underlying HW counter is < 64-bit).
+      // Note, it will not report correct time if there were multiple wrap
+      // arounds, and the longer term plan is to enlarge the capacity of the
+      // HW timestamps.
+      if (ContextEndTime < Event->RecordEventStartTimestamp)
+        ContextEndTime += TimestampMaxValue * ZeTimerResolution;
+
+      // Store it in the event.
+      Event->RecordEventEndTimestamp = ContextEndTime;
+    }
+  }
+  EndTimeRecordings.clear();
+}
+
 ur_result_t urQueueReleaseInternal(ur_queue_handle_t Queue) {
   ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
 
@@ -1542,6 +1592,8 @@ ur_result_t urQueueReleaseInternal(ur_queue_handle_t Queue) {
           }
   }
 
+  Queue->clearEndTimeRecordings();
+
   logger::debug("urQueueRelease(compute) NumTimesClosedFull {}, "
                 "NumTimesClosedEarly {}",
                 UrQueue->ComputeCommandBatch.NumTimesClosedFull,
@@ -1695,6 +1747,11 @@ ur_result_t ur_queue_handle_t_::synchronize() {
     LastCommandEvent = nullptr;
   }
 
+  // Since all timestamp recordings should have finished with the
+  // synchronizations, we can clear the map and write the results to the owning
+  // events.
+  clearEndTimeRecordings();
+
   // With the entire queue synchronized, the active barriers must be done so we
   // can remove them.
   if (auto Res = ActiveBarriers.clear())
@@ -1732,6 +1789,58 @@ ur_event_handle_t ur_queue_handle_t_::getEventFromQueueCache(bool IsMultiDevice,
   return RetEvent;
 }
 
+// This helper function checks to see if an event for a command can be included
+// at the end of a command list batch. This will only be true if the event does
+// not have dependencies or the dependencies are not for events which exist in
+// this batch.
+bool eventCanBeBatched(ur_queue_handle_t Queue, bool UseCopyEngine,
+                       uint32_t NumEventsInWaitList,
+                       const ur_event_handle_t *EventWaitList) {
+  auto &CommandBatch =
+      UseCopyEngine ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch;
+  // First see if there is an command-list open for batching commands
+  // for this queue.
+  if (Queue->hasOpenCommandList(UseCopyEngine)) {
+    // If this command should be batched, but the command has a dependency on a
+    // command in the current batch, then the command needs to have an event
+    // to track its completion so this event cannot be batched to the end of the
+    // command list.
+    if (NumEventsInWaitList > 0) {
+      for (auto &Event : CommandBatch.OpenCommandList->second.EventList) {
+        for (uint32_t i = 0; i < NumEventsInWaitList; i++) {
+          if (Event == EventWaitList[i]) {
+            return false;
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+
+// This helper function checks to see if a signal event at the end of a command
+// should be set. If the Queue is out of order and the command has no
+// dependencies, then this command can be enqueued without a signal event set in
+// a command list batch. The signal event will be appended at the end of the
+// batch to be signalled at the end of the command list.
+ur_result_t setSignalEvent(ur_queue_handle_t Queue, bool UseCopyEngine,
+                           ze_event_handle_t *ZeEvent, ur_event_handle_t *Event,
+                           uint32_t NumEventsInWaitList,
+                           const ur_event_handle_t *EventWaitList,
+                           ze_command_queue_handle_t ZeQueue) {
+  if (!UrL0OutOfOrderIntegratedSignalEvent && Queue->Device->isIntegrated() &&
+      eventCanBeBatched(Queue, UseCopyEngine, NumEventsInWaitList,
+                        EventWaitList) &&
+      !Queue->isInOrderQueue() && !Queue->UsingImmCmdLists) {
+    ZeEvent = nullptr;
+    (*Event)->IsInnerBatchedEvent = true;
+    (*Event)->ZeBatchedQueue = ZeQueue;
+  } else {
+    (*ZeEvent) = (*Event)->ZeEvent;
+  }
+  return UR_RESULT_SUCCESS;
+}
+
 // This helper function creates a ur_event_handle_t and associate a
 // ur_queue_handle_t. Note that the caller of this function must have acquired
 // lock on the Queue that is passed in.
diff --git a/source/adapters/level_zero/queue.hpp b/source/adapters/level_zero/queue.hpp
index 5cb061be5f..799e90e9d9 100644
--- a/source/adapters/level_zero/queue.hpp
+++ b/source/adapters/level_zero/queue.hpp
@@ -486,6 +486,22 @@ struct ur_queue_handle_t_ : _ur_object {
   std::vector<std::unordered_map<ur_device_handle_t, size_t>>
       EventCachesDeviceMap{2};
 
+  // End-times enqueued are stored on the queue rather than on the event to
+  // avoid the event objects having been destroyed prior to the write to the
+  // end-time member.
+  struct end_time_recording {
+    // RecordEventEndTimestamp is not adjusted for valid bits nor resolution, as
+    // it is written asynchronously.
+    uint64_t RecordEventEndTimestamp = 0;
+    // The event may die before the recording has been written back. In this
+    // case the event will mark this for deletion when the queue sees fit.
+    bool EventHasDied = false;
+  };
+  std::map<ur_event_handle_t, end_time_recording> EndTimeRecordings;
+
+  // Clear the end time recording timestamps entries.
+  void clearEndTimeRecordings();
+
   // adjust the queue's batch size, knowing that the current command list
   // is being closed with a full batch.
   // For copy commands, IsCopy is set to 'true'.
@@ -691,6 +707,25 @@ ur_result_t createEventAndAssociateQueue(
     ur_command_list_ptr_t CommandList, bool IsInternal, bool IsMultiDevice,
     std::optional<bool> HostVisible = std::nullopt);
 
+// This helper function checks to see if an event for a command can be included
+// at the end of a command list batch. This will only be true if the event does
+// not have dependencies or the dependencies are not for events which exist in
+// this batch.
+bool eventCanBeBatched(ur_queue_handle_t Queue, bool UseCopyEngine,
+                       uint32_t NumEventsInWaitList,
+                       const ur_event_handle_t *EventWaitList);
+
+// This helper function checks to see if a signal event at the end of a command
+// should be set. If the Queue is out of order and the command has no
+// dependencies, then this command can be enqueued without a signal event set in
+// a command list batch. The signal event will be appended at the end of the
+// batch to be signalled at the end of the command list.
+ur_result_t setSignalEvent(ur_queue_handle_t Queue, bool UseCopyEngine,
+                           ze_event_handle_t *ZeEvent, ur_event_handle_t *Event,
+                           uint32_t NumEventsInWaitList,
+                           const ur_event_handle_t *EventWaitList,
+                           ze_command_queue_handle_t ZeQueue);
+
 // Helper function to perform the necessary cleanup of the events from reset cmd
 // list.
 ur_result_t CleanupEventListFromResetCmdList(
diff --git a/source/adapters/level_zero/ur_interface_loader.cpp b/source/adapters/level_zero/ur_interface_loader.cpp
index 051db73145..d6d6060ea6 100644
--- a/source/adapters/level_zero/ur_interface_loader.cpp
+++ b/source/adapters/level_zero/ur_interface_loader.cpp
@@ -456,6 +456,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
 
   pDdiTable->pfnCooperativeKernelLaunchExp =
       urEnqueueCooperativeKernelLaunchExp;
+  pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp;
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/level_zero/ur_level_zero.cpp b/source/adapters/level_zero/ur_level_zero.cpp
index 60ad974b8e..9cecb3d427 100644
--- a/source/adapters/level_zero/ur_level_zero.cpp
+++ b/source/adapters/level_zero/ur_level_zero.cpp
@@ -1,18 +1,18 @@
-//===--------- ur_level_zero.cpp - Level Zero Adapter ---------------------===//
-//
-// Copyright (C) 2023 Intel Corporation
-//
-// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
-// Exceptions. See LICENSE.TXT
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <algorithm>
-#include <climits>
-#include <string.h>
-
-#include "ur_level_zero.hpp"
-
-// Define the static class field
-std::mutex ZeCall::GlobalLock;
+//===--------- ur_level_zero.cpp - Level Zero Adapter ---------------------===//
+//
+// Copyright (C) 2023 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <climits>
+#include <string.h>
+
+#include "ur_level_zero.hpp"
+
+// Define the static class field
+std::mutex ZeCall::GlobalLock;
diff --git a/source/adapters/level_zero/ur_level_zero.hpp b/source/adapters/level_zero/ur_level_zero.hpp
index dd7bbf67b3..2b41db2d4b 100644
--- a/source/adapters/level_zero/ur_level_zero.hpp
+++ b/source/adapters/level_zero/ur_level_zero.hpp
@@ -1,38 +1,38 @@
-//===--------- ur_level_zero.hpp - Level Zero Adapter ---------------------===//
-//
-// Copyright (C) 2023 Intel Corporation
-//
-// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
-// Exceptions. See LICENSE.TXT
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#pragma once
-
-#include <cassert>
-#include <cinttypes>
-#include <list>
-#include <map>
-#include <stdarg.h>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include <ur/ur.hpp>
-#include <ur_api.h>
-#include <ze_api.h>
-#include <zes_api.h>
-
-#include "common.hpp"
-#include "context.hpp"
-#include "device.hpp"
-#include "event.hpp"
-#include "image.hpp"
-#include "kernel.hpp"
-#include "memory.hpp"
-#include "physical_mem.hpp"
-#include "platform.hpp"
-#include "program.hpp"
-#include "queue.hpp"
-#include "sampler.hpp"
-#include "usm.hpp"
+//===--------- ur_level_zero.hpp - Level Zero Adapter ---------------------===//
+//
+// Copyright (C) 2023 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include <cassert>
+#include <cinttypes>
+#include <list>
+#include <map>
+#include <stdarg.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <ur/ur.hpp>
+#include <ur_api.h>
+#include <ze_api.h>
+#include <zes_api.h>
+
+#include "common.hpp"
+#include "context.hpp"
+#include "device.hpp"
+#include "event.hpp"
+#include "image.hpp"
+#include "kernel.hpp"
+#include "memory.hpp"
+#include "physical_mem.hpp"
+#include "platform.hpp"
+#include "program.hpp"
+#include "queue.hpp"
+#include "sampler.hpp"
+#include "usm.hpp"
diff --git a/source/adapters/level_zero/usm.cpp b/source/adapters/level_zero/usm.cpp
index 76be0cd3c7..19d31bcb9b 100644
--- a/source/adapters/level_zero/usm.cpp
+++ b/source/adapters/level_zero/usm.cpp
@@ -171,7 +171,7 @@ static ur_result_t USMAllocationMakeResident(
 static ur_result_t USMDeviceAllocImpl(void **ResultPtr,
                                       ur_context_handle_t Context,
                                       ur_device_handle_t Device,
-                                      ur_usm_device_mem_flags_t *Flags,
+                                      ur_usm_device_mem_flags_t Flags,
                                       size_t Size, uint32_t Alignment) {
   std::ignore = Flags;
   // TODO: translate PI properties to Level Zero flags
@@ -213,12 +213,10 @@ static ur_result_t USMDeviceAllocImpl(void **ResultPtr,
   return UR_RESULT_SUCCESS;
 }
 
-static ur_result_t USMSharedAllocImpl(void **ResultPtr,
-                                      ur_context_handle_t Context,
-                                      ur_device_handle_t Device,
-                                      ur_usm_host_mem_flags_t *,
-                                      ur_usm_device_mem_flags_t *, size_t Size,
-                                      uint32_t Alignment) {
+static ur_result_t
+USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
+                   ur_device_handle_t Device, ur_usm_host_mem_flags_t,
+                   ur_usm_device_mem_flags_t, size_t Size, uint32_t Alignment) {
 
   // TODO: translate PI properties to Level Zero flags
   ZeStruct<ze_host_mem_alloc_desc_t> ZeHostDesc;
@@ -263,7 +261,7 @@ static ur_result_t USMSharedAllocImpl(void **ResultPtr,
 
 static ur_result_t USMHostAllocImpl(void **ResultPtr,
                                     ur_context_handle_t Context,
-                                    ur_usm_host_mem_flags_t *Flags, size_t Size,
+                                    ur_usm_host_mem_flags_t Flags, size_t Size,
                                     uint32_t Alignment) {
   std::ignore = Flags;
   // TODO: translate PI properties to Level Zero flags
@@ -308,8 +306,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(
   uint32_t Align = USMDesc ? USMDesc->align : 0;
   // L0 supports alignment up to 64KB and silently ignores higher values.
   // We flag alignment > 64KB as an invalid value.
-  // L0 spec says alignment values that are not powers of 2 are invalid.
-  if (Align > 65536 || Align && (Align - 1) != 0)
+  // L0 spec says that alignment values that are not powers of 2 are invalid.
+  if (Align > 65536 || (Align & (Align - 1)) != 0)
     return UR_RESULT_ERROR_INVALID_VALUE;
 
   ur_platform_handle_t Plt = Context->getPlatform();
@@ -379,7 +377,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc(
   // L0 supports alignment up to 64KB and silently ignores higher values.
   // We flag alignment > 64KB as an invalid value.
   // L0 spec says that alignment values that are not powers of 2 are invalid.
-  if (Alignment > 65536 || Alignment & (Alignment - 1) != 0)
+  if (Alignment > 65536 || (Alignment & (Alignment - 1)) != 0)
     return UR_RESULT_ERROR_INVALID_VALUE;
 
   ur_platform_handle_t Plt = Device->Platform;
@@ -480,7 +478,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc(
   // L0 supports alignment up to 64KB and silently ignores higher values.
   // We flag alignment > 64KB as an invalid value.
   // L0 spec says that alignment values that are not powers of 2 are invalid.
-  if (Alignment > 65536 || Alignment && (Alignment - 1) != 0)
+  if (Alignment > 65536 || (Alignment & (Alignment - 1)) != 0)
     return UR_RESULT_ERROR_INVALID_VALUE;
 
   ur_platform_handle_t Plt = Device->Platform;
@@ -767,8 +765,8 @@ umf_result_t L0MemoryProvider::get_min_page_size(void *Ptr, size_t *PageSize) {
 
 ur_result_t L0SharedMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
                                                  uint32_t Alignment) {
-  return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, nullptr, Size,
-                            Alignment);
+  return USMSharedAllocImpl(ResultPtr, Context, Device, /*host flags*/ 0,
+                            /*device flags*/ 0, Size, Alignment);
 }
 
 ur_result_t L0SharedReadOnlyMemoryProvider::allocateImpl(void **ResultPtr,
@@ -776,20 +774,19 @@ ur_result_t L0SharedReadOnlyMemoryProvider::allocateImpl(void **ResultPtr,
                                                          uint32_t Alignment) {
   ur_usm_device_desc_t UsmDeviceDesc{};
   UsmDeviceDesc.flags = UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY;
-  ur_usm_host_desc_t UsmHostDesc{};
-  return USMSharedAllocImpl(ResultPtr, Context, Device, &UsmDeviceDesc.flags,
-                            &UsmHostDesc.flags, Size, Alignment);
+  return USMSharedAllocImpl(ResultPtr, Context, Device, UsmDeviceDesc.flags,
+                            /*host flags*/ 0, Size, Alignment);
 }
 
 ur_result_t L0DeviceMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
                                                  uint32_t Alignment) {
-  return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size,
+  return USMDeviceAllocImpl(ResultPtr, Context, Device, /* flags */ 0, Size,
                             Alignment);
 }
 
 ur_result_t L0HostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
                                                uint32_t Alignment) {
-  return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment);
+  return USMHostAllocImpl(ResultPtr, Context, /* flags */ 0, Size, Alignment);
 }
 
 ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context,
diff --git a/source/adapters/native_cpu/device.cpp b/source/adapters/native_cpu/device.cpp
index 1babdb0f10..aabe0a3f13 100644
--- a/source/adapters/native_cpu/device.cpp
+++ b/source/adapters/native_cpu/device.cpp
@@ -112,7 +112,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     // '0x8086' : 'Intel HD graphics vendor ID'
     return ReturnValue(uint32_t{0x8086});
   case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE:
-    return ReturnValue(size_t{256});
+    // TODO: provide a mechanism to estimate/configure this.
+    return ReturnValue(size_t{2048});
   case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN:
     // Imported from level_zero
     return ReturnValue(uint32_t{8});
@@ -151,7 +152,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES: {
     struct {
       size_t Arr[3];
-    } MaxGroupSize = {{256, 256, 1}};
+    } MaxGroupSize = {{256, 256, 256}};
     return ReturnValue(MaxGroupSize);
   }
   case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR:
@@ -321,6 +322,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP:
     return ReturnValue(false);
 
+  case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP:
+    return ReturnValue(false);
   default:
     DIE_NO_IMPLEMENTATION;
   }
diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
index 783f7f0ea9..7ab9883ca7 100644
--- a/source/adapters/native_cpu/enqueue.cpp
+++ b/source/adapters/native_cpu/enqueue.cpp
@@ -31,7 +31,7 @@ struct NDRDescT {
     for (uint32_t I = 0; I < WorkDim; I++) {
       GlobalOffset[I] = GlobalWorkOffset[I];
       GlobalSize[I] = GlobalWorkSize[I];
-      LocalSize[I] = LocalWorkSize[I];
+      LocalSize[I] = LocalWorkSize ? LocalWorkSize[I] : 1;
     }
     for (uint32_t I = WorkDim; I < 3; I++) {
       GlobalSize[I] = 1;
@@ -81,6 +81,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     DIE_NO_IMPLEMENTATION;
   }
 
+  // Check reqd_work_group_size
+  if (hKernel->hasReqdWGSize() && pLocalWorkSize != nullptr) {
+    const auto &Reqd = hKernel->getReqdWGSize();
+    for (uint32_t Dim = 0; Dim < workDim; Dim++) {
+      if (pLocalWorkSize[Dim] != Reqd[Dim]) {
+        return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+      }
+    }
+  }
+
   // TODO: add proper error checking
   // TODO: add proper event dep management
   native_cpu::NDRDescT ndr(workDim, pGlobalWorkOffset, pGlobalWorkSize,
diff --git a/source/adapters/native_cpu/event.cpp b/source/adapters/native_cpu/event.cpp
index 112bb553c0..9049e3c1b6 100644
--- a/source/adapters/native_cpu/event.cpp
+++ b/source/adapters/native_cpu/event.cpp
@@ -87,3 +87,15 @@ urEventSetCallback(ur_event_handle_t hEvent, ur_execution_info_t execStatus,
 
   DIE_NO_IMPLEMENTATION;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+    ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  std::ignore = hQueue;
+  std::ignore = blocking;
+  std::ignore = numEventsInWaitList;
+  std::ignore = phEventWaitList;
+  std::ignore = phEvent;
+
+  DIE_NO_IMPLEMENTATION;
+}
diff --git a/source/adapters/native_cpu/image.cpp b/source/adapters/native_cpu/image.cpp
index 01da9b7af8..6cb74fcbbe 100644
--- a/source/adapters/native_cpu/image.cpp
+++ b/source/adapters/native_cpu/image.cpp
@@ -59,7 +59,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
     [[maybe_unused]] ur_exp_image_mem_handle_t hImageMem,
     [[maybe_unused]] const ur_image_format_t *pImageFormat,
     [[maybe_unused]] const ur_image_desc_t *pImageDesc,
-    [[maybe_unused]] ur_mem_handle_t *phMem,
     [[maybe_unused]] ur_exp_image_handle_t *phImage) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
@@ -71,7 +70,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
     [[maybe_unused]] const ur_image_format_t *pImageFormat,
     [[maybe_unused]] const ur_image_desc_t *pImageDesc,
     [[maybe_unused]] ur_sampler_handle_t hSampler,
-    [[maybe_unused]] ur_mem_handle_t *phMem,
     [[maybe_unused]] ur_exp_image_handle_t *phImage) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
diff --git a/source/adapters/native_cpu/kernel.cpp b/source/adapters/native_cpu/kernel.cpp
index 5a7a286adc..7ef17b0c28 100644
--- a/source/adapters/native_cpu/kernel.cpp
+++ b/source/adapters/native_cpu/kernel.cpp
@@ -28,7 +28,16 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName,
 
   auto f = reinterpret_cast<nativecpu_ptr_t>(
       const_cast<unsigned char *>(kernelEntry->second));
-  auto kernel = new ur_kernel_handle_t_(pKernelName, *f);
+  ur_kernel_handle_t_ *kernel;
+
+  // Set reqd_work_group_size for kernel if needed
+  const auto &ReqdMap = hProgram->KernelReqdWorkGroupSizeMD;
+  auto ReqdIt = ReqdMap.find(pKernelName);
+  if (ReqdIt != ReqdMap.end()) {
+    kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f, ReqdIt->second);
+  } else {
+    kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f);
+  }
 
   *phKernel = kernel;
 
@@ -84,13 +93,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel,
     //  case UR_KERNEL_INFO_PROGRAM:
     //    return ReturnValue(ur_program_handle_t{ Kernel->Program });
   case UR_KERNEL_INFO_FUNCTION_NAME:
-    if (hKernel->_name) {
-      return ReturnValue(hKernel->_name);
-    }
-    return UR_RESULT_ERROR_INVALID_FUNCTION_NAME;
-    //  case UR_KERNEL_INFO_NUM_ARGS:
-    //    return ReturnValue(uint32_t{ Kernel->ZeKernelProperties->numKernelArgs
-    //    });
+    return ReturnValue(hKernel->_name);
   case UR_KERNEL_INFO_REFERENCE_COUNT:
     return ReturnValue(uint32_t{hKernel->getReferenceCount()});
   case UR_KERNEL_INFO_ATTRIBUTES:
@@ -121,8 +124,16 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
     return returnValue(max_threads);
   }
   case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
-    size_t group_size[3] = {1, 1, 1};
-    return returnValue(group_size, 3);
+    size_t GroupSize[3] = {0, 0, 0};
+    const auto &ReqdWGSizeMDMap = hKernel->hProgram->KernelReqdWorkGroupSizeMD;
+    const auto ReqdWGSizeMD = ReqdWGSizeMDMap.find(hKernel->_name);
+    if (ReqdWGSizeMD != ReqdWGSizeMDMap.end()) {
+      const auto ReqdWGSize = ReqdWGSizeMD->second;
+      GroupSize[0] = std::get<0>(ReqdWGSize);
+      GroupSize[1] = std::get<1>(ReqdWGSize);
+      GroupSize[2] = std::get<2>(ReqdWGSize);
+    }
+    return returnValue(GroupSize, 3);
   }
   case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: {
     int bytes = 0;
diff --git a/source/adapters/native_cpu/kernel.hpp b/source/adapters/native_cpu/kernel.hpp
index 9023a23fb2..b5728fa8b2 100644
--- a/source/adapters/native_cpu/kernel.hpp
+++ b/source/adapters/native_cpu/kernel.hpp
@@ -10,6 +10,8 @@
 
 #include "common.hpp"
 #include "nativecpu_state.hpp"
+#include "program.hpp"
+#include <array>
 #include <ur_api.h>
 #include <utility>
 
@@ -37,13 +39,17 @@ struct local_arg_info_t {
 
 struct ur_kernel_handle_t_ : RefCounted {
 
-  ur_kernel_handle_t_(const char *name, nativecpu_task_t subhandler)
-      : _name{name}, _subhandler{std::move(subhandler)} {}
+  ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
+                      nativecpu_task_t subhandler)
+      : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)},
+        HasReqdWGSize(false) {}
 
   ur_kernel_handle_t_(const ur_kernel_handle_t_ &other)
-      : _name(other._name), _subhandler(other._subhandler), _args(other._args),
+      : hProgram(other.hProgram), _name(other._name),
+        _subhandler(other._subhandler), _args(other._args),
         _localArgInfo(other._localArgInfo), _localMemPool(other._localMemPool),
-        _localMemPoolSize(other._localMemPoolSize) {
+        _localMemPoolSize(other._localMemPoolSize),
+        HasReqdWGSize(other.HasReqdWGSize), ReqdWGSize(other.ReqdWGSize) {
     incrementReferenceCount();
   }
 
@@ -52,13 +58,22 @@ struct ur_kernel_handle_t_ : RefCounted {
       free(_localMemPool);
     }
   }
-
-  const char *_name;
+  ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
+                      nativecpu_task_t subhandler,
+                      const native_cpu::ReqdWGSize_t &ReqdWGSize)
+      : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)},
+        HasReqdWGSize(true), ReqdWGSize(ReqdWGSize) {}
+
+  ur_program_handle_t hProgram;
+  std::string _name;
   nativecpu_task_t _subhandler;
   std::vector<native_cpu::NativeCPUArgDesc> _args;
   std::vector<local_arg_info_t> _localArgInfo;
 
-  // To be called before enqueueing the kernel.
+  bool hasReqdWGSize() const { return HasReqdWGSize; }
+
+  const native_cpu::ReqdWGSize_t &getReqdWGSize() const { return ReqdWGSize; }
+
   void updateMemPool(size_t numParallelThreads) {
     // compute requested size.
     size_t reqSize = 0;
@@ -88,4 +103,6 @@ struct ur_kernel_handle_t_ : RefCounted {
 private:
   char *_localMemPool = nullptr;
   size_t _localMemPoolSize = 0;
+  bool HasReqdWGSize;
+  native_cpu::ReqdWGSize_t ReqdWGSize;
 };
diff --git a/source/adapters/native_cpu/program.cpp b/source/adapters/native_cpu/program.cpp
index f210f210c9..678bdb8074 100644
--- a/source/adapters/native_cpu/program.cpp
+++ b/source/adapters/native_cpu/program.cpp
@@ -11,7 +11,9 @@
 #include "ur_api.h"
 
 #include "common.hpp"
+#include "common/ur_util.hpp"
 #include "program.hpp"
+#include <cstdint>
 
 UR_APIEXPORT ur_result_t UR_APICALL
 urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
@@ -26,6 +28,29 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
   DIE_NO_IMPLEMENTATION
 }
 
+static ur_result_t getReqdWGSize(const ur_program_metadata_t &MetadataElement,
+                                 native_cpu::ReqdWGSize_t &res) {
+  size_t MDElemsSize = MetadataElement.size - sizeof(std::uint64_t);
+
+  // Expect between 1 and 3 32-bit integer values.
+  UR_ASSERT(MDElemsSize == sizeof(std::uint32_t) ||
+                MDElemsSize == sizeof(std::uint32_t) * 2 ||
+                MDElemsSize == sizeof(std::uint32_t) * 3,
+            UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE);
+
+  // Get pointer to data, skipping 64-bit size at the start of the data.
+  const char *ValuePtr =
+      reinterpret_cast<const char *>(MetadataElement.value.pData) +
+      sizeof(std::uint64_t);
+  // Read values and pad with 1's for values not present.
+  std::uint32_t ReqdWorkGroupElements[] = {1, 1, 1};
+  std::memcpy(ReqdWorkGroupElements, ValuePtr, MDElemsSize);
+  std::get<0>(res) = ReqdWorkGroupElements[0];
+  std::get<1>(res) = ReqdWorkGroupElements[1];
+  std::get<2>(res) = ReqdWorkGroupElements[2];
+  return UR_RESULT_SUCCESS;
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
     ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size,
     const uint8_t *pBinary, const ur_program_properties_t *pProperties,
@@ -40,6 +65,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
 
   auto hProgram = new ur_program_handle_t_(
       hContext, reinterpret_cast<const unsigned char *>(pBinary));
+  if (pProperties != nullptr) {
+    for (uint32_t i = 0; i < pProperties->count; i++) {
+      auto mdNode = pProperties->pMetadatas[i];
+      std::string mdName(mdNode.pName);
+      auto [Prefix, Tag] = splitMetadataName(mdName);
+      if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) {
+        native_cpu::ReqdWGSize_t reqdWGSize;
+        auto res = getReqdWGSize(mdNode, reqdWGSize);
+        if (res != UR_RESULT_SUCCESS) {
+          return res;
+        }
+        hProgram->KernelReqdWorkGroupSizeMD[Prefix] = std::move(reqdWGSize);
+      }
+    }
+  }
 
   const nativecpu_entry *nativecpu_it =
       reinterpret_cast<const nativecpu_entry *>(pBinary);
diff --git a/source/adapters/native_cpu/program.hpp b/source/adapters/native_cpu/program.hpp
index 311eb40992..0673b4122c 100644
--- a/source/adapters/native_cpu/program.hpp
+++ b/source/adapters/native_cpu/program.hpp
@@ -15,6 +15,10 @@
 #include "context.hpp"
 #include <map>
 
+namespace native_cpu {
+using ReqdWGSize_t = std::array<uint32_t, 3>;
+}
+
 struct ur_program_handle_t_ : RefCounted {
   ur_program_handle_t_(ur_context_handle_t ctx, const unsigned char *pBinary)
       : _ctx{ctx}, _ptr{pBinary} {}
@@ -30,6 +34,8 @@ struct ur_program_handle_t_ : RefCounted {
   };
 
   std::map<const char *, const unsigned char *, _compare> _kernels;
+  std::unordered_map<std::string, native_cpu::ReqdWGSize_t>
+      KernelReqdWorkGroupSizeMD;
 };
 
 // The nativecpu_entry struct is also defined as LLVM-IR in the
diff --git a/source/adapters/native_cpu/ur_interface_loader.cpp b/source/adapters/native_cpu/ur_interface_loader.cpp
index 1f54a98c80..065012613e 100644
--- a/source/adapters/native_cpu/ur_interface_loader.cpp
+++ b/source/adapters/native_cpu/ur_interface_loader.cpp
@@ -392,6 +392,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
   }
 
   pDdiTable->pfnCooperativeKernelLaunchExp = nullptr;
+  pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp;
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/null/ur_nullddi.cpp b/source/adapters/null/ur_nullddi.cpp
index 37d0ceb15a..45dd6a59c5 100644
--- a/source/adapters/null/ur_nullddi.cpp
+++ b/source/adapters/null/ur_nullddi.cpp
@@ -4292,7 +4292,6 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
     const ur_image_format_t
         *pImageFormat, ///< [in] pointer to image format specification
     const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description
-    ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created
     ur_exp_image_handle_t
         *phImage ///< [out] pointer to handle of image object created
     ) try {
@@ -4303,12 +4302,9 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
         d_context.urDdiTable.BindlessImagesExp.pfnUnsampledImageCreateExp;
     if (nullptr != pfnUnsampledImageCreateExp) {
         result = pfnUnsampledImageCreateExp(hContext, hDevice, hImageMem,
-                                            pImageFormat, pImageDesc, phMem,
-                                            phImage);
+                                            pImageFormat, pImageDesc, phImage);
     } else {
         // generic implementation
-        *phMem = reinterpret_cast<ur_mem_handle_t>(d_context.get());
-
         *phImage = reinterpret_cast<ur_exp_image_handle_t>(d_context.get());
     }
 
@@ -4328,7 +4324,6 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
         *pImageFormat, ///< [in] pointer to image format specification
     const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description
     ur_sampler_handle_t hSampler,      ///< [in] sampler to be used
-    ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created
     ur_exp_image_handle_t
         *phImage ///< [out] pointer to handle of image object created
     ) try {
@@ -4340,11 +4335,9 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
     if (nullptr != pfnSampledImageCreateExp) {
         result =
             pfnSampledImageCreateExp(hContext, hDevice, hImageMem, pImageFormat,
-                                     pImageDesc, hSampler, phMem, phImage);
+                                     pImageDesc, hSampler, phImage);
     } else {
         // generic implementation
-        *phMem = reinterpret_cast<ur_mem_handle_t>(d_context.get());
-
         *phImage = reinterpret_cast<ur_exp_image_handle_t>(d_context.get());
     }
 
@@ -5513,6 +5506,47 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     return exceptionToResult(std::current_exception());
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueTimestampRecordingExp
+__urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    bool
+        blocking, ///< [in] indicates whether the call to this function should block until
+    ///< until the device timestamp recording command has executed on the
+    ///< device.
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before the kernel execution.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait
+    ///< events.
+    ur_event_handle_t *
+        phEvent ///< [in,out] return an event object that identifies this particular kernel
+                ///< execution instance. Profiling information can be queried
+    ///< from this event as if `hQueue` had profiling enabled. Querying
+    ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT`
+    ///< reports the timestamp at the time of the call to this function.
+    ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END`
+    ///< reports the timestamp recorded when the command is executed on the device.
+    ) try {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    // if the driver has created a custom function, then call it instead of using the generic path
+    auto pfnTimestampRecordingExp =
+        d_context.urDdiTable.EnqueueExp.pfnTimestampRecordingExp;
+    if (nullptr != pfnTimestampRecordingExp) {
+        result = pfnTimestampRecordingExp(hQueue, blocking, numEventsInWaitList,
+                                          phEventWaitList, phEvent);
+    } else {
+        // generic implementation
+        *phEvent = reinterpret_cast<ur_event_handle_t>(d_context.get());
+    }
+
+    return result;
+} catch (...) {
+    return exceptionToResult(std::current_exception());
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urProgramBuildExp
 __urdlllocal ur_result_t UR_APICALL urProgramBuildExp(
@@ -6069,6 +6103,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
     pDdiTable->pfnCooperativeKernelLaunchExp =
         driver::urEnqueueCooperativeKernelLaunchExp;
 
+    pDdiTable->pfnTimestampRecordingExp =
+        driver::urEnqueueTimestampRecordingExp;
+
     return result;
 } catch (...) {
     return exceptionToResult(std::current_exception());
diff --git a/source/adapters/opencl/common.cpp b/source/adapters/opencl/common.cpp
index 63981187e7..750616235d 100644
--- a/source/adapters/opencl/common.cpp
+++ b/source/adapters/opencl/common.cpp
@@ -85,6 +85,8 @@ ur_result_t mapCLErrorToUR(cl_int Result) {
     return UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS;
   case CL_DEVICE_NOT_AVAILABLE:
     return UR_RESULT_ERROR_DEVICE_NOT_AVAILABLE;
+  case CL_INVALID_KERNEL_ARGS:
+    return UR_RESULT_ERROR_INVALID_KERNEL_ARGS;
   default:
     return UR_RESULT_ERROR_UNKNOWN;
   }
diff --git a/source/adapters/opencl/device.cpp b/source/adapters/opencl/device.cpp
index d89a9492a5..ac79b71876 100644
--- a/source/adapters/opencl/device.cpp
+++ b/source/adapters/opencl/device.cpp
@@ -507,7 +507,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
           cl_adapter::cast<cl_device_id>(hDevice), {"cl_khr_fp16"}, Supported));
 
       if (!Supported) {
-        return UR_RESULT_ERROR_INVALID_ENUMERATION;
+        return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
       }
     }
 
@@ -799,6 +799,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
   case UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT: {
     return ReturnValue(false);
   }
+  case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: {
+    return ReturnValue(false);
+  }
   case UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED: {
     bool Supported = false;
     CL_RETURN_ON_FAILURE(cl_adapter::checkDeviceExtensions(
diff --git a/source/adapters/opencl/event.cpp b/source/adapters/opencl/event.cpp
index f5af30734c..5141ce8ff0 100644
--- a/source/adapters/opencl/event.cpp
+++ b/source/adapters/opencl/event.cpp
@@ -257,3 +257,9 @@ urEventSetCallback(ur_event_handle_t hEvent, ur_execution_info_t execStatus,
                                           CallbackType, ClCallback, Callback));
   return UR_RESULT_SUCCESS;
 }
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urEnqueueTimestampRecordingExp(ur_queue_handle_t, bool, uint32_t,
+                               const ur_event_handle_t *, ur_event_handle_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
diff --git a/source/adapters/opencl/image.cpp b/source/adapters/opencl/image.cpp
index f7cef40194..1f4887019b 100644
--- a/source/adapters/opencl/image.cpp
+++ b/source/adapters/opencl/image.cpp
@@ -59,7 +59,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
     [[maybe_unused]] ur_exp_image_mem_handle_t hImageMem,
     [[maybe_unused]] const ur_image_format_t *pImageFormat,
     [[maybe_unused]] const ur_image_desc_t *pImageDesc,
-    [[maybe_unused]] ur_mem_handle_t *phMem,
     [[maybe_unused]] ur_exp_image_handle_t *phImage) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
@@ -71,7 +70,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
     [[maybe_unused]] const ur_image_format_t *pImageFormat,
     [[maybe_unused]] const ur_image_desc_t *pImageDesc,
     [[maybe_unused]] ur_sampler_handle_t hSampler,
-    [[maybe_unused]] ur_mem_handle_t *phMem,
     [[maybe_unused]] ur_exp_image_handle_t *phImage) {
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
diff --git a/source/adapters/opencl/ur_interface_loader.cpp b/source/adapters/opencl/ur_interface_loader.cpp
index 8c2c73d7c8..effb2128c3 100644
--- a/source/adapters/opencl/ur_interface_loader.cpp
+++ b/source/adapters/opencl/ur_interface_loader.cpp
@@ -398,6 +398,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
 
   pDdiTable->pfnCooperativeKernelLaunchExp =
       urEnqueueCooperativeKernelLaunchExp;
+  pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp;
 
   return UR_RESULT_SUCCESS;
 }
diff --git a/source/adapters/opencl/usm.cpp b/source/adapters/opencl/usm.cpp
index ea11bb8752..6661fd4fa1 100644
--- a/source/adapters/opencl/usm.cpp
+++ b/source/adapters/opencl/usm.cpp
@@ -68,8 +68,7 @@ usmDescToCLMemProperties(const ur_base_desc_t *Desc,
       return UR_RESULT_ERROR_INVALID_VALUE;
     }
 
-    Next = Next->pNext ? static_cast<const ur_base_desc_t *>(Next->pNext)
-                       : nullptr;
+    Next = static_cast<const ur_base_desc_t *>(Next->pNext);
   } while (Next);
 
   if (AllocFlags) {
diff --git a/source/common/logger/ur_logger.hpp b/source/common/logger/ur_logger.hpp
index 9735d0d814..614d13a214 100644
--- a/source/common/logger/ur_logger.hpp
+++ b/source/common/logger/ur_logger.hpp
@@ -14,7 +14,8 @@
 
 namespace logger {
 
-Logger create_logger(std::string logger_name, bool skip_prefix = false);
+Logger create_logger(std::string logger_name, bool skip_prefix = false,
+                     bool skip_linebreak = false);
 
 inline Logger &get_logger(std::string name = "common") {
     static Logger logger = create_logger(std::move(name));
@@ -105,7 +106,8 @@ template <typename T> inline std::string toHex(T t) {
 ///             - flush level: error, meaning that only error messages are guaranteed
 ///                            to be printed immediately as they occur
 ///             - output: stderr
-inline Logger create_logger(std::string logger_name, bool skip_prefix) {
+inline Logger create_logger(std::string logger_name, bool skip_prefix,
+                            bool skip_linebreak) {
     std::transform(logger_name.begin(), logger_name.end(), logger_name.begin(),
                    ::toupper);
     std::stringstream env_var_name;
@@ -121,7 +123,7 @@ inline Logger create_logger(std::string logger_name, bool skip_prefix) {
         auto map = getenv_to_map(env_var_name.str().c_str());
         if (!map.has_value()) {
             return Logger(std::make_unique<logger::StderrSink>(
-                std::move(logger_name), skip_prefix));
+                std::move(logger_name), skip_prefix, skip_linebreak));
         }
 
         auto kv = map->find("level");
@@ -150,19 +152,20 @@ inline Logger create_logger(std::string logger_name, bool skip_prefix) {
                       << map->begin()->first
                       << "'. Default logger options are set.";
             return Logger(std::make_unique<logger::StderrSink>(
-                std::move(logger_name), skip_prefix));
+                std::move(logger_name), skip_prefix, skip_linebreak));
         }
 
-        sink =
-            values.size() == 2
-                ? sink_from_str(logger_name, values[0], values[1], skip_prefix)
-                : sink_from_str(logger_name, values[0], "", skip_prefix);
+        sink = values.size() == 2
+                   ? sink_from_str(logger_name, values[0], values[1],
+                                   skip_prefix, skip_linebreak)
+                   : sink_from_str(logger_name, values[0], "", skip_prefix,
+                                   skip_linebreak);
     } catch (const std::invalid_argument &e) {
         std::cerr << "Error when creating a logger instance from the '"
                   << env_var_name.str() << "' environment variable:\n"
                   << e.what() << std::endl;
         return Logger(std::make_unique<logger::StderrSink>(
-            std::move(logger_name), skip_prefix));
+            std::move(logger_name), skip_prefix, skip_linebreak));
     }
     sink->setFlushLevel(flush_level);
 
diff --git a/source/common/logger/ur_logger_details.hpp b/source/common/logger/ur_logger_details.hpp
index 5324f4083f..f17d3b3f64 100644
--- a/source/common/logger/ur_logger_details.hpp
+++ b/source/common/logger/ur_logger_details.hpp
@@ -25,6 +25,7 @@ class Logger {
     Logger(logger::Level level, std::unique_ptr<logger::Sink> sink)
         : level(level), sink(std::move(sink)) {}
 
+    Logger &operator=(Logger &&) = default;
     ~Logger() = default;
 
     void setLevel(logger::Level level) { this->level = level; }
diff --git a/source/common/logger/ur_sinks.hpp b/source/common/logger/ur_sinks.hpp
index 97c87d3cb4..b2ebf72a7a 100644
--- a/source/common/logger/ur_sinks.hpp
+++ b/source/common/logger/ur_sinks.hpp
@@ -39,8 +39,10 @@ class Sink {
     std::ostream *ostream;
     logger::Level flush_level;
 
-    Sink(std::string logger_name, bool skip_prefix = false)
-        : logger_name(std::move(logger_name)), skip_prefix(skip_prefix) {
+    Sink(std::string logger_name, bool skip_prefix = false,
+         bool skip_linebreak = false)
+        : logger_name(std::move(logger_name)), skip_prefix(skip_prefix),
+          skip_linebreak(skip_linebreak) {
         ostream = nullptr;
         flush_level = logger::Level::ERR;
     }
@@ -56,6 +58,7 @@ class Sink {
   private:
     std::string logger_name;
     bool skip_prefix;
+    bool skip_linebreak;
     std::mutex output_mutex;
     const char *error_prefix = "Log message syntax error: ";
 
@@ -83,7 +86,9 @@ class Sink {
                 }
             }
         }
-        buffer << "\n";
+        if (!skip_linebreak) {
+            buffer << "\n";
+        }
     }
 
     template <typename Arg, typename... Args>
@@ -129,14 +134,15 @@ class Sink {
 
 class StdoutSink : public Sink {
   public:
-    StdoutSink(std::string logger_name, bool skip_prefix = false)
-        : Sink(std::move(logger_name), skip_prefix) {
+    StdoutSink(std::string logger_name, bool skip_prefix = false,
+               bool skip_linebreak = false)
+        : Sink(std::move(logger_name), skip_prefix, skip_linebreak) {
         this->ostream = &std::cout;
     }
 
     StdoutSink(std::string logger_name, Level flush_lvl,
-               bool skip_prefix = false)
-        : StdoutSink(std::move(logger_name), skip_prefix) {
+               bool skip_prefix = false, bool skip_linebreak = false)
+        : StdoutSink(std::move(logger_name), skip_prefix, skip_linebreak) {
         this->flush_level = flush_lvl;
     }
 
@@ -145,13 +151,15 @@ class StdoutSink : public Sink {
 
 class StderrSink : public Sink {
   public:
-    StderrSink(std::string logger_name, bool skip_prefix = false)
-        : Sink(std::move(logger_name), skip_prefix) {
+    StderrSink(std::string logger_name, bool skip_prefix = false,
+               bool skip_linebreak = false)
+        : Sink(std::move(logger_name), skip_prefix, skip_linebreak) {
         this->ostream = &std::cerr;
     }
 
-    StderrSink(std::string logger_name, Level flush_lvl, bool skip_prefix)
-        : StderrSink(std::move(logger_name), skip_prefix) {
+    StderrSink(std::string logger_name, Level flush_lvl, bool skip_prefix,
+               bool skip_linebreak)
+        : StderrSink(std::move(logger_name), skip_prefix, skip_linebreak) {
         this->flush_level = flush_lvl;
     }
 
@@ -161,8 +169,8 @@ class StderrSink : public Sink {
 class FileSink : public Sink {
   public:
     FileSink(std::string logger_name, filesystem::path file_path,
-             bool skip_prefix = false)
-        : Sink(std::move(logger_name), skip_prefix) {
+             bool skip_prefix = false, bool skip_linebreak = false)
+        : Sink(std::move(logger_name), skip_prefix, skip_linebreak) {
         ofstream = std::ofstream(file_path);
         if (!ofstream.good()) {
             std::stringstream ss;
@@ -174,8 +182,10 @@ class FileSink : public Sink {
     }
 
     FileSink(std::string logger_name, filesystem::path file_path,
-             Level flush_lvl, bool skip_prefix = false)
-        : FileSink(std::move(logger_name), std::move(file_path), skip_prefix) {
+             Level flush_lvl, bool skip_prefix = false,
+             bool skip_linebreak = false)
+        : FileSink(std::move(logger_name), std::move(file_path), skip_prefix,
+                   skip_linebreak) {
         this->flush_level = flush_lvl;
     }
 
@@ -188,14 +198,17 @@ class FileSink : public Sink {
 inline std::unique_ptr<Sink> sink_from_str(std::string logger_name,
                                            std::string name,
                                            filesystem::path file_path = "",
-                                           bool skip_prefix = false) {
+                                           bool skip_prefix = false,
+                                           bool skip_linebreak = false) {
     if (name == "stdout" && file_path.empty()) {
-        return std::make_unique<logger::StdoutSink>(logger_name, skip_prefix);
+        return std::make_unique<logger::StdoutSink>(logger_name, skip_prefix,
+                                                    skip_linebreak);
     } else if (name == "stderr" && file_path.empty()) {
-        return std::make_unique<logger::StderrSink>(logger_name, skip_prefix);
+        return std::make_unique<logger::StderrSink>(logger_name, skip_prefix,
+                                                    skip_linebreak);
     } else if (name == "file" && !file_path.empty()) {
         return std::make_unique<logger::FileSink>(logger_name, file_path,
-                                                  skip_prefix);
+                                                  skip_prefix, skip_linebreak);
     }
 
     throw std::invalid_argument(
diff --git a/source/common/ur_util.hpp b/source/common/ur_util.hpp
index 9cecdbec1e..c78aac1d37 100644
--- a/source/common/ur_util.hpp
+++ b/source/common/ur_util.hpp
@@ -294,7 +294,7 @@ template <typename T> struct stype_map {};
 // stype_map_impl<UR_STRUCTURE_TYPE_USM_DEVICE_DESC> {};
 #include "stype_map_helpers.def"
 
-template <typename T> constexpr int as_stype() { return stype_map<T>::value; };
+template <typename T> constexpr int as_stype() { return stype_map<T>::value; }
 
 /// Walk a generic UR linked list looking for a node of the given type. If it's
 /// found, its address is returned, othewise `nullptr`. e.g. to find out whether
@@ -334,4 +334,13 @@ namespace ur {
 }
 } // namespace ur
 
+inline std::pair<std::string, std::string>
+splitMetadataName(const std::string &metadataName) {
+    size_t splitPos = metadataName.rfind('@');
+    if (splitPos == std::string::npos) {
+        return std::make_pair(metadataName, std::string{});
+    }
+    return std::make_pair(metadataName.substr(0, splitPos),
+                          metadataName.substr(splitPos, metadataName.length()));
+}
 #endif /* UR_UTIL_H */
diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp
index c55d752410..37630e5c29 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.cpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.cpp
@@ -22,8 +22,6 @@ namespace ur_sanitizer_layer {
 
 namespace {
 
-constexpr auto kSPIR_DeviceSanitizerReportMem = "__DeviceSanitizerReportMem";
-
 uptr MemToShadow_CPU(uptr USM_SHADOW_BASE, uptr UPtr) {
     return USM_SHADOW_BASE + (UPtr >> 3);
 }
@@ -155,6 +153,35 @@ SanitizerInterceptor::SanitizerInterceptor() {
         cl_Debug = Value == "1" || Value == "true" ? 1 : 0;
     }
 
+    KV = Options->find("redzone");
+    if (KV != Options->end()) {
+        auto Value = KV->second.front();
+        try {
+            cl_MinRZSize = std::stoul(Value);
+            if (cl_MinRZSize < 16) {
+                cl_MinRZSize = 16;
+                context.logger.warning("Trying to set redzone size to a value "
+                                       "less than 16 is ignored");
+            }
+        } catch (...) {
+            die("<SANITIZER>[ERROR]: \"redzone\" should be an integer");
+        }
+    }
+    KV = Options->find("max_redzone");
+    if (KV != Options->end()) {
+        auto Value = KV->second.front();
+        try {
+            cl_MaxRZSize = std::stoul(Value);
+            if (cl_MaxRZSize > 2048) {
+                cl_MaxRZSize = 2048;
+                context.logger.warning("Trying to set max redzone size to a "
+                                       "value greater than 2048 is ignored");
+            }
+        } catch (...) {
+            die("<SANITIZER>[ERROR]: \"max_redzone\" should be an integer");
+        }
+    }
+
     KV = Options->find("quarantine_size_mb");
     if (KV != Options->end()) {
         auto Value = KV->second.front();
@@ -213,7 +240,7 @@ ur_result_t SanitizerInterceptor::allocateMemory(
         Alignment = MinAlignment;
     }
 
-    uptr RZLog = ComputeRZLog(Size);
+    uptr RZLog = ComputeRZLog(Size, cl_MinRZSize, cl_MaxRZSize);
     uptr RZSize = RZLog2Size(RZLog);
     uptr RoundedSize = RoundUpTo(Size, Alignment);
     uptr NeededSize = RoundedSize + RZSize * 2;
@@ -348,11 +375,14 @@ ur_result_t SanitizerInterceptor::releaseMemory(ur_context_handle_t Context,
 
 ur_result_t SanitizerInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel,
                                                   ur_queue_handle_t Queue,
-                                                  LaunchInfo &LaunchInfo) {
+                                                  USMLaunchInfo &LaunchInfo) {
     auto Context = GetContext(Queue);
     auto Device = GetDevice(Queue);
     auto ContextInfo = getContextInfo(Context);
     auto DeviceInfo = getDeviceInfo(Device);
+    auto KernelInfo = getKernelInfo(Kernel);
+
+    UR_CALL(LaunchInfo.updateKernelInfo(*KernelInfo.get()));
 
     ManagedQueue InternalQueue(Context, Device);
     if (!InternalQueue) {
@@ -370,23 +400,12 @@ ur_result_t SanitizerInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel,
 
 ur_result_t SanitizerInterceptor::postLaunchKernel(ur_kernel_handle_t Kernel,
                                                    ur_queue_handle_t Queue,
-                                                   ur_event_handle_t &Event,
-                                                   LaunchInfo &LaunchInfo) {
-    auto Program = GetProgram(Kernel);
-    ur_event_handle_t ReadEvent{};
-
-    // If kernel has defined SPIR_DeviceSanitizerReportMem, then we try to read it
-    // to host, but it's okay that it isn't defined
+                                                   USMLaunchInfo &LaunchInfo) {
     // FIXME: We must use block operation here, until we support urEventSetCallback
-    auto Result = context.urDdiTable.Enqueue.pfnDeviceGlobalVariableRead(
-        Queue, Program, kSPIR_DeviceSanitizerReportMem, true,
-        sizeof(LaunchInfo.SPIR_DeviceSanitizerReportMem), 0,
-        &LaunchInfo.SPIR_DeviceSanitizerReportMem, 1, &Event, &ReadEvent);
+    auto Result = context.urDdiTable.Queue.pfnFinish(Queue);
 
     if (Result == UR_RESULT_SUCCESS) {
-        Event = ReadEvent;
-
-        const auto &AH = LaunchInfo.SPIR_DeviceSanitizerReportMem;
+        const auto &AH = LaunchInfo.Data->SanitizerReport;
         if (!AH.Flag) {
             return UR_RESULT_SUCCESS;
         }
@@ -627,13 +646,44 @@ ur_result_t SanitizerInterceptor::eraseDevice(ur_device_handle_t Device) {
     return UR_RESULT_SUCCESS;
 }
 
+ur_result_t SanitizerInterceptor::insertKernel(ur_kernel_handle_t Kernel) {
+    std::scoped_lock<ur_shared_mutex> Guard(m_KernelMapMutex);
+    if (m_KernelMap.find(Kernel) != m_KernelMap.end()) {
+        return UR_RESULT_SUCCESS;
+    }
+    m_KernelMap.emplace(Kernel, std::make_shared<KernelInfo>(Kernel));
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t SanitizerInterceptor::eraseKernel(ur_kernel_handle_t Kernel) {
+    std::scoped_lock<ur_shared_mutex> Guard(m_KernelMapMutex);
+    assert(m_KernelMap.find(Kernel) != m_KernelMap.end());
+    m_KernelMap.erase(Kernel);
+    return UR_RESULT_SUCCESS;
+}
+
 ur_result_t SanitizerInterceptor::prepareLaunch(
     ur_context_handle_t Context, std::shared_ptr<DeviceInfo> &DeviceInfo,
     ur_queue_handle_t Queue, ur_kernel_handle_t Kernel,
-    LaunchInfo &LaunchInfo) {
+    USMLaunchInfo &LaunchInfo) {
     auto Program = GetProgram(Kernel);
 
     do {
+        // Set launch info argument
+        auto ArgNums = GetKernelNumArgs(Kernel);
+        if (ArgNums) {
+            context.logger.debug(
+                "launch_info {} (numLocalArgs={}, localArgs={})",
+                (void *)LaunchInfo.Data, LaunchInfo.Data->NumLocalArgs,
+                (void *)LaunchInfo.Data->LocalArgs);
+            ur_result_t URes = context.urDdiTable.Kernel.pfnSetArgPointer(
+                Kernel, ArgNums - 1, nullptr, &LaunchInfo.Data);
+            if (URes != UR_RESULT_SUCCESS) {
+                context.logger.error("Failed to set launch info: {}", URes);
+                return URes;
+            }
+        }
+
         // Write global variable to program
         auto EnqueueWriteGlobal = [Queue, Program](const char *Name,
                                                    const void *Value,
@@ -723,15 +773,17 @@ ur_result_t SanitizerInterceptor::prepareLaunch(
                     "LocalShadowMemorySize={})",
                     NumWG, LocalMemorySize, LocalShadowMemorySize);
 
-                UR_CALL(EnqueueAllocateDevice(LocalShadowMemorySize,
-                                              LaunchInfo.LocalShadowOffset));
+                UR_CALL(EnqueueAllocateDevice(
+                    LocalShadowMemorySize, LaunchInfo.Data->LocalShadowOffset));
 
-                LaunchInfo.LocalShadowOffsetEnd =
-                    LaunchInfo.LocalShadowOffset + LocalShadowMemorySize - 1;
+                LaunchInfo.Data->LocalShadowOffsetEnd =
+                    LaunchInfo.Data->LocalShadowOffset + LocalShadowMemorySize -
+                    1;
 
-                context.logger.info("ShadowMemory(Local, {} - {})",
-                                    (void *)LaunchInfo.LocalShadowOffset,
-                                    (void *)LaunchInfo.LocalShadowOffsetEnd);
+                context.logger.info(
+                    "ShadowMemory(Local, {} - {})",
+                    (void *)LaunchInfo.Data->LocalShadowOffset,
+                    (void *)LaunchInfo.Data->LocalShadowOffsetEnd);
             }
         }
     } while (false);
@@ -749,15 +801,61 @@ SanitizerInterceptor::findAllocInfoByAddress(uptr Address) {
     return --It;
 }
 
-LaunchInfo::~LaunchInfo() {
+ur_result_t USMLaunchInfo::initialize() {
+    UR_CALL(context.urDdiTable.Context.pfnRetain(Context));
+    UR_CALL(context.urDdiTable.Device.pfnRetain(Device));
+    UR_CALL(context.urDdiTable.USM.pfnSharedAlloc(
+        Context, Device, nullptr, nullptr, sizeof(LaunchInfo), (void **)&Data));
+    *Data = LaunchInfo{};
+    return UR_RESULT_SUCCESS;
+}
+
+ur_result_t USMLaunchInfo::updateKernelInfo(const KernelInfo &KI) {
+    auto NumArgs = KI.LocalArgs.size();
+    if (NumArgs) {
+        Data->NumLocalArgs = NumArgs;
+        UR_CALL(context.urDdiTable.USM.pfnSharedAlloc(
+            Context, Device, nullptr, nullptr, sizeof(LocalArgsInfo) * NumArgs,
+            (void **)&Data->LocalArgs));
+        uint32_t i = 0;
+        for (auto [ArgIndex, ArgInfo] : KI.LocalArgs) {
+            Data->LocalArgs[i++] = ArgInfo;
+            context.logger.debug(
+                "local_args (argIndex={}, size={}, sizeWithRZ={})", ArgIndex,
+                ArgInfo.Size, ArgInfo.SizeWithRedZone);
+        }
+    }
+    return UR_RESULT_SUCCESS;
+}
+
+USMLaunchInfo::~USMLaunchInfo() {
     [[maybe_unused]] ur_result_t Result;
-    if (LocalShadowOffset) {
-        Result =
-            context.urDdiTable.USM.pfnFree(Context, (void *)LocalShadowOffset);
+    if (Data) {
+        auto Type = GetDeviceType(Device);
+        if (Type == DeviceType::GPU_PVC) {
+            if (Data->PrivateShadowOffset) {
+                Result = context.urDdiTable.USM.pfnFree(
+                    Context, (void *)Data->PrivateShadowOffset);
+                assert(Result == UR_RESULT_SUCCESS);
+            }
+            if (Data->LocalShadowOffset) {
+                Result = context.urDdiTable.USM.pfnFree(
+                    Context, (void *)Data->LocalShadowOffset);
+                assert(Result == UR_RESULT_SUCCESS);
+            }
+        }
+        if (Data->LocalArgs) {
+            Result = context.urDdiTable.USM.pfnFree(Context,
+                                                    (void *)Data->LocalArgs);
+            assert(Result == UR_RESULT_SUCCESS);
+        }
+        Result = context.urDdiTable.USM.pfnFree(Context, (void *)Data);
         assert(Result == UR_RESULT_SUCCESS);
     }
     Result = context.urDdiTable.Context.pfnRelease(Context);
     assert(Result == UR_RESULT_SUCCESS);
+    Result = context.urDdiTable.Device.pfnRelease(Device);
+    assert(Result == UR_RESULT_SUCCESS);
 }
 
 } // namespace ur_sanitizer_layer
diff --git a/source/loader/layers/sanitizer/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan_interceptor.hpp
index a691bee7b7..f02ca3b370 100644
--- a/source/loader/layers/sanitizer/asan_interceptor.hpp
+++ b/source/loader/layers/sanitizer/asan_interceptor.hpp
@@ -79,6 +79,26 @@ struct QueueInfo {
     }
 };
 
+struct KernelInfo {
+    ur_kernel_handle_t Handle;
+
+    ur_shared_mutex Mutex;
+    // Need preserve the order of local arguments
+    std::map<uint32_t, LocalArgsInfo> LocalArgs;
+
+    explicit KernelInfo(ur_kernel_handle_t Kernel) : Handle(Kernel) {
+        [[maybe_unused]] auto Result =
+            context.urDdiTable.Kernel.pfnRetain(Kernel);
+        assert(Result == UR_RESULT_SUCCESS);
+    }
+
+    ~KernelInfo() {
+        [[maybe_unused]] auto Result =
+            context.urDdiTable.Kernel.pfnRelease(Handle);
+        assert(Result == UR_RESULT_SUCCESS);
+    }
+};
+
 struct ContextInfo {
     ur_context_handle_t Handle;
 
@@ -107,31 +127,30 @@ struct ContextInfo {
     }
 };
 
-struct LaunchInfo {
-    uptr LocalShadowOffset = 0;
-    uptr LocalShadowOffsetEnd = 0;
-    DeviceSanitizerReport SPIR_DeviceSanitizerReportMem;
+struct USMLaunchInfo {
+    LaunchInfo *Data;
 
     ur_context_handle_t Context = nullptr;
+    ur_device_handle_t Device = nullptr;
     const size_t *GlobalWorkSize = nullptr;
     const size_t *GlobalWorkOffset = nullptr;
     std::vector<size_t> LocalWorkSize;
     uint32_t WorkDim = 0;
 
-    LaunchInfo(ur_context_handle_t Context, const size_t *GlobalWorkSize,
-               const size_t *LocalWorkSize, const size_t *GlobalWorkOffset,
-               uint32_t WorkDim)
-        : Context(Context), GlobalWorkSize(GlobalWorkSize),
+    USMLaunchInfo(ur_context_handle_t Context, ur_device_handle_t Device,
+                  const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
+                  const size_t *GlobalWorkOffset, uint32_t WorkDim)
+        : Context(Context), Device(Device), GlobalWorkSize(GlobalWorkSize),
           GlobalWorkOffset(GlobalWorkOffset), WorkDim(WorkDim) {
-        [[maybe_unused]] auto Result =
-            context.urDdiTable.Context.pfnRetain(Context);
-        assert(Result == UR_RESULT_SUCCESS);
         if (LocalWorkSize) {
             this->LocalWorkSize =
                 std::vector<size_t>(LocalWorkSize, LocalWorkSize + WorkDim);
         }
     }
-    ~LaunchInfo();
+    ~USMLaunchInfo();
+
+    ur_result_t initialize();
+    ur_result_t updateKernelInfo(const KernelInfo &KI);
 };
 
 struct DeviceGlobalInfo {
@@ -158,12 +177,11 @@ class SanitizerInterceptor {
 
     ur_result_t preLaunchKernel(ur_kernel_handle_t Kernel,
                                 ur_queue_handle_t Queue,
-                                LaunchInfo &LaunchInfo);
+                                USMLaunchInfo &LaunchInfo);
 
     ur_result_t postLaunchKernel(ur_kernel_handle_t Kernel,
                                  ur_queue_handle_t Queue,
-                                 ur_event_handle_t &Event,
-                                 LaunchInfo &LaunchInfo);
+                                 USMLaunchInfo &LaunchInfo);
 
     ur_result_t insertContext(ur_context_handle_t Context,
                               std::shared_ptr<ContextInfo> &CI);
@@ -173,6 +191,9 @@ class SanitizerInterceptor {
                              std::shared_ptr<DeviceInfo> &CI);
     ur_result_t eraseDevice(ur_device_handle_t Device);
 
+    ur_result_t insertKernel(ur_kernel_handle_t Kernel);
+    ur_result_t eraseKernel(ur_kernel_handle_t Kernel);
+
     std::optional<AllocationIterator> findAllocInfoByAddress(uptr Address);
 
     std::shared_ptr<ContextInfo> getContextInfo(ur_context_handle_t Context) {
@@ -181,6 +202,18 @@ class SanitizerInterceptor {
         return m_ContextMap[Context];
     }
 
+    std::shared_ptr<DeviceInfo> getDeviceInfo(ur_device_handle_t Device) {
+        std::shared_lock<ur_shared_mutex> Guard(m_DeviceMapMutex);
+        assert(m_DeviceMap.find(Device) != m_DeviceMap.end());
+        return m_DeviceMap[Device];
+    }
+
+    std::shared_ptr<KernelInfo> getKernelInfo(ur_kernel_handle_t Kernel) {
+        std::shared_lock<ur_shared_mutex> Guard(m_KernelMapMutex);
+        assert(m_KernelMap.find(Kernel) != m_KernelMap.end());
+        return m_KernelMap[Kernel];
+    }
+
   private:
     ur_result_t updateShadowMemory(std::shared_ptr<ContextInfo> &ContextInfo,
                                    std::shared_ptr<DeviceInfo> &DeviceInfo,
@@ -195,32 +228,31 @@ class SanitizerInterceptor {
                               std::shared_ptr<DeviceInfo> &DeviceInfo,
                               ur_queue_handle_t Queue,
                               ur_kernel_handle_t Kernel,
-                              LaunchInfo &LaunchInfo);
+                              USMLaunchInfo &LaunchInfo);
 
     ur_result_t allocShadowMemory(ur_context_handle_t Context,
                                   std::shared_ptr<DeviceInfo> &DeviceInfo);
 
-    std::shared_ptr<DeviceInfo> getDeviceInfo(ur_device_handle_t Device) {
-        std::shared_lock<ur_shared_mutex> Guard(m_DeviceMapMutex);
-        assert(m_DeviceMap.find(Device) != m_DeviceMap.end());
-        return m_DeviceMap[Device];
-    }
-
   private:
     std::unordered_map<ur_context_handle_t, std::shared_ptr<ContextInfo>>
         m_ContextMap;
     ur_shared_mutex m_ContextMapMutex;
-
     std::unordered_map<ur_device_handle_t, std::shared_ptr<DeviceInfo>>
         m_DeviceMap;
     ur_shared_mutex m_DeviceMapMutex;
 
+    std::unordered_map<ur_kernel_handle_t, std::shared_ptr<KernelInfo>>
+        m_KernelMap;
+    ur_shared_mutex m_KernelMapMutex;
+
     /// Assumption: all USM chunks are allocated in one VA
     AllocationMap m_AllocationMap;
     ur_shared_mutex m_AllocationMapMutex;
 
     // We use "uint64_t" here because EnqueueWriteGlobal will fail when it's "uint32_t"
     uint64_t cl_Debug = 0;
+    uint64_t cl_MinRZSize = 16;
+    uint64_t cl_MaxRZSize = 2048;
     uint32_t cl_MaxQuarantineSizeMB = 0;
     bool cl_DetectLocals = true;
 
diff --git a/source/loader/layers/sanitizer/asan_libdevice.hpp b/source/loader/layers/sanitizer/asan_libdevice.hpp
index 46ddee4423..1c8ef24a9d 100644
--- a/source/loader/layers/sanitizer/asan_libdevice.hpp
+++ b/source/loader/layers/sanitizer/asan_libdevice.hpp
@@ -62,6 +62,23 @@ struct DeviceSanitizerReport {
     bool IsRecover = false;
 };
 
+struct LocalArgsInfo {
+    uint64_t Size = 0;
+    uint64_t SizeWithRedZone = 0;
+};
+
+struct LaunchInfo {
+    uintptr_t PrivateShadowOffset =
+        0; // don't move this field, we use it in AddressSanitizerPass
+
+    uintptr_t LocalShadowOffset = 0;
+    uintptr_t LocalShadowOffsetEnd = 0;
+    DeviceSanitizerReport SanitizerReport;
+
+    uint32_t NumLocalArgs = 0;
+    LocalArgsInfo *LocalArgs = nullptr; // ordered by ArgIndex
+};
+
 constexpr unsigned ASAN_SHADOW_SCALE = 3;
 constexpr unsigned ASAN_SHADOW_GRANULARITY = 1ULL << ASAN_SHADOW_SCALE;
 
diff --git a/source/loader/layers/sanitizer/common.hpp b/source/loader/layers/sanitizer/common.hpp
index d5612100aa..5ac39b49fe 100644
--- a/source/loader/layers/sanitizer/common.hpp
+++ b/source/loader/layers/sanitizer/common.hpp
@@ -16,6 +16,7 @@
 #include "ur_ddi.h"
 
 #include <cassert>
+#include <cmath>
 #include <cstdint>
 #include <string>
 
@@ -53,7 +54,17 @@ inline constexpr uptr RZLog2Size(uptr rz_log) {
     return 16 << rz_log;
 }
 
-inline constexpr uptr ComputeRZLog(uptr user_requested_size) {
+inline constexpr uptr RZSize2Log(uptr rz_size) {
+    assert(rz_size >= 16);
+    assert(rz_size <= 2048);
+    assert(IsPowerOfTwo(rz_size));
+    uptr res = log2(rz_size) - 4;
+    assert(rz_size == RZLog2Size(res));
+    return res;
+}
+
+inline constexpr uptr ComputeRZLog(uptr user_requested_size, uptr min_size,
+                                   uptr max_size) {
     uptr rz_log = user_requested_size <= 64 - 16            ? 0
                   : user_requested_size <= 128 - 32         ? 1
                   : user_requested_size <= 512 - 64         ? 2
@@ -62,7 +73,44 @@ inline constexpr uptr ComputeRZLog(uptr user_requested_size) {
                   : user_requested_size <= (1 << 15) - 512  ? 5
                   : user_requested_size <= (1 << 16) - 1024 ? 6
                                                             : 7;
-    return rz_log;
+    uptr min_log = RZSize2Log(min_size);
+    uptr max_log = RZSize2Log(max_size);
+    return std::min(std::max(rz_log, min_log), max_log);
+}
+
+/// Returns the next integer (mod 2**64) that is greater than or equal to
+/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
+///
+/// Examples:
+/// \code
+///   alignTo(5, 8) = 8
+///   alignTo(17, 8) = 24
+///   alignTo(~0LL, 8) = 0
+///   alignTo(321, 255) = 510
+/// \endcode
+inline uint64_t AlignTo(uint64_t Value, uint64_t Align) {
+    assert(Align != 0u && "Align can't be 0.");
+    return (Value + Align - 1) / Align * Align;
+}
+
+inline uint64_t GetSizeAndRedzoneSizeForLocal(uint64_t Size,
+                                              uint64_t Granularity,
+                                              uint64_t Alignment) {
+    uint64_t Res = 0;
+    if (Size <= 4) {
+        Res = 16;
+    } else if (Size <= 16) {
+        Res = 32;
+    } else if (Size <= 128) {
+        Res = Size + 32;
+    } else if (Size <= 512) {
+        Res = Size + 64;
+    } else if (Size <= 4096) {
+        Res = Size + 128;
+    } else {
+        Res = Size + 256;
+    }
+    return AlignTo(std::max(Res, 2 * Granularity), Alignment);
 }
 
 // ================================================================
diff --git a/source/loader/layers/sanitizer/ur_sanddi.cpp b/source/loader/layers/sanitizer/ur_sanddi.cpp
index 53ce5d1c1e..58f54c9338 100644
--- a/source/loader/layers/sanitizer/ur_sanddi.cpp
+++ b/source/loader/layers/sanitizer/ur_sanddi.cpp
@@ -272,8 +272,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch(
 
     context.logger.debug("==== urEnqueueKernelLaunch");
 
-    LaunchInfo LaunchInfo(GetContext(hQueue), pGlobalWorkSize, pLocalWorkSize,
-                          pGlobalWorkOffset, workDim);
+    USMLaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue),
+                             pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset,
+                             workDim);
+    UR_CALL(LaunchInfo.initialize());
 
     UR_CALL(context.interceptor->preLaunchKernel(hKernel, hQueue, LaunchInfo));
 
@@ -283,8 +285,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch(
         pLocalWorkSize, numEventsInWaitList, phEventWaitList, &hEvent);
 
     if (result == UR_RESULT_SUCCESS) {
-        UR_CALL(context.interceptor->postLaunchKernel(hKernel, hQueue, hEvent,
-                                                      LaunchInfo));
+        UR_CALL(
+            context.interceptor->postLaunchKernel(hKernel, hQueue, LaunchInfo));
     }
 
     if (phEvent) {
@@ -374,6 +376,90 @@ __urdlllocal ur_result_t UR_APICALL urContextRelease(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelCreate
+__urdlllocal ur_result_t UR_APICALL urKernelCreate(
+    ur_program_handle_t hProgram, ///< [in] handle of the program instance
+    const char *pKernelName,      ///< [in] pointer to null-terminated string.
+    ur_kernel_handle_t
+        *phKernel ///< [out] pointer to handle of kernel object created.
+) {
+    auto pfnCreate = context.urDdiTable.Kernel.pfnCreate;
+
+    if (nullptr == pfnCreate) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urKernelCreate");
+
+    UR_CALL(pfnCreate(hProgram, pKernelName, phKernel));
+    UR_CALL(context.interceptor->insertKernel(*phKernel));
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelRelease
+__urdlllocal ur_result_t urKernelRelease(
+    ur_kernel_handle_t hKernel ///< [in] handle for the Kernel to release
+) {
+    auto pfnRelease = context.urDdiTable.Kernel.pfnRelease;
+
+    if (nullptr == pfnRelease) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urKernelRelease");
+    UR_CALL(pfnRelease(hKernel));
+
+    if (auto KernelInfo = context.interceptor->getKernelInfo(hKernel)) {
+        uint32_t RefCount;
+        UR_CALL(context.urDdiTable.Kernel.pfnGetInfo(
+            hKernel, UR_KERNEL_INFO_REFERENCE_COUNT, sizeof(RefCount),
+            &RefCount, nullptr));
+        if (RefCount == 1) {
+            UR_CALL(context.interceptor->eraseKernel(hKernel));
+        }
+    }
+
+    return UR_RESULT_SUCCESS;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urKernelSetArgLocal
+__urdlllocal ur_result_t UR_APICALL urKernelSetArgLocal(
+    ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
+    uint32_t argIndex, ///< [in] argument index in range [0, num args - 1]
+    size_t
+        argSize, ///< [in] size of the local buffer to be allocated by the runtime
+    const ur_kernel_arg_local_properties_t
+        *pProperties ///< [in][optional] pointer to local buffer properties.
+) {
+    auto pfnSetArgLocal = context.urDdiTable.Kernel.pfnSetArgLocal;
+
+    if (nullptr == pfnSetArgLocal) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    context.logger.debug("==== urKernelSetArgLocal (argIndex={}, argSize={})",
+                         argIndex, argSize);
+
+    {
+        auto KI = context.interceptor->getKernelInfo(hKernel);
+        std::scoped_lock<ur_shared_mutex> Guard(KI->Mutex);
+        // TODO: get local variable alignment
+        auto argSizeWithRZ = GetSizeAndRedzoneSizeForLocal(
+            argSize, ASAN_SHADOW_GRANULARITY, ASAN_SHADOW_GRANULARITY);
+        KI->LocalArgs[argIndex] = LocalArgsInfo{argSize, argSizeWithRZ};
+        argSize = argSizeWithRZ;
+    }
+
+    ur_result_t result =
+        pfnSetArgLocal(hKernel, argIndex, argSize, pProperties);
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Exported function for filling application's Context table
 ///        with current process' addresses
@@ -470,6 +556,38 @@ __urdlllocal ur_result_t UR_APICALL urGetProgramExpProcAddrTable(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Exported function for filling application's Kernel table
+///        with current process' addresses
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///     - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION
+__urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable(
+    ur_api_version_t version, ///< [in] API version requested
+    ur_kernel_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+    if (nullptr == pDdiTable) {
+        return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+    }
+
+    if (UR_MAJOR_VERSION(ur_sanitizer_layer::context.version) !=
+            UR_MAJOR_VERSION(version) ||
+        UR_MINOR_VERSION(ur_sanitizer_layer::context.version) >
+            UR_MINOR_VERSION(version)) {
+        return UR_RESULT_ERROR_UNSUPPORTED_VERSION;
+    }
+
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    pDdiTable->pfnCreate = ur_sanitizer_layer::urKernelCreate;
+    pDdiTable->pfnRelease = ur_sanitizer_layer::urKernelRelease;
+    pDdiTable->pfnSetArgLocal = ur_sanitizer_layer::urKernelSetArgLocal;
+
+    return result;
+}
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Exported function for filling application's Enqueue table
 ///        with current process' addresses
@@ -570,6 +688,11 @@ ur_result_t context_t::init(ur_dditable_t *dditable,
             UR_API_VERSION_CURRENT, &dditable->Context);
     }
 
+    if (UR_RESULT_SUCCESS == result) {
+        result = ur_sanitizer_layer::urGetKernelProcAddrTable(
+            UR_API_VERSION_CURRENT, &dditable->Kernel);
+    }
+
     if (UR_RESULT_SUCCESS == result) {
         result = ur_sanitizer_layer::urGetProgramProcAddrTable(
             UR_API_VERSION_CURRENT, &dditable->Program);
diff --git a/source/loader/layers/sanitizer/ur_sanitizer_layer.cpp b/source/loader/layers/sanitizer/ur_sanitizer_layer.cpp
index 0df123b6c2..98d9ead12f 100644
--- a/source/loader/layers/sanitizer/ur_sanitizer_layer.cpp
+++ b/source/loader/layers/sanitizer/ur_sanitizer_layer.cpp
@@ -18,8 +18,8 @@ context_t context;
 
 ///////////////////////////////////////////////////////////////////////////////
 context_t::context_t()
-    : interceptor(std::make_unique<SanitizerInterceptor>()),
-      logger(logger::create_logger("sanitizer")) {}
+    : logger(logger::create_logger("sanitizer")),
+      interceptor(std::make_unique<SanitizerInterceptor>()) {}
 
 bool context_t::isAvailable() const { return true; }
 
diff --git a/source/loader/layers/sanitizer/ur_sanitizer_layer.hpp b/source/loader/layers/sanitizer/ur_sanitizer_layer.hpp
index 018d9f4a80..32f01103d5 100644
--- a/source/loader/layers/sanitizer/ur_sanitizer_layer.hpp
+++ b/source/loader/layers/sanitizer/ur_sanitizer_layer.hpp
@@ -32,8 +32,8 @@ enum class SanitizerType {
 class __urdlllocal context_t : public proxy_layer_context_t {
   public:
     ur_dditable_t urDdiTable = {};
-    std::unique_ptr<SanitizerInterceptor> interceptor;
     logger::Logger logger;
+    std::unique_ptr<SanitizerInterceptor> interceptor;
     SanitizerType enabledType = SanitizerType::None;
 
     context_t();
diff --git a/source/loader/layers/tracing/ur_tracing_layer.cpp b/source/loader/layers/tracing/ur_tracing_layer.cpp
index dd36b286f2..01243a670d 100644
--- a/source/loader/layers/tracing/ur_tracing_layer.cpp
+++ b/source/loader/layers/tracing/ur_tracing_layer.cpp
@@ -27,7 +27,7 @@ constexpr auto STREAM_VER_MINOR = UR_MINOR_VERSION(UR_API_VERSION_CURRENT);
 static thread_local xpti_td *activeEvent;
 
 ///////////////////////////////////////////////////////////////////////////////
-context_t::context_t() {
+context_t::context_t() : logger(logger::create_logger("tracing", true, true)) {
     xptiFrameworkInitialize();
 
     call_stream_id = xptiRegisterStream(CALL_STREAM_NAME);
@@ -37,7 +37,7 @@ context_t::context_t() {
                    streamv.str().data());
 }
 
-bool context_t::isAvailable() const { return xptiTraceEnabled(); }
+bool context_t::isAvailable() const { return true; }
 
 void context_t::notify(uint16_t trace_type, uint32_t id, const char *name,
                        void *args, ur_result_t *resultp, uint64_t instance) {
diff --git a/source/loader/layers/tracing/ur_tracing_layer.hpp b/source/loader/layers/tracing/ur_tracing_layer.hpp
index ddda493c05..84a109fb4f 100644
--- a/source/loader/layers/tracing/ur_tracing_layer.hpp
+++ b/source/loader/layers/tracing/ur_tracing_layer.hpp
@@ -13,6 +13,7 @@
 #ifndef UR_TRACING_LAYER_H
 #define UR_TRACING_LAYER_H 1
 
+#include "logger/ur_logger.hpp"
 #include "ur_ddi.h"
 #include "ur_proxy_layer.hpp"
 #include "ur_util.hpp"
@@ -25,6 +26,7 @@ class __urdlllocal context_t : public proxy_layer_context_t {
   public:
     ur_dditable_t urDdiTable = {};
     codeloc_data codelocData;
+    logger::Logger logger;
 
     context_t();
     ~context_t();
diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp
index 771b46c0c0..83987e5d90 100644
--- a/source/loader/layers/tracing/ur_trcddi.cpp
+++ b/source/loader/layers/tracing/ur_trcddi.cpp
@@ -40,11 +40,17 @@ __urdlllocal ur_result_t UR_APICALL urAdapterGet(
     uint64_t instance =
         context.notify_begin(UR_FUNCTION_ADAPTER_GET, "urAdapterGet", &params);
 
+    context.logger.info("---> urAdapterGet");
+
     ur_result_t result = pfnAdapterGet(NumEntries, phAdapters, pNumAdapters);
 
     context.notify_end(UR_FUNCTION_ADAPTER_GET, "urAdapterGet", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_ADAPTER_GET, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -63,11 +69,18 @@ __urdlllocal ur_result_t UR_APICALL urAdapterRelease(
     uint64_t instance = context.notify_begin(UR_FUNCTION_ADAPTER_RELEASE,
                                              "urAdapterRelease", &params);
 
+    context.logger.info("---> urAdapterRelease");
+
     ur_result_t result = pfnAdapterRelease(hAdapter);
 
     context.notify_end(UR_FUNCTION_ADAPTER_RELEASE, "urAdapterRelease", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_ADAPTER_RELEASE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -86,11 +99,18 @@ __urdlllocal ur_result_t UR_APICALL urAdapterRetain(
     uint64_t instance = context.notify_begin(UR_FUNCTION_ADAPTER_RETAIN,
                                              "urAdapterRetain", &params);
 
+    context.logger.info("---> urAdapterRetain");
+
     ur_result_t result = pfnAdapterRetain(hAdapter);
 
     context.notify_end(UR_FUNCTION_ADAPTER_RETAIN, "urAdapterRetain", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_ADAPTER_RETAIN,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -117,11 +137,18 @@ __urdlllocal ur_result_t UR_APICALL urAdapterGetLastError(
     uint64_t instance = context.notify_begin(UR_FUNCTION_ADAPTER_GET_LAST_ERROR,
                                              "urAdapterGetLastError", &params);
 
+    context.logger.info("---> urAdapterGetLastError");
+
     ur_result_t result = pfnAdapterGetLastError(hAdapter, ppMessage, pError);
 
     context.notify_end(UR_FUNCTION_ADAPTER_GET_LAST_ERROR,
                        "urAdapterGetLastError", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_ADAPTER_GET_LAST_ERROR, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -151,12 +178,19 @@ __urdlllocal ur_result_t UR_APICALL urAdapterGetInfo(
     uint64_t instance = context.notify_begin(UR_FUNCTION_ADAPTER_GET_INFO,
                                              "urAdapterGetInfo", &params);
 
+    context.logger.info("---> urAdapterGetInfo");
+
     ur_result_t result = pfnAdapterGetInfo(hAdapter, propName, propSize,
                                            pPropValue, pPropSizeRet);
 
     context.notify_end(UR_FUNCTION_ADAPTER_GET_INFO, "urAdapterGetInfo",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_ADAPTER_GET_INFO,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -189,12 +223,19 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGet(
     uint64_t instance = context.notify_begin(UR_FUNCTION_PLATFORM_GET,
                                              "urPlatformGet", &params);
 
+    context.logger.info("---> urPlatformGet");
+
     ur_result_t result =
         pfnGet(phAdapters, NumAdapters, NumEntries, phPlatforms, pNumPlatforms);
 
     context.notify_end(UR_FUNCTION_PLATFORM_GET, "urPlatformGet", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_PLATFORM_GET,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -224,12 +265,19 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetInfo(
     uint64_t instance = context.notify_begin(UR_FUNCTION_PLATFORM_GET_INFO,
                                              "urPlatformGetInfo", &params);
 
+    context.logger.info("---> urPlatformGetInfo");
+
     ur_result_t result =
         pfnGetInfo(hPlatform, propName, propSize, pPropValue, pPropSizeRet);
 
     context.notify_end(UR_FUNCTION_PLATFORM_GET_INFO, "urPlatformGetInfo",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_PLATFORM_GET_INFO,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -250,11 +298,18 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetApiVersion(
         context.notify_begin(UR_FUNCTION_PLATFORM_GET_API_VERSION,
                              "urPlatformGetApiVersion", &params);
 
+    context.logger.info("---> urPlatformGetApiVersion");
+
     ur_result_t result = pfnGetApiVersion(hPlatform, pVersion);
 
     context.notify_end(UR_FUNCTION_PLATFORM_GET_API_VERSION,
                        "urPlatformGetApiVersion", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_PLATFORM_GET_API_VERSION, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -277,11 +332,18 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetNativeHandle(
         context.notify_begin(UR_FUNCTION_PLATFORM_GET_NATIVE_HANDLE,
                              "urPlatformGetNativeHandle", &params);
 
+    context.logger.info("---> urPlatformGetNativeHandle");
+
     ur_result_t result = pfnGetNativeHandle(hPlatform, phNativePlatform);
 
     context.notify_end(UR_FUNCTION_PLATFORM_GET_NATIVE_HANDLE,
                        "urPlatformGetNativeHandle", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_PLATFORM_GET_NATIVE_HANDLE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -308,6 +370,8 @@ __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle(
         context.notify_begin(UR_FUNCTION_PLATFORM_CREATE_WITH_NATIVE_HANDLE,
                              "urPlatformCreateWithNativeHandle", &params);
 
+    context.logger.info("---> urPlatformCreateWithNativeHandle");
+
     ur_result_t result =
         pfnCreateWithNativeHandle(hNativePlatform, pProperties, phPlatform);
 
@@ -315,6 +379,11 @@ __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle(
                        "urPlatformCreateWithNativeHandle", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_PLATFORM_CREATE_WITH_NATIVE_HANDLE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -340,6 +409,8 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetBackendOption(
         context.notify_begin(UR_FUNCTION_PLATFORM_GET_BACKEND_OPTION,
                              "urPlatformGetBackendOption", &params);
 
+    context.logger.info("---> urPlatformGetBackendOption");
+
     ur_result_t result =
         pfnGetBackendOption(hPlatform, pFrontendOption, ppPlatformOption);
 
@@ -347,6 +418,11 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetBackendOption(
                        "urPlatformGetBackendOption", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_PLATFORM_GET_BACKEND_OPTION, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -378,12 +454,18 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGet(
     uint64_t instance =
         context.notify_begin(UR_FUNCTION_DEVICE_GET, "urDeviceGet", &params);
 
+    context.logger.info("---> urDeviceGet");
+
     ur_result_t result =
         pfnGet(hPlatform, DeviceType, NumEntries, phDevices, pNumDevices);
 
     context.notify_end(UR_FUNCTION_DEVICE_GET, "urDeviceGet", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_GET, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -414,12 +496,19 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetInfo(
     uint64_t instance = context.notify_begin(UR_FUNCTION_DEVICE_GET_INFO,
                                              "urDeviceGetInfo", &params);
 
+    context.logger.info("---> urDeviceGetInfo");
+
     ur_result_t result =
         pfnGetInfo(hDevice, propName, propSize, pPropValue, pPropSizeRet);
 
     context.notify_end(UR_FUNCTION_DEVICE_GET_INFO, "urDeviceGetInfo", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_GET_INFO,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -439,11 +528,18 @@ __urdlllocal ur_result_t UR_APICALL urDeviceRetain(
     uint64_t instance = context.notify_begin(UR_FUNCTION_DEVICE_RETAIN,
                                              "urDeviceRetain", &params);
 
+    context.logger.info("---> urDeviceRetain");
+
     ur_result_t result = pfnRetain(hDevice);
 
     context.notify_end(UR_FUNCTION_DEVICE_RETAIN, "urDeviceRetain", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_RETAIN,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -462,11 +558,18 @@ __urdlllocal ur_result_t UR_APICALL urDeviceRelease(
     uint64_t instance = context.notify_begin(UR_FUNCTION_DEVICE_RELEASE,
                                              "urDeviceRelease", &params);
 
+    context.logger.info("---> urDeviceRelease");
+
     ur_result_t result = pfnRelease(hDevice);
 
     context.notify_end(UR_FUNCTION_DEVICE_RELEASE, "urDeviceRelease", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_RELEASE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -496,12 +599,19 @@ __urdlllocal ur_result_t UR_APICALL urDevicePartition(
     uint64_t instance = context.notify_begin(UR_FUNCTION_DEVICE_PARTITION,
                                              "urDevicePartition", &params);
 
+    context.logger.info("---> urDevicePartition");
+
     ur_result_t result = pfnPartition(hDevice, pProperties, NumDevices,
                                       phSubDevices, pNumDevicesRet);
 
     context.notify_end(UR_FUNCTION_DEVICE_PARTITION, "urDevicePartition",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_PARTITION,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -530,12 +640,19 @@ __urdlllocal ur_result_t UR_APICALL urDeviceSelectBinary(
     uint64_t instance = context.notify_begin(UR_FUNCTION_DEVICE_SELECT_BINARY,
                                              "urDeviceSelectBinary", &params);
 
+    context.logger.info("---> urDeviceSelectBinary");
+
     ur_result_t result =
         pfnSelectBinary(hDevice, pBinaries, NumBinaries, pSelectedBinary);
 
     context.notify_end(UR_FUNCTION_DEVICE_SELECT_BINARY, "urDeviceSelectBinary",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_SELECT_BINARY,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -557,11 +674,18 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetNativeHandle(
         context.notify_begin(UR_FUNCTION_DEVICE_GET_NATIVE_HANDLE,
                              "urDeviceGetNativeHandle", &params);
 
+    context.logger.info("---> urDeviceGetNativeHandle");
+
     ur_result_t result = pfnGetNativeHandle(hDevice, phNativeDevice);
 
     context.notify_end(UR_FUNCTION_DEVICE_GET_NATIVE_HANDLE,
                        "urDeviceGetNativeHandle", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_DEVICE_GET_NATIVE_HANDLE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -589,6 +713,8 @@ __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
         context.notify_begin(UR_FUNCTION_DEVICE_CREATE_WITH_NATIVE_HANDLE,
                              "urDeviceCreateWithNativeHandle", &params);
 
+    context.logger.info("---> urDeviceCreateWithNativeHandle");
+
     ur_result_t result = pfnCreateWithNativeHandle(hNativeDevice, hPlatform,
                                                    pProperties, phDevice);
 
@@ -596,6 +722,11 @@ __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle(
                        "urDeviceCreateWithNativeHandle", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_DEVICE_CREATE_WITH_NATIVE_HANDLE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -623,6 +754,8 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(
         context.notify_begin(UR_FUNCTION_DEVICE_GET_GLOBAL_TIMESTAMPS,
                              "urDeviceGetGlobalTimestamps", &params);
 
+    context.logger.info("---> urDeviceGetGlobalTimestamps");
+
     ur_result_t result =
         pfnGetGlobalTimestamps(hDevice, pDeviceTimestamp, pHostTimestamp);
 
@@ -630,6 +763,11 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(
                        "urDeviceGetGlobalTimestamps", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_DEVICE_GET_GLOBAL_TIMESTAMPS, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -655,12 +793,19 @@ __urdlllocal ur_result_t UR_APICALL urContextCreate(
     uint64_t instance = context.notify_begin(UR_FUNCTION_CONTEXT_CREATE,
                                              "urContextCreate", &params);
 
+    context.logger.info("---> urContextCreate");
+
     ur_result_t result =
         pfnCreate(DeviceCount, phDevices, pProperties, phContext);
 
     context.notify_end(UR_FUNCTION_CONTEXT_CREATE, "urContextCreate", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_CONTEXT_CREATE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -680,11 +825,18 @@ __urdlllocal ur_result_t UR_APICALL urContextRetain(
     uint64_t instance = context.notify_begin(UR_FUNCTION_CONTEXT_RETAIN,
                                              "urContextRetain", &params);
 
+    context.logger.info("---> urContextRetain");
+
     ur_result_t result = pfnRetain(hContext);
 
     context.notify_end(UR_FUNCTION_CONTEXT_RETAIN, "urContextRetain", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_CONTEXT_RETAIN,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -703,11 +855,18 @@ __urdlllocal ur_result_t UR_APICALL urContextRelease(
     uint64_t instance = context.notify_begin(UR_FUNCTION_CONTEXT_RELEASE,
                                              "urContextRelease", &params);
 
+    context.logger.info("---> urContextRelease");
+
     ur_result_t result = pfnRelease(hContext);
 
     context.notify_end(UR_FUNCTION_CONTEXT_RELEASE, "urContextRelease", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_CONTEXT_RELEASE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -739,12 +898,19 @@ __urdlllocal ur_result_t UR_APICALL urContextGetInfo(
     uint64_t instance = context.notify_begin(UR_FUNCTION_CONTEXT_GET_INFO,
                                              "urContextGetInfo", &params);
 
+    context.logger.info("---> urContextGetInfo");
+
     ur_result_t result =
         pfnGetInfo(hContext, propName, propSize, pPropValue, pPropSizeRet);
 
     context.notify_end(UR_FUNCTION_CONTEXT_GET_INFO, "urContextGetInfo",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_CONTEXT_GET_INFO,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -767,11 +933,18 @@ __urdlllocal ur_result_t UR_APICALL urContextGetNativeHandle(
         context.notify_begin(UR_FUNCTION_CONTEXT_GET_NATIVE_HANDLE,
                              "urContextGetNativeHandle", &params);
 
+    context.logger.info("---> urContextGetNativeHandle");
+
     ur_result_t result = pfnGetNativeHandle(hContext, phNativeContext);
 
     context.notify_end(UR_FUNCTION_CONTEXT_GET_NATIVE_HANDLE,
                        "urContextGetNativeHandle", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_CONTEXT_GET_NATIVE_HANDLE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -801,6 +974,8 @@ __urdlllocal ur_result_t UR_APICALL urContextCreateWithNativeHandle(
         context.notify_begin(UR_FUNCTION_CONTEXT_CREATE_WITH_NATIVE_HANDLE,
                              "urContextCreateWithNativeHandle", &params);
 
+    context.logger.info("---> urContextCreateWithNativeHandle");
+
     ur_result_t result = pfnCreateWithNativeHandle(
         hNativeContext, numDevices, phDevices, pProperties, phContext);
 
@@ -808,6 +983,11 @@ __urdlllocal ur_result_t UR_APICALL urContextCreateWithNativeHandle(
                        "urContextCreateWithNativeHandle", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_CONTEXT_CREATE_WITH_NATIVE_HANDLE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -833,12 +1013,19 @@ __urdlllocal ur_result_t UR_APICALL urContextSetExtendedDeleter(
         context.notify_begin(UR_FUNCTION_CONTEXT_SET_EXTENDED_DELETER,
                              "urContextSetExtendedDeleter", &params);
 
+    context.logger.info("---> urContextSetExtendedDeleter");
+
     ur_result_t result = pfnSetExtendedDeleter(hContext, pfnDeleter, pUserData);
 
     context.notify_end(UR_FUNCTION_CONTEXT_SET_EXTENDED_DELETER,
                        "urContextSetExtendedDeleter", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_CONTEXT_SET_EXTENDED_DELETER, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -864,12 +1051,19 @@ __urdlllocal ur_result_t UR_APICALL urMemImageCreate(
     uint64_t instance = context.notify_begin(UR_FUNCTION_MEM_IMAGE_CREATE,
                                              "urMemImageCreate", &params);
 
+    context.logger.info("---> urMemImageCreate");
+
     ur_result_t result =
         pfnImageCreate(hContext, flags, pImageFormat, pImageDesc, pHost, phMem);
 
     context.notify_end(UR_FUNCTION_MEM_IMAGE_CREATE, "urMemImageCreate",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_IMAGE_CREATE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -895,12 +1089,19 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferCreate(
     uint64_t instance = context.notify_begin(UR_FUNCTION_MEM_BUFFER_CREATE,
                                              "urMemBufferCreate", &params);
 
+    context.logger.info("---> urMemBufferCreate");
+
     ur_result_t result =
         pfnBufferCreate(hContext, flags, size, pProperties, phBuffer);
 
     context.notify_end(UR_FUNCTION_MEM_BUFFER_CREATE, "urMemBufferCreate",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_BUFFER_CREATE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -919,11 +1120,17 @@ __urdlllocal ur_result_t UR_APICALL urMemRetain(
     uint64_t instance =
         context.notify_begin(UR_FUNCTION_MEM_RETAIN, "urMemRetain", &params);
 
+    context.logger.info("---> urMemRetain");
+
     ur_result_t result = pfnRetain(hMem);
 
     context.notify_end(UR_FUNCTION_MEM_RETAIN, "urMemRetain", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_RETAIN, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -942,11 +1149,17 @@ __urdlllocal ur_result_t UR_APICALL urMemRelease(
     uint64_t instance =
         context.notify_begin(UR_FUNCTION_MEM_RELEASE, "urMemRelease", &params);
 
+    context.logger.info("---> urMemRelease");
+
     ur_result_t result = pfnRelease(hMem);
 
     context.notify_end(UR_FUNCTION_MEM_RELEASE, "urMemRelease", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_RELEASE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -973,12 +1186,19 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferPartition(
     uint64_t instance = context.notify_begin(UR_FUNCTION_MEM_BUFFER_PARTITION,
                                              "urMemBufferPartition", &params);
 
+    context.logger.info("---> urMemBufferPartition");
+
     ur_result_t result =
         pfnBufferPartition(hBuffer, flags, bufferCreateType, pRegion, phMem);
 
     context.notify_end(UR_FUNCTION_MEM_BUFFER_PARTITION, "urMemBufferPartition",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_BUFFER_PARTITION,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1001,11 +1221,18 @@ __urdlllocal ur_result_t UR_APICALL urMemGetNativeHandle(
     uint64_t instance = context.notify_begin(UR_FUNCTION_MEM_GET_NATIVE_HANDLE,
                                              "urMemGetNativeHandle", &params);
 
+    context.logger.info("---> urMemGetNativeHandle");
+
     ur_result_t result = pfnGetNativeHandle(hMem, hDevice, phNativeMem);
 
     context.notify_end(UR_FUNCTION_MEM_GET_NATIVE_HANDLE,
                        "urMemGetNativeHandle", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_GET_NATIVE_HANDLE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1033,6 +1260,8 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle(
         context.notify_begin(UR_FUNCTION_MEM_BUFFER_CREATE_WITH_NATIVE_HANDLE,
                              "urMemBufferCreateWithNativeHandle", &params);
 
+    context.logger.info("---> urMemBufferCreateWithNativeHandle");
+
     ur_result_t result = pfnBufferCreateWithNativeHandle(hNativeMem, hContext,
                                                          pProperties, phMem);
 
@@ -1040,6 +1269,11 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle(
                        "urMemBufferCreateWithNativeHandle", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_MEM_BUFFER_CREATE_WITH_NATIVE_HANDLE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1071,6 +1305,8 @@ __urdlllocal ur_result_t UR_APICALL urMemImageCreateWithNativeHandle(
         context.notify_begin(UR_FUNCTION_MEM_IMAGE_CREATE_WITH_NATIVE_HANDLE,
                              "urMemImageCreateWithNativeHandle", &params);
 
+    context.logger.info("---> urMemImageCreateWithNativeHandle");
+
     ur_result_t result = pfnImageCreateWithNativeHandle(
         hNativeMem, hContext, pImageFormat, pImageDesc, pProperties, phMem);
 
@@ -1078,6 +1314,11 @@ __urdlllocal ur_result_t UR_APICALL urMemImageCreateWithNativeHandle(
                        "urMemImageCreateWithNativeHandle", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_MEM_IMAGE_CREATE_WITH_NATIVE_HANDLE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1109,12 +1350,19 @@ __urdlllocal ur_result_t UR_APICALL urMemGetInfo(
     uint64_t instance =
         context.notify_begin(UR_FUNCTION_MEM_GET_INFO, "urMemGetInfo", &params);
 
+    context.logger.info("---> urMemGetInfo");
+
     ur_result_t result =
         pfnGetInfo(hMemory, propName, propSize, pPropValue, pPropSizeRet);
 
     context.notify_end(UR_FUNCTION_MEM_GET_INFO, "urMemGetInfo", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_GET_INFO,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1145,12 +1393,19 @@ __urdlllocal ur_result_t UR_APICALL urMemImageGetInfo(
     uint64_t instance = context.notify_begin(UR_FUNCTION_MEM_IMAGE_GET_INFO,
                                              "urMemImageGetInfo", &params);
 
+    context.logger.info("---> urMemImageGetInfo");
+
     ur_result_t result =
         pfnImageGetInfo(hMemory, propName, propSize, pPropValue, pPropSizeRet);
 
     context.notify_end(UR_FUNCTION_MEM_IMAGE_GET_INFO, "urMemImageGetInfo",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_IMAGE_GET_INFO,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1172,11 +1427,18 @@ __urdlllocal ur_result_t UR_APICALL urSamplerCreate(
     uint64_t instance = context.notify_begin(UR_FUNCTION_SAMPLER_CREATE,
                                              "urSamplerCreate", &params);
 
+    context.logger.info("---> urSamplerCreate");
+
     ur_result_t result = pfnCreate(hContext, pDesc, phSampler);
 
     context.notify_end(UR_FUNCTION_SAMPLER_CREATE, "urSamplerCreate", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_SAMPLER_CREATE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1196,11 +1458,18 @@ __urdlllocal ur_result_t UR_APICALL urSamplerRetain(
     uint64_t instance = context.notify_begin(UR_FUNCTION_SAMPLER_RETAIN,
                                              "urSamplerRetain", &params);
 
+    context.logger.info("---> urSamplerRetain");
+
     ur_result_t result = pfnRetain(hSampler);
 
     context.notify_end(UR_FUNCTION_SAMPLER_RETAIN, "urSamplerRetain", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_SAMPLER_RETAIN,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1220,11 +1489,18 @@ __urdlllocal ur_result_t UR_APICALL urSamplerRelease(
     uint64_t instance = context.notify_begin(UR_FUNCTION_SAMPLER_RELEASE,
                                              "urSamplerRelease", &params);
 
+    context.logger.info("---> urSamplerRelease");
+
     ur_result_t result = pfnRelease(hSampler);
 
     context.notify_end(UR_FUNCTION_SAMPLER_RELEASE, "urSamplerRelease", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_SAMPLER_RELEASE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1252,12 +1528,19 @@ __urdlllocal ur_result_t UR_APICALL urSamplerGetInfo(
     uint64_t instance = context.notify_begin(UR_FUNCTION_SAMPLER_GET_INFO,
                                              "urSamplerGetInfo", &params);
 
+    context.logger.info("---> urSamplerGetInfo");
+
     ur_result_t result =
         pfnGetInfo(hSampler, propName, propSize, pPropValue, pPropSizeRet);
 
     context.notify_end(UR_FUNCTION_SAMPLER_GET_INFO, "urSamplerGetInfo",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_SAMPLER_GET_INFO,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1280,11 +1563,18 @@ __urdlllocal ur_result_t UR_APICALL urSamplerGetNativeHandle(
         context.notify_begin(UR_FUNCTION_SAMPLER_GET_NATIVE_HANDLE,
                              "urSamplerGetNativeHandle", &params);
 
+    context.logger.info("---> urSamplerGetNativeHandle");
+
     ur_result_t result = pfnGetNativeHandle(hSampler, phNativeSampler);
 
     context.notify_end(UR_FUNCTION_SAMPLER_GET_NATIVE_HANDLE,
                        "urSamplerGetNativeHandle", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_SAMPLER_GET_NATIVE_HANDLE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1312,6 +1602,8 @@ __urdlllocal ur_result_t UR_APICALL urSamplerCreateWithNativeHandle(
         context.notify_begin(UR_FUNCTION_SAMPLER_CREATE_WITH_NATIVE_HANDLE,
                              "urSamplerCreateWithNativeHandle", &params);
 
+    context.logger.info("---> urSamplerCreateWithNativeHandle");
+
     ur_result_t result = pfnCreateWithNativeHandle(hNativeSampler, hContext,
                                                    pProperties, phSampler);
 
@@ -1319,6 +1611,11 @@ __urdlllocal ur_result_t UR_APICALL urSamplerCreateWithNativeHandle(
                        "urSamplerCreateWithNativeHandle", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_SAMPLER_CREATE_WITH_NATIVE_HANDLE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1345,11 +1642,18 @@ __urdlllocal ur_result_t UR_APICALL urUSMHostAlloc(
     uint64_t instance = context.notify_begin(UR_FUNCTION_USM_HOST_ALLOC,
                                              "urUSMHostAlloc", &params);
 
+    context.logger.info("---> urUSMHostAlloc");
+
     ur_result_t result = pfnHostAlloc(hContext, pUSMDesc, pool, size, ppMem);
 
     context.notify_end(UR_FUNCTION_USM_HOST_ALLOC, "urUSMHostAlloc", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_HOST_ALLOC,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1377,12 +1681,19 @@ __urdlllocal ur_result_t UR_APICALL urUSMDeviceAlloc(
     uint64_t instance = context.notify_begin(UR_FUNCTION_USM_DEVICE_ALLOC,
                                              "urUSMDeviceAlloc", &params);
 
+    context.logger.info("---> urUSMDeviceAlloc");
+
     ur_result_t result =
         pfnDeviceAlloc(hContext, hDevice, pUSMDesc, pool, size, ppMem);
 
     context.notify_end(UR_FUNCTION_USM_DEVICE_ALLOC, "urUSMDeviceAlloc",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_DEVICE_ALLOC,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1410,12 +1721,19 @@ __urdlllocal ur_result_t UR_APICALL urUSMSharedAlloc(
     uint64_t instance = context.notify_begin(UR_FUNCTION_USM_SHARED_ALLOC,
                                              "urUSMSharedAlloc", &params);
 
+    context.logger.info("---> urUSMSharedAlloc");
+
     ur_result_t result =
         pfnSharedAlloc(hContext, hDevice, pUSMDesc, pool, size, ppMem);
 
     context.notify_end(UR_FUNCTION_USM_SHARED_ALLOC, "urUSMSharedAlloc",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_SHARED_ALLOC,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1435,11 +1753,17 @@ __urdlllocal ur_result_t UR_APICALL urUSMFree(
     uint64_t instance =
         context.notify_begin(UR_FUNCTION_USM_FREE, "urUSMFree", &params);
 
+    context.logger.info("---> urUSMFree");
+
     ur_result_t result = pfnFree(hContext, pMem);
 
     context.notify_end(UR_FUNCTION_USM_FREE, "urUSMFree", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_FREE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1469,12 +1793,19 @@ __urdlllocal ur_result_t UR_APICALL urUSMGetMemAllocInfo(
     uint64_t instance = context.notify_begin(UR_FUNCTION_USM_GET_MEM_ALLOC_INFO,
                                              "urUSMGetMemAllocInfo", &params);
 
+    context.logger.info("---> urUSMGetMemAllocInfo");
+
     ur_result_t result = pfnGetMemAllocInfo(hContext, pMem, propName, propSize,
                                             pPropValue, pPropSizeRet);
 
     context.notify_end(UR_FUNCTION_USM_GET_MEM_ALLOC_INFO,
                        "urUSMGetMemAllocInfo", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_USM_GET_MEM_ALLOC_INFO, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1497,11 +1828,18 @@ __urdlllocal ur_result_t UR_APICALL urUSMPoolCreate(
     uint64_t instance = context.notify_begin(UR_FUNCTION_USM_POOL_CREATE,
                                              "urUSMPoolCreate", &params);
 
+    context.logger.info("---> urUSMPoolCreate");
+
     ur_result_t result = pfnPoolCreate(hContext, pPoolDesc, ppPool);
 
     context.notify_end(UR_FUNCTION_USM_POOL_CREATE, "urUSMPoolCreate", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_POOL_CREATE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1520,11 +1858,18 @@ __urdlllocal ur_result_t UR_APICALL urUSMPoolRetain(
     uint64_t instance = context.notify_begin(UR_FUNCTION_USM_POOL_RETAIN,
                                              "urUSMPoolRetain", &params);
 
+    context.logger.info("---> urUSMPoolRetain");
+
     ur_result_t result = pfnPoolRetain(pPool);
 
     context.notify_end(UR_FUNCTION_USM_POOL_RETAIN, "urUSMPoolRetain", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_POOL_RETAIN,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1543,11 +1888,18 @@ __urdlllocal ur_result_t UR_APICALL urUSMPoolRelease(
     uint64_t instance = context.notify_begin(UR_FUNCTION_USM_POOL_RELEASE,
                                              "urUSMPoolRelease", &params);
 
+    context.logger.info("---> urUSMPoolRelease");
+
     ur_result_t result = pfnPoolRelease(pPool);
 
     context.notify_end(UR_FUNCTION_USM_POOL_RELEASE, "urUSMPoolRelease",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_POOL_RELEASE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1574,12 +1926,19 @@ __urdlllocal ur_result_t UR_APICALL urUSMPoolGetInfo(
     uint64_t instance = context.notify_begin(UR_FUNCTION_USM_POOL_GET_INFO,
                                              "urUSMPoolGetInfo", &params);
 
+    context.logger.info("---> urUSMPoolGetInfo");
+
     ur_result_t result =
         pfnPoolGetInfo(hPool, propName, propSize, pPropValue, pPropSizeRet);
 
     context.notify_end(UR_FUNCTION_USM_POOL_GET_INFO, "urUSMPoolGetInfo",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_POOL_GET_INFO,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1615,6 +1974,8 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo(
         context.notify_begin(UR_FUNCTION_VIRTUAL_MEM_GRANULARITY_GET_INFO,
                              "urVirtualMemGranularityGetInfo", &params);
 
+    context.logger.info("---> urVirtualMemGranularityGetInfo");
+
     ur_result_t result = pfnGranularityGetInfo(
         hContext, hDevice, propName, propSize, pPropValue, pPropSizeRet);
 
@@ -1622,6 +1983,11 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo(
                        "urVirtualMemGranularityGetInfo", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_VIRTUAL_MEM_GRANULARITY_GET_INFO, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1650,11 +2016,18 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemReserve(
     uint64_t instance = context.notify_begin(UR_FUNCTION_VIRTUAL_MEM_RESERVE,
                                              "urVirtualMemReserve", &params);
 
+    context.logger.info("---> urVirtualMemReserve");
+
     ur_result_t result = pfnReserve(hContext, pStart, size, ppStart);
 
     context.notify_end(UR_FUNCTION_VIRTUAL_MEM_RESERVE, "urVirtualMemReserve",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_VIRTUAL_MEM_RESERVE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1676,11 +2049,18 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemFree(
     uint64_t instance = context.notify_begin(UR_FUNCTION_VIRTUAL_MEM_FREE,
                                              "urVirtualMemFree", &params);
 
+    context.logger.info("---> urVirtualMemFree");
+
     ur_result_t result = pfnFree(hContext, pStart, size);
 
     context.notify_end(UR_FUNCTION_VIRTUAL_MEM_FREE, "urVirtualMemFree",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_VIRTUAL_MEM_FREE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1709,12 +2089,19 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemMap(
     uint64_t instance = context.notify_begin(UR_FUNCTION_VIRTUAL_MEM_MAP,
                                              "urVirtualMemMap", &params);
 
+    context.logger.info("---> urVirtualMemMap");
+
     ur_result_t result =
         pfnMap(hContext, pStart, size, hPhysicalMem, offset, flags);
 
     context.notify_end(UR_FUNCTION_VIRTUAL_MEM_MAP, "urVirtualMemMap", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_VIRTUAL_MEM_MAP,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1736,11 +2123,18 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemUnmap(
     uint64_t instance = context.notify_begin(UR_FUNCTION_VIRTUAL_MEM_UNMAP,
                                              "urVirtualMemUnmap", &params);
 
+    context.logger.info("---> urVirtualMemUnmap");
+
     ur_result_t result = pfnUnmap(hContext, pStart, size);
 
     context.notify_end(UR_FUNCTION_VIRTUAL_MEM_UNMAP, "urVirtualMemUnmap",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_VIRTUAL_MEM_UNMAP,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1765,11 +2159,18 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemSetAccess(
     uint64_t instance = context.notify_begin(UR_FUNCTION_VIRTUAL_MEM_SET_ACCESS,
                                              "urVirtualMemSetAccess", &params);
 
+    context.logger.info("---> urVirtualMemSetAccess");
+
     ur_result_t result = pfnSetAccess(hContext, pStart, size, flags);
 
     context.notify_end(UR_FUNCTION_VIRTUAL_MEM_SET_ACCESS,
                        "urVirtualMemSetAccess", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_VIRTUAL_MEM_SET_ACCESS, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1803,12 +2204,19 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGetInfo(
     uint64_t instance = context.notify_begin(UR_FUNCTION_VIRTUAL_MEM_GET_INFO,
                                              "urVirtualMemGetInfo", &params);
 
+    context.logger.info("---> urVirtualMemGetInfo");
+
     ur_result_t result = pfnGetInfo(hContext, pStart, size, propName, propSize,
                                     pPropValue, pPropSizeRet);
 
     context.notify_end(UR_FUNCTION_VIRTUAL_MEM_GET_INFO, "urVirtualMemGetInfo",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_VIRTUAL_MEM_GET_INFO,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1836,12 +2244,19 @@ __urdlllocal ur_result_t UR_APICALL urPhysicalMemCreate(
     uint64_t instance = context.notify_begin(UR_FUNCTION_PHYSICAL_MEM_CREATE,
                                              "urPhysicalMemCreate", &params);
 
+    context.logger.info("---> urPhysicalMemCreate");
+
     ur_result_t result =
         pfnCreate(hContext, hDevice, size, pProperties, phPhysicalMem);
 
     context.notify_end(UR_FUNCTION_PHYSICAL_MEM_CREATE, "urPhysicalMemCreate",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_PHYSICAL_MEM_CREATE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1861,11 +2276,18 @@ __urdlllocal ur_result_t UR_APICALL urPhysicalMemRetain(
     uint64_t instance = context.notify_begin(UR_FUNCTION_PHYSICAL_MEM_RETAIN,
                                              "urPhysicalMemRetain", &params);
 
+    context.logger.info("---> urPhysicalMemRetain");
+
     ur_result_t result = pfnRetain(hPhysicalMem);
 
     context.notify_end(UR_FUNCTION_PHYSICAL_MEM_RETAIN, "urPhysicalMemRetain",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_PHYSICAL_MEM_RETAIN,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1885,11 +2307,18 @@ __urdlllocal ur_result_t UR_APICALL urPhysicalMemRelease(
     uint64_t instance = context.notify_begin(UR_FUNCTION_PHYSICAL_MEM_RELEASE,
                                              "urPhysicalMemRelease", &params);
 
+    context.logger.info("---> urPhysicalMemRelease");
+
     ur_result_t result = pfnRelease(hPhysicalMem);
 
     context.notify_end(UR_FUNCTION_PHYSICAL_MEM_RELEASE, "urPhysicalMemRelease",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_PHYSICAL_MEM_RELEASE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1915,12 +2344,19 @@ __urdlllocal ur_result_t UR_APICALL urProgramCreateWithIL(
     uint64_t instance = context.notify_begin(UR_FUNCTION_PROGRAM_CREATE_WITH_IL,
                                              "urProgramCreateWithIL", &params);
 
+    context.logger.info("---> urProgramCreateWithIL");
+
     ur_result_t result =
         pfnCreateWithIL(hContext, pIL, length, pProperties, phProgram);
 
     context.notify_end(UR_FUNCTION_PROGRAM_CREATE_WITH_IL,
                        "urProgramCreateWithIL", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_PROGRAM_CREATE_WITH_IL, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1949,12 +2385,19 @@ __urdlllocal ur_result_t UR_APICALL urProgramCreateWithBinary(
         context.notify_begin(UR_FUNCTION_PROGRAM_CREATE_WITH_BINARY,
                              "urProgramCreateWithBinary", &params);
 
+    context.logger.info("---> urProgramCreateWithBinary");
+
     ur_result_t result = pfnCreateWithBinary(hContext, hDevice, size, pBinary,
                                              pProperties, phProgram);
 
     context.notify_end(UR_FUNCTION_PROGRAM_CREATE_WITH_BINARY,
                        "urProgramCreateWithBinary", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_PROGRAM_CREATE_WITH_BINARY, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -1976,11 +2419,18 @@ __urdlllocal ur_result_t UR_APICALL urProgramBuild(
     uint64_t instance = context.notify_begin(UR_FUNCTION_PROGRAM_BUILD,
                                              "urProgramBuild", &params);
 
+    context.logger.info("---> urProgramBuild");
+
     ur_result_t result = pfnBuild(hContext, hProgram, pOptions);
 
     context.notify_end(UR_FUNCTION_PROGRAM_BUILD, "urProgramBuild", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_BUILD,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2003,11 +2453,18 @@ __urdlllocal ur_result_t UR_APICALL urProgramCompile(
     uint64_t instance = context.notify_begin(UR_FUNCTION_PROGRAM_COMPILE,
                                              "urProgramCompile", &params);
 
+    context.logger.info("---> urProgramCompile");
+
     ur_result_t result = pfnCompile(hContext, hProgram, pOptions);
 
     context.notify_end(UR_FUNCTION_PROGRAM_COMPILE, "urProgramCompile", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_COMPILE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2034,12 +2491,19 @@ __urdlllocal ur_result_t UR_APICALL urProgramLink(
     uint64_t instance = context.notify_begin(UR_FUNCTION_PROGRAM_LINK,
                                              "urProgramLink", &params);
 
+    context.logger.info("---> urProgramLink");
+
     ur_result_t result =
         pfnLink(hContext, count, phPrograms, pOptions, phProgram);
 
     context.notify_end(UR_FUNCTION_PROGRAM_LINK, "urProgramLink", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_LINK,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2058,11 +2522,18 @@ __urdlllocal ur_result_t UR_APICALL urProgramRetain(
     uint64_t instance = context.notify_begin(UR_FUNCTION_PROGRAM_RETAIN,
                                              "urProgramRetain", &params);
 
+    context.logger.info("---> urProgramRetain");
+
     ur_result_t result = pfnRetain(hProgram);
 
     context.notify_end(UR_FUNCTION_PROGRAM_RETAIN, "urProgramRetain", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_RETAIN,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2081,11 +2552,18 @@ __urdlllocal ur_result_t UR_APICALL urProgramRelease(
     uint64_t instance = context.notify_begin(UR_FUNCTION_PROGRAM_RELEASE,
                                              "urProgramRelease", &params);
 
+    context.logger.info("---> urProgramRelease");
+
     ur_result_t result = pfnRelease(hProgram);
 
     context.notify_end(UR_FUNCTION_PROGRAM_RELEASE, "urProgramRelease", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_RELEASE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2116,6 +2594,8 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetFunctionPointer(
         context.notify_begin(UR_FUNCTION_PROGRAM_GET_FUNCTION_POINTER,
                              "urProgramGetFunctionPointer", &params);
 
+    context.logger.info("---> urProgramGetFunctionPointer");
+
     ur_result_t result = pfnGetFunctionPointer(hDevice, hProgram, pFunctionName,
                                                ppFunctionPointer);
 
@@ -2123,6 +2603,11 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetFunctionPointer(
                        "urProgramGetFunctionPointer", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_PROGRAM_GET_FUNCTION_POINTER, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2155,6 +2640,8 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetGlobalVariablePointer(
         context.notify_begin(UR_FUNCTION_PROGRAM_GET_GLOBAL_VARIABLE_POINTER,
                              "urProgramGetGlobalVariablePointer", &params);
 
+    context.logger.info("---> urProgramGetGlobalVariablePointer");
+
     ur_result_t result = pfnGetGlobalVariablePointer(
         hDevice, hProgram, pGlobalVariableName, pGlobalVariableSizeRet,
         ppGlobalVariablePointerRet);
@@ -2163,6 +2650,11 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetGlobalVariablePointer(
                        "urProgramGetGlobalVariablePointer", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_PROGRAM_GET_GLOBAL_VARIABLE_POINTER, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2193,12 +2685,19 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetInfo(
     uint64_t instance = context.notify_begin(UR_FUNCTION_PROGRAM_GET_INFO,
                                              "urProgramGetInfo", &params);
 
+    context.logger.info("---> urProgramGetInfo");
+
     ur_result_t result =
         pfnGetInfo(hProgram, propName, propSize, pPropValue, pPropSizeRet);
 
     context.notify_end(UR_FUNCTION_PROGRAM_GET_INFO, "urProgramGetInfo",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_GET_INFO,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2231,12 +2730,19 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetBuildInfo(
     uint64_t instance = context.notify_begin(UR_FUNCTION_PROGRAM_GET_BUILD_INFO,
                                              "urProgramGetBuildInfo", &params);
 
+    context.logger.info("---> urProgramGetBuildInfo");
+
     ur_result_t result = pfnGetBuildInfo(hProgram, hDevice, propName, propSize,
                                          pPropValue, pPropSizeRet);
 
     context.notify_end(UR_FUNCTION_PROGRAM_GET_BUILD_INFO,
                        "urProgramGetBuildInfo", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_PROGRAM_GET_BUILD_INFO, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2262,6 +2768,8 @@ __urdlllocal ur_result_t UR_APICALL urProgramSetSpecializationConstants(
         context.notify_begin(UR_FUNCTION_PROGRAM_SET_SPECIALIZATION_CONSTANTS,
                              "urProgramSetSpecializationConstants", &params);
 
+    context.logger.info("---> urProgramSetSpecializationConstants");
+
     ur_result_t result =
         pfnSetSpecializationConstants(hProgram, count, pSpecConstants);
 
@@ -2269,6 +2777,11 @@ __urdlllocal ur_result_t UR_APICALL urProgramSetSpecializationConstants(
                        "urProgramSetSpecializationConstants", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_PROGRAM_SET_SPECIALIZATION_CONSTANTS, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2291,11 +2804,18 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetNativeHandle(
         context.notify_begin(UR_FUNCTION_PROGRAM_GET_NATIVE_HANDLE,
                              "urProgramGetNativeHandle", &params);
 
+    context.logger.info("---> urProgramGetNativeHandle");
+
     ur_result_t result = pfnGetNativeHandle(hProgram, phNativeProgram);
 
     context.notify_end(UR_FUNCTION_PROGRAM_GET_NATIVE_HANDLE,
                        "urProgramGetNativeHandle", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_PROGRAM_GET_NATIVE_HANDLE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2323,6 +2843,8 @@ __urdlllocal ur_result_t UR_APICALL urProgramCreateWithNativeHandle(
         context.notify_begin(UR_FUNCTION_PROGRAM_CREATE_WITH_NATIVE_HANDLE,
                              "urProgramCreateWithNativeHandle", &params);
 
+    context.logger.info("---> urProgramCreateWithNativeHandle");
+
     ur_result_t result = pfnCreateWithNativeHandle(hNativeProgram, hContext,
                                                    pProperties, phProgram);
 
@@ -2330,6 +2852,11 @@ __urdlllocal ur_result_t UR_APICALL urProgramCreateWithNativeHandle(
                        "urProgramCreateWithNativeHandle", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_PROGRAM_CREATE_WITH_NATIVE_HANDLE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2351,11 +2878,18 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreate(
     uint64_t instance = context.notify_begin(UR_FUNCTION_KERNEL_CREATE,
                                              "urKernelCreate", &params);
 
+    context.logger.info("---> urKernelCreate");
+
     ur_result_t result = pfnCreate(hProgram, pKernelName, phKernel);
 
     context.notify_end(UR_FUNCTION_KERNEL_CREATE, "urKernelCreate", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_CREATE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2381,12 +2915,19 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgValue(
     uint64_t instance = context.notify_begin(UR_FUNCTION_KERNEL_SET_ARG_VALUE,
                                              "urKernelSetArgValue", &params);
 
+    context.logger.info("---> urKernelSetArgValue");
+
     ur_result_t result =
         pfnSetArgValue(hKernel, argIndex, argSize, pProperties, pArgValue);
 
     context.notify_end(UR_FUNCTION_KERNEL_SET_ARG_VALUE, "urKernelSetArgValue",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_SET_ARG_VALUE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2411,12 +2952,19 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgLocal(
     uint64_t instance = context.notify_begin(UR_FUNCTION_KERNEL_SET_ARG_LOCAL,
                                              "urKernelSetArgLocal", &params);
 
+    context.logger.info("---> urKernelSetArgLocal");
+
     ur_result_t result =
         pfnSetArgLocal(hKernel, argIndex, argSize, pProperties);
 
     context.notify_end(UR_FUNCTION_KERNEL_SET_ARG_LOCAL, "urKernelSetArgLocal",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_SET_ARG_LOCAL,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2448,12 +2996,19 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetInfo(
     uint64_t instance = context.notify_begin(UR_FUNCTION_KERNEL_GET_INFO,
                                              "urKernelGetInfo", &params);
 
+    context.logger.info("---> urKernelGetInfo");
+
     ur_result_t result =
         pfnGetInfo(hKernel, propName, propSize, pPropValue, pPropSizeRet);
 
     context.notify_end(UR_FUNCTION_KERNEL_GET_INFO, "urKernelGetInfo", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_GET_INFO,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2483,12 +3038,19 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetGroupInfo(
     uint64_t instance = context.notify_begin(UR_FUNCTION_KERNEL_GET_GROUP_INFO,
                                              "urKernelGetGroupInfo", &params);
 
+    context.logger.info("---> urKernelGetGroupInfo");
+
     ur_result_t result = pfnGetGroupInfo(hKernel, hDevice, propName, propSize,
                                          pPropValue, pPropSizeRet);
 
     context.notify_end(UR_FUNCTION_KERNEL_GET_GROUP_INFO,
                        "urKernelGetGroupInfo", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_GET_GROUP_INFO,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2519,12 +3081,19 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSubGroupInfo(
         context.notify_begin(UR_FUNCTION_KERNEL_GET_SUB_GROUP_INFO,
                              "urKernelGetSubGroupInfo", &params);
 
+    context.logger.info("---> urKernelGetSubGroupInfo");
+
     ur_result_t result = pfnGetSubGroupInfo(hKernel, hDevice, propName,
                                             propSize, pPropValue, pPropSizeRet);
 
     context.notify_end(UR_FUNCTION_KERNEL_GET_SUB_GROUP_INFO,
                        "urKernelGetSubGroupInfo", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_KERNEL_GET_SUB_GROUP_INFO, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2543,11 +3112,18 @@ __urdlllocal ur_result_t UR_APICALL urKernelRetain(
     uint64_t instance = context.notify_begin(UR_FUNCTION_KERNEL_RETAIN,
                                              "urKernelRetain", &params);
 
+    context.logger.info("---> urKernelRetain");
+
     ur_result_t result = pfnRetain(hKernel);
 
     context.notify_end(UR_FUNCTION_KERNEL_RETAIN, "urKernelRetain", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_RETAIN,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2566,11 +3142,18 @@ __urdlllocal ur_result_t UR_APICALL urKernelRelease(
     uint64_t instance = context.notify_begin(UR_FUNCTION_KERNEL_RELEASE,
                                              "urKernelRelease", &params);
 
+    context.logger.info("---> urKernelRelease");
+
     ur_result_t result = pfnRelease(hKernel);
 
     context.notify_end(UR_FUNCTION_KERNEL_RELEASE, "urKernelRelease", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_RELEASE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2596,12 +3179,19 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgPointer(
     uint64_t instance = context.notify_begin(UR_FUNCTION_KERNEL_SET_ARG_POINTER,
                                              "urKernelSetArgPointer", &params);
 
+    context.logger.info("---> urKernelSetArgPointer");
+
     ur_result_t result =
         pfnSetArgPointer(hKernel, argIndex, pProperties, pArgValue);
 
     context.notify_end(UR_FUNCTION_KERNEL_SET_ARG_POINTER,
                        "urKernelSetArgPointer", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_KERNEL_SET_ARG_POINTER, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2628,12 +3218,19 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetExecInfo(
     uint64_t instance = context.notify_begin(UR_FUNCTION_KERNEL_SET_EXEC_INFO,
                                              "urKernelSetExecInfo", &params);
 
+    context.logger.info("---> urKernelSetExecInfo");
+
     ur_result_t result =
         pfnSetExecInfo(hKernel, propName, propSize, pProperties, pPropValue);
 
     context.notify_end(UR_FUNCTION_KERNEL_SET_EXEC_INFO, "urKernelSetExecInfo",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_SET_EXEC_INFO,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2657,12 +3254,19 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgSampler(
     uint64_t instance = context.notify_begin(UR_FUNCTION_KERNEL_SET_ARG_SAMPLER,
                                              "urKernelSetArgSampler", &params);
 
+    context.logger.info("---> urKernelSetArgSampler");
+
     ur_result_t result =
         pfnSetArgSampler(hKernel, argIndex, pProperties, hArgValue);
 
     context.notify_end(UR_FUNCTION_KERNEL_SET_ARG_SAMPLER,
                        "urKernelSetArgSampler", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_KERNEL_SET_ARG_SAMPLER, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2686,12 +3290,19 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgMemObj(
     uint64_t instance = context.notify_begin(UR_FUNCTION_KERNEL_SET_ARG_MEM_OBJ,
                                              "urKernelSetArgMemObj", &params);
 
+    context.logger.info("---> urKernelSetArgMemObj");
+
     ur_result_t result =
         pfnSetArgMemObj(hKernel, argIndex, pProperties, hArgValue);
 
     context.notify_end(UR_FUNCTION_KERNEL_SET_ARG_MEM_OBJ,
                        "urKernelSetArgMemObj", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_KERNEL_SET_ARG_MEM_OBJ, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2716,6 +3327,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetSpecializationConstants(
         context.notify_begin(UR_FUNCTION_KERNEL_SET_SPECIALIZATION_CONSTANTS,
                              "urKernelSetSpecializationConstants", &params);
 
+    context.logger.info("---> urKernelSetSpecializationConstants");
+
     ur_result_t result =
         pfnSetSpecializationConstants(hKernel, count, pSpecConstants);
 
@@ -2723,6 +3336,11 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetSpecializationConstants(
                        "urKernelSetSpecializationConstants", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_KERNEL_SET_SPECIALIZATION_CONSTANTS, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2744,11 +3362,18 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetNativeHandle(
         context.notify_begin(UR_FUNCTION_KERNEL_GET_NATIVE_HANDLE,
                              "urKernelGetNativeHandle", &params);
 
+    context.logger.info("---> urKernelGetNativeHandle");
+
     ur_result_t result = pfnGetNativeHandle(hKernel, phNativeKernel);
 
     context.notify_end(UR_FUNCTION_KERNEL_GET_NATIVE_HANDLE,
                        "urKernelGetNativeHandle", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_KERNEL_GET_NATIVE_HANDLE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2778,6 +3403,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
         context.notify_begin(UR_FUNCTION_KERNEL_CREATE_WITH_NATIVE_HANDLE,
                              "urKernelCreateWithNativeHandle", &params);
 
+    context.logger.info("---> urKernelCreateWithNativeHandle");
+
     ur_result_t result = pfnCreateWithNativeHandle(
         hNativeKernel, hContext, hProgram, pProperties, phKernel);
 
@@ -2785,6 +3412,11 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
                        "urKernelCreateWithNativeHandle", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_KERNEL_CREATE_WITH_NATIVE_HANDLE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2812,12 +3444,19 @@ __urdlllocal ur_result_t UR_APICALL urQueueGetInfo(
     uint64_t instance = context.notify_begin(UR_FUNCTION_QUEUE_GET_INFO,
                                              "urQueueGetInfo", &params);
 
+    context.logger.info("---> urQueueGetInfo");
+
     ur_result_t result =
         pfnGetInfo(hQueue, propName, propSize, pPropValue, pPropSizeRet);
 
     context.notify_end(UR_FUNCTION_QUEUE_GET_INFO, "urQueueGetInfo", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_GET_INFO,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2842,11 +3481,18 @@ __urdlllocal ur_result_t UR_APICALL urQueueCreate(
     uint64_t instance = context.notify_begin(UR_FUNCTION_QUEUE_CREATE,
                                              "urQueueCreate", &params);
 
+    context.logger.info("---> urQueueCreate");
+
     ur_result_t result = pfnCreate(hContext, hDevice, pProperties, phQueue);
 
     context.notify_end(UR_FUNCTION_QUEUE_CREATE, "urQueueCreate", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_CREATE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2865,11 +3511,18 @@ __urdlllocal ur_result_t UR_APICALL urQueueRetain(
     uint64_t instance = context.notify_begin(UR_FUNCTION_QUEUE_RETAIN,
                                              "urQueueRetain", &params);
 
+    context.logger.info("---> urQueueRetain");
+
     ur_result_t result = pfnRetain(hQueue);
 
     context.notify_end(UR_FUNCTION_QUEUE_RETAIN, "urQueueRetain", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_RETAIN,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2888,11 +3541,18 @@ __urdlllocal ur_result_t UR_APICALL urQueueRelease(
     uint64_t instance = context.notify_begin(UR_FUNCTION_QUEUE_RELEASE,
                                              "urQueueRelease", &params);
 
+    context.logger.info("---> urQueueRelease");
+
     ur_result_t result = pfnRelease(hQueue);
 
     context.notify_end(UR_FUNCTION_QUEUE_RELEASE, "urQueueRelease", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_RELEASE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2916,11 +3576,18 @@ __urdlllocal ur_result_t UR_APICALL urQueueGetNativeHandle(
     uint64_t instance = context.notify_begin(
         UR_FUNCTION_QUEUE_GET_NATIVE_HANDLE, "urQueueGetNativeHandle", &params);
 
+    context.logger.info("---> urQueueGetNativeHandle");
+
     ur_result_t result = pfnGetNativeHandle(hQueue, pDesc, phNativeQueue);
 
     context.notify_end(UR_FUNCTION_QUEUE_GET_NATIVE_HANDLE,
                        "urQueueGetNativeHandle", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_QUEUE_GET_NATIVE_HANDLE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2949,6 +3616,8 @@ __urdlllocal ur_result_t UR_APICALL urQueueCreateWithNativeHandle(
         context.notify_begin(UR_FUNCTION_QUEUE_CREATE_WITH_NATIVE_HANDLE,
                              "urQueueCreateWithNativeHandle", &params);
 
+    context.logger.info("---> urQueueCreateWithNativeHandle");
+
     ur_result_t result = pfnCreateWithNativeHandle(
         hNativeQueue, hContext, hDevice, pProperties, phQueue);
 
@@ -2956,6 +3625,11 @@ __urdlllocal ur_result_t UR_APICALL urQueueCreateWithNativeHandle(
                        "urQueueCreateWithNativeHandle", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_QUEUE_CREATE_WITH_NATIVE_HANDLE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2974,11 +3648,18 @@ __urdlllocal ur_result_t UR_APICALL urQueueFinish(
     uint64_t instance = context.notify_begin(UR_FUNCTION_QUEUE_FINISH,
                                              "urQueueFinish", &params);
 
+    context.logger.info("---> urQueueFinish");
+
     ur_result_t result = pfnFinish(hQueue);
 
     context.notify_end(UR_FUNCTION_QUEUE_FINISH, "urQueueFinish", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_FINISH,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -2997,11 +3678,17 @@ __urdlllocal ur_result_t UR_APICALL urQueueFlush(
     uint64_t instance =
         context.notify_begin(UR_FUNCTION_QUEUE_FLUSH, "urQueueFlush", &params);
 
+    context.logger.info("---> urQueueFlush");
+
     ur_result_t result = pfnFlush(hQueue);
 
     context.notify_end(UR_FUNCTION_QUEUE_FLUSH, "urQueueFlush", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_FLUSH, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3027,12 +3714,19 @@ __urdlllocal ur_result_t UR_APICALL urEventGetInfo(
     uint64_t instance = context.notify_begin(UR_FUNCTION_EVENT_GET_INFO,
                                              "urEventGetInfo", &params);
 
+    context.logger.info("---> urEventGetInfo");
+
     ur_result_t result =
         pfnGetInfo(hEvent, propName, propSize, pPropValue, pPropSizeRet);
 
     context.notify_end(UR_FUNCTION_EVENT_GET_INFO, "urEventGetInfo", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_EVENT_GET_INFO,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3062,12 +3756,19 @@ __urdlllocal ur_result_t UR_APICALL urEventGetProfilingInfo(
         context.notify_begin(UR_FUNCTION_EVENT_GET_PROFILING_INFO,
                              "urEventGetProfilingInfo", &params);
 
+    context.logger.info("---> urEventGetProfilingInfo");
+
     ur_result_t result = pfnGetProfilingInfo(hEvent, propName, propSize,
                                              pPropValue, pPropSizeRet);
 
     context.notify_end(UR_FUNCTION_EVENT_GET_PROFILING_INFO,
                        "urEventGetProfilingInfo", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_EVENT_GET_PROFILING_INFO, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3089,11 +3790,17 @@ __urdlllocal ur_result_t UR_APICALL urEventWait(
     uint64_t instance =
         context.notify_begin(UR_FUNCTION_EVENT_WAIT, "urEventWait", &params);
 
+    context.logger.info("---> urEventWait");
+
     ur_result_t result = pfnWait(numEvents, phEventWaitList);
 
     context.notify_end(UR_FUNCTION_EVENT_WAIT, "urEventWait", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_EVENT_WAIT, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3112,11 +3819,18 @@ __urdlllocal ur_result_t UR_APICALL urEventRetain(
     uint64_t instance = context.notify_begin(UR_FUNCTION_EVENT_RETAIN,
                                              "urEventRetain", &params);
 
+    context.logger.info("---> urEventRetain");
+
     ur_result_t result = pfnRetain(hEvent);
 
     context.notify_end(UR_FUNCTION_EVENT_RETAIN, "urEventRetain", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_EVENT_RETAIN,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3135,11 +3849,18 @@ __urdlllocal ur_result_t UR_APICALL urEventRelease(
     uint64_t instance = context.notify_begin(UR_FUNCTION_EVENT_RELEASE,
                                              "urEventRelease", &params);
 
+    context.logger.info("---> urEventRelease");
+
     ur_result_t result = pfnRelease(hEvent);
 
     context.notify_end(UR_FUNCTION_EVENT_RELEASE, "urEventRelease", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_EVENT_RELEASE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3160,11 +3881,18 @@ __urdlllocal ur_result_t UR_APICALL urEventGetNativeHandle(
     uint64_t instance = context.notify_begin(
         UR_FUNCTION_EVENT_GET_NATIVE_HANDLE, "urEventGetNativeHandle", &params);
 
+    context.logger.info("---> urEventGetNativeHandle");
+
     ur_result_t result = pfnGetNativeHandle(hEvent, phNativeEvent);
 
     context.notify_end(UR_FUNCTION_EVENT_GET_NATIVE_HANDLE,
                        "urEventGetNativeHandle", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_EVENT_GET_NATIVE_HANDLE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3192,6 +3920,8 @@ __urdlllocal ur_result_t UR_APICALL urEventCreateWithNativeHandle(
         context.notify_begin(UR_FUNCTION_EVENT_CREATE_WITH_NATIVE_HANDLE,
                              "urEventCreateWithNativeHandle", &params);
 
+    context.logger.info("---> urEventCreateWithNativeHandle");
+
     ur_result_t result =
         pfnCreateWithNativeHandle(hNativeEvent, hContext, pProperties, phEvent);
 
@@ -3199,6 +3929,11 @@ __urdlllocal ur_result_t UR_APICALL urEventCreateWithNativeHandle(
                        "urEventCreateWithNativeHandle", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_EVENT_CREATE_WITH_NATIVE_HANDLE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3222,12 +3957,19 @@ __urdlllocal ur_result_t UR_APICALL urEventSetCallback(
     uint64_t instance = context.notify_begin(UR_FUNCTION_EVENT_SET_CALLBACK,
                                              "urEventSetCallback", &params);
 
+    context.logger.info("---> urEventSetCallback");
+
     ur_result_t result =
         pfnSetCallback(hEvent, execStatus, pfnNotify, pUserData);
 
     context.notify_end(UR_FUNCTION_EVENT_SET_CALLBACK, "urEventSetCallback",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_EVENT_SET_CALLBACK,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3280,6 +4022,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch(
     uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH,
                                              "urEnqueueKernelLaunch", &params);
 
+    context.logger.info("---> urEnqueueKernelLaunch");
+
     ur_result_t result = pfnKernelLaunch(
         hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
         pLocalWorkSize, numEventsInWaitList, phEventWaitList, phEvent);
@@ -3287,6 +4031,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch(
     context.notify_end(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH,
                        "urEnqueueKernelLaunch", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3316,12 +4065,19 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWait(
     uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_EVENTS_WAIT,
                                              "urEnqueueEventsWait", &params);
 
+    context.logger.info("---> urEnqueueEventsWait");
+
     ur_result_t result =
         pfnEventsWait(hQueue, numEventsInWaitList, phEventWaitList, phEvent);
 
     context.notify_end(UR_FUNCTION_ENQUEUE_EVENTS_WAIT, "urEnqueueEventsWait",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_EVENTS_WAIT,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3353,6 +4109,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
         context.notify_begin(UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER,
                              "urEnqueueEventsWaitWithBarrier", &params);
 
+    context.logger.info("---> urEnqueueEventsWaitWithBarrier");
+
     ur_result_t result = pfnEventsWaitWithBarrier(hQueue, numEventsInWaitList,
                                                   phEventWaitList, phEvent);
 
@@ -3360,6 +4118,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
                        "urEnqueueEventsWaitWithBarrier", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3396,6 +4159,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferRead(
     uint64_t instance = context.notify_begin(
         UR_FUNCTION_ENQUEUE_MEM_BUFFER_READ, "urEnqueueMemBufferRead", &params);
 
+    context.logger.info("---> urEnqueueMemBufferRead");
+
     ur_result_t result =
         pfnMemBufferRead(hQueue, hBuffer, blockingRead, offset, size, pDst,
                          numEventsInWaitList, phEventWaitList, phEvent);
@@ -3403,6 +4168,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferRead(
     context.notify_end(UR_FUNCTION_ENQUEUE_MEM_BUFFER_READ,
                        "urEnqueueMemBufferRead", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_READ, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3442,6 +4212,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWrite(
         context.notify_begin(UR_FUNCTION_ENQUEUE_MEM_BUFFER_WRITE,
                              "urEnqueueMemBufferWrite", &params);
 
+    context.logger.info("---> urEnqueueMemBufferWrite");
+
     ur_result_t result =
         pfnMemBufferWrite(hQueue, hBuffer, blockingWrite, offset, size, pSrc,
                           numEventsInWaitList, phEventWaitList, phEvent);
@@ -3449,6 +4221,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWrite(
     context.notify_end(UR_FUNCTION_ENQUEUE_MEM_BUFFER_WRITE,
                        "urEnqueueMemBufferWrite", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_WRITE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3508,6 +4285,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
         context.notify_begin(UR_FUNCTION_ENQUEUE_MEM_BUFFER_READ_RECT,
                              "urEnqueueMemBufferReadRect", &params);
 
+    context.logger.info("---> urEnqueueMemBufferReadRect");
+
     ur_result_t result = pfnMemBufferReadRect(
         hQueue, hBuffer, blockingRead, bufferOrigin, hostOrigin, region,
         bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst,
@@ -3517,6 +4296,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
                        "urEnqueueMemBufferReadRect", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_READ_RECT, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3580,6 +4364,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
         context.notify_begin(UR_FUNCTION_ENQUEUE_MEM_BUFFER_WRITE_RECT,
                              "urEnqueueMemBufferWriteRect", &params);
 
+    context.logger.info("---> urEnqueueMemBufferWriteRect");
+
     ur_result_t result = pfnMemBufferWriteRect(
         hQueue, hBuffer, blockingWrite, bufferOrigin, hostOrigin, region,
         bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc,
@@ -3589,6 +4375,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
                        "urEnqueueMemBufferWriteRect", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_WRITE_RECT, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3625,6 +4416,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopy(
     uint64_t instance = context.notify_begin(
         UR_FUNCTION_ENQUEUE_MEM_BUFFER_COPY, "urEnqueueMemBufferCopy", &params);
 
+    context.logger.info("---> urEnqueueMemBufferCopy");
+
     ur_result_t result =
         pfnMemBufferCopy(hQueue, hBufferSrc, hBufferDst, srcOffset, dstOffset,
                          size, numEventsInWaitList, phEventWaitList, phEvent);
@@ -3632,6 +4425,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopy(
     context.notify_end(UR_FUNCTION_ENQUEUE_MEM_BUFFER_COPY,
                        "urEnqueueMemBufferCopy", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_COPY, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3680,6 +4478,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(
         context.notify_begin(UR_FUNCTION_ENQUEUE_MEM_BUFFER_COPY_RECT,
                              "urEnqueueMemBufferCopyRect", &params);
 
+    context.logger.info("---> urEnqueueMemBufferCopyRect");
+
     ur_result_t result = pfnMemBufferCopyRect(
         hQueue, hBufferSrc, hBufferDst, srcOrigin, dstOrigin, region,
         srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch,
@@ -3689,6 +4489,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(
                        "urEnqueueMemBufferCopyRect", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_COPY_RECT, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3730,6 +4535,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferFill(
     uint64_t instance = context.notify_begin(
         UR_FUNCTION_ENQUEUE_MEM_BUFFER_FILL, "urEnqueueMemBufferFill", &params);
 
+    context.logger.info("---> urEnqueueMemBufferFill");
+
     ur_result_t result =
         pfnMemBufferFill(hQueue, hBuffer, pPattern, patternSize, offset, size,
                          numEventsInWaitList, phEventWaitList, phEvent);
@@ -3737,6 +4544,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferFill(
     context.notify_end(UR_FUNCTION_ENQUEUE_MEM_BUFFER_FILL,
                        "urEnqueueMemBufferFill", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_FILL, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3779,6 +4591,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageRead(
     uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_MEM_IMAGE_READ,
                                              "urEnqueueMemImageRead", &params);
 
+    context.logger.info("---> urEnqueueMemImageRead");
+
     ur_result_t result = pfnMemImageRead(
         hQueue, hImage, blockingRead, origin, region, rowPitch, slicePitch,
         pDst, numEventsInWaitList, phEventWaitList, phEvent);
@@ -3786,6 +4600,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageRead(
     context.notify_end(UR_FUNCTION_ENQUEUE_MEM_IMAGE_READ,
                        "urEnqueueMemImageRead", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_ENQUEUE_MEM_IMAGE_READ, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3829,6 +4648,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageWrite(
     uint64_t instance = context.notify_begin(
         UR_FUNCTION_ENQUEUE_MEM_IMAGE_WRITE, "urEnqueueMemImageWrite", &params);
 
+    context.logger.info("---> urEnqueueMemImageWrite");
+
     ur_result_t result = pfnMemImageWrite(
         hQueue, hImage, blockingWrite, origin, region, rowPitch, slicePitch,
         pSrc, numEventsInWaitList, phEventWaitList, phEvent);
@@ -3836,6 +4657,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageWrite(
     context.notify_end(UR_FUNCTION_ENQUEUE_MEM_IMAGE_WRITE,
                        "urEnqueueMemImageWrite", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_ENQUEUE_MEM_IMAGE_WRITE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3878,6 +4704,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageCopy(
     uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_MEM_IMAGE_COPY,
                                              "urEnqueueMemImageCopy", &params);
 
+    context.logger.info("---> urEnqueueMemImageCopy");
+
     ur_result_t result =
         pfnMemImageCopy(hQueue, hImageSrc, hImageDst, srcOrigin, dstOrigin,
                         region, numEventsInWaitList, phEventWaitList, phEvent);
@@ -3885,6 +4713,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageCopy(
     context.notify_end(UR_FUNCTION_ENQUEUE_MEM_IMAGE_COPY,
                        "urEnqueueMemImageCopy", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_ENQUEUE_MEM_IMAGE_COPY, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3923,6 +4756,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferMap(
     uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_MEM_BUFFER_MAP,
                                              "urEnqueueMemBufferMap", &params);
 
+    context.logger.info("---> urEnqueueMemBufferMap");
+
     ur_result_t result = pfnMemBufferMap(hQueue, hBuffer, blockingMap, mapFlags,
                                          offset, size, numEventsInWaitList,
                                          phEventWaitList, phEvent, ppRetMap);
@@ -3930,6 +4765,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferMap(
     context.notify_end(UR_FUNCTION_ENQUEUE_MEM_BUFFER_MAP,
                        "urEnqueueMemBufferMap", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_MAP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -3962,6 +4802,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemUnmap(
     uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_MEM_UNMAP,
                                              "urEnqueueMemUnmap", &params);
 
+    context.logger.info("---> urEnqueueMemUnmap");
+
     ur_result_t result =
         pfnMemUnmap(hQueue, hMem, pMappedPtr, numEventsInWaitList,
                     phEventWaitList, phEvent);
@@ -3969,6 +4811,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemUnmap(
     context.notify_end(UR_FUNCTION_ENQUEUE_MEM_UNMAP, "urEnqueueMemUnmap",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_MEM_UNMAP,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4007,6 +4854,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill(
     uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_USM_FILL,
                                              "urEnqueueUSMFill", &params);
 
+    context.logger.info("---> urEnqueueUSMFill");
+
     ur_result_t result =
         pfnUSMFill(hQueue, pMem, patternSize, pPattern, size,
                    numEventsInWaitList, phEventWaitList, phEvent);
@@ -4014,6 +4863,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill(
     context.notify_end(UR_FUNCTION_ENQUEUE_USM_FILL, "urEnqueueUSMFill",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_USM_FILL,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4049,6 +4903,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy(
     uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_USM_MEMCPY,
                                              "urEnqueueUSMMemcpy", &params);
 
+    context.logger.info("---> urEnqueueUSMMemcpy");
+
     ur_result_t result =
         pfnUSMMemcpy(hQueue, blocking, pDst, pSrc, size, numEventsInWaitList,
                      phEventWaitList, phEvent);
@@ -4056,6 +4912,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy(
     context.notify_end(UR_FUNCTION_ENQUEUE_USM_MEMCPY, "urEnqueueUSMMemcpy",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_USM_MEMCPY,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4089,6 +4950,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMPrefetch(
     uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_USM_PREFETCH,
                                              "urEnqueueUSMPrefetch", &params);
 
+    context.logger.info("---> urEnqueueUSMPrefetch");
+
     ur_result_t result =
         pfnUSMPrefetch(hQueue, pMem, size, flags, numEventsInWaitList,
                        phEventWaitList, phEvent);
@@ -4096,6 +4959,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMPrefetch(
     context.notify_end(UR_FUNCTION_ENQUEUE_USM_PREFETCH, "urEnqueueUSMPrefetch",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_USM_PREFETCH,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4122,11 +4990,18 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMAdvise(
     uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_USM_ADVISE,
                                              "urEnqueueUSMAdvise", &params);
 
+    context.logger.info("---> urEnqueueUSMAdvise");
+
     ur_result_t result = pfnUSMAdvise(hQueue, pMem, size, advice, phEvent);
 
     context.notify_end(UR_FUNCTION_ENQUEUE_USM_ADVISE, "urEnqueueUSMAdvise",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_USM_ADVISE,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4170,6 +5045,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill2D(
     uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_USM_FILL_2D,
                                              "urEnqueueUSMFill2D", &params);
 
+    context.logger.info("---> urEnqueueUSMFill2D");
+
     ur_result_t result =
         pfnUSMFill2D(hQueue, pMem, pitch, patternSize, pPattern, width, height,
                      numEventsInWaitList, phEventWaitList, phEvent);
@@ -4177,6 +5054,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill2D(
     context.notify_end(UR_FUNCTION_ENQUEUE_USM_FILL_2D, "urEnqueueUSMFill2D",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_USM_FILL_2D,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4220,6 +5102,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
     uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_USM_MEMCPY_2D,
                                              "urEnqueueUSMMemcpy2D", &params);
 
+    context.logger.info("---> urEnqueueUSMMemcpy2D");
+
     ur_result_t result =
         pfnUSMMemcpy2D(hQueue, blocking, pDst, dstPitch, pSrc, srcPitch, width,
                        height, numEventsInWaitList, phEventWaitList, phEvent);
@@ -4227,6 +5111,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
     context.notify_end(UR_FUNCTION_ENQUEUE_USM_MEMCPY_2D,
                        "urEnqueueUSMMemcpy2D", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_USM_MEMCPY_2D,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4268,6 +5157,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
         context.notify_begin(UR_FUNCTION_ENQUEUE_DEVICE_GLOBAL_VARIABLE_WRITE,
                              "urEnqueueDeviceGlobalVariableWrite", &params);
 
+    context.logger.info("---> urEnqueueDeviceGlobalVariableWrite");
+
     ur_result_t result = pfnDeviceGlobalVariableWrite(
         hQueue, hProgram, name, blockingWrite, count, offset, pSrc,
         numEventsInWaitList, phEventWaitList, phEvent);
@@ -4276,6 +5167,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite(
                        "urEnqueueDeviceGlobalVariableWrite", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_ENQUEUE_DEVICE_GLOBAL_VARIABLE_WRITE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4317,6 +5213,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead(
         context.notify_begin(UR_FUNCTION_ENQUEUE_DEVICE_GLOBAL_VARIABLE_READ,
                              "urEnqueueDeviceGlobalVariableRead", &params);
 
+    context.logger.info("---> urEnqueueDeviceGlobalVariableRead");
+
     ur_result_t result = pfnDeviceGlobalVariableRead(
         hQueue, hProgram, name, blockingRead, count, offset, pDst,
         numEventsInWaitList, phEventWaitList, phEvent);
@@ -4325,6 +5223,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead(
                        "urEnqueueDeviceGlobalVariableRead", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_ENQUEUE_DEVICE_GLOBAL_VARIABLE_READ, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4368,6 +5271,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueReadHostPipe(
     uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_READ_HOST_PIPE,
                                              "urEnqueueReadHostPipe", &params);
 
+    context.logger.info("---> urEnqueueReadHostPipe");
+
     ur_result_t result =
         pfnReadHostPipe(hQueue, hProgram, pipe_symbol, blocking, pDst, size,
                         numEventsInWaitList, phEventWaitList, phEvent);
@@ -4375,6 +5280,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueReadHostPipe(
     context.notify_end(UR_FUNCTION_ENQUEUE_READ_HOST_PIPE,
                        "urEnqueueReadHostPipe", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_ENQUEUE_READ_HOST_PIPE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4418,6 +5328,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueWriteHostPipe(
     uint64_t instance = context.notify_begin(
         UR_FUNCTION_ENQUEUE_WRITE_HOST_PIPE, "urEnqueueWriteHostPipe", &params);
 
+    context.logger.info("---> urEnqueueWriteHostPipe");
+
     ur_result_t result =
         pfnWriteHostPipe(hQueue, hProgram, pipe_symbol, blocking, pSrc, size,
                          numEventsInWaitList, phEventWaitList, phEvent);
@@ -4425,6 +5337,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueWriteHostPipe(
     context.notify_end(UR_FUNCTION_ENQUEUE_WRITE_HOST_PIPE,
                        "urEnqueueWriteHostPipe", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_ENQUEUE_WRITE_HOST_PIPE, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4457,6 +5374,8 @@ __urdlllocal ur_result_t UR_APICALL urUSMPitchedAllocExp(
     uint64_t instance = context.notify_begin(UR_FUNCTION_USM_PITCHED_ALLOC_EXP,
                                              "urUSMPitchedAllocExp", &params);
 
+    context.logger.info("---> urUSMPitchedAllocExp");
+
     ur_result_t result =
         pfnPitchedAllocExp(hContext, hDevice, pUSMDesc, pool, widthInBytes,
                            height, elementSizeBytes, ppMem, pResultPitch);
@@ -4464,6 +5383,11 @@ __urdlllocal ur_result_t UR_APICALL urUSMPitchedAllocExp(
     context.notify_end(UR_FUNCTION_USM_PITCHED_ALLOC_EXP,
                        "urUSMPitchedAllocExp", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_PITCHED_ALLOC_EXP,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4489,6 +5413,8 @@ urBindlessImagesUnsampledImageHandleDestroyExp(
         UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_HANDLE_DESTROY_EXP,
         "urBindlessImagesUnsampledImageHandleDestroyExp", &params);
 
+    context.logger.info("---> urBindlessImagesUnsampledImageHandleDestroyExp");
+
     ur_result_t result =
         pfnUnsampledImageHandleDestroyExp(hContext, hDevice, hImage);
 
@@ -4497,6 +5423,13 @@ urBindlessImagesUnsampledImageHandleDestroyExp(
         "urBindlessImagesUnsampledImageHandleDestroyExp", &params, &result,
         instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str,
+        UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_HANDLE_DESTROY_EXP,
+        &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4522,6 +5455,8 @@ urBindlessImagesSampledImageHandleDestroyExp(
         UR_FUNCTION_BINDLESS_IMAGES_SAMPLED_IMAGE_HANDLE_DESTROY_EXP,
         "urBindlessImagesSampledImageHandleDestroyExp", &params);
 
+    context.logger.info("---> urBindlessImagesSampledImageHandleDestroyExp");
+
     ur_result_t result =
         pfnSampledImageHandleDestroyExp(hContext, hDevice, hImage);
 
@@ -4530,6 +5465,12 @@ urBindlessImagesSampledImageHandleDestroyExp(
         "urBindlessImagesSampledImageHandleDestroyExp", &params, &result,
         instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_BINDLESS_IMAGES_SAMPLED_IMAGE_HANDLE_DESTROY_EXP,
+        &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4557,6 +5498,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageAllocateExp(
         context.notify_begin(UR_FUNCTION_BINDLESS_IMAGES_IMAGE_ALLOCATE_EXP,
                              "urBindlessImagesImageAllocateExp", &params);
 
+    context.logger.info("---> urBindlessImagesImageAllocateExp");
+
     ur_result_t result = pfnImageAllocateExp(hContext, hDevice, pImageFormat,
                                              pImageDesc, phImageMem);
 
@@ -4564,6 +5507,11 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageAllocateExp(
                        "urBindlessImagesImageAllocateExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_BINDLESS_IMAGES_IMAGE_ALLOCATE_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4587,12 +5535,19 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageFreeExp(
         context.notify_begin(UR_FUNCTION_BINDLESS_IMAGES_IMAGE_FREE_EXP,
                              "urBindlessImagesImageFreeExp", &params);
 
+    context.logger.info("---> urBindlessImagesImageFreeExp");
+
     ur_result_t result = pfnImageFreeExp(hContext, hDevice, hImageMem);
 
     context.notify_end(UR_FUNCTION_BINDLESS_IMAGES_IMAGE_FREE_EXP,
                        "urBindlessImagesImageFreeExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_BINDLESS_IMAGES_IMAGE_FREE_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4606,7 +5561,6 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
     const ur_image_format_t
         *pImageFormat, ///< [in] pointer to image format specification
     const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description
-    ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created
     ur_exp_image_handle_t
         *phImage ///< [out] pointer to handle of image object created
 ) {
@@ -4618,19 +5572,26 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
     }
 
     ur_bindless_images_unsampled_image_create_exp_params_t params = {
-        &hContext,   &hDevice, &hImageMem, &pImageFormat,
-        &pImageDesc, &phMem,   &phImage};
+        &hContext, &hDevice, &hImageMem, &pImageFormat, &pImageDesc, &phImage};
     uint64_t instance = context.notify_begin(
         UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_CREATE_EXP,
         "urBindlessImagesUnsampledImageCreateExp", &params);
 
+    context.logger.info("---> urBindlessImagesUnsampledImageCreateExp");
+
     ur_result_t result = pfnUnsampledImageCreateExp(
-        hContext, hDevice, hImageMem, pImageFormat, pImageDesc, phMem, phImage);
+        hContext, hDevice, hImageMem, pImageFormat, pImageDesc, phImage);
 
     context.notify_end(UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_CREATE_EXP,
                        "urBindlessImagesUnsampledImageCreateExp", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_CREATE_EXP,
+        &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4645,7 +5606,6 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
         *pImageFormat, ///< [in] pointer to image format specification
     const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description
     ur_sampler_handle_t hSampler,      ///< [in] sampler to be used
-    ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created
     ur_exp_image_handle_t
         *phImage ///< [out] pointer to handle of image object created
 ) {
@@ -4658,19 +5618,27 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
 
     ur_bindless_images_sampled_image_create_exp_params_t params = {
         &hContext,   &hDevice,  &hImageMem, &pImageFormat,
-        &pImageDesc, &hSampler, &phMem,     &phImage};
+        &pImageDesc, &hSampler, &phImage};
     uint64_t instance = context.notify_begin(
         UR_FUNCTION_BINDLESS_IMAGES_SAMPLED_IMAGE_CREATE_EXP,
         "urBindlessImagesSampledImageCreateExp", &params);
 
+    context.logger.info("---> urBindlessImagesSampledImageCreateExp");
+
     ur_result_t result =
         pfnSampledImageCreateExp(hContext, hDevice, hImageMem, pImageFormat,
-                                 pImageDesc, hSampler, phMem, phImage);
+                                 pImageDesc, hSampler, phImage);
 
     context.notify_end(UR_FUNCTION_BINDLESS_IMAGES_SAMPLED_IMAGE_CREATE_EXP,
                        "urBindlessImagesSampledImageCreateExp", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_BINDLESS_IMAGES_SAMPLED_IMAGE_CREATE_EXP,
+        &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4731,6 +5699,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
         context.notify_begin(UR_FUNCTION_BINDLESS_IMAGES_IMAGE_COPY_EXP,
                              "urBindlessImagesImageCopyExp", &params);
 
+    context.logger.info("---> urBindlessImagesImageCopyExp");
+
     ur_result_t result = pfnImageCopyExp(
         hQueue, pDst, pSrc, pImageFormat, pImageDesc, imageCopyFlags, srcOffset,
         dstOffset, copyExtent, hostExtent, numEventsInWaitList, phEventWaitList,
@@ -4740,6 +5710,11 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
                        "urBindlessImagesImageCopyExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_BINDLESS_IMAGES_IMAGE_COPY_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4764,6 +5739,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageGetInfoExp(
         context.notify_begin(UR_FUNCTION_BINDLESS_IMAGES_IMAGE_GET_INFO_EXP,
                              "urBindlessImagesImageGetInfoExp", &params);
 
+    context.logger.info("---> urBindlessImagesImageGetInfoExp");
+
     ur_result_t result =
         pfnImageGetInfoExp(hImageMem, propName, pPropValue, pPropSizeRet);
 
@@ -4771,6 +5748,11 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageGetInfoExp(
                        "urBindlessImagesImageGetInfoExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_BINDLESS_IMAGES_IMAGE_GET_INFO_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4798,6 +5780,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMipmapGetLevelExp(
         context.notify_begin(UR_FUNCTION_BINDLESS_IMAGES_MIPMAP_GET_LEVEL_EXP,
                              "urBindlessImagesMipmapGetLevelExp", &params);
 
+    context.logger.info("---> urBindlessImagesMipmapGetLevelExp");
+
     ur_result_t result = pfnMipmapGetLevelExp(hContext, hDevice, hImageMem,
                                               mipmapLevel, phImageMem);
 
@@ -4805,6 +5789,11 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMipmapGetLevelExp(
                        "urBindlessImagesMipmapGetLevelExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_BINDLESS_IMAGES_MIPMAP_GET_LEVEL_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4828,12 +5817,19 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMipmapFreeExp(
         context.notify_begin(UR_FUNCTION_BINDLESS_IMAGES_MIPMAP_FREE_EXP,
                              "urBindlessImagesMipmapFreeExp", &params);
 
+    context.logger.info("---> urBindlessImagesMipmapFreeExp");
+
     ur_result_t result = pfnMipmapFreeExp(hContext, hDevice, hMem);
 
     context.notify_end(UR_FUNCTION_BINDLESS_IMAGES_MIPMAP_FREE_EXP,
                        "urBindlessImagesMipmapFreeExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_BINDLESS_IMAGES_MIPMAP_FREE_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4861,6 +5857,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportOpaqueFDExp(
         context.notify_begin(UR_FUNCTION_BINDLESS_IMAGES_IMPORT_OPAQUE_FD_EXP,
                              "urBindlessImagesImportOpaqueFDExp", &params);
 
+    context.logger.info("---> urBindlessImagesImportOpaqueFDExp");
+
     ur_result_t result = pfnImportOpaqueFDExp(hContext, hDevice, size,
                                               pInteropMemDesc, phInteropMem);
 
@@ -4868,6 +5866,11 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportOpaqueFDExp(
                        "urBindlessImagesImportOpaqueFDExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_BINDLESS_IMAGES_IMPORT_OPAQUE_FD_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4898,6 +5901,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
         context.notify_begin(UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_ARRAY_EXP,
                              "urBindlessImagesMapExternalArrayExp", &params);
 
+    context.logger.info("---> urBindlessImagesMapExternalArrayExp");
+
     ur_result_t result = pfnMapExternalArrayExp(
         hContext, hDevice, pImageFormat, pImageDesc, hInteropMem, phImageMem);
 
@@ -4905,6 +5910,11 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
                        "urBindlessImagesMapExternalArrayExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_ARRAY_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4929,12 +5939,19 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp(
         context.notify_begin(UR_FUNCTION_BINDLESS_IMAGES_RELEASE_INTEROP_EXP,
                              "urBindlessImagesReleaseInteropExp", &params);
 
+    context.logger.info("---> urBindlessImagesReleaseInteropExp");
+
     ur_result_t result = pfnReleaseInteropExp(hContext, hDevice, hInteropMem);
 
     context.notify_end(UR_FUNCTION_BINDLESS_IMAGES_RELEASE_INTEROP_EXP,
                        "urBindlessImagesReleaseInteropExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_BINDLESS_IMAGES_RELEASE_INTEROP_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4963,6 +5980,9 @@ urBindlessImagesImportExternalSemaphoreOpaqueFDExp(
         UR_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_SEMAPHORE_OPAQUE_FD_EXP,
         "urBindlessImagesImportExternalSemaphoreOpaqueFDExp", &params);
 
+    context.logger.info(
+        "---> urBindlessImagesImportExternalSemaphoreOpaqueFDExp");
+
     ur_result_t result = pfnImportExternalSemaphoreOpaqueFDExp(
         hContext, hDevice, pInteropSemaphoreDesc, phInteropSemaphore);
 
@@ -4971,6 +5991,13 @@ urBindlessImagesImportExternalSemaphoreOpaqueFDExp(
         "urBindlessImagesImportExternalSemaphoreOpaqueFDExp", &params, &result,
         instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str,
+        UR_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_SEMAPHORE_OPAQUE_FD_EXP,
+        &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -4995,6 +6022,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesDestroyExternalSemaphoreExp(
         UR_FUNCTION_BINDLESS_IMAGES_DESTROY_EXTERNAL_SEMAPHORE_EXP,
         "urBindlessImagesDestroyExternalSemaphoreExp", &params);
 
+    context.logger.info("---> urBindlessImagesDestroyExternalSemaphoreExp");
+
     ur_result_t result =
         pfnDestroyExternalSemaphoreExp(hContext, hDevice, hInteropSemaphore);
 
@@ -5003,6 +6032,12 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesDestroyExternalSemaphoreExp(
         "urBindlessImagesDestroyExternalSemaphoreExp", &params, &result,
         instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_BINDLESS_IMAGES_DESTROY_EXTERNAL_SEMAPHORE_EXP,
+        &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5036,6 +6071,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp(
         UR_FUNCTION_BINDLESS_IMAGES_WAIT_EXTERNAL_SEMAPHORE_EXP,
         "urBindlessImagesWaitExternalSemaphoreExp", &params);
 
+    context.logger.info("---> urBindlessImagesWaitExternalSemaphoreExp");
+
     ur_result_t result = pfnWaitExternalSemaphoreExp(
         hQueue, hSemaphore, numEventsInWaitList, phEventWaitList, phEvent);
 
@@ -5043,6 +6080,12 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp(
                        "urBindlessImagesWaitExternalSemaphoreExp", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_BINDLESS_IMAGES_WAIT_EXTERNAL_SEMAPHORE_EXP,
+        &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5076,6 +6119,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp(
         UR_FUNCTION_BINDLESS_IMAGES_SIGNAL_EXTERNAL_SEMAPHORE_EXP,
         "urBindlessImagesSignalExternalSemaphoreExp", &params);
 
+    context.logger.info("---> urBindlessImagesSignalExternalSemaphoreExp");
+
     ur_result_t result = pfnSignalExternalSemaphoreExp(
         hQueue, hSemaphore, numEventsInWaitList, phEventWaitList, phEvent);
 
@@ -5084,6 +6129,12 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp(
         "urBindlessImagesSignalExternalSemaphoreExp", &params, &result,
         instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_BINDLESS_IMAGES_SIGNAL_EXTERNAL_SEMAPHORE_EXP,
+        &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5109,12 +6160,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp(
         context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_CREATE_EXP,
                              "urCommandBufferCreateExp", &params);
 
+    context.logger.info("---> urCommandBufferCreateExp");
+
     ur_result_t result =
         pfnCreateExp(hContext, hDevice, pCommandBufferDesc, phCommandBuffer);
 
     context.notify_end(UR_FUNCTION_COMMAND_BUFFER_CREATE_EXP,
                        "urCommandBufferCreateExp", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_COMMAND_BUFFER_CREATE_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5135,11 +6193,18 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferRetainExp(
         context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_RETAIN_EXP,
                              "urCommandBufferRetainExp", &params);
 
+    context.logger.info("---> urCommandBufferRetainExp");
+
     ur_result_t result = pfnRetainExp(hCommandBuffer);
 
     context.notify_end(UR_FUNCTION_COMMAND_BUFFER_RETAIN_EXP,
                        "urCommandBufferRetainExp", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_COMMAND_BUFFER_RETAIN_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5160,11 +6225,18 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferReleaseExp(
         context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_RELEASE_EXP,
                              "urCommandBufferReleaseExp", &params);
 
+    context.logger.info("---> urCommandBufferReleaseExp");
+
     ur_result_t result = pfnReleaseExp(hCommandBuffer);
 
     context.notify_end(UR_FUNCTION_COMMAND_BUFFER_RELEASE_EXP,
                        "urCommandBufferReleaseExp", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_COMMAND_BUFFER_RELEASE_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5185,12 +6257,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferFinalizeExp(
         context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_FINALIZE_EXP,
                              "urCommandBufferFinalizeExp", &params);
 
+    context.logger.info("---> urCommandBufferFinalizeExp");
+
     ur_result_t result = pfnFinalizeExp(hCommandBuffer);
 
     context.notify_end(UR_FUNCTION_COMMAND_BUFFER_FINALIZE_EXP,
                        "urCommandBufferFinalizeExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_COMMAND_BUFFER_FINALIZE_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5239,6 +6318,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
         UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_EXP,
         "urCommandBufferAppendKernelLaunchExp", &params);
 
+    context.logger.info("---> urCommandBufferAppendKernelLaunchExp");
+
     ur_result_t result = pfnAppendKernelLaunchExp(
         hCommandBuffer, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
         pLocalWorkSize, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint,
@@ -5248,6 +6329,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
                        "urCommandBufferAppendKernelLaunchExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5281,6 +6367,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp(
         context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP,
                              "urCommandBufferAppendUSMMemcpyExp", &params);
 
+    context.logger.info("---> urCommandBufferAppendUSMMemcpyExp");
+
     ur_result_t result = pfnAppendUSMMemcpyExp(hCommandBuffer, pDst, pSrc, size,
                                                numSyncPointsInWaitList,
                                                pSyncPointWaitList, pSyncPoint);
@@ -5289,6 +6377,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp(
                        "urCommandBufferAppendUSMMemcpyExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5325,6 +6418,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp(
         context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_FILL_EXP,
                              "urCommandBufferAppendUSMFillExp", &params);
 
+    context.logger.info("---> urCommandBufferAppendUSMFillExp");
+
     ur_result_t result = pfnAppendUSMFillExp(
         hCommandBuffer, pMemory, pPattern, patternSize, size,
         numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint);
@@ -5333,6 +6428,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp(
                        "urCommandBufferAppendUSMFillExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_FILL_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5375,6 +6475,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
         UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_EXP,
         "urCommandBufferAppendMemBufferCopyExp", &params);
 
+    context.logger.info("---> urCommandBufferAppendMemBufferCopyExp");
+
     ur_result_t result = pfnAppendMemBufferCopyExp(
         hCommandBuffer, hSrcMem, hDstMem, srcOffset, dstOffset, size,
         numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint);
@@ -5383,6 +6485,12 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
                        "urCommandBufferAppendMemBufferCopyExp", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_EXP,
+        &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5424,6 +6532,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp(
         UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_EXP,
         "urCommandBufferAppendMemBufferWriteExp", &params);
 
+    context.logger.info("---> urCommandBufferAppendMemBufferWriteExp");
+
     ur_result_t result = pfnAppendMemBufferWriteExp(
         hCommandBuffer, hBuffer, offset, size, pSrc, numSyncPointsInWaitList,
         pSyncPointWaitList, pSyncPoint);
@@ -5432,6 +6542,12 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp(
                        "urCommandBufferAppendMemBufferWriteExp", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_EXP,
+        &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5472,6 +6588,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp(
         UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_EXP,
         "urCommandBufferAppendMemBufferReadExp", &params);
 
+    context.logger.info("---> urCommandBufferAppendMemBufferReadExp");
+
     ur_result_t result = pfnAppendMemBufferReadExp(
         hCommandBuffer, hBuffer, offset, size, pDst, numSyncPointsInWaitList,
         pSyncPointWaitList, pSyncPoint);
@@ -5480,6 +6598,12 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp(
                        "urCommandBufferAppendMemBufferReadExp", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_EXP,
+        &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5533,6 +6657,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
         UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_RECT_EXP,
         "urCommandBufferAppendMemBufferCopyRectExp", &params);
 
+    context.logger.info("---> urCommandBufferAppendMemBufferCopyRectExp");
+
     ur_result_t result = pfnAppendMemBufferCopyRectExp(
         hCommandBuffer, hSrcMem, hDstMem, srcOrigin, dstOrigin, region,
         srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch,
@@ -5543,6 +6669,12 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
         "urCommandBufferAppendMemBufferCopyRectExp", &params, &result,
         instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_RECT_EXP,
+        &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5602,6 +6734,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp(
         UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_RECT_EXP,
         "urCommandBufferAppendMemBufferWriteRectExp", &params);
 
+    context.logger.info("---> urCommandBufferAppendMemBufferWriteRectExp");
+
     ur_result_t result = pfnAppendMemBufferWriteRectExp(
         hCommandBuffer, hBuffer, bufferOffset, hostOffset, region,
         bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc,
@@ -5612,6 +6746,12 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp(
         "urCommandBufferAppendMemBufferWriteRectExp", &params, &result,
         instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_RECT_EXP,
+        &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5669,6 +6809,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
         UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_RECT_EXP,
         "urCommandBufferAppendMemBufferReadRectExp", &params);
 
+    context.logger.info("---> urCommandBufferAppendMemBufferReadRectExp");
+
     ur_result_t result = pfnAppendMemBufferReadRectExp(
         hCommandBuffer, hBuffer, bufferOffset, hostOffset, region,
         bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst,
@@ -5679,6 +6821,12 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
         "urCommandBufferAppendMemBufferReadRectExp", &params, &result,
         instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_RECT_EXP,
+        &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5722,6 +6870,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp(
         UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_FILL_EXP,
         "urCommandBufferAppendMemBufferFillExp", &params);
 
+    context.logger.info("---> urCommandBufferAppendMemBufferFillExp");
+
     ur_result_t result = pfnAppendMemBufferFillExp(
         hCommandBuffer, hBuffer, pPattern, patternSize, offset, size,
         numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint);
@@ -5730,6 +6880,12 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp(
                        "urCommandBufferAppendMemBufferFillExp", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_FILL_EXP,
+        &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5768,6 +6924,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp(
         context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_PREFETCH_EXP,
                              "urCommandBufferAppendUSMPrefetchExp", &params);
 
+    context.logger.info("---> urCommandBufferAppendUSMPrefetchExp");
+
     ur_result_t result = pfnAppendUSMPrefetchExp(
         hCommandBuffer, pMemory, size, flags, numSyncPointsInWaitList,
         pSyncPointWaitList, pSyncPoint);
@@ -5776,6 +6934,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp(
                        "urCommandBufferAppendUSMPrefetchExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_PREFETCH_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5814,6 +6977,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp(
         context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_ADVISE_EXP,
                              "urCommandBufferAppendUSMAdviseExp", &params);
 
+    context.logger.info("---> urCommandBufferAppendUSMAdviseExp");
+
     ur_result_t result = pfnAppendUSMAdviseExp(hCommandBuffer, pMemory, size,
                                                advice, numSyncPointsInWaitList,
                                                pSyncPointWaitList, pSyncPoint);
@@ -5822,6 +6987,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp(
                        "urCommandBufferAppendUSMAdviseExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_ADVISE_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5854,12 +7024,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferEnqueueExp(
         context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_ENQUEUE_EXP,
                              "urCommandBufferEnqueueExp", &params);
 
+    context.logger.info("---> urCommandBufferEnqueueExp");
+
     ur_result_t result = pfnEnqueueExp(
         hCommandBuffer, hQueue, numEventsInWaitList, phEventWaitList, phEvent);
 
     context.notify_end(UR_FUNCTION_COMMAND_BUFFER_ENQUEUE_EXP,
                        "urCommandBufferEnqueueExp", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_COMMAND_BUFFER_ENQUEUE_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5881,12 +7058,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferRetainCommandExp(
         context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_RETAIN_COMMAND_EXP,
                              "urCommandBufferRetainCommandExp", &params);
 
+    context.logger.info("---> urCommandBufferRetainCommandExp");
+
     ur_result_t result = pfnRetainCommandExp(hCommand);
 
     context.notify_end(UR_FUNCTION_COMMAND_BUFFER_RETAIN_COMMAND_EXP,
                        "urCommandBufferRetainCommandExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_COMMAND_BUFFER_RETAIN_COMMAND_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5908,12 +7092,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferReleaseCommandExp(
         context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_RELEASE_COMMAND_EXP,
                              "urCommandBufferReleaseCommandExp", &params);
 
+    context.logger.info("---> urCommandBufferReleaseCommandExp");
+
     ur_result_t result = pfnReleaseCommandExp(hCommand);
 
     context.notify_end(UR_FUNCTION_COMMAND_BUFFER_RELEASE_COMMAND_EXP,
                        "urCommandBufferReleaseCommandExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_COMMAND_BUFFER_RELEASE_COMMAND_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5938,6 +7129,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
         UR_FUNCTION_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_EXP,
         "urCommandBufferUpdateKernelLaunchExp", &params);
 
+    context.logger.info("---> urCommandBufferUpdateKernelLaunchExp");
+
     ur_result_t result =
         pfnUpdateKernelLaunchExp(hCommand, pUpdateKernelLaunch);
 
@@ -5945,6 +7138,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
                        "urCommandBufferUpdateKernelLaunchExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -5975,12 +7173,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferGetInfoExp(
         context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_GET_INFO_EXP,
                              "urCommandBufferGetInfoExp", &params);
 
+    context.logger.info("---> urCommandBufferGetInfoExp");
+
     ur_result_t result = pfnGetInfoExp(hCommandBuffer, propName, propSize,
                                        pPropValue, pPropSizeRet);
 
     context.notify_end(UR_FUNCTION_COMMAND_BUFFER_GET_INFO_EXP,
                        "urCommandBufferGetInfoExp", &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_COMMAND_BUFFER_GET_INFO_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -6012,6 +7217,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCommandGetInfoExp(
         context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP,
                              "urCommandBufferCommandGetInfoExp", &params);
 
+    context.logger.info("---> urCommandBufferCommandGetInfoExp");
+
     ur_result_t result = pfnCommandGetInfoExp(hCommand, propName, propSize,
                                               pPropValue, pPropSizeRet);
 
@@ -6019,6 +7226,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCommandGetInfoExp(
                        "urCommandBufferCommandGetInfoExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -6074,6 +7286,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
         context.notify_begin(UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP,
                              "urEnqueueCooperativeKernelLaunchExp", &params);
 
+    context.logger.info("---> urEnqueueCooperativeKernelLaunchExp");
+
     ur_result_t result = pfnCooperativeKernelLaunchExp(
         hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
         pLocalWorkSize, numEventsInWaitList, phEventWaitList, phEvent);
@@ -6082,6 +7296,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
                        "urEnqueueCooperativeKernelLaunchExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -6110,6 +7329,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
         UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP,
         "urKernelSuggestMaxCooperativeGroupCountExp", &params);
 
+    context.logger.info("---> urKernelSuggestMaxCooperativeGroupCountExp");
+
     ur_result_t result = pfnSuggestMaxCooperativeGroupCountExp(
         hKernel, localWorkSize, dynamicSharedMemorySize, pGroupCountRet);
 
@@ -6118,6 +7339,65 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
         "urKernelSuggestMaxCooperativeGroupCountExp", &params, &result,
         instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP,
+        &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
+    return result;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueTimestampRecordingExp
+__urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    bool
+        blocking, ///< [in] indicates whether the call to this function should block until
+    ///< until the device timestamp recording command has executed on the
+    ///< device.
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before the kernel execution.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait
+    ///< events.
+    ur_event_handle_t *
+        phEvent ///< [in,out] return an event object that identifies this particular kernel
+                ///< execution instance. Profiling information can be queried
+    ///< from this event as if `hQueue` had profiling enabled. Querying
+    ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT`
+    ///< reports the timestamp at the time of the call to this function.
+    ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END`
+    ///< reports the timestamp recorded when the command is executed on the device.
+) {
+    auto pfnTimestampRecordingExp =
+        context.urDdiTable.EnqueueExp.pfnTimestampRecordingExp;
+
+    if (nullptr == pfnTimestampRecordingExp) {
+        return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+    }
+
+    ur_enqueue_timestamp_recording_exp_params_t params = {
+        &hQueue, &blocking, &numEventsInWaitList, &phEventWaitList, &phEvent};
+    uint64_t instance =
+        context.notify_begin(UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP,
+                             "urEnqueueTimestampRecordingExp", &params);
+
+    context.logger.info("---> urEnqueueTimestampRecordingExp");
+
+    ur_result_t result = pfnTimestampRecordingExp(
+        hQueue, blocking, numEventsInWaitList, phEventWaitList, phEvent);
+
+    context.notify_end(UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP,
+                       "urEnqueueTimestampRecordingExp", &params, &result,
+                       instance);
+
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -6142,11 +7422,18 @@ __urdlllocal ur_result_t UR_APICALL urProgramBuildExp(
     uint64_t instance = context.notify_begin(UR_FUNCTION_PROGRAM_BUILD_EXP,
                                              "urProgramBuildExp", &params);
 
+    context.logger.info("---> urProgramBuildExp");
+
     ur_result_t result = pfnBuildExp(hProgram, numDevices, phDevices, pOptions);
 
     context.notify_end(UR_FUNCTION_PROGRAM_BUILD_EXP, "urProgramBuildExp",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_BUILD_EXP,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -6172,12 +7459,19 @@ __urdlllocal ur_result_t UR_APICALL urProgramCompileExp(
     uint64_t instance = context.notify_begin(UR_FUNCTION_PROGRAM_COMPILE_EXP,
                                              "urProgramCompileExp", &params);
 
+    context.logger.info("---> urProgramCompileExp");
+
     ur_result_t result =
         pfnCompileExp(hProgram, numDevices, phDevices, pOptions);
 
     context.notify_end(UR_FUNCTION_PROGRAM_COMPILE_EXP, "urProgramCompileExp",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_COMPILE_EXP,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -6208,12 +7502,19 @@ __urdlllocal ur_result_t UR_APICALL urProgramLinkExp(
     uint64_t instance = context.notify_begin(UR_FUNCTION_PROGRAM_LINK_EXP,
                                              "urProgramLinkExp", &params);
 
+    context.logger.info("---> urProgramLinkExp");
+
     ur_result_t result = pfnLinkExp(hContext, numDevices, phDevices, count,
                                     phPrograms, pOptions, phProgram);
 
     context.notify_end(UR_FUNCTION_PROGRAM_LINK_EXP, "urProgramLinkExp",
                        &params, &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_LINK_EXP,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -6234,11 +7535,18 @@ __urdlllocal ur_result_t UR_APICALL urUSMImportExp(
     uint64_t instance = context.notify_begin(UR_FUNCTION_USM_IMPORT_EXP,
                                              "urUSMImportExp", &params);
 
+    context.logger.info("---> urUSMImportExp");
+
     ur_result_t result = pfnImportExp(hContext, pMem, size);
 
     context.notify_end(UR_FUNCTION_USM_IMPORT_EXP, "urUSMImportExp", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_IMPORT_EXP,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -6258,11 +7566,18 @@ __urdlllocal ur_result_t UR_APICALL urUSMReleaseExp(
     uint64_t instance = context.notify_begin(UR_FUNCTION_USM_RELEASE_EXP,
                                              "urUSMReleaseExp", &params);
 
+    context.logger.info("---> urUSMReleaseExp");
+
     ur_result_t result = pfnReleaseExp(hContext, pMem);
 
     context.notify_end(UR_FUNCTION_USM_RELEASE_EXP, "urUSMReleaseExp", &params,
                        &result, instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_RELEASE_EXP,
+                                    &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -6286,12 +7601,19 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PEnablePeerAccessExp(
         context.notify_begin(UR_FUNCTION_USM_P2P_ENABLE_PEER_ACCESS_EXP,
                              "urUsmP2PEnablePeerAccessExp", &params);
 
+    context.logger.info("---> urUsmP2PEnablePeerAccessExp");
+
     ur_result_t result = pfnEnablePeerAccessExp(commandDevice, peerDevice);
 
     context.notify_end(UR_FUNCTION_USM_P2P_ENABLE_PEER_ACCESS_EXP,
                        "urUsmP2PEnablePeerAccessExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_USM_P2P_ENABLE_PEER_ACCESS_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -6315,12 +7637,19 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PDisablePeerAccessExp(
         context.notify_begin(UR_FUNCTION_USM_P2P_DISABLE_PEER_ACCESS_EXP,
                              "urUsmP2PDisablePeerAccessExp", &params);
 
+    context.logger.info("---> urUsmP2PDisablePeerAccessExp");
+
     ur_result_t result = pfnDisablePeerAccessExp(commandDevice, peerDevice);
 
     context.notify_end(UR_FUNCTION_USM_P2P_DISABLE_PEER_ACCESS_EXP,
                        "urUsmP2PDisablePeerAccessExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_USM_P2P_DISABLE_PEER_ACCESS_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -6356,6 +7685,8 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp(
         context.notify_begin(UR_FUNCTION_USM_P2P_PEER_ACCESS_GET_INFO_EXP,
                              "urUsmP2PPeerAccessGetInfoExp", &params);
 
+    context.logger.info("---> urUsmP2PPeerAccessGetInfoExp");
+
     ur_result_t result =
         pfnPeerAccessGetInfoExp(commandDevice, peerDevice, propName, propSize,
                                 pPropValue, pPropSizeRet);
@@ -6364,6 +7695,11 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp(
                        "urUsmP2PPeerAccessGetInfoExp", &params, &result,
                        instance);
 
+    std::ostringstream args_str;
+    ur::extras::printFunctionParams(
+        args_str, UR_FUNCTION_USM_P2P_PEER_ACCESS_GET_INFO_EXP, &params);
+    context.logger.info("({}) -> {};\n", args_str.str(), result);
+
     return result;
 }
 
@@ -6828,6 +8164,10 @@ __urdlllocal ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
     pDdiTable->pfnCooperativeKernelLaunchExp =
         ur_tracing_layer::urEnqueueCooperativeKernelLaunchExp;
 
+    dditable.pfnTimestampRecordingExp = pDdiTable->pfnTimestampRecordingExp;
+    pDdiTable->pfnTimestampRecordingExp =
+        ur_tracing_layer::urEnqueueTimestampRecordingExp;
+
     return result;
 }
 ///////////////////////////////////////////////////////////////////////////////
@@ -7632,6 +8972,10 @@ ur_result_t context_t::init(ur_dditable_t *dditable,
         return result;
     }
 
+    // Recreate the logger in case env variables have been modified between
+    // program launch and the call to `urLoaderInit`
+    logger = logger::create_logger("tracing", true, true);
+
     ur_tracing_layer::context.codelocData = codelocData;
 
     if (UR_RESULT_SUCCESS == result) {
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index 1e14552b4e..d2f63921e2 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -496,7 +496,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetInfo(
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
 
-        if (UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP < propName) {
+        if (UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName) {
             return UR_RESULT_ERROR_INVALID_ENUMERATION;
         }
 
@@ -6961,7 +6961,6 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
     const ur_image_format_t
         *pImageFormat, ///< [in] pointer to image format specification
     const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description
-    ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created
     ur_exp_image_handle_t
         *phImage ///< [out] pointer to handle of image object created
 ) {
@@ -6993,10 +6992,6 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
 
-        if (NULL == phMem) {
-            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
-        }
-
         if (NULL == phImage) {
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
@@ -7017,7 +7012,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
     }
 
     ur_result_t result = pfnUnsampledImageCreateExp(
-        hContext, hDevice, hImageMem, pImageFormat, pImageDesc, phMem, phImage);
+        hContext, hDevice, hImageMem, pImageFormat, pImageDesc, phImage);
 
     return result;
 }
@@ -7033,7 +7028,6 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
         *pImageFormat, ///< [in] pointer to image format specification
     const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description
     ur_sampler_handle_t hSampler,      ///< [in] sampler to be used
-    ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created
     ur_exp_image_handle_t
         *phImage ///< [out] pointer to handle of image object created
 ) {
@@ -7069,10 +7063,6 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
 
-        if (NULL == phMem) {
-            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
-        }
-
         if (NULL == phImage) {
             return UR_RESULT_ERROR_INVALID_NULL_POINTER;
         }
@@ -7099,7 +7089,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
 
     ur_result_t result =
         pfnSampledImageCreateExp(hContext, hDevice, hImageMem, pImageFormat,
-                                 pImageDesc, hSampler, phMem, phImage);
+                                 pImageDesc, hSampler, phImage);
 
     return result;
 }
@@ -8936,6 +8926,73 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueTimestampRecordingExp
+__urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    bool
+        blocking, ///< [in] indicates whether the call to this function should block until
+    ///< until the device timestamp recording command has executed on the
+    ///< device.
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before the kernel execution.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait
+    ///< events.
+    ur_event_handle_t *
+        phEvent ///< [in,out] return an event object that identifies this particular kernel
+                ///< execution instance. Profiling information can be queried
+    ///< from this event as if `hQueue` had profiling enabled. Querying
+    ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT`
+    ///< reports the timestamp at the time of the call to this function.
+    ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END`
+    ///< reports the timestamp recorded when the command is executed on the device.
+) {
+    auto pfnTimestampRecordingExp =
+        context.urDdiTable.EnqueueExp.pfnTimestampRecordingExp;
+
+    if (nullptr == pfnTimestampRecordingExp) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    if (context.enableParameterValidation) {
+        if (NULL == hQueue) {
+            return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
+        }
+
+        if (NULL == phEvent) {
+            return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+        }
+
+        if (phEventWaitList == NULL && numEventsInWaitList > 0) {
+            return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST;
+        }
+
+        if (phEventWaitList != NULL && numEventsInWaitList == 0) {
+            return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST;
+        }
+
+        if (phEventWaitList != NULL && numEventsInWaitList > 0) {
+            for (uint32_t i = 0; i < numEventsInWaitList; ++i) {
+                if (phEventWaitList[i] == NULL) {
+                    return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST;
+                }
+            }
+        }
+    }
+
+    if (context.enableLifetimeValidation &&
+        !refCountContext.isReferenceValid(hQueue)) {
+        refCountContext.logInvalidReference(hQueue);
+    }
+
+    ur_result_t result = pfnTimestampRecordingExp(
+        hQueue, blocking, numEventsInWaitList, phEventWaitList, phEvent);
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urProgramBuildExp
 __urdlllocal ur_result_t UR_APICALL urProgramBuildExp(
@@ -9745,6 +9802,10 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
     pDdiTable->pfnCooperativeKernelLaunchExp =
         ur_validation_layer::urEnqueueCooperativeKernelLaunchExp;
 
+    dditable.pfnTimestampRecordingExp = pDdiTable->pfnTimestampRecordingExp;
+    pDdiTable->pfnTimestampRecordingExp =
+        ur_validation_layer::urEnqueueTimestampRecordingExp;
+
     return result;
 }
 
diff --git a/source/loader/ur_adapter_registry.hpp b/source/loader/ur_adapter_registry.hpp
index 3cfac34647..060a5ae8a9 100644
--- a/source/loader/ur_adapter_registry.hpp
+++ b/source/loader/ur_adapter_registry.hpp
@@ -114,11 +114,13 @@ class AdapterRegistry {
     // to load the adapter.
     std::vector<std::vector<fs::path>> adaptersLoadPaths;
 
-    static constexpr std::array<const char *, 4> knownAdapterNames{
+    static constexpr std::array<const char *, 5> knownAdapterNames{
         MAKE_LIBRARY_NAME("ur_adapter_level_zero", "0"),
-        MAKE_LIBRARY_NAME("ur_adapter_hip", "0"),
         MAKE_LIBRARY_NAME("ur_adapter_opencl", "0"),
-        MAKE_LIBRARY_NAME("ur_adapter_cuda", "0")};
+        MAKE_LIBRARY_NAME("ur_adapter_cuda", "0"),
+        MAKE_LIBRARY_NAME("ur_adapter_hip", "0"),
+        MAKE_LIBRARY_NAME("ur_adapter_native_cpu", "0"),
+    };
 
     std::optional<std::vector<fs::path>> getEnvAdapterSearchPaths() {
         std::optional<std::vector<std::string>> pathStringsOpt;
diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp
index ffb7eec027..a8c9dc8dcc 100644
--- a/source/loader/ur_ldrddi.cpp
+++ b/source/loader/ur_ldrddi.cpp
@@ -5917,7 +5917,6 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
     const ur_image_format_t
         *pImageFormat, ///< [in] pointer to image format specification
     const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description
-    ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created
     ur_exp_image_handle_t
         *phImage ///< [out] pointer to handle of image object created
 ) {
@@ -5942,21 +5941,13 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
         reinterpret_cast<ur_exp_image_mem_object_t *>(hImageMem)->handle;
 
     // forward to device-platform
-    result = pfnUnsampledImageCreateExp(
-        hContext, hDevice, hImageMem, pImageFormat, pImageDesc, phMem, phImage);
+    result = pfnUnsampledImageCreateExp(hContext, hDevice, hImageMem,
+                                        pImageFormat, pImageDesc, phImage);
 
     if (UR_RESULT_SUCCESS != result) {
         return result;
     }
 
-    try {
-        // convert platform handle to loader handle
-        *phMem = reinterpret_cast<ur_mem_handle_t>(
-            ur_mem_factory.getInstance(*phMem, dditable));
-    } catch (std::bad_alloc &) {
-        result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
-    }
-
     try {
         // convert platform handle to loader handle
         *phImage = reinterpret_cast<ur_exp_image_handle_t>(
@@ -5979,7 +5970,6 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
         *pImageFormat, ///< [in] pointer to image format specification
     const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description
     ur_sampler_handle_t hSampler,      ///< [in] sampler to be used
-    ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created
     ur_exp_image_handle_t
         *phImage ///< [out] pointer to handle of image object created
 ) {
@@ -6009,20 +5999,12 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
     // forward to device-platform
     result =
         pfnSampledImageCreateExp(hContext, hDevice, hImageMem, pImageFormat,
-                                 pImageDesc, hSampler, phMem, phImage);
+                                 pImageDesc, hSampler, phImage);
 
     if (UR_RESULT_SUCCESS != result) {
         return result;
     }
 
-    try {
-        // convert platform handle to loader handle
-        *phMem = reinterpret_cast<ur_mem_handle_t>(
-            ur_mem_factory.getInstance(*phMem, dditable));
-    } catch (std::bad_alloc &) {
-        result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
-    }
-
     try {
         // convert platform handle to loader handle
         *phImage = reinterpret_cast<ur_exp_image_handle_t>(
@@ -7653,6 +7635,69 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Intercept function for urEnqueueTimestampRecordingExp
+__urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    bool
+        blocking, ///< [in] indicates whether the call to this function should block until
+    ///< until the device timestamp recording command has executed on the
+    ///< device.
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before the kernel execution.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait
+    ///< events.
+    ur_event_handle_t *
+        phEvent ///< [in,out] return an event object that identifies this particular kernel
+                ///< execution instance. Profiling information can be queried
+    ///< from this event as if `hQueue` had profiling enabled. Querying
+    ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT`
+    ///< reports the timestamp at the time of the call to this function.
+    ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END`
+    ///< reports the timestamp recorded when the command is executed on the device.
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+
+    // extract platform's function pointer table
+    auto dditable = reinterpret_cast<ur_queue_object_t *>(hQueue)->dditable;
+    auto pfnTimestampRecordingExp =
+        dditable->ur.EnqueueExp.pfnTimestampRecordingExp;
+    if (nullptr == pfnTimestampRecordingExp) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    // convert loader handle to platform handle
+    hQueue = reinterpret_cast<ur_queue_object_t *>(hQueue)->handle;
+
+    // convert loader handles to platform handles
+    auto phEventWaitListLocal =
+        std::vector<ur_event_handle_t>(numEventsInWaitList);
+    for (size_t i = 0; i < numEventsInWaitList; ++i) {
+        phEventWaitListLocal[i] =
+            reinterpret_cast<ur_event_object_t *>(phEventWaitList[i])->handle;
+    }
+
+    // forward to device-platform
+    result = pfnTimestampRecordingExp(hQueue, blocking, numEventsInWaitList,
+                                      phEventWaitListLocal.data(), phEvent);
+
+    if (UR_RESULT_SUCCESS != result) {
+        return result;
+    }
+
+    try {
+        // convert platform handle to loader handle
+        *phEvent = reinterpret_cast<ur_event_handle_t>(
+            ur_event_factory.getInstance(*phEvent, dditable));
+    } catch (std::bad_alloc &) {
+        result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+    }
+
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Intercept function for urProgramBuildExp
 __urdlllocal ur_result_t UR_APICALL urProgramBuildExp(
@@ -8373,6 +8418,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
             // return pointers to loader's DDIs
             pDdiTable->pfnCooperativeKernelLaunchExp =
                 ur_loader::urEnqueueCooperativeKernelLaunchExp;
+            pDdiTable->pfnTimestampRecordingExp =
+                ur_loader::urEnqueueTimestampRecordingExp;
         } else {
             // return pointers directly to platform's DDIs
             *pDdiTable =
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index 66f4835c56..62b502095c 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -842,7 +842,7 @@ ur_result_t UR_APICALL urDeviceGetSelected(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP < propName`
+///         + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
 ///         + If `propName` is not supported by the adapter.
 ///     - ::UR_RESULT_ERROR_INVALID_SIZE
@@ -4768,6 +4768,7 @@ ur_result_t UR_APICALL urEventSetCallback(
 ///     - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION
 ///     - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
+///     - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGS - "The kernel argument values have not been specified."
 ///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
 ur_result_t UR_APICALL urEnqueueKernelLaunch(
@@ -6602,7 +6603,6 @@ ur_result_t UR_APICALL urBindlessImagesImageFreeExp(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `NULL == pImageFormat`
 ///         + `NULL == pImageDesc`
-///         + `NULL == phMem`
 ///         + `NULL == phImage`
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
@@ -6618,7 +6618,6 @@ ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
     const ur_image_format_t
         *pImageFormat, ///< [in] pointer to image format specification
     const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description
-    ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created
     ur_exp_image_handle_t
         *phImage ///< [out] pointer to handle of image object created
     ) try {
@@ -6630,7 +6629,7 @@ ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
     }
 
     return pfnUnsampledImageCreateExp(hContext, hDevice, hImageMem,
-                                      pImageFormat, pImageDesc, phMem, phImage);
+                                      pImageFormat, pImageDesc, phImage);
 } catch (...) {
     return exceptionToResult(std::current_exception());
 }
@@ -6655,7 +6654,6 @@ ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `NULL == pImageFormat`
 ///         + `NULL == pImageDesc`
-///         + `NULL == phMem`
 ///         + `NULL == phImage`
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
@@ -6673,7 +6671,6 @@ ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
         *pImageFormat, ///< [in] pointer to image format specification
     const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description
     ur_sampler_handle_t hSampler,      ///< [in] sampler to be used
-    ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created
     ur_exp_image_handle_t
         *phImage ///< [out] pointer to handle of image object created
     ) try {
@@ -6684,7 +6681,7 @@ ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
     }
 
     return pfnSampledImageCreateExp(hContext, hDevice, hImageMem, pImageFormat,
-                                    pImageDesc, hSampler, phMem, phImage);
+                                    pImageDesc, hSampler, phImage);
 } catch (...) {
     return exceptionToResult(std::current_exception());
 }
@@ -8315,6 +8312,54 @@ ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     return exceptionToResult(std::current_exception());
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Enqueue a command for recording the device timestamp
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hQueue`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == phEvent`
+///     - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST
+///         + `phEventWaitList == NULL && numEventsInWaitList > 0`
+///         + `phEventWaitList != NULL && numEventsInWaitList == 0`
+ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    bool
+        blocking, ///< [in] indicates whether the call to this function should block until
+    ///< until the device timestamp recording command has executed on the
+    ///< device.
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before the kernel execution.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait
+    ///< events.
+    ur_event_handle_t *
+        phEvent ///< [in,out] return an event object that identifies this particular kernel
+                ///< execution instance. Profiling information can be queried
+    ///< from this event as if `hQueue` had profiling enabled. Querying
+    ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT`
+    ///< reports the timestamp at the time of the call to this function.
+    ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END`
+    ///< reports the timestamp recorded when the command is executed on the device.
+    ) try {
+    auto pfnTimestampRecordingExp =
+        ur_lib::context->urDdiTable.EnqueueExp.pfnTimestampRecordingExp;
+    if (nullptr == pfnTimestampRecordingExp) {
+        return UR_RESULT_ERROR_UNINITIALIZED;
+    }
+
+    return pfnTimestampRecordingExp(hQueue, blocking, numEventsInWaitList,
+                                    phEventWaitList, phEvent);
+} catch (...) {
+    return exceptionToResult(std::current_exception());
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Produces an executable program from one program, negates need for the
 ///        linking step.
diff --git a/source/loader/ur_print.cpp b/source/loader/ur_print.cpp
index 5721ebf3ef..3b144c87ad 100644
--- a/source/loader/ur_print.cpp
+++ b/source/loader/ur_print.cpp
@@ -1653,6 +1653,14 @@ ur_result_t urPrintEnqueueCooperativeKernelLaunchExpParams(
     return str_copy(&ss, buffer, buff_size, out_size);
 }
 
+ur_result_t urPrintEnqueueTimestampRecordingExpParams(
+    const struct ur_enqueue_timestamp_recording_exp_params_t *params,
+    char *buffer, const size_t buff_size, size_t *out_size) {
+    std::stringstream ss;
+    ss << params;
+    return str_copy(&ss, buffer, buff_size, out_size);
+}
+
 ur_result_t
 urPrintEventGetInfoParams(const struct ur_event_get_info_params_t *params,
                           char *buffer, const size_t buff_size,
diff --git a/source/ur/ur.cpp b/source/ur/ur.cpp
index dad6312d57..cff431069a 100644
--- a/source/ur/ur.cpp
+++ b/source/ur/ur.cpp
@@ -1,24 +1,24 @@
-
-//===--------- ur.cpp - Unified Runtime  ----------------------------------===//
-//
-// Copyright (C) 2023 Intel Corporation
-//
-// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
-// Exceptions. See LICENSE.TXT
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "ur.hpp"
-#include <cassert>
-
-// Controls tracing UR calls from within the UR itself.
-bool PrintTrace = [] {
-  const char *PiRet = std::getenv("SYCL_PI_TRACE");
-  const char *Trace = PiRet ? PiRet : nullptr;
-  const int TraceValue = Trace ? std::stoi(Trace) : 0;
-  if (TraceValue == -1 || TraceValue == 2) { // Means print all traces
-    return true;
-  }
-  return false;
-}();
+
+//===--------- ur.cpp - Unified Runtime  ----------------------------------===//
+//
+// Copyright (C) 2023 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ur.hpp"
+#include <cassert>
+
+// Controls tracing UR calls from within the UR itself.
+bool PrintTrace = [] {
+  const char *PiRet = std::getenv("SYCL_PI_TRACE");
+  const char *Trace = PiRet ? PiRet : nullptr;
+  const int TraceValue = Trace ? std::stoi(Trace) : 0;
+  if (TraceValue == -1 || TraceValue == 2) { // Means print all traces
+    return true;
+  }
+  return false;
+}();
diff --git a/source/ur/ur.hpp b/source/ur/ur.hpp
index 8e684f6832..ccca6a756b 100644
--- a/source/ur/ur.hpp
+++ b/source/ur/ur.hpp
@@ -1,377 +1,377 @@
-//===--------- ur.hpp - Unified Runtime  ----------------------------------===//
-//
-// Copyright (C) 2023 Intel Corporation
-//
-// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
-// Exceptions. See LICENSE.TXT
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#pragma once
-
-#include <atomic>
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <functional>
-#include <iostream>
-#include <mutex>
-#include <shared_mutex>
-#include <string>
-#include <thread>
-#include <variant>
-#include <vector>
-
-#include <ur_api.h>
-
-#include "ur_util.hpp"
-
-template <class To, class From> To ur_cast(From Value) {
-  // TODO: see if more sanity checks are possible.
-  assert(sizeof(From) == sizeof(To));
-  return (To)(Value);
-}
-
-template <> uint32_t inline ur_cast(uint64_t Value) {
-  // Cast value and check that we don't lose any information.
-  uint32_t CastedValue = (uint32_t)(Value);
-  assert((uint64_t)CastedValue == Value);
-  return CastedValue;
-}
-
-// TODO: promote all of the below extensions to the Unified Runtime
-//       and get rid of these ZER_EXT constants.
-const ur_device_info_t UR_EXT_DEVICE_INFO_OPENCL_C_VERSION =
-    (ur_device_info_t)0x103D;
-
-const ur_command_t UR_EXT_COMMAND_TYPE_USER =
-    (ur_command_t)((uint32_t)UR_COMMAND_FORCE_UINT32 - 1);
-
-/// Program metadata tags recognized by the UR adapters. For kernels the tag
-/// must appear after the kernel name.
-#define __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE                    \
-  "@reqd_work_group_size"
-#define __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING "@global_id_mapping"
-#define __SYCL_UR_PROGRAM_METADATA_TAG_NEED_FINALIZATION "Requires finalization"
-
-// Terminates the process with a catastrophic error message.
-[[noreturn]] inline void die(const char *Message) {
-  std::cerr << "die: " << Message << std::endl;
-  std::terminate();
-}
-
-// A single-threaded app has an opportunity to enable this mode to avoid
-// overhead from mutex locking. Default value is 0 which means that single
-// thread mode is disabled.
-static const bool SingleThreadMode = [] {
-  auto UrRet = ur_getenv("UR_L0_SINGLE_THREAD_MODE");
-  auto PiRet = ur_getenv("SYCL_PI_LEVEL_ZERO_SINGLE_THREAD_MODE");
-  const bool RetVal =
-      UrRet ? std::stoi(*UrRet) : (PiRet ? std::stoi(*PiRet) : 0);
-  return RetVal;
-}();
-
-// Class which acts like shared_mutex if SingleThreadMode variable is not set.
-// If SingleThreadMode variable is set then mutex operations are turned into
-// nop.
-class ur_shared_mutex {
-  std::shared_mutex Mutex;
-
-public:
-  void lock() {
-    if (!SingleThreadMode) {
-      Mutex.lock();
-    }
-  }
-  bool try_lock() { return SingleThreadMode ? true : Mutex.try_lock(); }
-  void unlock() {
-    if (!SingleThreadMode) {
-      Mutex.unlock();
-    }
-  }
-
-  void lock_shared() {
-    if (!SingleThreadMode) {
-      Mutex.lock_shared();
-    }
-  }
-  bool try_lock_shared() {
-    return SingleThreadMode ? true : Mutex.try_lock_shared();
-  }
-  void unlock_shared() {
-    if (!SingleThreadMode) {
-      Mutex.unlock_shared();
-    }
-  }
-};
-
-// Class which acts like std::mutex if SingleThreadMode variable is not set.
-// If SingleThreadMode variable is set then mutex operations are turned into
-// nop.
-class ur_mutex {
-  std::mutex Mutex;
-  friend class ur_lock;
-
-public:
-  void lock() {
-    if (!SingleThreadMode) {
-      Mutex.lock();
-    }
-  }
-  bool try_lock() { return SingleThreadMode ? true : Mutex.try_lock(); }
-  void unlock() {
-    if (!SingleThreadMode) {
-      Mutex.unlock();
-    }
-  }
-};
-
-class ur_lock {
-  std::unique_lock<std::mutex> Lock;
-
-public:
-  explicit ur_lock(ur_mutex &Mutex) {
-    if (!SingleThreadMode) {
-      Lock = std::unique_lock<std::mutex>(Mutex.Mutex);
-    }
-  }
-};
-
-/// SpinLock is a synchronization primitive, that uses atomic variable and
-/// causes thread trying acquire lock wait in loop while repeatedly check if
-/// the lock is available.
-///
-/// One important feature of this implementation is that std::atomic<bool> can
-/// be zero-initialized. This allows SpinLock to have trivial constructor and
-/// destructor, which makes it possible to use it in global context (unlike
-/// std::mutex, that doesn't provide such guarantees).
-class SpinLock {
-public:
-  void lock() {
-    while (MLock.test_and_set(std::memory_order_acquire)) {
-      std::this_thread::yield();
-    }
-  }
-  void unlock() { MLock.clear(std::memory_order_release); }
-
-private:
-  std::atomic_flag MLock = ATOMIC_FLAG_INIT;
-};
-
-// The wrapper for immutable data.
-// The data is initialized only once at first access (via ->) with the
-// initialization function provided in Init. All subsequent access to
-// the data just returns the already stored data.
-//
-template <class T> struct ZeCache : private T {
-  // The initialization function takes a reference to the data
-  // it is going to initialize, since it is private here in
-  // order to disallow access other than through "->".
-  //
-  using InitFunctionType = std::function<void(T &)>;
-  InitFunctionType Compute{nullptr};
-  std::once_flag Computed;
-
-  ZeCache() : T{} {}
-
-  // Access to the fields of the original T data structure.
-  T *operator->() {
-    std::call_once(Computed, Compute, static_cast<T &>(*this));
-    return this;
-  }
-};
-
-// Helper for one-liner validation
-#define UR_ASSERT(condition, error)                                            \
-  if (!(condition))                                                            \
-    return error;
-
-// TODO: populate with target agnostic handling of UR platforms
-struct _ur_platform {};
-
-// Controls tracing UR calls from within the UR itself.
-extern bool PrintTrace;
-
-// The getInfo*/ReturnHelper facilities provide shortcut way of
-// writing return bytes for the various getInfo APIs.
-namespace ur {
-template <typename T, typename Assign>
-ur_result_t getInfoImpl(size_t param_value_size, void *param_value,
-                        size_t *param_value_size_ret, T value,
-                        size_t value_size, Assign &&assign_func) {
-  if (!param_value && !param_value_size_ret) {
-    return UR_RESULT_ERROR_INVALID_NULL_POINTER;
-  }
-
-  if (param_value != nullptr) {
-
-    if (param_value_size < value_size) {
-      return UR_RESULT_ERROR_INVALID_SIZE;
-    }
-
-    assign_func(param_value, value, value_size);
-  }
-
-  if (param_value_size_ret != nullptr) {
-    *param_value_size_ret = value_size;
-  }
-
-  return UR_RESULT_SUCCESS;
-}
-
-template <typename T>
-ur_result_t getInfo(size_t param_value_size, void *param_value,
-                    size_t *param_value_size_ret, T value) {
-
-  auto assignment = [](void *param_value, T value, size_t value_size) {
-    std::ignore = value_size;
-    *static_cast<T *>(param_value) = value;
-  };
-
-  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
-                     sizeof(T), assignment);
-}
-
-template <typename T>
-ur_result_t getInfoArray(size_t array_length, size_t param_value_size,
-                         void *param_value, size_t *param_value_size_ret,
-                         const T *value) {
-  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
-                     array_length * sizeof(T), memcpy);
-}
-
-template <typename T, typename RetType>
-ur_result_t getInfoArray(size_t array_length, size_t param_value_size,
-                         void *param_value, size_t *param_value_size_ret,
-                         const T *value) {
-  if (param_value) {
-    memset(param_value, 0, param_value_size);
-    for (uint32_t I = 0; I < array_length; I++) {
-      ((RetType *)param_value)[I] = (RetType)value[I];
-    }
-  }
-  if (param_value_size_ret) {
-    *param_value_size_ret = array_length * sizeof(RetType);
-  }
-  return UR_RESULT_SUCCESS;
-}
-
-template <>
-inline ur_result_t
-getInfo<const char *>(size_t param_value_size, void *param_value,
-                      size_t *param_value_size_ret, const char *value) {
-  return getInfoArray(strlen(value) + 1, param_value_size, param_value,
-                      param_value_size_ret, value);
-}
-} // namespace ur
-
-class UrReturnHelper {
-public:
-  UrReturnHelper(size_t param_value_size, void *param_value,
-                 size_t *param_value_size_ret)
-      : param_value_size(param_value_size), param_value(param_value),
-        param_value_size_ret(param_value_size_ret) {}
-
-  // A version where in/out info size is represented by a single pointer
-  // to a value which is updated on return
-  UrReturnHelper(size_t *param_value_size, void *param_value)
-      : param_value_size(*param_value_size), param_value(param_value),
-        param_value_size_ret(param_value_size) {}
-
-  // Scalar return value
-  template <class T> ur_result_t operator()(const T &t) {
-    return ur::getInfo(param_value_size, param_value, param_value_size_ret, t);
-  }
-
-  // Array return value
-  template <class T> ur_result_t operator()(const T *t, size_t s) {
-    return ur::getInfoArray(s, param_value_size, param_value,
-                            param_value_size_ret, t);
-  }
-
-  // Array return value where element type is differrent from T
-  template <class RetType, class T>
-  ur_result_t operator()(const T *t, size_t s) {
-    return ur::getInfoArray<T, RetType>(s, param_value_size, param_value,
-                                        param_value_size_ret, t);
-  }
-
-protected:
-  size_t param_value_size;
-  void *param_value;
-  size_t *param_value_size_ret;
-};
-
-template <typename T> class Result {
-public:
-  Result(ur_result_t err) : value_or_err(err) {}
-  Result(T value) : value_or_err(std::move(value)) {}
-  Result() : value_or_err(UR_RESULT_ERROR_UNINITIALIZED) {}
-
-  bool is_err() { return std::holds_alternative<ur_result_t>(value_or_err); }
-  explicit operator bool() const { return !is_err(); }
-
-  const T *get_value() { return std::get_if<T>(&value_or_err); }
-
-  ur_result_t get_error() {
-    auto *err = std::get_if<ur_result_t>(&value_or_err);
-    return err ? *err : UR_RESULT_SUCCESS;
-  }
-
-private:
-  std::variant<ur_result_t, T> value_or_err;
-};
-
-// Helper to make sure each x, y, z dim divide the global dimension.
-//
-// In/Out: ThreadsPerBlockInDim - The dimension of workgroup in some dimension
-// In:     GlobalWorkSizeInDim  - The global size in some dimension
-static inline void
-roundToHighestFactorOfGlobalSize(size_t &ThreadsPerBlockInDim,
-                                 const size_t GlobalWorkSizeInDim) {
-  while (ThreadsPerBlockInDim > 1 &&
-         GlobalWorkSizeInDim % ThreadsPerBlockInDim) {
-    --ThreadsPerBlockInDim;
-  }
-}
-
-// Returns whether or not Value is a power of 2
-template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
-bool isPowerOf2(const T &Value) {
-  return Value && !(Value & (Value - 1));
-}
-
-// Helper to make sure each x, y, z dim divide the global dimension.
-// Additionally it makes sure that the inner dimension always is a power of 2
-//
-// In/Out: ThreadsPerBlock      - The size of wg in 3d
-// In:     GlobalSize           - The global size in 3d (if dim < 3 then outer
-//                                                       dims == 1)
-// In:     MaxBlockDim          - The max size of block in 3d
-// In:     MaxBlockSize         - The max total size of block in all dimensions
-// In:     WorkDim              - The workdim (1, 2 or 3)
-static inline void roundToHighestFactorOfGlobalSizeIn3d(
-    size_t *ThreadsPerBlock, const size_t *GlobalSize,
-    const size_t *MaxBlockDim, const size_t MaxBlockSize) {
-  assert(GlobalSize[0] && "GlobalSize[0] cannot be zero");
-  assert(GlobalSize[1] && "GlobalSize[1] cannot be zero");
-  assert(GlobalSize[2] && "GlobalSize[2] cannot be zero");
-
-  ThreadsPerBlock[0] =
-      std::min(GlobalSize[0], std::min(MaxBlockSize, MaxBlockDim[0]));
-  do {
-    roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalSize[0]);
-  } while (!isPowerOf2(ThreadsPerBlock[0]) && ThreadsPerBlock[0] > 32 &&
-           --ThreadsPerBlock[0]);
-
-  ThreadsPerBlock[1] =
-      std::min(GlobalSize[1],
-               std::min(MaxBlockSize / ThreadsPerBlock[0], MaxBlockDim[1]));
-  roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalSize[1]);
-
-  ThreadsPerBlock[2] = std::min(
-      GlobalSize[2],
-      std::min(MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[0]),
-               MaxBlockDim[2]));
-  roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalSize[2]);
-}
+//===--------- ur.hpp - Unified Runtime  ----------------------------------===//
+//
+// Copyright (C) 2023 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <iostream>
+#include <mutex>
+#include <shared_mutex>
+#include <string>
+#include <thread>
+#include <variant>
+#include <vector>
+
+#include <ur_api.h>
+
+#include "ur_util.hpp"
+
+template <class To, class From> To ur_cast(From Value) {
+  // TODO: see if more sanity checks are possible.
+  assert(sizeof(From) == sizeof(To));
+  return (To)(Value);
+}
+
+template <> uint32_t inline ur_cast(uint64_t Value) {
+  // Cast value and check that we don't lose any information.
+  uint32_t CastedValue = (uint32_t)(Value);
+  assert((uint64_t)CastedValue == Value);
+  return CastedValue;
+}
+
+// TODO: promote all of the below extensions to the Unified Runtime
+//       and get rid of these ZER_EXT constants.
+const ur_device_info_t UR_EXT_DEVICE_INFO_OPENCL_C_VERSION =
+    (ur_device_info_t)0x103D;
+
+const ur_command_t UR_EXT_COMMAND_TYPE_USER =
+    (ur_command_t)((uint32_t)UR_COMMAND_FORCE_UINT32 - 1);
+
+/// Program metadata tags recognized by the UR adapters. For kernels the tag
+/// must appear after the kernel name.
+#define __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE                    \
+  "@reqd_work_group_size"
+#define __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING "@global_id_mapping"
+#define __SYCL_UR_PROGRAM_METADATA_TAG_NEED_FINALIZATION "Requires finalization"
+
+// Terminates the process with a catastrophic error message.
+[[noreturn]] inline void die(const char *Message) {
+  std::cerr << "die: " << Message << std::endl;
+  std::terminate();
+}
+
+// A single-threaded app has an opportunity to enable this mode to avoid
+// overhead from mutex locking. Default value is 0 which means that single
+// thread mode is disabled.
+static const bool SingleThreadMode = [] {
+  auto UrRet = ur_getenv("UR_L0_SINGLE_THREAD_MODE");
+  auto PiRet = ur_getenv("SYCL_PI_LEVEL_ZERO_SINGLE_THREAD_MODE");
+  const bool RetVal =
+      UrRet ? std::stoi(*UrRet) : (PiRet ? std::stoi(*PiRet) : 0);
+  return RetVal;
+}();
+
+// Class which acts like shared_mutex if SingleThreadMode variable is not set.
+// If SingleThreadMode variable is set then mutex operations are turned into
+// nop.
+class ur_shared_mutex {
+  std::shared_mutex Mutex;
+
+public:
+  void lock() {
+    if (!SingleThreadMode) {
+      Mutex.lock();
+    }
+  }
+  bool try_lock() { return SingleThreadMode ? true : Mutex.try_lock(); }
+  void unlock() {
+    if (!SingleThreadMode) {
+      Mutex.unlock();
+    }
+  }
+
+  void lock_shared() {
+    if (!SingleThreadMode) {
+      Mutex.lock_shared();
+    }
+  }
+  bool try_lock_shared() {
+    return SingleThreadMode ? true : Mutex.try_lock_shared();
+  }
+  void unlock_shared() {
+    if (!SingleThreadMode) {
+      Mutex.unlock_shared();
+    }
+  }
+};
+
+// Class which acts like std::mutex if SingleThreadMode variable is not set.
+// If SingleThreadMode variable is set then mutex operations are turned into
+// nop.
+class ur_mutex {
+  std::mutex Mutex;
+  friend class ur_lock;
+
+public:
+  void lock() {
+    if (!SingleThreadMode) {
+      Mutex.lock();
+    }
+  }
+  bool try_lock() { return SingleThreadMode ? true : Mutex.try_lock(); }
+  void unlock() {
+    if (!SingleThreadMode) {
+      Mutex.unlock();
+    }
+  }
+};
+
+class ur_lock {
+  std::unique_lock<std::mutex> Lock;
+
+public:
+  explicit ur_lock(ur_mutex &Mutex) {
+    if (!SingleThreadMode) {
+      Lock = std::unique_lock<std::mutex>(Mutex.Mutex);
+    }
+  }
+};
+
+/// SpinLock is a synchronization primitive, that uses atomic variable and
+/// causes thread trying acquire lock wait in loop while repeatedly check if
+/// the lock is available.
+///
+/// One important feature of this implementation is that std::atomic<bool> can
+/// be zero-initialized. This allows SpinLock to have trivial constructor and
+/// destructor, which makes it possible to use it in global context (unlike
+/// std::mutex, that doesn't provide such guarantees).
+class SpinLock {
+public:
+  void lock() {
+    while (MLock.test_and_set(std::memory_order_acquire)) {
+      std::this_thread::yield();
+    }
+  }
+  void unlock() { MLock.clear(std::memory_order_release); }
+
+private:
+  std::atomic_flag MLock = ATOMIC_FLAG_INIT;
+};
+
+// The wrapper for immutable data.
+// The data is initialized only once at first access (via ->) with the
+// initialization function provided in Init. All subsequent access to
+// the data just returns the already stored data.
+//
+template <class T> struct ZeCache : private T {
+  // The initialization function takes a reference to the data
+  // it is going to initialize, since it is private here in
+  // order to disallow access other than through "->".
+  //
+  using InitFunctionType = std::function<void(T &)>;
+  InitFunctionType Compute{nullptr};
+  std::once_flag Computed;
+
+  ZeCache() : T{} {}
+
+  // Access to the fields of the original T data structure.
+  T *operator->() {
+    std::call_once(Computed, Compute, static_cast<T &>(*this));
+    return this;
+  }
+};
+
+// Helper for one-liner validation
+#define UR_ASSERT(condition, error)                                            \
+  if (!(condition))                                                            \
+    return error;
+
+// TODO: populate with target agnostic handling of UR platforms
+struct _ur_platform {};
+
+// Controls tracing UR calls from within the UR itself.
+extern bool PrintTrace;
+
+// The getInfo*/ReturnHelper facilities provide shortcut way of
+// writing return bytes for the various getInfo APIs.
+namespace ur {
+template <typename T, typename Assign>
+ur_result_t getInfoImpl(size_t param_value_size, void *param_value,
+                        size_t *param_value_size_ret, T value,
+                        size_t value_size, Assign &&assign_func) {
+  if (!param_value && !param_value_size_ret) {
+    return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+  }
+
+  if (param_value != nullptr) {
+
+    if (param_value_size < value_size) {
+      return UR_RESULT_ERROR_INVALID_SIZE;
+    }
+
+    assign_func(param_value, value, value_size);
+  }
+
+  if (param_value_size_ret != nullptr) {
+    *param_value_size_ret = value_size;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+template <typename T>
+ur_result_t getInfo(size_t param_value_size, void *param_value,
+                    size_t *param_value_size_ret, T value) {
+
+  auto assignment = [](void *param_value, T value, size_t value_size) {
+    std::ignore = value_size;
+    *static_cast<T *>(param_value) = value;
+  };
+
+  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
+                     sizeof(T), assignment);
+}
+
+template <typename T>
+ur_result_t getInfoArray(size_t array_length, size_t param_value_size,
+                         void *param_value, size_t *param_value_size_ret,
+                         const T *value) {
+  return getInfoImpl(param_value_size, param_value, param_value_size_ret, value,
+                     array_length * sizeof(T), memcpy);
+}
+
+template <typename T, typename RetType>
+ur_result_t getInfoArray(size_t array_length, size_t param_value_size,
+                         void *param_value, size_t *param_value_size_ret,
+                         const T *value) {
+  if (param_value) {
+    memset(param_value, 0, param_value_size);
+    for (uint32_t I = 0; I < array_length; I++) {
+      ((RetType *)param_value)[I] = (RetType)value[I];
+    }
+  }
+  if (param_value_size_ret) {
+    *param_value_size_ret = array_length * sizeof(RetType);
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+template <>
+inline ur_result_t
+getInfo<const char *>(size_t param_value_size, void *param_value,
+                      size_t *param_value_size_ret, const char *value) {
+  return getInfoArray(strlen(value) + 1, param_value_size, param_value,
+                      param_value_size_ret, value);
+}
+} // namespace ur
+
+class UrReturnHelper {
+public:
+  UrReturnHelper(size_t param_value_size, void *param_value,
+                 size_t *param_value_size_ret)
+      : param_value_size(param_value_size), param_value(param_value),
+        param_value_size_ret(param_value_size_ret) {}
+
+  // A version where in/out info size is represented by a single pointer
+  // to a value which is updated on return
+  UrReturnHelper(size_t *param_value_size, void *param_value)
+      : param_value_size(*param_value_size), param_value(param_value),
+        param_value_size_ret(param_value_size) {}
+
+  // Scalar return value
+  template <class T> ur_result_t operator()(const T &t) {
+    return ur::getInfo(param_value_size, param_value, param_value_size_ret, t);
+  }
+
+  // Array return value
+  template <class T> ur_result_t operator()(const T *t, size_t s) {
+    return ur::getInfoArray(s, param_value_size, param_value,
+                            param_value_size_ret, t);
+  }
+
+  // Array return value where element type is differrent from T
+  template <class RetType, class T>
+  ur_result_t operator()(const T *t, size_t s) {
+    return ur::getInfoArray<T, RetType>(s, param_value_size, param_value,
+                                        param_value_size_ret, t);
+  }
+
+protected:
+  size_t param_value_size;
+  void *param_value;
+  size_t *param_value_size_ret;
+};
+
+template <typename T> class Result {
+public:
+  Result(ur_result_t err) : value_or_err(err) {}
+  Result(T value) : value_or_err(std::move(value)) {}
+  Result() : value_or_err(UR_RESULT_ERROR_UNINITIALIZED) {}
+
+  bool is_err() { return std::holds_alternative<ur_result_t>(value_or_err); }
+  explicit operator bool() const { return !is_err(); }
+
+  const T *get_value() { return std::get_if<T>(&value_or_err); }
+
+  ur_result_t get_error() {
+    auto *err = std::get_if<ur_result_t>(&value_or_err);
+    return err ? *err : UR_RESULT_SUCCESS;
+  }
+
+private:
+  std::variant<ur_result_t, T> value_or_err;
+};
+
+// Helper to make sure each x, y, z dim divide the global dimension.
+//
+// In/Out: ThreadsPerBlockInDim - The dimension of workgroup in some dimension
+// In:     GlobalWorkSizeInDim  - The global size in some dimension
+static inline void
+roundToHighestFactorOfGlobalSize(size_t &ThreadsPerBlockInDim,
+                                 const size_t GlobalWorkSizeInDim) {
+  while (ThreadsPerBlockInDim > 1 &&
+         GlobalWorkSizeInDim % ThreadsPerBlockInDim) {
+    --ThreadsPerBlockInDim;
+  }
+}
+
+// Returns whether or not Value is a power of 2
+template <typename T, typename = std::enable_if_t<std::is_integral_v<T>>>
+bool isPowerOf2(const T &Value) {
+  return Value && !(Value & (Value - 1));
+}
+
+// Helper to make sure each x, y, z dim divide the global dimension.
+// Additionally it makes sure that the inner dimension always is a power of 2
+//
+// In/Out: ThreadsPerBlock      - The size of wg in 3d
+// In:     GlobalSize           - The global size in 3d (if dim < 3 then outer
+//                                                       dims == 1)
+// In:     MaxBlockDim          - The max size of block in 3d
+// In:     MaxBlockSize         - The max total size of block in all dimensions
+// In:     WorkDim              - The workdim (1, 2 or 3)
+static inline void roundToHighestFactorOfGlobalSizeIn3d(
+    size_t *ThreadsPerBlock, const size_t *GlobalSize,
+    const size_t *MaxBlockDim, const size_t MaxBlockSize) {
+  assert(GlobalSize[0] && "GlobalSize[0] cannot be zero");
+  assert(GlobalSize[1] && "GlobalSize[1] cannot be zero");
+  assert(GlobalSize[2] && "GlobalSize[2] cannot be zero");
+
+  ThreadsPerBlock[0] =
+      std::min(GlobalSize[0], std::min(MaxBlockSize, MaxBlockDim[0]));
+  do {
+    roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalSize[0]);
+  } while (!isPowerOf2(ThreadsPerBlock[0]) && ThreadsPerBlock[0] > 32 &&
+           --ThreadsPerBlock[0]);
+
+  ThreadsPerBlock[1] =
+      std::min(GlobalSize[1],
+               std::min(MaxBlockSize / ThreadsPerBlock[0], MaxBlockDim[1]));
+  roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalSize[1]);
+
+  ThreadsPerBlock[2] = std::min(
+      GlobalSize[2],
+      std::min(MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[0]),
+               MaxBlockDim[2]));
+  roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalSize[2]);
+}
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index e6410ee99b..1ed70e0494 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -7,7 +7,7 @@
  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  *
  * @file ur_api.cpp
- * @version v0.9-r0
+ * @version v0.10-r0
  *
  */
 #include "ur_api.h"
@@ -736,7 +736,7 @@ ur_result_t UR_APICALL urDeviceGetSelected(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
 ///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP < propName`
+///         + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName`
 ///     - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
 ///         + If `propName` is not supported by the adapter.
 ///     - ::UR_RESULT_ERROR_INVALID_SIZE
@@ -4040,6 +4040,7 @@ ur_result_t UR_APICALL urEventSetCallback(
 ///     - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION
 ///     - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
+///     - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGS - "The kernel argument values have not been specified."
 ///     - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
 ///     - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
 ur_result_t UR_APICALL urEnqueueKernelLaunch(
@@ -5634,7 +5635,6 @@ ur_result_t UR_APICALL urBindlessImagesImageFreeExp(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `NULL == pImageFormat`
 ///         + `NULL == pImageDesc`
-///         + `NULL == phMem`
 ///         + `NULL == phImage`
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
@@ -5650,7 +5650,6 @@ ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
     const ur_image_format_t
         *pImageFormat, ///< [in] pointer to image format specification
     const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description
-    ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created
     ur_exp_image_handle_t
         *phImage ///< [out] pointer to handle of image object created
 ) {
@@ -5678,7 +5677,6 @@ ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
 ///         + `NULL == pImageFormat`
 ///         + `NULL == pImageDesc`
-///         + `NULL == phMem`
 ///         + `NULL == phImage`
 ///     - ::UR_RESULT_ERROR_INVALID_CONTEXT
 ///     - ::UR_RESULT_ERROR_INVALID_VALUE
@@ -5696,7 +5694,6 @@ ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
         *pImageFormat, ///< [in] pointer to image format specification
     const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description
     ur_sampler_handle_t hSampler,      ///< [in] sampler to be used
-    ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created
     ur_exp_image_handle_t
         *phImage ///< [out] pointer to handle of image object created
 ) {
@@ -7034,6 +7031,46 @@ ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     return result;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Enqueue a command for recording the device timestamp
+///
+/// @returns
+///     - ::UR_RESULT_SUCCESS
+///     - ::UR_RESULT_ERROR_UNINITIALIZED
+///     - ::UR_RESULT_ERROR_DEVICE_LOST
+///     - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+///     - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+///         + `NULL == hQueue`
+///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == phEvent`
+///     - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST
+///         + `phEventWaitList == NULL && numEventsInWaitList > 0`
+///         + `phEventWaitList != NULL && numEventsInWaitList == 0`
+ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+    ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+    bool
+        blocking, ///< [in] indicates whether the call to this function should block until
+    ///< until the device timestamp recording command has executed on the
+    ///< device.
+    uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+    const ur_event_handle_t *
+        phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+    ///< events that must be complete before the kernel execution.
+    ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait
+    ///< events.
+    ur_event_handle_t *
+        phEvent ///< [in,out] return an event object that identifies this particular kernel
+                ///< execution instance. Profiling information can be queried
+    ///< from this event as if `hQueue` had profiling enabled. Querying
+    ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT`
+    ///< reports the timestamp at the time of the call to this function.
+    ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END`
+    ///< reports the timestamp recorded when the command is executed on the device.
+) {
+    ur_result_t result = UR_RESULT_SUCCESS;
+    return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Produces an executable program from one program, negates need for the
 ///        linking step.
diff --git a/test/adapters/CMakeLists.txt b/test/adapters/CMakeLists.txt
index 5eff6e357a..969d8a4e86 100644
--- a/test/adapters/CMakeLists.txt
+++ b/test/adapters/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (C) 2023 Intel Corporation
+# Copyright (C) 2023-2024 Intel Corporation
 # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
@@ -48,6 +48,35 @@ function(add_adapter_test name)
         ENVIRONMENT "${args_ENVIRONMENT}")
 endfunction()
 
+find_program(VALGRIND valgrind)
+
+function(add_adapter_memcheck_test name)
+    cmake_parse_arguments(args
+        ""                      # options
+        ""                      # one value keywords
+        "ENVIRONMENT"           # multi value keywords
+        ${ARGN})
+    if(VALGRIND)
+        set(target test-adapter-${name})
+        set(test_name ${target}-memcheck)
+
+        add_test(NAME ${test_name}
+                COMMAND ${CMAKE_COMMAND}
+                -D TEST_FILE=valgrind
+                -D TEST_ARGS="--tool=memcheck --leak-check=full $<TARGET_FILE:${target}> --devices_count=${UR_TEST_DEVICES_COUNT} --platforms_count=${UR_TEST_DEVICES_COUNT}"
+                -D MODE=stderr
+                -D MATCH_FILE=${CMAKE_CURRENT_SOURCE_DIR}/${name}_memcheck.match
+                -P ${PROJECT_SOURCE_DIR}/cmake/match.cmake
+                DEPENDS ${TEST_TARGET_NAME}
+                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+        )
+
+        set_tests_properties(${test_name} PROPERTIES
+            LABELS "adapter-specific;${name}"
+            ENVIRONMENT "${args_ENVIRONMENT}")
+    endif()
+endfunction()
+
 if(UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_ALL)
     add_subdirectory(cuda)
 endif()
diff --git a/test/adapters/cuda/context_tests.cpp b/test/adapters/cuda/context_tests.cpp
index 37742a002c..b08e22c8f9 100644
--- a/test/adapters/cuda/context_tests.cpp
+++ b/test/adapters/cuda/context_tests.cpp
@@ -21,7 +21,9 @@ TEST_P(cudaUrContextCreateTest, CreateWithChildThread) {
 
     // Retrieve the CUDA context to check information is correct
     auto checkValue = [=] {
-        CUcontext cudaContext = context.handle->get();
+        // Just testing the first device in context
+        CUcontext cudaContext =
+            context.handle->getDevices()[0]->getNativeContext();
         unsigned int version = 0;
         EXPECT_SUCCESS_CUDA(cuCtxGetApiVersion(cudaContext, &version));
         EXPECT_EQ(version, known_cuda_api_version);
@@ -94,15 +96,6 @@ TEST_P(cudaUrContextCreateTest, ContextLifetimeExisting) {
 
     // ensure the queue has the correct context
     ASSERT_EQ(context, queue->getContext());
-
-    // create a buffer in the context to set the context as active
-    uur::raii::Mem buffer;
-    ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, 1024,
-                                     nullptr, buffer.ptr()));
-
-    // check that context is now the active cuda context
-    ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(&current));
-    ASSERT_EQ(current, context->get());
 }
 
 TEST_P(cudaUrContextCreateTest, ThreadedContext) {
@@ -127,8 +120,6 @@ TEST_P(cudaUrContextCreateTest, ThreadedContext) {
     // the first context, and then create and release another queue with
     // the second context.
     auto test_thread = std::thread([&] {
-        CUcontext current = nullptr;
-
         {
             // create a queue with the first context
             uur::raii::Queue queue;
@@ -165,15 +156,6 @@ TEST_P(cudaUrContextCreateTest, ThreadedContext) {
 
             // ensure queue has correct context
             ASSERT_EQ(context2, queue->getContext());
-
-            // create a buffer to set the active context
-            uur::raii::Mem buffer = nullptr;
-            ASSERT_SUCCESS(urMemBufferCreate(context2, UR_MEM_FLAG_READ_WRITE,
-                                             1024, nullptr, buffer.ptr()));
-
-            // check that the 2nd context is now tha active cuda context
-            ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(&current));
-            ASSERT_EQ(current, context2->get());
         }
     });
 
diff --git a/test/adapters/level_zero/CMakeLists.txt b/test/adapters/level_zero/CMakeLists.txt
index 45d68594ed..17be4306ce 100644
--- a/test/adapters/level_zero/CMakeLists.txt
+++ b/test/adapters/level_zero/CMakeLists.txt
@@ -13,15 +13,47 @@ else()
         FIXTURE KERNELS
         SOURCES
             urProgramLink.cpp
+            urKernelCreateWithNativeHandle.cpp
         ENVIRONMENT
             "UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_level_zero>\""
     )
+    # TODO: valgrind tests require very new environment.
+    # Enable once all L0 runners are updated.
+    # add_adapter_memcheck_test(level_zero
+    #    ENVIRONMENT
+    #         "UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_level_zero>\""
+    # )
+
+    target_link_libraries(test-adapter-level_zero PRIVATE
+        LevelZeroLoader
+        LevelZeroLoader-Headers
+    )
 
     target_include_directories(test-adapter-level_zero PRIVATE
         ${PROJECT_SOURCE_DIR}/source
         ${PROJECT_SOURCE_DIR}/source/adapters/level_zero
+        LevelZeroLoader-Headers
     )
 
     add_dependencies(test-adapter-level_zero
         generate_device_binaries kernel_names_header)
 endif()
+
+if(LINUX)
+    # Make L0 use CallMap from a seprate shared lib so that we can access the map
+    # from the tests. This only seems to work on linux
+    add_library(zeCallMap SHARED zeCallMap.cpp)
+    target_compile_definitions(ur_adapter_level_zero PRIVATE UR_L0_CALL_COUNT_IN_TESTS)
+    target_link_libraries(ur_adapter_level_zero PRIVATE zeCallMap)
+
+    add_adapter_test(level_zero
+        FIXTURE DEVICES
+        SOURCES
+            event_cache_tests.cpp
+        ENVIRONMENT
+            "UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_level_zero>\""
+            "UR_L0_LEAKS_DEBUG=1"
+    )
+
+    target_link_libraries(test-adapter-level_zero PRIVATE zeCallMap)
+endif()
diff --git a/test/adapters/level_zero/event_cache_tests.cpp b/test/adapters/level_zero/event_cache_tests.cpp
new file mode 100644
index 0000000000..53bc39ad96
--- /dev/null
+++ b/test/adapters/level_zero/event_cache_tests.cpp
@@ -0,0 +1,163 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "ur_print.hpp"
+#include "uur/fixtures.h"
+#include "uur/raii.h"
+
+#include <map>
+#include <string>
+
+template <typename... Args> auto combineFlags(std::tuple<Args...> tuple) {
+    return std::apply([](auto... args) { return (... |= args); }, tuple);
+}
+
+extern std::map<std::string, int> *ZeCallCount;
+
+using FlagsTupleType = std::tuple<ur_queue_flags_t, ur_queue_flags_t,
+                                  ur_queue_flags_t, ur_queue_flags_t>;
+
+struct urEventCacheTest : uur::urContextTestWithParam<FlagsTupleType> {
+    void SetUp() override {
+        UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::SetUp());
+
+        flags = combineFlags(getParam());
+
+        ur_queue_properties_t props;
+        props.flags = flags;
+        ASSERT_SUCCESS(urQueueCreate(context, device, &props, &queue));
+        ASSERT_NE(queue, nullptr);
+
+        ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_WRITE_ONLY, size,
+                                         nullptr, &buffer));
+
+        (*ZeCallCount)["zeEventCreate"] = 0;
+        (*ZeCallCount)["zeEventDestroy"] = 0;
+    }
+
+    void TearDown() override {
+        if (buffer) {
+            EXPECT_SUCCESS(urMemRelease(buffer));
+        }
+        if (queue) {
+            UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urQueueRelease(queue));
+        }
+        UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::TearDown());
+    }
+
+    auto enqueueWork(ur_event_handle_t *hEvent, int data) {
+        input.assign(count, data);
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urEnqueueMemBufferWrite(
+            queue, buffer, false, 0, size, input.data(), 0, nullptr, hEvent));
+    }
+
+    void verifyData() {
+        std::vector<uint32_t> output(count, 1);
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urEnqueueMemBufferRead(
+            queue, buffer, true, 0, size, output.data(), 0, nullptr, nullptr));
+
+        if (!(flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)) {
+            ASSERT_EQ(input, output);
+        }
+    }
+
+    const size_t count = 1024;
+    const size_t size = sizeof(uint32_t) * count;
+    ur_mem_handle_t buffer = nullptr;
+    ur_queue_handle_t queue = nullptr;
+    std::vector<uint32_t> input;
+    ur_queue_flags_t flags;
+};
+
+TEST_P(urEventCacheTest, eventsReuseNoVisibleEvent) {
+    static constexpr int numIters = 16;
+    static constexpr int numEnqueues = 128;
+
+    for (int i = 0; i < numIters; i++) {
+        for (int j = 0; j < numEnqueues; j++) {
+            enqueueWork(nullptr, i * numEnqueues + j);
+        }
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urQueueFinish(queue));
+        verifyData();
+    }
+
+    // TODO: why events are not reused for UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE?
+    if ((flags & UR_QUEUE_FLAG_DISCARD_EVENTS) &&
+        !(flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)) {
+        ASSERT_EQ((*ZeCallCount)["zeEventCreate"], 2);
+    } else {
+        ASSERT_GE((*ZeCallCount)["zeEventCreate"], numIters * numEnqueues);
+    }
+}
+
+TEST_P(urEventCacheTest, eventsReuseWithVisibleEvent) {
+    static constexpr int numIters = 16;
+    static constexpr int numEnqueues = 128;
+
+    for (int i = 0; i < numIters; i++) {
+        std::vector<uur::raii::Event> events(numEnqueues);
+        for (int j = 0; j < numEnqueues; j++) {
+            enqueueWork(events[j].ptr(), i * numEnqueues + j);
+        }
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urQueueFinish(queue));
+        verifyData();
+    }
+
+    ASSERT_LT((*ZeCallCount)["zeEventCreate"], numIters * numEnqueues);
+}
+
+TEST_P(urEventCacheTest, eventsReuseWithVisibleEventAndWait) {
+    static constexpr int numIters = 16;
+    static constexpr int numEnqueues = 128;
+    static constexpr int waitEveryN = 16;
+
+    for (int i = 0; i < numIters; i++) {
+        std::vector<uur::raii::Event> events;
+        for (int j = 0; j < numEnqueues; j++) {
+            events.emplace_back();
+            enqueueWork(events.back().ptr(), i * numEnqueues + j);
+
+            if (j > 0 && j % waitEveryN == 0) {
+                ASSERT_SUCCESS(urEventWait(waitEveryN,
+                                           (ur_event_handle_t *)events.data()));
+                verifyData();
+                events.clear();
+            }
+        }
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urQueueFinish(queue));
+    }
+
+    ASSERT_GE((*ZeCallCount)["zeEventCreate"], waitEveryN);
+    // TODO: why there are more events than this?
+    // ASSERT_LE((*ZeCallCount)["zeEventCreate"],  waitEveryN * 2 + 2);
+}
+
+template <typename T>
+inline std::string
+printFlags(const testing::TestParamInfo<typename T::ParamType> &info) {
+    const auto device_handle = std::get<0>(info.param);
+    const auto platform_device_name =
+        uur::GetPlatformAndDeviceName(device_handle);
+    auto flags = combineFlags(std::get<1>(info.param));
+
+    std::stringstream ss;
+    ur::details::printFlag<ur_queue_flag_t>(ss, flags);
+
+    auto str = ss.str();
+    std::replace(str.begin(), str.end(), ' ', '_');
+    std::replace(str.begin(), str.end(), '|', '_');
+    return platform_device_name + "__" + str;
+}
+
+UUR_TEST_SUITE_P(
+    urEventCacheTest,
+    ::testing::Combine(
+        testing::Values(0, UR_QUEUE_FLAG_DISCARD_EVENTS),
+        testing::Values(0, UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE),
+        // TODO: why the test fails with UR_QUEUE_FLAG_SUBMISSION_BATCHED?
+        testing::Values(
+            UR_QUEUE_FLAG_SUBMISSION_IMMEDIATE /*, UR_QUEUE_FLAG_SUBMISSION_BATCHED */),
+        testing::Values(0, UR_QUEUE_FLAG_PROFILING_ENABLE)),
+    printFlags<urEventCacheTest>);
diff --git a/test/adapters/level_zero/level_zero_memcheck.match b/test/adapters/level_zero/level_zero_memcheck.match
new file mode 100644
index 0000000000..904185989c
--- /dev/null
+++ b/test/adapters/level_zero/level_zero_memcheck.match
@@ -0,0 +1,2 @@
+{{IGNORE}}
+{{.*}} ERROR SUMMARY: 0 errors from 0 contexts {{.*}}
\ No newline at end of file
diff --git a/test/adapters/level_zero/urKernelCreateWithNativeHandle.cpp b/test/adapters/level_zero/urKernelCreateWithNativeHandle.cpp
new file mode 100644
index 0000000000..19e4de21ec
--- /dev/null
+++ b/test/adapters/level_zero/urKernelCreateWithNativeHandle.cpp
@@ -0,0 +1,63 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "level_zero/ze_api.h"
+#include "ur_api.h"
+#include "uur/checks.h"
+#include <uur/fixtures.h>
+
+using urLevelZeroKernelNativeHandleTest = uur::urContextTest;
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urLevelZeroKernelNativeHandleTest);
+
+TEST_P(urLevelZeroKernelNativeHandleTest, OwnedHandleRelease) {
+    ze_context_handle_t native_context;
+    urContextGetNativeHandle(context, (ur_native_handle_t *)&native_context);
+
+    ze_device_handle_t native_device;
+    urDeviceGetNativeHandle(device, (ur_native_handle_t *)&native_device);
+
+    std::shared_ptr<std::vector<char>> il_binary;
+    uur::KernelsEnvironment::instance->LoadSource("foo", il_binary);
+
+    auto kernel_name =
+        uur::KernelsEnvironment::instance->GetEntryPointNames("foo")[0];
+
+    ze_module_desc_t moduleDesc = {ZE_STRUCTURE_TYPE_MODULE_DESC};
+    moduleDesc.format = ZE_MODULE_FORMAT_IL_SPIRV;
+    moduleDesc.inputSize = il_binary->size();
+    moduleDesc.pInputModule =
+        reinterpret_cast<const uint8_t *>(il_binary->data());
+    moduleDesc.pBuildFlags = "";
+    ze_module_handle_t module;
+
+    ASSERT_EQ(zeModuleCreate(native_context, native_device, &moduleDesc,
+                             &module, NULL),
+              ZE_RESULT_SUCCESS);
+
+    ze_kernel_desc_t kernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC};
+    kernelDesc.pKernelName = kernel_name.c_str();
+
+    ze_kernel_handle_t native_kernel;
+
+    ASSERT_EQ(zeKernelCreate(module, &kernelDesc, &native_kernel),
+              ZE_RESULT_SUCCESS);
+
+    ur_program_native_properties_t pprops = {
+        UR_STRUCTURE_TYPE_PROGRAM_NATIVE_PROPERTIES, nullptr, 1};
+
+    ur_program_handle_t program;
+    ASSERT_SUCCESS(urProgramCreateWithNativeHandle((ur_native_handle_t)module,
+                                                   context, &pprops, &program));
+
+    ur_kernel_native_properties_t kprops = {
+        UR_STRUCTURE_TYPE_KERNEL_NATIVE_PROPERTIES, nullptr, 1};
+
+    ur_kernel_handle_t kernel;
+    ASSERT_SUCCESS(urKernelCreateWithNativeHandle(
+        (ur_native_handle_t)native_kernel, context, program, &kprops, &kernel));
+
+    ASSERT_SUCCESS(urKernelRelease(kernel));
+    ASSERT_SUCCESS(urProgramRelease(program));
+}
diff --git a/test/adapters/level_zero/urProgramLink.cpp b/test/adapters/level_zero/urProgramLink.cpp
index 77ce3b8f49..b84247ec47 100644
--- a/test/adapters/level_zero/urProgramLink.cpp
+++ b/test/adapters/level_zero/urProgramLink.cpp
@@ -3,6 +3,7 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+#include "ur_api.h"
 #include <uur/fixtures.h>
 
 using urLevelZeroProgramLinkTest = uur::urProgramTest;
@@ -28,4 +29,6 @@ TEST_P(urLevelZeroProgramLinkTest, InvalidLinkOptionsPrintedInLog) {
                                          log.data(), nullptr));
     ASSERT_EQ(log[logSize - 1], '\0');
     ASSERT_NE(std::string{log.data()}.find("-foo"), std::string::npos);
+
+    ASSERT_SUCCESS(urProgramRelease(linked_program));
 }
diff --git a/test/adapters/level_zero/zeCallMap.cpp b/test/adapters/level_zero/zeCallMap.cpp
new file mode 100644
index 0000000000..3c6487f36d
--- /dev/null
+++ b/test/adapters/level_zero/zeCallMap.cpp
@@ -0,0 +1,12 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <map>
+#include <string>
+
+// Map used by L0 adapter to count the number of calls to each L0 function
+// Lifetime is managed by the adapter, this variable is defined here
+// only so that we can read it from the tests.
+std::map<std::string, int> *ZeCallCount = nullptr;
diff --git a/test/conformance/CMakeLists.txt b/test/conformance/CMakeLists.txt
index 14875a117a..9a80c5345e 100644
--- a/test/conformance/CMakeLists.txt
+++ b/test/conformance/CMakeLists.txt
@@ -9,23 +9,36 @@ function(add_test_adapter name adapter)
     set(TEST_TARGET_NAME test-${name})
     set(TEST_NAME ${name}-${adapter})
 
-    add_test(NAME ${TEST_NAME}
-        COMMAND ${CMAKE_COMMAND}
-        -D TEST_FILE=${Python3_EXECUTABLE}
-        -D TEST_ARGS="${UR_CONFORMANCE_TEST_DIR}/cts_exe.py --test_command ${CMAKE_BINARY_DIR}/bin/${TEST_TARGET_NAME} --test_devices_count=${UR_TEST_DEVICES_COUNT} --test_platforms_count=${UR_TEST_PLATFORMS_COUNT}"
-        -D MODE=stdout
-        -D MATCH_FILE=${CMAKE_CURRENT_SOURCE_DIR}/${name}_${adapter}.match
-        -P ${PROJECT_SOURCE_DIR}/cmake/match.cmake
-        DEPENDS ${TEST_TARGET_NAME}
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+    set(TEST_COMMAND
+        "${PROJECT_BINARY_DIR}/bin/${TEST_TARGET_NAME} --test_devices_count=${UR_TEST_DEVICES_COUNT} --test_platforms_count=${UR_TEST_PLATFORMS_COUNT}"
     )
 
-    set(testEnv
-        UR_ADAPTERS_FORCE_LOAD="$<TARGET_FILE:ur_${adapter}>"
-        GTEST_COLOR=no
-    )
+    if(UR_CONFORMANCE_ENABLE_MATCH_FILES)
+        add_test(NAME ${TEST_NAME}
+            COMMAND ${CMAKE_COMMAND}
+            -D TEST_FILE=${Python3_EXECUTABLE}
+            -D TEST_ARGS="${UR_CONFORMANCE_TEST_DIR}/cts_exe.py --test_command ${TEST_COMMAND}"
+            -D MODE=stdout
+            -D MATCH_FILE=${CMAKE_CURRENT_SOURCE_DIR}/${name}_${adapter}.match
+            -P ${PROJECT_SOURCE_DIR}/cmake/match.cmake
+            DEPENDS ${TEST_TARGET_NAME}
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+        )
+    else()
+        separate_arguments(TEST_COMMAND)
+        add_test(NAME ${TEST_NAME}
+            COMMAND ${TEST_COMMAND}
+            DEPENDS ${TEST_TARGET_NAME}
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+        )
+    endif()
+
+    set(TEST_ENV UR_ADAPTERS_FORCE_LOAD="$<TARGET_FILE:ur_${adapter}>")
+    if(UR_CONFORMANCE_ENABLE_MATCH_FILES)
+        list(APPEND TEST_ENV GTEST_COLOR=no)
+    endif()
     set_tests_properties(${TEST_NAME} PROPERTIES
-        ENVIRONMENT "${testEnv}"
+        ENVIRONMENT "${TEST_ENV}"
         LABELS "conformance;${adapter}")
 endfunction()
 
@@ -123,6 +136,7 @@ if(UR_DPCXX)
     add_subdirectory(kernel)
     add_subdirectory(program)
     add_subdirectory(enqueue)
+    add_subdirectory(integration)
     add_subdirectory(exp_command_buffer)
     add_subdirectory(exp_usm_p2p)
 else()
diff --git a/test/conformance/context/context_adapter_cuda.match b/test/conformance/context/context_adapter_cuda.match
index b884aef583..e69de29bb2 100644
--- a/test/conformance/context/context_adapter_cuda.match
+++ b/test/conformance/context/context_adapter_cuda.match
@@ -1 +0,0 @@
-urContextCreateWithNativeHandleTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_
diff --git a/test/conformance/context/context_adapter_hip.match b/test/conformance/context/context_adapter_hip.match
index 82d8d71397..0c9d1530b7 100644
--- a/test/conformance/context/context_adapter_hip.match
+++ b/test/conformance/context/context_adapter_hip.match
@@ -1,2 +1,4 @@
 urContextCreateWithNativeHandleTest.Success/AMD_HIP_BACKEND___{{.*}}_
+urContextCreateWithNativeHandleTest.SuccessWithOwnedNativeHandle/AMD_HIP_BACKEND___{{.*}}_
+urContextCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle/AMD_HIP_BACKEND___{{.*}}_
 urContextGetInfoTestWithInfoParam.Success/AMD_HIP_BACKEND___{{.*}}
diff --git a/test/conformance/context/context_adapter_level_zero.match b/test/conformance/context/context_adapter_level_zero.match
index e43bf4d5b8..f25df872a3 100644
--- a/test/conformance/context/context_adapter_level_zero.match
+++ b/test/conformance/context/context_adapter_level_zero.match
@@ -1 +1,2 @@
+urContextCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
 urContextSetExtendedDeleterTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
diff --git a/test/conformance/context/context_adapter_native_cpu.match b/test/conformance/context/context_adapter_native_cpu.match
index 64f39d35c5..4c4ca027cb 100644
--- a/test/conformance/context/context_adapter_native_cpu.match
+++ b/test/conformance/context/context_adapter_native_cpu.match
@@ -1 +1,3 @@
+urContextCreateWithNativeHandleTest.InvalidNullPointerDevices/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urContextCreateWithNativeHandleTest.InvalidNullPointerContext/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urContextSetExtendedDeleterTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
diff --git a/test/conformance/context/urContextCreateWithNativeHandle.cpp b/test/conformance/context/urContextCreateWithNativeHandle.cpp
index 97cfa79b9a..64d4470df1 100644
--- a/test/conformance/context/urContextCreateWithNativeHandle.cpp
+++ b/test/conformance/context/urContextCreateWithNativeHandle.cpp
@@ -10,8 +10,9 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urContextCreateWithNativeHandleTest);
 
 TEST_P(urContextCreateWithNativeHandleTest, Success) {
     ur_native_handle_t native_context = nullptr;
-    if (urContextGetNativeHandle(context, &native_context)) {
-        GTEST_SKIP();
+    {
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+            urContextGetNativeHandle(context, &native_context));
     }
 
     // We cannot assume anything about a native_handle, not even if it's
@@ -20,8 +21,8 @@ TEST_P(urContextCreateWithNativeHandleTest, Success) {
     // and perform some query on it to verify that it works.
     ur_context_handle_t ctx = nullptr;
     ur_context_native_properties_t props{};
-    ASSERT_SUCCESS(urContextCreateWithNativeHandle(native_context, 1, &device,
-                                                   &props, &ctx));
+    UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urContextCreateWithNativeHandle(
+        native_context, 1, &device, &props, &ctx));
     ASSERT_NE(ctx, nullptr);
 
     uint32_t n_devices = 0;
@@ -30,3 +31,64 @@ TEST_P(urContextCreateWithNativeHandleTest, Success) {
 
     ASSERT_SUCCESS(urContextRelease(ctx));
 }
+
+TEST_P(urContextCreateWithNativeHandleTest, SuccessWithOwnedNativeHandle) {
+    ur_native_handle_t native_context = nullptr;
+    {
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+            urContextGetNativeHandle(context, &native_context));
+    }
+
+    ur_context_handle_t ctx = nullptr;
+    ur_context_native_properties_t props{
+        UR_STRUCTURE_TYPE_CONTEXT_NATIVE_PROPERTIES, nullptr, true};
+    UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urContextCreateWithNativeHandle(
+        native_context, 1, &device, &props, &ctx));
+    ASSERT_NE(ctx, nullptr);
+
+    uint32_t ref_count = 0;
+    ASSERT_SUCCESS(urContextGetInfo(ctx, UR_CONTEXT_INFO_REFERENCE_COUNT,
+                                    sizeof(uint32_t), &ref_count, nullptr));
+    ASSERT_EQ(ref_count, 1);
+}
+
+TEST_P(urContextCreateWithNativeHandleTest, SuccessWithUnOwnedNativeHandle) {
+    ur_native_handle_t native_context = nullptr;
+    {
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+            urContextGetNativeHandle(context, &native_context));
+    }
+
+    ur_context_handle_t ctx = nullptr;
+    ur_context_native_properties_t props{
+        UR_STRUCTURE_TYPE_CONTEXT_NATIVE_PROPERTIES, nullptr, false};
+    UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urContextCreateWithNativeHandle(
+        native_context, 1, &device, &props, &ctx));
+    ASSERT_NE(ctx, nullptr);
+
+    uint32_t ref_count = 0;
+    ASSERT_SUCCESS(urContextGetInfo(ctx, UR_CONTEXT_INFO_REFERENCE_COUNT,
+                                    sizeof(uint32_t), &ref_count, nullptr));
+    ASSERT_EQ(ref_count, 2);
+
+    ASSERT_SUCCESS(urContextRelease(ctx));
+}
+
+TEST_P(urContextCreateWithNativeHandleTest, InvalidNullPointerDevices) {
+    ur_native_handle_t native_context = nullptr;
+    ASSERT_SUCCESS(urContextGetNativeHandle(context, &native_context));
+
+    ur_context_handle_t ctx = nullptr;
+    ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
+                     urContextCreateWithNativeHandle(native_context, 1, nullptr,
+                                                     nullptr, &ctx));
+}
+
+TEST_P(urContextCreateWithNativeHandleTest, InvalidNullPointerContext) {
+    ur_native_handle_t native_context = nullptr;
+    ASSERT_SUCCESS(urContextGetNativeHandle(context, &native_context));
+
+    ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
+                     urContextCreateWithNativeHandle(native_context, 1, &device,
+                                                     nullptr, nullptr));
+}
diff --git a/test/conformance/device/device_adapter_cuda.match b/test/conformance/device/device_adapter_cuda.match
index 48e00debe4..9989fbd774 100644
--- a/test/conformance/device/device_adapter_cuda.match
+++ b/test/conformance/device/device_adapter_cuda.match
@@ -1 +1,2 @@
+urDeviceCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle
 {{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime
diff --git a/test/conformance/device/device_adapter_hip.match b/test/conformance/device/device_adapter_hip.match
index 1e498301c8..715a8ba055 100644
--- a/test/conformance/device/device_adapter_hip.match
+++ b/test/conformance/device/device_adapter_hip.match
@@ -1,2 +1,4 @@
 {{OPT}}urDeviceCreateWithNativeHandleTest.Success
+{{OPT}}urDeviceCreateWithNativeHandleTest.SuccessWithOwnedNativeHandle
+{{OPT}}urDeviceCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle
 {{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime
diff --git a/test/conformance/device/device_adapter_level_zero.match b/test/conformance/device/device_adapter_level_zero.match
index b1bff7376c..1f735d7a77 100644
--- a/test/conformance/device/device_adapter_level_zero.match
+++ b/test/conformance/device/device_adapter_level_zero.match
@@ -1,18 +1,10 @@
-{{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime
+urDeviceCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_GLOBAL_MEM_FREE
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_HOST_UNIFIED_MEMORY
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ENDIAN_LITTLE
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_AVAILABLE
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_COMPILER_AVAILABLE
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_LINKER_AVAILABLE
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_IMAGE_SRGB
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_BUILD_ON_SUBDEVICE
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ATOMIC_64
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ASYNC_BARRIER
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP
diff --git a/test/conformance/device/device_adapter_native_cpu.match b/test/conformance/device/device_adapter_native_cpu.match
index 4c1a0f7f7f..147a9a3dfd 100644
--- a/test/conformance/device/device_adapter_native_cpu.match
+++ b/test/conformance/device/device_adapter_native_cpu.match
@@ -1,4 +1,7 @@
+urDeviceCreateWithNativeHandleTest.InvalidNullHandlePlatform
+urDeviceCreateWithNativeHandleTest.InvalidNullPointerDevice
 {{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime
+urDeviceGetInfoSingleTest.MaxWorkGroupSizeIsNonzero
 {{OPT}}urDeviceSelectBinaryTest.Success
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_DEVICE_ID
 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MEMORY_CLOCK_RATE
diff --git a/test/conformance/device/device_adapter_opencl.match b/test/conformance/device/device_adapter_opencl.match
index 716ebd54fe..39854cbcd3 100644
--- a/test/conformance/device/device_adapter_opencl.match
+++ b/test/conformance/device/device_adapter_opencl.match
@@ -1 +1 @@
-urDeviceGetInfoTest.Success/UR_DEVICE_INFO_HALF_FP_CONFIG
+urDeviceCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle
diff --git a/test/conformance/device/urDeviceCreateWithNativeHandle.cpp b/test/conformance/device/urDeviceCreateWithNativeHandle.cpp
index fbcd5e2f4b..cf31ebe9be 100644
--- a/test/conformance/device/urDeviceCreateWithNativeHandle.cpp
+++ b/test/conformance/device/urDeviceCreateWithNativeHandle.cpp
@@ -9,8 +9,9 @@ using urDeviceCreateWithNativeHandleTest = uur::urAllDevicesTest;
 TEST_F(urDeviceCreateWithNativeHandleTest, Success) {
     for (auto device : devices) {
         ur_native_handle_t native_handle = nullptr;
-        if (urDeviceGetNativeHandle(device, &native_handle)) {
-            continue;
+        {
+            UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+                urDeviceGetNativeHandle(device, &native_handle));
         }
 
         // We cannot assume anything about a native_handle, not even if it's
@@ -18,8 +19,8 @@ TEST_F(urDeviceCreateWithNativeHandleTest, Success) {
         // We can however convert the native_handle back into a unified-runtime handle
         // and perform some query on it to verify that it works.
         ur_device_handle_t dev = nullptr;
-        ASSERT_SUCCESS(urDeviceCreateWithNativeHandle(native_handle, platform,
-                                                      nullptr, &dev));
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urDeviceCreateWithNativeHandle(
+            native_handle, platform, nullptr, &dev));
         ASSERT_NE(dev, nullptr);
 
         uint32_t dev_id = 0;
@@ -27,3 +28,72 @@ TEST_F(urDeviceCreateWithNativeHandleTest, Success) {
                                        sizeof(uint32_t), &dev_id, nullptr));
     }
 }
+
+TEST_F(urDeviceCreateWithNativeHandleTest, SuccessWithOwnedNativeHandle) {
+    for (auto device : devices) {
+        ur_native_handle_t native_handle = nullptr;
+        {
+            UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+                urDeviceGetNativeHandle(device, &native_handle));
+        }
+
+        ur_device_handle_t dev = nullptr;
+        ur_device_native_properties_t props{
+            UR_STRUCTURE_TYPE_DEVICE_NATIVE_PROPERTIES, nullptr, true};
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urDeviceCreateWithNativeHandle(
+            native_handle, platform, &props, &dev));
+        ASSERT_NE(dev, nullptr);
+
+        uint32_t ref_count = 0;
+        ASSERT_SUCCESS(urDeviceGetInfo(dev, UR_DEVICE_INFO_REFERENCE_COUNT,
+                                       sizeof(uint32_t), &ref_count, nullptr));
+
+        ASSERT_EQ(ref_count, 1);
+    }
+}
+
+TEST_F(urDeviceCreateWithNativeHandleTest, SuccessWithUnOwnedNativeHandle) {
+    for (auto device : devices) {
+        ur_native_handle_t native_handle = nullptr;
+        {
+            UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+                urDeviceGetNativeHandle(device, &native_handle));
+        }
+
+        ur_device_handle_t dev = nullptr;
+        ur_device_native_properties_t props{
+            UR_STRUCTURE_TYPE_DEVICE_NATIVE_PROPERTIES, nullptr, false};
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urDeviceCreateWithNativeHandle(
+            native_handle, platform, &props, &dev));
+        ASSERT_NE(dev, nullptr);
+
+        uint32_t ref_count = 0;
+        ASSERT_SUCCESS(urDeviceGetInfo(dev, UR_DEVICE_INFO_REFERENCE_COUNT,
+                                       sizeof(uint32_t), &ref_count, nullptr));
+
+        ASSERT_EQ(ref_count, 2);
+    }
+}
+
+TEST_F(urDeviceCreateWithNativeHandleTest, InvalidNullHandlePlatform) {
+    for (auto device : devices) {
+        ur_native_handle_t native_handle = nullptr;
+        ASSERT_SUCCESS(urDeviceGetNativeHandle(device, &native_handle));
+
+        ur_device_handle_t dev = nullptr;
+        ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE,
+                         urDeviceCreateWithNativeHandle(native_handle, nullptr,
+                                                        nullptr, &dev));
+    }
+}
+
+TEST_F(urDeviceCreateWithNativeHandleTest, InvalidNullPointerDevice) {
+    for (auto device : devices) {
+        ur_native_handle_t native_handle = nullptr;
+        ASSERT_SUCCESS(urDeviceGetNativeHandle(device, &native_handle));
+
+        ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
+                         urDeviceCreateWithNativeHandle(native_handle, platform,
+                                                        nullptr, nullptr));
+    }
+}
diff --git a/test/conformance/device/urDeviceGetInfo.cpp b/test/conformance/device/urDeviceGetInfo.cpp
index a3fd9afe10..d1a04d8a6a 100644
--- a/test/conformance/device/urDeviceGetInfo.cpp
+++ b/test/conformance/device/urDeviceGetInfo.cpp
@@ -3,6 +3,7 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+#include <array>
 #include <map>
 #include <uur/fixtures.h>
 
@@ -242,6 +243,12 @@ INSTANTIATE_TEST_SUITE_P(
         return ss.str();
     });
 
+struct urDeviceGetInfoSingleTest : uur::urAllDevicesTest {
+    void SetUp() override {
+        UUR_RETURN_ON_FATAL_FAILURE(uur::urAllDevicesTest::SetUp());
+    }
+};
+
 bool doesReturnArray(ur_device_info_t info_type) {
     if (info_type == UR_DEVICE_INFO_SUPPORTED_PARTITIONS ||
         info_type == UR_DEVICE_INFO_PARTITION_TYPE) {
@@ -284,7 +291,7 @@ TEST_P(urDeviceGetInfoTest, Success) {
     }
 }
 
-TEST_P(urDeviceGetInfoTest, InvalidNullHandleDevice) {
+TEST_F(urDeviceGetInfoSingleTest, InvalidNullHandleDevice) {
     ur_device_type_t device_type;
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE,
                      urDeviceGetInfo(nullptr, UR_DEVICE_INFO_TYPE,
@@ -292,7 +299,7 @@ TEST_P(urDeviceGetInfoTest, InvalidNullHandleDevice) {
                                      nullptr));
 }
 
-TEST_P(urDeviceGetInfoTest, InvalidEnumerationInfoType) {
+TEST_F(urDeviceGetInfoSingleTest, InvalidEnumerationInfoType) {
     for (auto device : devices) {
         ur_device_type_t device_type;
         ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_ENUMERATION,
@@ -302,7 +309,7 @@ TEST_P(urDeviceGetInfoTest, InvalidEnumerationInfoType) {
     }
 }
 
-TEST_P(urDeviceGetInfoTest, InvalidSizePropSize) {
+TEST_F(urDeviceGetInfoSingleTest, InvalidSizePropSize) {
     for (auto device : devices) {
         ur_device_type_t device_type;
         ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_SIZE,
@@ -311,7 +318,7 @@ TEST_P(urDeviceGetInfoTest, InvalidSizePropSize) {
     }
 }
 
-TEST_P(urDeviceGetInfoTest, InvalidSizePropSizeSmall) {
+TEST_F(urDeviceGetInfoSingleTest, InvalidSizePropSizeSmall) {
     for (auto device : devices) {
         ur_device_type_t device_type;
         ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_SIZE,
@@ -321,7 +328,7 @@ TEST_P(urDeviceGetInfoTest, InvalidSizePropSizeSmall) {
     }
 }
 
-TEST_P(urDeviceGetInfoTest, InvalidNullPointerPropValue) {
+TEST_F(urDeviceGetInfoSingleTest, InvalidNullPointerPropValue) {
     for (auto device : devices) {
         ur_device_type_t device_type;
         ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
@@ -331,10 +338,30 @@ TEST_P(urDeviceGetInfoTest, InvalidNullPointerPropValue) {
     }
 }
 
-TEST_P(urDeviceGetInfoTest, InvalidNullPointerPropSizeRet) {
+TEST_F(urDeviceGetInfoSingleTest, InvalidNullPointerPropSizeRet) {
     for (auto device : devices) {
         ASSERT_EQ_RESULT(
             UR_RESULT_ERROR_INVALID_NULL_POINTER,
             urDeviceGetInfo(device, UR_DEVICE_INFO_TYPE, 0, nullptr, nullptr));
     }
 }
+
+TEST_F(urDeviceGetInfoSingleTest, MaxWorkGroupSizeIsNonzero) {
+    for (auto device : devices) {
+        size_t max_global_size;
+
+        ASSERT_SUCCESS(
+            urDeviceGetInfo(device, UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE,
+                            sizeof(size_t), &max_global_size, nullptr));
+        ASSERT_NE(max_global_size, 0);
+
+        std::array<size_t, 3> max_work_group_sizes;
+        ASSERT_SUCCESS(urDeviceGetInfo(device,
+                                       UR_DEVICE_INFO_MAX_WORK_GROUPS_3D,
+                                       sizeof(max_work_group_sizes),
+                                       max_work_group_sizes.data(), nullptr));
+        for (size_t i = 0; i < 3; i++) {
+            ASSERT_NE(max_work_group_sizes[i], 0);
+        }
+    }
+}
diff --git a/test/conformance/device/urDevicePartition.cpp b/test/conformance/device/urDevicePartition.cpp
index c4c992e06a..2b2939066d 100644
--- a/test/conformance/device/urDevicePartition.cpp
+++ b/test/conformance/device/urDevicePartition.cpp
@@ -244,6 +244,21 @@ TEST_F(urDevicePartitionTest, InvalidNullPointerProperties) {
     }
 }
 
+TEST_F(urDevicePartitionTest, InvalidNullPointerPropertiesArray) {
+    ur_device_partition_properties_t properties{
+        UR_STRUCTURE_TYPE_DEVICE_PARTITION_PROPERTIES,
+        nullptr,
+        nullptr,
+        0,
+    };
+    for (auto device : devices) {
+        ur_device_handle_t sub_device = nullptr;
+        ASSERT_EQ_RESULT(
+            UR_RESULT_ERROR_INVALID_NULL_POINTER,
+            urDevicePartition(device, &properties, 1, &sub_device, nullptr));
+    }
+}
+
 TEST_F(urDevicePartitionTest, SuccessSubSet) {
     for (auto device : devices) {
 
diff --git a/test/conformance/device_code/CMakeLists.txt b/test/conformance/device_code/CMakeLists.txt
index ee16b0eb43..1419604b9d 100644
--- a/test/conformance/device_code/CMakeLists.txt
+++ b/test/conformance/device_code/CMakeLists.txt
@@ -10,6 +10,16 @@ else()
     set(AMD_ARCH "${UR_CONFORMANCE_AMD_ARCH}")
 endif()
 
+if (WIN32)
+  set(NULDEV NUL)
+else()
+  set(NULDEV /dev/null)
+endif()
+
+cmake_path(GET UR_DPCXX EXTENSION EXE)
+cmake_path(REPLACE_FILENAME UR_DPCXX "clang-offload-extract${EXE}" OUTPUT_VARIABLE DEFAULT_EXTRACTOR_NAME)
+set(UR_DEVICE_CODE_EXTRACTOR "${DEFAULT_EXTRACTOR_NAME}" CACHE PATH "Path to clang-offload-extract")
+
 if("${AMD_ARCH}" STREQUAL "" AND "${TARGET_TRIPLES}" MATCHES "amd")
     find_package(RocmAgentEnumerator)
     if(NOT ROCM_AGENT_ENUMERATOR_FOUND)
@@ -52,8 +62,15 @@ macro(add_device_binary SOURCE_FILE)
             set(EXTRA_ENV DYLD_FALLBACK_LIBRARY_PATH=${UR_SYCL_LIBRARY_DIR})
         endif()
     endif()
+
+    # Convert build flags to a regular CMake list, splitting by unquoted white
+    # space as necessary.
+    separate_arguments(DPCXX_BUILD_FLAGS_LIST NATIVE_COMMAND "${UR_DPCXX_BUILD_FLAGS}")
+
     foreach(TRIPLE ${TARGET_TRIPLES})
         set(EXE_PATH "${DEVICE_BINARY_DIR}/${KERNEL_NAME}_${TRIPLE}")
+        set(BIN_PATH "${DEVICE_BINARY_DIR}/${TRIPLE}.bin.0")
+
         if(${TRIPLE} MATCHES "amd")
             set(AMD_TARGET_BACKEND -Xsycl-target-backend=${TRIPLE})
             set(AMD_OFFLOAD_ARCH  --offload-arch=${AMD_ARCH})
@@ -76,19 +93,37 @@ macro(add_device_binary SOURCE_FILE)
             continue()
         endif()
 
-        add_custom_command(OUTPUT ${EXE_PATH}
+        # HIP doesn't seem to provide the symbol
+        # `_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E11FixedSgSize` which
+        # causes a build failure here
+        if(${TRIPLE} MATCHES "amd" AND ${KERNEL_NAME} MATCHES "subgroup")
+            continue()
+        endif()
+
+        add_custom_command(OUTPUT "${BIN_PATH}"
             COMMAND ${UR_DPCXX} -fsycl -fsycl-targets=${TRIPLE} -fsycl-device-code-split=off 
             ${AMD_TARGET_BACKEND} ${AMD_OFFLOAD_ARCH} ${AMD_NOGPULIB}
-            ${SOURCE_FILE} -o ${EXE_PATH}
+            ${DPCXX_BUILD_FLAGS_LIST} ${SOURCE_FILE} -o ${EXE_PATH}
+
+            COMMAND ${CMAKE_COMMAND} -E env ${EXTRA_ENV} ${UR_DEVICE_CODE_EXTRACTOR} --stem="${TRIPLE}.bin" ${EXE_PATH}
 
-            COMMAND ${CMAKE_COMMAND} -E env ${EXTRA_ENV} SYCL_DUMP_IMAGES=true
-            ${EXE_PATH} || exit 0
             WORKING_DIRECTORY "${DEVICE_BINARY_DIR}"
             DEPENDS ${SOURCE_FILE}
         )
-        add_custom_target(generate_${KERNEL_NAME}_${TRIPLE} DEPENDS ${EXE_PATH})
+        add_custom_target(generate_${KERNEL_NAME}_${TRIPLE} DEPENDS ${BIN_PATH})
         add_dependencies(generate_device_binaries generate_${KERNEL_NAME}_${TRIPLE})
     endforeach()
+
+    set(IH_PATH "${DEVICE_BINARY_DIR}/${KERNEL_NAME}.ih")
+    add_custom_command(OUTPUT "${IH_PATH}"
+        COMMAND ${UR_DPCXX} -fsycl -fsycl-device-code-split=off
+        -fsycl-device-only -c -Xclang -fsycl-int-header="${IH_PATH}"
+        ${DPCXX_BUILD_FLAGS_LIST} ${SOURCE_FILE} -o ${NULDEV}
+
+        WORKING_DIRECTORY "${DEVICE_BINARY_DIR}"
+        DEPENDS ${SOURCE_FILE}
+    )
+    list(APPEND DEVICE_IHS ${IH_PATH})
     list(APPEND DEVICE_CODE_SOURCES ${SOURCE_FILE})
 endmacro()
 
@@ -101,6 +136,9 @@ add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fill_usm.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/foo.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/image_copy.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/mean.cpp)
+add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/cpy_and_mult.cpp)
+add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/cpy_and_mult_usm.cpp)
+add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/multiply.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/spec_constant.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/spec_constant_multiple.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/usm_ll.cpp)
@@ -108,13 +146,16 @@ add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/saxpy.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/saxpy_usm.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/indexers_usm.cpp)
 add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/build_failure.cpp)
+add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fixed_wg_size.cpp)
+add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/standard_types.cpp)
+add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/subgroup.cpp)
 
 set(KERNEL_HEADER ${UR_CONFORMANCE_DEVICE_BINARIES_DIR}/kernel_entry_points.h)
 add_custom_command(OUTPUT ${KERNEL_HEADER}
     WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/scripts
     COMMAND ${Python3_EXECUTABLE} generate_kernel_header.py
-    --dpcxx_path ${UR_DPCXX} -o ${KERNEL_HEADER} ${DEVICE_CODE_SOURCES}
+    -o ${KERNEL_HEADER} ${DEVICE_CODE_SOURCES}
     DEPENDS ${PROJECT_SOURCE_DIR}/scripts/generate_kernel_header.py
-    ${DEVICE_CODE_SOURCES})
+    ${DEVICE_CODE_SOURCES} ${DEVICE_IHS})
 add_custom_target(kernel_names_header DEPENDS ${KERNEL_HEADER})
 add_dependencies(generate_device_binaries kernel_names_header)
diff --git a/test/conformance/device_code/bar.cpp b/test/conformance/device_code/bar.cpp
index fecac40c75..58f2696bf8 100644
--- a/test/conformance/device_code/bar.cpp
+++ b/test/conformance/device_code/bar.cpp
@@ -3,14 +3,14 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
-    cl::sycl::queue deviceQueue;
-    cl::sycl::range<1> numOfItems{1};
+    sycl::queue deviceQueue;
+    sycl::range<1> numOfItems{1};
 
-    deviceQueue.submit([&](cl::sycl::handler &cgh) {
-        auto kern = [=](cl::sycl::id<1>) {};
+    deviceQueue.submit([&](sycl::handler &cgh) {
+        auto kern = [=](sycl::id<1>) {};
         cgh.parallel_for<class Bar>(numOfItems, kern);
     });
 
diff --git a/test/conformance/device_code/cpy_and_mult.cpp b/test/conformance/device_code/cpy_and_mult.cpp
new file mode 100644
index 0000000000..a2bdaccf55
--- /dev/null
+++ b/test/conformance/device_code/cpy_and_mult.cpp
@@ -0,0 +1,29 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <CL/sycl.hpp>
+
+int main() {
+    size_t array_size = 16;
+    cl::sycl::queue sycl_queue;
+    std::vector<uint32_t> src(array_size, 1);
+    std::vector<uint32_t> dst(array_size, 1);
+    auto src_buff =
+        cl::sycl::buffer<uint32_t>(src.data(), cl::sycl::range<1>(array_size));
+    auto dst_buff =
+        cl::sycl::buffer<uint32_t>(dst.data(), cl::sycl::range<1>(array_size));
+
+    sycl_queue.submit([&](cl::sycl::handler &cgh) {
+        auto src_acc = src_buff.get_access<cl::sycl::access::mode::read>(cgh);
+        auto dst_acc = dst_buff.get_access<cl::sycl::access::mode::write>(cgh);
+        cgh.parallel_for<class cpy_and_mult>(
+            cl::sycl::range<1>{array_size},
+            [src_acc, dst_acc](cl::sycl::item<1> itemId) {
+                auto id = itemId.get_id(0);
+                dst_acc[id] = src_acc[id] * 2;
+            });
+    });
+    return 0;
+}
diff --git a/test/conformance/device_code/cpy_and_mult_usm.cpp b/test/conformance/device_code/cpy_and_mult_usm.cpp
new file mode 100644
index 0000000000..e253dfe14b
--- /dev/null
+++ b/test/conformance/device_code/cpy_and_mult_usm.cpp
@@ -0,0 +1,22 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <CL/sycl.hpp>
+
+int main() {
+    size_t array_size = 16;
+    cl::sycl::queue sycl_queue;
+    uint32_t *src = cl::sycl::malloc_device<uint32_t>(array_size, sycl_queue);
+    uint32_t *dst = cl::sycl::malloc_device<uint32_t>(array_size, sycl_queue);
+    sycl_queue.submit([&](cl::sycl::handler &cgh) {
+        cgh.parallel_for<class cpy_and_mult_usm>(
+            cl::sycl::range<1>{array_size},
+            [src, dst](cl::sycl::item<1> itemId) {
+                auto id = itemId.get_id(0);
+                dst[id] = src[id] * 2;
+            });
+    });
+    return 0;
+}
diff --git a/test/conformance/device_code/fill.cpp b/test/conformance/device_code/fill.cpp
index 443373edf2..fabcbcf8ec 100644
--- a/test/conformance/device_code/fill.cpp
+++ b/test/conformance/device_code/fill.cpp
@@ -3,19 +3,18 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
     size_t array_size = 16;
     std::vector<uint32_t> A(array_size, 1);
     uint32_t val = 42;
-    cl::sycl::queue sycl_queue;
-    auto A_buff =
-        cl::sycl::buffer<uint32_t>(A.data(), cl::sycl::range<1>(array_size));
-    sycl_queue.submit([&](cl::sycl::handler &cgh) {
-        auto A_acc = A_buff.get_access<cl::sycl::access::mode::write>(cgh);
-        cgh.parallel_for<class fill>(cl::sycl::range<1>{array_size},
-                                     [A_acc, val](cl::sycl::item<1> itemId) {
+    sycl::queue sycl_queue;
+    auto A_buff = sycl::buffer<uint32_t>(A.data(), sycl::range<1>(array_size));
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        auto A_acc = A_buff.get_access<sycl::access::mode::write>(cgh);
+        cgh.parallel_for<class fill>(sycl::range<1>{array_size},
+                                     [A_acc, val](sycl::item<1> itemId) {
                                          auto id = itemId.get_id(0);
                                          A_acc[id] = val;
                                      });
diff --git a/test/conformance/device_code/fill_2d.cpp b/test/conformance/device_code/fill_2d.cpp
index d4dd6c704f..5fed417ed8 100644
--- a/test/conformance/device_code/fill_2d.cpp
+++ b/test/conformance/device_code/fill_2d.cpp
@@ -3,24 +3,24 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
     size_t nd_range_x = 8;
     size_t nd_range_y = 8;
-    auto nd_range = cl::sycl::range<2>(nd_range_x, nd_range_y);
+    auto nd_range = sycl::range<2>(nd_range_x, nd_range_y);
 
     std::vector<uint32_t> A(nd_range_x * nd_range_y, 1);
     uint32_t val = 42;
-    cl::sycl::queue sycl_queue;
+    sycl::queue sycl_queue;
 
-    auto work_range = cl::sycl::nd_range<2>(nd_range, cl::sycl::range<2>(1, 1));
-    auto A_buff = cl::sycl::buffer<uint32_t>(
-        A.data(), cl::sycl::range<1>(nd_range_x * nd_range_y));
-    sycl_queue.submit([&](cl::sycl::handler &cgh) {
-        auto A_acc = A_buff.get_access<cl::sycl::access::mode::write>(cgh);
+    auto work_range = sycl::nd_range<2>(nd_range, sycl::range<2>(1, 1));
+    auto A_buff = sycl::buffer<uint32_t>(
+        A.data(), sycl::range<1>(nd_range_x * nd_range_y));
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        auto A_acc = A_buff.get_access<sycl::access::mode::write>(cgh);
         cgh.parallel_for<class fill_2d>(
-            work_range, [A_acc, val](cl::sycl::nd_item<2> item_id) {
+            work_range, [A_acc, val](sycl::nd_item<2> item_id) {
                 auto id = item_id.get_global_linear_id();
                 A_acc[id] = val;
             });
diff --git a/test/conformance/device_code/fill_3d.cpp b/test/conformance/device_code/fill_3d.cpp
index a1f172ba6a..fd835222a2 100644
--- a/test/conformance/device_code/fill_3d.cpp
+++ b/test/conformance/device_code/fill_3d.cpp
@@ -3,26 +3,25 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
     size_t nd_range_x = 4;
     size_t nd_range_y = 4;
     size_t nd_range_z = 4;
-    auto nd_range = cl::sycl::range<3>(nd_range_x, nd_range_y, nd_range_z);
+    auto nd_range = sycl::range<3>(nd_range_x, nd_range_y, nd_range_z);
 
     std::vector<uint32_t> A(nd_range_x * nd_range_y * nd_range_y, 1);
     uint32_t val = 42;
-    cl::sycl::queue sycl_queue;
+    sycl::queue sycl_queue;
 
-    auto work_range =
-        cl::sycl::nd_range<3>(nd_range, cl::sycl::range<3>(1, 1, 1));
-    auto A_buff = cl::sycl::buffer<uint32_t>(
-        A.data(), cl::sycl::range<1>(nd_range_x * nd_range_y));
-    sycl_queue.submit([&](cl::sycl::handler &cgh) {
-        auto A_acc = A_buff.get_access<cl::sycl::access::mode::write>(cgh);
+    auto work_range = sycl::nd_range<3>(nd_range, sycl::range<3>(1, 1, 1));
+    auto A_buff = sycl::buffer<uint32_t>(
+        A.data(), sycl::range<1>(nd_range_x * nd_range_y));
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        auto A_acc = A_buff.get_access<sycl::access::mode::write>(cgh);
         cgh.parallel_for<class fill_3d>(
-            work_range, [A_acc, val](cl::sycl::nd_item<3> item_id) {
+            work_range, [A_acc, val](sycl::nd_item<3> item_id) {
                 auto id = item_id.get_global_linear_id();
                 A_acc[id] = val;
             });
diff --git a/test/conformance/device_code/fill_usm.cpp b/test/conformance/device_code/fill_usm.cpp
index 92cd255399..d57309cb59 100644
--- a/test/conformance/device_code/fill_usm.cpp
+++ b/test/conformance/device_code/fill_usm.cpp
@@ -3,17 +3,17 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
     size_t array_size = 16;
     std::vector<uint32_t> A(array_size, 1);
     uint32_t val = 42;
-    cl::sycl::queue sycl_queue;
-    uint32_t *data = cl::sycl::malloc_shared<uint32_t>(array_size, sycl_queue);
-    sycl_queue.submit([&](cl::sycl::handler &cgh) {
-        cgh.parallel_for<class fill_usm>(cl::sycl::range<1>{array_size},
-                                         [data, val](cl::sycl::item<1> itemId) {
+    sycl::queue sycl_queue;
+    uint32_t *data = sycl::malloc_shared<uint32_t>(array_size, sycl_queue);
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        cgh.parallel_for<class fill_usm>(sycl::range<1>{array_size},
+                                         [data, val](sycl::item<1> itemId) {
                                              auto id = itemId.get_id(0);
                                              data[id] = val;
                                          });
diff --git a/test/conformance/device_code/fixed_wg_size.cpp b/test/conformance/device_code/fixed_wg_size.cpp
new file mode 100644
index 0000000000..db2e8a9250
--- /dev/null
+++ b/test/conformance/device_code/fixed_wg_size.cpp
@@ -0,0 +1,27 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <sycl/sycl.hpp>
+
+struct KernelFunctor {
+    void operator()(sycl::nd_item<3>) const {}
+    void operator()(sycl::item<3>) const {}
+
+    auto get(sycl::ext::oneapi::experimental::properties_tag) {
+        return sycl::ext::oneapi::experimental::properties{
+            sycl::ext::oneapi::experimental::work_group_size<4, 4, 4>};
+    }
+};
+
+int main() {
+    sycl::queue myQueue;
+    myQueue.submit([&](sycl::handler &cgh) {
+        cgh.parallel_for<class FixedWgSize>(sycl::range<3>(8, 8, 8),
+                                            KernelFunctor{});
+    });
+
+    myQueue.wait();
+    return 0;
+}
diff --git a/test/conformance/device_code/foo.cpp b/test/conformance/device_code/foo.cpp
index dc108b9606..20ad92be4b 100644
--- a/test/conformance/device_code/foo.cpp
+++ b/test/conformance/device_code/foo.cpp
@@ -3,14 +3,14 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
-    cl::sycl::queue deviceQueue;
-    cl::sycl::range<1> numOfItems{1};
+    sycl::queue deviceQueue;
+    sycl::range<1> numOfItems{1};
 
-    deviceQueue.submit([&](cl::sycl::handler &cgh) {
-        auto kern = [=](cl::sycl::id<1>) {};
+    deviceQueue.submit([&](sycl::handler &cgh) {
+        auto kern = [=](sycl::id<1>) {};
         cgh.parallel_for<class Foo>(numOfItems, kern);
     });
 
diff --git a/test/conformance/device_code/image_copy.cpp b/test/conformance/device_code/image_copy.cpp
index a64b601213..d04398c2fe 100644
--- a/test/conformance/device_code/image_copy.cpp
+++ b/test/conformance/device_code/image_copy.cpp
@@ -3,44 +3,40 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
-    cl::sycl::queue sycl_queue;
+    sycl::queue sycl_queue;
 
     const int height = 8;
     const int width = 8;
-    auto image_range = cl::sycl::range<2>(height, width);
+    auto image_range = sycl::range<2>(height, width);
     const int channels = 4;
     std::vector<float> in_data(height * width * channels, 0.5f);
     std::vector<float> out_data(height * width * channels, 0);
 
-    cl::sycl::image<2> image_in(
-        in_data.data(), cl::sycl::image_channel_order::rgba,
-        cl::sycl::image_channel_type::fp32, image_range);
-    cl::sycl::image<2> image_out(
-        out_data.data(), cl::sycl::image_channel_order::rgba,
-        cl::sycl::image_channel_type::fp32, image_range);
+    sycl::image<2> image_in(in_data.data(), sycl::image_channel_order::rgba,
+                            sycl::image_channel_type::fp32, image_range);
+    sycl::image<2> image_out(out_data.data(), sycl::image_channel_order::rgba,
+                             sycl::image_channel_type::fp32, image_range);
 
-    auto work_range =
-        cl::sycl::nd_range<2>(image_range, cl::sycl::range<2>(1, 1));
-    sycl_queue.submit([&](cl::sycl::handler &cgh) {
-        cl::sycl::accessor<cl::sycl::float4, 2, cl::sycl::access::mode::read,
-                           cl::sycl::access::target::image>
+    auto work_range = sycl::nd_range<2>(image_range, sycl::range<2>(1, 1));
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        sycl::accessor<sycl::float4, 2, sycl::access::mode::read,
+                       sycl::access::target::image>
             in_acc(image_in, cgh);
-        cl::sycl::accessor<cl::sycl::float4, 2, cl::sycl::access::mode::write,
-                           cl::sycl::access::target::image>
+        sycl::accessor<sycl::float4, 2, sycl::access::mode::write,
+                       sycl::access::target::image>
             out_acc(image_out, cgh);
 
-        cl::sycl::sampler smpl(
-            cl::sycl::coordinate_normalization_mode::unnormalized,
-            cl::sycl::addressing_mode::clamp,
-            cl::sycl::filtering_mode::nearest);
+        sycl::sampler smpl(sycl::coordinate_normalization_mode::unnormalized,
+                           sycl::addressing_mode::clamp,
+                           sycl::filtering_mode::nearest);
 
         cgh.parallel_for<class image_copy>(
-            work_range, [=](cl::sycl::nd_item<2> item_id) {
-                auto coords = cl::sycl::int2(item_id.get_global_id(0),
-                                             item_id.get_global_id(1));
+            work_range, [=](sycl::nd_item<2> item_id) {
+                auto coords = sycl::int2(item_id.get_global_id(0),
+                                         item_id.get_global_id(1));
                 out_acc.write(coords, in_acc.read(coords, smpl));
             });
     });
diff --git a/test/conformance/device_code/indexers_usm.cpp b/test/conformance/device_code/indexers_usm.cpp
index 76b0751730..e055fa47cc 100644
--- a/test/conformance/device_code/indexers_usm.cpp
+++ b/test/conformance/device_code/indexers_usm.cpp
@@ -3,25 +3,24 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
-    const cl::sycl::range<3> global_range(8, 8, 8);
-    const cl::sycl::range<3> local_range(2, 2, 2);
-    const cl::sycl::id<3> global_offset(4, 4, 4);
-    const cl::sycl::nd_range<3> nd_range(global_range, local_range,
-                                         global_offset);
+    const sycl::range<3> global_range(8, 8, 8);
+    const sycl::range<3> local_range(2, 2, 2);
+    const sycl::id<3> global_offset(4, 4, 4);
+    const sycl::nd_range<3> nd_range(global_range, local_range, global_offset);
 
-    cl::sycl::queue sycl_queue;
+    sycl::queue sycl_queue;
     const size_t elements_per_work_item = 6;
-    int *ptr = cl::sycl::malloc_shared<int>(global_range[0] * global_range[1] *
-                                                global_range[2] *
-                                                elements_per_work_item,
-                                            sycl_queue);
+    int *ptr =
+        sycl::malloc_shared<int>(global_range[0] * global_range[1] *
+                                     global_range[2] * elements_per_work_item,
+                                 sycl_queue);
 
-    sycl_queue.submit([&](cl::sycl::handler &cgh) {
+    sycl_queue.submit([&](sycl::handler &cgh) {
         cgh.parallel_for<class indexers>(
-            nd_range, [ptr](cl::sycl::nd_item<3> index) {
+            nd_range, [ptr](sycl::nd_item<3> index) {
                 int *wi_ptr =
                     ptr + index.get_global_linear_id() * elements_per_work_item;
 
diff --git a/test/conformance/device_code/mean.cpp b/test/conformance/device_code/mean.cpp
index 61623e0914..6d5a571374 100644
--- a/test/conformance/device_code/mean.cpp
+++ b/test/conformance/device_code/mean.cpp
@@ -3,34 +3,34 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
     const int array_size = 16;
     const int wg_size = 4;
     std::vector<uint32_t> in(array_size * wg_size, 1);
     std::vector<uint32_t> out(array_size, 0);
-    cl::sycl::queue sycl_queue;
-    auto in_buff = cl::sycl::buffer<uint32_t>(
-        in.data(), cl::sycl::range<1>(array_size * wg_size));
+    sycl::queue sycl_queue;
+    auto in_buff =
+        sycl::buffer<uint32_t>(in.data(), sycl::range<1>(array_size * wg_size));
     auto out_buff =
-        cl::sycl::buffer<uint32_t>(out.data(), cl::sycl::range<1>(array_size));
-    sycl_queue.submit([&](cl::sycl::handler &cgh) {
+        sycl::buffer<uint32_t>(out.data(), sycl::range<1>(array_size));
+    sycl_queue.submit([&](sycl::handler &cgh) {
         sycl::local_accessor<uint32_t> local_mem(wg_size, cgh);
-        auto in_acc = in_buff.get_access<cl::sycl::access::mode::read>(cgh);
-        auto out_acc = out_buff.get_access<cl::sycl::access::mode::write>(cgh);
+        auto in_acc = in_buff.get_access<sycl::access::mode::read>(cgh);
+        auto out_acc = out_buff.get_access<sycl::access::mode::write>(cgh);
 
-        cl::sycl::range<1> num_groups{array_size};
-        cl::sycl::range<1> group_size{wg_size};
+        sycl::range<1> num_groups{array_size};
+        sycl::range<1> group_size{wg_size};
         cgh.parallel_for_work_group<class mean>(
-            num_groups, group_size, [=](cl::sycl::group<1> group) {
+            num_groups, group_size, [=](sycl::group<1> group) {
                 auto group_id = group.get_group_id();
                 group.parallel_for_work_item([&](sycl::h_item<1> item) {
                     auto local_id = item.get_local_id(0);
                     auto in_index = (group_id * wg_size) + local_id;
                     local_mem[local_id] = in_acc[in_index];
                 });
-                cl::sycl::group_barrier(group);
+                sycl::group_barrier(group);
                 uint32_t total = 0;
                 for (int i = 0; i < wg_size; i++) {
                     total += local_mem[i];
diff --git a/test/conformance/device_code/multiply.cpp b/test/conformance/device_code/multiply.cpp
new file mode 100644
index 0000000000..070cef18fd
--- /dev/null
+++ b/test/conformance/device_code/multiply.cpp
@@ -0,0 +1,20 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <sycl/sycl.hpp>
+
+int main() {
+
+    const size_t inputSize = 1;
+    sycl::queue sycl_queue;
+    uint32_t *inputArray = sycl::malloc_shared<uint32_t>(inputSize, sycl_queue);
+
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        cgh.parallel_for<class MultiplyBy2>(
+            sycl::range<1>(inputSize),
+            [=](sycl::id<1> itemID) { inputArray[itemID] *= 2; });
+    });
+    return 0;
+}
diff --git a/test/conformance/device_code/saxpy.cpp b/test/conformance/device_code/saxpy.cpp
index 593e8e2435..ac113884b9 100644
--- a/test/conformance/device_code/saxpy.cpp
+++ b/test/conformance/device_code/saxpy.cpp
@@ -3,7 +3,7 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
     size_t array_size = 16;
@@ -11,20 +11,17 @@ int main() {
     std::vector<uint32_t> Y(array_size, 2);
     std::vector<uint32_t> Z(array_size, 0);
     uint32_t A = 42;
-    auto x_buff =
-        cl::sycl::buffer<uint32_t>(X.data(), cl::sycl::range<1>(array_size));
-    auto y_buff =
-        cl::sycl::buffer<uint32_t>(Y.data(), cl::sycl::range<1>(array_size));
-    auto z_buff =
-        cl::sycl::buffer<uint32_t>(Z.data(), cl::sycl::range<1>(array_size));
+    auto x_buff = sycl::buffer<uint32_t>(X.data(), sycl::range<1>(array_size));
+    auto y_buff = sycl::buffer<uint32_t>(Y.data(), sycl::range<1>(array_size));
+    auto z_buff = sycl::buffer<uint32_t>(Z.data(), sycl::range<1>(array_size));
 
-    cl::sycl::queue sycl_queue;
-    sycl_queue.submit([&](cl::sycl::handler &cgh) {
-        auto x_acc = x_buff.get_access<cl::sycl::access::mode::read>(cgh);
-        auto y_acc = y_buff.get_access<cl::sycl::access::mode::read>(cgh);
-        auto z_acc = z_buff.get_access<cl::sycl::access::mode::write>(cgh);
-        cgh.parallel_for<class saxpy>(cl::sycl::range<1>{array_size},
-                                      [=](cl::sycl::item<1> itemId) {
+    sycl::queue sycl_queue;
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        auto x_acc = x_buff.get_access<sycl::access::mode::read>(cgh);
+        auto y_acc = y_buff.get_access<sycl::access::mode::read>(cgh);
+        auto z_acc = z_buff.get_access<sycl::access::mode::write>(cgh);
+        cgh.parallel_for<class saxpy>(sycl::range<1>{array_size},
+                                      [=](sycl::item<1> itemId) {
                                           auto i = itemId.get_id(0);
                                           z_acc[i] = A * x_acc[i] + y_acc[i];
                                       });
diff --git a/test/conformance/device_code/saxpy_usm.cpp b/test/conformance/device_code/saxpy_usm.cpp
index 8772a7e25d..774686ab21 100644
--- a/test/conformance/device_code/saxpy_usm.cpp
+++ b/test/conformance/device_code/saxpy_usm.cpp
@@ -3,20 +3,20 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-#include <CL/sycl.hpp>
+#include <sycl/sycl.hpp>
 
 int main() {
     size_t array_size = 16;
 
-    cl::sycl::queue sycl_queue;
-    uint32_t *X = cl::sycl::malloc_shared<uint32_t>(array_size, sycl_queue);
-    uint32_t *Y = cl::sycl::malloc_shared<uint32_t>(array_size, sycl_queue);
-    uint32_t *Z = cl::sycl::malloc_shared<uint32_t>(array_size, sycl_queue);
+    sycl::queue sycl_queue;
+    uint32_t *X = sycl::malloc_shared<uint32_t>(array_size, sycl_queue);
+    uint32_t *Y = sycl::malloc_shared<uint32_t>(array_size, sycl_queue);
+    uint32_t *Z = sycl::malloc_shared<uint32_t>(array_size, sycl_queue);
     uint32_t A = 42;
 
-    sycl_queue.submit([&](cl::sycl::handler &cgh) {
-        cgh.parallel_for<class saxpy>(cl::sycl::range<1>{array_size},
-                                      [=](cl::sycl::item<1> itemId) {
+    sycl_queue.submit([&](sycl::handler &cgh) {
+        cgh.parallel_for<class saxpy>(sycl::range<1>{array_size},
+                                      [=](sycl::item<1> itemId) {
                                           auto i = itemId.get_id(0);
                                           Z[i] = A * X[i] + Y[i];
                                       });
diff --git a/test/conformance/device_code/standard_types.cpp b/test/conformance/device_code/standard_types.cpp
new file mode 100644
index 0000000000..d3cf89aa66
--- /dev/null
+++ b/test/conformance/device_code/standard_types.cpp
@@ -0,0 +1,42 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <stdint.h>
+#include <sycl/sycl.hpp>
+
+struct Struct {
+    uint32_t a;
+    uint32_t b;
+};
+
+int main() {
+    sycl::queue deviceQueue;
+    sycl::range<1> numOfItems{1};
+
+    uint32_t output = 0;
+
+    volatile bool test_bool = true;
+    volatile uint8_t test_u8 = 2;
+    volatile uint32_t test_u32 = 3;
+    volatile uint64_t test_u64 = 5;
+    Struct test_struct{7, 5};
+    volatile float test_float = 11;
+
+    {
+        sycl::buffer output_buff(&output, sycl::range(1));
+        deviceQueue.submit([&](sycl::handler &cgh) {
+            sycl::accessor acc{output_buff, cgh, sycl::read_write};
+            auto kern = [=](sycl::id<1> id) {
+                acc[id] = 100 + (test_bool ? 1 : 0) * test_u8 * test_u32 *
+                                    test_u64 * test_struct.a *
+                                    static_cast<uint32_t>(test_float);
+            };
+            cgh.parallel_for<class Foo>(numOfItems, kern);
+        });
+        deviceQueue.wait();
+    }
+
+    return output == 2410;
+}
diff --git a/test/conformance/device_code/subgroup.cpp b/test/conformance/device_code/subgroup.cpp
new file mode 100644
index 0000000000..fa4228f846
--- /dev/null
+++ b/test/conformance/device_code/subgroup.cpp
@@ -0,0 +1,35 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <sycl/sycl.hpp>
+
+struct KernelFunctor {
+    sycl::accessor<size_t, 1, sycl::access_mode::write> Acc;
+
+    KernelFunctor(sycl::accessor<size_t, 1, sycl::access_mode::write> Acc)
+        : Acc(Acc) {}
+
+    void operator()(sycl::nd_item<1> NdItem) const {
+        auto SG = NdItem.get_sub_group();
+        if (NdItem.get_global_linear_id() == 0) {
+            Acc[0] = SG.get_local_linear_range();
+        }
+    }
+};
+
+int main() {
+    sycl::queue myQueue;
+    size_t output = 0;
+    sycl::buffer output_buff(&output, sycl::range(1));
+
+    myQueue.submit([&](sycl::handler &cgh) {
+        sycl::accessor acc{output_buff, cgh, sycl::write_only, sycl::no_init};
+        cgh.parallel_for<class FixedSgSize>(sycl::nd_range<1>(8, 2),
+                                            KernelFunctor{acc});
+    });
+
+    myQueue.wait();
+    return 0;
+}
diff --git a/test/conformance/enqueue/CMakeLists.txt b/test/conformance/enqueue/CMakeLists.txt
index 532cab1b85..7cc68203a0 100644
--- a/test/conformance/enqueue/CMakeLists.txt
+++ b/test/conformance/enqueue/CMakeLists.txt
@@ -29,4 +29,5 @@ add_conformance_test_with_kernels_environment(enqueue
     urEnqueueUSMPrefetch.cpp
     urEnqueueReadHostPipe.cpp
     urEnqueueWriteHostPipe.cpp
+    urEnqueueTimestampRecording.cpp
     )
diff --git a/test/conformance/enqueue/enqueue_adapter_cuda.match b/test/conformance/enqueue/enqueue_adapter_cuda.match
index 9b57269f3d..8d015c3dc7 100644
--- a/test/conformance/enqueue/enqueue_adapter_cuda.match
+++ b/test/conformance/enqueue/enqueue_adapter_cuda.match
@@ -1,3 +1,6 @@
+urEnqueueKernelLaunchTest.InvalidKernelArgs/NVIDIA_CUDA_BACKEND___{{.*}}_
+urEnqueueKernelLaunchKernelWgSizeTest.NonMatchingLocalSize/NVIDIA_CUDA_BACKEND___{{.*}}_
+urEnqueueKernelLaunchKernelSubGroupTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_
 {{OPT}}urEnqueueKernelLaunchWithVirtualMemory.Success/NVIDIA_CUDA_BACKEND___{{.*}}_
 {{OPT}}urEnqueueMemBufferCopyRectTest.InvalidSize/NVIDIA_CUDA_BACKEND___{{.*}}_
 {{OPT}}urEnqueueMemBufferFillTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}___size__256__patternSize__256
diff --git a/test/conformance/enqueue/enqueue_adapter_hip.match b/test/conformance/enqueue/enqueue_adapter_hip.match
index 8c30bb6a1f..f97188d742 100644
--- a/test/conformance/enqueue/enqueue_adapter_hip.match
+++ b/test/conformance/enqueue/enqueue_adapter_hip.match
@@ -1,10 +1,11 @@
-{{OPT}}urEnqueueDeviceGetGlobalVariableReadTest.Success/AMD_HIP_BACKEND___{{.*}}_
+{{OPT}}urEnqueueKernelLaunchTest.InvalidKernelArgs/AMD_HIP_BACKEND___{{.*}}_
+{{OPT}}urEnqueueKernelLaunchKernelWgSizeTest.NonMatchingLocalSize/AMD_HIP_BACKEND___{{.*}}_
+{{OPT}}urEnqueueKernelLaunchKernelSubGroupTest.Success/AMD_HIP_BACKEND___{{.*}}_
 {{OPT}}urEnqueueKernelLaunchUSMLinkedList.Success/AMD_HIP_BACKEND___{{.*}}___UsePoolEnabled
 {{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___copy_row_2D
 {{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___copy_3d_2d
 {{OPT}}urEnqueueMemBufferFillTest.Success/AMD_HIP_BACKEND___{{.*}}___size__256__patternSize__256
 {{OPT}}urEnqueueMemBufferFillTest.Success/AMD_HIP_BACKEND___{{.*}}___size__1024__patternSize__256
-{{OPT}}urEnqueueMemBufferMapTest.SuccessMultiMaps/AMD_HIP_BACKEND___{{.*}}_
 {{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___write_row_2D
 {{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___write_3d_2d
 {{OPT}}urEnqueueMemImageCopyTest.Success/AMD_HIP_BACKEND___{{.*}}___1D
@@ -33,58 +34,12 @@
 {{OPT}}urEnqueueMemImageReadTest.Success3D/AMD_HIP_BACKEND___{{.*}}_
 {{OPT}}urEnqueueMemImageWriteTest.Success1D/AMD_HIP_BACKEND___{{.*}}_
 {{OPT}}urEnqueueMemImageWriteTest.Success3D/AMD_HIP_BACKEND___{{.*}}_
-{{OPT}}urEnqueueUSMFillTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___size__1__patternSize__1
-{{OPT}}urEnqueueUSMFillTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___size__256__patternSize__256
-{{OPT}}urEnqueueUSMFillTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___size__1024__patternSize__256
-{{OPT}}urEnqueueUSMFillTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___size__256__patternSize__4
-{{OPT}}urEnqueueUSMFillTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___size__256__patternSize__8
-{{OPT}}urEnqueueUSMFillTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___size__256__patternSize__16
-{{OPT}}urEnqueueUSMFillTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___size__256__patternSize__32
 {{OPT}}urEnqueueUSMFill2DNegativeTest.OutOfBounds/AMD_HIP_BACKEND___{{.*}}_
 {{OPT}}urEnqueueUSMAdviseWithParamTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_USM_ADVICE_FLAG_DEFAULT
 {{OPT}}urEnqueueUSMAdviseTest.MultipleParamsSuccess/AMD_HIP_BACKEND___{{.*}}_
 {{OPT}}urEnqueueUSMAdviseTest.NonCoherentDeviceMemorySuccessOrWarning/AMD_HIP_BACKEND___{{.*}}_
-{{OPT}}urEnqueueUSMMemcpyTest.Blocking/AMD_HIP_BACKEND___{{.*}}_
-{{OPT}}urEnqueueUSMMemcpyTest.BlockingWithEvent/AMD_HIP_BACKEND___{{.*}}_
-{{OPT}}urEnqueueUSMMemcpyTest.NonBlocking/AMD_HIP_BACKEND___{{.*}}_
-{{OPT}}urEnqueueUSMMemcpyTest.WaitForDependencies/AMD_HIP_BACKEND___{{.*}}_
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1__width__1__height__1__src__Device__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1__width__1__height__1__src__Host__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1__width__1__height__1__src__Shared__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__1__src__Device__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__1__src__Host__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__1__src__Shared__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__1024__height__1__src__Device__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__1024__height__1__src__Host__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__1024__height__1__src__Shared__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__256__src__Device__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__256__src__Host__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__256__src__Shared__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__23__src__Device__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__23__src__Host__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__23__src__Shared__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__1__src__Device__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__1__src__Host__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__1__src__Shared__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1__width__1__height__1__src__Device__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1__width__1__height__1__src__Host__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1__width__1__height__1__src__Shared__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__1__src__Device__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__1__src__Host__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__1__src__Shared__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__1024__height__1__src__Device__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__1024__height__1__src__Host__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__1024__height__1__src__Shared__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__256__src__Device__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__256__src__Host__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__256__src__Shared__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__23__src__Device__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__23__src__Host__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__23__src__Shared__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__1__src__Device__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__1__src__Host__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__1__src__Shared__dst__Device
-{{OPT}}urEnqueueUSMMemcpy2DNegativeTest.InvalidSize/AMD_HIP_BACKEND___{{.*}}___pitch__1__width__1__height__1__src__Device__dst__Device
+{{OPT}}urEnqueueUSMMemcpy2DNegativeTest.InvalidSize/AMD_HIP_BACKEND___{{.*}}___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE
 {{OPT}}urEnqueueUSMPrefetchWithParamTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_USM_MIGRATION_FLAG_DEFAULT
 {{OPT}}urEnqueueUSMPrefetchWithParamTest.CheckWaitEvent/AMD_HIP_BACKEND___{{.*}}___UR_USM_MIGRATION_FLAG_DEFAULT
-{{OPT}}{{Segmentation fault|Aborted}}
+{{OPT}}urEnqueueTimestampRecordingExpTest.Success/AMD_HIP_BACKEND___{{.*}}
+{{OPT}}urEnqueueTimestampRecordingExpTest.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}
diff --git a/test/conformance/enqueue/enqueue_adapter_native_cpu.match b/test/conformance/enqueue/enqueue_adapter_native_cpu.match
index 43a114bf21..155a400e89 100644
--- a/test/conformance/enqueue/enqueue_adapter_native_cpu.match
+++ b/test/conformance/enqueue/enqueue_adapter_native_cpu.match
@@ -22,6 +22,13 @@
 {{OPT}}urEnqueueKernelLaunchTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 {{OPT}}urEnqueueKernelLaunchTest.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 {{OPT}}urEnqueueKernelLaunchTest.InvalidWorkDimension/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueKernelLaunchTest.InvalidWorkGroupSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueKernelLaunchTest.InvalidKernelArgs/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueKernelLaunchKernelWgSizeTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueKernelLaunchKernelWgSizeTest.SuccessWithExplicitLocalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueKernelLaunchKernelWgSizeTest.NonMatchingLocalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueKernelLaunchKernelSubGroupTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}urEnqueueKernelLaunchKernelStandardTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 {{OPT}}urEnqueueKernelLaunchTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D_1
 {{OPT}}urEnqueueKernelLaunchTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D_31
 {{OPT}}urEnqueueKernelLaunchTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D_1027
diff --git a/test/conformance/enqueue/enqueue_adapter_opencl.match b/test/conformance/enqueue/enqueue_adapter_opencl.match
index 4aa265c897..0e751b8a25 100644
--- a/test/conformance/enqueue/enqueue_adapter_opencl.match
+++ b/test/conformance/enqueue/enqueue_adapter_opencl.match
@@ -32,4 +32,6 @@
 {{OPT}}urEnqueueUSMMemcpy2DNegativeTest.InvalidSize/Intel_R__OpenCL___{{.*}}
 {{OPT}}urEnqueueUSMMemcpy2DNegativeTest.InvalidEventWaitList/Intel_R__OpenCL___{{.*}}
 {{OPT}}urEnqueueUSMPrefetchTest.InvalidSizeTooLarge/Intel_R__OpenCL___{{.*}}
+urEnqueueKernelLaunchKernelWgSizeTest.Success/Intel_R__OpenCL___{{.*}}_
+urEnqueueKernelLaunchKernelSubGroupTest.Success/Intel_R__OpenCL___{{.*}}_
 {{OPT}}urEnqueueKernelLaunchUSMLinkedList.Success/Intel_R__OpenCL___{{.*}}_UsePoolEnabled
diff --git a/test/conformance/enqueue/urEnqueueKernelLaunch.cpp b/test/conformance/enqueue/urEnqueueKernelLaunch.cpp
index 9217457270..88b65348dc 100644
--- a/test/conformance/enqueue/urEnqueueKernelLaunch.cpp
+++ b/test/conformance/enqueue/urEnqueueKernelLaunch.cpp
@@ -18,6 +18,45 @@ struct urEnqueueKernelLaunchTest : uur::urKernelExecutionTest {
 };
 UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunchTest);
 
+struct urEnqueueKernelLaunchKernelWgSizeTest : uur::urKernelExecutionTest {
+    void SetUp() override {
+        program_name = "fixed_wg_size";
+        UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
+    }
+
+    std::array<size_t, 3> global_size{32, 32, 32};
+    std::array<size_t, 3> global_offset{0, 0, 0};
+    // This must match the size in fixed_wg_size.cpp
+    std::array<size_t, 3> wg_size{4, 4, 4};
+    size_t n_dimensions = 3;
+};
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunchKernelWgSizeTest);
+
+// Note: Due to an issue with HIP, the subgroup test is not generated
+struct urEnqueueKernelLaunchKernelSubGroupTest : uur::urKernelExecutionTest {
+    void SetUp() override {
+        program_name = "subgroup";
+        UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
+    }
+
+    std::array<size_t, 3> global_size{32, 32, 32};
+    std::array<size_t, 3> global_offset{0, 0, 0};
+    size_t n_dimensions = 3;
+};
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunchKernelSubGroupTest);
+
+struct urEnqueueKernelLaunchKernelStandardTest : uur::urKernelExecutionTest {
+    void SetUp() override {
+        program_name = "standard_types";
+        UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
+    }
+
+    size_t n_dimensions = 1;
+    size_t global_size = 1;
+    size_t offset = 0;
+};
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunchKernelStandardTest);
+
 TEST_P(urEnqueueKernelLaunchTest, Success) {
     ur_mem_handle_t buffer = nullptr;
     AddBuffer1DArg(sizeof(val) * global_size, &buffer);
@@ -77,6 +116,94 @@ TEST_P(urEnqueueKernelLaunchTest, InvalidWorkDimension) {
                      UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
 }
 
+TEST_P(urEnqueueKernelLaunchTest, InvalidWorkGroupSize) {
+    // As far as I can tell, there's no way to check if a kernel or device
+    // requires uniform work group sizes or not, so this may succeed or report
+    // an error
+    size_t local_size = 31;
+    ur_mem_handle_t buffer = nullptr;
+    AddBuffer1DArg(sizeof(val) * global_size, &buffer);
+    AddPodArg(val);
+    auto result =
+        urEnqueueKernelLaunch(queue, kernel, n_dimensions, &global_offset,
+                              &global_size, &local_size, 0, nullptr, nullptr);
+    ASSERT_TRUE(result == UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE ||
+                result == UR_RESULT_SUCCESS);
+}
+
+TEST_P(urEnqueueKernelLaunchTest, InvalidKernelArgs) {
+    ur_platform_backend_t backend;
+    ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND,
+                                     sizeof(ur_platform_backend_t), &backend,
+                                     nullptr));
+
+    if (backend == UR_PLATFORM_BACKEND_CUDA ||
+        backend == UR_PLATFORM_BACKEND_HIP) {
+        GTEST_FAIL() << "AMD and Nvidia can't check kernel arguments.";
+    }
+
+    // Enqueue kernel without setting any args
+    ASSERT_EQ_RESULT(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
+                                           &global_offset, &global_size,
+                                           nullptr, 0, nullptr, nullptr),
+                     UR_RESULT_ERROR_INVALID_KERNEL_ARGS);
+}
+
+TEST_P(urEnqueueKernelLaunchKernelWgSizeTest, Success) {
+    ASSERT_SUCCESS(urEnqueueKernelLaunch(
+        queue, kernel, n_dimensions, global_offset.data(), global_size.data(),
+        nullptr, 0, nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+}
+
+TEST_P(urEnqueueKernelLaunchKernelWgSizeTest, SuccessWithExplicitLocalSize) {
+    ASSERT_SUCCESS(urEnqueueKernelLaunch(
+        queue, kernel, n_dimensions, global_offset.data(), global_size.data(),
+        wg_size.data(), 0, nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+}
+
+TEST_P(urEnqueueKernelLaunchKernelWgSizeTest, NonMatchingLocalSize) {
+    std::array<size_t, 3> wrong_wg_size{8, 8, 8};
+    ASSERT_EQ_RESULT(
+        urEnqueueKernelLaunch(queue, kernel, n_dimensions, global_offset.data(),
+                              global_size.data(), wrong_wg_size.data(), 0,
+                              nullptr, nullptr),
+        UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE);
+}
+
+TEST_P(urEnqueueKernelLaunchKernelSubGroupTest, Success) {
+    ur_mem_handle_t buffer = nullptr;
+    AddBuffer1DArg(sizeof(size_t), &buffer);
+    ASSERT_SUCCESS(urEnqueueKernelLaunch(
+        queue, kernel, n_dimensions, global_offset.data(), global_size.data(),
+        nullptr, 0, nullptr, nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+    ValidateBuffer<size_t>(buffer, sizeof(size_t), 8);
+}
+
+struct Pair {
+    uint32_t a;
+    uint32_t b;
+};
+TEST_P(urEnqueueKernelLaunchKernelStandardTest, Success) {
+    uint32_t expected_result = 2410;
+    ur_mem_handle_t output = nullptr;
+    AddBuffer1DArg(sizeof(uint32_t), &output);
+    AddPodArg(true);
+    AddPodArg<uint8_t>(2);
+    AddPodArg<uint32_t>(3);
+    AddPodArg<uint64_t>(5);
+    AddPodArg<Pair>({7, 5});
+    AddPodArg<float>(11.0);
+
+    ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, &offset,
+                                         &global_size, nullptr, 0, nullptr,
+                                         nullptr));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+    ValidateBuffer<uint32_t>(output, sizeof(uint32_t), expected_result);
+}
+
 struct testParametersEnqueueKernel {
     size_t X, Y, Z;
     size_t Dims;
diff --git a/test/conformance/enqueue/urEnqueueTimestampRecording.cpp b/test/conformance/enqueue/urEnqueueTimestampRecording.cpp
new file mode 100644
index 0000000000..5fc8ee5547
--- /dev/null
+++ b/test/conformance/enqueue/urEnqueueTimestampRecording.cpp
@@ -0,0 +1,94 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <uur/fixtures.h>
+
+struct urEnqueueTimestampRecordingExpTest : uur::urQueueTest {
+    void SetUp() override {
+        UUR_RETURN_ON_FATAL_FAILURE(urQueueTest::SetUp());
+        bool timestamp_recording_support = false;
+        ASSERT_SUCCESS(uur::GetTimestampRecordingSupport(
+            device, timestamp_recording_support));
+        if (!timestamp_recording_support) {
+            GTEST_SKIP() << "Timestamp recording is not supported";
+        }
+    }
+
+    void TearDown() override { urQueueTest::TearDown(); }
+};
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueTimestampRecordingExpTest);
+
+void common_check(ur_event_handle_t event) {
+    // All successful runs should return a non-zero profiling results.
+    uint64_t queuedTime = 0, submitTime = 0, startTime = 0, endTime = 0;
+    ASSERT_SUCCESS(
+        urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_QUEUED,
+                                sizeof(uint64_t), &queuedTime, nullptr));
+    ASSERT_SUCCESS(
+        urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_SUBMIT,
+                                sizeof(uint64_t), &submitTime, nullptr));
+    ASSERT_SUCCESS(
+        urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_START,
+                                sizeof(uint64_t), &startTime, nullptr));
+    ASSERT_SUCCESS(urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_END,
+                                           sizeof(uint64_t), &endTime,
+                                           nullptr));
+    ASSERT_TRUE(queuedTime > 0);
+    ASSERT_TRUE(submitTime > 0);
+    ASSERT_TRUE(startTime > 0);
+    ASSERT_TRUE(endTime > 0);
+    ASSERT_TRUE(queuedTime == submitTime);
+    ASSERT_TRUE(startTime == endTime);
+    ASSERT_TRUE(endTime >= submitTime);
+}
+
+TEST_P(urEnqueueTimestampRecordingExpTest, Success) {
+    ur_event_handle_t event = nullptr;
+    ASSERT_SUCCESS(
+        urEnqueueTimestampRecordingExp(queue, false, 0, nullptr, &event));
+    ASSERT_SUCCESS(urQueueFinish(queue));
+    common_check(event);
+    ASSERT_SUCCESS(urEventRelease(event));
+}
+
+TEST_P(urEnqueueTimestampRecordingExpTest, SuccessBlocking) {
+    ur_event_handle_t event = nullptr;
+    ASSERT_SUCCESS(
+        urEnqueueTimestampRecordingExp(queue, true, 0, nullptr, &event));
+    common_check(event);
+    ASSERT_SUCCESS(urEventRelease(event));
+}
+
+TEST_P(urEnqueueTimestampRecordingExpTest, InvalidNullHandleQueue) {
+    ur_event_handle_t event = nullptr;
+    ASSERT_EQ_RESULT(
+        urEnqueueTimestampRecordingExp(nullptr, false, 0, nullptr, &event),
+        UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+}
+
+TEST_P(urEnqueueTimestampRecordingExpTest, InvalidNullPointerEvent) {
+    ASSERT_EQ_RESULT(
+        urEnqueueTimestampRecordingExp(queue, false, 0, nullptr, nullptr),
+        UR_RESULT_ERROR_INVALID_NULL_POINTER);
+}
+
+TEST_P(urEnqueueTimestampRecordingExpTest, InvalidNullPtrEventWaitList) {
+    ur_event_handle_t event = nullptr;
+    ASSERT_EQ_RESULT(
+        urEnqueueTimestampRecordingExp(queue, true, 1, nullptr, &event),
+        UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
+
+    ur_event_handle_t validEvent;
+    ASSERT_SUCCESS(urEnqueueEventsWait(queue, 0, nullptr, &validEvent));
+    ASSERT_EQ_RESULT(
+        urEnqueueTimestampRecordingExp(queue, true, 0, &validEvent, &event),
+        UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
+    ASSERT_SUCCESS(urEventRelease(validEvent));
+
+    ur_event_handle_t invalidEvent = nullptr;
+    ASSERT_EQ_RESULT(
+        urEnqueueTimestampRecordingExp(queue, true, 0, &invalidEvent, &event),
+        UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST);
+}
diff --git a/test/conformance/event/event_adapter_native_cpu.match b/test/conformance/event/event_adapter_native_cpu.match
index d3bc8ef822..f5769d271f 100644
--- a/test/conformance/event/event_adapter_native_cpu.match
+++ b/test/conformance/event/event_adapter_native_cpu.match
@@ -21,6 +21,7 @@ urEventRetainTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urEventReleaseTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urEventGetNativeHandleTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urEventGetNativeHandleTest.InvalidNullPointerNativeEvent/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urEventCreateWithNativeHandleTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urEventSetCallbackTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urEventSetCallbackTest.ValidateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urEventSetCallbackTest.AllStates/SYCL_NATIVE_CPU___SYCL_Native_CPU_
diff --git a/test/conformance/event/urEventCreateWithNativeHandle.cpp b/test/conformance/event/urEventCreateWithNativeHandle.cpp
index 076f6b312f..47daa464ed 100644
--- a/test/conformance/event/urEventCreateWithNativeHandle.cpp
+++ b/test/conformance/event/urEventCreateWithNativeHandle.cpp
@@ -11,8 +11,9 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEventCreateWithNativeHandleTest);
 
 TEST_P(urEventCreateWithNativeHandleTest, Success) {
     ur_native_handle_t native_event = nullptr;
-    if (urEventGetNativeHandle(event, &native_event)) {
-        GTEST_SKIP();
+    {
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+            urEventGetNativeHandle(event, &native_event));
     }
 
     // We cannot assume anything about a native_handle, not even if it's
@@ -20,8 +21,8 @@ TEST_P(urEventCreateWithNativeHandleTest, Success) {
     // We can however convert the native_handle back into a unified-runtime handle
     // and perform some query on it to verify that it works.
     uur::raii::Event evt = nullptr;
-    ASSERT_SUCCESS(urEventCreateWithNativeHandle(native_event, context, nullptr,
-                                                 evt.ptr()));
+    UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urEventCreateWithNativeHandle(
+        native_event, context, nullptr, evt.ptr()));
     ASSERT_NE(evt, nullptr);
 
     ur_execution_info_t exec_info;
diff --git a/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp b/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp
index 2663e6824a..80561cc9e1 100644
--- a/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp
+++ b/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp
@@ -19,18 +19,32 @@ struct BufferFillCommandTest
                                          &buffer));
 
         // First argument is buffer to fill
-        ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, 0, nullptr, buffer));
-
-        // second arg is hidden accessor
-        struct {
-            size_t offsets[1] = {0};
-        } accessor;
-        ASSERT_SUCCESS(urKernelSetArgValue(kernel, 1, sizeof(accessor), nullptr,
-                                           &accessor));
-
-        // Second argument is scalar to fill with.
+        unsigned current_arg_index = 0;
         ASSERT_SUCCESS(
-            urKernelSetArgValue(kernel, 2, sizeof(val), nullptr, &val));
+            urKernelSetArgMemObj(kernel, current_arg_index++, nullptr, buffer));
+
+        // Add accessor arguments depending on backend.
+        // HIP has 3 offset parameters and other backends only have 1.
+        if (backend == UR_PLATFORM_BACKEND_HIP) {
+            size_t val = 0;
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index++,
+                                               sizeof(size_t), nullptr, &val));
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index++,
+                                               sizeof(size_t), nullptr, &val));
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index++,
+                                               sizeof(size_t), nullptr, &val));
+        } else {
+            struct {
+                size_t offsets[1] = {0};
+            } accessor;
+            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index++,
+                                               sizeof(accessor), nullptr,
+                                               &accessor));
+        }
+
+        // Second user defined argument is scalar to fill with.
+        ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index++,
+                                           sizeof(val), nullptr, &val));
 
         // Append kernel command to command-buffer and close command-buffer
         ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp(
diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match
index 76e0b99dc0..1447b0b31b 100644
--- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match
+++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match
@@ -1,8 +1,3 @@
-{{OPT}}BufferFillCommandTest.UpdateParameters/AMD_HIP_BACKEND{{.*}}
-{{OPT}}BufferFillCommandTest.UpdateGlobalSize/AMD_HIP_BACKEND{{.*}}
-{{OPT}}BufferFillCommandTest.SeparateUpdateCalls/AMD_HIP_BACKEND{{.*}}
-{{OPT}}BufferFillCommandTest.OverrideUpdate/AMD_HIP_BACKEND{{.*}}
-{{OPT}}BufferFillCommandTest.OverrideArgList/AMD_HIP_BACKEND{{.*}}
 {{OPT}}USMFillCommandTest.UpdateParameters/AMD_HIP_BACKEND{{.*}}
 {{OPT}}USMMultipleFillCommandTest.UpdateAllKernels/AMD_HIP_BACKEND{{.*}}
 {{OPT}}BufferSaxpyKernelTest.UpdateParameters/AMD_HIP_BACKEND{{.*}}
@@ -14,4 +9,3 @@
 {{OPT}}urCommandBufferReleaseCommandExpTest.InvalidNullHandle/AMD_HIP_BACKEND{{.*}}
 {{OPT}}urCommandBufferRetainCommandExpTest.Success/AMD_HIP_BACKEND{{.*}}
 {{OPT}}urCommandBufferRetainCommandExpTest.InvalidNullHandle/AMD_HIP_BACKEND{{.*}}
-{{OPT}}{{Segmentation fault|Aborted}}
diff --git a/test/conformance/integration/CMakeLists.txt b/test/conformance/integration/CMakeLists.txt
new file mode 100644
index 0000000000..1689cf04f4
--- /dev/null
+++ b/test/conformance/integration/CMakeLists.txt
@@ -0,0 +1,10 @@
+# Copyright (C) 2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+add_conformance_test_with_kernels_environment(integration
+        QueueEmptyStatus.cpp
+        QueueUSM.cpp
+        QueueBuffer.cpp
+  )
diff --git a/test/conformance/integration/QueueBuffer.cpp b/test/conformance/integration/QueueBuffer.cpp
new file mode 100644
index 0000000000..d801ebf684
--- /dev/null
+++ b/test/conformance/integration/QueueBuffer.cpp
@@ -0,0 +1,108 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "fixtures.h"
+#include <chrono>
+#include <thread>
+
+struct QueueBufferTestWithParam : uur::IntegrationQueueTestWithParam {
+    void SetUp() override {
+        program_name = "cpy_and_mult";
+        UUR_RETURN_ON_FATAL_FAILURE(
+            uur::IntegrationQueueTestWithParam::SetUp());
+    }
+
+    void TearDown() override { uur::IntegrationQueueTestWithParam::TearDown(); }
+
+    void verifyResults(ur_mem_handle_t Buffer, uint32_t ExpectedValue) {
+        uint32_t HostMem[ArraySize] = {};
+        ASSERT_SUCCESS(urEnqueueMemBufferRead(Queue, Buffer, true, 0,
+                                              sizeof(uint32_t) * ArraySize,
+                                              HostMem, 0, nullptr, nullptr));
+
+        for (uint32_t i : HostMem) {
+            ASSERT_EQ(i, ExpectedValue);
+        }
+    }
+
+    ur_mem_handle_t Buffer1 = nullptr;
+    ur_mem_handle_t Buffer2 = nullptr;
+};
+
+UUR_TEST_SUITE_P(QueueBufferTestWithParam,
+                 testing::Values(0, /* In-Order */
+                                 UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE),
+                 uur::IntegrationQueueTestWithParam::paramPrinter);
+
+/* Submits multiple kernels that interact with each other by accessing and
+ * writing to the same buffers.
+ * Checks that when using an IN_ORDER queue, no synchronization is needed
+ * between calls to urEnqueueKernelLaunch.
+ * Checks that when using an OUT_OF_ORDER queue, synchronizing using only
+ * event barriers is enough. */
+TEST_P(QueueBufferTestWithParam, QueueBufferTest) {
+
+    std::vector<ur_event_handle_t> EventsFill;
+    ur_event_handle_t Event;
+
+    size_t Buffer1Index;
+    size_t Buffer2Index;
+    ASSERT_NO_FATAL_FAILURE(
+        AddBuffer1DArg(ArraySize * sizeof(uint32_t), &Buffer1, &Buffer1Index));
+    ASSERT_NO_FATAL_FAILURE(
+        AddBuffer1DArg(ArraySize * sizeof(uint32_t), &Buffer2, &Buffer2Index));
+
+    ASSERT_SUCCESS(urEnqueueMemBufferFill(
+        Queue, Buffer1, &InitialValue, sizeof(uint32_t), 0,
+        ArraySize * sizeof(uint32_t), 0, nullptr, &Event));
+    EventsFill.push_back(Event);
+
+    ASSERT_SUCCESS(urEnqueueMemBufferFill(
+        Queue, Buffer2, &InitialValue, sizeof(uint32_t), 0,
+        ArraySize * sizeof(uint32_t), 0, nullptr, &Event));
+    EventsFill.push_back(Event);
+
+    ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(EventsFill));
+
+    constexpr size_t GlobalOffset = 0;
+    constexpr size_t NDimensions = 1;
+    constexpr uint32_t NumIterations = 5;
+
+    uint32_t CurValueMem1 = InitialValue;
+    uint32_t CurValueMem2 = InitialValue;
+    for (uint32_t i = 0; i < NumIterations; ++i) {
+
+        /* Copy from DeviceMem1 to DeviceMem2 and multiply by 2 */
+        ASSERT_SUCCESS(
+            urKernelSetArgMemObj(kernel, Buffer2Index, nullptr, Buffer2));
+        ASSERT_SUCCESS(
+            urKernelSetArgMemObj(kernel, Buffer1Index, nullptr, Buffer1));
+
+        ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, NDimensions,
+                                             &GlobalOffset, &ArraySize, nullptr,
+                                             0, nullptr, &Event));
+        ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event));
+
+        CurValueMem2 = CurValueMem1 * 2;
+
+        /* Copy from DeviceMem1 to DeviceMem2 and multiply by 2 */
+        ASSERT_SUCCESS(
+            urKernelSetArgMemObj(kernel, Buffer1Index, nullptr, Buffer2));
+        ASSERT_SUCCESS(
+            urKernelSetArgMemObj(kernel, Buffer2Index, nullptr, Buffer1));
+
+        ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, NDimensions,
+                                             &GlobalOffset, &ArraySize, nullptr,
+                                             0, nullptr, &Event));
+        ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event));
+
+        CurValueMem1 = CurValueMem2 * 2;
+    }
+
+    ASSERT_SUCCESS(urQueueFinish(Queue));
+
+    ASSERT_NO_FATAL_FAILURE(verifyResults(Buffer1, CurValueMem1));
+    ASSERT_NO_FATAL_FAILURE(verifyResults(Buffer2, CurValueMem2));
+}
diff --git a/test/conformance/integration/QueueEmptyStatus.cpp b/test/conformance/integration/QueueEmptyStatus.cpp
new file mode 100644
index 0000000000..b8f1517b70
--- /dev/null
+++ b/test/conformance/integration/QueueEmptyStatus.cpp
@@ -0,0 +1,107 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "fixtures.h"
+#include <chrono>
+#include <thread>
+
+struct QueueEmptyStatusTestWithParam : uur::IntegrationQueueTestWithParam {
+
+    void SetUp() override {
+
+        program_name = "multiply";
+        UUR_RETURN_ON_FATAL_FAILURE(
+            uur::IntegrationQueueTestWithParam::SetUp());
+
+        ur_device_usm_access_capability_flags_t shared_usm_flags = 0;
+        ASSERT_SUCCESS(
+            uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags));
+        if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) {
+            GTEST_SKIP() << "Shared USM is not supported.";
+        }
+
+        ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr,
+                                        ArraySize * sizeof(uint32_t),
+                                        &SharedMem));
+    }
+
+    void TearDown() override {
+        ASSERT_SUCCESS(urUSMFree(context, SharedMem));
+        uur::IntegrationQueueTestWithParam::TearDown();
+    }
+
+    void submitWorkToQueue() {
+        ur_event_handle_t Event;
+        ASSERT_SUCCESS(
+            urEnqueueUSMFill(Queue, SharedMem, sizeof(uint32_t), &InitialValue,
+                             ArraySize * sizeof(uint32_t), 0, nullptr, &Event));
+        ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event));
+
+        ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, &SharedMem));
+
+        constexpr size_t global_offset = 0;
+        constexpr size_t n_dimensions = 1;
+        constexpr uint32_t num_iterations = 5;
+        for (uint32_t i = 0; i < num_iterations; ++i) {
+            ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, n_dimensions,
+                                                 &global_offset, &ArraySize,
+                                                 nullptr, 0, nullptr, &Event));
+            ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event));
+        }
+
+        ASSERT_SUCCESS(urQueueFlush(Queue));
+    }
+
+    void waitUntilQueueEmpty() const {
+
+        using namespace std::chrono_literals;
+
+        constexpr auto step = 500ms;
+        constexpr auto maxWait = 5000ms;
+
+        /* Wait a bit until work finishes running. We don't synchronize with
+         * urQueueFinish() because we want to check if the status is set without
+         * calling it explicitly. */
+        for (auto currentWait = 0ms; currentWait < maxWait;
+             currentWait += step) {
+            std::this_thread::sleep_for(step);
+
+            ur_bool_t is_queue_empty;
+            ASSERT_SUCCESS(urQueueGetInfo(Queue, UR_QUEUE_INFO_EMPTY,
+                                          sizeof(ur_bool_t), &is_queue_empty,
+                                          nullptr));
+            if (is_queue_empty) {
+                return;
+            }
+        }
+
+        /* If we are here, the test failed. Let's call queue finish to avoid
+         * issues when freeing memory */
+        ASSERT_SUCCESS(urQueueFinish(Queue));
+        GTEST_FAIL();
+    }
+
+    void *SharedMem = nullptr;
+};
+
+UUR_TEST_SUITE_P(QueueEmptyStatusTestWithParam,
+                 testing::Values(0, /* In-Order */
+                                 UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE),
+                 uur::IntegrationQueueTestWithParam::paramPrinter);
+
+/* Submits kernels that have a dependency on each other and checks that the
+ * queue submits all the work in the correct order to the device.
+ * Explicit synchronization (except for barriers) is avoided in these tests to
+ * check that the properties of In-Order and OutOfOrder queues are working as
+ * expected */
+TEST_P(QueueEmptyStatusTestWithParam, QueueEmptyStatusTest) {
+    ASSERT_NO_FATAL_FAILURE(submitWorkToQueue());
+    ASSERT_NO_FATAL_FAILURE(waitUntilQueueEmpty());
+
+    constexpr size_t expected_value = 3200;
+    for (uint32_t i = 0; i < ArraySize; ++i) {
+        ASSERT_EQ(reinterpret_cast<uint32_t *>(SharedMem)[i], expected_value);
+    }
+}
diff --git a/test/conformance/integration/QueueUSM.cpp b/test/conformance/integration/QueueUSM.cpp
new file mode 100644
index 0000000000..cc8201453a
--- /dev/null
+++ b/test/conformance/integration/QueueUSM.cpp
@@ -0,0 +1,117 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "fixtures.h"
+#include <chrono>
+#include <thread>
+
+struct QueueUSMTestWithParam : uur::IntegrationQueueTestWithParam {
+    void SetUp() override {
+        program_name = "cpy_and_mult_usm";
+        UUR_RETURN_ON_FATAL_FAILURE(
+            uur::IntegrationQueueTestWithParam::SetUp());
+
+        ur_device_usm_access_capability_flags_t shared_usm_flags = 0;
+        ASSERT_SUCCESS(
+            uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags));
+        if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) {
+            GTEST_SKIP() << "Shared USM is not supported.";
+        }
+
+        ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr,
+                                        ArraySize * sizeof(uint32_t),
+                                        &DeviceMem1));
+
+        ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr,
+                                        ArraySize * sizeof(uint32_t),
+                                        &DeviceMem2));
+    }
+
+    void TearDown() override {
+        ASSERT_SUCCESS(urUSMFree(context, DeviceMem1));
+        ASSERT_SUCCESS(urUSMFree(context, DeviceMem2));
+        uur::IntegrationQueueTestWithParam::TearDown();
+    }
+
+    void verifyResults(void *DeviceMem, uint32_t ExpectedValue) {
+        uint32_t HostMem[ArraySize] = {};
+        ASSERT_SUCCESS(urEnqueueUSMMemcpy(Queue, true, HostMem, DeviceMem,
+                                          sizeof(uint32_t) * ArraySize, 0,
+                                          nullptr, nullptr));
+
+        for (uint32_t i : HostMem) {
+            ASSERT_EQ(i, ExpectedValue);
+        }
+    }
+
+    void *DeviceMem1 = nullptr;
+    void *DeviceMem2 = nullptr;
+};
+
+UUR_TEST_SUITE_P(QueueUSMTestWithParam,
+                 testing::Values(0, /* In-Order */
+                                 UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE),
+                 uur::IntegrationQueueTestWithParam::paramPrinter);
+
+/* Submits multiple kernels that interact with each other by accessing and
+ * writing to the same USM memory locations.
+ * Checks that when using an IN_ORDER queue, no synchronization is needed
+ * between calls to urEnqueueKernelLaunch.
+ * Checks that when using an OUT_OF_ORDER queue, synchronizing using only
+ * event barriers is enough. */
+TEST_P(QueueUSMTestWithParam, QueueUSMTest) {
+
+    std::vector<ur_event_handle_t> EventsFill;
+    ur_event_handle_t Event;
+    ASSERT_SUCCESS(urEnqueueUSMFill(Queue, DeviceMem1, sizeof(uint32_t),
+                                    &InitialValue, ArraySize * sizeof(uint32_t),
+                                    0, nullptr, &Event));
+    EventsFill.push_back(Event);
+
+    ASSERT_SUCCESS(urEnqueueUSMFill(Queue, DeviceMem2, sizeof(uint32_t),
+                                    &InitialValue, ArraySize * sizeof(uint32_t),
+                                    0, nullptr, &Event));
+    EventsFill.push_back(Event);
+
+    ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(EventsFill));
+
+    constexpr size_t GlobalOffset = 0;
+    constexpr size_t NDimensions = 1;
+    constexpr uint32_t NumIterations = 5;
+
+    uint32_t CurValueMem1 = InitialValue;
+    uint32_t CurValueMem2 = InitialValue;
+
+    std::vector<ur_event_handle_t> EventsKernel;
+
+    for (uint32_t i = 0; i < NumIterations; ++i) {
+        /* Copy from DeviceMem2 to DeviceMem1 and multiply by 2 */
+        ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, &DeviceMem1));
+        ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 1, nullptr, &DeviceMem2));
+
+        ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, NDimensions,
+                                             &GlobalOffset, &ArraySize, nullptr,
+                                             0, nullptr, &Event));
+        ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event));
+
+        CurValueMem2 = CurValueMem1 * 2;
+
+        /* Copy from DeviceMem1 to DeviceMem2 and multiply by 2 */
+        ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, &DeviceMem2));
+        ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 1, nullptr, &DeviceMem1));
+
+        ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, NDimensions,
+                                             &GlobalOffset, &ArraySize, nullptr,
+                                             0, nullptr, &Event));
+        ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event));
+
+        CurValueMem1 = CurValueMem2 * 2;
+    }
+
+    ASSERT_SUCCESS(urQueueFinish(Queue));
+
+    ASSERT_NO_FATAL_FAILURE(verifyResults(DeviceMem1, CurValueMem1));
+    ASSERT_NO_FATAL_FAILURE(verifyResults(DeviceMem2, CurValueMem2));
+}
diff --git a/test/conformance/integration/fixtures.h b/test/conformance/integration/fixtures.h
new file mode 100644
index 0000000000..aca70a5245
--- /dev/null
+++ b/test/conformance/integration/fixtures.h
@@ -0,0 +1,76 @@
+// Copyright (C) 2024 Intel Corporation
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+// See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef UR_CONFORMANCE_INTEGRATION_FIXTURES_H
+#define UR_CONFORMANCE_INTEGRATION_FIXTURES_H
+
+#include <uur/fixtures.h>
+
+namespace uur {
+
+struct IntegrationQueueTestWithParam
+    : uur::urKernelExecutionTestWithParam<ur_queue_flag_t> {
+    void SetUp() override {
+        UUR_RETURN_ON_FATAL_FAILURE(
+            uur::urKernelExecutionTestWithParam<ur_queue_flag_t>::SetUp());
+
+        QueueFlags = getParam();
+        ur_queue_properties_t queue_properties = {
+            UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr, QueueFlags};
+        ASSERT_SUCCESS(
+            urQueueCreate(context, device, &queue_properties, &Queue));
+    }
+
+    void TearDown() override {
+        for (ur_event_handle_t Event : AllEvents) {
+            ASSERT_SUCCESS(urEventRelease(Event));
+        }
+
+        UUR_RETURN_ON_FATAL_FAILURE(
+            uur::urKernelExecutionTestWithParam<ur_queue_flag_t>::TearDown());
+    }
+
+    void submitBarrierIfNeeded(std::vector<ur_event_handle_t> &(Events)) {
+        if (QueueFlags == UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
+            ASSERT_SUCCESS(urEnqueueEventsWaitWithBarrier(
+                Queue, Events.size(), Events.data(), nullptr));
+            AllEvents.insert(AllEvents.end(), Events.begin(), Events.end());
+        }
+    }
+
+    void submitBarrierIfNeeded(ur_event_handle_t Event) {
+        if (QueueFlags == UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
+            ASSERT_SUCCESS(
+                urEnqueueEventsWaitWithBarrier(Queue, 1, &Event, nullptr));
+            AllEvents.push_back(Event);
+        }
+    }
+
+    std::vector<ur_event_handle_t> AllEvents;
+    ur_queue_flags_t QueueFlags{};
+    ur_queue_handle_t Queue{};
+    static constexpr size_t ArraySize = 100;
+    static constexpr uint32_t InitialValue = 100;
+
+    static std::string
+    paramPrinter(const ::testing::TestParamInfo<
+                 std::tuple<ur_device_handle_t, ur_queue_flag_t>> &info) {
+        auto device = std::get<0>(info.param);
+        auto param = std::get<1>(info.param);
+
+        std::stringstream ss;
+        if (param == 0) {
+            ss << "IN_ORDER_QUEUE";
+        }
+        if (param == UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
+            ss << "OUT_OF_ORDER_QUEUE";
+        }
+
+        return uur::GetPlatformAndDeviceName(device) + "__" + ss.str();
+    }
+};
+} // namespace uur
+
+#endif //UR_CONFORMANCE_INTEGRATION_FIXTURES_H
diff --git a/test/conformance/integration/integration_adapter_cuda.match b/test/conformance/integration/integration_adapter_cuda.match
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test/conformance/integration/integration_adapter_hip.match b/test/conformance/integration/integration_adapter_hip.match
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test/conformance/integration/integration_adapter_level_zero.match b/test/conformance/integration/integration_adapter_level_zero.match
new file mode 100644
index 0000000000..905fdea60f
--- /dev/null
+++ b/test/conformance/integration/integration_adapter_level_zero.match
@@ -0,0 +1,6 @@
+{{OPT}}QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___IN_ORDER_QUEUE
+{{OPT}}QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___OUT_OF_ORDER_QUEUE
+{{OPT}}QueueUSMTestWithParam.QueueUSMTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___IN_ORDER_QUEUE
+{{OPT}}QueueUSMTestWithParam.QueueUSMTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___OUT_OF_ORDER_QUEUE
+{{OPT}}QueueBufferTestWithParam.QueueBufferTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___IN_ORDER_QUEUE
+{{OPT}}QueueBufferTestWithParam.QueueBufferTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___OUT_OF_ORDER_QUEUE
diff --git a/test/conformance/integration/integration_adapter_native_cpu.match b/test/conformance/integration/integration_adapter_native_cpu.match
new file mode 100644
index 0000000000..07afde2cef
--- /dev/null
+++ b/test/conformance/integration/integration_adapter_native_cpu.match
@@ -0,0 +1,7 @@
+QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___IN_ORDER_QUEUE
+QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___OUT_OF_ORDER_QUEUE
+QueueUSMTestWithParam.QueueUSMTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___IN_ORDER_QUEUE
+QueueUSMTestWithParam.QueueUSMTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___OUT_OF_ORDER_QUEUE
+QueueBufferTestWithParam.QueueBufferTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___IN_ORDER_QUEUE
+QueueBufferTestWithParam.QueueBufferTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___OUT_OF_ORDER_QUEUE
+{{OPT}}{{Segmentation fault|Aborted}}
diff --git a/test/conformance/integration/integration_adapter_opencl.match b/test/conformance/integration/integration_adapter_opencl.match
new file mode 100644
index 0000000000..57a5299327
--- /dev/null
+++ b/test/conformance/integration/integration_adapter_opencl.match
@@ -0,0 +1,6 @@
+QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/Intel_R__OpenCL___{{.*}}___IN_ORDER_QUEUE
+QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/Intel_R__OpenCL___{{.*}}___OUT_OF_ORDER_QUEUE
+QueueUSMTestWithParam.QueueUSMTest/Intel_R__OpenCL___{{.*}}___IN_ORDER_QUEUE
+QueueUSMTestWithParam.QueueUSMTest/Intel_R__OpenCL___{{.*}}___OUT_OF_ORDER_QUEUE
+QueueBufferTestWithParam.QueueBufferTest/Intel_R__OpenCL___{{.*}}___IN_ORDER_QUEUE
+QueueBufferTestWithParam.QueueBufferTest/Intel_R__OpenCL___{{.*}}___OUT_OF_ORDER_QUEUE
diff --git a/test/conformance/kernel/kernel_adapter_cuda.match b/test/conformance/kernel/kernel_adapter_cuda.match
index 992b184a29..fe44a34352 100644
--- a/test/conformance/kernel/kernel_adapter_cuda.match
+++ b/test/conformance/kernel/kernel_adapter_cuda.match
@@ -1,9 +1,6 @@
-{{OPT}}urKernelSetArgLocalTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_
-{{OPT}}urKernelSetArgLocalTest.InvalidNullHandleKernel/NVIDIA_CUDA_BACKEND___{{.*}}_
+urKernelGetGroupInfoWgSizeTest.CompileWorkGroupSize/NVIDIA_CUDA_BACKEND___{{.*}}_
 {{OPT}}urKernelSetArgLocalTest.InvalidKernelArgumentIndex/NVIDIA_CUDA_BACKEND___{{.*}}_
 {{OPT}}urKernelSetArgMemObjTest.InvalidKernelArgumentIndex/NVIDIA_CUDA_BACKEND___{{.*}}_
-{{OPT}}urKernelSetArgPointerNegativeTest.InvalidNullHandleKernel/NVIDIA_CUDA_BACKEND___{{.*}}_
 {{OPT}}urKernelSetArgPointerNegativeTest.InvalidKernelArgumentIndex/NVIDIA_CUDA_BACKEND___{{.*}}_
 {{OPT}}urKernelSetArgSamplerTest.InvalidKernelArgumentIndex/NVIDIA_CUDA_BACKEND___{{.*}}_
 {{OPT}}urKernelSetArgValueTest.InvalidKernelArgumentIndex/NVIDIA_CUDA_BACKEND___{{.*}}_
-{{OPT}}Segmentation fault
diff --git a/test/conformance/kernel/kernel_adapter_hip.match b/test/conformance/kernel/kernel_adapter_hip.match
index c5ed864c90..894bd698bb 100644
--- a/test/conformance/kernel/kernel_adapter_hip.match
+++ b/test/conformance/kernel/kernel_adapter_hip.match
@@ -1,3 +1,8 @@
+{{OPT}}urKernelCreateWithNativeHandleTest.Success/AMD_HIP_BACKEND___{{.*}}_
+{{OPT}}urKernelCreateWithNativeHandleTest.InvalidNullHandleContext/AMD_HIP_BACKEND___{{.*}}_
+{{OPT}}urKernelCreateWithNativeHandleTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}_
+{{OPT}}urKernelCreateWithNativeHandleTest.InvalidNullPointerNativeKernel/AMD_HIP_BACKEND___{{.*}}_
+urKernelGetGroupInfoWgSizeTest.CompileWorkGroupSize/AMD_HIP_BACKEND___{{.*}}_
 {{OPT}}urKernelGetInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_KERNEL_INFO_NUM_REGS
 {{OPT}}urKernelGetInfoTest.InvalidSizeSmall/AMD_HIP_BACKEND___{{.*}}___UR_KERNEL_INFO_FUNCTION_NAME
 {{OPT}}urKernelGetInfoTest.InvalidSizeSmall/AMD_HIP_BACKEND___{{.*}}___UR_KERNEL_INFO_NUM_ARGS
@@ -22,4 +27,3 @@
 {{OPT}}urKernelSetSpecializationConstantsTest.InvalidNullHandleKernel/AMD_HIP_BACKEND___{{.*}}_
 {{OPT}}urKernelSetSpecializationConstantsTest.InvalidNullPointerSpecConstants/AMD_HIP_BACKEND___{{.*}}_
 {{OPT}}urKernelSetSpecializationConstantsTest.InvalidSizeCount/AMD_HIP_BACKEND___{{.*}}_
-{{OPT}}{{Segmentation fault|Aborted}}
diff --git a/test/conformance/kernel/kernel_adapter_native_cpu.match b/test/conformance/kernel/kernel_adapter_native_cpu.match
index fae4d2f51b..93e3ddd67d 100644
--- a/test/conformance/kernel/kernel_adapter_native_cpu.match
+++ b/test/conformance/kernel/kernel_adapter_native_cpu.match
@@ -31,6 +31,8 @@ urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU___
 urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE
 urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
 urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE
+urKernelGetGroupInfoSingleTest.CompileWorkGroupSizeEmpty/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urKernelGetGroupInfoWgSizeTest.CompileWorkGroupSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urKernelGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_KERNEL_INFO_FUNCTION_NAME
 urKernelGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_KERNEL_INFO_NUM_ARGS
 urKernelGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_KERNEL_INFO_REFERENCE_COUNT
@@ -101,6 +103,7 @@ urKernelGetSubGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU
 urKernelGetSubGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS
 urKernelGetSubGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS
 urKernelGetSubGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL
+urKernelGetSubGroupInfoSingleTest.CompileNumSubgroupsIsZero/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urKernelReleaseTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urKernelReleaseTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urKernelRetainTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
diff --git a/test/conformance/kernel/urKernelCreateWithNativeHandle.cpp b/test/conformance/kernel/urKernelCreateWithNativeHandle.cpp
index ab01069c82..aac3d6e7f7 100644
--- a/test/conformance/kernel/urKernelCreateWithNativeHandle.cpp
+++ b/test/conformance/kernel/urKernelCreateWithNativeHandle.cpp
@@ -8,8 +8,9 @@
 struct urKernelCreateWithNativeHandleTest : uur::urKernelTest {
     void SetUp() override {
         UUR_RETURN_ON_FATAL_FAILURE(urKernelTest::SetUp());
-        if (urKernelGetNativeHandle(kernel, &native_kernel_handle)) {
-            GTEST_SKIP();
+        {
+            UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+                urKernelGetNativeHandle(kernel, &native_kernel_handle));
         }
     }
 
@@ -31,7 +32,7 @@ struct urKernelCreateWithNativeHandleTest : uur::urKernelTest {
 UUR_INSTANTIATE_KERNEL_TEST_SUITE_P(urKernelCreateWithNativeHandleTest);
 
 TEST_P(urKernelCreateWithNativeHandleTest, Success) {
-    ASSERT_SUCCESS(urKernelCreateWithNativeHandle(
+    UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urKernelCreateWithNativeHandle(
         native_kernel_handle, context, program, &properties, &native_kernel));
 
     uint32_t ref_count = 0;
diff --git a/test/conformance/kernel/urKernelGetGroupInfo.cpp b/test/conformance/kernel/urKernelGetGroupInfo.cpp
index 5ad6225676..b91001a07f 100644
--- a/test/conformance/kernel/urKernelGetGroupInfo.cpp
+++ b/test/conformance/kernel/urKernelGetGroupInfo.cpp
@@ -18,6 +18,24 @@ UUR_TEST_SUITE_P(
                       UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE),
     uur::deviceTestWithParamPrinter<ur_kernel_group_info_t>);
 
+struct urKernelGetGroupInfoSingleTest : uur::urKernelTest {
+    void SetUp() override {
+        UUR_RETURN_ON_FATAL_FAILURE(urKernelTest::SetUp());
+    }
+};
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urKernelGetGroupInfoSingleTest);
+
+struct urKernelGetGroupInfoWgSizeTest : uur::urKernelTest {
+    void SetUp() override {
+        program_name = "fixed_wg_size";
+        UUR_RETURN_ON_FATAL_FAILURE(urKernelTest::SetUp());
+    }
+
+    // This must match the size in fixed_wg_size.cpp
+    std::array<size_t, 3> wg_size{4, 4, 4};
+};
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urKernelGetGroupInfoWgSizeTest);
+
 TEST_P(urKernelGetGroupInfoTest, Success) {
     auto property_name = getParam();
     size_t property_size = 0;
@@ -57,3 +75,21 @@ TEST_P(urKernelGetGroupInfoTest, InvalidEnumeration) {
                                           UR_KERNEL_GROUP_INFO_FORCE_UINT32, 0,
                                           nullptr, &bad_enum_length));
 }
+
+TEST_P(urKernelGetGroupInfoWgSizeTest, CompileWorkGroupSize) {
+    std::array<size_t, 3> read_dims{1, 1, 1};
+    ASSERT_SUCCESS(urKernelGetGroupInfo(
+        kernel, device, UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
+        sizeof(read_dims), read_dims.data(), nullptr));
+    ASSERT_EQ(read_dims, wg_size);
+}
+
+TEST_P(urKernelGetGroupInfoSingleTest, CompileWorkGroupSizeEmpty) {
+    // Returns 0 by default when there is no sepecific information
+    std::array<size_t, 3> read_dims{1, 1, 1};
+    std::array<size_t, 3> zero{0, 0, 0};
+    ASSERT_SUCCESS(urKernelGetGroupInfo(
+        kernel, device, UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
+        sizeof(read_dims), read_dims.data(), nullptr));
+    ASSERT_EQ(read_dims, zero);
+}
diff --git a/test/conformance/kernel/urKernelGetSubGroupInfo.cpp b/test/conformance/kernel/urKernelGetSubGroupInfo.cpp
index 9a8e599510..fa4e045483 100644
--- a/test/conformance/kernel/urKernelGetSubGroupInfo.cpp
+++ b/test/conformance/kernel/urKernelGetSubGroupInfo.cpp
@@ -16,6 +16,13 @@ UUR_TEST_SUITE_P(
                       UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL),
     uur::deviceTestWithParamPrinter<ur_kernel_sub_group_info_t>);
 
+struct urKernelGetSubGroupInfoSingleTest : uur::urKernelTest {
+    void SetUp() override {
+        UUR_RETURN_ON_FATAL_FAILURE(urKernelTest::SetUp());
+    }
+};
+UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urKernelGetSubGroupInfoSingleTest);
+
 TEST_P(urKernelGetSubGroupInfoTest, Success) {
     auto property_name = getParam();
     size_t property_size = 0;
@@ -53,3 +60,12 @@ TEST_P(urKernelGetSubGroupInfoTest, InvalidEnumeration) {
                          kernel, device, UR_KERNEL_SUB_GROUP_INFO_FORCE_UINT32,
                          0, nullptr, &bad_enum_length));
 }
+
+TEST_P(urKernelGetSubGroupInfoSingleTest, CompileNumSubgroupsIsZero) {
+    // Returns 0 by default when there is no specific information
+    size_t subgroups = 1;
+    ASSERT_SUCCESS(urKernelGetSubGroupInfo(
+        kernel, device, UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS,
+        sizeof(subgroups), &subgroups, nullptr));
+    ASSERT_EQ(subgroups, 0);
+}
diff --git a/test/conformance/memory/memory_adapter_cuda.match b/test/conformance/memory/memory_adapter_cuda.match
index cd65787020..b9353f20be 100644
--- a/test/conformance/memory/memory_adapter_cuda.match
+++ b/test/conformance/memory/memory_adapter_cuda.match
@@ -1 +1 @@
-urMemBufferCreateWithNativeHandleTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_
+urMemImageCreateTest.InvalidSize/NVIDIA_CUDA_BACKEND___{{.*}}_
diff --git a/test/conformance/memory/memory_adapter_native_cpu.match b/test/conformance/memory/memory_adapter_native_cpu.match
index 58651a745b..d0cb7644d9 100644
--- a/test/conformance/memory/memory_adapter_native_cpu.match
+++ b/test/conformance/memory/memory_adapter_native_cpu.match
@@ -11,5 +11,7 @@ urMemImageCreateTest.InvalidImageDescNumMipLevel/SYCL_NATIVE_CPU___SYCL_Native_C
 urMemImageCreateTest.InvalidImageDescNumSamples/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urMemImageCreateTest.InvalidImageDescRowPitch/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urMemImageCreateTest.InvalidImageDescSlicePitch/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urMemImageCreateWithHostPtrFlagsTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER
+urMemImageCreateWithHostPtrFlagsTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_FLAG_USE_HOST_POINTER
 urMemReleaseTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urMemRetainTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
diff --git a/test/conformance/memory/urMemBufferCreateWithNativeHandle.cpp b/test/conformance/memory/urMemBufferCreateWithNativeHandle.cpp
index 573c9c0036..96871d24b6 100644
--- a/test/conformance/memory/urMemBufferCreateWithNativeHandle.cpp
+++ b/test/conformance/memory/urMemBufferCreateWithNativeHandle.cpp
@@ -10,8 +10,9 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urMemBufferCreateWithNativeHandleTest);
 
 TEST_P(urMemBufferCreateWithNativeHandleTest, Success) {
     ur_native_handle_t hNativeMem = nullptr;
-    if (urMemGetNativeHandle(buffer, device, &hNativeMem)) {
-        GTEST_SKIP();
+    {
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+            urMemGetNativeHandle(buffer, device, &hNativeMem));
     }
 
     // We cannot assume anything about a native_handle, not even if it's
@@ -24,7 +25,7 @@ TEST_P(urMemBufferCreateWithNativeHandleTest, Success) {
         /*.pNext =*/nullptr,
         /*.isNativeHandleOwned =*/false,
     };
-    ASSERT_SUCCESS(
+    UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
         urMemBufferCreateWithNativeHandle(hNativeMem, context, &props, &mem));
     ASSERT_NE(mem, nullptr);
 
diff --git a/test/conformance/memory/urMemImageCreate.cpp b/test/conformance/memory/urMemImageCreate.cpp
index b64c09558b..0ba25bd1c9 100644
--- a/test/conformance/memory/urMemImageCreate.cpp
+++ b/test/conformance/memory/urMemImageCreate.cpp
@@ -3,6 +3,7 @@
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #include <uur/fixtures.h>
+#include <uur/raii.h>
 
 using urMemImageCreateTest = uur::urContextTest;
 UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urMemImageCreateTest);
@@ -64,6 +65,13 @@ TEST_P(urMemImageCreateTest, InvalidNullPointerImageDesc) {
                                       &image_handle));
 }
 
+TEST_P(urMemImageCreateTest, InvalidNullPointerImageFormat) {
+    ur_mem_handle_t image_handle = nullptr;
+    ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
+                     urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, nullptr,
+                                      &image_desc, nullptr, &image_handle));
+}
+
 TEST_P(urMemImageCreateTest, InvalidSize) {
 
     ur_mem_handle_t image_handle = nullptr;
@@ -171,6 +179,20 @@ UUR_TEST_SUITE_P(urMemImageCreateWithHostPtrFlagsTest,
                                    UR_MEM_FLAG_USE_HOST_POINTER),
                  uur::deviceTestWithParamPrinter<ur_mem_flag_t>);
 
+TEST_P(urMemImageCreateWithHostPtrFlagsTest, Success) {
+    uur::raii::Mem host_ptr_buffer = nullptr;
+    ASSERT_SUCCESS(urMemImageCreate(context, UR_MEM_FLAG_ALLOC_HOST_POINTER,
+                                    &image_format, &image_desc, nullptr,
+                                    host_ptr_buffer.ptr()));
+
+    ur_mem_handle_t image_handle = nullptr;
+    ASSERT_SUCCESS(urMemImageCreate(context, getParam(), &image_format,
+                                    &image_desc, host_ptr_buffer.ptr(),
+                                    &image_handle));
+    ASSERT_NE(nullptr, image_handle);
+    ASSERT_SUCCESS(urMemRelease(image_handle));
+}
+
 TEST_P(urMemImageCreateWithHostPtrFlagsTest, InvalidHostPtr) {
     ur_mem_handle_t image_handle = nullptr;
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_HOST_PTR,
diff --git a/test/conformance/platform/platform_adapter_cuda.match b/test/conformance/platform/platform_adapter_cuda.match
index e69de29bb2..b459b89bbe 100644
--- a/test/conformance/platform/platform_adapter_cuda.match
+++ b/test/conformance/platform/platform_adapter_cuda.match
@@ -0,0 +1 @@
+urPlatformCreateWithNativeHandleTest.InvalidNullPointerPlatform
diff --git a/test/conformance/platform/platform_adapter_hip.match b/test/conformance/platform/platform_adapter_hip.match
index e69de29bb2..b459b89bbe 100644
--- a/test/conformance/platform/platform_adapter_hip.match
+++ b/test/conformance/platform/platform_adapter_hip.match
@@ -0,0 +1 @@
+urPlatformCreateWithNativeHandleTest.InvalidNullPointerPlatform
diff --git a/test/conformance/platform/platform_adapter_native_cpu.match b/test/conformance/platform/platform_adapter_native_cpu.match
index 6e05372d57..257822d30b 100644
--- a/test/conformance/platform/platform_adapter_native_cpu.match
+++ b/test/conformance/platform/platform_adapter_native_cpu.match
@@ -1,3 +1,4 @@
+urPlatformCreateWithNativeHandleTest.InvalidNullPointerPlatform
 urPlatfromGetBackendOptionTest.InvalidValueFrontendOption
 urPlatfromGetBackendOptionTestWithParam.Success/_O0
 urPlatfromGetBackendOptionTestWithParam.Success/_O1
diff --git a/test/conformance/platform/urPlatformCreateWithNativeHandle.cpp b/test/conformance/platform/urPlatformCreateWithNativeHandle.cpp
index 6dd3310f6a..435ac23a00 100644
--- a/test/conformance/platform/urPlatformCreateWithNativeHandle.cpp
+++ b/test/conformance/platform/urPlatformCreateWithNativeHandle.cpp
@@ -10,32 +10,80 @@ using urPlatformCreateWithNativeHandleTest = uur::platform::urPlatformTest;
 TEST_F(urPlatformCreateWithNativeHandleTest, Success) {
     for (auto platform : platforms) {
         ur_native_handle_t native_handle = nullptr;
-        if (urPlatformGetNativeHandle(platform, &native_handle)) {
-            continue;
-        };
+        {
+            UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+                urPlatformGetNativeHandle(platform, &native_handle));
+        }
 
         // We cannot assume anything about a native_handle, not even if it's
         // `nullptr` since this could be a valid representation within a backend.
         // We can however convert the native_handle back into a unified-runtime
         // handle and perform some query on it to verify that it works.
         ur_platform_handle_t plat = nullptr;
-        ASSERT_SUCCESS(
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
             urPlatformCreateWithNativeHandle(native_handle, nullptr, &plat));
         ASSERT_NE(plat, nullptr);
 
-        ur_platform_backend_t backend;
-        ASSERT_SUCCESS(urPlatformGetInfo(plat, UR_PLATFORM_INFO_BACKEND,
-                                         sizeof(ur_platform_backend_t),
-                                         &backend, nullptr));
+        std::string input_platform_name = uur::GetPlatformName(platform);
+        std::string created_platform_name = uur::GetPlatformName(plat);
+        ASSERT_EQ(input_platform_name, created_platform_name);
     }
 }
 
-TEST_F(urPlatformCreateWithNativeHandleTest, InvalidNullPointerPlatform) {
+TEST_F(urPlatformCreateWithNativeHandleTest, SuccessWithOwnedNativeHandle) {
+    for (auto platform : platforms) {
+        ur_native_handle_t native_handle = nullptr;
+        {
+            UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+                urPlatformGetNativeHandle(platform, &native_handle));
+        }
+
+        // We cannot assume anything about a native_handle, not even if it's
+        // `nullptr` since this could be a valid representation within a backend.
+        // We can however convert the native_handle back into a unified-runtime
+        // handle and perform some query on it to verify that it works.
+        ur_platform_native_properties_t props = {
+            UR_STRUCTURE_TYPE_PLATFORM_NATIVE_PROPERTIES, nullptr, true};
+        ur_platform_handle_t plat = nullptr;
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+            urPlatformCreateWithNativeHandle(native_handle, &props, &plat));
+        ASSERT_NE(plat, nullptr);
+
+        std::string input_platform_name = uur::GetPlatformName(platform);
+        std::string created_platform_name = uur::GetPlatformName(plat);
+        ASSERT_EQ(input_platform_name, created_platform_name);
+    }
+}
+
+TEST_F(urPlatformCreateWithNativeHandleTest, SuccessWithUnOwnedNativeHandle) {
     for (auto platform : platforms) {
         ur_native_handle_t native_handle = nullptr;
-        if (urPlatformGetNativeHandle(platform, &native_handle)) {
-            continue;
+        {
+            UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+                urPlatformGetNativeHandle(platform, &native_handle));
         }
+
+        // We cannot assume anything about a native_handle, not even if it's
+        // `nullptr` since this could be a valid representation within a backend.
+        // We can however convert the native_handle back into a unified-runtime
+        // handle and perform some query on it to verify that it works.
+        ur_platform_native_properties_t props = {
+            UR_STRUCTURE_TYPE_PLATFORM_NATIVE_PROPERTIES, nullptr, false};
+        ur_platform_handle_t plat = nullptr;
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+            urPlatformCreateWithNativeHandle(native_handle, &props, &plat));
+        ASSERT_NE(plat, nullptr);
+
+        std::string input_platform_name = uur::GetPlatformName(platform);
+        std::string created_platform_name = uur::GetPlatformName(plat);
+        ASSERT_EQ(input_platform_name, created_platform_name);
+    }
+}
+
+TEST_F(urPlatformCreateWithNativeHandleTest, InvalidNullPointerPlatform) {
+    for (auto platform : platforms) {
+        ur_native_handle_t native_handle = nullptr;
+        ASSERT_SUCCESS(urPlatformGetNativeHandle(platform, &native_handle));
         ASSERT_EQ_RESULT(
             UR_RESULT_ERROR_INVALID_NULL_POINTER,
             urPlatformCreateWithNativeHandle(native_handle, nullptr, nullptr));
diff --git a/test/conformance/platform/urPlatformGet.cpp b/test/conformance/platform/urPlatformGet.cpp
index 2f3b28279d..f3ac6318e9 100644
--- a/test/conformance/platform/urPlatformGet.cpp
+++ b/test/conformance/platform/urPlatformGet.cpp
@@ -33,3 +33,11 @@ TEST_F(urPlatformGetTest, InvalidNumEntries) {
                                    static_cast<uint32_t>(adapters.size()), 0,
                                    platforms.data(), nullptr));
 }
+
+TEST_F(urPlatformGetTest, InvalidNullPointer) {
+    uint32_t count;
+    ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
+                     urPlatformGet(nullptr,
+                                   static_cast<uint32_t>(adapters.size()), 0,
+                                   nullptr, &count));
+}
diff --git a/test/conformance/program/program_adapter_cuda.match b/test/conformance/program/program_adapter_cuda.match
index 8b917219ec..5f6c2de212 100644
--- a/test/conformance/program/program_adapter_cuda.match
+++ b/test/conformance/program/program_adapter_cuda.match
@@ -1,29 +1,13 @@
 urProgramBuildTest.BuildFailure/NVIDIA_CUDA_BACKEND___{{.*}}_
-{{OPT}}urProgramCreateWithNativeHandleTest.InvalidNullHandleContext/NVIDIA_CUDA_BACKEND___{{.*}}_
-{{OPT}}urProgramCreateWithNativeHandleTest.InvalidNullPointerProgram/NVIDIA_CUDA_BACKEND___{{.*}}_
-{{OPT}}urProgramGetBuildInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_BINARY_TYPE
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_STATUS
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_OPTIONS
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_LOG
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_BINARY_TYPE
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleDevice/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_STATUS
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleDevice/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_OPTIONS
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleDevice/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_LOG
-{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleDevice/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_BINARY_TYPE
-{{OPT}}urProgramGetBuildInfoSingleTest.LogIsNullTerminated/NVIDIA_CUDA_BACKEND___{{.*}}_
-{{OPT}}urProgramGetInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_NUM_KERNELS
-{{OPT}}urProgramGetInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_KERNEL_NAMES
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_REFERENCE_COUNT
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_CONTEXT
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_NUM_DEVICES
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_DEVICES
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_SOURCE
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_BINARY_SIZES
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_BINARIES
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_NUM_KERNELS
-{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_KERNEL_NAMES
-{{OPT}}urProgramLinkTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_
-{{OPT}}urProgramSetSpecializationConstantsTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_
-{{OPT}}urProgramSetSpecializationConstantsTest.UseDefaultValue/NVIDIA_CUDA_BACKEND___{{.*}}_
-urProgramSetMultipleSpecializationConstantsTest.MultipleCalls/NVIDIA_CUDA_BACKEND___{{.*}}_
-urProgramSetMultipleSpecializationConstantsTest.SingleCall/NVIDIA_CUDA_BACKEND___{{.*}}_
+{{OPT}}urProgramCreateWithILTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}
+{{OPT}}urProgramCreateWithILTest.SuccessWithProperties/NVIDIA_CUDA_BACKEND___{{.*}}
+{{OPT}}urProgramCreateWithILTest.BuildInvalidProgram/NVIDIA_CUDA_BACKEND___{{.*}}
+{{OPT}}urProgramGetBuildInfoSingleTest.LogIsNullTerminated/NVIDIA_CUDA_BACKEND___{{.*}}
+{{OPT}}urProgramGetInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}
+{{OPT}}urProgramGetInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}
+{{OPT}}urProgramGetInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}
+{{OPT}}urProgramGetInfoSingleTest.NumDevicesMatchesDeviceArray/NVIDIA_CUDA_BACKEND___{{.*}}
+{{OPT}}urProgramSetSpecializationConstantsTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}
+{{OPT}}urProgramSetSpecializationConstantsTest.UseDefaultValue/NVIDIA_CUDA_BACKEND___{{.*}}
+urProgramSetMultipleSpecializationConstantsTest.MultipleCalls/NVIDIA_CUDA_BACKEND___{{.*}}
+urProgramSetMultipleSpecializationConstantsTest.SingleCall/NVIDIA_CUDA_BACKEND___{{.*}}
diff --git a/test/conformance/program/program_adapter_hip.match b/test/conformance/program/program_adapter_hip.match
index 684e066ce3..27bd6831a5 100644
--- a/test/conformance/program/program_adapter_hip.match
+++ b/test/conformance/program/program_adapter_hip.match
@@ -1,4 +1,5 @@
 urProgramBuildTest.BuildFailure/AMD_HIP_BACKEND___{{.*}}_
+{{OPT}}urProgramCreateWithNativeHandleTest.Success/AMD_HIP_BACKEND___{{.*}}_
 {{OPT}}urProgramCreateWithNativeHandleTest.InvalidNullHandleContext/AMD_HIP_BACKEND___{{.*}}_
 {{OPT}}urProgramCreateWithNativeHandleTest.InvalidNullPointerProgram/AMD_HIP_BACKEND___{{.*}}_
 {{OPT}}urProgramGetBuildInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_BINARY_TYPE
@@ -24,4 +25,6 @@ urProgramBuildTest.BuildFailure/AMD_HIP_BACKEND___{{.*}}_
 {{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_KERNEL_NAMES
 {{OPT}}urProgramLinkTest.Success/AMD_HIP_BACKEND___{{.*}}_
 {{OPT}}urProgramSetSpecializationConstantsTest.Success/AMD_HIP_BACKEND___{{.*}}_
-{{OPT}}{{Segmentation fault|Aborted}}
+{{OPT}}urProgramSetSpecializationConstantsTest.UseDefaultValue/AMD_HIP_BACKEND___{{.*}}_
+{{OPT}}urProgramSetMultipleSpecializationConstantsTest.MultipleCalls/AMD_HIP_BACKEND___{{.*}}_
+{{OPT}}urProgramSetMultipleSpecializationConstantsTest.SingleCall/AMD_HIP_BACKEND___{{.*}}_
diff --git a/test/conformance/program/program_adapter_native_cpu.match b/test/conformance/program/program_adapter_native_cpu.match
index c509c67d3e..9a5b0a9830 100644
--- a/test/conformance/program/program_adapter_native_cpu.match
+++ b/test/conformance/program/program_adapter_native_cpu.match
@@ -138,3 +138,4 @@
 {{OPT}}urProgramSetSpecializationConstantsTest.InvalidSizeCount/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 {{OPT}}urProgramSetMultipleSpecializationConstantsTest.MultipleCalls/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 {{OPT}}urProgramSetMultipleSpecializationConstantsTest.SingleCall/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+{{OPT}}{{Segmentation fault|Aborted}}
diff --git a/test/conformance/program/urProgramBuild.cpp b/test/conformance/program/urProgramBuild.cpp
index f99b94321e..a7e7e4a275 100644
--- a/test/conformance/program/urProgramBuild.cpp
+++ b/test/conformance/program/urProgramBuild.cpp
@@ -30,8 +30,7 @@ TEST_P(urProgramBuildTest, InvalidNullHandleProgram) {
 TEST_P(urProgramBuildTest, BuildFailure) {
     ur_program_handle_t program = nullptr;
     std::shared_ptr<std::vector<char>> il_binary;
-    uur::KernelsEnvironment::instance->LoadSource("build_failure", 0,
-                                                  il_binary);
+    uur::KernelsEnvironment::instance->LoadSource("build_failure", il_binary);
     if (!il_binary) {
         // The build failure we are testing for happens at SYCL compile time on
         // AMD and Nvidia, so no binary exists to check for a build failure
diff --git a/test/conformance/program/urProgramCreateWithIL.cpp b/test/conformance/program/urProgramCreateWithIL.cpp
index 800a43cd5d..3d81d14104 100644
--- a/test/conformance/program/urProgramCreateWithIL.cpp
+++ b/test/conformance/program/urProgramCreateWithIL.cpp
@@ -17,7 +17,7 @@ struct urProgramCreateWithILTest : uur::urContextTest {
         if (backend == UR_PLATFORM_BACKEND_HIP) {
             GTEST_SKIP();
         }
-        uur::KernelsEnvironment::instance->LoadSource("foo", 0, il_binary);
+        uur::KernelsEnvironment::instance->LoadSource("foo", il_binary);
     }
 
     void TearDown() override {
diff --git a/test/conformance/program/urProgramCreateWithNativeHandle.cpp b/test/conformance/program/urProgramCreateWithNativeHandle.cpp
index e121c61de8..000f94e0ad 100644
--- a/test/conformance/program/urProgramCreateWithNativeHandle.cpp
+++ b/test/conformance/program/urProgramCreateWithNativeHandle.cpp
@@ -8,8 +8,10 @@
 struct urProgramCreateWithNativeHandleTest : uur::urProgramTest {
     void SetUp() override {
         UUR_RETURN_ON_FATAL_FAILURE(urProgramTest::SetUp());
-        ASSERT_SUCCESS(
-            urProgramGetNativeHandle(program, &native_program_handle));
+        {
+            UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+                urProgramGetNativeHandle(program, &native_program_handle));
+        }
     }
 
     void TearDown() override {
@@ -25,10 +27,8 @@ struct urProgramCreateWithNativeHandleTest : uur::urProgramTest {
 UUR_INSTANTIATE_KERNEL_TEST_SUITE_P(urProgramCreateWithNativeHandleTest);
 
 TEST_P(urProgramCreateWithNativeHandleTest, Success) {
-    if (urProgramCreateWithNativeHandle(native_program_handle, context, nullptr,
-                                        &native_program)) {
-        GTEST_SKIP();
-    }
+    UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urProgramCreateWithNativeHandle(
+        native_program_handle, context, nullptr, &native_program));
 
     uint32_t ref_count = 0;
     ASSERT_SUCCESS(urProgramGetInfo(native_program,
diff --git a/test/conformance/program/urProgramSetSpecializationConstants.cpp b/test/conformance/program/urProgramSetSpecializationConstants.cpp
index 74c602cbe3..6d5b70322f 100644
--- a/test/conformance/program/urProgramSetSpecializationConstants.cpp
+++ b/test/conformance/program/urProgramSetSpecializationConstants.cpp
@@ -48,9 +48,10 @@ TEST_P(urProgramSetSpecializationConstantsTest, UseDefaultValue) {
     ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND,
                                      sizeof(ur_platform_backend_t), &backend,
                                      nullptr));
-    if (backend == UR_PLATFORM_BACKEND_CUDA) {
-        GTEST_FAIL()
-            << "This test is known to cause crashes on Nvidia; not running.";
+    if (backend == UR_PLATFORM_BACKEND_CUDA ||
+        backend == UR_PLATFORM_BACKEND_HIP) {
+        GTEST_FAIL() << "This test is known to cause crashes on Nvidia and "
+                        "AMD; not running.";
     }
 
     ASSERT_SUCCESS(urProgramBuild(context, program, nullptr));
diff --git a/test/conformance/queue/queue_adapter_native_cpu.match b/test/conformance/queue/queue_adapter_native_cpu.match
index c2887b1063..a4c2c502a4 100644
--- a/test/conformance/queue/queue_adapter_native_cpu.match
+++ b/test/conformance/queue/queue_adapter_native_cpu.match
@@ -1,3 +1,5 @@
+urQueueCreateTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
+urQueueCreateTest.CheckContext/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE
 urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_PROFILING_ENABLE
 urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_ON_DEVICE
@@ -9,6 +11,17 @@ urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_C
 urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_SUBMISSION_IMMEDIATE
 urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_USE_DEFAULT_STREAM
 urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM
+urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE
+urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_PROFILING_ENABLE
+urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_ON_DEVICE
+urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_ON_DEVICE_DEFAULT
+urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_DISCARD_EVENTS
+urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_PRIORITY_LOW
+urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_PRIORITY_HIGH
+urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_SUBMISSION_BATCHED
+urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_SUBMISSION_IMMEDIATE
+urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_USE_DEFAULT_STREAM
+urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM
 urQueueFinishTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urQueueFlushTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_
 urQueueGetInfoTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_CONTEXT
diff --git a/test/conformance/queue/urQueueCreate.cpp b/test/conformance/queue/urQueueCreate.cpp
index 03cda76d50..ad0957d747 100644
--- a/test/conformance/queue/urQueueCreate.cpp
+++ b/test/conformance/queue/urQueueCreate.cpp
@@ -2,16 +2,24 @@
 // Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
 // See LICENSE.TXT
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#include "uur/raii.h"
 #include <uur/fixtures.h>
 
 using urQueueCreateTest = uur::urContextTest;
 UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urQueueCreateTest);
 
 TEST_P(urQueueCreateTest, Success) {
-    ur_queue_handle_t queue = nullptr;
-    ASSERT_SUCCESS(urQueueCreate(context, device, nullptr, &queue));
+    uur::raii::Queue queue = nullptr;
+    ASSERT_SUCCESS(urQueueCreate(context, device, nullptr, queue.ptr()));
     ASSERT_NE(nullptr, queue);
-    ASSERT_SUCCESS(urQueueRelease(queue));
+
+    ur_queue_info_t queue_flags;
+    ASSERT_SUCCESS(urQueueGetInfo(queue, UR_QUEUE_INFO_FLAGS,
+                                  sizeof(ur_queue_info_t), &queue_flags,
+                                  nullptr));
+
+    /* Check that the queue was created without any flag */
+    ASSERT_EQ(queue_flags, 0);
 }
 
 using urQueueCreateWithParamTest = uur::urContextTestWithParam<ur_queue_flag_t>;
@@ -54,9 +62,87 @@ TEST_P(urQueueCreateWithParamTest, SuccessWithProperties) {
                                   nullptr));
     ASSERT_TRUE(queueFlags & queryFlag);
 
+    // Check that no other bit is set (i.e. is power of 2)
+    ASSERT_TRUE(queueFlags != 0 && (queueFlags & (queueFlags - 1)) == 0);
+
     ASSERT_SUCCESS(urQueueRelease(queue));
 }
 
+/* Creates two queues with the same platform and device, and checks that the
+ * queried device and platform of both queues match. */
+TEST_P(urQueueCreateWithParamTest, MatchingDeviceHandles) {
+    ur_queue_flags_t supportedFlags{};
+    ASSERT_SUCCESS(uur::GetDeviceQueueOnHostProperties(device, supportedFlags));
+
+    ur_queue_flags_t queryFlag = getParam();
+    if (!(supportedFlags & queryFlag)) {
+        GTEST_SKIP() << queryFlag << " : is not supported by the device.";
+    }
+
+    ur_queue_properties_t props = {
+        /*.stype =*/UR_STRUCTURE_TYPE_QUEUE_PROPERTIES,
+        /*.pNext =*/nullptr,
+        /*.flags =*/queryFlag,
+    };
+
+    uur::raii::Queue queue1 = nullptr;
+    ASSERT_SUCCESS(urQueueCreate(context, device, &props, queue1.ptr()));
+    ASSERT_NE(queue1, nullptr);
+
+    uur::raii::Queue queue2 = nullptr;
+    ASSERT_SUCCESS(urQueueCreate(context, device, &props, queue2.ptr()));
+    ASSERT_NE(queue2, nullptr);
+
+    ur_device_handle_t deviceQueue1;
+    ASSERT_SUCCESS(urQueueGetInfo(queue1, UR_QUEUE_INFO_DEVICE,
+                                  sizeof(ur_device_handle_t), &deviceQueue1,
+                                  nullptr));
+
+    ur_device_handle_t deviceQueue2;
+    ASSERT_SUCCESS(urQueueGetInfo(queue1, UR_QUEUE_INFO_DEVICE,
+                                  sizeof(ur_device_handle_t), &deviceQueue2,
+                                  nullptr));
+
+    ASSERT_EQ(deviceQueue1, deviceQueue2);
+}
+
+/* Create a queue and check that it returns the right context*/
+TEST_P(urQueueCreateTest, CheckContext) {
+
+    uur::raii::Queue queue = nullptr;
+    ASSERT_SUCCESS(urQueueCreate(context, device, nullptr, queue.ptr()));
+    ASSERT_NE(queue.ptr(), nullptr);
+
+    ur_context_handle_t returned_context = nullptr;
+    ASSERT_SUCCESS(urQueueGetInfo(queue, UR_QUEUE_INFO_CONTEXT,
+                                  sizeof(ur_context_handle_t),
+                                  &returned_context, nullptr));
+
+    ASSERT_EQ(this->context, returned_context);
+}
+
+using urQueueCreateTestMultipleDevices = uur::urAllDevicesTest;
+
+/* Create a queue using a context from a different device */
+TEST_F(urQueueCreateTestMultipleDevices, ContextFromWrongDevice) {
+
+    if (devices.size() < 2) {
+        GTEST_SKIP() << "Test requires at least 2 devices in the system";
+    }
+    ur_device_handle_t device1 = devices[0];
+    uur::raii::Context context1 = nullptr;
+    urContextCreate(1, &device1, nullptr, context1.ptr());
+
+    ur_device_handle_t device2 = devices[1];
+    uur::raii::Context context2 = nullptr;
+    urContextCreate(1, &device2, nullptr, context2.ptr());
+
+    ur_queue_handle_t queue = nullptr;
+    ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_DEVICE,
+                     urQueueCreate(context2, device1, nullptr, &queue));
+    ASSERT_EQ(queue, nullptr);
+}
+
 TEST_P(urQueueCreateTest, InvalidNullHandleContext) {
     ur_queue_handle_t queue = nullptr;
     ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE,
diff --git a/test/conformance/queue/urQueueCreateWithNativeHandle.cpp b/test/conformance/queue/urQueueCreateWithNativeHandle.cpp
index 9f7588601f..2f910198f7 100644
--- a/test/conformance/queue/urQueueCreateWithNativeHandle.cpp
+++ b/test/conformance/queue/urQueueCreateWithNativeHandle.cpp
@@ -9,8 +9,9 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urQueueCreateWithNativeHandleTest);
 
 TEST_P(urQueueCreateWithNativeHandleTest, Success) {
     ur_native_handle_t native_handle = nullptr;
-    if (urQueueGetNativeHandle(queue, nullptr, &native_handle)) {
-        GTEST_SKIP();
+    {
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+            urQueueGetNativeHandle(queue, nullptr, &native_handle));
     }
 
     // We cannot assume anything about a native_handle, not even if it's
diff --git a/test/conformance/sampler/urSamplerCreateWithNativeHandle.cpp b/test/conformance/sampler/urSamplerCreateWithNativeHandle.cpp
index 342e8f074e..c5bf46bb13 100644
--- a/test/conformance/sampler/urSamplerCreateWithNativeHandle.cpp
+++ b/test/conformance/sampler/urSamplerCreateWithNativeHandle.cpp
@@ -11,8 +11,9 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urSamplerCreateWithNativeHandleTest);
 
 TEST_P(urSamplerCreateWithNativeHandleTest, Success) {
     ur_native_handle_t native_sampler = nullptr;
-    if (urSamplerGetNativeHandle(sampler, &native_sampler)) {
-        GTEST_SKIP();
+    {
+        UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
+            urSamplerGetNativeHandle(sampler, &native_sampler));
     }
 
     // We cannot assume anything about a native_handle, not even if it's
@@ -21,8 +22,8 @@ TEST_P(urSamplerCreateWithNativeHandleTest, Success) {
     // and perform some query on it to verify that it works.
     ur_sampler_handle_t hSampler = nullptr;
     ur_sampler_native_properties_t props{};
-    ASSERT_SUCCESS(urSamplerCreateWithNativeHandle(native_sampler, context,
-                                                   &props, &hSampler));
+    UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urSamplerCreateWithNativeHandle(
+        native_sampler, context, &props, &hSampler));
     ASSERT_NE(hSampler, nullptr);
 
     ur_sampler_addressing_mode_t addr_mode;
diff --git a/test/conformance/source/environment.cpp b/test/conformance/source/environment.cpp
index a58b3ecdbd..209bc8f9a2 100644
--- a/test/conformance/source/environment.cpp
+++ b/test/conformance/source/environment.cpp
@@ -367,7 +367,7 @@ KernelsEnvironment::parseKernelOptions(int argc, char **argv,
     return options;
 }
 
-std::string KernelsEnvironment::getSupportedILPostfix(uint32_t device_index) {
+std::string KernelsEnvironment::getTargetName() {
     std::stringstream IL;
 
     if (instance->GetDevices().size() == 0) {
@@ -382,66 +382,44 @@ std::string KernelsEnvironment::getSupportedILPostfix(uint32_t device_index) {
         error = "failed to get backend from platform.";
         return {};
     }
-    if (backend == UR_PLATFORM_BACKEND_HIP) {
-        return ".bin";
-    }
 
-    auto device = instance->GetDevices()[device_index];
-    std::string IL_version;
-    if (uur::GetDeviceILVersion(device, IL_version)) {
-        error = "failed to get device IL version";
+    std::string target = "";
+    switch (backend) {
+    case UR_PLATFORM_BACKEND_OPENCL:
+    case UR_PLATFORM_BACKEND_LEVEL_ZERO:
+        return "spir64";
+    case UR_PLATFORM_BACKEND_CUDA:
+        return "nvptx64-nvidia-cuda";
+    case UR_PLATFORM_BACKEND_HIP:
+        return "amdgcn-amd-amdhsa";
+    case UR_PLATFORM_BACKEND_NATIVE_CPU:
+        error = "native_cpu doesn't support kernel tests yet";
         return {};
-    }
-
-    // TODO: This potentially needs updating as more adapters are tested.
-    if (IL_version.find("SPIR-V") != std::string::npos) {
-        IL << ".spv";
-    } else if (IL_version.find("nvptx") != std::string::npos) {
-        IL << ".bin";
-    } else {
-        error = "Undefined IL version: " + IL_version;
+    default:
+        error = "unknown target.";
         return {};
     }
-
-    return IL.str();
 }
 
 std::string
-KernelsEnvironment::getKernelSourcePath(const std::string &kernel_name,
-                                        uint32_t device_index) {
+KernelsEnvironment::getKernelSourcePath(const std::string &kernel_name) {
     std::stringstream path;
     path << kernel_options.kernel_directory << "/" << kernel_name;
-    std::string il_postfix = getSupportedILPostfix(device_index);
-
-    if (il_postfix.empty()) {
-        return {};
-    }
-
-    std::string binary_name;
-    for (const auto &entry : filesystem::directory_iterator(path.str())) {
-        auto file_name = entry.path().filename().string();
-        if (file_name.find(il_postfix) != std::string::npos) {
-            binary_name = file_name;
-            break;
-        }
-    }
 
-    if (binary_name.empty()) {
-        error =
-            "failed retrieving kernel source path for kernel: " + kernel_name;
+    std::string target_name = getTargetName();
+    if (target_name.empty()) {
         return {};
     }
 
-    path << "/" << binary_name;
+    path << "/" << target_name << ".bin.0";
 
     return path.str();
 }
 
 void KernelsEnvironment::LoadSource(
-    const std::string &kernel_name, uint32_t device_index,
+    const std::string &kernel_name,
     std::shared_ptr<std::vector<char>> &binary_out) {
-    std::string source_path =
-        instance->getKernelSourcePath(kernel_name, device_index);
+    std::string source_path = instance->getKernelSourcePath(kernel_name);
 
     if (source_path.empty()) {
         FAIL() << error;
@@ -487,9 +465,10 @@ ur_result_t KernelsEnvironment::CreateProgram(
                                        nullptr)) {
         return error;
     }
-    if (backend == UR_PLATFORM_BACKEND_HIP) {
-        // The HIP adapter does not support urProgramCreateWithIL so we need to
-        // use urProgramCreateWithBinary instead.
+    if (backend == UR_PLATFORM_BACKEND_HIP ||
+        backend == UR_PLATFORM_BACKEND_CUDA) {
+        // The CUDA and HIP adapters do not support urProgramCreateWithIL so we
+        // need to use urProgramCreateWithBinary instead.
         if (auto error = urProgramCreateWithBinary(
                 hContext, hDevice, binary.size(),
                 reinterpret_cast<const uint8_t *>(binary.data()), properties,
diff --git a/test/conformance/testing/include/uur/environment.h b/test/conformance/testing/include/uur/environment.h
index bcb7e46661..acd255a3c2 100644
--- a/test/conformance/testing/include/uur/environment.h
+++ b/test/conformance/testing/include/uur/environment.h
@@ -72,7 +72,7 @@ struct KernelsEnvironment : DevicesEnvironment {
     virtual void SetUp() override;
     virtual void TearDown() override;
 
-    void LoadSource(const std::string &kernel_name, uint32_t device_index,
+    void LoadSource(const std::string &kernel_name,
                     std::shared_ptr<std::vector<char>> &binary_out);
 
     ur_result_t CreateProgram(ur_platform_handle_t hPlatform,
@@ -89,9 +89,8 @@ struct KernelsEnvironment : DevicesEnvironment {
   private:
     KernelOptions parseKernelOptions(int argc, char **argv,
                                      const std::string &kernels_default_dir);
-    std::string getKernelSourcePath(const std::string &kernel_name,
-                                    uint32_t device_index);
-    std::string getSupportedILPostfix(uint32_t device_index);
+    std::string getKernelSourcePath(const std::string &kernel_name);
+    std::string getTargetName();
 
     KernelOptions kernel_options;
     // mapping between kernels (full_path + kernel_name) and their saved source.
diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h
index 55f3f7c4f4..cf64aa13d3 100644
--- a/test/conformance/testing/include/uur/fixtures.h
+++ b/test/conformance/testing/include/uur/fixtures.h
@@ -20,6 +20,14 @@
     }                                                                          \
     (void)0
 
+#define UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(ret)                                 \
+    auto status = ret;                                                         \
+    if (status == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {                       \
+        GTEST_SKIP();                                                          \
+    } else {                                                                   \
+        ASSERT_EQ(status, UR_RESULT_SUCCESS);                                  \
+    }
+
 namespace uur {
 
 struct urPlatformTest : ::testing::Test {
@@ -321,7 +329,8 @@ template <class T> struct urMemImageTestWithParam : urContextTestWithParam<T> {
 struct urQueueTest : urContextTest {
     void SetUp() override {
         UUR_RETURN_ON_FATAL_FAILURE(urContextTest::SetUp());
-        ASSERT_SUCCESS(urQueueCreate(context, device, 0, &queue));
+        ASSERT_SUCCESS(
+            urQueueCreate(context, device, &queue_properties, &queue));
         ASSERT_NE(queue, nullptr);
     }
 
@@ -332,6 +341,8 @@ struct urQueueTest : urContextTest {
         UUR_RETURN_ON_FATAL_FAILURE(urContextTest::TearDown());
     }
 
+    ur_queue_properties_t queue_properties = {
+        UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr, 0};
     ur_queue_handle_t queue = nullptr;
 };
 
@@ -339,7 +350,7 @@ struct urHostPipeTest : urQueueTest {
     void SetUp() override {
         UUR_RETURN_ON_FATAL_FAILURE(urQueueTest::SetUp());
         UUR_RETURN_ON_FATAL_FAILURE(
-            uur::KernelsEnvironment::instance->LoadSource("foo", 0, il_binary));
+            uur::KernelsEnvironment::instance->LoadSource("foo", il_binary));
         ASSERT_SUCCESS(uur::KernelsEnvironment::instance->CreateProgram(
             platform, context, device, *il_binary, nullptr, &program));
 
@@ -382,7 +393,9 @@ struct urHostPipeTest : urQueueTest {
 template <class T> struct urQueueTestWithParam : urContextTestWithParam<T> {
     void SetUp() override {
         UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam<T>::SetUp());
-        ASSERT_SUCCESS(urQueueCreate(this->context, this->device, 0, &queue));
+        ASSERT_SUCCESS(urQueueCreate(this->context, this->device,
+                                     &queue_properties, &queue));
+        ASSERT_NE(queue, nullptr);
     }
 
     void TearDown() override {
@@ -391,7 +404,8 @@ template <class T> struct urQueueTestWithParam : urContextTestWithParam<T> {
         }
         UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam<T>::TearDown());
     }
-
+    ur_queue_properties_t queue_properties = {
+        UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr, 0};
     ur_queue_handle_t queue;
 };
 
@@ -1121,7 +1135,7 @@ struct urProgramTest : urQueueTest {
             GTEST_SKIP();
         }
         UUR_RETURN_ON_FATAL_FAILURE(
-            uur::KernelsEnvironment::instance->LoadSource(program_name, 0,
+            uur::KernelsEnvironment::instance->LoadSource(program_name,
                                                           il_binary));
 
         const ur_program_properties_t properties = {
@@ -1145,9 +1159,9 @@ struct urProgramTest : urQueueTest {
     std::vector<ur_program_metadata_t> metadatas{};
 };
 
-template <class T> struct urProgramTestWithParam : urContextTestWithParam<T> {
+template <class T> struct urProgramTestWithParam : urQueueTestWithParam<T> {
     void SetUp() override {
-        UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam<T>::SetUp());
+        UUR_RETURN_ON_FATAL_FAILURE(urQueueTestWithParam<T>::SetUp());
 
         ur_platform_backend_t backend;
         ASSERT_SUCCESS(urPlatformGetInfo(this->platform,
@@ -1160,7 +1174,7 @@ template <class T> struct urProgramTestWithParam : urContextTestWithParam<T> {
         }
 
         UUR_RETURN_ON_FATAL_FAILURE(
-            uur::KernelsEnvironment::instance->LoadSource(program_name, 0,
+            uur::KernelsEnvironment::instance->LoadSource(program_name,
                                                           il_binary));
         ASSERT_SUCCESS(uur::KernelsEnvironment::instance->CreateProgram(
             this->platform, this->context, this->device, *il_binary, nullptr,
@@ -1171,7 +1185,7 @@ template <class T> struct urProgramTestWithParam : urContextTestWithParam<T> {
         if (program) {
             EXPECT_SUCCESS(urProgramRelease(program));
         }
-        UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam<T>::TearDown());
+        UUR_RETURN_ON_FATAL_FAILURE(urQueueTestWithParam<T>::TearDown());
     }
 
     std::shared_ptr<std::vector<char>> il_binary;
@@ -1246,20 +1260,16 @@ template <class T> struct urKernelTestWithParam : urBaseKernelTestWithParam<T> {
     }
 };
 
-struct urBaseKernelExecutionTest : urBaseKernelTest {
-    void SetUp() override {
-        UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTest::SetUp());
-    }
+struct KernelLaunchHelper {
 
-    void TearDown() override {
-        for (auto &buffer : buffer_args) {
-            ASSERT_SUCCESS(urMemRelease(buffer));
-        }
-        UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTest::TearDown());
-    }
+    KernelLaunchHelper(ur_platform_handle_t &platform,
+                       ur_context_handle_t &context, ur_kernel_handle_t &kernel,
+                       ur_queue_handle_t &queue)
+        : platform{platform}, context{context}, kernel{kernel}, queue{queue} {}
 
     // Adds a kernel arg representing a sycl buffer constructed with a 1D range.
-    void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer) {
+    void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer,
+                        size_t *buffer_index) {
         ur_mem_handle_t mem_handle = nullptr;
         ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, size,
                                          nullptr, &mem_handle));
@@ -1270,6 +1280,9 @@ struct urBaseKernelExecutionTest : urBaseKernelTest {
         ASSERT_SUCCESS(urQueueFinish(queue));
         ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, current_arg_index, nullptr,
                                             mem_handle));
+        if (buffer_index) {
+            *buffer_index = current_arg_index;
+        }
 
         // SYCL device kernels have different interfaces depending on the
         // backend being used. Typically a kernel which takes a buffer argument
@@ -1301,8 +1314,6 @@ struct urBaseKernelExecutionTest : urBaseKernelTest {
                                                &accessor));
             current_arg_index += 2;
         }
-
-        buffer_args.push_back(mem_handle);
         *out_buffer = mem_handle;
     }
 
@@ -1340,7 +1351,11 @@ struct urBaseKernelExecutionTest : urBaseKernelTest {
         ValidateBuffer<T>(buffer, size, validator);
     }
 
-    std::vector<ur_mem_handle_t> buffer_args;
+    ur_platform_handle_t &platform;
+    ur_context_handle_t &context;
+    ur_kernel_handle_t &kernel;
+    ur_queue_handle_t &queue;
+
     uint32_t current_arg_index = 0;
 };
 
@@ -1349,10 +1364,6 @@ struct urBaseKernelExecutionTestWithParam : urBaseKernelTestWithParam<T> {
     void SetUp() override {
         UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam<T>::SetUp());
         UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam<T>::Build());
-        context = urBaseKernelTestWithParam<T>::context;
-        kernel = urBaseKernelTestWithParam<T>::kernel;
-        ASSERT_SUCCESS(urQueueCreate(
-            context, urBaseKernelTestWithParam<T>::device, 0, &queue));
     }
 
     void TearDown() override {
@@ -1360,91 +1371,85 @@ struct urBaseKernelExecutionTestWithParam : urBaseKernelTestWithParam<T> {
             ASSERT_SUCCESS(urMemRelease(buffer));
         }
         UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam<T>::TearDown());
-        if (queue) {
-            EXPECT_SUCCESS(urQueueRelease(queue));
-        }
     }
 
-    // Adds a kernel arg representing a sycl buffer constructed with a 1D range.
-    void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer) {
-        ur_mem_handle_t mem_handle = nullptr;
-        ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, size,
-                                         nullptr, &mem_handle));
-        char zero = 0;
-        ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, mem_handle, &zero,
-                                              sizeof(zero), 0, size, 0, nullptr,
-                                              nullptr));
-        ASSERT_SUCCESS(urQueueFinish(queue));
-        ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, current_arg_index, nullptr,
-                                            mem_handle));
+    void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer,
+                        size_t *buffer_index = nullptr) {
+        helper.AddBuffer1DArg(size, out_buffer, buffer_index);
+        buffer_args.push_back(*out_buffer);
+    }
 
-        // SYCL device kernels have different interfaces depending on the
-        // backend being used. Typically a kernel which takes a buffer argument
-        // will take a pointer to the start of the buffer and a sycl::id param
-        // which is a struct that encodes the accessor to the buffer. However
-        // the AMD backend handles this differently and uses three separate
-        // arguments for each of the three dimensions of the accessor.
+    template <class K> void AddPodArg(K data) { helper.AddPodArg(data); }
 
-        ur_platform_backend_t backend;
-        ASSERT_SUCCESS(urPlatformGetInfo(urBaseKernelTestWithParam<T>::platform,
-                                         UR_PLATFORM_INFO_BACKEND,
-                                         sizeof(backend), &backend, nullptr));
-        if (backend == UR_PLATFORM_BACKEND_HIP) {
-            // this emulates the three offset params for buffer accessor on AMD.
-            size_t val = 0;
-            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1,
-                                               sizeof(size_t), nullptr, &val));
-            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 2,
-                                               sizeof(size_t), nullptr, &val));
-            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 3,
-                                               sizeof(size_t), nullptr, &val));
-            current_arg_index += 4;
-        } else {
-            // This emulates the offset struct sycl adds for a 1D buffer accessor.
-            struct {
-                size_t offsets[1] = {0};
-            } accessor;
-            ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1,
-                                               sizeof(accessor), nullptr,
-                                               &accessor));
-            current_arg_index += 2;
+    void Launch1DRange(size_t global_size, size_t local_size = 1) {
+        helper.Launch1DRange(global_size, local_size);
+    }
+
+    template <class K>
+    void ValidateBuffer(ur_mem_handle_t buffer, size_t size,
+                        std::function<bool(T &)> validator) {
+        helper.ValidateBuffer(buffer, size, validator);
+    }
+
+    template <class K>
+    void ValidateBuffer(ur_mem_handle_t buffer, size_t size, K value) {
+        helper.ValidateBuffer(buffer, size, value);
+    }
+
+  private:
+    KernelLaunchHelper helper = KernelLaunchHelper{
+        this->platform, this->context, this->kernel, this->queue};
+    std::vector<ur_mem_handle_t> buffer_args;
+};
+
+struct urBaseKernelExecutionTest : urBaseKernelTest {
+    void SetUp() override {
+        UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTest::SetUp());
+    }
+
+    void TearDown() override {
+        for (auto &buffer : buffer_args) {
+            ASSERT_SUCCESS(urMemRelease(buffer));
         }
+        UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTest::TearDown());
+    }
 
-        buffer_args.push_back(mem_handle);
-        *out_buffer = mem_handle;
+    void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer,
+                        size_t *buffer_index = nullptr) {
+        helper.AddBuffer1DArg(size, out_buffer, buffer_index);
+        buffer_args.push_back(*out_buffer);
     }
 
-    template <class U> void AddPodArg(U data) {
-        ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index,
-                                           sizeof(data), nullptr, &data));
-        current_arg_index++;
+    template <class T> void AddPodArg(T data) { helper.AddPodArg(data); }
+
+    void Launch1DRange(size_t global_size, size_t local_size = 1) {
+        helper.Launch1DRange(global_size, local_size);
     }
 
-    // Validate the contents of `buffer` according to the given validator.
-    template <class U>
+    template <class T>
     void ValidateBuffer(ur_mem_handle_t buffer, size_t size,
-                        std::function<bool(U &)> validator) {
-        std::vector<U> read_buffer(size / sizeof(U));
-        ASSERT_SUCCESS(urEnqueueMemBufferRead(queue, buffer, true, 0, size,
-                                              read_buffer.data(), 0, nullptr,
-                                              nullptr));
-        ASSERT_TRUE(
-            std::all_of(read_buffer.begin(), read_buffer.end(), validator));
+                        std::function<bool(T &)> validator) {
+        helper.ValidateBuffer(buffer, size, validator);
     }
 
-    // Helper that uses the generic validate function to check for a given value.
-    template <class U>
-    void ValidateBuffer(ur_mem_handle_t buffer, size_t size, U value) {
-        auto validator = [&value](U result) -> bool { return result == value; };
-
-        ValidateBuffer<U>(buffer, size, validator);
+    template <class T>
+    void ValidateBuffer(ur_mem_handle_t buffer, size_t size, T value) {
+        helper.ValidateBuffer(buffer, size, value);
     }
 
+  private:
+    KernelLaunchHelper helper =
+        KernelLaunchHelper{platform, context, kernel, queue};
     std::vector<ur_mem_handle_t> buffer_args;
-    uint32_t current_arg_index = 0;
-    ur_context_handle_t context;
-    ur_kernel_handle_t kernel;
-    ur_queue_handle_t queue;
+};
+
+template <class T>
+struct urKernelExecutionTestWithParam : urBaseKernelExecutionTestWithParam<T> {
+    void SetUp() {
+        UUR_RETURN_ON_FATAL_FAILURE(
+            urBaseKernelExecutionTestWithParam<T>::SetUp());
+        this->Build();
+    }
 };
 
 struct urKernelExecutionTest : urBaseKernelExecutionTest {
diff --git a/test/conformance/testing/include/uur/utils.h b/test/conformance/testing/include/uur/utils.h
index d2341cc31b..34e5f7768b 100644
--- a/test/conformance/testing/include/uur/utils.h
+++ b/test/conformance/testing/include/uur/utils.h
@@ -395,6 +395,8 @@ ur_result_t GetDeviceMaxComputeQueueIndices(ur_device_handle_t device,
                                             uint32_t &max_indices);
 ur_result_t GetDeviceHostPipeRWSupported(ur_device_handle_t device,
                                          bool &support);
+ur_result_t GetTimestampRecordingSupport(ur_device_handle_t device,
+                                         bool &support);
 
 ur_device_partition_property_t makePartitionByCountsDesc(uint32_t count);
 
diff --git a/test/conformance/testing/source/utils.cpp b/test/conformance/testing/source/utils.cpp
index 6cb8d3a90c..ab78f64bbc 100644
--- a/test/conformance/testing/source/utils.cpp
+++ b/test/conformance/testing/source/utils.cpp
@@ -635,6 +635,12 @@ ur_result_t GetDeviceHostPipeRWSupported(ur_device_handle_t device,
         device, UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED, support);
 }
 
+ur_result_t GetTimestampRecordingSupport(ur_device_handle_t device,
+                                         bool &support) {
+    return GetDeviceInfo<bool>(
+        device, UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP, support);
+}
+
 ur_device_partition_property_t makePartitionByCountsDesc(uint32_t count) {
     ur_device_partition_property_t desc;
     desc.type = UR_DEVICE_PARTITION_BY_COUNTS;
diff --git a/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match b/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match
index dbb7cdebd5..9cda954748 100644
--- a/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match
+++ b/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match
@@ -1,88 +1,10 @@
-urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1
-urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2
-urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3
-urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7
-urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12
+{{OPT}}urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3
+{{OPT}}urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7
+{{OPT}}urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12
 urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44
-urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1
-urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2
-urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3
-urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7
-urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12
-urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44
-urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1
-urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2
-urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3
-urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7
-urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12
-urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44
-urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1
-urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2
-urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3
-urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7
-urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12
-urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44
 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1
 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2
 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3
 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7
 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12
 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44
-urPhysicalMemReleaseTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urPhysicalMemReleaseTest.InvalidNullHandlePhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urPhysicalMemRetainTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urPhysicalMemRetainTest.InvalidNullHandlePhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemFreeTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemFreeTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemFreeTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemGetInfoTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_VIRTUAL_MEM_INFO_ACCESS_MODE
-urVirtualMemGetInfoTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemGetInfoTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemGetInfoTest.InvalidEnumerationInfo/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemGranularityGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM
-urVirtualMemGranularityGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_VIRTUAL_MEM_GRANULARITY_INFO_RECOMMENDED
-urVirtualMemGranularityGetInfoNegativeTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemGranularityGetInfoNegativeTest.InvalidEnumeration/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemGranularityGetInfoNegativeTest.InvalidNullPointerPropSizeRet/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemGranularityGetInfoNegativeTest.InvalidNullPointerPropValue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemGranularityGetInfoNegativeTest.InvalidPropSizeZero/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemGranularityGetInfoNegativeTest.InvalidSizePropSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemMapTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemMapTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemMapTest.InvalidNullHandlePhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemMapTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemMapTest.InvalidEnumerationFlags/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___8
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___16
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___32
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___64
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___128
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___256
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___512
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2048
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___5000
-urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___100000
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___8
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___16
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___32
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___64
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___128
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___256
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___512
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2048
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___5000
-urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___100000
-urVirtualMemReserveTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemReserveTest.InvalidNullPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemSetAccessTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemSetAccessTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemSetAccessTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemUnmapTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemUnmapTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
-urVirtualMemUnmapTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_
diff --git a/test/layers/tracing/CMakeLists.txt b/test/layers/tracing/CMakeLists.txt
index 2ccb4f69b0..27c94b5cfd 100644
--- a/test/layers/tracing/CMakeLists.txt
+++ b/test/layers/tracing/CMakeLists.txt
@@ -43,6 +43,20 @@ add_test(NAME example-collected-hello-world
 
 set_tracing_test_props(example-collected-hello-world collector)
 
+add_test(NAME example-logged-hello-world
+    COMMAND ${CMAKE_COMMAND}
+    -D MODE=stdout
+    -D TEST_FILE=$<TARGET_FILE:hello_world>
+    -D MATCH_FILE=${CMAKE_CURRENT_SOURCE_DIR}/hello_world.out.logged.match
+    -P ${PROJECT_SOURCE_DIR}/cmake/match.cmake
+    DEPENDS hello_world
+)
+set_tests_properties(example-logged-hello-world PROPERTIES LABELS "tracing")
+set_property(TEST example-logged-hello-world PROPERTY ENVIRONMENT
+    "UR_LOG_TRACING=level:info\;output:stdout"
+    "UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_null>\""
+    "UR_ENABLE_LAYERS=UR_LAYER_TRACING")
+
 function(add_tracing_test name)
     set(TEST_TARGET_NAME tracing-test-${name})
     add_ur_executable(${TEST_TARGET_NAME}
diff --git a/test/layers/tracing/hello_world.out.logged.match b/test/layers/tracing/hello_world.out.logged.match
new file mode 100644
index 0000000000..5bd7a33eae
--- /dev/null
+++ b/test/layers/tracing/hello_world.out.logged.match
@@ -0,0 +1,13 @@
+Platform initialized.
+---> urAdapterGet(.NumEntries = 0, .phAdapters = {{.*}}, .pNumAdapters = {{.*}} (1)) -> UR_RESULT_SUCCESS;
+---> urAdapterGet(.NumEntries = 1, .phAdapters = {{.*}}, .pNumAdapters = nullptr) -> UR_RESULT_SUCCESS;
+---> urPlatformGet(.phAdapters = {{.*}}, .NumAdapters = 1, .NumEntries = 1, .phPlatforms = {{.*}}, .pNumPlatforms = {{.*}} (1)) -> UR_RESULT_SUCCESS;
+---> urPlatformGet(.phAdapters = {{.*}}, .NumAdapters = 1, .NumEntries = 1, .phPlatforms = {{.*}}, .pNumPlatforms = nullptr) -> UR_RESULT_SUCCESS;
+---> urPlatformGetApiVersion(.hPlatform = {{.*}}, .pVersion = {{.*}} ({{0\.[0-9]+}})) -> UR_RESULT_SUCCESS;
+API version: {{0\.[0-9]+}}
+---> urDeviceGet(.hPlatform = {{.*}}, .DeviceType = UR_DEVICE_TYPE_GPU, .NumEntries = 0, .phDevices = {}, .pNumDevices = {{.*}} (1)) -> UR_RESULT_SUCCESS;
+---> urDeviceGet(.hPlatform = {{.*}}, .DeviceType = UR_DEVICE_TYPE_GPU, .NumEntries = 1, .phDevices = {{.*}}, .pNumDevices = nullptr) -> UR_RESULT_SUCCESS;
+---> urDeviceGetInfo(.hDevice = {{.*}}, .propName = UR_DEVICE_INFO_TYPE, .propSize = 4, .pPropValue = {{.*}} (UR_DEVICE_TYPE_GPU), .pPropSizeRet = nullptr) -> UR_RESULT_SUCCESS;
+---> urDeviceGetInfo(.hDevice = {{.*}}, .propName = UR_DEVICE_INFO_NAME, .propSize = {{.*}}, .pPropValue = {{.*}} (Null Device), .pPropSizeRet = nullptr) -> UR_RESULT_SUCCESS;
+Found a Null Device gpu.
+---> urAdapterRelease(.hAdapter = {{.*}}) -> UR_RESULT_SUCCESS;
diff --git a/third_party/requirements.txt b/third_party/requirements.txt
index 0211d29d6b..45c2184890 100644
--- a/third_party/requirements.txt
+++ b/third_party/requirements.txt
@@ -10,9 +10,9 @@ clang-format==15.0.7
 colorama==0.4.1
 docutils==0.15.2
 exhale==0.3.0
-idna==2.8
+idna==3.7
 imagesize==1.1.0
-Jinja2==3.1.3
+Jinja2==3.1.4
 lxml==4.9.3
 Mako==1.3.0
 MarkupSafe==2.1.5
diff --git a/tools/urinfo/urinfo.hpp b/tools/urinfo/urinfo.hpp
index 111726f6cc..caf61e86f6 100644
--- a/tools/urinfo/urinfo.hpp
+++ b/tools/urinfo/urinfo.hpp
@@ -383,5 +383,26 @@ inline void printDeviceInfos(ur_device_handle_t hDevice,
     std::cout << prefix;
     printDeviceInfo<ur_bool_t>(
         hDevice, UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP);
+    std::cout << prefix;
+    printDeviceInfo<ur_bool_t>(
+        hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP);
+    std::cout << prefix;
+    printDeviceInfo<ur_bool_t>(
+        hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP);
+    std::cout << prefix;
+    printDeviceInfo<ur_bool_t>(
+        hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP);
+    std::cout << prefix;
+    printDeviceInfo<ur_bool_t>(
+        hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP);
+    std::cout << prefix;
+    printDeviceInfo<ur_bool_t>(
+        hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP);
+    std::cout << prefix;
+    printDeviceInfo<ur_bool_t>(
+        hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP);
+    std::cout << prefix;
+    printDeviceInfo<ur_bool_t>(hDevice,
+                               UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP);
 }
 } // namespace urinfo