diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000..176a458f94
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+* text=auto
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index d53176133b..56f72c49e9 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -2,16 +2,21 @@
# Level Zero adapter
source/adapters/level_zero @oneapi-src/unified-runtime-level-zero-write
+test/adapters/level_zero @oneapi-src/unified-runtime-level-zero-write
# CUDA and HIP adapters
source/adapters/cuda @oneapi-src/unified-runtime-cuda-write
+test/adapters/cuda @oneapi-src/unified-runtime-cuda-write
source/adapters/hip @oneapi-src/unified-runtime-hip-write
+test/adapters/hip @oneapi-src/unified-runtime-hip-write
# OpenCL adapter
source/adapters/opencl @oneapi-src/unified-runtime-opencl-write
+test/adapters/opencl @oneapi-src/unified-runtime-opencl-write
# Native CPU adapter
source/adapters/native_cpu @oneapi-src/unified-runtime-native-cpu-write
+test/adapters/native_cpu @oneapi-src/unified-runtime-native-cpu-write
# Command-buffer experimental feature
source/adapters/**/command_buffer.* @oneapi-src/unified-runtime-command-buffer-write
@@ -20,6 +25,7 @@ scripts/core/exp-command-buffer.yml @oneapi-src/unified-runtime-command-buff
test/conformance/exp_command_buffer** @oneapi-src/unified-runtime-command-buffer-write
# Bindless Images experimental feature
-scripts/core/EXP-BINDLESS-IMAGES.rst @oneapi-src/unified-runtime-bindless-images-write
-scripts/core/exp-bindless-images.yml @oneapi-src/unified-runtime-bindless-images-write
-source/adapters/**/image.* @oneapi-src/unified-runtime-bindless-images-write
+source/adapters/**/image.* @oneapi-src/unified-runtime-bindless-images-write
+scripts/core/EXP-BINDLESS-IMAGES.rst @oneapi-src/unified-runtime-bindless-images-write
+scripts/core/exp-bindless-images.yml @oneapi-src/unified-runtime-bindless-images-write
+test/conformance/exp_bindless_images** @oneapi-src/unified-runtime-bindless-images-write
diff --git a/.github/docker/ubuntu-22.04.Dockerfile b/.github/docker/ubuntu-22.04.Dockerfile
index e8e88caa72..09eaab03a8 100644
--- a/.github/docker/ubuntu-22.04.Dockerfile
+++ b/.github/docker/ubuntu-22.04.Dockerfile
@@ -67,7 +67,8 @@ RUN /opt/install_dpcpp.sh
COPY install_libbacktrace.sh /opt/install_libbacktrace.sh
RUN /opt/install_libbacktrace.sh
-# Add a new (non-root) 'user'
-ENV USER user
+# Add a new (non-root) 'test_user' and switch to it
+ENV USER test_user
ENV USERPASS pass
RUN useradd -m "${USER}" -g sudo -p "$(mkpasswd ${USERPASS})"
+USER test_user
diff --git a/.github/scripts/compute_benchmarks.py b/.github/scripts/compute_benchmarks.py
new file mode 100755
index 0000000000..01f1b90c40
--- /dev/null
+++ b/.github/scripts/compute_benchmarks.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+
+# Copyright (C) 2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+import subprocess # nosec B404
+import csv
+import argparse
+import io
+import json
+from pathlib import Path
+
+# Function to run the benchmark with the given parameters and environment variables
+def run_benchmark(directory, ioq, env_vars):
+ env = os.environ.copy()
+ env.update(env_vars)
+ command = [
+ f"{directory}/api_overhead_benchmark_sycl",
+ "--test=SubmitKernel",
+ f"--Ioq={ioq}",
+ "--DiscardEvents=0",
+ "--MeasureCompletion=0",
+ "--iterations=10000",
+ "--Profiling=0",
+ "--NumKernels=10",
+ "--KernelExecTime=1",
+ "--csv",
+ "--noHeaders"
+ ]
+ result = subprocess.run(command, capture_output=True, text=True, env=env) # nosec B603
+ return command, result.stdout
+
+# Function to parse the CSV output and extract the mean execution time
+def parse_output(output):
+ # Use StringIO to turn the string output into a file-like object for the csv reader
+ csv_file = io.StringIO(output)
+ reader = csv.reader(csv_file)
+
+ # Skip the header row
+ next(reader, None)
+ data_row = next(reader, None)
+ if data_row is None:
+ raise ValueError("Benchmark output does not contain data.")
+ try:
+ name = data_row[0] # Name of the benchmark is the first value
+ mean = float(data_row[1]) # Mean is the second value
+ return (name, mean)
+ except ValueError:
+ raise ValueError(f"Could not convert mean execution time to float: '{data_row[1]}'")
+ except IndexError:
+ raise ValueError("Data row does not contain enough values.")
+
+# Function to generate the mermaid bar chart script
+def generate_mermaid_script(labels, chart_data):
+ mermaid_script=f"""
+---
+config:
+ gantt:
+ rightPadding: 10
+ leftPadding: 120
+ sectionFontSize: 10
+ numberSectionStyles: 2
+---
+gantt
+ title api_overhead_benchmark_sycl, mean execution time per 10 kernels (μs)
+ todayMarker off
+ dateFormat X
+ axisFormat %s
+"""
+ for label in labels:
+ nbars = 0
+ print_label = label.replace(" ", "
")
+ mermaid_script += f"""
+ section {print_label}
+"""
+ for (name, data) in chart_data:
+ if data is not None:
+ if label in data:
+ nbars += 1
+ mean = data[label]
+ crit = "crit," if name == "This PR" else ""
+ mermaid_script += f"""
+ {name} ({mean} us) : {crit} 0, {int(mean)}
+"""
+ padding = 4 - nbars
+ if padding > 0:
+ for _ in range(padding):
+ mermaid_script += f"""
+ - : 0, 0
+"""
+
+ return mermaid_script
+
+# Function to generate the markdown collapsible sections for each variant
+def generate_markdown_details(variant_details):
+ markdown_sections = []
+ for label, command, env_vars, output in variant_details:
+ env_vars_str = '\n'.join(f"{key}={value}" for key, value in env_vars.items())
+ markdown_sections.append(f"""
+
+{label}
+
+#### Environment Variables:
+{env_vars_str}
+
+#### Command:
+{' '.join(command)}
+
+#### Output:
+{output}
+
+
+""")
+ return "\n".join(markdown_sections)
+
+# Function to generate the full markdown
+def generate_markdown_with_mermaid_chart(mermaid_script, variant_details):
+ return f"""
+# Benchmark Results
+```mermaid
+{mermaid_script}
+```
+## Details
+{generate_markdown_details(variant_details)}
+"""
+
+def save_benchmark_results(save_name, benchmark_data):
+ benchmarks_dir = Path.home() / 'benchmarks'
+ benchmarks_dir.mkdir(exist_ok=True)
+ file_path = benchmarks_dir / f"{save_name}.json"
+ with file_path.open('w') as file:
+ json.dump(benchmark_data, file, indent=4)
+ print(f"Benchmark results saved to {file_path}")
+
+def load_benchmark_results(compare_name):
+ benchmarks_dir = Path.home() / 'benchmarks'
+ file_path = benchmarks_dir / f"{compare_name}.json"
+ if file_path.exists():
+ with file_path.open('r') as file:
+ return json.load(file)
+ else:
+ return None
+
+def main(directory, additional_env_vars, save_name, compare_names):
+ variants = [
+ (1, {'UR_L0_USE_IMMEDIATE_COMMANDLISTS': '0'}, "Imm-CmdLists-OFF"),
+ (0, {'UR_L0_USE_IMMEDIATE_COMMANDLISTS': '0'}, "Imm-CmdLists-OFF"),
+ (1, {'UR_L0_USE_IMMEDIATE_COMMANDLISTS': '1'}, ""),
+ (0, {'UR_L0_USE_IMMEDIATE_COMMANDLISTS': '1'}, ""),
+ ]
+
+ # Run benchmarks and collect means, labels, and variant details
+ means = []
+ labels = []
+ variant_details = []
+ for ioq, env_vars, extra_label in variants:
+ merged_env_vars = {**env_vars, **additional_env_vars}
+ command, output = run_benchmark(directory, ioq, merged_env_vars)
+ (label, mean) = parse_output(output)
+ label += f" {extra_label}"
+ means.append(mean)
+ labels.append(label)
+ variant_details.append((label, command, merged_env_vars, output))
+
+ benchmark_data = {label: mean for label, mean in zip(labels, means)}
+
+ chart_data = [("This PR", benchmark_data)]
+ for name in compare_names:
+ chart_data.append((name, load_benchmark_results(name)))
+
+ if save_name:
+ save_benchmark_results(save_name, benchmark_data)
+
+ mermaid_script = generate_mermaid_script(labels, chart_data)
+
+ markdown_content = generate_markdown_with_mermaid_chart(mermaid_script, variant_details)
+
+ with open('benchmark_results.md', 'w') as file:
+ file.write(markdown_content)
+
+ print("Markdown with benchmark results has been written to benchmark_results.md")
+
+def validate_and_parse_env_args(env_args):
+ env_vars = {}
+ for arg in env_args:
+ if '=' not in arg:
+ raise ValueError(f"Environment variable argument '{arg}' is not in the form Variable=Value.")
+ key, value = arg.split('=', 1)
+ env_vars[key] = value
+ return env_vars
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description='Run benchmarks and generate a Mermaid bar chart script.')
+ parser.add_argument('benchmark_directory', type=str, help='The directory where the benchmarks are located.')
+ parser.add_argument("--env", type=str, help='Use env variable for a benchmark run.', action="append", default=[])
+ parser.add_argument("--save", type=str, help='Save the results for comparison under a specified name.')
+ parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append", default=["baseline"])
+
+ args = parser.parse_args()
+
+ additional_env_vars = validate_and_parse_env_args(args.env)
+
+ main(args.benchmark_directory, additional_env_vars, args.save, args.compare)
diff --git a/.github/scripts/get_system_info.sh b/.github/scripts/get_system_info.sh
index 6ca38a33ef..8301c99099 100755
--- a/.github/scripts/get_system_info.sh
+++ b/.github/scripts/get_system_info.sh
@@ -53,7 +53,7 @@ function system_info {
echo "**********/proc/meminfo**********"
cat /proc/meminfo
echo "**********build/bin/urinfo**********"
- $(dirname "$(readlink -f "$0")")/../../build/bin/urinfo || true
+ $(dirname "$(readlink -f "$0")")/../../build/bin/urinfo --no-linear-ids --verbose || true
echo "******OpenCL*******"
# The driver version of OpenCL Graphics is the compute-runtime version
clinfo || echo "OpenCL not installed"
diff --git a/.github/workflows/benchmarks_core.yml b/.github/workflows/benchmarks_core.yml
new file mode 100644
index 0000000000..34ffb7134e
--- /dev/null
+++ b/.github/workflows/benchmarks_core.yml
@@ -0,0 +1,199 @@
+name: Compute Benchmarks
+
+on:
+ # this workflow can by only triggered by other workflows
+ # for example by: e2e_cuda.yml or e2e_opencl.yml
+ workflow_call:
+ # acceptable input from adapter-specific workflows
+ inputs:
+ name:
+ description: Adapter name
+ type: string
+ required: true
+ str_name:
+ description: Formatted adapter name
+ type: string
+ required: true
+ config:
+ description: Params for sycl configuration
+ type: string
+ required: true
+ unit:
+ description: Test unit (cpu/gpu)
+ type: string
+ required: true
+ runner_tag:
+ description: Tag defined for the runner
+ type: string
+ required: true
+ trigger:
+ description: Type of workflow trigger
+ type: string
+ required: true
+ comment:
+ description: Text if triggered by a comment
+ type: string
+ required: false
+
+permissions:
+ contents: read
+ pull-requests: write
+
+jobs:
+ e2e-build-hw:
+ if: github.repository == 'oneapi-src/unified-runtime' # run only on upstream; forks will not have the HW
+ name: Build SYCL, UR, run Compute Benchmarks
+ strategy:
+ matrix:
+ adapter: [
+ {name: "${{inputs.name}}",
+ str_name: "${{inputs.str_name}}",
+ config: "${{inputs.config}}",
+ unit: "${{inputs.unit}}"}
+ ]
+ build_type: [Release]
+ compiler: [{c: clang, cxx: clang++}]
+
+ runs-on: ${{inputs.runner_tag}}
+
+ steps:
+ # Workspace on self-hosted runners is not cleaned automatically.
+ # We have to delete the files created outside of using actions.
+ - name: Cleanup self-hosted workspace
+ if: always()
+ run: |
+ ls -la ./
+ rm -rf ./* || true
+
+ - name: Add comment to PR
+ uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+ if: ${{ always() && inputs.trigger != 'schedule' }}
+ with:
+ script: |
+ const adapter = '${{ matrix.adapter.name }}';
+ const url = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}';
+ const body = `Compute Benchmarks ${adapter} run: \n${url}`;
+
+ github.rest.issues.createComment({
+ issue_number: context.issue.number,
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ body: body
+ })
+
+ - name: Checkout UR
+ uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+ with:
+ path: ur-repo
+
+ # On issue_comment trigger (for PRs) we need to fetch special ref for
+ # proper PR's merge commit. Note, this ref may be absent if the PR is already merged.
+ - name: Fetch PR's merge commit
+ if: ${{ inputs.trigger != 'schedule' }}
+ working-directory: ${{github.workspace}}/ur-repo
+ env:
+ PR_NO: ${{github.event.issue.number}}
+ run: |
+ git fetch -- https://github.com/${{github.repository}} +refs/pull/${PR_NO}/*:refs/remotes/origin/pr/${PR_NO}/*
+ git checkout origin/pr/${PR_NO}/merge
+ git rev-parse origin/pr/${PR_NO}/merge
+
+ - name: Checkout SYCL
+ uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+ with:
+ repository: intel/llvm
+ ref: refs/heads/sycl
+ path: sycl-repo
+ fetch-depth: 1
+ fetch-tags: false
+
+ - name: Set CUDA env vars
+ if: matrix.adapter.name == 'CUDA'
+ run: |
+ echo "CUDA_LIB_PATH=/usr/local/cuda/lib64/stubs" >> $GITHUB_ENV
+ echo "LD_LIBRARY_PATH=/usr/local/cuda/compat/:/usr/local/cuda/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
+
+ - name: Configure SYCL
+ run: >
+ python3 sycl-repo/buildbot/configure.py
+ -t ${{matrix.build_type}}
+ -o ${{github.workspace}}/sycl_build
+ --cmake-gen "Ninja"
+ --ci-defaults ${{matrix.adapter.config}}
+ --cmake-opt="-DLLVM_INSTALL_UTILS=ON"
+ --cmake-opt="-DSYCL_PI_TESTS=OFF"
+ --cmake-opt="-DSYCL_PI_UR_USE_FETCH_CONTENT=OFF"
+ --cmake-opt="-DSYCL_PI_UR_SOURCE_DIR=${{github.workspace}}/ur-repo/"
+ --cmake-opt=-DCMAKE_C_COMPILER_LAUNCHER=ccache
+ --cmake-opt=-DCMAKE_CXX_COMPILER_LAUNCHER=ccache
+
+ - name: Build SYCL
+ run: cmake --build ${{github.workspace}}/sycl_build -j
+
+ - name: Set additional env. vars
+ run: |
+ echo "${{github.workspace}}/sycl_build/bin" >> $GITHUB_PATH
+ echo "LD_LIBRARY_PATH=${{github.workspace}}/sycl_build/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV
+
+ # Running (newly built) sycl-ls sets up some extra variables
+ - name: Setup SYCL variables
+ run: |
+ which clang++ sycl-ls
+ SYCL_PI_TRACE=-1 sycl-ls
+
+ - name: Checkout Compute Benchmarks
+ uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+ with:
+ repository: intel/compute-benchmarks
+ path: compute-benchmarks-repo
+ submodules: recursive
+
+ - name: Configure Compute Benchmarks
+ run: >
+ cmake
+ -B ${{github.workspace}}/compute-benchmarks-build/
+ -S ${{github.workspace}}/compute-benchmarks-repo/
+ -DCMAKE_BUILD_TYPE=Release
+ -DBUILD_SYCL=ON
+ -DSYCL_COMPILER_ROOT=${{github.workspace}}/sycl_build
+ -DALLOW_WARNINGS=ON
+
+ - name: Build Compute Benchmarks
+ run: cmake --build ${{github.workspace}}/compute-benchmarks-build/ -j
+
+ - name: Set oneAPI Device Selector
+ run: |
+ echo "ONEAPI_DEVICE_SELECTOR=${{ matrix.adapter.str_name }}:${{ matrix.adapter.unit }}" >> $GITHUB_ENV
+
+ - name: Extract arguments from comment
+ id: args
+ run: echo "ARGS=$(echo '${{ inputs.comment }}' | sed -n 's/.*\/benchmarks-[^ ]* \(.*\)/\1/p')" >> $GITHUB_ENV
+
+ - name: Run SYCL API Overhead benchmark
+ id: benchmarks
+ run: ${{github.workspace}}/ur-repo/.github/scripts/compute_benchmarks.py ${{github.workspace}}/compute-benchmarks-build/bin/ $ARGS
+
+ - name: Add comment to PR
+ uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+ if: ${{ always() && inputs.trigger != 'schedule' }}
+ with:
+ script: |
+ let markdown = ""
+ try {
+ const fs = require('fs');
+ markdown = fs.readFileSync('benchmark_results.md', 'utf8');
+ } catch(err) {
+ }
+
+ const adapter = '${{ matrix.adapter.name }}';
+ const url = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}';
+ const test_status = '${{ steps.benchmarks.outcome }}';
+ const job_status = '${{ job.status }}';
+ const body = `Compute Benchmarks ${adapter} run:\n${url}\nJob status: ${job_status}. Test status: ${test_status}.\n ${markdown}`;
+
+ github.rest.issues.createComment({
+ issue_number: context.issue.number,
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ body: body
+ })
diff --git a/.github/workflows/benchmarks_level_zero.yml b/.github/workflows/benchmarks_level_zero.yml
new file mode 100644
index 0000000000..6b38aaeedd
--- /dev/null
+++ b/.github/workflows/benchmarks_level_zero.yml
@@ -0,0 +1,26 @@
+name: Compute Benchmarks Level Zero
+
+on:
+ issue_comment:
+ types: [created, edited]
+
+permissions:
+ contents: read
+ pull-requests: write
+
+jobs:
+ e2e-build-hw:
+ # trigger only if PR comment contains "benchmarks-level-zero"
+ if: ${{ (github.event.issue.pull_request && contains(github.event.comment.body, '/benchmarks-level-zero'))}}
+ name: Start benchmarks job
+ # use core flow, run it with L0 specific parameters
+ uses: ./.github/workflows/benchmarks_core.yml
+ # parameters that we pass to the core flow
+ with:
+ name: "L0"
+ runner_tag: "L0_PERF"
+ str_name: "level_zero"
+ config: ""
+ unit: "gpu"
+ trigger: "${{github.event_name}}"
+ comment: ${{github.event.comment.body}}
diff --git a/.github/workflows/build-hw-reusable.yml b/.github/workflows/build-hw-reusable.yml
index a7945e2ac2..8c8f28a868 100644
--- a/.github/workflows/build-hw-reusable.yml
+++ b/.github/workflows/build-hw-reusable.yml
@@ -15,6 +15,13 @@ on:
permissions:
contents: read
+env:
+ UR_LOG_CUDA: "level:error;flush:error"
+ UR_LOG_HIP: "level:error;flush:error"
+ UR_LOG_LEVEL_ZERO: "level:error;flush:error"
+ UR_LOG_NATIVE_CPU: "level:error;flush:error"
+ UR_LOG_OPENCL: "level:error;flush:error"
+
jobs:
adapter-build-hw:
name: Build & Test HW
diff --git a/.github/workflows/e2e_core.yml b/.github/workflows/e2e_core.yml
index 8554c5c476..7f2050d83d 100644
--- a/.github/workflows/e2e_core.yml
+++ b/.github/workflows/e2e_core.yml
@@ -34,6 +34,22 @@ on:
description: Type of workflow trigger
type: string
required: true
+ xfail:
+ description: Allow test failures
+ type: string
+ required: false
+ xfail_not:
+ description: Not xfail
+ type: string
+ required: false
+ filter_out:
+ description: Tests to filter out completely
+ type: string
+ required: false
+ extra_lit_flags:
+ description: Additional llvm-lit flags to use
+ type: string
+ required: false
permissions:
contents: read
@@ -50,7 +66,8 @@ jobs:
str_name: "${{inputs.str_name}}",
prefix: "${{inputs.prefix}}",
config: "${{inputs.config}}",
- unit: "${{inputs.unit}}"}
+ unit: "${{inputs.unit}}",
+ extra_lit_flags: "${{inputs.extra_lit_flags}}"},
]
build_type: [Release]
compiler: [{c: clang, cxx: clang++}]
@@ -103,7 +120,7 @@ jobs:
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
with:
repository: intel/llvm
- ref: sycl
+ ref: refs/heads/sycl
path: sycl-repo
- name: Set CUDA env vars
@@ -112,11 +129,6 @@ jobs:
echo "CUDA_LIB_PATH=/usr/local/cuda/lib64/stubs" >> $GITHUB_ENV
echo "LD_LIBRARY_PATH=/usr/local/cuda/compat/:/usr/local/cuda/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
- - name: Run pre setup
- run: |
- source /opt/intel/oneapi/setvars.sh --force
- sycl-ls
-
- name: Configure SYCL
run: >
python3 sycl-repo/buildbot/configure.py
@@ -134,11 +146,15 @@ jobs:
- name: Build SYCL
run: cmake --build ${{github.workspace}}/sycl_build -j
+ - name: Set extra llvm-lit options
+ if: matrix.adapter.extra_lit_flags != ''
+ run: echo "LIT_OPTS=${{matrix.adapter.extra_lit_flags}}" >> $GITHUB_ENV
+
- name: Run check-sycl
# Remove after fixing SYCL test :: abi/layout_handler.cpp
# This issue does not affect further execution of e2e with UR.
continue-on-error: true
- run: cmake --build ${{github.workspace}}/sycl_build --target check-sycl -j6
+ run: cmake --build ${{github.workspace}}/sycl_build --target check-sycl
- name: Set additional env. vars
run: |
@@ -161,6 +177,13 @@ jobs:
-DCMAKE_CXX_COMPILER="$(which clang++)"
-DLLVM_LIT="${{github.workspace}}/sycl-repo/llvm/utils/lit/lit.py"
+ - name: Set test filters for L0
+ if: matrix.adapter.name == 'L0'
+ run: |
+ echo "LIT_XFAIL_NOT=${{inputs.xfail_not}}" >> $GITHUB_ENV
+ echo "LIT_XFAIL=${{inputs.xfail}}" >> $GITHUB_ENV
+ echo "LIT_FILTER_OUT=${{inputs.filter_out}}" >> $GITHUB_ENV
+
- name: Run e2e tests
id: tests
run: ninja -C build-e2e check-sycl-e2e
diff --git a/.github/workflows/e2e_level_zero.yml b/.github/workflows/e2e_level_zero.yml
new file mode 100644
index 0000000000..66c23715ea
--- /dev/null
+++ b/.github/workflows/e2e_level_zero.yml
@@ -0,0 +1,36 @@
+name: E2E Level Zero
+
+on:
+ schedule:
+ # Run every day at 23:00 UTC
+ - cron: '0 23 * * *'
+ issue_comment:
+ types: [created, edited]
+
+permissions:
+ contents: read
+ pull-requests: write
+
+jobs:
+ e2e-build-hw:
+ # trigger only if PR comment contains "e2e-level-zero"
+ if: ${{ (github.event.issue.pull_request && contains(github.event.comment.body, '/e2e-level-zero')) || (github.event_name == 'schedule') }}
+ name: Start e2e job
+ # use core flow, run it with L0 specific parameters
+ uses: ./.github/workflows/e2e_core.yml
+ # parameters that we pass to the core flow
+ with:
+ name: "L0"
+ runner_tag: "L0_E2E"
+ str_name: "level_zero"
+ prefix: "ext_oneapi_"
+ config: ""
+ unit: "gpu"
+ trigger: "${{github.event_name}}"
+ # Failing tests
+ xfail: "ESIMD/preemption.cpp;syclcompat/atomic/atomic_class.cpp;ProgramManager/uneven_kernel_split.cpp;Plugin/level_zero_ext_intel_queue_index.cpp;Plugin/level_zero_ext_intel_cslice.cpp;Matrix/joint_matrix_rowmajorA_rowmajorB.cpp;Matrix/element_wise_ops.cpp;Matrix/element_wise_all_ops.cpp;Matrix/SG32/element_wise_all_ops.cpp"
+ # Flaky tests
+ filter_out: "GroupAlgorithm/root_group.cpp|Basic/exceptions-SYCL-2020.cpp|Graph/UnsupportedDevice/device_query.cpp|Graph/RecordReplay/exception_inconsistent_contexts.cpp"
+ # These runners by default spawn upwards of 260 workers. That's too much for the GPU.
+ # We also add a time out just in case some test hangs
+ extra_lit_flags: "-sv -j 50 --max-time 600"
diff --git a/.github/workflows/hadolint.yml b/.github/workflows/hadolint.yml
deleted file mode 100644
index a1063aefe8..0000000000
--- a/.github/workflows/hadolint.yml
+++ /dev/null
@@ -1,34 +0,0 @@
-# Runs linter for Docker files
-name: Hadolint
-
-on:
- workflow_dispatch:
- push:
- pull_request:
- paths:
- - '.github/docker/*Dockerfile'
- - '.github/workflows/hadolint.yml'
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
- cancel-in-progress: true
-
-permissions:
- contents: read
-
-jobs:
- linux:
- name: Hadolint
- runs-on: ubuntu-latest
-
- steps:
- - name: Clone the git repo
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
- - name: Run Hadolint
- uses: hadolint/hadolint-action@54c9adbab1582c2ef04b2016b760714a4bfde3cf # v3.1.0
- with:
- recursive: true
- dockerfile: ".github/docker/*Dockerfile"
- # ignore pinning apt packages to versions
- ignore: DL3008
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index 97362d4f1a..d0cb335d96 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -10,6 +10,9 @@ concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
+permissions:
+ contents: read
+
jobs:
labeler:
permissions:
@@ -17,4 +20,4 @@ jobs:
pull-requests: write
runs-on: ubuntu-latest
steps:
- - uses: actions/labeler@v5
+ - uses: actions/labeler@8558fd74291d67161a8a78ce36a881fa63b766a9 # v5.0.0
diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml
new file mode 100644
index 0000000000..2efb04c86a
--- /dev/null
+++ b/.github/workflows/trivy.yml
@@ -0,0 +1,50 @@
+# Runs linter for Docker files
+name: Trivy
+
+on:
+ workflow_dispatch:
+ push:
+ pull_request:
+ paths:
+ - '.github/docker/*Dockerfile'
+ - '.github/workflows/trivy.yml'
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+ cancel-in-progress: true
+
+permissions:
+ contents: read
+
+jobs:
+ linux:
+ name: Trivy
+ runs-on: ubuntu-latest
+ permissions:
+ security-events: write
+
+ steps:
+ - name: Clone repo
+ uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+
+ - name: Run Trivy
+ uses: aquasecurity/trivy-action@84384bd6e777ef152729993b8145ea352e9dd3ef # v0.17.0
+ with:
+ scan-type: 'config'
+ hide-progress: false
+ format: 'sarif'
+ output: 'trivy-results.sarif'
+ exit-code: 1 # Fail if issue found
+ # file with suppressions: .trivyignore (in root dir)
+
+ - name: Print report and trivyignore file
+ run: |
+ echo "### Trivy ignore content:"
+ cat .trivyignore
+ echo "### Trivy report:"
+ cat trivy-results.sarif
+
+ - name: Upload results
+ uses: github/codeql-action/upload-sarif@e8893c57a1f3a2b659b6b55564fdfdbbd2982911 # v3.24.0
+ with:
+ sarif_file: 'trivy-results.sarif'
diff --git a/.trivyignore b/.trivyignore
new file mode 100644
index 0000000000..63749166bc
--- /dev/null
+++ b/.trivyignore
@@ -0,0 +1,6 @@
+# Docs: https://aquasecurity.github.io/trivy/latest/docs/configuration/filtering/#trivyignore
+
+# In docker files:
+# HEALTHCHECK is not required for development, nor in CI (failed docker = failed CI).
+# We're not hosting any application with usage of the dockers.
+AVD-DS-0026
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4fcd74e729..cfd8752657 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,8 +3,8 @@
# See LICENSE.TXT
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-cmake_minimum_required(VERSION 3.14.0 FATAL_ERROR)
-project(unified-runtime VERSION 0.9.0)
+cmake_minimum_required(VERSION 3.20.0 FATAL_ERROR)
+project(unified-runtime VERSION 0.10.0)
include(GNUInstallDirs)
include(CheckCXXSourceCompiles)
@@ -47,12 +47,15 @@ option(UR_BUILD_ADAPTER_ALL "Build all currently supported adapters" OFF)
option(UR_BUILD_EXAMPLE_CODEGEN "Build the codegen example." OFF)
option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace for linux" OFF)
option(UR_ENABLE_ASSERTIONS "Enable assertions for all build types" OFF)
+option(UR_BUILD_XPTI_LIBS "Build the XPTI libraries when tracing is enabled" ON)
set(UR_DPCXX "" CACHE FILEPATH "Path of the DPC++ compiler executable")
+set(UR_DPCXX_BUILD_FLAGS "" CACHE STRING "Build flags to pass to DPC++ when compiling device programs")
set(UR_SYCL_LIBRARY_DIR "" CACHE PATH
"Path of the SYCL runtime library directory")
set(UR_CONFORMANCE_TARGET_TRIPLES "" CACHE STRING
"List of sycl targets to build CTS device binaries for")
set(UR_CONFORMANCE_AMD_ARCH "" CACHE STRING "AMD device target ID to build CTS binaries for")
+option(UR_CONFORMANCE_ENABLE_MATCH_FILES "Enable CTS match files" ON)
set(UR_ADAPTER_LEVEL_ZERO_SOURCE_DIR "" CACHE PATH
"Path to external 'level_zero' adapter source dir")
set(UR_ADAPTER_OPENCL_SOURCE_DIR "" CACHE PATH
@@ -107,31 +110,34 @@ endif()
if(UR_ENABLE_TRACING)
add_compile_definitions(UR_ENABLE_TRACING)
- # fetch xpti proxy library for the tracing layer
- FetchContentSparse_Declare(xpti https://github.com/intel/llvm.git "sycl-nightly/20230703" "xpti")
- FetchContent_MakeAvailable(xpti)
+ if (UR_BUILD_XPTI_LIBS)
+ # fetch xpti proxy library for the tracing layer
+ FetchContentSparse_Declare(xpti https://github.com/intel/llvm.git "sycl-nightly/20230703" "xpti")
+ FetchContent_MakeAvailable(xpti)
- # set -fPIC for xpti since we are linking it with a shared library
- set_target_properties(xpti PROPERTIES POSITION_INDEPENDENT_CODE ON)
+ # set -fPIC for xpti since we are linking it with a shared library
+ set_target_properties(xpti PROPERTIES POSITION_INDEPENDENT_CODE ON)
- # fetch the xptifw dispatcher, mostly used for testing
- # these variables need to be set for xptifw to compile
- set(XPTI_SOURCE_DIR ${xpti_SOURCE_DIR})
- set(XPTI_DIR ${xpti_SOURCE_DIR})
- set(XPTI_ENABLE_TESTS OFF CACHE INTERNAL "Turn off xptifw tests")
+ # fetch the xptifw dispatcher, mostly used for testing
+ # these variables need to be set for xptifw to compile
+ set(XPTI_SOURCE_DIR ${xpti_SOURCE_DIR})
+ set(XPTI_DIR ${xpti_SOURCE_DIR})
+ set(XPTI_ENABLE_TESTS OFF CACHE INTERNAL "Turn off xptifw tests")
- FetchContentSparse_Declare(xptifw https://github.com/intel/llvm.git "sycl-nightly/20230703" "xptifw")
+ FetchContentSparse_Declare(xptifw https://github.com/intel/llvm.git "sycl-nightly/20230703" "xptifw")
- FetchContent_MakeAvailable(xptifw)
+ FetchContent_MakeAvailable(xptifw)
- check_cxx_compiler_flag("-Wno-error=maybe-uninitialized" HAS_MAYBE_UNINIT)
- if (HAS_MAYBE_UNINIT)
- target_compile_options(xptifw PRIVATE -Wno-error=maybe-uninitialized)
+ check_cxx_compiler_flag("-Wno-error=maybe-uninitialized" HAS_MAYBE_UNINIT)
+ if (HAS_MAYBE_UNINIT)
+ target_compile_options(xptifw PRIVATE -Wno-error=maybe-uninitialized)
+ endif()
+
+ set_target_properties(xptifw PROPERTIES
+ LIBRARY_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
+ )
endif()
- set_target_properties(xptifw PROPERTIES
- LIBRARY_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
- )
if (MSVC)
set(TARGET_XPTI $,xpti,xptid>)
else()
diff --git a/README.md b/README.md
index cb43c380b9..3d53d2117e 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,7 @@
[](https://github.com/oneapi-src/unified-runtime/actions/workflows/cmake.yml)
[](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_cuda.yml)
[](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_opencl.yml)
+[](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_level_zero.yml)
[](https://github.com/oneapi-src/unified-runtime/actions/workflows/codeql.yml)
[](https://github.com/oneapi-src/unified-runtime/actions/workflows/bandit.yml)
[](https://scan.coverity.com/projects/oneapi-src-unified-runtime)
@@ -13,7 +14,6 @@
## Table of contents
- [Unified Runtime](#unified-runtime)
- - [Adapters](#adapters)
- [Table of contents](#table-of-contents)
- [Contents of the repo](#contents-of-the-repo)
- [Integration](#integration)
@@ -29,7 +29,7 @@
- [Adapter naming convention](#adapter-naming-convention)
- [Source code generation](#source-code-generation)
- [Documentation](#documentation)
-6. [Release Process](#release-process)
+ - [Release Process](#release-process)
## Contents of the repo
@@ -88,7 +88,7 @@ for more detailed instructions on the correct setup.
Required packages:
- C++ compiler with C++17 support
-- [CMake](https://cmake.org/) >= 3.14.0
+- [CMake](https://cmake.org/) >= 3.20.0
- Python v3.6.6 or later
### Windows
@@ -131,6 +131,7 @@ List of options provided by CMake:
| UR_ENABLE_SANITIZER | Enable device sanitizer layer | ON/OFF | ON |
| UR_CONFORMANCE_TARGET_TRIPLES | SYCL triples to build CTS device binaries for | Comma-separated list | spir64 |
| UR_CONFORMANCE_AMD_ARCH | AMD device target ID to build CTS binaries for | string | `""` |
+| UR_CONFORMANCE_ENABLE_MATCH_FILES | Enable CTS match files | ON/OFF | ON |
| UR_BUILD_ADAPTER_L0 | Build the Level-Zero adapter | ON/OFF | OFF |
| UR_BUILD_ADAPTER_OPENCL | Build the OpenCL adapter | ON/OFF | OFF |
| UR_BUILD_ADAPTER_CUDA | Build the CUDA adapter | ON/OFF | OFF |
@@ -140,6 +141,8 @@ List of options provided by CMake:
| UR_HIP_PLATFORM | Build HIP adapter for AMD or NVIDIA platform | AMD/NVIDIA | AMD |
| UR_ENABLE_COMGR | Enable comgr lib usage | AMD/NVIDIA | AMD |
| UR_DPCXX | Path of the DPC++ compiler executable to build CTS device binaries | File path | `""` |
+| UR_DEVICE_CODE_EXTRACTOR | Path of the `clang-offload-extract` executable from the DPC++ package, required for CTS device binaries | File path | `"${dirname(UR_DPCXX)}/clang-offload-extract"` |
+| UR_DPCXX_BUILD_FLAGS | Build flags to pass to DPC++ when compiling device programs | Space-separated options list | `""` |
| UR_SYCL_LIBRARY_DIR | Path of the SYCL runtime library directory to build CTS device binaries | Directory path | `""` |
| UR_HIP_ROCM_DIR | Path of the default ROCm HIP installation | Directory path | `/opt/rocm` |
| UR_HIP_INCLUDE_DIR | Path of the ROCm HIP include directory | Directory path | `${UR_HIP_ROCM_DIR}/include` |
diff --git a/include/ur_api.h b/include/ur_api.h
index 8680dbeffb..96a58e04ea 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -7,7 +7,7 @@
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
* @file ur_api.h
- * @version v0.9-r0
+ * @version v0.10-r0
*
*/
#ifndef UR_API_H_INCLUDED
@@ -222,6 +222,7 @@ typedef enum ur_function_t {
UR_FUNCTION_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_EXP = 220, ///< Enumerator for ::urCommandBufferUpdateKernelLaunchExp
UR_FUNCTION_COMMAND_BUFFER_GET_INFO_EXP = 221, ///< Enumerator for ::urCommandBufferGetInfoExp
UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP = 222, ///< Enumerator for ::urCommandBufferCommandGetInfoExp
+ UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP = 223, ///< Enumerator for ::urEnqueueTimestampRecordingExp
/// @cond
UR_FUNCTION_FORCE_UINT32 = 0x7fffffff
/// @endcond
@@ -1097,11 +1098,12 @@ urPlatformGetInfo(
/// - API versions contain major and minor attributes, use
/// ::UR_MAJOR_VERSION and ::UR_MINOR_VERSION
typedef enum ur_api_version_t {
- UR_API_VERSION_0_6 = UR_MAKE_VERSION(0, 6), ///< version 0.6
- UR_API_VERSION_0_7 = UR_MAKE_VERSION(0, 7), ///< version 0.7
- UR_API_VERSION_0_8 = UR_MAKE_VERSION(0, 8), ///< version 0.8
- UR_API_VERSION_0_9 = UR_MAKE_VERSION(0, 9), ///< version 0.9
- UR_API_VERSION_CURRENT = UR_MAKE_VERSION(0, 9), ///< latest known version
+ UR_API_VERSION_0_6 = UR_MAKE_VERSION(0, 6), ///< version 0.6
+ UR_API_VERSION_0_7 = UR_MAKE_VERSION(0, 7), ///< version 0.7
+ UR_API_VERSION_0_8 = UR_MAKE_VERSION(0, 8), ///< version 0.8
+ UR_API_VERSION_0_9 = UR_MAKE_VERSION(0, 9), ///< version 0.9
+ UR_API_VERSION_0_10 = UR_MAKE_VERSION(0, 10), ///< version 0.10
+ UR_API_VERSION_CURRENT = UR_MAKE_VERSION(0, 10), ///< latest known version
/// @cond
UR_API_VERSION_FORCE_UINT32 = 0x7fffffff
/// @endcond
@@ -1434,201 +1436,214 @@ urDeviceGetSelected(
///////////////////////////////////////////////////////////////////////////////
/// @brief Supported device info
typedef enum ur_device_info_t {
- UR_DEVICE_INFO_TYPE = 0, ///< [::ur_device_type_t] type of the device
- UR_DEVICE_INFO_VENDOR_ID = 1, ///< [uint32_t] vendor Id of the device
- UR_DEVICE_INFO_DEVICE_ID = 2, ///< [uint32_t] Id of the device
- UR_DEVICE_INFO_MAX_COMPUTE_UNITS = 3, ///< [uint32_t] the number of compute units
- UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS = 4, ///< [uint32_t] max work item dimensions
- UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES = 5, ///< [size_t[]] return an array of max work item sizes
- UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE = 6, ///< [size_t] max work group size
- UR_DEVICE_INFO_SINGLE_FP_CONFIG = 7, ///< [::ur_device_fp_capability_flags_t] single precision floating point
- ///< capability
- UR_DEVICE_INFO_HALF_FP_CONFIG = 8, ///< [::ur_device_fp_capability_flags_t] half precision floating point
- ///< capability
- UR_DEVICE_INFO_DOUBLE_FP_CONFIG = 9, ///< [::ur_device_fp_capability_flags_t] double precision floating point
- ///< capability
- UR_DEVICE_INFO_QUEUE_PROPERTIES = 10, ///< [::ur_queue_flags_t] command queue properties supported by the device
- UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR = 11, ///< [uint32_t] preferred vector width for char
- UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT = 12, ///< [uint32_t] preferred vector width for short
- UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT = 13, ///< [uint32_t] preferred vector width for int
- UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG = 14, ///< [uint32_t] preferred vector width for long
- UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT = 15, ///< [uint32_t] preferred vector width for float
- UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE = 16, ///< [uint32_t] preferred vector width for double
- UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF = 17, ///< [uint32_t] preferred vector width for half float
- UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR = 18, ///< [uint32_t] native vector width for char
- UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT = 19, ///< [uint32_t] native vector width for short
- UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT = 20, ///< [uint32_t] native vector width for int
- UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG = 21, ///< [uint32_t] native vector width for long
- UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT = 22, ///< [uint32_t] native vector width for float
- UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE = 23, ///< [uint32_t] native vector width for double
- UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF = 24, ///< [uint32_t] native vector width for half float
- UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY = 25, ///< [uint32_t] max clock frequency in MHz
- UR_DEVICE_INFO_MEMORY_CLOCK_RATE = 26, ///< [uint32_t] memory clock frequency in MHz
- UR_DEVICE_INFO_ADDRESS_BITS = 27, ///< [uint32_t] address bits
- UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE = 28, ///< [uint64_t] max memory allocation size
- UR_DEVICE_INFO_IMAGE_SUPPORTED = 29, ///< [::ur_bool_t] images are supported
- UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS = 30, ///< [uint32_t] max number of image objects arguments of a kernel declared
- ///< with the read_only qualifier
- UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS = 31, ///< [uint32_t] max number of image objects arguments of a kernel declared
- ///< with the write_only qualifier
- UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS = 32, ///< [uint32_t] max number of image objects arguments of a kernel declared
- ///< with the read_write qualifier
- UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH = 33, ///< [size_t] max width of Image2D object
- UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT = 34, ///< [size_t] max height of Image2D object
- UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH = 35, ///< [size_t] max width of Image3D object
- UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT = 36, ///< [size_t] max height of Image3D object
- UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH = 37, ///< [size_t] max depth of Image3D object
- UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE = 38, ///< [size_t] max image buffer size
- UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE = 39, ///< [size_t] max image array size
- UR_DEVICE_INFO_MAX_SAMPLERS = 40, ///< [uint32_t] max number of samplers that can be used in a kernel
- UR_DEVICE_INFO_MAX_PARAMETER_SIZE = 41, ///< [size_t] max size in bytes of all arguments passed to a kernel
- UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN = 42, ///< [uint32_t] memory base address alignment
- UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE = 43, ///< [::ur_device_mem_cache_type_t] global memory cache type
- UR_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE = 44, ///< [uint32_t] global memory cache line size in bytes
- UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE = 45, ///< [uint64_t] size of global memory cache in bytes
- UR_DEVICE_INFO_GLOBAL_MEM_SIZE = 46, ///< [uint64_t] size of global memory in bytes
- UR_DEVICE_INFO_GLOBAL_MEM_FREE = 47, ///< [uint64_t] size of global memory which is free in bytes
- UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE = 48, ///< [uint64_t] max constant buffer size in bytes
- UR_DEVICE_INFO_MAX_CONSTANT_ARGS = 49, ///< [uint32_t] max number of __const declared arguments in a kernel
- UR_DEVICE_INFO_LOCAL_MEM_TYPE = 50, ///< [::ur_device_local_mem_type_t] local memory type
- UR_DEVICE_INFO_LOCAL_MEM_SIZE = 51, ///< [uint64_t] local memory size in bytes
- UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT = 52, ///< [::ur_bool_t] support error correction to global and local memory
- UR_DEVICE_INFO_HOST_UNIFIED_MEMORY = 53, ///< [::ur_bool_t] unified host device memory
- UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION = 54, ///< [size_t] profiling timer resolution in nanoseconds
- UR_DEVICE_INFO_ENDIAN_LITTLE = 55, ///< [::ur_bool_t] little endian byte order
- UR_DEVICE_INFO_AVAILABLE = 56, ///< [::ur_bool_t] device is available
- UR_DEVICE_INFO_COMPILER_AVAILABLE = 57, ///< [::ur_bool_t] device compiler is available
- UR_DEVICE_INFO_LINKER_AVAILABLE = 58, ///< [::ur_bool_t] device linker is available
- UR_DEVICE_INFO_EXECUTION_CAPABILITIES = 59, ///< [::ur_device_exec_capability_flags_t] device kernel execution
- ///< capability bit-field
- UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES = 60, ///< [::ur_queue_flags_t] device command queue property bit-field
- UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES = 61, ///< [::ur_queue_flags_t] host queue property bit-field
- UR_DEVICE_INFO_BUILT_IN_KERNELS = 62, ///< [char[]] a semi-colon separated list of built-in kernels
- UR_DEVICE_INFO_PLATFORM = 63, ///< [::ur_platform_handle_t] the platform associated with the device
- UR_DEVICE_INFO_REFERENCE_COUNT = 64, ///< [uint32_t] Reference count of the device object.
- ///< The reference count returned should be considered immediately stale.
- ///< It is unsuitable for general use in applications. This feature is
- ///< provided for identifying memory leaks.
- UR_DEVICE_INFO_IL_VERSION = 65, ///< [char[]] IL version
- UR_DEVICE_INFO_NAME = 66, ///< [char[]] Device name
- UR_DEVICE_INFO_VENDOR = 67, ///< [char[]] Device vendor
- UR_DEVICE_INFO_DRIVER_VERSION = 68, ///< [char[]] Driver version
- UR_DEVICE_INFO_PROFILE = 69, ///< [char[]] Device profile
- UR_DEVICE_INFO_VERSION = 70, ///< [char[]] Device version
- UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION = 71, ///< [char[]] Version of backend runtime
- UR_DEVICE_INFO_EXTENSIONS = 72, ///< [char[]] Return a space separated list of extension names
- UR_DEVICE_INFO_PRINTF_BUFFER_SIZE = 73, ///< [size_t] Maximum size in bytes of internal printf buffer
- UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC = 74, ///< [::ur_bool_t] prefer user synchronization when sharing object with
- ///< other API
- UR_DEVICE_INFO_PARENT_DEVICE = 75, ///< [::ur_device_handle_t] return parent device handle
- UR_DEVICE_INFO_SUPPORTED_PARTITIONS = 76, ///< [::ur_device_partition_t[]] Returns an array of partition types
- ///< supported by the device
- UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES = 77, ///< [uint32_t] maximum number of sub-devices when the device is
- ///< partitioned
- UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN = 78, ///< [::ur_device_affinity_domain_flags_t] Returns a bit-field of the
- ///< supported affinity domains for partitioning.
- ///< If the device does not support any affinity domains, then 0 will be returned.
- UR_DEVICE_INFO_PARTITION_TYPE = 79, ///< [::ur_device_partition_property_t[]] return an array of
- ///< ::ur_device_partition_property_t for properties specified in
- ///< ::urDevicePartition
- UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS = 80, ///< [uint32_t] max number of sub groups
- UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS = 81, ///< [::ur_bool_t] support sub group independent forward progress
- UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL = 82, ///< [uint32_t[]] return an array of sub group sizes supported on Intel
- ///< device
- UR_DEVICE_INFO_USM_HOST_SUPPORT = 83, ///< [::ur_device_usm_access_capability_flags_t] support USM host memory
- ///< access
- UR_DEVICE_INFO_USM_DEVICE_SUPPORT = 84, ///< [::ur_device_usm_access_capability_flags_t] support USM device memory
- ///< access
- UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT = 85, ///< [::ur_device_usm_access_capability_flags_t] support USM single device
- ///< shared memory access
- UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT = 86, ///< [::ur_device_usm_access_capability_flags_t] support USM cross device
- ///< shared memory access
- UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT = 87, ///< [::ur_device_usm_access_capability_flags_t] support USM system wide
- ///< shared memory access
- UR_DEVICE_INFO_UUID = 88, ///< [uint8_t[]] return device UUID
- UR_DEVICE_INFO_PCI_ADDRESS = 89, ///< [char[]] return device PCI address
- UR_DEVICE_INFO_GPU_EU_COUNT = 90, ///< [uint32_t] return Intel GPU EU count
- UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH = 91, ///< [uint32_t] return Intel GPU EU SIMD width
- UR_DEVICE_INFO_GPU_EU_SLICES = 92, ///< [uint32_t] return Intel GPU number of slices
- UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 93, ///< [uint32_t] return Intel GPU EU count per subslice
- UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE = 94, ///< [uint32_t] return Intel GPU number of subslices per slice
- UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU = 95, ///< [uint32_t] return Intel GPU number of threads per EU
- UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH = 96, ///< [uint32_t] return max memory bandwidth in Mb/s
- UR_DEVICE_INFO_IMAGE_SRGB = 97, ///< [::ur_bool_t] device supports sRGB images
- UR_DEVICE_INFO_BUILD_ON_SUBDEVICE = 98, ///< [::ur_bool_t] Return true if sub-device should do its own program
- ///< build
- UR_DEVICE_INFO_ATOMIC_64 = 99, ///< [::ur_bool_t] support 64 bit atomics
- UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 100, ///< [::ur_memory_order_capability_flags_t] return a bit-field of atomic
- ///< memory order capabilities
- UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 101, ///< [::ur_memory_scope_capability_flags_t] return a bit-field of atomic
- ///< memory scope capabilities
- UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES = 102, ///< [::ur_memory_order_capability_flags_t] return a bit-field of atomic
- ///< memory fence order capabilities
- UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES = 103, ///< [::ur_memory_scope_capability_flags_t] return a bit-field of atomic
- ///< memory fence scope capabilities
- UR_DEVICE_INFO_BFLOAT16 = 104, ///< [::ur_bool_t] support for bfloat16
- UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES = 105, ///< [uint32_t] Returns 1 if the device doesn't have a notion of a
- ///< queue index. Otherwise, returns the number of queue indices that are
- ///< available for this device.
- UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS = 106, ///< [::ur_bool_t] support the ::urKernelSetSpecializationConstants entry
- ///< point
- UR_DEVICE_INFO_MEMORY_BUS_WIDTH = 107, ///< [uint32_t] return the width in bits of the memory bus interface of the
- ///< device.
- UR_DEVICE_INFO_MAX_WORK_GROUPS_3D = 108, ///< [size_t[3]] return max 3D work groups
- UR_DEVICE_INFO_ASYNC_BARRIER = 109, ///< [::ur_bool_t] return true if Async Barrier is supported
- UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT = 110, ///< [::ur_bool_t] return true if specifying memory channels is supported
- UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED = 111, ///< [::ur_bool_t] Return true if the device supports enqueueing commands
- ///< to read and write pipes from the host.
- UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP = 112, ///< [uint32_t] The maximum number of registers available per block.
- UR_DEVICE_INFO_IP_VERSION = 113, ///< [uint32_t] The device IP version. The meaning of the device IP version
- ///< is implementation-defined, but newer devices should have a higher
- ///< version than older devices.
- UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT = 114, ///< [::ur_bool_t] return true if the device supports virtual memory.
- UR_DEVICE_INFO_ESIMD_SUPPORT = 115, ///< [::ur_bool_t] return true if the device supports ESIMD.
- UR_DEVICE_INFO_COMPONENT_DEVICES = 116, ///< [::ur_device_handle_t[]] The set of component devices contained by
- ///< this composite device.
- UR_DEVICE_INFO_COMPOSITE_DEVICE = 117, ///< [::ur_device_handle_t] The composite device containing this component
- ///< device.
- UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP = 0x1000, ///< [::ur_bool_t] Returns true if the device supports the use of
- ///< command-buffers.
- UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP = 0x1001, ///< [::ur_bool_t] Returns true if the device supports updating the kernel
- ///< commands in a command-buffer.
- UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP = 0x2000, ///< [::ur_bool_t] returns true if the device supports the creation of
- ///< bindless images
- UR_DEVICE_INFO_BINDLESS_IMAGES_SHARED_USM_SUPPORT_EXP = 0x2001, ///< [::ur_bool_t] returns true if the device supports the creation of
- ///< bindless images backed by shared USM
- UR_DEVICE_INFO_BINDLESS_IMAGES_1D_USM_SUPPORT_EXP = 0x2002, ///< [::ur_bool_t] returns true if the device supports the creation of 1D
- ///< bindless images backed by USM
- UR_DEVICE_INFO_BINDLESS_IMAGES_2D_USM_SUPPORT_EXP = 0x2003, ///< [::ur_bool_t] returns true if the device supports the creation of 2D
- ///< bindless images backed by USM
- UR_DEVICE_INFO_IMAGE_PITCH_ALIGN_EXP = 0x2004, ///< [uint32_t] returns the required alignment of the pitch between two
- ///< rows of an image in bytes
- UR_DEVICE_INFO_MAX_IMAGE_LINEAR_WIDTH_EXP = 0x2005, ///< [size_t] returns the maximum linear width allowed for images allocated
- ///< using USM
- UR_DEVICE_INFO_MAX_IMAGE_LINEAR_HEIGHT_EXP = 0x2006, ///< [size_t] returns the maximum linear height allowed for images
- ///< allocated using USM
- UR_DEVICE_INFO_MAX_IMAGE_LINEAR_PITCH_EXP = 0x2007, ///< [size_t] returns the maximum linear pitch allowed for images allocated
- ///< using USM
- UR_DEVICE_INFO_MIPMAP_SUPPORT_EXP = 0x2008, ///< [::ur_bool_t] returns true if the device supports allocating mipmap
- ///< resources
- UR_DEVICE_INFO_MIPMAP_ANISOTROPY_SUPPORT_EXP = 0x2009, ///< [::ur_bool_t] returns true if the device supports sampling mipmap
- ///< images with anisotropic filtering
- UR_DEVICE_INFO_MIPMAP_MAX_ANISOTROPY_EXP = 0x200A, ///< [uint32_t] returns the maximum anisotropic ratio supported by the
- ///< device
- UR_DEVICE_INFO_MIPMAP_LEVEL_REFERENCE_SUPPORT_EXP = 0x200B, ///< [::ur_bool_t] returns true if the device supports using images created
- ///< from individual mipmap levels
- UR_DEVICE_INFO_INTEROP_MEMORY_IMPORT_SUPPORT_EXP = 0x200C, ///< [::ur_bool_t] returns true if the device supports importing external
- ///< memory resources
- UR_DEVICE_INFO_INTEROP_MEMORY_EXPORT_SUPPORT_EXP = 0x200D, ///< [::ur_bool_t] returns true if the device supports exporting internal
- ///< memory resources
- UR_DEVICE_INFO_INTEROP_SEMAPHORE_IMPORT_SUPPORT_EXP = 0x200E, ///< [::ur_bool_t] returns true if the device supports importing external
- ///< semaphore resources
- UR_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP = 0x200F, ///< [::ur_bool_t] returns true if the device supports exporting internal
- ///< event resources
- UR_DEVICE_INFO_CUBEMAP_SUPPORT_EXP = 0x2010, ///< [::ur_bool_t] returns true if the device supports allocating and
- ///< accessing cubemap resources
- UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP = 0x2011, ///< [::ur_bool_t] returns true if the device supports sampling cubemapped
- ///< images across face boundaries
+ UR_DEVICE_INFO_TYPE = 0, ///< [::ur_device_type_t] type of the device
+ UR_DEVICE_INFO_VENDOR_ID = 1, ///< [uint32_t] vendor Id of the device
+ UR_DEVICE_INFO_DEVICE_ID = 2, ///< [uint32_t] Id of the device
+ UR_DEVICE_INFO_MAX_COMPUTE_UNITS = 3, ///< [uint32_t] the number of compute units
+ UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS = 4, ///< [uint32_t] max work item dimensions
+ UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES = 5, ///< [size_t[]] return an array of max work item sizes
+ UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE = 6, ///< [size_t] max work group size
+ UR_DEVICE_INFO_SINGLE_FP_CONFIG = 7, ///< [::ur_device_fp_capability_flags_t] single precision floating point
+ ///< capability
+ UR_DEVICE_INFO_HALF_FP_CONFIG = 8, ///< [::ur_device_fp_capability_flags_t] half precision floating point
+ ///< capability
+ UR_DEVICE_INFO_DOUBLE_FP_CONFIG = 9, ///< [::ur_device_fp_capability_flags_t] double precision floating point
+ ///< capability
+ UR_DEVICE_INFO_QUEUE_PROPERTIES = 10, ///< [::ur_queue_flags_t] command queue properties supported by the device
+ UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR = 11, ///< [uint32_t] preferred vector width for char
+ UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT = 12, ///< [uint32_t] preferred vector width for short
+ UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT = 13, ///< [uint32_t] preferred vector width for int
+ UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG = 14, ///< [uint32_t] preferred vector width for long
+ UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT = 15, ///< [uint32_t] preferred vector width for float
+ UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE = 16, ///< [uint32_t] preferred vector width for double
+ UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF = 17, ///< [uint32_t] preferred vector width for half float
+ UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR = 18, ///< [uint32_t] native vector width for char
+ UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT = 19, ///< [uint32_t] native vector width for short
+ UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT = 20, ///< [uint32_t] native vector width for int
+ UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG = 21, ///< [uint32_t] native vector width for long
+ UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT = 22, ///< [uint32_t] native vector width for float
+ UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE = 23, ///< [uint32_t] native vector width for double
+ UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF = 24, ///< [uint32_t] native vector width for half float
+ UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY = 25, ///< [uint32_t] max clock frequency in MHz
+ UR_DEVICE_INFO_MEMORY_CLOCK_RATE = 26, ///< [uint32_t] memory clock frequency in MHz
+ UR_DEVICE_INFO_ADDRESS_BITS = 27, ///< [uint32_t] address bits
+ UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE = 28, ///< [uint64_t] max memory allocation size
+ UR_DEVICE_INFO_IMAGE_SUPPORTED = 29, ///< [::ur_bool_t] images are supported
+ UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS = 30, ///< [uint32_t] max number of image objects arguments of a kernel declared
+ ///< with the read_only qualifier
+ UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS = 31, ///< [uint32_t] max number of image objects arguments of a kernel declared
+ ///< with the write_only qualifier
+ UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS = 32, ///< [uint32_t] max number of image objects arguments of a kernel declared
+ ///< with the read_write qualifier
+ UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH = 33, ///< [size_t] max width of Image2D object
+ UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT = 34, ///< [size_t] max height of Image2D object
+ UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH = 35, ///< [size_t] max width of Image3D object
+ UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT = 36, ///< [size_t] max height of Image3D object
+ UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH = 37, ///< [size_t] max depth of Image3D object
+ UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE = 38, ///< [size_t] max image buffer size
+ UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE = 39, ///< [size_t] max image array size
+ UR_DEVICE_INFO_MAX_SAMPLERS = 40, ///< [uint32_t] max number of samplers that can be used in a kernel
+ UR_DEVICE_INFO_MAX_PARAMETER_SIZE = 41, ///< [size_t] max size in bytes of all arguments passed to a kernel
+ UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN = 42, ///< [uint32_t] memory base address alignment
+ UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE = 43, ///< [::ur_device_mem_cache_type_t] global memory cache type
+ UR_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE = 44, ///< [uint32_t] global memory cache line size in bytes
+ UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE = 45, ///< [uint64_t] size of global memory cache in bytes
+ UR_DEVICE_INFO_GLOBAL_MEM_SIZE = 46, ///< [uint64_t] size of global memory in bytes
+ UR_DEVICE_INFO_GLOBAL_MEM_FREE = 47, ///< [uint64_t] size of global memory which is free in bytes
+ UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE = 48, ///< [uint64_t] max constant buffer size in bytes
+ UR_DEVICE_INFO_MAX_CONSTANT_ARGS = 49, ///< [uint32_t] max number of __const declared arguments in a kernel
+ UR_DEVICE_INFO_LOCAL_MEM_TYPE = 50, ///< [::ur_device_local_mem_type_t] local memory type
+ UR_DEVICE_INFO_LOCAL_MEM_SIZE = 51, ///< [uint64_t] local memory size in bytes
+ UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT = 52, ///< [::ur_bool_t] support error correction to global and local memory
+ UR_DEVICE_INFO_HOST_UNIFIED_MEMORY = 53, ///< [::ur_bool_t] unified host device memory
+ UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION = 54, ///< [size_t] profiling timer resolution in nanoseconds
+ UR_DEVICE_INFO_ENDIAN_LITTLE = 55, ///< [::ur_bool_t] little endian byte order
+ UR_DEVICE_INFO_AVAILABLE = 56, ///< [::ur_bool_t] device is available
+ UR_DEVICE_INFO_COMPILER_AVAILABLE = 57, ///< [::ur_bool_t] device compiler is available
+ UR_DEVICE_INFO_LINKER_AVAILABLE = 58, ///< [::ur_bool_t] device linker is available
+ UR_DEVICE_INFO_EXECUTION_CAPABILITIES = 59, ///< [::ur_device_exec_capability_flags_t] device kernel execution
+ ///< capability bit-field
+ UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES = 60, ///< [::ur_queue_flags_t] device command queue property bit-field
+ UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES = 61, ///< [::ur_queue_flags_t] host queue property bit-field
+ UR_DEVICE_INFO_BUILT_IN_KERNELS = 62, ///< [char[]] a semi-colon separated list of built-in kernels
+ UR_DEVICE_INFO_PLATFORM = 63, ///< [::ur_platform_handle_t] the platform associated with the device
+ UR_DEVICE_INFO_REFERENCE_COUNT = 64, ///< [uint32_t] Reference count of the device object.
+ ///< The reference count returned should be considered immediately stale.
+ ///< It is unsuitable for general use in applications. This feature is
+ ///< provided for identifying memory leaks.
+ UR_DEVICE_INFO_IL_VERSION = 65, ///< [char[]] IL version
+ UR_DEVICE_INFO_NAME = 66, ///< [char[]] Device name
+ UR_DEVICE_INFO_VENDOR = 67, ///< [char[]] Device vendor
+ UR_DEVICE_INFO_DRIVER_VERSION = 68, ///< [char[]] Driver version
+ UR_DEVICE_INFO_PROFILE = 69, ///< [char[]] Device profile
+ UR_DEVICE_INFO_VERSION = 70, ///< [char[]] Device version
+ UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION = 71, ///< [char[]] Version of backend runtime
+ UR_DEVICE_INFO_EXTENSIONS = 72, ///< [char[]] Return a space separated list of extension names
+ UR_DEVICE_INFO_PRINTF_BUFFER_SIZE = 73, ///< [size_t] Maximum size in bytes of internal printf buffer
+ UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC = 74, ///< [::ur_bool_t] prefer user synchronization when sharing object with
+ ///< other API
+ UR_DEVICE_INFO_PARENT_DEVICE = 75, ///< [::ur_device_handle_t] return parent device handle
+ UR_DEVICE_INFO_SUPPORTED_PARTITIONS = 76, ///< [::ur_device_partition_t[]] Returns an array of partition types
+ ///< supported by the device
+ UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES = 77, ///< [uint32_t] maximum number of sub-devices when the device is
+ ///< partitioned
+ UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN = 78, ///< [::ur_device_affinity_domain_flags_t] Returns a bit-field of the
+ ///< supported affinity domains for partitioning.
+ ///< If the device does not support any affinity domains, then 0 will be returned.
+ UR_DEVICE_INFO_PARTITION_TYPE = 79, ///< [::ur_device_partition_property_t[]] return an array of
+ ///< ::ur_device_partition_property_t for properties specified in
+ ///< ::urDevicePartition
+ UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS = 80, ///< [uint32_t] max number of sub groups
+ UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS = 81, ///< [::ur_bool_t] support sub group independent forward progress
+ UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL = 82, ///< [uint32_t[]] return an array of sub group sizes supported on Intel
+ ///< device
+ UR_DEVICE_INFO_USM_HOST_SUPPORT = 83, ///< [::ur_device_usm_access_capability_flags_t] support USM host memory
+ ///< access
+ UR_DEVICE_INFO_USM_DEVICE_SUPPORT = 84, ///< [::ur_device_usm_access_capability_flags_t] support USM device memory
+ ///< access
+ UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT = 85, ///< [::ur_device_usm_access_capability_flags_t] support USM single device
+ ///< shared memory access
+ UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT = 86, ///< [::ur_device_usm_access_capability_flags_t] support USM cross device
+ ///< shared memory access
+ UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT = 87, ///< [::ur_device_usm_access_capability_flags_t] support USM system wide
+ ///< shared memory access
+ UR_DEVICE_INFO_UUID = 88, ///< [uint8_t[]] return device UUID
+ UR_DEVICE_INFO_PCI_ADDRESS = 89, ///< [char[]] return device PCI address
+ UR_DEVICE_INFO_GPU_EU_COUNT = 90, ///< [uint32_t] return Intel GPU EU count
+ UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH = 91, ///< [uint32_t] return Intel GPU EU SIMD width
+ UR_DEVICE_INFO_GPU_EU_SLICES = 92, ///< [uint32_t] return Intel GPU number of slices
+ UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 93, ///< [uint32_t] return Intel GPU EU count per subslice
+ UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE = 94, ///< [uint32_t] return Intel GPU number of subslices per slice
+ UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU = 95, ///< [uint32_t] return Intel GPU number of threads per EU
+ UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH = 96, ///< [uint32_t] return max memory bandwidth in Mb/s
+ UR_DEVICE_INFO_IMAGE_SRGB = 97, ///< [::ur_bool_t] device supports sRGB images
+ UR_DEVICE_INFO_BUILD_ON_SUBDEVICE = 98, ///< [::ur_bool_t] Return true if sub-device should do its own program
+ ///< build
+ UR_DEVICE_INFO_ATOMIC_64 = 99, ///< [::ur_bool_t] support 64 bit atomics
+ UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 100, ///< [::ur_memory_order_capability_flags_t] return a bit-field of atomic
+ ///< memory order capabilities
+ UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 101, ///< [::ur_memory_scope_capability_flags_t] return a bit-field of atomic
+ ///< memory scope capabilities
+ UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES = 102, ///< [::ur_memory_order_capability_flags_t] return a bit-field of atomic
+ ///< memory fence order capabilities
+ UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES = 103, ///< [::ur_memory_scope_capability_flags_t] return a bit-field of atomic
+ ///< memory fence scope capabilities
+ UR_DEVICE_INFO_BFLOAT16 = 104, ///< [::ur_bool_t] support for bfloat16
+ UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES = 105, ///< [uint32_t] Returns 1 if the device doesn't have a notion of a
+ ///< queue index. Otherwise, returns the number of queue indices that are
+ ///< available for this device.
+ UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS = 106, ///< [::ur_bool_t] support the ::urKernelSetSpecializationConstants entry
+ ///< point
+ UR_DEVICE_INFO_MEMORY_BUS_WIDTH = 107, ///< [uint32_t] return the width in bits of the memory bus interface of the
+ ///< device.
+ UR_DEVICE_INFO_MAX_WORK_GROUPS_3D = 108, ///< [size_t[3]] return max 3D work groups
+ UR_DEVICE_INFO_ASYNC_BARRIER = 109, ///< [::ur_bool_t] return true if Async Barrier is supported
+ UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT = 110, ///< [::ur_bool_t] return true if specifying memory channels is supported
+ UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED = 111, ///< [::ur_bool_t] Return true if the device supports enqueueing commands
+ ///< to read and write pipes from the host.
+ UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP = 112, ///< [uint32_t] The maximum number of registers available per block.
+ UR_DEVICE_INFO_IP_VERSION = 113, ///< [uint32_t] The device IP version. The meaning of the device IP version
+ ///< is implementation-defined, but newer devices should have a higher
+ ///< version than older devices.
+ UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT = 114, ///< [::ur_bool_t] return true if the device supports virtual memory.
+ UR_DEVICE_INFO_ESIMD_SUPPORT = 115, ///< [::ur_bool_t] return true if the device supports ESIMD.
+ UR_DEVICE_INFO_COMPONENT_DEVICES = 116, ///< [::ur_device_handle_t[]] The set of component devices contained by
+ ///< this composite device.
+ UR_DEVICE_INFO_COMPOSITE_DEVICE = 117, ///< [::ur_device_handle_t] The composite device containing this component
+ ///< device.
+ UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP = 0x1000, ///< [::ur_bool_t] Returns true if the device supports the use of
+ ///< command-buffers.
+ UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP = 0x1001, ///< [::ur_bool_t] Returns true if the device supports updating the kernel
+ ///< commands in a command-buffer.
+ UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP = 0x2000, ///< [::ur_bool_t] returns true if the device supports the creation of
+ ///< bindless images
+ UR_DEVICE_INFO_BINDLESS_IMAGES_SHARED_USM_SUPPORT_EXP = 0x2001, ///< [::ur_bool_t] returns true if the device supports the creation of
+ ///< bindless images backed by shared USM
+ UR_DEVICE_INFO_BINDLESS_IMAGES_1D_USM_SUPPORT_EXP = 0x2002, ///< [::ur_bool_t] returns true if the device supports the creation of 1D
+ ///< bindless images backed by USM
+ UR_DEVICE_INFO_BINDLESS_IMAGES_2D_USM_SUPPORT_EXP = 0x2003, ///< [::ur_bool_t] returns true if the device supports the creation of 2D
+ ///< bindless images backed by USM
+ UR_DEVICE_INFO_IMAGE_PITCH_ALIGN_EXP = 0x2004, ///< [uint32_t] returns the required alignment of the pitch between two
+ ///< rows of an image in bytes
+ UR_DEVICE_INFO_MAX_IMAGE_LINEAR_WIDTH_EXP = 0x2005, ///< [size_t] returns the maximum linear width allowed for images allocated
+ ///< using USM
+ UR_DEVICE_INFO_MAX_IMAGE_LINEAR_HEIGHT_EXP = 0x2006, ///< [size_t] returns the maximum linear height allowed for images
+ ///< allocated using USM
+ UR_DEVICE_INFO_MAX_IMAGE_LINEAR_PITCH_EXP = 0x2007, ///< [size_t] returns the maximum linear pitch allowed for images allocated
+ ///< using USM
+ UR_DEVICE_INFO_MIPMAP_SUPPORT_EXP = 0x2008, ///< [::ur_bool_t] returns true if the device supports allocating mipmap
+ ///< resources
+ UR_DEVICE_INFO_MIPMAP_ANISOTROPY_SUPPORT_EXP = 0x2009, ///< [::ur_bool_t] returns true if the device supports sampling mipmap
+ ///< images with anisotropic filtering
+ UR_DEVICE_INFO_MIPMAP_MAX_ANISOTROPY_EXP = 0x200A, ///< [uint32_t] returns the maximum anisotropic ratio supported by the
+ ///< device
+ UR_DEVICE_INFO_MIPMAP_LEVEL_REFERENCE_SUPPORT_EXP = 0x200B, ///< [::ur_bool_t] returns true if the device supports using images created
+ ///< from individual mipmap levels
+ UR_DEVICE_INFO_INTEROP_MEMORY_IMPORT_SUPPORT_EXP = 0x200C, ///< [::ur_bool_t] returns true if the device supports importing external
+ ///< memory resources
+ UR_DEVICE_INFO_INTEROP_MEMORY_EXPORT_SUPPORT_EXP = 0x200D, ///< [::ur_bool_t] returns true if the device supports exporting internal
+ ///< memory resources
+ UR_DEVICE_INFO_INTEROP_SEMAPHORE_IMPORT_SUPPORT_EXP = 0x200E, ///< [::ur_bool_t] returns true if the device supports importing external
+ ///< semaphore resources
+ UR_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP = 0x200F, ///< [::ur_bool_t] returns true if the device supports exporting internal
+ ///< event resources
+ UR_DEVICE_INFO_CUBEMAP_SUPPORT_EXP = 0x2010, ///< [::ur_bool_t] returns true if the device supports allocating and
+ ///< accessing cubemap resources
+ UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP = 0x2011, ///< [::ur_bool_t] returns true if the device supports sampling cubemapped
+ ///< images across face boundaries
+ UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP = 0x2012, ///< [::ur_bool_t] returns true if the device is capable of fetching USM
+ ///< backed 1D sampled image data.
+ UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP = 0x2013, ///< [::ur_bool_t] returns true if the device is capable of fetching
+ ///< non-USM backed 1D sampled image data.
+ UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP = 0x2014, ///< [::ur_bool_t] returns true if the device is capable of fetching USM
+ ///< backed 2D sampled image data.
+ UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP = 0x2015, ///< [::ur_bool_t] returns true if the device is capable of fetching
+ ///< non-USM backed 2D sampled image data.
+ UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP = 0x2016, ///< [::ur_bool_t] returns true if the device is capable of fetching USM
+ ///< backed 3D sampled image data.
+ UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP = 0x2017, ///< [::ur_bool_t] returns true if the device is capable of fetching
+ ///< non-USM backed 3D sampled image data.
+ UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP = 0x2018, ///< [::ur_bool_t] returns true if the device supports timestamp recording
/// @cond
UR_DEVICE_INFO_FORCE_UINT32 = 0x7fffffff
/// @endcond
@@ -1654,7 +1669,7 @@ typedef enum ur_device_info_t {
/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
/// + `NULL == hDevice`
/// - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-/// + `::UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP < propName`
+/// + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName`
/// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION
/// + If `propName` is not supported by the adapter.
/// - ::UR_RESULT_ERROR_INVALID_SIZE
@@ -4757,7 +4772,8 @@ typedef enum ur_kernel_group_info_t {
UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE = 0, ///< [size_t[3]] Return Work Group maximum global size
UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE = 1, ///< [size_t] Return maximum Work Group size
UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE = 2, ///< [size_t[3]] Return Work Group size required by the source code, such
- ///< as __attribute__((required_work_group_size(X,Y,Z))
+ ///< as __attribute__((required_work_group_size(X,Y,Z)), or (0, 0, 0) if
+ ///< unspecified
UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE = 3, ///< [size_t] Return local memory required by the Kernel
UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE = 4, ///< [size_t] Return preferred multiple of Work Group size for launch
UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE = 5, ///< [size_t] Return minimum amount of private memory in bytes used by each
@@ -4773,7 +4789,8 @@ typedef enum ur_kernel_group_info_t {
typedef enum ur_kernel_sub_group_info_t {
UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE = 0, ///< [uint32_t] Return maximum SubGroup size
UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS = 1, ///< [uint32_t] Return maximum number of SubGroup
- UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS = 2, ///< [uint32_t] Return number of SubGroup required by the source code
+ UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS = 2, ///< [uint32_t] Return number of SubGroup required by the source code or 0
+ ///< if unspecified
UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL = 3, ///< [uint32_t] Return SubGroup size required by Intel
/// @cond
UR_KERNEL_SUB_GROUP_INFO_FORCE_UINT32 = 0x7fffffff
@@ -5606,6 +5623,7 @@ typedef enum ur_command_t {
UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP = 0x1000, ///< Event created by ::urCommandBufferEnqueueExp
UR_COMMAND_INTEROP_SEMAPHORE_WAIT_EXP = 0x2000, ///< Event created by ::urBindlessImagesWaitExternalSemaphoreExp
UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP = 0x2001, ///< Event created by ::urBindlessImagesSignalExternalSemaphoreExp
+ UR_COMMAND_TIMESTAMP_RECORDING_EXP = 0x2002, ///< Event created by ::urEnqueueTimestampRecordingExp
/// @cond
UR_COMMAND_FORCE_UINT32 = 0x7fffffff
/// @endcond
@@ -5973,6 +5991,7 @@ urEventSetCallback(
/// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION
/// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE
/// - ::UR_RESULT_ERROR_INVALID_VALUE
+/// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGS - "The kernel argument values have not been specified."
/// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY
/// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES
UR_APIEXPORT ur_result_t UR_APICALL
@@ -7535,7 +7554,6 @@ urBindlessImagesImageFreeExp(
/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
/// + `NULL == pImageFormat`
/// + `NULL == pImageDesc`
-/// + `NULL == phMem`
/// + `NULL == phImage`
/// - ::UR_RESULT_ERROR_INVALID_CONTEXT
/// - ::UR_RESULT_ERROR_INVALID_VALUE
@@ -7550,7 +7568,6 @@ urBindlessImagesUnsampledImageCreateExp(
ur_exp_image_mem_handle_t hImageMem, ///< [in] handle to memory from which to create the image
const ur_image_format_t *pImageFormat, ///< [in] pointer to image format specification
const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description
- ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created
ur_exp_image_handle_t *phImage ///< [out] pointer to handle of image object created
);
@@ -7574,7 +7591,6 @@ urBindlessImagesUnsampledImageCreateExp(
/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
/// + `NULL == pImageFormat`
/// + `NULL == pImageDesc`
-/// + `NULL == phMem`
/// + `NULL == phImage`
/// - ::UR_RESULT_ERROR_INVALID_CONTEXT
/// - ::UR_RESULT_ERROR_INVALID_VALUE
@@ -7591,7 +7607,6 @@ urBindlessImagesSampledImageCreateExp(
const ur_image_format_t *pImageFormat, ///< [in] pointer to image format specification
const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description
ur_sampler_handle_t hSampler, ///< [in] sampler to be used
- ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created
ur_exp_image_handle_t *phImage ///< [out] pointer to handle of image object created
);
@@ -8878,6 +8893,48 @@ urKernelSuggestMaxCooperativeGroupCountExp(
uint32_t *pGroupCountRet ///< [out] pointer to maximum number of groups
);
+#if !defined(__GNUC__)
+#pragma endregion
+#endif
+// Intel 'oneAPI' Unified Runtime Experimental APIs for enqueuing timestamp recordings
+#if !defined(__GNUC__)
+#pragma region enqueue timestamp recording(experimental)
+#endif
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Enqueue a command for recording the device timestamp
+///
+/// @returns
+/// - ::UR_RESULT_SUCCESS
+/// - ::UR_RESULT_ERROR_UNINITIALIZED
+/// - ::UR_RESULT_ERROR_DEVICE_LOST
+/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC
+/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE
+/// + `NULL == hQueue`
+/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+/// + `NULL == phEvent`
+/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST
+/// + `phEventWaitList == NULL && numEventsInWaitList > 0`
+/// + `phEventWaitList != NULL && numEventsInWaitList == 0`
+UR_APIEXPORT ur_result_t UR_APICALL
+urEnqueueTimestampRecordingExp(
+ ur_queue_handle_t hQueue, ///< [in] handle of the queue object
+ bool blocking, ///< [in] indicates whether the call to this function should block until
+ ///< until the device timestamp recording command has executed on the
+ ///< device.
+ uint32_t numEventsInWaitList, ///< [in] size of the event wait list
+ const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of
+ ///< events that must be complete before the kernel execution.
+ ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait
+ ///< events.
+ ur_event_handle_t *phEvent ///< [in,out] return an event object that identifies this particular kernel
+ ///< execution instance. Profiling information can be queried
+ ///< from this event as if `hQueue` had profiling enabled. Querying
+ ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT`
+ ///< reports the timestamp at the time of the call to this function.
+ ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END`
+ ///< reports the timestamp recorded when the command is executed on the device.
+);
+
#if !defined(__GNUC__)
#pragma endregion
#endif
@@ -10588,6 +10645,18 @@ typedef struct ur_enqueue_cooperative_kernel_launch_exp_params_t {
ur_event_handle_t **pphEvent;
} ur_enqueue_cooperative_kernel_launch_exp_params_t;
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for urEnqueueTimestampRecordingExp
+/// @details Each entry is a pointer to the parameter passed to the function;
+/// allowing the callback the ability to modify the parameter's value
+typedef struct ur_enqueue_timestamp_recording_exp_params_t {
+ ur_queue_handle_t *phQueue;
+ bool *pblocking;
+ uint32_t *pnumEventsInWaitList;
+ const ur_event_handle_t **pphEventWaitList;
+ ur_event_handle_t **pphEvent;
+} ur_enqueue_timestamp_recording_exp_params_t;
+
///////////////////////////////////////////////////////////////////////////////
/// @brief Function parameters for urBindlessImagesUnsampledImageHandleDestroyExp
/// @details Each entry is a pointer to the parameter passed to the function;
@@ -10640,7 +10709,6 @@ typedef struct ur_bindless_images_unsampled_image_create_exp_params_t {
ur_exp_image_mem_handle_t *phImageMem;
const ur_image_format_t **ppImageFormat;
const ur_image_desc_t **ppImageDesc;
- ur_mem_handle_t **pphMem;
ur_exp_image_handle_t **pphImage;
} ur_bindless_images_unsampled_image_create_exp_params_t;
@@ -10655,7 +10723,6 @@ typedef struct ur_bindless_images_sampled_image_create_exp_params_t {
const ur_image_format_t **ppImageFormat;
const ur_image_desc_t **ppImageDesc;
ur_sampler_handle_t *phSampler;
- ur_mem_handle_t **pphMem;
ur_exp_image_handle_t **pphImage;
} ur_bindless_images_sampled_image_create_exp_params_t;
diff --git a/include/ur_ddi.h b/include/ur_ddi.h
index 5a069f0881..52ae226c2a 100644
--- a/include/ur_ddi.h
+++ b/include/ur_ddi.h
@@ -7,7 +7,7 @@
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
* @file ur_ddi.h
- * @version v0.9-r0
+ * @version v0.10-r0
*
*/
#ifndef UR_DDI_H_INCLUDED
@@ -1448,10 +1448,20 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueCooperativeKernelLaunchExp_t)(
const ur_event_handle_t *,
ur_event_handle_t *);
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function-pointer for urEnqueueTimestampRecordingExp
+typedef ur_result_t(UR_APICALL *ur_pfnEnqueueTimestampRecordingExp_t)(
+ ur_queue_handle_t,
+ bool,
+ uint32_t,
+ const ur_event_handle_t *,
+ ur_event_handle_t *);
+
///////////////////////////////////////////////////////////////////////////////
/// @brief Table of EnqueueExp functions pointers
typedef struct ur_enqueue_exp_dditable_t {
ur_pfnEnqueueCooperativeKernelLaunchExp_t pfnCooperativeKernelLaunchExp;
+ ur_pfnEnqueueTimestampRecordingExp_t pfnTimestampRecordingExp;
} ur_enqueue_exp_dditable_t;
///////////////////////////////////////////////////////////////////////////////
@@ -1513,7 +1523,6 @@ typedef ur_result_t(UR_APICALL *ur_pfnBindlessImagesUnsampledImageCreateExp_t)(
ur_exp_image_mem_handle_t,
const ur_image_format_t *,
const ur_image_desc_t *,
- ur_mem_handle_t *,
ur_exp_image_handle_t *);
///////////////////////////////////////////////////////////////////////////////
@@ -1525,7 +1534,6 @@ typedef ur_result_t(UR_APICALL *ur_pfnBindlessImagesSampledImageCreateExp_t)(
const ur_image_format_t *,
const ur_image_desc_t *,
ur_sampler_handle_t,
- ur_mem_handle_t *,
ur_exp_image_handle_t *);
///////////////////////////////////////////////////////////////////////////////
diff --git a/include/ur_print.h b/include/ur_print.h
index 3126c1714b..3377980ce7 100644
--- a/include/ur_print.h
+++ b/include/ur_print.h
@@ -1954,6 +1954,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueWriteHostPipeParams(const stru
/// - `buff_size < out_size`
UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueCooperativeKernelLaunchExpParams(const struct ur_enqueue_cooperative_kernel_launch_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print ur_enqueue_timestamp_recording_exp_params_t struct
+/// @returns
+/// - ::UR_RESULT_SUCCESS
+/// - ::UR_RESULT_ERROR_INVALID_SIZE
+/// - `buff_size < out_size`
+UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueTimestampRecordingExpParams(const struct ur_enqueue_timestamp_recording_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size);
+
///////////////////////////////////////////////////////////////////////////////
/// @brief Print ur_bindless_images_unsampled_image_handle_destroy_exp_params_t struct
/// @returns
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index a5074c5da1..bed5f01670 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -7,7 +7,7 @@
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
* @file ur_print.hpp
- * @version v0.9-r0
+ * @version v0.10-r0
*
*/
#ifndef UR_PRINT_HPP
@@ -916,6 +916,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) {
case UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP:
os << "UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP";
break;
+ case UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP:
+ os << "UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP";
+ break;
default:
os << "unknown enumerator";
break;
@@ -2553,6 +2556,27 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) {
case UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP:
os << "UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP";
break;
+ case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP:
+ os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP";
+ break;
+ case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP:
+ os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP";
+ break;
+ case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP:
+ os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP";
+ break;
+ case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP:
+ os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP";
+ break;
+ case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP:
+ os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP";
+ break;
+ case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP:
+ os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP";
+ break;
+ case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP:
+ os << "UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP";
+ break;
default:
os << "unknown enumerator";
break;
@@ -4190,6 +4214,90 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_device_info
os << ")";
} break;
+ case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP: {
+ const ur_bool_t *tptr = (const ur_bool_t *)ptr;
+ if (sizeof(ur_bool_t) > size) {
+ os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
+ return UR_RESULT_ERROR_INVALID_SIZE;
+ }
+ os << (const void *)(tptr) << " (";
+
+ os << *tptr;
+
+ os << ")";
+ } break;
+ case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP: {
+ const ur_bool_t *tptr = (const ur_bool_t *)ptr;
+ if (sizeof(ur_bool_t) > size) {
+ os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
+ return UR_RESULT_ERROR_INVALID_SIZE;
+ }
+ os << (const void *)(tptr) << " (";
+
+ os << *tptr;
+
+ os << ")";
+ } break;
+ case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP: {
+ const ur_bool_t *tptr = (const ur_bool_t *)ptr;
+ if (sizeof(ur_bool_t) > size) {
+ os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
+ return UR_RESULT_ERROR_INVALID_SIZE;
+ }
+ os << (const void *)(tptr) << " (";
+
+ os << *tptr;
+
+ os << ")";
+ } break;
+ case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP: {
+ const ur_bool_t *tptr = (const ur_bool_t *)ptr;
+ if (sizeof(ur_bool_t) > size) {
+ os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
+ return UR_RESULT_ERROR_INVALID_SIZE;
+ }
+ os << (const void *)(tptr) << " (";
+
+ os << *tptr;
+
+ os << ")";
+ } break;
+ case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP: {
+ const ur_bool_t *tptr = (const ur_bool_t *)ptr;
+ if (sizeof(ur_bool_t) > size) {
+ os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
+ return UR_RESULT_ERROR_INVALID_SIZE;
+ }
+ os << (const void *)(tptr) << " (";
+
+ os << *tptr;
+
+ os << ")";
+ } break;
+ case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP: {
+ const ur_bool_t *tptr = (const ur_bool_t *)ptr;
+ if (sizeof(ur_bool_t) > size) {
+ os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
+ return UR_RESULT_ERROR_INVALID_SIZE;
+ }
+ os << (const void *)(tptr) << " (";
+
+ os << *tptr;
+
+ os << ")";
+ } break;
+ case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: {
+ const ur_bool_t *tptr = (const ur_bool_t *)ptr;
+ if (sizeof(ur_bool_t) > size) {
+ os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")";
+ return UR_RESULT_ERROR_INVALID_SIZE;
+ }
+ os << (const void *)(tptr) << " (";
+
+ os << *tptr;
+
+ os << ")";
+ } break;
default:
os << "unknown enumerator";
return UR_RESULT_ERROR_INVALID_ENUMERATION;
@@ -8698,6 +8806,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_command_t value) {
case UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP:
os << "UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP";
break;
+ case UR_COMMAND_TIMESTAMP_RECORDING_EXP:
+ os << "UR_COMMAND_TIMESTAMP_RECORDING_EXP";
+ break;
default:
os << "unknown enumerator";
break;
@@ -14014,6 +14125,48 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
return os;
}
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ur_enqueue_timestamp_recording_exp_params_t type
+/// @returns
+/// std::ostream &
+inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_enqueue_timestamp_recording_exp_params_t *params) {
+
+ os << ".hQueue = ";
+
+ ur::details::printPtr(os,
+ *(params->phQueue));
+
+ os << ", ";
+ os << ".blocking = ";
+
+ os << *(params->pblocking);
+
+ os << ", ";
+ os << ".numEventsInWaitList = ";
+
+ os << *(params->pnumEventsInWaitList);
+
+ os << ", ";
+ os << ".phEventWaitList = {";
+ for (size_t i = 0; *(params->pphEventWaitList) != NULL && i < *params->pnumEventsInWaitList; ++i) {
+ if (i != 0) {
+ os << ", ";
+ }
+
+ ur::details::printPtr(os,
+ (*(params->pphEventWaitList))[i]);
+ }
+ os << "}";
+
+ os << ", ";
+ os << ".phEvent = ";
+
+ ur::details::printPtr(os,
+ *(params->pphEvent));
+
+ return os;
+}
+
///////////////////////////////////////////////////////////////////////////////
/// @brief Print operator for the ur_bindless_images_unsampled_image_handle_destroy_exp_params_t type
/// @returns
@@ -14165,12 +14318,6 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
ur::details::printPtr(os,
*(params->ppImageDesc));
- os << ", ";
- os << ".phMem = ";
-
- ur::details::printPtr(os,
- *(params->pphMem));
-
os << ", ";
os << ".phImage = ";
@@ -14221,12 +14368,6 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
ur::details::printPtr(os,
*(params->phSampler));
- os << ", ";
- os << ".phMem = ";
-
- ur::details::printPtr(os,
- *(params->pphMem));
-
os << ", ";
os << ".phImage = ";
@@ -17036,6 +17177,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, ur_function_
case UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP: {
os << (const struct ur_enqueue_cooperative_kernel_launch_exp_params_t *)params;
} break;
+ case UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP: {
+ os << (const struct ur_enqueue_timestamp_recording_exp_params_t *)params;
+ } break;
case UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_HANDLE_DESTROY_EXP: {
os << (const struct ur_bindless_images_unsampled_image_handle_destroy_exp_params_t *)params;
} break;
diff --git a/scripts/Doxyfile b/scripts/Doxyfile
index 0329aeffff..64f0ac58a6 100644
--- a/scripts/Doxyfile
+++ b/scripts/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME = "Intel One API Unified Runtime API"
# could be handy for archiving the generated documentation or if some version
# control system is used.
-PROJECT_NUMBER = v0.9
+PROJECT_NUMBER = v0.10
# Using the PROJECT_BRIEF tag one can provide an optional one line description
# for a project that appears at the top of each page and should give viewer a
diff --git a/scripts/core/EXP-BINDLESS-IMAGES.rst b/scripts/core/EXP-BINDLESS-IMAGES.rst
index af90c1ea0f..ee54c69291 100644
--- a/scripts/core/EXP-BINDLESS-IMAGES.rst
+++ b/scripts/core/EXP-BINDLESS-IMAGES.rst
@@ -91,6 +91,12 @@ Enums
* ${X}_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP
* ${X}_DEVICE_INFO_CUBEMAP_SUPPORT_EXP
* ${X}_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP
+ * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP
+ * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP
+ * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP
+ * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP
+ * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP
+ * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP
* ${x}_command_t
* ${X}_COMMAND_INTEROP_SEMAPHORE_WAIT_EXP
@@ -198,6 +204,8 @@ Changelog
+------------------------------------------------------------------------+
| 10.0 | Added cubemap image type, sampling properties, and device |
| | queries. |
++------------------------------------------------------------------------+
+| 11.0 | Added device queries for sampled image fetch capabilities. |
+----------+-------------------------------------------------------------+
Contributors
diff --git a/scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst b/scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst
new file mode 100644
index 0000000000..3cf46095b2
--- /dev/null
+++ b/scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst
@@ -0,0 +1,70 @@
+<%
+ OneApi=tags['$OneApi']
+ x=tags['$x']
+ X=x.upper()
+%>
+
+.. _experimental-enqueue-timestamp-recording:
+
+================================================================================
+Enqueue Timestamp Recording
+================================================================================
+
+.. warning::
+
+ Experimental features:
+
+ * May be replaced, updated, or removed at any time.
+ * Do not require maintaining API/ABI stability of their own additions over
+ time.
+ * Do not require conformance testing of their own additions.
+
+
+Motivation
+--------------------------------------------------------------------------------
+Currently, the only way to get timestamp information is through enabling
+profiling on a queue and retrieving the information from events coming from
+commands submitted to it. However, not all systems give full control of the
+queue construction to the programmer wanting the profiling information. To amend
+this, this extension adds the ability to enqueue a timestamp recording on any
+queue, with or without profiling enabled. This event can in turn be queried for
+the usual profiling information.
+
+
+API
+--------------------------------------------------------------------------------
+
+Enums
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* ${x}_device_info_t
+ * ${X}_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP
+
+* ${x}_command_t
+ * ${X}_COMMAND_TIMESTAMP_RECORDING_EXP
+
+Functions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+* ${x}EnqueueTimestampRecordingExp
+
+Changelog
+--------------------------------------------------------------------------------
+
++-----------+------------------------+
+| Revision | Changes |
++===========+========================+
+| 1.0 | Initial Draft |
++-----------+------------------------+
+
+
+Support
+--------------------------------------------------------------------------------
+
+Adapters which support this experimental feature *must* return true for the new
+`${X}_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP` device info query.
+
+
+Contributors
+--------------------------------------------------------------------------------
+
+* Steffen Larsen `steffen.larsen@intel.com `_
diff --git a/scripts/core/INTRO.rst b/scripts/core/INTRO.rst
index 52e78153ae..a81c282070 100644
--- a/scripts/core/INTRO.rst
+++ b/scripts/core/INTRO.rst
@@ -182,6 +182,8 @@ Unified Runtime loader implements tracing support through the `XPTI framework 0`"
+ - "`phEventWaitList != NULL && numEventsInWaitList == 0`"
diff --git a/scripts/core/kernel.yml b/scripts/core/kernel.yml
index 4ce4f9c70a..4a0bf0bab1 100644
--- a/scripts/core/kernel.yml
+++ b/scripts/core/kernel.yml
@@ -135,7 +135,7 @@ etors:
- name: WORK_GROUP_SIZE
desc: "[size_t] Return maximum Work Group size"
- name: COMPILE_WORK_GROUP_SIZE
- desc: "[size_t[3]] Return Work Group size required by the source code, such as __attribute__((required_work_group_size(X,Y,Z))"
+ desc: "[size_t[3]] Return Work Group size required by the source code, such as __attribute__((required_work_group_size(X,Y,Z)), or (0, 0, 0) if unspecified"
- name: LOCAL_MEM_SIZE
desc: "[size_t] Return local memory required by the Kernel"
- name: PREFERRED_WORK_GROUP_SIZE_MULTIPLE
@@ -154,7 +154,7 @@ etors:
- name: MAX_NUM_SUB_GROUPS
desc: "[uint32_t] Return maximum number of SubGroup"
- name: COMPILE_NUM_SUB_GROUPS
- desc: "[uint32_t] Return number of SubGroup required by the source code"
+ desc: "[uint32_t] Return number of SubGroup required by the source code or 0 if unspecified"
- name: SUB_GROUP_SIZE_INTEL
desc: "[uint32_t] Return SubGroup size required by Intel"
--- #--------------------------------------------------------------------------
diff --git a/scripts/core/platform.yml b/scripts/core/platform.yml
index 510a7e16b3..c35b71622d 100644
--- a/scripts/core/platform.yml
+++ b/scripts/core/platform.yml
@@ -140,6 +140,9 @@ etors:
- name: "0_9"
value: "$X_MAKE_VERSION( 0, 9 )"
desc: "version 0.9"
+ - name: "0_10"
+ value: "$X_MAKE_VERSION( 0, 10 )"
+ desc: "version 0.10"
--- #--------------------------------------------------------------------------
type: function
desc: "Returns the API version supported by the specified platform"
diff --git a/scripts/core/registry.yml b/scripts/core/registry.yml
index 5ae70f7bd1..6a551d5821 100644
--- a/scripts/core/registry.yml
+++ b/scripts/core/registry.yml
@@ -580,6 +580,9 @@ etors:
- name: COMMAND_BUFFER_COMMAND_GET_INFO_EXP
desc: Enumerator for $xCommandBufferCommandGetInfoExp
value: '222'
+- name: ENQUEUE_TIMESTAMP_RECORDING_EXP
+ desc: Enumerator for $xEnqueueTimestampRecordingExp
+ value: '223'
---
type: enum
desc: Defines structure types
diff --git a/scripts/ctest_parser.py b/scripts/ctest_parser.py
index bd1a84f3ee..5074a5c9b9 100755
--- a/scripts/ctest_parser.py
+++ b/scripts/ctest_parser.py
@@ -24,7 +24,7 @@ def get_cts_test_suite_names(working_directory):
]
def percent(amount, total):
- return round((amount / total) * 100, 2)
+ return round((amount / (total or 1)) * 100, 2)
def summarize_results(results):
total = results['Total']
diff --git a/scripts/generate_docs.py b/scripts/generate_docs.py
index 5c14305898..9b29123e40 100644
--- a/scripts/generate_docs.py
+++ b/scripts/generate_docs.py
@@ -10,6 +10,7 @@
import subprocess
import util
import re
+from parse_specs import Version
RE_ENABLE = r"^\#\#\s*\-\-validate\s*\=\s*on$"
RE_DISABLE = r"^\#\#\s*\-\-validate\s*\=\s*off$"
@@ -87,7 +88,7 @@ def _make_ref(symbol, symbol_type, meta):
generate a valid reStructuredText file
"""
def _generate_valid_rst(fin, fout, namespace, tags, ver, rev, meta, fast_mode):
- ver=float(ver)
+ ver = Version(ver)
enable = True
code_block = False
diff --git a/scripts/generate_kernel_header.py b/scripts/generate_kernel_header.py
index f765189334..46e78d6ed7 100755
--- a/scripts/generate_kernel_header.py
+++ b/scripts/generate_kernel_header.py
@@ -12,9 +12,7 @@
import subprocess
import sys
-from mako.template import Template
-
-HEADER_TEMPLATE = Template("""/*
+HEADER_TEMPLATE = """/*
*
* Copyright (C) 2023 Intel Corporation
*
@@ -22,7 +20,7 @@
* See LICENSE.TXT
* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
*
- * @file ${file_name}.h
+ * @file %s.h
*
*/
@@ -33,66 +31,63 @@
namespace uur {
namespace device_binaries {
std::map> program_kernel_map = {
-% for program, entry_points in kernel_name_dict.items():
- {"${program}", {
- % for entry_point in entry_points:
- "${entry_point}",
- % endfor
- }},
-% endfor
+%s
};
}
}
-""")
+"""
+PROGRAM_TEMPLATE = """\
+ {"%s", {
+%s
+ }},
+"""
+
+ENTRY_POINT_TEMPLATE = """\
+ "%s",
+"""
def generate_header(output_file, kernel_name_dict):
"""Render the template and write it to the output file."""
file_name = os.path.basename(output_file)
- rendered = HEADER_TEMPLATE.render(file_name=file_name,
- kernel_name_dict=kernel_name_dict)
+ device_binaries = ""
+ for program, entry_points in kernel_name_dict.items():
+ content = ""
+ for entry_point in entry_points:
+ content += ENTRY_POINT_TEMPLATE % entry_point
+ device_binaries += PROGRAM_TEMPLATE % (program, content)
+ rendered = HEADER_TEMPLATE % (file_name, device_binaries)
rendered = re.sub(r"\r\n", r"\n", rendered)
-
with open(output_file, "w") as fout:
fout.write(rendered)
-def get_mangled_names(dpcxx_path, source_file, output_header):
+def get_mangled_names(source_file, output_header):
"""Return a list of all the entry point names from a given sycl source file.
Filters out wrapper and offset handler entry points.
"""
output_dir = os.path.dirname(output_header)
- il_file = os.path.join(output_dir, os.path.basename(source_file) + ".ll")
- generate_il_command = f"""\
- {dpcxx_path} -S -fsycl -fsycl-device-code-split=off \
- -fsycl-device-only -o {il_file} {source_file}"""
- subprocess.run(generate_il_command, shell=True)
- kernel_line_regex = re.compile("define.*spir_kernel")
- definition_lines = []
- with open(il_file) as f:
+ name = os.path.splitext(os.path.basename(source_file))[0]
+ ih_file = os.path.join(output_dir, name, name + ".ih")
+ definitions = []
+ writing = False
+ with open(ih_file) as f:
lines = f.readlines()
for line in lines:
- if kernel_line_regex.search(line) is not None:
- definition_lines.append(line)
-
- entry_point_names = []
- kernel_name_regex = re.compile(r"@(.*?)\(")
- for line in definition_lines:
- if kernel_name_regex.search(line) is None:
- continue
- kernel_name = kernel_name_regex.search(line).group(1)
- if "kernel_wrapper" not in kernel_name and "with_offset" not in kernel_name:
- entry_point_names.append(kernel_name)
+ if "}" in line and writing:
+ break
+ # __pf_kernel_wrapper seems to be an internal function used by dpcpp
+ if writing and "19__pf_kernel_wrapper" not in line:
+ definitions.append(line.replace(",", "").strip()[1:-1])
+ if "const char* const kernel_names[] = {" in line:
+ writing = True
- os.remove(il_file)
- return entry_point_names
+ return definitions
def main():
parser = argparse.ArgumentParser()
- parser.add_argument("--dpcxx_path",
- help="Full path to dpc++ compiler executable.")
parser.add_argument(
"-o",
"--output",
@@ -105,7 +100,7 @@ def main():
for source_file in args.source_files:
program_name = os.path.splitext(os.path.basename(source_file))[0]
mangled_names[program_name] = get_mangled_names(
- args.dpcxx_path, source_file, args.output)
+ source_file, args.output)
generate_header(args.output, mangled_names)
diff --git a/scripts/parse_specs.py b/scripts/parse_specs.py
index 1bff8887f4..c54e0adbf8 100644
--- a/scripts/parse_specs.py
+++ b/scripts/parse_specs.py
@@ -6,20 +6,23 @@
SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
"""
+
import os
import generate_ids
import util
import re
import hashlib
import json
-import yaml
import copy
from templates.helper import param_traits, type_traits, value_traits
import ctypes
import itertools
+from typing import Dict, List, Optional
+from version import Version
+
-default_version = "0.9"
-all_versions = ["0.6", "0.7", "0.8", "0.9"]
+default_version = Version("0.10")
+all_versions = [Version(ver) for ver in ["0.6", "0.7", "0.8", "0.9", "0.10"]]
"""
preprocess object
@@ -105,14 +108,14 @@ def __validate_version(d, prefix="", base_version=default_version):
raise Exception(prefix+"'version' must be a string: '%s'"%type(d['version']))
try:
- version = str(float(d['version']))
+ version = str(d['version'])
except:
version = None
if version != d['version']:
raise Exception(prefix+"'version' invalid value: '%s'"%d['version'])
- return float(d.get('version', base_version))
+ return Version(d.get('version', base_version))
def __validate_tag(d, key, tags, case):
for x in tags:
@@ -127,7 +130,7 @@ def __validate_desc(desc):
raise Exception(prefix+"'version' must be a string: '%s'"%type(k))
try:
- version = str(float(k))
+ version = str(k)
except:
version = None
@@ -212,8 +215,8 @@ def __validate_etors(d, tags):
typed = d.get('typed_etors', False)
value = -1
- d_ver = d.get('version', default_version)
- max_ver = float(d_ver)
+ d_ver = Version(d.get('version', default_version))
+ max_ver = d_ver
for i, item in enumerate(d['etors']):
prefix="'etors'[%s] "%i
if not isinstance(item, dict):
@@ -300,8 +303,8 @@ def __validate_members(d, tags, meta):
if not isinstance(d['members'], list):
raise Exception("'members' must be a sequence: '%s'"%type(d['members']))
- d_ver = d.get('version', default_version)
- max_ver = float(d_ver)
+ d_ver = Version(d.get('version', default_version))
+ max_ver = d_ver
for i, item in enumerate(d['members']):
prefix="'members'[%s] "%i
if not isinstance(item, dict):
@@ -342,8 +345,8 @@ def __validate_params(d, tags, meta):
if not isinstance(d['params'], list):
raise Exception("'params' must be a sequence: '%s'"%type(d['params']))
- d_ver = d.get('version', default_version)
- max_ver = float(d_ver)
+ d_ver = Version(d.get('version', default_version))
+ max_ver = d_ver
min = {'[in]': None, '[out]': None, '[in,out]': None}
for i, item in enumerate(d['params']):
prefix="'params'[%s] "%i
@@ -501,24 +504,23 @@ def __validate_union_tag(d):
"""
filters object by version
"""
-def _filter_version(d, max_ver):
- ver = float(d.get('version', default_version))
+def _filter_version(d, max_ver: Version) -> Optional[Dict]:
+ ver = Version(d.get('version', default_version))
if ver > max_ver:
return None
- def __filter_desc(d):
+ def __filter_desc(d) -> dict:
if 'desc' in d and isinstance(d['desc'], dict):
for k, v in d['desc'].items():
- if float(k) <= max_ver:
- desc = v
- d['desc'] = desc
+ if Version(k) <= max_ver:
+ d['desc'] = v
return d
flt = []
type = d['type']
if 'enum' == type:
for e in d['etors']:
- ver = float(e.get('version', default_version))
+ ver = Version(e.get('version', default_version))
if ver <= max_ver:
flt.append(__filter_desc(e))
if d['name'].endswith('version_t'):
@@ -531,14 +533,14 @@ def __filter_desc(d):
elif 'function' == type:
for p in d['params']:
- ver = float(p.get('version', default_version))
+ ver = Version(p.get('version', default_version))
if ver <= max_ver:
flt.append(__filter_desc(p))
d['params'] = flt
elif 'struct' == type or 'union' == type or 'class' == type:
for m in d.get('members',[]):
- ver = float(m.get('version', default_version))
+ ver = Version(m.get('version', default_version))
if ver <= max_ver:
flt.append(__filter_desc(m))
d['members'] = flt
@@ -548,15 +550,15 @@ def __filter_desc(d):
"""
creates docs per version
"""
-def _make_versions(d, max_ver):
+def _make_versions(d, max_ver : Version) -> List[Version]:
docs = []
type = d['type']
if 'function' == type or 'struct' == type:
for ver in all_versions:
- if float(ver) > max_ver:
+ if ver > max_ver:
break
- dv = _filter_version(copy.deepcopy(d), float(ver))
+ dv = _filter_version(copy.deepcopy(d), ver)
if not dv:
continue
@@ -936,7 +938,7 @@ def parse(section, version, tags, meta, ref):
if not _validate_doc(f, d, tags, line_nums[i], meta):
continue
- d = _filter_version(d, float(version))
+ d = _filter_version(d, version)
if not d:
continue
@@ -948,7 +950,10 @@ def parse(section, version, tags, meta, ref):
# extract header from objects
if re.match(r"header", d['type']):
header = d
- header['ordinal'] = int(int(header.get('ordinal',"1000")) * float(header.get('version',"1.0")))
+ header["ordinal"] = int(
+ int(header.get("ordinal", "1000"))
+ * Version(header.get("version", "1.0")).major
+ )
header['ordinal'] *= 1000 if re.match(r"extension", header.get('desc',"").lower()) else 1
header['ordinal'] *= 1000 if re.match(r"experimental", header.get('desc',"").lower()) else 1
basename = os.path.splitext(os.path.basename(f))[0]
@@ -961,7 +966,7 @@ def parse(section, version, tags, meta, ref):
for c in '_-':
name = name.replace(c, ' ')
elif header:
- for d in _make_versions(d, float(version)):
+ for d in _make_versions(d, version):
objects.append(d)
meta = _generate_meta(d, header['ordinal'], meta)
diff --git a/scripts/run.py b/scripts/run.py
index b5a9ccc8b8..5d9b475007 100755
--- a/scripts/run.py
+++ b/scripts/run.py
@@ -13,9 +13,12 @@
import parse_specs
import generate_code
import generate_docs
-import os, sys, platform
+import os
+import sys
+import platform
import time
import subprocess
+from version import Version
"""
helper for adding mutually-exclusive boolean arguments "--name" and "--!name"
@@ -95,7 +98,7 @@ def get_version_from_cmakelists():
for line in cmakelists_file.readlines():
line = line.strip()
if line.startswith('project('):
- return re.findall(r'\d+\.\d+', line)[0]
+ return Version(re.findall(r'\d+\.\d+', line)[0])
raise Exception(f'unable to read project version from {cmakelists_path}')
@@ -117,8 +120,13 @@ def main():
add_argument(parser, "pdf", "generation of PDF file.")
add_argument(parser, "rst", "generation of reStructuredText files.", True)
parser.add_argument("--update_spec", type=str, help="root of integrated spec directory to update")
- parser.add_argument("--ver", type=str, default=get_version_from_cmakelists(),
- required=False, help="specification version to generate.")
+ parser.add_argument(
+ "--ver",
+ type=parse_specs.Version,
+ default=get_version_from_cmakelists(),
+ required=False,
+ help="specification version to generate.",
+ )
parser.add_argument("--api-json", type=str, default="unified_runtime.json", required=False, help="json output file for the spec")
parser.add_argument("--clang-format", type=str, default="clang-format", required=False, help="path to clang-format executable")
parser.add_argument('--fast-mode', action='store_true', help='Disable sections which are slow to render')
@@ -136,82 +144,75 @@ def main():
'meta' : {},
'ref' : {}
}
-
- try:
- for section in configParser.sections():
- input['configs'].append({
- 'name' : section,
- 'namespace': configParser.get(section,'namespace'),
- 'tags' : {'$'+key : configParser.get(section,key) for key in configParser.get(section,'tags').split(",")},
- })
-
- # phase 2: parse specs
- for config in input['configs']:
- specs, input['meta'], input['ref'] = parse_specs.parse(config['name'], args['ver'], config['tags'], input['meta'], input['ref'])
- input['specs'].append(specs)
+ for section in configParser.sections():
+ input['configs'].append({
+ 'name' : section,
+ 'namespace': configParser.get(section,'namespace'),
+ 'tags' : {'$'+key : configParser.get(section,key) for key in configParser.get(section,'tags').split(",")},
+ })
- util.jsonWrite(args['api_json'], input)
+ # phase 2: parse specs
+ for config in input['configs']:
+ specs, input['meta'], input['ref'] = parse_specs.parse(config['name'], args['ver'], config['tags'], input['meta'], input['ref'])
+ input['specs'].append(specs)
- # phase 3: generate files
- if args['clean']:
- clean()
+ util.jsonWrite(args['api_json'], input)
- incpath = os.path.join("../include/")
- srcpath = os.path.join("../source/")
- docpath = os.path.join("../docs/")
+ # phase 3: generate files
+ if args['clean']:
+ clean()
- generate_docs.prepare(docpath, args['rst'], args['html'], args['ver'])
+ incpath = os.path.join("../include/")
+ srcpath = os.path.join("../source/")
+ docpath = os.path.join("../docs/")
- for idx, specs in enumerate(input['specs']):
- config = input['configs'][idx]
- if args[config['name']]:
+ generate_docs.prepare(docpath, args['rst'], args['html'], args['ver'])
- generate_code.generate_api(incpath, srcpath, config['namespace'], config['tags'], args['ver'], args['rev'], specs, input['meta'])
+ for idx, specs in enumerate(input['specs']):
+ config = input['configs'][idx]
+ if args[config['name']]:
- # clang-format ur_api.h
- proc = subprocess.run([args['clang_format'], "--style=file", "-i" , "ur_api.h"], stderr=subprocess.PIPE, cwd=incpath)
- if proc.returncode != 0:
- print("-- clang-format failed with non-zero return code. --")
- print(proc.stderr.decode())
- raise Exception("Failed to format ur_api.h")
+ generate_code.generate_api(incpath, srcpath, config['namespace'], config['tags'], args['ver'], args['rev'], specs, input['meta'])
- if args['rst']:
- generate_docs.generate_rst(docpath, config['name'], config['namespace'], config['tags'], args['ver'], args['rev'], specs, input['meta'], args['fast_mode'])
+ # clang-format ur_api.h
+ proc = subprocess.run([args['clang_format'], "--style=file", "-i" , "ur_api.h"], stderr=subprocess.PIPE, cwd=incpath)
+ if proc.returncode != 0:
+ print("-- clang-format failed with non-zero return code. --")
+ print(proc.stderr.decode())
+ raise Exception("Failed to format ur_api.h")
- if util.makeErrorCount():
- print("\n%s Errors found during generation, stopping execution!"%util.makeErrorCount())
- return
+ if args['rst']:
+ generate_docs.generate_rst(docpath, config['name'], config['namespace'], config['tags'], args['ver'], args['rev'], specs, input['meta'], args['fast_mode'])
- if args['debug']:
- util.makoFileListWrite("generated.json")
+ if util.makeErrorCount():
+ print("\n%s Errors found during generation, stopping execution!"%util.makeErrorCount())
+ return
- # phase 4: build code
- if args['build']:
- if not build():
- print("\nBuild failed, stopping execution!")
- return
+ if args['debug']:
+ util.makoFileListWrite("generated.json")
- # phase 5: prep for publication of html or pdf
- if args['html'] or args['pdf']:
- generate_docs.generate_common(docpath, configParser.sections(), args['ver'], args['rev'])
+ # phase 4: build code
+ if args['build']:
+ if not build():
+ print("\nBuild failed, stopping execution!")
+ return
- # phase 5: publish documentation
- if args['html']:
- generate_docs.generate_html(docpath)
+ # phase 5: prep for publication of html or pdf
+ if args['html'] or args['pdf']:
+ generate_docs.generate_common(docpath, configParser.sections(), args['ver'], args['rev'])
- if args['pdf']:
- generate_docs.generate_pdf(docpath)
+ # phase 5: publish documentation
+ if args['html']:
+ generate_docs.generate_html(docpath)
- if args['update_spec']:
- update_spec(args['update_spec'])
+ if args['pdf']:
+ generate_docs.generate_pdf(docpath)
- print("\nCompleted in %.1f seconds!"%(time.time() - start))
+ if args['update_spec']:
+ update_spec(args['update_spec'])
- except BaseException as e:
- print("Failed to generate specification.")
- print(e)
- return sys.exit(1)
+ print("\nCompleted in %.1f seconds!"%(time.time() - start))
if __name__ == '__main__':
diff --git a/scripts/templates/helper.py b/scripts/templates/helper.py
index 3b8e5693b7..1d539d70fe 100644
--- a/scripts/templates/helper.py
+++ b/scripts/templates/helper.py
@@ -12,6 +12,7 @@
# allow imports from top-level scripts directory
sys.path.append("..")
+from version import Version
"""
Extracts traits from a spec object
@@ -1081,15 +1082,20 @@ def make_param_checks(namespace, tags, obj, cpp=False, meta=None):
def get_class_function_objs(specs, cname, version = None):
objects = []
for s in specs:
- for obj in s['objects']:
+ for obj in s["objects"]:
is_function = obj_traits.is_function(obj)
match_cls = cname == obj_traits.class_name(obj)
if is_function and match_cls:
if version is None:
objects.append(obj)
- elif float(obj.get('version',"1.0")) <= version:
+ elif Version(obj.get("version", "1.0")) <= version:
objects.append(obj)
- return sorted(objects, key=lambda obj: (float(obj.get('version',"1.0"))*10000) + int(obj.get('ordinal',"100")))
+ return sorted(
+ objects,
+ key=lambda obj: (Version(obj.get("version", "1.0")).major * 10000)
+ + int(obj.get("ordinal", "100")),
+ )
+
"""
Public:
@@ -1107,8 +1113,16 @@ def get_class_function_objs_exp(specs, cname):
exp_objects.append(obj)
else:
objects.append(obj)
- objects = sorted(objects, key=lambda obj: (float(obj.get('version',"1.0"))*10000) + int(obj.get('ordinal',"100")))
- exp_objects = sorted(exp_objects, key=lambda obj: (float(obj.get('version',"1.0"))*10000) + int(obj.get('ordinal',"100")))
+ objects = sorted(
+ objects,
+ key=lambda obj: (Version(obj.get("version", "1.0")).major * 10000)
+ + int(obj.get("ordinal", "100")),
+ )
+ exp_objects = sorted(
+ exp_objects,
+ key=lambda obj: (Version(obj.get("version", "1.0")).major * 10000)
+ + int(obj.get("ordinal", "100")),
+ )
return objects, exp_objects
"""
@@ -1222,7 +1236,7 @@ def _filter_param_list(params, filters1=["[in]", "[in,out]", "[out]"], filters2=
def get_pfncbtables(specs, meta, namespace, tags):
tables = []
for cname in sorted(meta['class'], key=lambda x: meta['class'][x]['ordinal']):
- objs = get_class_function_objs(specs, cname, 1.0)
+ objs = get_class_function_objs(specs, cname, Version('1.0'))
if len(objs) > 0:
name = get_table_name(namespace, tags, {'class': cname})
print(name)
diff --git a/scripts/templates/trcddi.cpp.mako b/scripts/templates/trcddi.cpp.mako
index 2ace43072b..6f6579d5ac 100644
--- a/scripts/templates/trcddi.cpp.mako
+++ b/scripts/templates/trcddi.cpp.mako
@@ -45,10 +45,16 @@ namespace ur_tracing_layer
${th.make_pfncb_param_type(n, tags, obj)} params = { &${",&".join(th.make_param_lines(n, tags, obj, format=["name"]))} };
uint64_t instance = context.notify_begin(${th.make_func_etor(n, tags, obj)}, "${th.make_func_name(n, tags, obj)}", ¶ms);
+ context.logger.info("---> ${th.make_func_name(n, tags, obj)}");
+
${x}_result_t result = ${th.make_pfn_name(n, tags, obj)}( ${", ".join(th.make_param_lines(n, tags, obj, format=["name"]))} );
context.notify_end(${th.make_func_etor(n, tags, obj)}, "${th.make_func_name(n, tags, obj)}", ¶ms, &result, instance);
+ std::ostringstream args_str;
+ ur::extras::printFunctionParams(args_str, ${th.make_func_etor(n, tags, obj)}, ¶ms);
+ context.logger.info("({}) -> {};\n", args_str.str(), result);
+
return result;
}
%if 'condition' in obj:
@@ -112,6 +118,10 @@ namespace ur_tracing_layer
return result;
}
+ // Recreate the logger in case env variables have been modified between
+ // program launch and the call to `urLoaderInit`
+ logger = logger::create_logger("tracing", true, true);
+
ur_tracing_layer::context.codelocData = codelocData;
%for tbl in th.get_pfntables(specs, meta, n, tags):
diff --git a/scripts/version.py b/scripts/version.py
new file mode 100644
index 0000000000..8fa72d65a7
--- /dev/null
+++ b/scripts/version.py
@@ -0,0 +1,41 @@
+"""
+Copyright (C) 2024 Intel Corporation
+
+Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+See LICENSE.TXT
+SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""
+
+import functools
+import re
+
+
+@functools.total_ordering
+class Version:
+ def __init__(self, version: str):
+ assert isinstance(version, (str, Version))
+ if isinstance(version, str):
+ self.str = version
+ match = re.match(r"^(\d+)\.(\d+)$", self.str)
+ assert isinstance(match, re.Match)
+ self.major = int(match.groups()[0])
+ self.minor = int(match.groups()[1])
+ else:
+ self.str = version.str
+ self.major = version.major
+ self.minor = version.minor
+
+ def __eq__(self, other) -> bool:
+ assert isinstance(other, Version)
+ return self.major == other.major and self.minor == other.minor
+
+ def __lt__(self, other) -> bool:
+ if not isinstance(other, Version):
+ import ipdb; ipdb.set_trace()
+ return self.major < other.major or (
+ self.major == other.major and self.minor < other.minor
+ )
+
+ def __str__(self) -> str:
+ return self.str
diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp
index 8f1ede3010..8d21a93c75 100644
--- a/source/adapters/cuda/command_buffer.cpp
+++ b/source/adapters/cuda/command_buffer.cpp
@@ -203,9 +203,10 @@ static ur_result_t enqueueCommandBufferFillHelper(
}
}
- UR_CHECK_ERROR(cuGraphAddMemsetNode(
- &GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
- DepsList.size(), &NodeParams, CommandBuffer->Device->getContext()));
+ UR_CHECK_ERROR(
+ cuGraphAddMemsetNode(&GraphNode, CommandBuffer->CudaGraph,
+ DepsList.data(), DepsList.size(), &NodeParams,
+ CommandBuffer->Device->getNativeContext()));
// Get sync point and register the cuNode with it.
*SyncPoint =
@@ -237,7 +238,7 @@ static ur_result_t enqueueCommandBufferFillHelper(
UR_CHECK_ERROR(cuGraphAddMemsetNode(
&GraphNodeFirst, CommandBuffer->CudaGraph, DepsList.data(),
DepsList.size(), &NodeParamsStepFirst,
- CommandBuffer->Device->getContext()));
+ CommandBuffer->Device->getNativeContext()));
// Get sync point and register the cuNode with it.
*SyncPoint = CommandBuffer->addSyncPoint(
@@ -269,7 +270,7 @@ static ur_result_t enqueueCommandBufferFillHelper(
UR_CHECK_ERROR(cuGraphAddMemsetNode(
&GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
DepsList.size(), &NodeParamsStep,
- CommandBuffer->Device->getContext()));
+ CommandBuffer->Device->getNativeContext()));
GraphNodePtr = std::make_shared(GraphNode);
// Get sync point and register the cuNode with it.
@@ -478,7 +479,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp(
UR_CHECK_ERROR(cuGraphAddMemcpyNode(
&GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
- &NodeParams, hCommandBuffer->Device->getContext()));
+ &NodeParams, hCommandBuffer->Device->getNativeContext()));
// Get sync point and register the cuNode with it.
*pSyncPoint =
@@ -513,8 +514,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
}
try {
- auto Src = std::get(hSrcMem->Mem).get() + srcOffset;
- auto Dst = std::get(hDstMem->Mem).get() + dstOffset;
+ auto Src = std::get(hSrcMem->Mem)
+ .getPtrWithOffset(hCommandBuffer->Device, srcOffset);
+ auto Dst = std::get(hDstMem->Mem)
+ .getPtrWithOffset(hCommandBuffer->Device, dstOffset);
CUDA_MEMCPY3D NodeParams = {};
setCopyParams(&Src, CU_MEMORYTYPE_DEVICE, &Dst, CU_MEMORYTYPE_DEVICE, size,
@@ -522,7 +525,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp(
UR_CHECK_ERROR(cuGraphAddMemcpyNode(
&GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
- &NodeParams, hCommandBuffer->Device->getContext()));
+ &NodeParams, hCommandBuffer->Device->getNativeContext()));
// Get sync point and register the cuNode with it.
*pSyncPoint =
@@ -553,8 +556,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
}
try {
- CUdeviceptr SrcPtr = std::get(hSrcMem->Mem).get();
- CUdeviceptr DstPtr = std::get(hDstMem->Mem).get();
+ auto SrcPtr =
+ std::get(hSrcMem->Mem).getPtr(hCommandBuffer->Device);
+ auto DstPtr =
+ std::get(hDstMem->Mem).getPtr(hCommandBuffer->Device);
CUDA_MEMCPY3D NodeParams = {};
setCopyRectParams(region, &SrcPtr, CU_MEMORYTYPE_DEVICE, srcOrigin,
@@ -563,7 +568,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp(
UR_CHECK_ERROR(cuGraphAddMemcpyNode(
&GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
- &NodeParams, hCommandBuffer->Device->getContext()));
+ &NodeParams, hCommandBuffer->Device->getNativeContext()));
// Get sync point and register the cuNode with it.
*pSyncPoint =
@@ -593,7 +598,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp(
}
try {
- auto Dst = std::get(hBuffer->Mem).get() + offset;
+ auto Dst = std::get(hBuffer->Mem)
+ .getPtrWithOffset(hCommandBuffer->Device, offset);
CUDA_MEMCPY3D NodeParams = {};
setCopyParams(pSrc, CU_MEMORYTYPE_HOST, &Dst, CU_MEMORYTYPE_DEVICE, size,
@@ -601,7 +607,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp(
UR_CHECK_ERROR(cuGraphAddMemcpyNode(
&GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
- &NodeParams, hCommandBuffer->Device->getContext()));
+ &NodeParams, hCommandBuffer->Device->getNativeContext()));
// Get sync point and register the cuNode with it.
*pSyncPoint =
@@ -630,7 +636,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp(
}
try {
- auto Src = std::get(hBuffer->Mem).get() + offset;
+ auto Src = std::get(hBuffer->Mem)
+ .getPtrWithOffset(hCommandBuffer->Device, offset);
CUDA_MEMCPY3D NodeParams = {};
setCopyParams(&Src, CU_MEMORYTYPE_DEVICE, pDst, CU_MEMORYTYPE_HOST, size,
@@ -638,7 +645,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp(
UR_CHECK_ERROR(cuGraphAddMemcpyNode(
&GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
- &NodeParams, hCommandBuffer->Device->getContext()));
+ &NodeParams, hCommandBuffer->Device->getNativeContext()));
// Get sync point and register the cuNode with it.
*pSyncPoint =
@@ -670,7 +677,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp(
}
try {
- CUdeviceptr DstPtr = std::get(hBuffer->Mem).get();
+ auto DstPtr =
+ std::get(hBuffer->Mem).getPtr(hCommandBuffer->Device);
CUDA_MEMCPY3D NodeParams = {};
setCopyRectParams(region, pSrc, CU_MEMORYTYPE_HOST, hostOffset,
@@ -680,7 +688,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp(
UR_CHECK_ERROR(cuGraphAddMemcpyNode(
&GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
- &NodeParams, hCommandBuffer->Device->getContext()));
+ &NodeParams, hCommandBuffer->Device->getNativeContext()));
// Get sync point and register the cuNode with it.
*pSyncPoint =
@@ -712,7 +720,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
}
try {
- CUdeviceptr SrcPtr = std::get(hBuffer->Mem).get();
+ auto SrcPtr =
+ std::get(hBuffer->Mem).getPtr(hCommandBuffer->Device);
CUDA_MEMCPY3D NodeParams = {};
setCopyRectParams(region, &SrcPtr, CU_MEMORYTYPE_DEVICE, bufferOffset,
@@ -722,7 +731,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp(
UR_CHECK_ERROR(cuGraphAddMemcpyNode(
&GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(),
- &NodeParams, hCommandBuffer->Device->getContext()));
+ &NodeParams, hCommandBuffer->Device->getNativeContext()));
// Get sync point and register the cuNode with it.
*pSyncPoint =
@@ -821,7 +830,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp(
PatternSizeIsValid,
UR_RESULT_ERROR_INVALID_SIZE);
- auto DstDevice = std::get(hBuffer->Mem).get() + offset;
+ auto DstDevice = std::get(hBuffer->Mem)
+ .getPtrWithOffset(hCommandBuffer->Device, offset);
return enqueueCommandBufferFillHelper(
hCommandBuffer, &DstDevice, CU_MEMORYTYPE_DEVICE, pPattern, patternSize,
@@ -854,7 +864,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
try {
std::unique_ptr RetImplEvent{nullptr};
- ScopedContext Active(hQueue->getContext());
+ ScopedContext Active(hQueue->getDevice());
uint32_t StreamToken;
ur_stream_guard_ Guard;
CUstream CuStream = hQueue->getNextComputeStream(
@@ -972,7 +982,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
if (ArgValue == nullptr) {
Kernel->setKernelArg(ArgIndex, 0, nullptr);
} else {
- CUdeviceptr CuPtr = std::get(ArgValue->Mem).get();
+ CUdeviceptr CuPtr =
+ std::get(ArgValue->Mem).getPtr(CommandBuffer->Device);
Kernel->setKernelArg(ArgIndex, sizeof(CUdeviceptr), (void *)&CuPtr);
}
} catch (ur_result_t Err) {
diff --git a/source/adapters/cuda/command_buffer.hpp b/source/adapters/cuda/command_buffer.hpp
index 84a9e0405b..d83269f2ae 100644
--- a/source/adapters/cuda/command_buffer.hpp
+++ b/source/adapters/cuda/command_buffer.hpp
@@ -10,155 +10,13 @@
#include
#include
+#include
#include "context.hpp"
#include "logger/ur_logger.hpp"
#include
#include
-static inline const char *getUrResultString(ur_result_t Result) {
- switch (Result) {
- case UR_RESULT_SUCCESS:
- return "UR_RESULT_SUCCESS";
- case UR_RESULT_ERROR_INVALID_OPERATION:
- return "UR_RESULT_ERROR_INVALID_OPERATION";
- case UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES:
- return "UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES";
- case UR_RESULT_ERROR_INVALID_QUEUE:
- return "UR_RESULT_ERROR_INVALID_QUEUE";
- case UR_RESULT_ERROR_INVALID_VALUE:
- return "UR_RESULT_ERROR_INVALID_VALUE";
- case UR_RESULT_ERROR_INVALID_CONTEXT:
- return "UR_RESULT_ERROR_INVALID_CONTEXT";
- case UR_RESULT_ERROR_INVALID_PLATFORM:
- return "UR_RESULT_ERROR_INVALID_PLATFORM";
- case UR_RESULT_ERROR_INVALID_BINARY:
- return "UR_RESULT_ERROR_INVALID_BINARY";
- case UR_RESULT_ERROR_INVALID_PROGRAM:
- return "UR_RESULT_ERROR_INVALID_PROGRAM";
- case UR_RESULT_ERROR_INVALID_SAMPLER:
- return "UR_RESULT_ERROR_INVALID_SAMPLER";
- case UR_RESULT_ERROR_INVALID_BUFFER_SIZE:
- return "UR_RESULT_ERROR_INVALID_BUFFER_SIZE";
- case UR_RESULT_ERROR_INVALID_MEM_OBJECT:
- return "UR_RESULT_ERROR_INVALID_MEM_OBJECT";
- case UR_RESULT_ERROR_INVALID_EVENT:
- return "UR_RESULT_ERROR_INVALID_EVENT";
- case UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST:
- return "UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST";
- case UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET:
- return "UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET";
- case UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE:
- return "UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE";
- case UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE:
- return "UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE";
- case UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE:
- return "UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE";
- case UR_RESULT_ERROR_DEVICE_NOT_FOUND:
- return "UR_RESULT_ERROR_DEVICE_NOT_FOUND";
- case UR_RESULT_ERROR_INVALID_DEVICE:
- return "UR_RESULT_ERROR_INVALID_DEVICE";
- case UR_RESULT_ERROR_DEVICE_LOST:
- return "UR_RESULT_ERROR_DEVICE_LOST";
- case UR_RESULT_ERROR_DEVICE_REQUIRES_RESET:
- return "UR_RESULT_ERROR_DEVICE_REQUIRES_RESET";
- case UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE:
- return "UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE";
- case UR_RESULT_ERROR_DEVICE_PARTITION_FAILED:
- return "UR_RESULT_ERROR_DEVICE_PARTITION_FAILED";
- case UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT:
- return "UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT";
- case UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE:
- return "UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE";
- case UR_RESULT_ERROR_INVALID_WORK_DIMENSION:
- return "UR_RESULT_ERROR_INVALID_WORK_DIMENSION";
- case UR_RESULT_ERROR_INVALID_KERNEL_ARGS:
- return "UR_RESULT_ERROR_INVALID_KERNEL_ARGS";
- case UR_RESULT_ERROR_INVALID_KERNEL:
- return "UR_RESULT_ERROR_INVALID_KERNEL";
- case UR_RESULT_ERROR_INVALID_KERNEL_NAME:
- return "UR_RESULT_ERROR_INVALID_KERNEL_NAME";
- case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX:
- return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX";
- case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE:
- return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE";
- case UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE:
- return "UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE";
- case UR_RESULT_ERROR_INVALID_IMAGE_SIZE:
- return "UR_RESULT_ERROR_INVALID_IMAGE_SIZE";
- case UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR:
- return "UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR";
- case UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED:
- return "UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED";
- case UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE:
- return "UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE";
- case UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE:
- return "UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE";
- case UR_RESULT_ERROR_UNINITIALIZED:
- return "UR_RESULT_ERROR_UNINITIALIZED";
- case UR_RESULT_ERROR_OUT_OF_HOST_MEMORY:
- return "UR_RESULT_ERROR_OUT_OF_HOST_MEMORY";
- case UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY:
- return "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY";
- case UR_RESULT_ERROR_OUT_OF_RESOURCES:
- return "UR_RESULT_ERROR_OUT_OF_RESOURCES";
- case UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE:
- return "UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE";
- case UR_RESULT_ERROR_PROGRAM_LINK_FAILURE:
- return "UR_RESULT_ERROR_PROGRAM_LINK_FAILURE";
- case UR_RESULT_ERROR_UNSUPPORTED_VERSION:
- return "UR_RESULT_ERROR_UNSUPPORTED_VERSION";
- case UR_RESULT_ERROR_UNSUPPORTED_FEATURE:
- return "UR_RESULT_ERROR_UNSUPPORTED_FEATURE";
- case UR_RESULT_ERROR_INVALID_ARGUMENT:
- return "UR_RESULT_ERROR_INVALID_ARGUMENT";
- case UR_RESULT_ERROR_INVALID_NULL_HANDLE:
- return "UR_RESULT_ERROR_INVALID_NULL_HANDLE";
- case UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE:
- return "UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE";
- case UR_RESULT_ERROR_INVALID_NULL_POINTER:
- return "UR_RESULT_ERROR_INVALID_NULL_POINTER";
- case UR_RESULT_ERROR_INVALID_SIZE:
- return "UR_RESULT_ERROR_INVALID_SIZE";
- case UR_RESULT_ERROR_UNSUPPORTED_SIZE:
- return "UR_RESULT_ERROR_UNSUPPORTED_SIZE";
- case UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT:
- return "UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT";
- case UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT:
- return "UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT";
- case UR_RESULT_ERROR_INVALID_ENUMERATION:
- return "UR_RESULT_ERROR_INVALID_ENUMERATION";
- case UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION:
- return "UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION";
- case UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT:
- return "UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT";
- case UR_RESULT_ERROR_INVALID_NATIVE_BINARY:
- return "UR_RESULT_ERROR_INVALID_NATIVE_BINARY";
- case UR_RESULT_ERROR_INVALID_GLOBAL_NAME:
- return "UR_RESULT_ERROR_INVALID_GLOBAL_NAME";
- case UR_RESULT_ERROR_INVALID_FUNCTION_NAME:
- return "UR_RESULT_ERROR_INVALID_FUNCTION_NAME";
- case UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION:
- return "UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION";
- case UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION:
- return "UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION";
- case UR_RESULT_ERROR_PROGRAM_UNLINKED:
- return "UR_RESULT_ERROR_PROGRAM_UNLINKED";
- case UR_RESULT_ERROR_OVERLAPPING_REGIONS:
- return "UR_RESULT_ERROR_OVERLAPPING_REGIONS";
- case UR_RESULT_ERROR_INVALID_HOST_PTR:
- return "UR_RESULT_ERROR_INVALID_HOST_PTR";
- case UR_RESULT_ERROR_INVALID_USM_SIZE:
- return "UR_RESULT_ERROR_INVALID_USM_SIZE";
- case UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE:
- return "UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE";
- case UR_RESULT_ERROR_ADAPTER_SPECIFIC:
- return "UR_RESULT_ERROR_ADAPTER_SPECIFIC";
- default:
- return "UR_RESULT_ERROR_UNKNOWN";
- }
-}
-
// Trace an internal UR call
#define UR_TRACE(Call) \
{ \
@@ -173,7 +31,7 @@ static inline const char *getUrResultString(ur_result_t Result) {
logger::always("UR ---> {}", #Call); \
Result = (Call); \
if (PrintTrace) \
- logger::always("UR <--- {}({})", #Call, getUrResultString(Result)); \
+ logger::always("UR <--- {}({})", #Call, Result); \
}
// Handle to a kernel command.
diff --git a/source/adapters/cuda/context.cpp b/source/adapters/cuda/context.cpp
index 40ae0ce4ad..0d08dab76c 100644
--- a/source/adapters/cuda/context.cpp
+++ b/source/adapters/cuda/context.cpp
@@ -46,23 +46,19 @@ UR_APIEXPORT ur_result_t UR_APICALL
urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices,
const ur_context_properties_t *pProperties,
ur_context_handle_t *phContext) {
- std::ignore = DeviceCount;
std::ignore = pProperties;
- assert(DeviceCount == 1);
- ur_result_t RetErr = UR_RESULT_SUCCESS;
-
std::unique_ptr ContextPtr{nullptr};
try {
ContextPtr = std::unique_ptr(
- new ur_context_handle_t_{*phDevices});
+ new ur_context_handle_t_{phDevices, DeviceCount});
*phContext = ContextPtr.release();
} catch (ur_result_t Err) {
- RetErr = Err;
+ return Err;
} catch (...) {
- RetErr = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+ return UR_RESULT_ERROR_OUT_OF_RESOURCES;
}
- return RetErr;
+ return UR_RESULT_SUCCESS;
}
UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
@@ -72,9 +68,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
switch (static_cast(ContextInfoType)) {
case UR_CONTEXT_INFO_NUM_DEVICES:
- return ReturnValue(1);
+ return ReturnValue(static_cast(hContext->getDevices().size()));
case UR_CONTEXT_INFO_DEVICES:
- return ReturnValue(hContext->getDevice());
+ return ReturnValue(hContext->getDevices().data(),
+ hContext->getDevices().size());
case UR_CONTEXT_INFO_REFERENCE_COUNT:
return ReturnValue(hContext->getReferenceCount());
case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
@@ -88,7 +85,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo(
int Major = 0;
UR_CHECK_ERROR(cuDeviceGetAttribute(
&Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
- hContext->getDevice()->get()));
+ hContext->getDevices()[0]->get()));
uint32_t Capabilities =
(Major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM |
UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP |
@@ -137,7 +134,10 @@ urContextRetain(ur_context_handle_t hContext) {
UR_APIEXPORT ur_result_t UR_APICALL urContextGetNativeHandle(
ur_context_handle_t hContext, ur_native_handle_t *phNativeContext) {
- *phNativeContext = reinterpret_cast(hContext->get());
+ // FIXME: this entry point has been deprecated in the SYCL RT and should be
+ // changed to unsupoorted once deprecation period has elapsed.
+ *phNativeContext = reinterpret_cast(
+ hContext->getDevices()[0]->getNativeContext());
return UR_RESULT_SUCCESS;
}
diff --git a/source/adapters/cuda/context.hpp b/source/adapters/cuda/context.hpp
index 4ddcaf3a87..f28e58afe7 100644
--- a/source/adapters/cuda/context.hpp
+++ b/source/adapters/cuda/context.hpp
@@ -33,27 +33,26 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data);
///
/// One of the main differences between the UR API and the CUDA driver API is
/// that the second modifies the state of the threads by assigning
-/// `CUcontext` objects to threads. `CUcontext` objects store data associated
+/// \c CUcontext objects to threads. \c CUcontext objects store data associated
/// with a given device and control access to said device from the user side.
/// UR API context are objects that are passed to functions, and not bound
/// to threads.
-/// The ur_context_handle_t_ object doesn't implement this behavior. It only
-/// holds the CUDA context data. The RAII object \ref ScopedContext implements
-/// the active context behavior.
///
-/// Primary vs User-defined context
+/// Since the \c ur_context_handle_t can contain multiple devices, and a \c
+/// CUcontext refers to only a single device, the \c CUcontext is more tightly
+/// coupled to a \c ur_device_handle_t than a \c ur_context_handle_t. In order
+/// to remove some ambiguities about the different semantics of \c
+/// \c ur_context_handle_t and native \c CUcontext, we access the native \c
+/// CUcontext solely through the \c ur_device_handle_t class, by using the
+/// object \ref ScopedContext, which sets the active device (by setting the
+/// active native \c CUcontext).
///
-/// CUDA has two different types of context, the Primary context,
-/// which is usable by all threads on a given process for a given device, and
-/// the aforementioned custom contexts.
-/// The CUDA documentation, confirmed with performance analysis, suggest using
-/// the Primary context whenever possible.
-/// The Primary context is also used by the CUDA Runtime API.
-/// For UR applications to interop with CUDA Runtime API, they have to use
-/// the primary context - and make that active in the thread.
-/// The `ur_context_handle_t_` object can be constructed with a `kind` parameter
-/// that allows to construct a Primary or `user-defined` context, so that
-/// the UR object interface is always the same.
+/// Primary vs User-defined \c CUcontext
+///
+/// CUDA has two different types of \c CUcontext, the Primary context, which is
+/// usable by all threads on a given process for a given device, and the
+/// aforementioned custom \c CUcontext s. The CUDA documentation, confirmed with
+/// performance analysis, suggest using the Primary context whenever possible.
///
/// Destructor callback
///
@@ -63,6 +62,18 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data);
/// See proposal for details.
/// https://github.com/codeplaysoftware/standards-proposals/blob/master/extended-context-destruction/index.md
///
+///
+/// Memory Management for Devices in a Context <\b>
+///
+/// A \c ur_mem_handle_t is associated with a \c ur_context_handle_t_, which
+/// may refer to multiple devices. Therefore the \c ur_mem_handle_t must
+/// handle a native allocation for each device in the context. UR is
+/// responsible for automatically handling event dependencies for kernels
+/// writing to or reading from the same \c ur_mem_handle_t and migrating memory
+/// between native allocations for devices in the same \c ur_context_handle_t_
+/// if necessary.
+///
+///
struct ur_context_handle_t_ {
struct deleter_data {
@@ -72,18 +83,21 @@ struct ur_context_handle_t_ {
void operator()() { Function(UserData); }
};
- using native_type = CUcontext;
-
- native_type CUContext;
- ur_device_handle_t DeviceID;
+ std::vector Devices;
std::atomic_uint32_t RefCount;
- ur_context_handle_t_(ur_device_handle_t_ *DevID)
- : CUContext{DevID->getContext()}, DeviceID{DevID}, RefCount{1} {
- urDeviceRetain(DeviceID);
+ ur_context_handle_t_(const ur_device_handle_t *Devs, uint32_t NumDevices)
+ : Devices{Devs, Devs + NumDevices}, RefCount{1} {
+ for (auto &Dev : Devices) {
+ urDeviceRetain(Dev);
+ }
};
- ~ur_context_handle_t_() { urDeviceRelease(DeviceID); }
+ ~ur_context_handle_t_() {
+ for (auto &Dev : Devices) {
+ urDeviceRelease(Dev);
+ }
+ }
void invokeExtendedDeleters() {
std::lock_guard Guard(Mutex);
@@ -98,9 +112,9 @@ struct ur_context_handle_t_ {
ExtendedDeleters.emplace_back(deleter_data{Function, UserData});
}
- ur_device_handle_t getDevice() const noexcept { return DeviceID; }
-
- native_type get() const noexcept { return CUContext; }
+ const std::vector &getDevices() const noexcept {
+ return Devices;
+ }
uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
@@ -123,12 +137,11 @@ struct ur_context_handle_t_ {
namespace {
class ScopedContext {
public:
- ScopedContext(ur_context_handle_t Context) {
- if (!Context) {
- throw UR_RESULT_ERROR_INVALID_CONTEXT;
+ ScopedContext(ur_device_handle_t Device) {
+ if (!Device) {
+ throw UR_RESULT_ERROR_INVALID_DEVICE;
}
-
- setContext(Context->get());
+ setContext(Device->getNativeContext());
}
ScopedContext(CUcontext NativeContext) { setContext(NativeContext); }
diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp
index da11f3f1bf..3a94587d1f 100644
--- a/source/adapters/cuda/device.cpp
+++ b/source/adapters/cuda/device.cpp
@@ -47,7 +47,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
static constexpr uint32_t MaxWorkItemDimensions = 3u;
- ScopedContext Active(hDevice->getContext());
+ ScopedContext Active(hDevice);
switch ((uint32_t)propName) {
case UR_DEVICE_INFO_TYPE: {
@@ -926,6 +926,34 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
// CUDA supports cubemap seamless filtering.
return ReturnValue(true);
}
+ case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP: {
+ // CUDA does support fetching 1D USM sampled image data.
+ return ReturnValue(true);
+ }
+ case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP: {
+ // CUDA does not support fetching 1D non-USM sampled image data.
+ return ReturnValue(false);
+ }
+ case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP: {
+ // CUDA does support fetching 2D USM sampled image data.
+ return ReturnValue(true);
+ }
+ case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP: {
+ // CUDA does support fetching 2D non-USM sampled image data.
+ return ReturnValue(true);
+ }
+ case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP: {
+ // CUDA does not support 3D USM sampled textures
+ return ReturnValue(false);
+ }
+ case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP: {
+ // CUDA does support fetching 3D non-USM sampled image data.
+ return ReturnValue(true);
+ }
+ case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: {
+ // CUDA supports recording timestamp events.
+ return ReturnValue(true);
+ }
case UR_DEVICE_INFO_DEVICE_ID: {
int Value = 0;
UR_CHECK_ERROR(cuDeviceGetAttribute(
@@ -1206,7 +1234,7 @@ ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice,
uint64_t *pDeviceTimestamp,
uint64_t *pHostTimestamp) {
CUevent Event;
- ScopedContext Active(hDevice->getContext());
+ ScopedContext Active(hDevice);
if (pDeviceTimestamp) {
UR_CHECK_ERROR(cuEventCreate(&Event, CU_EVENT_DEFAULT));
diff --git a/source/adapters/cuda/device.hpp b/source/adapters/cuda/device.hpp
index 373c8a8ab9..0a40329026 100644
--- a/source/adapters/cuda/device.hpp
+++ b/source/adapters/cuda/device.hpp
@@ -22,6 +22,7 @@ struct ur_device_handle_t_ {
CUevent EvBase; // CUDA event used as base counter
std::atomic_uint32_t RefCount;
ur_platform_handle_t Platform;
+ uint32_t DeviceIndex;
static constexpr uint32_t MaxWorkItemDimensions = 3u;
size_t MaxWorkItemSizes[MaxWorkItemDimensions];
@@ -34,9 +35,9 @@ struct ur_device_handle_t_ {
public:
ur_device_handle_t_(native_type cuDevice, CUcontext cuContext, CUevent evBase,
- ur_platform_handle_t platform)
+ ur_platform_handle_t platform, uint32_t DevIndex)
: CuDevice(cuDevice), CuContext(cuContext), EvBase(evBase), RefCount{1},
- Platform(platform) {
+ Platform(platform), DeviceIndex{DevIndex} {
UR_CHECK_ERROR(cuDeviceGetAttribute(
&MaxRegsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
@@ -79,12 +80,16 @@ struct ur_device_handle_t_ {
native_type get() const noexcept { return CuDevice; };
- CUcontext getContext() const noexcept { return CuContext; };
+ CUcontext getNativeContext() const noexcept { return CuContext; };
uint32_t getReferenceCount() const noexcept { return RefCount; }
ur_platform_handle_t getPlatform() const noexcept { return Platform; };
+ // Returns the index of the device relative to the other devices in the same
+ // platform
+ uint32_t getIndex() const noexcept { return DeviceIndex; }
+
uint64_t getElapsedTime(CUevent) const;
size_t getMaxWorkItemSizes(int index) const noexcept {
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index 6f99941095..9627fc6da2 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -26,7 +26,7 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
UR_ASSERT(EventWaitList, UR_RESULT_SUCCESS);
try {
- ScopedContext Active(CommandQueue->getContext());
+ ScopedContext Active(CommandQueue->getDevice());
auto Result = forLatestEvents(
EventWaitList, NumEventsInWaitList,
@@ -188,7 +188,7 @@ bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device,
// @param [out] ThreadsPerBlock Number of threads per block we should run
// @param [out] BlocksPerGrid Number of blocks per grid we should run
ur_result_t
-setKernelParams(const ur_context_handle_t Context,
+setKernelParams([[maybe_unused]] const ur_context_handle_t Context,
const ur_device_handle_t Device, const uint32_t WorkDim,
const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize,
const size_t *LocalWorkSize, ur_kernel_handle_t &Kernel,
@@ -201,7 +201,7 @@ setKernelParams(const ur_context_handle_t Context,
try {
// Set the active context here as guessLocalWorkSize needs an active context
- ScopedContext Active(Context);
+ ScopedContext Active(Device);
{
size_t *ReqdThreadsPerBlock = Kernel->ReqdThreadsPerBlock;
MaxWorkGroupSize = Device->getMaxWorkGroupSize();
@@ -271,7 +271,6 @@ setKernelParams(const ur_context_handle_t Context,
CudaImplicitOffset);
}
- auto Device = Context->getDevice();
if (LocalSize > static_cast(Device->getMaxCapacityLocalMem())) {
setErrorMessage("Excessive allocation of local memory on the device",
UR_RESULT_ERROR_ADAPTER_SPECIFIC);
@@ -329,7 +328,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
// This function makes one stream work on the previous work (or work
// represented by input events) and then all future work waits on that stream.
try {
- ScopedContext Active(hQueue->getContext());
+ ScopedContext Active(hQueue->getDevice());
uint32_t StreamToken;
ur_stream_guard_ Guard;
CUstream CuStream = hQueue->getNextComputeStream(
@@ -410,11 +409,43 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
// Preconditions
- UR_ASSERT(hQueue->getContext() == hKernel->getContext(),
+ UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(),
UR_RESULT_ERROR_INVALID_KERNEL);
UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
+ std::vector DepEvents(
+ phEventWaitList, phEventWaitList + numEventsInWaitList);
+ std::vector> MemMigrationLocks;
+
+ // phEventWaitList only contains events that are handed to UR by the SYCL
+ // runtime. However since UR handles memory dependencies within a context
+ // we may need to add more events to our dependent events list if the UR
+ // context contains multiple devices
+ if (hQueue->getContext()->Devices.size() > 1) {
+ MemMigrationLocks.reserve(hKernel->Args.MemObjArgs.size());
+ for (auto &MemArg : hKernel->Args.MemObjArgs) {
+ bool PushBack = false;
+ if (auto MemDepEvent = MemArg.Mem->LastEventWritingToMemObj;
+ MemDepEvent && std::find(DepEvents.begin(), DepEvents.end(),
+ MemDepEvent) == DepEvents.end()) {
+ DepEvents.push_back(MemDepEvent);
+ PushBack = true;
+ }
+ if ((MemArg.AccessFlags &
+ (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) ||
+ PushBack) {
+ if (std::find_if(MemMigrationLocks.begin(), MemMigrationLocks.end(),
+ [MemArg](auto &Lock) {
+ return Lock.first == MemArg.Mem;
+ }) == MemMigrationLocks.end())
+ MemMigrationLocks.emplace_back(
+ std::pair{MemArg.Mem, ur_lock{MemArg.Mem->MemoryMigrationMutex}});
+ }
+ }
+ }
+
+ // Early exit for zero size kernel
if (*pGlobalWorkSize == 0) {
return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList,
phEventWaitList, phEvent);
@@ -426,26 +457,37 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
uint32_t LocalSize = hKernel->getLocalSize();
- ur_result_t Result = UR_RESULT_SUCCESS;
CUfunction CuFunc = hKernel->get();
- Result = setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
- pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
- hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
- if (Result != UR_RESULT_SUCCESS) {
- return Result;
- }
+ // This might return UR_RESULT_ERROR_ADAPTER_SPECIFIC, which cannot be handled
+ // using the standard UR_CHECK_ERROR
+ if (ur_result_t Ret =
+ setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
+ pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
+ hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
+ Ret != UR_RESULT_SUCCESS)
+ return Ret;
try {
std::unique_ptr RetImplEvent{nullptr};
+ ScopedContext Active(hQueue->getDevice());
uint32_t StreamToken;
ur_stream_guard_ Guard;
CUstream CuStream = hQueue->getNextComputeStream(
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
- Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
- phEventWaitList);
+ if (DepEvents.size()) {
+ UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, DepEvents.size(),
+ DepEvents.data()));
+ }
+
+ // For memory migration across devices in the same context
+ if (hQueue->getContext()->Devices.size() > 1) {
+ for (auto &MemArg : hKernel->Args.MemObjArgs) {
+ migrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice());
+ }
+ }
if (phEvent) {
RetImplEvent =
@@ -454,6 +496,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
UR_CHECK_ERROR(RetImplEvent->start());
}
+ // Once event has been started we can unlock MemoryMigrationMutex
+ if (hQueue->getContext()->Devices.size() > 1) {
+ for (auto &MemArg : hKernel->Args.MemObjArgs) {
+ // Telling the ur_mem_handle_t that it will need to wait on this kernel
+ // if it has been written to
+ if (phEvent && (MemArg.AccessFlags &
+ (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY))) {
+ MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.get());
+ }
+ }
+ // We can release the MemoryMigrationMutexes now
+ MemMigrationLocks.clear();
+ }
+
auto &ArgIndices = hKernel->getArgIndices();
UR_CHECK_ERROR(cuLaunchKernel(
CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2],
@@ -469,9 +525,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
}
} catch (ur_result_t Err) {
- Result = Err;
+ return Err;
}
- return Result;
+ return UR_RESULT_SUCCESS;
}
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
@@ -565,35 +621,52 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
size_t hostRowPitch, size_t hostSlicePitch, void *pDst,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent) {
- ur_result_t Result = UR_RESULT_SUCCESS;
- CUdeviceptr DevPtr = std::get(hBuffer->Mem).get();
std::unique_ptr RetImplEvent{nullptr};
- try {
- ScopedContext Active(hQueue->getContext());
- CUstream CuStream = hQueue->getNextTransferStream();
+ ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex};
+ auto Device = hQueue->getDevice();
+ ScopedContext Active(Device);
+ CUstream Stream = hQueue->getNextTransferStream();
- Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
- phEventWaitList);
+ try {
+ // Note that this entry point may be called on a queue that may not be the
+ // last queue to write to the MemBuffer, meaning we must perform the copy
+ // from a different device
+ if (hBuffer->LastEventWritingToMemObj &&
+ hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() !=
+ hQueue->getDevice()) {
+ hQueue = hBuffer->LastEventWritingToMemObj->getQueue();
+ Device = hQueue->getDevice();
+ ScopedContext Active(Device);
+ Stream = CUstream{0}; // Default stream for different device
+ // We may have to wait for an event on another queue if it is the last
+ // event writing to mem obj
+ UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1,
+ &hBuffer->LastEventWritingToMemObj));
+ }
+
+ UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
+ phEventWaitList));
if (phEvent) {
RetImplEvent =
std::unique_ptr(ur_event_handle_t_::makeNative(
- UR_COMMAND_MEM_BUFFER_READ_RECT, hQueue, CuStream));
+ UR_COMMAND_MEM_BUFFER_READ_RECT, hQueue, Stream));
UR_CHECK_ERROR(RetImplEvent->start());
}
- Result = commonEnqueueMemBufferCopyRect(
- CuStream, region, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin,
+ auto DevPtr = std::get(hBuffer->Mem).getPtr(Device);
+ UR_CHECK_ERROR(commonEnqueueMemBufferCopyRect(
+ Stream, region, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin,
bufferRowPitch, bufferSlicePitch, pDst, CU_MEMORYTYPE_HOST, hostOrigin,
- hostRowPitch, hostSlicePitch);
+ hostRowPitch, hostSlicePitch));
if (phEvent) {
UR_CHECK_ERROR(RetImplEvent->record());
}
if (blockingRead) {
- UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
+ UR_CHECK_ERROR(cuStreamSynchronize(Stream));
}
if (phEvent) {
@@ -601,9 +674,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
}
} catch (ur_result_t Err) {
- Result = Err;
+ return Err;
}
- return Result;
+ return UR_RESULT_SUCCESS;
}
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
@@ -613,15 +686,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
size_t hostRowPitch, size_t hostSlicePitch, void *pSrc,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent) {
- ur_result_t Result = UR_RESULT_SUCCESS;
- CUdeviceptr DevPtr = std::get(hBuffer->Mem).get();
+ CUdeviceptr DevPtr =
+ std::get(hBuffer->Mem).getPtr(hQueue->getDevice());
std::unique_ptr RetImplEvent{nullptr};
try {
- ScopedContext active(hQueue->getContext());
+ ScopedContext Active(hQueue->getDevice());
CUstream cuStream = hQueue->getNextTransferStream();
- Result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
- phEventWaitList);
+ UR_CHECK_ERROR(enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
+ phEventWaitList));
if (phEvent) {
RetImplEvent =
@@ -630,10 +703,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
UR_CHECK_ERROR(RetImplEvent->start());
}
- Result = commonEnqueueMemBufferCopyRect(
+ UR_CHECK_ERROR(commonEnqueueMemBufferCopyRect(
cuStream, region, pSrc, CU_MEMORYTYPE_HOST, hostOrigin, hostRowPitch,
hostSlicePitch, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin,
- bufferRowPitch, bufferSlicePitch);
+ bufferRowPitch, bufferSlicePitch));
if (phEvent) {
UR_CHECK_ERROR(RetImplEvent->record());
@@ -648,9 +721,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect(
}
} catch (ur_result_t Err) {
- Result = Err;
+ return Err;
}
- return Result;
+ return UR_RESULT_SUCCESS;
}
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy(
@@ -666,7 +739,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy(
std::unique_ptr RetImplEvent{nullptr};
try {
- ScopedContext Active(hQueue->getContext());
+ ScopedContext Active(hQueue->getDevice());
ur_result_t Result = UR_RESULT_SUCCESS;
auto Stream = hQueue->getNextTransferStream();
@@ -680,8 +753,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy(
UR_CHECK_ERROR(RetImplEvent->start());
}
- auto Src = std::get(hBufferSrc->Mem).get() + srcOffset;
- auto Dst = std::get(hBufferDst->Mem).get() + dstOffset;
+ auto Src = std::get(hBufferSrc->Mem)
+ .getPtrWithOffset(hQueue->getDevice(), srcOffset);
+ auto Dst = std::get(hBufferDst->Mem)
+ .getPtrWithOffset(hQueue->getDevice(), dstOffset);
UR_CHECK_ERROR(cuMemcpyDtoDAsync(Dst, Src, size, Stream));
@@ -706,12 +781,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent) {
ur_result_t Result = UR_RESULT_SUCCESS;
- CUdeviceptr SrcPtr = std::get(hBufferSrc->Mem).get();
- CUdeviceptr DstPtr = std::get(hBufferDst->Mem).get();
+ CUdeviceptr SrcPtr =
+ std::get(hBufferSrc->Mem).getPtr(hQueue->getDevice());
+ CUdeviceptr DstPtr =
+ std::get(hBufferDst->Mem).getPtr(hQueue->getDevice());
std::unique_ptr RetImplEvent{nullptr};
try {
- ScopedContext Active(hQueue->getContext());
+ ScopedContext Active(hQueue->getDevice());
CUstream CuStream = hQueue->getNextTransferStream();
Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
phEventWaitList);
@@ -784,7 +861,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
std::unique_ptr RetImplEvent{nullptr};
try {
- ScopedContext Active(hQueue->getContext());
+ ScopedContext Active(hQueue->getDevice());
auto Stream = hQueue->getNextTransferStream();
ur_result_t Result =
@@ -797,7 +874,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill(
UR_CHECK_ERROR(RetImplEvent->start());
}
- auto DstDevice = std::get(hBuffer->Mem).get() + offset;
+ auto DstDevice = std::get(hBuffer->Mem)
+ .getPtrWithOffset(hQueue->getDevice(), offset);
auto N = size / patternSize;
// pattern size in bytes
@@ -933,18 +1011,34 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
std::ignore = rowPitch;
std::ignore = slicePitch;
- UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface,
- UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+ UR_ASSERT(hImage->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
- ur_result_t Result = UR_RESULT_SUCCESS;
+ ur_lock MemoryMigrationLock{hImage->MemoryMigrationMutex};
+ auto Device = hQueue->getDevice();
+ CUstream Stream = hQueue->getNextTransferStream();
try {
- ScopedContext Active(hQueue->getContext());
- CUstream CuStream = hQueue->getNextTransferStream();
- Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
- phEventWaitList);
+ // Note that this entry point may be called on a queue that may not be the
+ // last queue to write to the Image, meaning we must perform the copy
+ // from a different device
+ if (hImage->LastEventWritingToMemObj &&
+ hImage->LastEventWritingToMemObj->getQueue()->getDevice() !=
+ hQueue->getDevice()) {
+ hQueue = hImage->LastEventWritingToMemObj->getQueue();
+ Device = hQueue->getDevice();
+ ScopedContext Active(Device);
+ Stream = CUstream{0}; // Default stream for different device
+ // We may have to wait for an event on another queue if it is the last
+ // event writing to mem obj
+ UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1,
+ &hImage->LastEventWritingToMemObj));
+ }
+
+ ScopedContext Active(Device);
+ UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
+ phEventWaitList));
- CUarray Array = std::get(hImage->Mem).getArray();
+ CUarray Array = std::get(hImage->Mem).getArray(Device);
CUDA_ARRAY_DESCRIPTOR ArrayDesc;
UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array));
@@ -954,29 +1048,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
size_t ByteOffsetX = origin.x * ElementByteSize * ArrayDesc.NumChannels;
size_t BytesToCopy = ElementByteSize * ArrayDesc.NumChannels * region.width;
- ur_mem_type_t ImgType = std::get(hImage->Mem).getImageType();
+ ur_mem_type_t ImgType = std::get(hImage->Mem).getType();
std::unique_ptr RetImplEvent{nullptr};
if (phEvent) {
RetImplEvent =
std::unique_ptr(ur_event_handle_t_::makeNative(
- UR_COMMAND_MEM_IMAGE_READ, hQueue, CuStream));
+ UR_COMMAND_MEM_IMAGE_READ, hQueue, Stream));
UR_CHECK_ERROR(RetImplEvent->start());
}
if (ImgType == UR_MEM_TYPE_IMAGE1D) {
UR_CHECK_ERROR(
- cuMemcpyAtoHAsync(pDst, Array, ByteOffsetX, BytesToCopy, CuStream));
+ cuMemcpyAtoHAsync(pDst, Array, ByteOffsetX, BytesToCopy, Stream));
} else {
ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height,
region.depth};
ur_rect_offset_t SrcOffset = {ByteOffsetX, origin.y, origin.z};
- Result = commonEnqueueMemImageNDCopy(
- CuStream, ImgType, AdjustedRegion, &Array, CU_MEMORYTYPE_ARRAY,
- SrcOffset, pDst, CU_MEMORYTYPE_HOST, ur_rect_offset_t{});
- if (Result != UR_RESULT_SUCCESS) {
- return Result;
- }
+ UR_CHECK_ERROR(commonEnqueueMemImageNDCopy(
+ Stream, ImgType, AdjustedRegion, &Array, CU_MEMORYTYPE_ARRAY,
+ SrcOffset, pDst, CU_MEMORYTYPE_HOST, ur_rect_offset_t{}));
}
if (phEvent) {
@@ -985,7 +1076,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
}
if (blockingRead) {
- UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
+ UR_CHECK_ERROR(cuStreamSynchronize(Stream));
}
} catch (ur_result_t Err) {
return Err;
@@ -993,7 +1084,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
return UR_RESULT_ERROR_UNKNOWN;
}
- return Result;
+ return UR_RESULT_SUCCESS;
}
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
@@ -1005,18 +1096,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
std::ignore = rowPitch;
std::ignore = slicePitch;
- UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface,
- UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+ UR_ASSERT(hImage->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+ auto &Image = std::get(hImage->Mem);
+ // FIXME: We are assuming that the lifetime of host ptr lives as long as the
+ // image
+ if (!Image.HostPtr)
+ Image.HostPtr = pSrc;
ur_result_t Result = UR_RESULT_SUCCESS;
try {
- ScopedContext Active(hQueue->getContext());
+ ScopedContext Active(hQueue->getDevice());
CUstream CuStream = hQueue->getNextTransferStream();
Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
phEventWaitList);
- CUarray Array = std::get(hImage->Mem).getArray();
+ CUarray Array = Image.getArray(hQueue->getDevice());
CUDA_ARRAY_DESCRIPTOR ArrayDesc;
UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array));
@@ -1034,7 +1129,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite(
UR_CHECK_ERROR(RetImplEvent->start());
}
- ur_mem_type_t ImgType = std::get(hImage->Mem).getImageType();
+ ur_mem_type_t ImgType = Image.getType();
if (ImgType == UR_MEM_TYPE_IMAGE1D) {
UR_CHECK_ERROR(
cuMemcpyHtoAAsync(Array, ByteOffsetX, pSrc, BytesToCopy, CuStream));
@@ -1071,24 +1166,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy(
ur_rect_offset_t dstOrigin, ur_rect_region_t region,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent) {
- UR_ASSERT(hImageSrc->MemType == ur_mem_handle_t_::Type::Surface,
- UR_RESULT_ERROR_INVALID_MEM_OBJECT);
- UR_ASSERT(hImageDst->MemType == ur_mem_handle_t_::Type::Surface,
- UR_RESULT_ERROR_INVALID_MEM_OBJECT);
- UR_ASSERT(std::get(hImageSrc->Mem).getImageType() ==
- std::get(hImageDst->Mem).getImageType(),
+ UR_ASSERT(hImageSrc->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+ UR_ASSERT(hImageDst->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+ UR_ASSERT(std::get(hImageSrc->Mem).getType() ==
+ std::get(hImageDst->Mem).getType(),
UR_RESULT_ERROR_INVALID_MEM_OBJECT);
ur_result_t Result = UR_RESULT_SUCCESS;
try {
- ScopedContext Active(hQueue->getContext());
+ ScopedContext Active(hQueue->getDevice());
CUstream CuStream = hQueue->getNextTransferStream();
Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
phEventWaitList);
- CUarray SrcArray = std::get(hImageSrc->Mem).getArray();
- CUarray DstArray = std::get(hImageDst->Mem).getArray();
+ CUarray SrcArray =
+ std::get(hImageSrc->Mem).getArray(hQueue->getDevice());
+ CUarray DstArray =
+ std::get(hImageDst->Mem).getArray(hQueue->getDevice());
CUDA_ARRAY_DESCRIPTOR SrcArrayDesc;
UR_CHECK_ERROR(cuArrayGetDescriptor(&SrcArrayDesc, SrcArray));
@@ -1117,7 +1212,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy(
UR_CHECK_ERROR(RetImplEvent->start());
}
- ur_mem_type_t ImgType = std::get(hImageSrc->Mem).getImageType();
+ ur_mem_type_t ImgType = std::get(hImageSrc->Mem).getType();
ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height,
region.depth};
@@ -1154,8 +1249,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
ur_map_flags_t mapFlags, size_t offset, size_t size,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent, void **ppRetMap) {
- UR_ASSERT(hBuffer->MemType == ur_mem_handle_t_::Type::Buffer,
- UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+ UR_ASSERT(hBuffer->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
UR_ASSERT(offset + size <= std::get(hBuffer->Mem).getSize(),
UR_RESULT_ERROR_INVALID_SIZE);
@@ -1177,7 +1271,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
MapPtr, numEventsInWaitList,
phEventWaitList, phEvent);
} else {
- ScopedContext Active(hQueue->getContext());
+ ScopedContext Active(hQueue->getDevice());
if (IsPinned) {
Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList,
@@ -1208,8 +1302,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
ur_queue_handle_t hQueue, ur_mem_handle_t hMem, void *pMappedPtr,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent) {
- UR_ASSERT(hMem->MemType == ur_mem_handle_t_::Type::Buffer,
- UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+ UR_ASSERT(hMem->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
auto &BufferImpl = std::get(hMem->Mem);
auto *Map = BufferImpl.getMapDetails(pMappedPtr);
@@ -1225,7 +1318,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
hQueue, hMem, true, Map->getMapOffset(), Map->getMapSize(), pMappedPtr,
numEventsInWaitList, phEventWaitList, phEvent);
} else {
- ScopedContext Active(hQueue->getContext());
+ ScopedContext Active(hQueue->getDevice());
if (IsPinned) {
Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList,
@@ -1256,7 +1349,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
std::unique_ptr EventPtr{nullptr};
try {
- ScopedContext Active(hQueue->getContext());
+ ScopedContext Active(hQueue->getDevice());
uint32_t StreamToken;
ur_stream_guard_ Guard;
CUstream CuStream = hQueue->getNextComputeStream(
@@ -1310,7 +1403,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy(
std::unique_ptr EventPtr{nullptr};
try {
- ScopedContext Active(hQueue->getContext());
+ ScopedContext Active(hQueue->getDevice());
CUstream CuStream = hQueue->getNextTransferStream();
Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
phEventWaitList);
@@ -1347,7 +1440,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
UR_CHECK_ERROR(cuPointerGetAttribute(
&PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem));
UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE);
- ur_device_handle_t Device = hQueue->getContext()->getDevice();
+ ur_device_handle_t Device = hQueue->getDevice();
// Certain cuda devices and Windows do not have support for some Unified
// Memory features. cuMemPrefetchAsync requires concurrent memory access
@@ -1373,7 +1466,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
std::unique_ptr EventPtr{nullptr};
try {
- ScopedContext Active(hQueue->getContext());
+ ScopedContext Active(hQueue->getDevice());
CUstream CuStream = hQueue->getNextTransferStream();
Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
phEventWaitList);
@@ -1414,7 +1507,7 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
(advice & UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE) ||
(advice & UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE) ||
(advice & UR_USM_ADVICE_FLAG_DEFAULT)) {
- ur_device_handle_t Device = hQueue->getContext()->getDevice();
+ ur_device_handle_t Device = hQueue->getDevice();
if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
setErrorMessage("Mem advise ignored as device does not support "
"concurrent managed access",
@@ -1441,7 +1534,7 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
std::unique_ptr EventPtr{nullptr};
try {
- ScopedContext Active(hQueue->getContext());
+ ScopedContext Active(hQueue->getDevice());
if (phEvent) {
EventPtr =
@@ -1453,16 +1546,16 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
if (advice & UR_USM_ADVICE_FLAG_DEFAULT) {
UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size,
CU_MEM_ADVISE_UNSET_READ_MOSTLY,
- hQueue->getContext()->getDevice()->get()));
+ hQueue->getDevice()->get()));
UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size,
CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION,
- hQueue->getContext()->getDevice()->get()));
+ hQueue->getDevice()->get()));
UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size,
CU_MEM_ADVISE_UNSET_ACCESSED_BY,
- hQueue->getContext()->getDevice()->get()));
+ hQueue->getDevice()->get()));
} else {
Result = setCuMemAdvise((CUdeviceptr)pMem, size, advice,
- hQueue->getContext()->getDevice()->get());
+ hQueue->getDevice()->get());
}
if (phEvent) {
@@ -1493,7 +1586,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
ur_result_t result = UR_RESULT_SUCCESS;
try {
- ScopedContext active(hQueue->getContext());
+ ScopedContext active(hQueue->getDevice());
CUstream cuStream = hQueue->getNextTransferStream();
result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList,
phEventWaitList);
@@ -1543,33 +1636,50 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
UR_ASSERT(!hBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
UR_ASSERT(offset + size <= std::get(hBuffer->Mem).Size,
UR_RESULT_ERROR_INVALID_SIZE);
-
- ur_result_t Result = UR_RESULT_SUCCESS;
- CUdeviceptr DevPtr = std::get(hBuffer->Mem).get();
std::unique_ptr RetImplEvent{nullptr};
+ ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex};
+ auto Device = hQueue->getDevice();
+ ScopedContext Active(Device);
+ CUstream Stream = hQueue->getNextTransferStream();
try {
- ScopedContext Active(hQueue->getContext());
- CUstream CuStream = hQueue->getNextTransferStream();
-
- Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
- phEventWaitList);
+ // Note that this entry point may be called on a queue that may not be the
+ // last queue to write to the MemBuffer, meaning we must perform the copy
+ // from a different device
+ if (hBuffer->LastEventWritingToMemObj &&
+ hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() !=
+ hQueue->getDevice()) {
+ hQueue = hBuffer->LastEventWritingToMemObj->getQueue();
+ Device = hQueue->getDevice();
+ ScopedContext Active(Device);
+ Stream = CUstream{0}; // Default stream for different device
+ // We may have to wait for an event on another queue if it is the last
+ // event writing to mem obj
+ UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1,
+ &hBuffer->LastEventWritingToMemObj));
+ }
+
+ UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList,
+ phEventWaitList));
if (phEvent) {
RetImplEvent =
std::unique_ptr(ur_event_handle_t_::makeNative(
- UR_COMMAND_MEM_BUFFER_READ, hQueue, CuStream));
+ UR_COMMAND_MEM_BUFFER_READ, hQueue, Stream));
UR_CHECK_ERROR(RetImplEvent->start());
}
- UR_CHECK_ERROR(cuMemcpyDtoHAsync(pDst, DevPtr + offset, size, CuStream));
+ UR_CHECK_ERROR(cuMemcpyDtoHAsync(
+ pDst,
+ std::get(hBuffer->Mem).getPtrWithOffset(Device, offset),
+ size, Stream));
if (phEvent) {
UR_CHECK_ERROR(RetImplEvent->record());
}
if (blockingRead) {
- UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
+ UR_CHECK_ERROR(cuStreamSynchronize(Stream));
}
if (phEvent) {
@@ -1577,10 +1687,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
}
} catch (ur_result_t Err) {
- Result = Err;
+ return Err;
}
-
- return Result;
+ return UR_RESULT_SUCCESS;
}
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
@@ -1592,11 +1701,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(
UR_RESULT_ERROR_INVALID_SIZE);
ur_result_t Result = UR_RESULT_SUCCESS;
- CUdeviceptr DevPtr = std::get(hBuffer->Mem).get();
+ CUdeviceptr DevPtr =
+ std::get(hBuffer->Mem).getPtr(hQueue->getDevice());
std::unique_ptr RetImplEvent{nullptr};
try {
- ScopedContext Active(hQueue->getContext());
+ ScopedContext Active(hQueue->getDevice());
CUstream CuStream = hQueue->getNextTransferStream();
Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
@@ -1709,3 +1819,33 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe(
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+ ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList,
+ const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+
+ ur_result_t Result = UR_RESULT_SUCCESS;
+ std::unique_ptr RetImplEvent{nullptr};
+ try {
+ ScopedContext Active(hQueue->getDevice());
+ CUstream CuStream = hQueue->getNextComputeStream();
+
+ UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList,
+ phEventWaitList));
+
+ RetImplEvent =
+ std::unique_ptr(ur_event_handle_t_::makeNative(
+ UR_COMMAND_TIMESTAMP_RECORDING_EXP, hQueue, CuStream));
+ UR_CHECK_ERROR(RetImplEvent->start());
+ UR_CHECK_ERROR(RetImplEvent->record());
+
+ if (blocking) {
+ UR_CHECK_ERROR(cuStreamSynchronize(CuStream));
+ }
+
+ *phEvent = RetImplEvent.release();
+ } catch (ur_result_t Err) {
+ Result = Err;
+ }
+ return Result;
+}
diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp
index ac66bf479e..f9889a3f46 100644
--- a/source/adapters/cuda/event.cpp
+++ b/source/adapters/cuda/event.cpp
@@ -54,7 +54,7 @@ ur_result_t ur_event_handle_t_::start() {
ur_result_t Result = UR_RESULT_SUCCESS;
try {
- if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) {
+ if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) {
// NOTE: This relies on the default stream to be unused.
UR_CHECK_ERROR(cuEventRecord(EvQueued, 0));
UR_CHECK_ERROR(cuEventRecord(EvStart, Stream));
@@ -149,7 +149,7 @@ ur_result_t ur_event_handle_t_::release() {
UR_CHECK_ERROR(cuEventDestroy(EvEnd));
- if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) {
+ if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) {
UR_CHECK_ERROR(cuEventDestroy(EvQueued));
UR_CHECK_ERROR(cuEventDestroy(EvStart));
}
@@ -190,7 +190,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet);
ur_queue_handle_t Queue = hEvent->getQueue();
- if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
+ if (Queue == nullptr || (!(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) &&
+ !hEvent->isTimestampEvent())) {
return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
}
@@ -220,7 +221,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(ur_event_handle_t,
UR_APIEXPORT ur_result_t UR_APICALL
urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) {
try {
- ScopedContext Active(phEventWaitList[0]->getContext());
+ ScopedContext Active(phEventWaitList[0]->getQueue()->getDevice());
auto WaitFunc = [](ur_event_handle_t Event) -> ur_result_t {
UR_ASSERT(Event, UR_RESULT_ERROR_INVALID_EVENT);
@@ -255,7 +256,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) {
std::unique_ptr event_ptr{hEvent};
ur_result_t Result = UR_RESULT_ERROR_INVALID_EVENT;
try {
- ScopedContext Active(hEvent->getContext());
Result = hEvent->release();
} catch (...) {
Result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
diff --git a/source/adapters/cuda/event.hpp b/source/adapters/cuda/event.hpp
index 390fd7833a..5ed68f0f25 100644
--- a/source/adapters/cuda/event.hpp
+++ b/source/adapters/cuda/event.hpp
@@ -57,6 +57,10 @@ struct ur_event_handle_t_ {
return UR_EVENT_STATUS_COMPLETE;
}
+ bool isTimestampEvent() const noexcept {
+ return getCommandType() == UR_COMMAND_TIMESTAMP_RECORDING_EXP;
+ }
+
ur_context_handle_t getContext() const noexcept { return Context; };
uint32_t incrementReferenceCount() { return ++RefCount; }
@@ -83,13 +87,14 @@ struct ur_event_handle_t_ {
static ur_event_handle_t
makeNative(ur_command_t Type, ur_queue_handle_t Queue, CUstream Stream,
uint32_t StreamToken = std::numeric_limits::max()) {
- const bool ProfilingEnabled =
- Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE;
+ const bool RequiresTimings =
+ Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE ||
+ Type == UR_COMMAND_TIMESTAMP_RECORDING_EXP;
native_type EvEnd = nullptr, EvQueued = nullptr, EvStart = nullptr;
UR_CHECK_ERROR(cuEventCreate(
- &EvEnd, ProfilingEnabled ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING));
+ &EvEnd, RequiresTimings ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING));
- if (ProfilingEnabled) {
+ if (RequiresTimings) {
UR_CHECK_ERROR(cuEventCreate(&EvQueued, CU_EVENT_DEFAULT));
UR_CHECK_ERROR(cuEventCreate(&EvStart, CU_EVENT_DEFAULT));
}
diff --git a/source/adapters/cuda/image.cpp b/source/adapters/cuda/image.cpp
index 3dd1a7f34b..95dc2e258e 100644
--- a/source/adapters/cuda/image.cpp
+++ b/source/adapters/cuda/image.cpp
@@ -331,7 +331,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPitchedAllocExp(
const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool,
size_t widthInBytes, size_t height, size_t elementSizeBytes, void **ppMem,
size_t *pResultPitch) {
- UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+ UR_ASSERT(std::find(hContext->getDevices().begin(),
+ hContext->getDevices().end(),
+ hDevice) != hContext->getDevices().end(),
UR_RESULT_ERROR_INVALID_CONTEXT);
std::ignore = pUSMDesc;
std::ignore = pool;
@@ -350,7 +352,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPitchedAllocExp(
UR_RESULT_ERROR_INVALID_VALUE);
ur_result_t Result = UR_RESULT_SUCCESS;
try {
- ScopedContext Active(hDevice->getContext());
+ ScopedContext Active(hDevice);
UR_CHECK_ERROR(cuMemAllocPitch((CUdeviceptr *)ppMem, pResultPitch,
widthInBytes, height, elementSizeBytes));
} catch (ur_result_t error) {
@@ -366,7 +368,9 @@ UR_APIEXPORT ur_result_t UR_APICALL
urBindlessImagesUnsampledImageHandleDestroyExp(ur_context_handle_t hContext,
ur_device_handle_t hDevice,
ur_exp_image_handle_t hImage) {
- UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+ UR_ASSERT(std::find(hContext->getDevices().begin(),
+ hContext->getDevices().end(),
+ hDevice) != hContext->getDevices().end(),
UR_RESULT_ERROR_INVALID_CONTEXT);
UR_CHECK_ERROR(cuSurfObjectDestroy((CUsurfObject)hImage));
@@ -377,7 +381,9 @@ UR_APIEXPORT ur_result_t UR_APICALL
urBindlessImagesSampledImageHandleDestroyExp(ur_context_handle_t hContext,
ur_device_handle_t hDevice,
ur_exp_image_handle_t hImage) {
- UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+ UR_ASSERT(std::find(hContext->getDevices().begin(),
+ hContext->getDevices().end(),
+ hDevice) != hContext->getDevices().end(),
UR_RESULT_ERROR_INVALID_CONTEXT);
UR_CHECK_ERROR(cuTexObjectDestroy((CUtexObject)hImage));
@@ -388,7 +394,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp(
ur_context_handle_t hContext, ur_device_handle_t hDevice,
const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc,
ur_exp_image_mem_handle_t *phImageMem) {
- UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+ UR_ASSERT(std::find(hContext->getDevices().begin(),
+ hContext->getDevices().end(),
+ hDevice) != hContext->getDevices().end(),
UR_RESULT_ERROR_INVALID_CONTEXT);
// Populate descriptor
@@ -435,7 +443,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp(
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
- ScopedContext Active(hDevice->getContext());
+ ScopedContext Active(hDevice);
// Allocate a cuArray
if (pImageDesc->numMipLevel == 1) {
@@ -475,10 +483,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp(
UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageFreeExp(
ur_context_handle_t hContext, ur_device_handle_t hDevice,
ur_exp_image_mem_handle_t hImageMem) {
- UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+ UR_ASSERT(std::find(hContext->getDevices().begin(),
+ hContext->getDevices().end(),
+ hDevice) != hContext->getDevices().end(),
UR_RESULT_ERROR_INVALID_CONTEXT);
- ScopedContext Active(hDevice->getContext());
+ ScopedContext Active(hDevice);
try {
UR_CHECK_ERROR(cuArrayDestroy((CUarray)hImageMem));
} catch (ur_result_t Err) {
@@ -492,9 +502,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageFreeExp(
UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
ur_context_handle_t hContext, ur_device_handle_t hDevice,
ur_exp_image_mem_handle_t hImageMem, const ur_image_format_t *pImageFormat,
- const ur_image_desc_t *pImageDesc, ur_mem_handle_t *phMem,
+ [[maybe_unused]] const ur_image_desc_t *pImageDesc,
ur_exp_image_handle_t *phImage) {
- UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+ UR_ASSERT(std::find(hContext->getDevices().begin(),
+ hContext->getDevices().end(),
+ hDevice) != hContext->getDevices().end(),
UR_RESULT_ERROR_INVALID_CONTEXT);
unsigned int NumChannels = 0;
@@ -509,7 +521,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
try {
- ScopedContext Active(hDevice->getContext());
+ ScopedContext Active(hDevice);
CUDA_RESOURCE_DESC image_res_desc = {};
@@ -523,15 +535,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
UR_CHECK_ERROR(cuSurfObjectCreate(&surface, &image_res_desc));
*phImage = (ur_exp_image_handle_t)surface;
- auto urMemObj = std::unique_ptr(new ur_mem_handle_t_{
- hContext, (CUarray)hImageMem, surface, pImageDesc->type});
-
- if (urMemObj == nullptr) {
- return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
- }
-
- *phMem = urMemObj.release();
-
} catch (ur_result_t Err) {
return Err;
} catch (...) {
@@ -545,11 +548,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
ur_context_handle_t hContext, ur_device_handle_t hDevice,
ur_exp_image_mem_handle_t hImageMem, const ur_image_format_t *pImageFormat,
const ur_image_desc_t *pImageDesc, ur_sampler_handle_t hSampler,
- ur_mem_handle_t *phMem, ur_exp_image_handle_t *phImage) {
- UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+ ur_exp_image_handle_t *phImage) {
+ UR_ASSERT(std::find(hContext->getDevices().begin(),
+ hContext->getDevices().end(),
+ hDevice) != hContext->getDevices().end(),
UR_RESULT_ERROR_INVALID_CONTEXT);
- ScopedContext Active(hDevice->getContext());
+ ScopedContext Active(hDevice);
unsigned int NumChannels = 0;
UR_CHECK_ERROR(
@@ -611,15 +616,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
UR_CHECK_ERROR(
urTextureCreate(hSampler, pImageDesc, image_res_desc, phImage));
- auto urMemObj = std::unique_ptr(new ur_mem_handle_t_{
- hContext, (CUarray)hImageMem, (CUtexObject)*phImage, hSampler,
- pImageDesc->type});
-
- if (urMemObj == nullptr) {
- return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
- }
-
- *phMem = urMemObj.release();
} catch (ur_result_t Err) {
return Err;
} catch (...) {
@@ -654,7 +650,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
&PixelSizeBytes));
try {
- ScopedContext Active(hQueue->getContext());
+ ScopedContext Active(hQueue->getDevice());
CUstream Stream = hQueue->getNextTransferStream();
enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList);
@@ -986,11 +982,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMipmapGetLevelExp(
ur_context_handle_t hContext, ur_device_handle_t hDevice,
ur_exp_image_mem_handle_t hImageMem, uint32_t mipmapLevel,
ur_exp_image_mem_handle_t *phImageMem) {
- UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+ UR_ASSERT(std::find(hContext->getDevices().begin(),
+ hContext->getDevices().end(),
+ hDevice) != hContext->getDevices().end(),
UR_RESULT_ERROR_INVALID_CONTEXT);
try {
- ScopedContext Active(hDevice->getContext());
+ ScopedContext Active(hDevice);
CUarray ImageArray;
UR_CHECK_ERROR(cuMipmappedArrayGetLevel(
&ImageArray, (CUmipmappedArray)hImageMem, mipmapLevel));
@@ -1007,10 +1005,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMipmapGetLevelExp(
UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMipmapFreeExp(
ur_context_handle_t hContext, ur_device_handle_t hDevice,
ur_exp_image_mem_handle_t hMem) {
- UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+ UR_ASSERT(std::find(hContext->getDevices().begin(),
+ hContext->getDevices().end(),
+ hDevice) != hContext->getDevices().end(),
UR_RESULT_ERROR_INVALID_CONTEXT);
- ScopedContext Active(hDevice->getContext());
+ ScopedContext Active(hDevice);
try {
UR_CHECK_ERROR(cuMipmappedArrayDestroy((CUmipmappedArray)hMem));
} catch (ur_result_t Err) {
@@ -1025,11 +1025,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportOpaqueFDExp(
ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size,
ur_exp_interop_mem_desc_t *pInteropMemDesc,
ur_exp_interop_mem_handle_t *phInteropMem) {
- UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+ UR_ASSERT(std::find(hContext->getDevices().begin(),
+ hContext->getDevices().end(),
+ hDevice) != hContext->getDevices().end(),
UR_RESULT_ERROR_INVALID_CONTEXT);
try {
- ScopedContext Active(hDevice->getContext());
+ ScopedContext Active(hDevice);
CUDA_EXTERNAL_MEMORY_HANDLE_DESC extMemDesc = {};
extMemDesc.size = size;
@@ -1068,7 +1070,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc,
ur_exp_interop_mem_handle_t hInteropMem,
ur_exp_image_mem_handle_t *phImageMem) {
- UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+ UR_ASSERT(std::find(hContext->getDevices().begin(),
+ hContext->getDevices().end(),
+ hDevice) != hContext->getDevices().end(),
UR_RESULT_ERROR_INVALID_CONTEXT);
unsigned int NumChannels = 0;
@@ -1080,7 +1084,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
pImageFormat->channelType, pImageFormat->channelOrder, &format, nullptr));
try {
- ScopedContext Active(hDevice->getContext());
+ ScopedContext Active(hDevice);
CUDA_ARRAY3D_DESCRIPTOR ArrayDesc = {};
ArrayDesc.Width = pImageDesc->width;
@@ -1119,11 +1123,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp(
ur_context_handle_t hContext, ur_device_handle_t hDevice,
ur_exp_interop_mem_handle_t hInteropMem) {
- UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+ UR_ASSERT(std::find(hContext->getDevices().begin(),
+ hContext->getDevices().end(),
+ hDevice) != hContext->getDevices().end(),
UR_RESULT_ERROR_INVALID_CONTEXT);
try {
- ScopedContext Active(hDevice->getContext());
+ ScopedContext Active(hDevice);
UR_CHECK_ERROR(cuDestroyExternalMemory((CUexternalMemory)hInteropMem));
} catch (ur_result_t Err) {
return Err;
@@ -1138,11 +1144,13 @@ urBindlessImagesImportExternalSemaphoreOpaqueFDExp(
ur_context_handle_t hContext, ur_device_handle_t hDevice,
ur_exp_interop_semaphore_desc_t *pInteropSemaphoreDesc,
ur_exp_interop_semaphore_handle_t *phInteropSemaphoreHandle) {
- UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+ UR_ASSERT(std::find(hContext->getDevices().begin(),
+ hContext->getDevices().end(),
+ hDevice) != hContext->getDevices().end(),
UR_RESULT_ERROR_INVALID_CONTEXT);
try {
- ScopedContext Active(hDevice->getContext());
+ ScopedContext Active(hDevice);
CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC extSemDesc = {};
@@ -1177,11 +1185,13 @@ urBindlessImagesImportExternalSemaphoreOpaqueFDExp(
UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesDestroyExternalSemaphoreExp(
ur_context_handle_t hContext, ur_device_handle_t hDevice,
ur_exp_interop_semaphore_handle_t hInteropSemaphore) {
- UR_ASSERT((hContext->getDevice()->get() == hDevice->get()),
+ UR_ASSERT(std::find(hContext->getDevices().begin(),
+ hContext->getDevices().end(),
+ hDevice) != hContext->getDevices().end(),
UR_RESULT_ERROR_INVALID_CONTEXT);
try {
- ScopedContext Active(hDevice->getContext());
+ ScopedContext Active(hDevice);
UR_CHECK_ERROR(
cuDestroyExternalSemaphore((CUexternalSemaphore)hInteropSemaphore));
} catch (ur_result_t Err) {
@@ -1198,7 +1208,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp(
ur_event_handle_t *phEvent) {
try {
- ScopedContext Active(hQueue->getContext());
+ ScopedContext Active(hQueue->getDevice());
CUstream Stream = hQueue->getNextTransferStream();
enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList);
@@ -1230,7 +1240,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp(
ur_event_handle_t *phEvent) {
try {
- ScopedContext Active(hQueue->getContext());
+ ScopedContext Active(hQueue->getDevice());
CUstream Stream = hQueue->getNextTransferStream();
enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList);
diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp
index c9334add15..675fdbe0a3 100644
--- a/source/adapters/cuda/kernel.cpp
+++ b/source/adapters/cuda/kernel.cpp
@@ -19,7 +19,7 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName,
std::unique_ptr Kernel{nullptr};
try {
- ScopedContext Active(hProgram->getContext());
+ ScopedContext Active(hProgram->getDevice());
CUfunction CuFunc;
CUresult FunctionResult =
@@ -293,8 +293,6 @@ UR_APIEXPORT ur_result_t UR_APICALL
urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex,
const ur_kernel_arg_mem_obj_properties_t *Properties,
ur_mem_handle_t hArgValue) {
- std::ignore = Properties;
-
// Below sets kernel arg when zero-sized buffers are handled.
// In such case the corresponding memory is null.
if (hArgValue == nullptr) {
@@ -304,10 +302,14 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex,
ur_result_t Result = UR_RESULT_SUCCESS;
try {
- if (hArgValue->MemType == ur_mem_handle_t_::Type::Surface) {
+ auto Device = hKernel->getProgram()->getDevice();
+ ur_mem_flags_t MemAccess =
+ Properties ? Properties->memoryAccess : UR_MEM_FLAG_READ_WRITE;
+ hKernel->Args.addMemObjArg(argIndex, hArgValue, MemAccess);
+ if (hArgValue->isImage()) {
CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
UR_CHECK_ERROR(cuArray3DGetDescriptor(
- &arrayDesc, std::get(hArgValue->Mem).getArray()));
+ &arrayDesc, std::get(hArgValue->Mem).getArray(Device)));
if (arrayDesc.Format != CU_AD_FORMAT_UNSIGNED_INT32 &&
arrayDesc.Format != CU_AD_FORMAT_SIGNED_INT32 &&
arrayDesc.Format != CU_AD_FORMAT_HALF &&
@@ -317,10 +319,11 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex,
UR_RESULT_ERROR_ADAPTER_SPECIFIC);
return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
}
- CUsurfObject CuSurf = std::get(hArgValue->Mem).getSurface();
+ CUsurfObject CuSurf =
+ std::get(hArgValue->Mem).getSurface(Device);
hKernel->setKernelArg(argIndex, sizeof(CuSurf), (void *)&CuSurf);
} else {
- CUdeviceptr CuPtr = std::get(hArgValue->Mem).get();
+ CUdeviceptr CuPtr = std::get(hArgValue->Mem).getPtr(Device);
hKernel->setKernelArg(argIndex, sizeof(CUdeviceptr), (void *)&CuPtr);
}
} catch (ur_result_t Err) {
diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp
index 9e65066ba7..b7a7358b27 100644
--- a/source/adapters/cuda/kernel.hpp
+++ b/source/adapters/cuda/kernel.hpp
@@ -63,6 +63,14 @@ struct ur_kernel_handle_t_ {
args_size_t ParamSizes;
args_index_t Indices;
args_size_t OffsetPerIndex;
+ // A struct to keep track of memargs so that we can do dependency analysis
+ // at urEnqueueKernelLaunch
+ struct mem_obj_arg {
+ ur_mem_handle_t_ *Mem;
+ int Index;
+ ur_mem_flags_t AccessFlags;
+ };
+ std::vector MemObjArgs;
std::uint32_t ImplicitOffsetArgs[3] = {0, 0, 0};
@@ -116,6 +124,20 @@ struct ur_kernel_handle_t_ {
Size + (AlignedLocalOffset - LocalOffset));
}
+ void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) {
+ assert(hMem && "Invalid mem handle");
+ // To avoid redundancy we are not storing mem obj with index i at index
+ // i in the vec of MemObjArgs.
+ for (auto &Arg : MemObjArgs) {
+ if (Arg.Index == Index) {
+ // Overwrite the mem obj with the same index
+ Arg = arguments::mem_obj_arg{hMem, Index, Flags};
+ return;
+ }
+ }
+ MemObjArgs.push_back(arguments::mem_obj_arg{hMem, Index, Flags});
+ }
+
void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) {
assert(Size == sizeof(std::uint32_t) * 3);
std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size);
@@ -142,7 +164,7 @@ struct ur_kernel_handle_t_ {
urContextRetain(Context);
/// Note: this code assumes that there is only one device per context
ur_result_t RetError = urKernelGetGroupInfo(
- this, Context->getDevice(),
+ this, Program->getDevice(),
UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
sizeof(ReqdThreadsPerBlock), ReqdThreadsPerBlock, nullptr);
(void)RetError;
@@ -165,6 +187,7 @@ struct ur_kernel_handle_t_ {
uint32_t getReferenceCount() const noexcept { return RefCount; }
native_type get() const noexcept { return Function; };
+ ur_program_handle_t getProgram() const noexcept { return Program; };
native_type get_with_offset_parameter() const noexcept {
return FunctionWithOffsetParam;
diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp
index f097d2474e..c28ce98748 100644
--- a/source/adapters/cuda/memory.cpp
+++ b/source/adapters/cuda/memory.cpp
@@ -36,59 +36,50 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate(
const bool PerformInitialCopy =
(flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) ||
((flags & UR_MEM_FLAG_USE_HOST_POINTER) && !EnableUseHostPtr);
- ur_result_t Result = UR_RESULT_SUCCESS;
ur_mem_handle_t MemObj = nullptr;
try {
- ScopedContext Active(hContext);
- CUdeviceptr Ptr = 0;
auto HostPtr = pProperties ? pProperties->pHost : nullptr;
-
BufferMem::AllocMode AllocMode = BufferMem::AllocMode::Classic;
if ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && EnableUseHostPtr) {
UR_CHECK_ERROR(
cuMemHostRegister(HostPtr, size, CU_MEMHOSTREGISTER_DEVICEMAP));
- UR_CHECK_ERROR(cuMemHostGetDevicePointer(&Ptr, HostPtr, 0));
AllocMode = BufferMem::AllocMode::UseHostPtr;
} else if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) {
UR_CHECK_ERROR(cuMemAllocHost(&HostPtr, size));
- UR_CHECK_ERROR(cuMemHostGetDevicePointer(&Ptr, HostPtr, 0));
AllocMode = BufferMem::AllocMode::AllocHostPtr;
- } else {
- UR_CHECK_ERROR(cuMemAlloc(&Ptr, size));
- if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) {
- AllocMode = BufferMem::AllocMode::CopyIn;
- }
+ } else if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) {
+ AllocMode = BufferMem::AllocMode::CopyIn;
}
- ur_mem_handle_t parentBuffer = nullptr;
+ auto URMemObj = std::unique_ptr(
+ new ur_mem_handle_t_{hContext, flags, AllocMode, HostPtr, size});
+ if (URMemObj == nullptr) {
+ return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
+ }
- auto URMemObj = std::unique_ptr(new ur_mem_handle_t_{
- hContext, parentBuffer, flags, AllocMode, Ptr, HostPtr, size});
- if (URMemObj != nullptr) {
- MemObj = URMemObj.release();
- if (PerformInitialCopy) {
- // Operates on the default stream of the current CUDA context.
+ // First allocation will be made at urMemBufferCreate if context only
+ // has one device
+ if (PerformInitialCopy && HostPtr) {
+ // Perform initial copy to every device in context
+ for (auto &Device : hContext->getDevices()) {
+ ScopedContext Active(Device);
+ // getPtr may allocate mem if not already allocated
+ const auto &Ptr = std::get(URMemObj->Mem).getPtr(Device);
UR_CHECK_ERROR(cuMemcpyHtoD(Ptr, HostPtr, size));
- // Synchronize with default stream implicitly used by cuMemcpyHtoD
- // to make buffer data available on device before any other UR call
- // uses it.
- CUstream defaultStream = 0;
- UR_CHECK_ERROR(cuStreamSynchronize(defaultStream));
}
- } else {
- Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
}
+ MemObj = URMemObj.release();
} catch (ur_result_t Err) {
- Result = Err;
+ return Err;
} catch (...) {
- Result = UR_RESULT_ERROR_OUT_OF_RESOURCES;
+ return UR_RESULT_ERROR_OUT_OF_RESOURCES;
}
*phBuffer = MemObj;
- return Result;
+ return UR_RESULT_SUCCESS;
}
UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) {
@@ -117,26 +108,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) {
return UR_RESULT_SUCCESS;
}
- ScopedContext Active(MemObjPtr->getContext());
-
- if (hMem->MemType == ur_mem_handle_t_::Type::Buffer) {
- auto &BufferImpl = std::get(MemObjPtr->Mem);
- switch (BufferImpl.MemAllocMode) {
- case BufferMem::AllocMode::CopyIn:
- case BufferMem::AllocMode::Classic:
- UR_CHECK_ERROR(cuMemFree(BufferImpl.Ptr));
- break;
- case BufferMem::AllocMode::UseHostPtr:
- UR_CHECK_ERROR(cuMemHostUnregister(BufferImpl.HostPtr));
- break;
- case BufferMem::AllocMode::AllocHostPtr:
- UR_CHECK_ERROR(cuMemFreeHost(BufferImpl.HostPtr));
- };
- } else if (hMem->MemType == ur_mem_handle_t_::Type::Surface) {
- auto &SurfaceImpl = std::get(MemObjPtr->Mem);
- UR_CHECK_ERROR(cuSurfObjectDestroy(SurfaceImpl.getSurface()));
- UR_CHECK_ERROR(cuArrayDestroy(SurfaceImpl.getArray()));
- }
+ UR_CHECK_ERROR(hMem->clear());
} catch (ur_result_t Err) {
Result = Err;
@@ -161,10 +133,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) {
/// \param[out] phNativeMem Set to the native handle of the UR mem object.
///
/// \return UR_RESULT_SUCCESS
-UR_APIEXPORT ur_result_t UR_APICALL urMemGetNativeHandle(
- ur_mem_handle_t hMem, ur_device_handle_t, ur_native_handle_t *phNativeMem) {
- *phNativeMem = reinterpret_cast(
- std::get(hMem->Mem).get());
+UR_APIEXPORT ur_result_t UR_APICALL
+urMemGetNativeHandle(ur_mem_handle_t hMem, ur_device_handle_t Device,
+ ur_native_handle_t *phNativeMem) {
+ try {
+ *phNativeMem = reinterpret_cast(
+ std::get(hMem->Mem).getPtr(Device));
+ } catch (ur_result_t Err) {
+ return Err;
+ } catch (...) {
+ return UR_RESULT_ERROR_UNKNOWN;
+ }
return UR_RESULT_SUCCESS;
}
@@ -177,14 +156,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory,
UrReturnHelper ReturnValue(propSize, pMemInfo, pPropSizeRet);
- ScopedContext Active(hMemory->getContext());
+ // Any device in context will do
+ auto Device = hMemory->getContext()->getDevices()[0];
+ ScopedContext Active(Device);
switch (MemInfoType) {
case UR_MEM_INFO_SIZE: {
try {
size_t AllocSize = 0;
UR_CHECK_ERROR(cuMemGetAddressRange(
- nullptr, &AllocSize, std::get(hMemory->Mem).Ptr));
+ nullptr, &AllocSize,
+ std::get(hMemory->Mem).getPtr(Device)));
return ReturnValue(AllocSize);
} catch (ur_result_t Err) {
return Err;
@@ -242,160 +224,34 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate(
UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR);
}
- ur_result_t Result = UR_RESULT_SUCCESS;
-
// We only support RBGA channel order
// TODO: check SYCL CTS and spec. May also have to support BGRA
UR_ASSERT(pImageFormat->channelOrder == UR_IMAGE_CHANNEL_ORDER_RGBA,
UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION);
- // We have to use cuArray3DCreate, which has some caveats. The height and
- // depth parameters must be set to 0 produce 1D or 2D arrays. pImageDesc gives
- // a minimum value of 1, so we need to convert the answer.
- CUDA_ARRAY3D_DESCRIPTOR ArrayDesc;
- ArrayDesc.NumChannels = 4; // Only support 4 channel image
- ArrayDesc.Flags = 0; // No flags required
- ArrayDesc.Width = pImageDesc->width;
- if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
- ArrayDesc.Height = 0;
- ArrayDesc.Depth = 0;
- } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
- ArrayDesc.Height = pImageDesc->height;
- ArrayDesc.Depth = 0;
- } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
- ArrayDesc.Height = pImageDesc->height;
- ArrayDesc.Depth = pImageDesc->depth;
- }
-
- // We need to get this now in bytes for calculating the total image size later
- size_t PixelTypeSizeBytes;
-
- switch (pImageFormat->channelType) {
- case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8:
- case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8:
- ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
- PixelTypeSizeBytes = 1;
- break;
- case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8:
- ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT8;
- PixelTypeSizeBytes = 1;
- break;
- case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16:
- case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16:
- ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT16;
- PixelTypeSizeBytes = 2;
- break;
- case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16:
- ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT16;
- PixelTypeSizeBytes = 2;
- break;
- case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT:
- ArrayDesc.Format = CU_AD_FORMAT_HALF;
- PixelTypeSizeBytes = 2;
- break;
- case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32:
- ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
- PixelTypeSizeBytes = 4;
- break;
- case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32:
- ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT32;
- PixelTypeSizeBytes = 4;
- break;
- case UR_IMAGE_CHANNEL_TYPE_FLOAT:
- ArrayDesc.Format = CU_AD_FORMAT_FLOAT;
- PixelTypeSizeBytes = 4;
- break;
- default:
- detail::ur::die(
- "urMemImageCreate given unsupported image_channel_data_type");
- }
-
- // When a dimension isn't used pImageDesc has the size set to 1
- size_t PixelSizeBytes =
- PixelTypeSizeBytes * 4; // 4 is the only number of channels we support
- size_t ImageSizeBytes = PixelSizeBytes * pImageDesc->width *
- pImageDesc->height * pImageDesc->depth;
-
- ScopedContext Active(hContext);
- CUarray ImageArray = nullptr;
- try {
- UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &ArrayDesc));
- } catch (ur_result_t Err) {
- if (Err == UR_RESULT_ERROR_INVALID_VALUE) {
- return UR_RESULT_ERROR_INVALID_IMAGE_SIZE;
- }
- return Err;
- } catch (...) {
- return UR_RESULT_ERROR_UNKNOWN;
- }
+ auto URMemObj = std::unique_ptr(
+ new ur_mem_handle_t_{hContext, flags, *pImageFormat, *pImageDesc, pHost});
try {
if (PerformInitialCopy) {
- // We have to use a different copy function for each image dimensionality
- if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
- UR_CHECK_ERROR(cuMemcpyHtoA(ImageArray, 0, pHost, ImageSizeBytes));
- } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
- CUDA_MEMCPY2D CpyDesc;
- memset(&CpyDesc, 0, sizeof(CpyDesc));
- CpyDesc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
- CpyDesc.srcHost = pHost;
- CpyDesc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
- CpyDesc.dstArray = ImageArray;
- CpyDesc.WidthInBytes = PixelSizeBytes * pImageDesc->width;
- CpyDesc.Height = pImageDesc->height;
- UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc));
- } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
- CUDA_MEMCPY3D CpyDesc;
- memset(&CpyDesc, 0, sizeof(CpyDesc));
- CpyDesc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
- CpyDesc.srcHost = pHost;
- CpyDesc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
- CpyDesc.dstArray = ImageArray;
- CpyDesc.WidthInBytes = PixelSizeBytes * pImageDesc->width;
- CpyDesc.Height = pImageDesc->height;
- CpyDesc.Depth = pImageDesc->depth;
- UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc));
+ for (const auto &Device : hContext->getDevices()) {
+ UR_CHECK_ERROR(migrateMemoryToDeviceIfNeeded(URMemObj.get(), Device));
}
}
- // CUDA_RESOURCE_DESC is a union of different structs, shown here
- // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html
- // We need to fill it as described here to use it for a surface or texture
- // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__SURFOBJECT.html
- // CUDA_RESOURCE_DESC::resType must be CU_RESOURCE_TYPE_ARRAY and
- // CUDA_RESOURCE_DESC::res::array::hArray must be set to a valid CUDA array
- // handle.
- // CUDA_RESOURCE_DESC::flags must be set to zero
-
- CUDA_RESOURCE_DESC ImageResDesc;
- ImageResDesc.res.array.hArray = ImageArray;
- ImageResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
- ImageResDesc.flags = 0;
-
- CUsurfObject Surface;
- UR_CHECK_ERROR(cuSurfObjectCreate(&Surface, &ImageResDesc));
-
- auto MemObj = std::unique_ptr(new ur_mem_handle_t_(
- hContext, ImageArray, Surface, flags, pImageDesc->type, phMem));
-
- if (MemObj == nullptr) {
+ if (URMemObj == nullptr) {
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
}
- *phMem = MemObj.release();
+ *phMem = URMemObj.release();
} catch (ur_result_t Err) {
- if (ImageArray) {
- cuArrayDestroy(ImageArray);
- }
+ (*phMem)->clear();
return Err;
} catch (...) {
- if (ImageArray) {
- cuArrayDestroy(ImageArray);
- }
+ (*phMem)->clear();
return UR_RESULT_ERROR_UNKNOWN;
}
-
- return Result;
+ return UR_RESULT_SUCCESS;
}
UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t hMemory,
@@ -407,14 +263,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t hMemory,
auto Context = hMemory->getContext();
- ScopedContext Active(Context);
+ // Any device will do
+ auto Device = Context->getDevices()[0];
+ ScopedContext Active(Device);
UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
try {
CUDA_ARRAY3D_DESCRIPTOR ArrayInfo;
UR_CHECK_ERROR(cuArray3DGetDescriptor(
- &ArrayInfo, std::get(hMemory->Mem).getArray()));
+ &ArrayInfo, std::get(hMemory->Mem).getArray(Device)));
const auto cuda2urFormat = [](CUarray_format CUFormat,
ur_image_channel_type_t *ChannelType) {
@@ -544,27 +402,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition(
UR_ASSERT(pRegion->size != 0u, UR_RESULT_ERROR_INVALID_BUFFER_SIZE);
auto &BufferImpl = std::get(hBuffer->Mem);
-
- assert((pRegion->origin <= (pRegion->origin + pRegion->size)) && "Overflow");
UR_ASSERT(((pRegion->origin + pRegion->size) <= BufferImpl.getSize()),
UR_RESULT_ERROR_INVALID_BUFFER_SIZE);
- // Retained indirectly due to retaining parent buffer below.
- ur_context_handle_t Context = hBuffer->Context;
-
- BufferMem::AllocMode AllocMode = BufferMem::AllocMode::Classic;
- assert(BufferImpl.Ptr != BufferMem::native_type{0});
- BufferMem::native_type Ptr = BufferImpl.Ptr + pRegion->origin;
-
- void *HostPtr = nullptr;
- if (BufferImpl.HostPtr) {
- HostPtr = static_cast(BufferImpl.HostPtr) + pRegion->origin;
- }
-
- std::unique_ptr MemObj{nullptr};
+ std::unique_ptr RetMemObj{nullptr};
try {
- MemObj = std::unique_ptr{new ur_mem_handle_t_{
- Context, hBuffer, flags, AllocMode, Ptr, HostPtr, pRegion->size}};
+ for (auto Device : hBuffer->Context->getDevices()) {
+ BufferImpl.getPtr(
+ Device); // This is allocating a dev ptr behind the scenes
+ // which is necessary before SubBuffer partition
+ }
+ RetMemObj = std::unique_ptr{
+ new ur_mem_handle_t_{hBuffer, pRegion->origin}};
} catch (ur_result_t Err) {
*phMem = nullptr;
return Err;
@@ -573,6 +422,189 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition(
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
}
- *phMem = MemObj.release();
+ *phMem = RetMemObj.release();
+ return UR_RESULT_SUCCESS;
+}
+
+ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem,
+ const ur_device_handle_t hDevice) {
+ ScopedContext Active(hDevice);
+ ur_lock LockGuard(Mem->MemoryAllocationMutex);
+
+ if (Mem->isBuffer()) {
+ auto &Buffer = std::get(Mem->Mem);
+ auto &DevPtr = Buffer.Ptrs[hDevice->getIndex() % Buffer.Ptrs.size()];
+
+ // Allocation has already been made
+ if (DevPtr != BufferMem::native_type{0}) {
+ return UR_RESULT_SUCCESS;
+ }
+
+ if (Buffer.MemAllocMode == BufferMem::AllocMode::AllocHostPtr) {
+ // Host allocation has already been made
+ UR_CHECK_ERROR(cuMemHostGetDevicePointer(&DevPtr, Buffer.HostPtr, 0));
+ } else if (Buffer.MemAllocMode == BufferMem::AllocMode::UseHostPtr) {
+ UR_CHECK_ERROR(cuMemHostRegister(Buffer.HostPtr, Buffer.Size,
+ CU_MEMHOSTALLOC_DEVICEMAP));
+ UR_CHECK_ERROR(cuMemHostGetDevicePointer(&DevPtr, Buffer.HostPtr, 0));
+ } else {
+ UR_CHECK_ERROR(cuMemAlloc(&DevPtr, Buffer.Size));
+ }
+ } else {
+ CUarray ImageArray;
+ CUsurfObject Surface;
+ try {
+ auto &Image = std::get(Mem->Mem);
+ // Allocation has already been made
+ if (Image.Arrays[hDevice->getIndex() % Image.Arrays.size()]) {
+ return UR_RESULT_SUCCESS;
+ }
+ UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &Image.ArrayDesc));
+ Image.Arrays[hDevice->getIndex() % Image.Arrays.size()] = ImageArray;
+
+ // CUDA_RESOURCE_DESC is a union of different structs, shown here
+ // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html
+ // We need to fill it as described here to use it for a surface or texture
+ // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__SURFOBJECT.html
+ // CUDA_RESOURCE_DESC::resType must be CU_RESOURCE_TYPE_ARRAY and
+ // CUDA_RESOURCE_DESC::res::array::hArray must be set to a valid CUDA
+ // array handle. CUDA_RESOURCE_DESC::flags must be set to zero
+ CUDA_RESOURCE_DESC ImageResDesc;
+ ImageResDesc.res.array.hArray = ImageArray;
+ ImageResDesc.resType = CU_RESOURCE_TYPE_ARRAY;
+ ImageResDesc.flags = 0;
+
+ UR_CHECK_ERROR(cuSurfObjectCreate(&Surface, &ImageResDesc));
+ Image.SurfObjs[hDevice->getIndex() % Image.SurfObjs.size()] = Surface;
+ } catch (ur_result_t Err) {
+ if (ImageArray) {
+ UR_CHECK_ERROR(cuArrayDestroy(ImageArray));
+ }
+ return Err;
+ } catch (...) {
+ if (ImageArray) {
+ UR_CHECK_ERROR(cuArrayDestroy(ImageArray));
+ }
+ return UR_RESULT_ERROR_UNKNOWN;
+ }
+ }
+ return UR_RESULT_SUCCESS;
+}
+
+namespace {
+ur_result_t migrateBufferToDevice(ur_mem_handle_t Mem,
+ ur_device_handle_t hDevice) {
+ auto &Buffer = std::get(Mem->Mem);
+ if (Mem->LastEventWritingToMemObj == nullptr) {
+ // Device allocation being initialized from host for the first time
+ if (Buffer.HostPtr) {
+ UR_CHECK_ERROR(
+ cuMemcpyHtoD(Buffer.getPtr(hDevice), Buffer.HostPtr, Buffer.Size));
+ }
+ } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
+ hDevice) {
+ UR_CHECK_ERROR(cuMemcpyDtoD(
+ Buffer.getPtr(hDevice),
+ Buffer.getPtr(Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
+ Buffer.Size));
+ }
+ return UR_RESULT_SUCCESS;
+}
+
+ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
+ ur_device_handle_t hDevice) {
+ auto &Image = std::get(Mem->Mem);
+ // When a dimension isn't used image_desc has the size set to 1
+ size_t PixelSizeBytes = Image.PixelTypeSizeBytes *
+ 4; // 4 is the only number of channels we support
+ size_t ImageSizeBytes = PixelSizeBytes * Image.ImageDesc.width *
+ Image.ImageDesc.height * Image.ImageDesc.depth;
+
+ CUarray ImageArray = Image.getArray(hDevice);
+
+ CUDA_MEMCPY2D CpyDesc2D;
+ CUDA_MEMCPY3D CpyDesc3D;
+ // We have to use a different copy function for each image
+ // dimensionality
+ if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
+ memset(&CpyDesc2D, 0, sizeof(CpyDesc2D));
+ CpyDesc2D.srcHost = Image.HostPtr;
+ CpyDesc2D.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
+ CpyDesc2D.dstArray = ImageArray;
+ CpyDesc2D.WidthInBytes = PixelSizeBytes * Image.ImageDesc.width;
+ CpyDesc2D.Height = Image.ImageDesc.height;
+ } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
+ memset(&CpyDesc3D, 0, sizeof(CpyDesc3D));
+ CpyDesc3D.srcHost = Image.HostPtr;
+ CpyDesc3D.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
+ CpyDesc3D.dstArray = ImageArray;
+ CpyDesc3D.WidthInBytes = PixelSizeBytes * Image.ImageDesc.width;
+ CpyDesc3D.Height = Image.ImageDesc.height;
+ CpyDesc3D.Depth = Image.ImageDesc.depth;
+ }
+
+ if (Mem->LastEventWritingToMemObj == nullptr) {
+ if (Image.HostPtr) {
+ if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
+ UR_CHECK_ERROR(
+ cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
+ } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
+ CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
+ CpyDesc2D.srcHost = Image.HostPtr;
+ UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
+ } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
+ CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
+ CpyDesc3D.srcHost = Image.HostPtr;
+ UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
+ }
+ }
+ } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
+ hDevice) {
+ if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
+ // FIXME: 1D memcpy from DtoD going through the host.
+ UR_CHECK_ERROR(cuMemcpyAtoH(
+ Image.HostPtr,
+ Image.getArray(
+ Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
+ 0 /*srcOffset*/, ImageSizeBytes));
+ UR_CHECK_ERROR(
+ cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
+ } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
+ CpyDesc2D.srcArray = Image.getArray(
+ Mem->LastEventWritingToMemObj->getQueue()->getDevice());
+ UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D));
+ } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
+ CpyDesc3D.srcArray = Image.getArray(
+ Mem->LastEventWritingToMemObj->getQueue()->getDevice());
+ UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D));
+ }
+ }
+ return UR_RESULT_SUCCESS;
+}
+} // namespace
+
+// If calling this entry point it is necessary to lock the memoryMigrationMutex
+// beforehand
+ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem,
+ const ur_device_handle_t hDevice) {
+ UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE);
+ // Device allocation has already been initialized with most up to date
+ // data in buffer
+ if (Mem->HaveMigratedToDeviceSinceLastWrite
+ [hDevice->getIndex() %
+ Mem->HaveMigratedToDeviceSinceLastWrite.size()]) {
+ return UR_RESULT_SUCCESS;
+ }
+
+ ScopedContext Active(hDevice);
+ if (Mem->isBuffer()) {
+ UR_CHECK_ERROR(migrateBufferToDevice(Mem, hDevice));
+ } else {
+ UR_CHECK_ERROR(migrateImageToDevice(Mem, hDevice));
+ }
+
+ Mem->HaveMigratedToDeviceSinceLastWrite
+ [hDevice->getIndex() % Mem->HaveMigratedToDeviceSinceLastWrite.size()] =
+ true;
return UR_RESULT_SUCCESS;
}
diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp
index 3f09552038..6b7e9d0156 100644
--- a/source/adapters/cuda/memory.hpp
+++ b/source/adapters/cuda/memory.hpp
@@ -11,10 +11,19 @@
#include
#include
+#include
#include
#include
#include "common.hpp"
+#include "context.hpp"
+#include "device.hpp"
+#include "event.hpp"
+
+ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
+ const ur_device_handle_t);
+ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
+ const ur_device_handle_t);
// Handler for plain, pointer-based CUDA allocations
struct BufferMem {
@@ -27,7 +36,7 @@ struct BufferMem {
/// Original flags for the mapped region
ur_map_flags_t MapFlags;
/// Allocated host memory used exclusively for this map.
- std::unique_ptr MapMem;
+ std::shared_ptr MapMem;
BufferMap(size_t MapSize, size_t MapOffset, ur_map_flags_t MapFlags)
: MapSize(MapSize), MapOffset(MapOffset), MapFlags(MapFlags),
@@ -61,11 +70,16 @@ struct BufferMem {
using native_type = CUdeviceptr;
+private:
+ /// CUDA handler for the pointer
+ std::vector Ptrs;
+
+public:
/// If this allocation is a sub-buffer (i.e., a view on an existing
/// allocation), this is the pointer to the parent handler structure
- ur_mem_handle_t Parent;
- /// CUDA handler for the pointer
- native_type Ptr;
+ ur_mem_handle_t Parent = nullptr;
+ /// Outer UR mem holding this BufferMem in variant
+ ur_mem_handle_t OuterMemStruct;
/// Pointer associated with this device on the host
void *HostPtr;
/// Size of the allocation in bytes
@@ -75,12 +89,34 @@ struct BufferMem {
AllocMode MemAllocMode;
- BufferMem(ur_mem_handle_t Parent, BufferMem::AllocMode Mode, CUdeviceptr Ptr,
- void *HostPtr, size_t Size)
- : Parent{Parent}, Ptr{Ptr}, HostPtr{HostPtr}, Size{Size},
- PtrToBufferMap{}, MemAllocMode{Mode} {};
+ BufferMem(ur_context_handle_t Context, ur_mem_handle_t OuterMemStruct,
+ AllocMode Mode, void *HostPtr, size_t Size)
+ : Ptrs(Context->getDevices().size(), native_type{0}),
+ OuterMemStruct{OuterMemStruct}, HostPtr{HostPtr}, Size{Size},
+ MemAllocMode{Mode} {};
- native_type get() const noexcept { return Ptr; }
+ BufferMem(const BufferMem &Buffer) = default;
+
+ native_type getPtrWithOffset(const ur_device_handle_t Device, size_t Offset) {
+ if (ur_result_t Err =
+ allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
+ Err != UR_RESULT_SUCCESS) {
+ throw Err;
+ }
+ return reinterpret_cast(
+ reinterpret_cast(Ptrs[Device->getIndex() % Ptrs.size()]) +
+ Offset);
+ }
+
+ native_type getPtr(const ur_device_handle_t Device) {
+ return getPtrWithOffset(Device, 0);
+ }
+
+ void *getVoid(const ur_device_handle_t Device) {
+ return reinterpret_cast(getPtrWithOffset(Device, 0));
+ }
+
+ bool isSubBuffer() const noexcept { return Parent != nullptr; }
size_t getSize() const noexcept { return Size; }
@@ -120,104 +156,315 @@ struct BufferMem {
assert(MapPtr != nullptr);
PtrToBufferMap.erase(MapPtr);
}
-};
-// Handler data for surface object (i.e. Images)
-struct SurfaceMem {
- CUarray Array;
- CUsurfObject SurfObj;
- ur_mem_type_t ImageType;
+ ur_result_t clear() {
+ if (Parent != nullptr) {
+ return UR_RESULT_SUCCESS;
+ }
- SurfaceMem(CUarray Array, CUsurfObject Surf, ur_mem_type_t ImageType,
- void *HostPtr)
- : Array{Array}, SurfObj{Surf}, ImageType{ImageType} {
- (void)HostPtr;
+ switch (MemAllocMode) {
+ case AllocMode::CopyIn:
+ case AllocMode::Classic:
+ for (auto &DevPtr : Ptrs) {
+ if (DevPtr != native_type{0}) {
+ UR_CHECK_ERROR(cuMemFree(DevPtr));
+ }
+ }
+ break;
+ case AllocMode::UseHostPtr:
+ UR_CHECK_ERROR(cuMemHostUnregister(HostPtr));
+ break;
+ case AllocMode::AllocHostPtr:
+ UR_CHECK_ERROR(cuMemFreeHost(HostPtr));
+ }
+ return UR_RESULT_SUCCESS;
}
- CUarray getArray() const noexcept { return Array; }
+ friend struct ur_mem_handle_t_;
+ friend ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
+ const ur_device_handle_t);
+};
- CUsurfObject getSurface() const noexcept { return SurfObj; }
+// Handler data for surface object (i.e. Images)
+struct SurfaceMem {
+private:
+ std::vector Arrays;
+ std::vector SurfObjs;
- ur_mem_type_t getImageType() const noexcept { return ImageType; }
-};
+public:
+ ur_mem_handle_t OuterMemStruct;
-// For sampled/unsampled images
-struct ImageMem {
- CUarray Array;
- void *Handle;
- ur_mem_type_t ImageType;
- ur_sampler_handle_t Sampler;
+ ur_image_format_t ImageFormat;
+ ur_image_desc_t ImageDesc;
+ CUDA_ARRAY3D_DESCRIPTOR ArrayDesc;
+ size_t PixelTypeSizeBytes;
+ void *HostPtr;
- ImageMem(CUarray Array, void *Handle, ur_mem_type_t ImageType,
- ur_sampler_handle_t Sampler)
- : Array{Array}, Handle{Handle}, ImageType{ImageType}, Sampler{Sampler} {};
+ SurfaceMem(ur_context_handle_t Context, ur_mem_handle_t OuterMemStruct,
+ ur_image_format_t ImageFormat, ur_image_desc_t ImageDesc,
+ void *HostPtr)
+ : Arrays(Context->Devices.size(), CUarray{0}),
+ SurfObjs(Context->Devices.size(), CUsurfObject{0}),
+ OuterMemStruct{OuterMemStruct},
+ ImageFormat{ImageFormat}, ImageDesc{ImageDesc}, HostPtr{HostPtr} {
+ // We have to use hipArray3DCreate, which has some caveats. The height and
+ // depth parameters must be set to 0 produce 1D or 2D arrays. image_desc
+ // gives a minimum value of 1, so we need to convert the answer.
+ ArrayDesc.NumChannels = 4; // Only support 4 channel image
+ ArrayDesc.Flags = 0; // No flags required
+ ArrayDesc.Width = ImageDesc.width;
+ if (ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
+ ArrayDesc.Height = 0;
+ ArrayDesc.Depth = 0;
+ } else if (ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
+ ArrayDesc.Height = ImageDesc.height;
+ ArrayDesc.Depth = 0;
+ } else if (ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
+ ArrayDesc.Height = ImageDesc.height;
+ ArrayDesc.Depth = ImageDesc.depth;
+ }
- CUarray get_array() const noexcept { return Array; }
+ // We need to get PixelTypeSizeBytes for calculating the total image size
+ // later
+ switch (ImageFormat.channelType) {
+
+ case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8:
+ case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8:
+ ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
+ PixelTypeSizeBytes = 1;
+ break;
+ case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8:
+ ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT8;
+ PixelTypeSizeBytes = 1;
+ break;
+ case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16:
+ case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16:
+ ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT16;
+ PixelTypeSizeBytes = 2;
+ break;
+ case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16:
+ ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT16;
+ PixelTypeSizeBytes = 2;
+ break;
+ case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT:
+ ArrayDesc.Format = CU_AD_FORMAT_HALF;
+ PixelTypeSizeBytes = 2;
+ break;
+ case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32:
+ ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT32;
+ PixelTypeSizeBytes = 4;
+ break;
+ case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32:
+ ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT32;
+ PixelTypeSizeBytes = 4;
+ break;
+ case UR_IMAGE_CHANNEL_TYPE_FLOAT:
+ ArrayDesc.Format = CU_AD_FORMAT_FLOAT;
+ PixelTypeSizeBytes = 4;
+ break;
+ default:
+ detail::ur::die(
+ "urMemImageCreate given unsupported image_channel_data_type");
+ }
+ }
- void *get_handle() const noexcept { return Handle; }
+ // Will allocate a new array on device if not already allocated
+ CUarray getArray(const ur_device_handle_t Device) {
+ if (ur_result_t Err =
+ allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
+ Err != UR_RESULT_SUCCESS) {
+ throw Err;
+ }
+ return Arrays[Device->getIndex() % Arrays.size()];
+ }
+ // Will allocate a new surface on device if not already allocated
+ CUsurfObject getSurface(const ur_device_handle_t Device) {
+ if (ur_result_t Err =
+ allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device);
+ Err != UR_RESULT_SUCCESS) {
+ throw Err;
+ }
+ return SurfObjs[Device->getIndex() % SurfObjs.size()];
+ }
- ur_mem_type_t get_image_type() const noexcept { return ImageType; }
+ ur_mem_type_t getType() { return ImageDesc.type; }
- ur_sampler_handle_t get_sampler() const noexcept { return Sampler; }
+ ur_result_t clear() {
+ for (auto Array : Arrays) {
+ if (Array) {
+ UR_CHECK_ERROR(cuArrayDestroy(Array));
+ }
+ }
+ for (auto Surf : SurfObjs) {
+ if (Surf != CUsurfObject{0}) {
+ UR_CHECK_ERROR(cuSurfObjectDestroy(Surf));
+ }
+ }
+ return UR_RESULT_SUCCESS;
+ }
+ friend ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
+ const ur_device_handle_t);
};
/// UR Mem mapping to CUDA memory allocations, both data and texture/surface.
/// \brief Represents non-SVM allocations on the CUDA backend.
/// Keeps tracks of all mapped regions used for Map/Unmap calls.
/// Only one region can be active at the same time per allocation.
+///
+/// The ur_mem_handle_t is responsible for memory allocation and migration
+/// across devices in the same ur_context_handle_t. If a kernel writes to a
+/// ur_mem_handle_t then it will write to LastEventWritingToMemObj. Then all
+/// subsequent operations that want to read from the ur_mem_handle_t must wait
+/// on the event referring to the last write.
+///
+/// Since urMemBufferCreate/urMemImageCreate do not take a queue or device
+/// object, only a ur_context_handle_t, at mem obj creation we don't know which
+/// device we must make a native image/allocation on. Therefore no allocations
+/// are made at urMemBufferCreate/urMemImageCreate. Instead device
+/// images/allocations are made lazily. These allocations are made implicitly
+/// with a call to getPtr/getArray which will allocate a new allocation/image on
+/// device if need be.
+///
+/// Memory migration between native allocations for devices in the same
+/// ur_context_handle_t will occur at:
+///
+/// 1. urEnqueueKernelLaunch
+/// 2. urEnqueueMem(Buffer|Image)Read(Rect)
+///
+/// Migrations will occur in both cases if the most recent version of data
+/// is on a different device, marked by
+/// LastEventWritingToMemObj->getQueue()->getDevice()
+///
+/// Example trace:
+/// ~~~~~~~~~~~~~~
+///
+/// =====> urContextCreate([device0, device1], ...) // associated with [q0, q1]
+/// -> OUT: hContext
+///
+/// =====> urMemBufferCreate(hContext,...);
+/// -> No native allocations made
+/// -> OUT: hBuffer
+///
+/// =====> urEnqueueMemBufferWrite(q0, hBuffer,...);
+/// -> Allocation made on q0 ie device0
+/// -> New allocation initialized with host data.
+///
+/// =====> urKernelSetArgMemObj(hKernel0, hBuffer, ...);
+/// -> ur_kernel_handle_t associated with a ur_program_handle_t,
+/// which is in turn unique to a device. So we can set the kernel
+/// arg with the ptr of the device specific allocation.
+/// -> hKernel0->getProgram()->getDevice() == device0
+/// -> allocateMemObjOnDeviceIfNeeded(device0);
+/// -> Native allocation already made on device0, continue.
+///
+/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...);
+/// -> Suppose that hKernel0 writes to hBuffer.
+/// -> Call hBuffer->setLastEventWritingToMemObj with return event
+/// from this operation
+/// -> Enqueue native kernel launch
+///
+/// =====> urKernelSetArgMemObj(hKernel1, hBuffer, ...);
+/// -> hKernel1->getProgram()->getDevice() == device1
+/// -> New allocation will be made on device1 when calling
+/// getPtr(device1)
+/// -> No native allocation on device1
+/// -> Make native allocation on device1
+///
+/// =====> urEnqueueKernelLaunch(q1, hKernel1, ...);
+/// -> Suppose hKernel1 wants to read from hBuffer and not write.
+/// -> migrateMemoryToDeviceIfNeeded(device1);
+/// -> hBuffer->LastEventWritingToMemObj is not nullptr
+/// -> Check if memory has been migrated to device1 since the
+/// last write
+/// -> Hasn't been migrated
+/// -> Wait on LastEventWritingToMemObj.
+/// -> Migrate memory from device0's native allocation to
+/// device1's native allocation.
+/// -> Enqueue native kernel launch
+///
+/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...);
+/// -> migrateMemoryToDeviceIfNeeded(device0);
+/// -> hBuffer->LastEventWritingToMemObj refers to an event
+/// from q0
+/// -> Migration not necessary
+/// -> Enqueue native kernel launch
+///
struct ur_mem_handle_t_ {
// Context where the memory object is accessible
ur_context_handle_t Context;
/// Reference counting of the handler
std::atomic_uint32_t RefCount;
- enum class Type { Buffer, Surface, Texture } MemType;
// Original mem flags passed
ur_mem_flags_t MemFlags;
+ // If we make a ur_mem_handle_t_ from a native allocation, it can be useful to
+ // associate it with the device that holds the native allocation.
+ ur_device_handle_t DeviceWithNativeAllocation{nullptr};
+
+ // Has the memory been migrated to a device since the last write?
+ std::vector HaveMigratedToDeviceSinceLastWrite;
+
+ // We should wait on this event prior to migrating memory across allocations
+ // in this ur_mem_handle_t_
+ ur_event_handle_t LastEventWritingToMemObj{nullptr};
+
+ // Enumerates all possible types of accesses.
+ enum access_mode_t { unknown, read_write, read_only, write_only };
+
+ ur_mutex MemoryAllocationMutex; // A mutex for allocations
+ ur_mutex MemoryMigrationMutex; // A mutex for memory transfers
+
/// A UR Memory object represents either plain memory allocations ("Buffers"
/// in OpenCL) or typed allocations ("Images" in OpenCL).
/// In CUDA their API handlers are different. Whereas "Buffers" are allocated
/// as pointer-like structs, "Images" are stored in Textures or Surfaces.
- /// This union allows implementation to use either from the same handler.
- std::variant Mem;
+ /// This variant allows implementation to use either from the same handler.
+ std::variant Mem;
/// Constructs the UR mem handler for a non-typed allocation ("buffer")
- ur_mem_handle_t_(ur_context_handle_t Context, ur_mem_handle_t Parent,
- ur_mem_flags_t MemFlags, BufferMem::AllocMode Mode,
- CUdeviceptr Ptr, void *HostPtr, size_t Size)
- : Context{Context}, RefCount{1}, MemType{Type::Buffer},
- MemFlags{MemFlags}, Mem{BufferMem{Parent, Mode, Ptr, HostPtr, Size}} {
- if (isSubBuffer()) {
- urMemRetain(std::get(Mem).Parent);
- } else {
- urContextRetain(Context);
- }
- };
-
- /// Constructs the UR allocation for an Image object (surface in CUDA)
- ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array,
- CUsurfObject Surf, ur_mem_flags_t MemFlags,
- ur_mem_type_t ImageType, void *HostPtr)
- : Context{Context}, RefCount{1}, MemType{Type::Surface},
- MemFlags{MemFlags}, Mem{SurfaceMem{Array, Surf, ImageType, HostPtr}} {
+ ur_mem_handle_t_(ur_context_handle_t Ctxt, ur_mem_flags_t MemFlags,
+ BufferMem::AllocMode Mode, void *HostPtr, size_t Size)
+ : Context{Ctxt}, RefCount{1}, MemFlags{MemFlags},
+ HaveMigratedToDeviceSinceLastWrite(Context->Devices.size(), false),
+ Mem{std::in_place_type, Ctxt, this, Mode, HostPtr, Size} {
urContextRetain(Context);
- }
+ };
- /// Constructs the UR allocation for an unsampled image object
- ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array,
- CUsurfObject Surf, ur_mem_type_t ImageType)
- : Context{Context}, RefCount{1}, MemType{Type::Surface}, MemFlags{0},
- Mem{ImageMem{Array, (void *)Surf, ImageType, nullptr}} {
- urContextRetain(Context);
- }
+ // Subbuffer constructor
+ ur_mem_handle_t_(ur_mem_handle_t Parent, size_t SubBufferOffset)
+ : Context{Parent->Context}, RefCount{1}, MemFlags{Parent->MemFlags},
+ HaveMigratedToDeviceSinceLastWrite(Parent->Context->Devices.size(),
+ false),
+ Mem{BufferMem{std::get(Parent->Mem)}} {
+ auto &SubBuffer = std::get(Mem);
+ SubBuffer.Parent = Parent;
+ SubBuffer.OuterMemStruct = this;
+ if (SubBuffer.HostPtr) {
+ SubBuffer.HostPtr =
+ static_cast(SubBuffer.HostPtr) + SubBufferOffset;
+ }
+ for (auto &DevPtr : SubBuffer.Ptrs) {
+ if (DevPtr) {
+ DevPtr += SubBufferOffset;
+ }
+ }
+ urMemRetain(Parent);
+ };
- /// Constructs the UR allocation for a sampled image object
- ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array, CUtexObject Tex,
- ur_sampler_handle_t Sampler, ur_mem_type_t ImageType)
- : Context{Context}, RefCount{1}, MemType{Type::Texture}, MemFlags{0},
- Mem{ImageMem{Array, (void *)Tex, ImageType, Sampler}} {
+ /// Constructs the UR mem handler for an Image object
+ ur_mem_handle_t_(ur_context_handle_t Ctxt, ur_mem_flags_t MemFlags,
+ ur_image_format_t ImageFormat, ur_image_desc_t ImageDesc,
+ void *HostPtr)
+ : Context{Ctxt}, RefCount{1}, MemFlags{MemFlags},
+ HaveMigratedToDeviceSinceLastWrite(Context->Devices.size(), false),
+ Mem{std::in_place_type,
+ Ctxt,
+ this,
+ ImageFormat,
+ ImageDesc,
+ HostPtr} {
urContextRetain(Context);
}
@@ -229,13 +476,24 @@ struct ur_mem_handle_t_ {
urContextRelease(Context);
}
- bool isBuffer() const noexcept { return MemType == Type::Buffer; }
+ bool isBuffer() const noexcept {
+ return std::holds_alternative(Mem);
+ }
bool isSubBuffer() const noexcept {
return (isBuffer() && (std::get(Mem).Parent != nullptr));
}
- bool isImage() const noexcept { return MemType == Type::Surface; }
+ bool isImage() const noexcept {
+ return std::holds_alternative(Mem);
+ }
+
+ ur_result_t clear() {
+ if (isBuffer()) {
+ return std::get(Mem).clear();
+ }
+ return std::get(Mem).clear();
+ }
ur_context_handle_t getContext() const noexcept { return Context; }
@@ -244,4 +502,22 @@ struct ur_mem_handle_t_ {
uint32_t decrementReferenceCount() noexcept { return --RefCount; }
uint32_t getReferenceCount() const noexcept { return RefCount; }
+
+ void setLastEventWritingToMemObj(ur_event_handle_t NewEvent) {
+ assert(NewEvent && "Invalid event!");
+ // This entry point should only ever be called when using multi device ctx
+ assert(Context->Devices.size() > 1);
+ urEventRetain(NewEvent);
+ if (LastEventWritingToMemObj != nullptr) {
+ urEventRelease(LastEventWritingToMemObj);
+ }
+ LastEventWritingToMemObj = NewEvent;
+ for (const auto &Device : Context->getDevices()) {
+ // This event is never an interop event so will always have an associated
+ // queue
+ HaveMigratedToDeviceSinceLastWrite
+ [Device->getIndex() % HaveMigratedToDeviceSinceLastWrite.size()] =
+ Device == NewEvent->getQueue()->getDevice();
+ }
+ }
};
diff --git a/source/adapters/cuda/physical_mem.cpp b/source/adapters/cuda/physical_mem.cpp
index e2c46696a0..c8d4adf0ef 100644
--- a/source/adapters/cuda/physical_mem.cpp
+++ b/source/adapters/cuda/physical_mem.cpp
@@ -23,7 +23,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urPhysicalMemCreate(
CUmemAllocationProp AllocProps = {};
AllocProps.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
AllocProps.type = CU_MEM_ALLOCATION_TYPE_PINNED;
- UR_CHECK_ERROR(GetDeviceOrdinal(hDevice, AllocProps.location.id));
+ AllocProps.location.id = hDevice->getIndex();
CUmemGenericAllocationHandle ResHandle;
switch (auto Result = cuMemCreate(&ResHandle, size, &AllocProps, 0)) {
@@ -32,7 +32,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urPhysicalMemCreate(
default:
UR_CHECK_ERROR(Result);
}
- *phPhysicalMem = new ur_physical_mem_handle_t_(ResHandle, hContext);
+ *phPhysicalMem = new ur_physical_mem_handle_t_(ResHandle, hContext, hDevice);
return UR_RESULT_SUCCESS;
}
@@ -51,7 +51,7 @@ urPhysicalMemRelease(ur_physical_mem_handle_t hPhysicalMem) {
try {
std::unique_ptr PhysicalMemGuard(hPhysicalMem);
- ScopedContext Active(hPhysicalMem->getContext());
+ ScopedContext Active(hPhysicalMem->getDevice());
UR_CHECK_ERROR(cuMemRelease(hPhysicalMem->get()));
return UR_RESULT_SUCCESS;
} catch (ur_result_t err) {
diff --git a/source/adapters/cuda/physical_mem.hpp b/source/adapters/cuda/physical_mem.hpp
index 0ce332e112..c77ad0d547 100644
--- a/source/adapters/cuda/physical_mem.hpp
+++ b/source/adapters/cuda/physical_mem.hpp
@@ -26,41 +26,29 @@ struct ur_physical_mem_handle_t_ {
std::atomic_uint32_t RefCount;
native_type PhysicalMem;
ur_context_handle_t_ *Context;
+ ur_device_handle_t Device;
- ur_physical_mem_handle_t_(native_type PhysMem, ur_context_handle_t_ *Ctx)
- : RefCount(1), PhysicalMem(PhysMem), Context(Ctx) {
+ ur_physical_mem_handle_t_(native_type PhysMem, ur_context_handle_t_ *Ctx,
+ ur_device_handle_t Device)
+ : RefCount(1), PhysicalMem(PhysMem), Context(Ctx), Device(Device) {
urContextRetain(Context);
+ urDeviceRetain(Device);
}
- ~ur_physical_mem_handle_t_() { urContextRelease(Context); }
+ ~ur_physical_mem_handle_t_() {
+ urContextRelease(Context);
+ urDeviceRelease(Device);
+ }
native_type get() const noexcept { return PhysicalMem; }
ur_context_handle_t_ *getContext() const noexcept { return Context; }
+ ur_device_handle_t_ *getDevice() const noexcept { return Device; }
+
uint32_t incrementReferenceCount() noexcept { return ++RefCount; }
uint32_t decrementReferenceCount() noexcept { return --RefCount; }
uint32_t getReferenceCount() const noexcept { return RefCount; }
};
-
-// Find a device ordinal of a device.
-inline ur_result_t GetDeviceOrdinal(ur_device_handle_t Device, int &Ordinal) {
- ur_adapter_handle_t AdapterHandle = &adapter;
- // Get list of platforms
- uint32_t NumPlatforms;
- UR_CHECK_ERROR(urPlatformGet(&AdapterHandle, 1, 0, nullptr, &NumPlatforms));
- UR_ASSERT(NumPlatforms, UR_RESULT_ERROR_UNKNOWN);
-
- std::vector Platforms{NumPlatforms};
- UR_CHECK_ERROR(urPlatformGet(&AdapterHandle, 1, NumPlatforms,
- Platforms.data(), nullptr));
-
- // Ordinal corresponds to the platform ID as each device has its own platform.
- CUdevice NativeDevice = Device->get();
- for (Ordinal = 0; size_t(Ordinal) < Platforms.size(); ++Ordinal)
- if (Platforms[Ordinal]->Devices[0]->get() == NativeDevice)
- return UR_RESULT_SUCCESS;
- return UR_RESULT_ERROR_INVALID_DEVICE;
-}
diff --git a/source/adapters/cuda/platform.cpp b/source/adapters/cuda/platform.cpp
index 4c730f997a..27b94f756f 100644
--- a/source/adapters/cuda/platform.cpp
+++ b/source/adapters/cuda/platform.cpp
@@ -52,9 +52,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetInfo(
/// There is only one CUDA platform, and contains all devices on the system.
/// Triggers the CUDA Driver initialization (cuInit) the first time, so this
/// must be the first PI API called.
-///
-/// However because multiple devices in a context is not currently supported,
-/// place each device in a separate platform.
UR_APIEXPORT ur_result_t UR_APICALL
urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries,
ur_platform_handle_t *phPlatforms, uint32_t *pNumPlatforms) {
@@ -62,7 +59,7 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries,
try {
static std::once_flag InitFlag;
static uint32_t NumPlatforms = 1;
- static std::vector Platforms;
+ static ur_platform_handle_t_ Platform;
UR_ASSERT(phPlatforms || pNumPlatforms, UR_RESULT_ERROR_INVALID_VALUE);
UR_ASSERT(!phPlatforms || NumEntries > 0, UR_RESULT_ERROR_INVALID_SIZE);
@@ -76,39 +73,34 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries,
int NumDevices = 0;
UR_CHECK_ERROR(cuDeviceGetCount(&NumDevices));
try {
- // make one platform per device
- NumPlatforms = NumDevices;
- Platforms.resize(NumDevices);
-
for (int i = 0; i < NumDevices; ++i) {
CUdevice Device;
UR_CHECK_ERROR(cuDeviceGet(&Device, i));
CUcontext Context;
UR_CHECK_ERROR(cuDevicePrimaryCtxRetain(&Context, Device));
- ScopedContext active(Context);
+ ScopedContext Active(Context); // Set native ctx as active
CUevent EvBase;
UR_CHECK_ERROR(cuEventCreate(&EvBase, CU_EVENT_DEFAULT));
// Use default stream to record base event counter
UR_CHECK_ERROR(cuEventRecord(EvBase, 0));
- Platforms[i].Devices.emplace_back(new ur_device_handle_t_{
- Device, Context, EvBase, &Platforms[i]});
+ Platform.Devices.emplace_back(
+ new ur_device_handle_t_{Device, Context, EvBase, &Platform,
+ static_cast(i)});
}
} catch (const std::bad_alloc &) {
// Signal out-of-memory situation
for (int i = 0; i < NumDevices; ++i) {
- Platforms[i].Devices.clear();
+ Platform.Devices.clear();
}
- Platforms.clear();
Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
} catch (ur_result_t Err) {
// Clear and rethrow to allow retry
for (int i = 0; i < NumDevices; ++i) {
- Platforms[i].Devices.clear();
+ Platform.Devices.clear();
}
- Platforms.clear();
Result = Err;
throw Err;
} catch (...) {
@@ -123,9 +115,7 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries,
}
if (phPlatforms != nullptr) {
- for (unsigned i = 0; i < std::min(NumEntries, NumPlatforms); ++i) {
- phPlatforms[i] = &Platforms[i];
- }
+ *phPlatforms = &Platform;
}
return Result;
diff --git a/source/adapters/cuda/program.cpp b/source/adapters/cuda/program.cpp
index 390e96b34c..c5cb763ada 100644
--- a/source/adapters/cuda/program.cpp
+++ b/source/adapters/cuda/program.cpp
@@ -9,6 +9,7 @@
//===----------------------------------------------------------------------===//
#include "program.hpp"
+#include "ur_util.hpp"
bool getMaxRegistersJitOptionValue(const std::string &BuildOptions,
unsigned int &Value) {
@@ -44,23 +45,6 @@ bool getMaxRegistersJitOptionValue(const std::string &BuildOptions,
return true;
}
-ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Context)
- : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1},
- Context{Context}, KernelReqdWorkGroupSizeMD{} {
- urContextRetain(Context);
-}
-
-ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(Context); }
-
-std::pair
-splitMetadataName(const std::string &metadataName) {
- size_t splitPos = metadataName.rfind('@');
- if (splitPos == std::string::npos)
- return std::make_pair(metadataName, std::string{});
- return std::make_pair(metadataName.substr(0, splitPos),
- metadataName.substr(splitPos, metadataName.length()));
-}
-
ur_result_t
ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata,
size_t Length) {
@@ -197,12 +181,14 @@ ur_result_t createProgram(ur_context_handle_t hContext,
const uint8_t *pBinary,
const ur_program_properties_t *pProperties,
ur_program_handle_t *phProgram) {
- UR_ASSERT(hContext->getDevice()->get() == hDevice->get(),
+ UR_ASSERT(std::find(hContext->getDevices().begin(),
+ hContext->getDevices().end(),
+ hDevice) != hContext->getDevices().end(),
UR_RESULT_ERROR_INVALID_CONTEXT);
UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE);
std::unique_ptr RetProgram{
- new ur_program_handle_t_{hContext}};
+ new ur_program_handle_t_{hContext, hDevice}};
if (pProperties) {
if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
@@ -222,18 +208,12 @@ ur_result_t createProgram(ur_context_handle_t hContext,
return UR_RESULT_SUCCESS;
}
-/// CUDA will handle the PTX/CUBIN binaries internally through CUmodule object.
-/// So, urProgramCreateWithIL and urProgramCreateWithBinary are equivalent in
-/// terms of CUDA adapter. See \ref urProgramCreateWithBinary.
+// A program is unique to a device so this entry point cannot be supported with
+// a multi device context
UR_APIEXPORT ur_result_t UR_APICALL
-urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
- size_t length, const ur_program_properties_t *pProperties,
- ur_program_handle_t *phProgram) {
- ur_device_handle_t hDevice = hContext->getDevice();
- auto pBinary = reinterpret_cast(pIL);
-
- return createProgram(hContext, hDevice, length, pBinary, pProperties,
- phProgram);
+urProgramCreateWithIL(ur_context_handle_t, const void *, size_t,
+ const ur_program_properties_t *, ur_program_handle_t *) {
+ return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
/// CUDA will handle the PTX/CUBIN binaries internally through a call to
@@ -272,7 +252,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext,
ur_result_t Result = UR_RESULT_SUCCESS;
try {
- ScopedContext Active(hProgram->getContext());
+ ScopedContext Active(hProgram->getDevice());
hProgram->buildProgram(pOptions);
hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_EXECUTABLE;
@@ -297,13 +277,17 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count,
const ur_program_handle_t *phPrograms, const char *pOptions,
ur_program_handle_t *phProgram) {
ur_result_t Result = UR_RESULT_SUCCESS;
+ // All programs must be associated with the same device
+ for (auto i = 1u; i < count; ++i)
+ UR_ASSERT(phPrograms[i]->getDevice() == phPrograms[0]->getDevice(),
+ UR_RESULT_ERROR_INVALID_DEVICE);
try {
- ScopedContext Active(hContext);
+ ScopedContext Active(phPrograms[0]->getDevice());
CUlinkState State;
std::unique_ptr RetProgram{
- new ur_program_handle_t_{hContext}};
+ new ur_program_handle_t_{hContext, phPrograms[0]->getDevice()}};
UR_CHECK_ERROR(cuLinkCreate(0, nullptr, nullptr, &State));
try {
@@ -390,7 +374,7 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName,
case UR_PROGRAM_INFO_NUM_DEVICES:
return ReturnValue(1u);
case UR_PROGRAM_INFO_DEVICES:
- return ReturnValue(&hProgram->Context->DeviceID, 1);
+ return ReturnValue(&hProgram->Device, 1);
case UR_PROGRAM_INFO_SOURCE:
return ReturnValue(hProgram->Binary);
case UR_PROGRAM_INFO_BINARY_SIZES:
@@ -434,7 +418,7 @@ urProgramRelease(ur_program_handle_t hProgram) {
ur_result_t Result = UR_RESULT_ERROR_INVALID_PROGRAM;
try {
- ScopedContext Active(hProgram->getContext());
+ ScopedContext Active(hProgram->getDevice());
auto cuModule = hProgram->get();
// "0" is a valid handle for a cuModule, so the best way to check if we
// actually loaded a module and need to unload it is to look at the build
@@ -491,8 +475,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer(
ur_device_handle_t hDevice, ur_program_handle_t hProgram,
const char *pFunctionName, void **ppFunctionPointer) {
// Check if device passed is the same the device bound to the context
- UR_ASSERT(hDevice == hProgram->getContext()->getDevice(),
- UR_RESULT_ERROR_INVALID_DEVICE);
+ UR_ASSERT(hDevice == hProgram->getDevice(), UR_RESULT_ERROR_INVALID_DEVICE);
CUfunction Func;
CUresult Ret = cuModuleGetFunction(&Func, hProgram->get(), pFunctionName);
diff --git a/source/adapters/cuda/program.hpp b/source/adapters/cuda/program.hpp
index feb3a09f31..5d41374d34 100644
--- a/source/adapters/cuda/program.hpp
+++ b/source/adapters/cuda/program.hpp
@@ -24,6 +24,7 @@ struct ur_program_handle_t_ {
size_t BinarySizeInBytes;
std::atomic_uint32_t RefCount;
ur_context_handle_t Context;
+ ur_device_handle_t Device;
/* The ur_program_binary_type_t property is defined individually for every
* device in a program. However, since the CUDA adapter only has 1 device per
@@ -42,8 +43,17 @@ struct ur_program_handle_t_ {
std::string BuildOptions;
ur_program_build_status_t BuildStatus = UR_PROGRAM_BUILD_STATUS_NONE;
- ur_program_handle_t_(ur_context_handle_t Context);
- ~ur_program_handle_t_();
+ ur_program_handle_t_(ur_context_handle_t Context, ur_device_handle_t Device)
+ : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1},
+ Context{Context}, Device{Device}, KernelReqdWorkGroupSizeMD{} {
+ urContextRetain(Context);
+ urDeviceRetain(Device);
+ }
+
+ ~ur_program_handle_t_() {
+ urContextRelease(Context);
+ urDeviceRelease(Device);
+ }
ur_result_t setMetadata(const ur_program_metadata_t *Metadata, size_t Length);
@@ -51,6 +61,7 @@ struct ur_program_handle_t_ {
ur_result_t buildProgram(const char *BuildOptions);
ur_context_handle_t getContext() const { return Context; };
+ ur_device_handle_t getDevice() const noexcept { return Device; };
native_type get() const noexcept { return Module; };
diff --git a/source/adapters/cuda/queue.cpp b/source/adapters/cuda/queue.cpp
index 120d665524..773126f4d5 100644
--- a/source/adapters/cuda/queue.cpp
+++ b/source/adapters/cuda/queue.cpp
@@ -123,7 +123,8 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice,
try {
std::unique_ptr Queue{nullptr};
- if (hContext->getDevice() != hDevice) {
+ if (std::find(hContext->getDevices().begin(), hContext->getDevices().end(),
+ hDevice) == hContext->getDevices().end()) {
*phQueue = nullptr;
return UR_RESULT_ERROR_INVALID_DEVICE;
}
@@ -145,10 +146,10 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice,
IsOutOfOrder = true;
}
if (URFlags & UR_QUEUE_FLAG_PRIORITY_HIGH) {
- ScopedContext Active(hContext);
+ ScopedContext Active(hDevice);
UR_CHECK_ERROR(cuCtxGetStreamPriorityRange(nullptr, &Priority));
} else if (URFlags & UR_QUEUE_FLAG_PRIORITY_LOW) {
- ScopedContext Active(hContext);
+ ScopedContext Active(hDevice);
UR_CHECK_ERROR(cuCtxGetStreamPriorityRange(&Priority, nullptr));
}
}
@@ -193,7 +194,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) {
if (!hQueue->backendHasOwnership())
return UR_RESULT_SUCCESS;
- ScopedContext Active(hQueue->getContext());
+ ScopedContext Active(hQueue->getDevice());
hQueue->forEachStream([](CUstream S) {
UR_CHECK_ERROR(cuStreamSynchronize(S));
@@ -212,7 +213,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) {
ur_result_t Result = UR_RESULT_SUCCESS;
try {
- ScopedContext active(hQueue->getContext());
+ ScopedContext active(hQueue->getDevice());
hQueue->syncStreams*ResetUsed=*/true>(
[](CUstream s) { UR_CHECK_ERROR(cuStreamSynchronize(s)); });
@@ -242,7 +243,7 @@ urQueueGetNativeHandle(ur_queue_handle_t hQueue, ur_queue_native_desc_t *pDesc,
ur_native_handle_t *phNativeQueue) {
std::ignore = pDesc;
- ScopedContext Active(hQueue->getContext());
+ ScopedContext Active(hQueue->getDevice());
*phNativeQueue =
reinterpret_cast(hQueue->getNextComputeStream());
return UR_RESULT_SUCCESS;
@@ -276,7 +277,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle(
new ur_queue_handle_t_{std::move(ComputeCuStreams),
std::move(TransferCuStreams),
hContext,
- hContext->getDevice(),
+ hDevice,
CuFlags,
Flags,
/*priority*/ 0,
diff --git a/source/adapters/cuda/queue.hpp b/source/adapters/cuda/queue.hpp
index c79ca18a9b..46e9968fa9 100644
--- a/source/adapters/cuda/queue.hpp
+++ b/source/adapters/cuda/queue.hpp
@@ -97,6 +97,7 @@ struct ur_queue_handle_t_ {
uint32_t *StreamToken = nullptr);
native_type getNextTransferStream();
native_type get() { return getNextComputeStream(); };
+ ur_device_handle_t getDevice() const noexcept { return Device; };
bool hasBeenSynchronized(uint32_t StreamToken) {
// stream token not associated with one of the compute streams
diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp
index 2ffc0755ee..80fd211863 100644
--- a/source/adapters/cuda/ur_interface_loader.cpp
+++ b/source/adapters/cuda/ur_interface_loader.cpp
@@ -407,6 +407,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
pDdiTable->pfnCooperativeKernelLaunchExp =
urEnqueueCooperativeKernelLaunchExp;
+ pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp;
return UR_RESULT_SUCCESS;
}
diff --git a/source/adapters/cuda/usm.cpp b/source/adapters/cuda/usm.cpp
index 59d3ba70fd..4e6c6898d5 100644
--- a/source/adapters/cuda/usm.cpp
+++ b/source/adapters/cuda/usm.cpp
@@ -33,7 +33,7 @@ urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc,
UR_RESULT_ERROR_INVALID_VALUE);
if (!hPool) {
- return USMHostAllocImpl(ppMem, hContext, nullptr, size, alignment);
+ return USMHostAllocImpl(ppMem, hContext, /* flags */ 0, size, alignment);
}
auto UMFPool = hPool->HostMemPool.get();
@@ -57,7 +57,7 @@ urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
UR_RESULT_ERROR_INVALID_VALUE);
if (!hPool) {
- return USMDeviceAllocImpl(ppMem, hContext, hDevice, nullptr, size,
+ return USMDeviceAllocImpl(ppMem, hContext, hDevice, /* flags */ 0, size,
alignment);
}
@@ -82,8 +82,8 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
UR_RESULT_ERROR_INVALID_VALUE);
if (!hPool) {
- return USMSharedAllocImpl(ppMem, hContext, hDevice, nullptr, nullptr, size,
- alignment);
+ return USMSharedAllocImpl(ppMem, hContext, hDevice, /*host flags*/ 0,
+ /*device flags*/ 0, size, alignment);
}
auto UMFPool = hPool->SharedMemPool.get();
@@ -95,11 +95,10 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
return UR_RESULT_SUCCESS;
}
-ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Pointer) {
+ur_result_t USMFreeImpl(ur_context_handle_t, void *Pointer) {
ur_result_t Result = UR_RESULT_SUCCESS;
try {
- ScopedContext Active(Context);
- bool IsManaged;
+ unsigned int IsManaged;
unsigned int Type;
void *AttributeValues[2] = {&IsManaged, &Type};
CUpointer_attribute Attributes[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED,
@@ -131,11 +130,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext,
return USMFreeImpl(hContext, pMem);
}
-ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
- ur_device_handle_t, ur_usm_device_mem_flags_t *,
- size_t Size, uint32_t Alignment) {
+ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t,
+ ur_device_handle_t Device,
+ ur_usm_device_mem_flags_t, size_t Size,
+ uint32_t Alignment) {
try {
- ScopedContext Active(Context);
+ ScopedContext Active(Device);
UR_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)ResultPtr, Size));
} catch (ur_result_t Err) {
return Err;
@@ -150,12 +150,13 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
return UR_RESULT_SUCCESS;
}
-ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
- ur_device_handle_t, ur_usm_host_mem_flags_t *,
- ur_usm_device_mem_flags_t *, size_t Size,
+ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t,
+ ur_device_handle_t Device,
+ ur_usm_host_mem_flags_t,
+ ur_usm_device_mem_flags_t, size_t Size,
uint32_t Alignment) {
try {
- ScopedContext Active(Context);
+ ScopedContext Active(Device);
UR_CHECK_ERROR(cuMemAllocManaged((CUdeviceptr *)ResultPtr, Size,
CU_MEM_ATTACH_GLOBAL));
} catch (ur_result_t Err) {
@@ -171,11 +172,10 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
return UR_RESULT_SUCCESS;
}
-ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
- ur_usm_host_mem_flags_t *, size_t Size,
+ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t,
+ ur_usm_host_mem_flags_t, size_t Size,
uint32_t Alignment) {
try {
- ScopedContext Active(Context);
UR_CHECK_ERROR(cuMemAllocHost(ResultPtr, Size));
} catch (ur_result_t Err) {
return Err;
@@ -199,7 +199,6 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem,
UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet);
try {
- ScopedContext Active(hContext);
switch (propName) {
case UR_USM_ALLOC_INFO_TYPE: {
unsigned int Value;
@@ -358,24 +357,24 @@ umf_result_t USMMemoryProvider::get_min_page_size(void *Ptr, size_t *PageSize) {
ur_result_t USMSharedMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
uint32_t Alignment) {
- return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, nullptr, Size,
- Alignment);
+ return USMSharedAllocImpl(ResultPtr, Context, Device, /*host flags*/ 0,
+ /*device flags*/ 0, Size, Alignment);
}
ur_result_t USMDeviceMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
uint32_t Alignment) {
- return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size,
+ return USMDeviceAllocImpl(ResultPtr, Context, Device, /* flags */ 0, Size,
Alignment);
}
ur_result_t USMHostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
uint32_t Alignment) {
- return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment);
+ return USMHostAllocImpl(ResultPtr, Context, /* flags */ 0, Size, Alignment);
}
ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context,
ur_usm_pool_desc_t *PoolDesc)
- : Context(Context) {
+ : Context{Context} {
const void *pNext = PoolDesc->pNext;
while (pNext != nullptr) {
const ur_base_desc_t *BaseDesc = static_cast(pNext);
@@ -406,25 +405,25 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context,
&this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Host])
.second;
- auto Device = Context->DeviceID;
- MemProvider =
- umf::memoryProviderMakeUnique(Context, Device)
- .second;
- DeviceMemPool =
- umf::poolMakeUniqueFromOps(
- &UMF_DISJOINT_POOL_OPS, std::move(MemProvider),
- &this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Device])
- .second;
-
- MemProvider =
- umf::memoryProviderMakeUnique(Context, Device)
- .second;
- SharedMemPool =
- umf::poolMakeUniqueFromOps(
- &UMF_DISJOINT_POOL_OPS, std::move(MemProvider),
- &this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Shared])
- .second;
- Context->addPool(this);
+ for (const auto &Device : Context->getDevices()) {
+ MemProvider =
+ umf::memoryProviderMakeUnique(Context, Device)
+ .second;
+ DeviceMemPool = umf::poolMakeUniqueFromOps(
+ &UMF_DISJOINT_POOL_OPS, std::move(MemProvider),
+ &this->DisjointPoolConfigs
+ .Configs[usm::DisjointPoolMemType::Device])
+ .second;
+ MemProvider =
+ umf::memoryProviderMakeUnique(Context, Device)
+ .second;
+ SharedMemPool = umf::poolMakeUniqueFromOps(
+ &UMF_DISJOINT_POOL_OPS, std::move(MemProvider),
+ &this->DisjointPoolConfigs
+ .Configs[usm::DisjointPoolMemType::Shared])
+ .second;
+ Context->addPool(this);
+ }
}
bool ur_usm_pool_handle_t_::hasUMFPool(umf_memory_pool_t *umf_pool) {
diff --git a/source/adapters/cuda/usm.hpp b/source/adapters/cuda/usm.hpp
index 2ec3df150f..e5d1f7fbaa 100644
--- a/source/adapters/cuda/usm.hpp
+++ b/source/adapters/cuda/usm.hpp
@@ -118,15 +118,15 @@ class USMHostMemoryProvider final : public USMMemoryProvider {
ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
ur_device_handle_t Device,
- ur_usm_device_mem_flags_t *Flags, size_t Size,
+ ur_usm_device_mem_flags_t Flags, size_t Size,
uint32_t Alignment);
ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
ur_device_handle_t Device,
- ur_usm_host_mem_flags_t *,
- ur_usm_device_mem_flags_t *, size_t Size,
+ ur_usm_host_mem_flags_t,
+ ur_usm_device_mem_flags_t, size_t Size,
uint32_t Alignment);
ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
- ur_usm_host_mem_flags_t *Flags, size_t Size,
+ ur_usm_host_mem_flags_t Flags, size_t Size,
uint32_t Alignment);
diff --git a/source/adapters/cuda/usm_p2p.cpp b/source/adapters/cuda/usm_p2p.cpp
index 810a11ef84..e09220d730 100644
--- a/source/adapters/cuda/usm_p2p.cpp
+++ b/source/adapters/cuda/usm_p2p.cpp
@@ -14,8 +14,8 @@
UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PEnablePeerAccessExp(
ur_device_handle_t commandDevice, ur_device_handle_t peerDevice) {
try {
- ScopedContext active(commandDevice->getContext());
- UR_CHECK_ERROR(cuCtxEnablePeerAccess(peerDevice->getContext(), 0));
+ ScopedContext active(commandDevice);
+ UR_CHECK_ERROR(cuCtxEnablePeerAccess(peerDevice->getNativeContext(), 0));
} catch (ur_result_t err) {
return err;
}
@@ -25,8 +25,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PEnablePeerAccessExp(
UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PDisablePeerAccessExp(
ur_device_handle_t commandDevice, ur_device_handle_t peerDevice) {
try {
- ScopedContext active(commandDevice->getContext());
- UR_CHECK_ERROR(cuCtxDisablePeerAccess(peerDevice->getContext()));
+ ScopedContext active(commandDevice);
+ UR_CHECK_ERROR(cuCtxDisablePeerAccess(peerDevice->getNativeContext()));
} catch (ur_result_t err) {
return err;
}
@@ -43,7 +43,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp(
int value;
CUdevice_P2PAttribute cuAttr;
try {
- ScopedContext active(commandDevice->getContext());
+ ScopedContext active(commandDevice);
switch (propName) {
case UR_EXP_PEER_INFO_UR_PEER_ACCESS_SUPPORTED: {
cuAttr = CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED;
diff --git a/source/adapters/cuda/virtual_mem.cpp b/source/adapters/cuda/virtual_mem.cpp
index 9c37dda4fb..29908ad1d4 100644
--- a/source/adapters/cuda/virtual_mem.cpp
+++ b/source/adapters/cuda/virtual_mem.cpp
@@ -17,12 +17,12 @@
#include
UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo(
- ur_context_handle_t hContext, ur_device_handle_t hDevice,
+ ur_context_handle_t, ur_device_handle_t hDevice,
ur_virtual_mem_granularity_info_t propName, size_t propSize,
void *pPropValue, size_t *pPropSizeRet) {
UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
- ScopedContext Active(hContext);
+ ScopedContext Active(hDevice);
switch (propName) {
case UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM:
case UR_VIRTUAL_MEM_GRANULARITY_INFO_RECOMMENDED: {
@@ -33,7 +33,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo(
CUmemAllocationProp AllocProps = {};
AllocProps.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
AllocProps.type = CU_MEM_ALLOCATION_TYPE_PINNED;
- UR_CHECK_ERROR(GetDeviceOrdinal(hDevice, AllocProps.location.id));
+ AllocProps.location.id = hDevice->getIndex();
size_t Granularity;
UR_CHECK_ERROR(
@@ -50,15 +50,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo(
UR_APIEXPORT ur_result_t UR_APICALL
urVirtualMemReserve(ur_context_handle_t hContext, const void *pStart,
size_t size, void **ppStart) {
- ScopedContext Active(hContext);
+ // Reserve the virtual mem. Only need to do once for arbitrary context
+ ScopedContext Active(hContext->getDevices()[0]);
UR_CHECK_ERROR(cuMemAddressReserve((CUdeviceptr *)ppStart, size, 0,
(CUdeviceptr)pStart, 0));
return UR_RESULT_SUCCESS;
}
-UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemFree(
- ur_context_handle_t hContext, const void *pStart, size_t size) {
- ScopedContext Active(hContext);
+UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemFree(ur_context_handle_t,
+ const void *pStart,
+ size_t size) {
UR_CHECK_ERROR(cuMemAddressFree((CUdeviceptr)pStart, size));
return UR_RESULT_SUCCESS;
}
@@ -66,22 +67,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemFree(
UR_APIEXPORT ur_result_t UR_APICALL
urVirtualMemSetAccess(ur_context_handle_t hContext, const void *pStart,
size_t size, ur_virtual_mem_access_flags_t flags) {
- CUmemAccessDesc AccessDesc = {};
- if (flags & UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE)
- AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
- else if (flags & UR_VIRTUAL_MEM_ACCESS_FLAG_READ_ONLY)
- AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READ;
- else
- AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_NONE;
- AccessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
- // TODO: When contexts support multiple devices, we should create a descriptor
- // for each. We may also introduce a variant of this function with a
- // specific device.
- UR_CHECK_ERROR(
- GetDeviceOrdinal(hContext->getDevice(), AccessDesc.location.id));
-
- ScopedContext Active(hContext);
- UR_CHECK_ERROR(cuMemSetAccess((CUdeviceptr)pStart, size, &AccessDesc, 1));
+ // Set access for every device in the context
+ for (auto &Device : hContext->getDevices()) {
+ CUmemAccessDesc AccessDesc = {};
+ if (flags & UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE)
+ AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+ else if (flags & UR_VIRTUAL_MEM_ACCESS_FLAG_READ_ONLY)
+ AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READ;
+ else
+ AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_NONE;
+ AccessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+ AccessDesc.location.id = Device->getIndex();
+ ScopedContext Active(Device);
+ UR_CHECK_ERROR(cuMemSetAccess((CUdeviceptr)pStart, size, &AccessDesc, 1));
+ }
return UR_RESULT_SUCCESS;
}
@@ -89,7 +88,8 @@ UR_APIEXPORT ur_result_t UR_APICALL
urVirtualMemMap(ur_context_handle_t hContext, const void *pStart, size_t size,
ur_physical_mem_handle_t hPhysicalMem, size_t offset,
ur_virtual_mem_access_flags_t flags) {
- ScopedContext Active(hContext);
+ // Map the virtual mem. Only need to do once for arbitrary context
+ ScopedContext Active(hContext->getDevices()[0]);
UR_CHECK_ERROR(
cuMemMap((CUdeviceptr)pStart, size, offset, hPhysicalMem->get(), 0));
if (flags)
@@ -99,7 +99,8 @@ urVirtualMemMap(ur_context_handle_t hContext, const void *pStart, size_t size,
UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemUnmap(
ur_context_handle_t hContext, const void *pStart, size_t size) {
- ScopedContext Active(hContext);
+ // Unmap the virtual mem. Only need to do once for arbitrary context
+ ScopedContext Active(hContext->getDevices()[0]);
UR_CHECK_ERROR(cuMemUnmap((CUdeviceptr)pStart, size));
return UR_RESULT_SUCCESS;
}
@@ -110,12 +111,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGetInfo(
size_t propSize, void *pPropValue, size_t *pPropSizeRet) {
UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
- ScopedContext Active(hContext);
+ // Set arbitrary context
+ ScopedContext Active(hContext->getDevices()[0]);
switch (propName) {
case UR_VIRTUAL_MEM_INFO_ACCESS_MODE: {
CUmemLocation MemLocation = {};
MemLocation.type = CU_MEM_LOCATION_TYPE_DEVICE;
- UR_CHECK_ERROR(GetDeviceOrdinal(hContext->getDevice(), MemLocation.id));
+ MemLocation.id = hContext->getDevices()[0]->getIndex();
unsigned long long CuAccessFlags;
UR_CHECK_ERROR(
diff --git a/source/adapters/hip/command_buffer.hpp b/source/adapters/hip/command_buffer.hpp
index 50fddc5448..751fde3720 100644
--- a/source/adapters/hip/command_buffer.hpp
+++ b/source/adapters/hip/command_buffer.hpp
@@ -10,154 +10,12 @@
#include
#include
+#include
#include "context.hpp"
#include
#include
-static inline const char *getUrResultString(ur_result_t Result) {
- switch (Result) {
- case UR_RESULT_SUCCESS:
- return "UR_RESULT_SUCCESS";
- case UR_RESULT_ERROR_INVALID_OPERATION:
- return "UR_RESULT_ERROR_INVALID_OPERATION";
- case UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES:
- return "UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES";
- case UR_RESULT_ERROR_INVALID_QUEUE:
- return "UR_RESULT_ERROR_INVALID_QUEUE";
- case UR_RESULT_ERROR_INVALID_VALUE:
- return "UR_RESULT_ERROR_INVALID_VALUE";
- case UR_RESULT_ERROR_INVALID_CONTEXT:
- return "UR_RESULT_ERROR_INVALID_CONTEXT";
- case UR_RESULT_ERROR_INVALID_PLATFORM:
- return "UR_RESULT_ERROR_INVALID_PLATFORM";
- case UR_RESULT_ERROR_INVALID_BINARY:
- return "UR_RESULT_ERROR_INVALID_BINARY";
- case UR_RESULT_ERROR_INVALID_PROGRAM:
- return "UR_RESULT_ERROR_INVALID_PROGRAM";
- case UR_RESULT_ERROR_INVALID_SAMPLER:
- return "UR_RESULT_ERROR_INVALID_SAMPLER";
- case UR_RESULT_ERROR_INVALID_BUFFER_SIZE:
- return "UR_RESULT_ERROR_INVALID_BUFFER_SIZE";
- case UR_RESULT_ERROR_INVALID_MEM_OBJECT:
- return "UR_RESULT_ERROR_INVALID_MEM_OBJECT";
- case UR_RESULT_ERROR_INVALID_EVENT:
- return "UR_RESULT_ERROR_INVALID_EVENT";
- case UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST:
- return "UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST";
- case UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET:
- return "UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET";
- case UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE:
- return "UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE";
- case UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE:
- return "UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE";
- case UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE:
- return "UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE";
- case UR_RESULT_ERROR_DEVICE_NOT_FOUND:
- return "UR_RESULT_ERROR_DEVICE_NOT_FOUND";
- case UR_RESULT_ERROR_INVALID_DEVICE:
- return "UR_RESULT_ERROR_INVALID_DEVICE";
- case UR_RESULT_ERROR_DEVICE_LOST:
- return "UR_RESULT_ERROR_DEVICE_LOST";
- case UR_RESULT_ERROR_DEVICE_REQUIRES_RESET:
- return "UR_RESULT_ERROR_DEVICE_REQUIRES_RESET";
- case UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE:
- return "UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE";
- case UR_RESULT_ERROR_DEVICE_PARTITION_FAILED:
- return "UR_RESULT_ERROR_DEVICE_PARTITION_FAILED";
- case UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT:
- return "UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT";
- case UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE:
- return "UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE";
- case UR_RESULT_ERROR_INVALID_WORK_DIMENSION:
- return "UR_RESULT_ERROR_INVALID_WORK_DIMENSION";
- case UR_RESULT_ERROR_INVALID_KERNEL_ARGS:
- return "UR_RESULT_ERROR_INVALID_KERNEL_ARGS";
- case UR_RESULT_ERROR_INVALID_KERNEL:
- return "UR_RESULT_ERROR_INVALID_KERNEL";
- case UR_RESULT_ERROR_INVALID_KERNEL_NAME:
- return "UR_RESULT_ERROR_INVALID_KERNEL_NAME";
- case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX:
- return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX";
- case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE:
- return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE";
- case UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE:
- return "UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE";
- case UR_RESULT_ERROR_INVALID_IMAGE_SIZE:
- return "UR_RESULT_ERROR_INVALID_IMAGE_SIZE";
- case UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR:
- return "UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR";
- case UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED:
- return "UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED";
- case UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE:
- return "UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE";
- case UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE:
- return "UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE";
- case UR_RESULT_ERROR_UNINITIALIZED:
- return "UR_RESULT_ERROR_UNINITIALIZED";
- case UR_RESULT_ERROR_OUT_OF_HOST_MEMORY:
- return "UR_RESULT_ERROR_OUT_OF_HOST_MEMORY";
- case UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY:
- return "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY";
- case UR_RESULT_ERROR_OUT_OF_RESOURCES:
- return "UR_RESULT_ERROR_OUT_OF_RESOURCES";
- case UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE:
- return "UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE";
- case UR_RESULT_ERROR_PROGRAM_LINK_FAILURE:
- return "UR_RESULT_ERROR_PROGRAM_LINK_FAILURE";
- case UR_RESULT_ERROR_UNSUPPORTED_VERSION:
- return "UR_RESULT_ERROR_UNSUPPORTED_VERSION";
- case UR_RESULT_ERROR_UNSUPPORTED_FEATURE:
- return "UR_RESULT_ERROR_UNSUPPORTED_FEATURE";
- case UR_RESULT_ERROR_INVALID_ARGUMENT:
- return "UR_RESULT_ERROR_INVALID_ARGUMENT";
- case UR_RESULT_ERROR_INVALID_NULL_HANDLE:
- return "UR_RESULT_ERROR_INVALID_NULL_HANDLE";
- case UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE:
- return "UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE";
- case UR_RESULT_ERROR_INVALID_NULL_POINTER:
- return "UR_RESULT_ERROR_INVALID_NULL_POINTER";
- case UR_RESULT_ERROR_INVALID_SIZE:
- return "UR_RESULT_ERROR_INVALID_SIZE";
- case UR_RESULT_ERROR_UNSUPPORTED_SIZE:
- return "UR_RESULT_ERROR_UNSUPPORTED_SIZE";
- case UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT:
- return "UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT";
- case UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT:
- return "UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT";
- case UR_RESULT_ERROR_INVALID_ENUMERATION:
- return "UR_RESULT_ERROR_INVALID_ENUMERATION";
- case UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION:
- return "UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION";
- case UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT:
- return "UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT";
- case UR_RESULT_ERROR_INVALID_NATIVE_BINARY:
- return "UR_RESULT_ERROR_INVALID_NATIVE_BINARY";
- case UR_RESULT_ERROR_INVALID_GLOBAL_NAME:
- return "UR_RESULT_ERROR_INVALID_GLOBAL_NAME";
- case UR_RESULT_ERROR_INVALID_FUNCTION_NAME:
- return "UR_RESULT_ERROR_INVALID_FUNCTION_NAME";
- case UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION:
- return "UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION";
- case UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION:
- return "UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION";
- case UR_RESULT_ERROR_PROGRAM_UNLINKED:
- return "UR_RESULT_ERROR_PROGRAM_UNLINKED";
- case UR_RESULT_ERROR_OVERLAPPING_REGIONS:
- return "UR_RESULT_ERROR_OVERLAPPING_REGIONS";
- case UR_RESULT_ERROR_INVALID_HOST_PTR:
- return "UR_RESULT_ERROR_INVALID_HOST_PTR";
- case UR_RESULT_ERROR_INVALID_USM_SIZE:
- return "UR_RESULT_ERROR_INVALID_USM_SIZE";
- case UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE:
- return "UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE";
- case UR_RESULT_ERROR_ADAPTER_SPECIFIC:
- return "UR_RESULT_ERROR_ADAPTER_SPECIFIC";
- default:
- return "UR_RESULT_ERROR_UNKNOWN";
- }
-}
-
// Trace an internal UR call
#define UR_TRACE(Call) \
{ \
@@ -169,10 +27,10 @@ static inline const char *getUrResultString(ur_result_t Result) {
#define UR_CALL(Call, Result) \
{ \
if (PrintTrace) \
- fprintf(stderr, "UR ---> %s\n", #Call); \
+ std::cerr << "UR ---> " << #Call << "\n"; \
Result = (Call); \
if (PrintTrace) \
- fprintf(stderr, "UR <--- %s(%s)\n", #Call, getUrResultString(Result)); \
+ std::cerr << "UR <--- " << #Call << "(" << Result << ")\n"; \
}
// Handle to a kernel command.
diff --git a/source/adapters/hip/device.cpp b/source/adapters/hip/device.cpp
index ae4dbe159e..dd20a4f50f 100644
--- a/source/adapters/hip/device.cpp
+++ b/source/adapters/hip/device.cpp
@@ -724,11 +724,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
}
case UR_DEVICE_INFO_GLOBAL_MEM_FREE: {
+ // Work around an issue on some (unsupported) architectures,
+ // where hipMemGetInfo fails internally and returns hipErrorInvalidValue
+ // when trying to query the amount of available global memory. Since we
+ // can't distinguish this condition from us doing something wrong, we can't
+ // handle it gracefully.
+ hipDeviceProp_t Props;
+ UR_CHECK_ERROR(hipGetDeviceProperties(&Props, hDevice->get()));
+ if (strcmp(Props.gcnArchName, "gfx1031") == 0) {
+ return ReturnValue(size_t{0});
+ }
+
size_t FreeMemory = 0;
size_t TotalMemory = 0;
- detail::ur::assertion(hipMemGetInfo(&FreeMemory, &TotalMemory) ==
- hipSuccess,
- "failed hipMemGetInfo() API.");
+ UR_CHECK_ERROR(hipMemGetInfo(&FreeMemory, &TotalMemory));
return ReturnValue(FreeMemory);
}
@@ -841,6 +850,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
case UR_DEVICE_INFO_COMPOSITE_DEVICE:
// These two are exclusive of L0.
return ReturnValue(0);
+ case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP:
+ return ReturnValue(true);
// TODO: Investigate if this information is available on HIP.
case UR_DEVICE_INFO_GPU_EU_COUNT:
diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp
index 79522d4c93..4b98a9c491 100644
--- a/source/adapters/hip/enqueue.cpp
+++ b/source/adapters/hip/enqueue.cpp
@@ -15,12 +15,13 @@
#include "kernel.hpp"
#include "memory.hpp"
#include "queue.hpp"
+#include "ur_api.h"
#include
extern size_t imageElementByteSize(hipArray_Format ArrayFormat);
-ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
+ur_result_t enqueueEventsWait(ur_queue_handle_t Queue, hipStream_t Stream,
uint32_t NumEventsInWaitList,
const ur_event_handle_t *EventWaitList) {
if (!EventWaitList) {
@@ -29,8 +30,8 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
try {
auto Result = forLatestEvents(
EventWaitList, NumEventsInWaitList,
- [Stream](ur_event_handle_t Event) -> ur_result_t {
- ScopedContext Active(Event->getDevice());
+ [Stream, Queue](ur_event_handle_t Event) -> ur_result_t {
+ ScopedContext Active(Queue->getDevice());
if (Event->isCompleted() || Event->getStream() == Stream) {
return UR_RESULT_SUCCESS;
} else {
@@ -218,8 +219,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(
// last queue to write to the MemBuffer, meaning we must perform the copy
// from a different device
if (hBuffer->LastEventWritingToMemObj &&
- hBuffer->LastEventWritingToMemObj->getDevice() != hQueue->getDevice()) {
- Device = hBuffer->LastEventWritingToMemObj->getDevice();
+ hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() !=
+ hQueue->getDevice()) {
+ // This event is never created with interop so getQueue is never null
+ hQueue = hBuffer->LastEventWritingToMemObj->getQueue();
+ Device = hQueue->getDevice();
ScopedContext Active(Device);
HIPStream = hipStream_t{0}; // Default stream for different device
// We may have to wait for an event on another queue if it is the last
@@ -584,8 +588,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect(
// last queue to write to the MemBuffer, meaning we must perform the copy
// from a different device
if (hBuffer->LastEventWritingToMemObj &&
- hBuffer->LastEventWritingToMemObj->getDevice() != hQueue->getDevice()) {
- Device = hBuffer->LastEventWritingToMemObj->getDevice();
+ hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() !=
+ hQueue->getDevice()) {
+ // This event is never created with interop so getQueue is never null
+ hQueue = hBuffer->LastEventWritingToMemObj->getQueue();
+ Device = hQueue->getDevice();
ScopedContext Active(Device);
HIPStream = hipStream_t{0}; // Default stream for different device
// We may have to wait for an event on another queue if it is the last
@@ -1017,8 +1024,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead(
// last queue to write to the MemBuffer, meaning we must perform the copy
// from a different device
if (hImage->LastEventWritingToMemObj &&
- hImage->LastEventWritingToMemObj->getDevice() != hQueue->getDevice()) {
- Device = hImage->LastEventWritingToMemObj->getDevice();
+ hImage->LastEventWritingToMemObj->getQueue()->getDevice() !=
+ hQueue->getDevice()) {
+ hQueue = hImage->LastEventWritingToMemObj->getQueue();
+ Device = hQueue->getDevice();
ScopedContext Active(Device);
HIPStream = hipStream_t{0}; // Default stream for different device
// We may have to wait for an event on another queue if it is the last
@@ -1231,49 +1240,42 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
UR_ASSERT(offset + size <= BufferImpl.getSize(),
UR_RESULT_ERROR_INVALID_SIZE);
- ur_result_t Result = UR_RESULT_ERROR_INVALID_OPERATION;
- const bool IsPinned =
- BufferImpl.MemAllocMode == BufferMem::AllocMode::AllocHostPtr;
-
- // Currently no support for overlapping regions
- if (BufferImpl.getMapPtr() != nullptr) {
- return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+ auto MapPtr = BufferImpl.mapToPtr(size, offset, mapFlags);
+ if (!MapPtr) {
+ return UR_RESULT_ERROR_INVALID_MEM_OBJECT;
}
- // Allocate a pointer in the host to store the mapped information
- auto HostPtr = BufferImpl.mapToPtr(size, offset, mapFlags);
- *ppRetMap = std::get(hBuffer->Mem).getMapPtr();
- if (HostPtr) {
- Result = UR_RESULT_SUCCESS;
- }
+ const bool IsPinned =
+ BufferImpl.MemAllocMode == BufferMem::AllocMode::AllocHostPtr;
- if (!IsPinned &&
- ((mapFlags & UR_MAP_FLAG_READ) || (mapFlags & UR_MAP_FLAG_WRITE))) {
- // Pinned host memory is already on host so it doesn't need to be read.
- Result = urEnqueueMemBufferRead(hQueue, hBuffer, blockingMap, offset, size,
- HostPtr, numEventsInWaitList,
- phEventWaitList, phEvent);
- } else {
- ScopedContext Active(hQueue->getDevice());
+ try {
+ if (!IsPinned && (mapFlags & (UR_MAP_FLAG_READ | UR_MAP_FLAG_WRITE))) {
+ // Pinned host memory is already on host so it doesn't need to be read.
+ UR_CHECK_ERROR(urEnqueueMemBufferRead(
+ hQueue, hBuffer, blockingMap, offset, size, MapPtr,
+ numEventsInWaitList, phEventWaitList, phEvent));
+ } else {
+ ScopedContext Active(hQueue->getDevice());
- if (IsPinned) {
- Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList,
- nullptr);
- }
+ if (IsPinned) {
+ UR_CHECK_ERROR(urEnqueueEventsWait(hQueue, numEventsInWaitList,
+ phEventWaitList, nullptr));
+ }
- if (phEvent) {
- try {
+ if (phEvent) {
*phEvent = ur_event_handle_t_::makeNative(
UR_COMMAND_MEM_BUFFER_MAP, hQueue, hQueue->getNextTransferStream());
UR_CHECK_ERROR((*phEvent)->start());
UR_CHECK_ERROR((*phEvent)->record());
- } catch (ur_result_t Error) {
- Result = Error;
}
}
+ } catch (ur_result_t Error) {
+ return Error;
}
- return Result;
+ *ppRetMap = MapPtr;
+
+ return UR_RESULT_SUCCESS;
}
/// Implements the unmap from the host, using a BufferWrite operation.
@@ -1284,47 +1286,44 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
ur_queue_handle_t hQueue, ur_mem_handle_t hMem, void *pMappedPtr,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent) {
- ur_result_t Result = UR_RESULT_SUCCESS;
UR_ASSERT(hMem->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT);
- UR_ASSERT(std::get(hMem->Mem).getMapPtr() != nullptr,
- UR_RESULT_ERROR_INVALID_MEM_OBJECT);
- UR_ASSERT(std::get(hMem->Mem).getMapPtr() == pMappedPtr,
- UR_RESULT_ERROR_INVALID_MEM_OBJECT);
+ auto &BufferImpl = std::get(hMem->Mem);
- const bool IsPinned = std::get(hMem->Mem).MemAllocMode ==
- BufferMem::AllocMode::AllocHostPtr;
-
- if (!IsPinned &&
- ((std::get(hMem->Mem).getMapFlags() & UR_MAP_FLAG_WRITE) ||
- (std::get(hMem->Mem).getMapFlags() &
- UR_MAP_FLAG_WRITE_INVALIDATE_REGION))) {
- // Pinned host memory is only on host so it doesn't need to be written to.
- Result = urEnqueueMemBufferWrite(
- hQueue, hMem, true, std::get(hMem->Mem).getMapOffset(),
- std::get(hMem->Mem).getMapSize(), pMappedPtr,
- numEventsInWaitList, phEventWaitList, phEvent);
- } else {
- ScopedContext Active(hQueue->getDevice());
+ auto *Map = BufferImpl.getMapDetails(pMappedPtr);
+ UR_ASSERT(Map != nullptr, UR_RESULT_ERROR_INVALID_MEM_OBJECT);
- if (IsPinned) {
- Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList,
- nullptr);
- }
+ const bool IsPinned =
+ BufferImpl.MemAllocMode == BufferMem::AllocMode::AllocHostPtr;
- if (phEvent) {
- try {
+ try {
+ if (!IsPinned &&
+ (Map->getMapFlags() &
+ (UR_MAP_FLAG_WRITE | UR_MAP_FLAG_WRITE_INVALIDATE_REGION))) {
+ // Pinned host memory is only on host so it doesn't need to be written to.
+ UR_CHECK_ERROR(urEnqueueMemBufferWrite(
+ hQueue, hMem, true, Map->getMapOffset(), Map->getMapSize(),
+ pMappedPtr, numEventsInWaitList, phEventWaitList, phEvent));
+ } else {
+ ScopedContext Active(hQueue->getDevice());
+
+ if (IsPinned) {
+ UR_CHECK_ERROR(urEnqueueEventsWait(hQueue, numEventsInWaitList,
+ phEventWaitList, nullptr));
+ }
+
+ if (phEvent) {
*phEvent = ur_event_handle_t_::makeNative(
UR_COMMAND_MEM_UNMAP, hQueue, hQueue->getNextTransferStream());
UR_CHECK_ERROR((*phEvent)->start());
UR_CHECK_ERROR((*phEvent)->record());
- } catch (ur_result_t Error) {
- Result = Error;
}
}
+ } catch (ur_result_t Error) {
+ return Error;
}
- std::get(hMem->Mem).unmap(pMappedPtr);
- return Result;
+ BufferImpl.unmap(pMappedPtr);
+ return UR_RESULT_SUCCESS;
}
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
@@ -1842,10 +1841,14 @@ setKernelParams(const ur_device_handle_t Device, const uint32_t WorkDim,
static_cast(Device->getMaxBlockDimY()),
static_cast(Device->getMaxBlockDimZ())};
+ auto &ReqdThreadsPerBlock = Kernel->ReqdThreadsPerBlock;
MaxWorkGroupSize = Device->getMaxWorkGroupSize();
if (LocalWorkSize != nullptr) {
auto isValid = [&](int dim) {
+ UR_ASSERT(ReqdThreadsPerBlock[dim] == 0 ||
+ LocalWorkSize[dim] == ReqdThreadsPerBlock[dim],
+ UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE);
UR_ASSERT(LocalWorkSize[dim] <= MaxThreadsPerBlock[dim],
UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE);
// Checks that local work sizes are a divisor of the global work sizes
@@ -1967,3 +1970,36 @@ void setCopyRectParams(ur_rect_region_t Region, const void *SrcPtr,
: (DstType == hipMemoryTypeDevice ? hipMemcpyHostToDevice
: hipMemcpyHostToHost));
}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+ ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList,
+ const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+
+ ur_result_t Result = UR_RESULT_SUCCESS;
+ std::unique_ptr RetImplEvent{nullptr};
+ try {
+ ScopedContext Active(hQueue->getDevice());
+
+ uint32_t StreamToken;
+ ur_stream_quard Guard;
+ hipStream_t HIPStream = hQueue->getNextComputeStream(
+ numEventsInWaitList, phEventWaitList, Guard, &StreamToken);
+ UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList,
+ phEventWaitList));
+
+ RetImplEvent =
+ std::unique_ptr(ur_event_handle_t_::makeNative(
+ UR_COMMAND_TIMESTAMP_RECORDING_EXP, hQueue, HIPStream));
+ UR_CHECK_ERROR(RetImplEvent->start());
+ UR_CHECK_ERROR(RetImplEvent->record());
+
+ if (blocking) {
+ UR_CHECK_ERROR(hipStreamSynchronize(HIPStream));
+ }
+
+ *phEvent = RetImplEvent.release();
+ } catch (ur_result_t Err) {
+ Result = Err;
+ }
+ return Result;
+}
diff --git a/source/adapters/hip/event.cpp b/source/adapters/hip/event.cpp
index 7478c0e778..5327c43a3b 100644
--- a/source/adapters/hip/event.cpp
+++ b/source/adapters/hip/event.cpp
@@ -22,7 +22,8 @@ ur_event_handle_t_::ur_event_handle_t_(ur_command_t Type,
StreamToken{StreamToken}, EventId{0}, EvEnd{nullptr}, EvStart{nullptr},
EvQueued{nullptr}, Queue{Queue}, Stream{Stream}, Context{Context} {
- bool ProfilingEnabled = Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE;
+ bool ProfilingEnabled =
+ Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent();
UR_CHECK_ERROR(hipEventCreateWithFlags(
&EvEnd, ProfilingEnabled ? hipEventDefault : hipEventDisableTiming));
@@ -58,7 +59,7 @@ ur_result_t ur_event_handle_t_::start() {
ur_result_t Result = UR_RESULT_SUCCESS;
try {
- if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) {
+ if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) {
// NOTE: This relies on the default stream to be unused.
UR_CHECK_ERROR(hipEventRecord(EvQueued, 0));
UR_CHECK_ERROR(hipEventRecord(EvStart, Queue->get()));
@@ -177,7 +178,7 @@ ur_result_t ur_event_handle_t_::release() {
assert(Queue != nullptr);
UR_CHECK_ERROR(hipEventDestroy(EvEnd));
- if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) {
+ if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) {
UR_CHECK_ERROR(hipEventDestroy(EvQueued));
UR_CHECK_ERROR(hipEventDestroy(EvStart));
}
@@ -244,7 +245,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
UR_ASSERT(!(pPropValue && propValueSize == 0), UR_RESULT_ERROR_INVALID_VALUE);
ur_queue_handle_t Queue = hEvent->getQueue();
- if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
+ if (Queue == nullptr || (!(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) &&
+ !hEvent->isTimestampEvent())) {
return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
}
diff --git a/source/adapters/hip/event.hpp b/source/adapters/hip/event.hpp
index 50de73b14f..5e7c1d7e7d 100644
--- a/source/adapters/hip/event.hpp
+++ b/source/adapters/hip/event.hpp
@@ -28,8 +28,6 @@ struct ur_event_handle_t_ {
ur_queue_handle_t getQueue() const noexcept { return Queue; }
- ur_device_handle_t getDevice() const noexcept { return Queue->getDevice(); }
-
hipStream_t getStream() const noexcept { return Stream; }
uint32_t getComputeStreamToken() const noexcept { return StreamToken; }
@@ -55,6 +53,10 @@ struct ur_event_handle_t_ {
return UR_EVENT_STATUS_COMPLETE;
}
+ bool isTimestampEvent() const noexcept {
+ return getCommandType() == UR_COMMAND_TIMESTAMP_RECORDING_EXP;
+ }
+
ur_context_handle_t getContext() const noexcept { return Context; };
uint32_t incrementReferenceCount() { return ++RefCount; }
diff --git a/source/adapters/hip/image.cpp b/source/adapters/hip/image.cpp
index a8fcd6f465..4085614f31 100644
--- a/source/adapters/hip/image.cpp
+++ b/source/adapters/hip/image.cpp
@@ -59,7 +59,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
[[maybe_unused]] ur_exp_image_mem_handle_t hImageMem,
[[maybe_unused]] const ur_image_format_t *pImageFormat,
[[maybe_unused]] const ur_image_desc_t *pImageDesc,
- [[maybe_unused]] ur_mem_handle_t *phMem,
[[maybe_unused]] ur_exp_image_handle_t *phImage) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
@@ -71,7 +70,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
[[maybe_unused]] const ur_image_format_t *pImageFormat,
[[maybe_unused]] const ur_image_desc_t *pImageDesc,
[[maybe_unused]] ur_sampler_handle_t hSampler,
- [[maybe_unused]] ur_mem_handle_t *phMem,
[[maybe_unused]] ur_exp_image_handle_t *phImage) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp
index ddb8f10595..f35d3957bc 100644
--- a/source/adapters/hip/kernel.cpp
+++ b/source/adapters/hip/kernel.cpp
@@ -91,14 +91,17 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
return ReturnValue(size_t(MaxThreads));
}
case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: {
- size_t group_size[3] = {0, 0, 0};
- // Returns the work-group size specified in the kernel source or IL.
- // If the work-group size is not specified in the kernel source or IL,
- // (0, 0, 0) is returned.
- // https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/clGetKernelWorkGroupInfo.html
-
- // TODO: can we extract the work group size from the PTX?
- return ReturnValue(group_size, 3);
+ size_t GroupSize[3] = {0, 0, 0};
+ const auto &ReqdWGSizeMDMap =
+ hKernel->getProgram()->KernelReqdWorkGroupSizeMD;
+ const auto ReqdWGSizeMD = ReqdWGSizeMDMap.find(hKernel->getName());
+ if (ReqdWGSizeMD != ReqdWGSizeMDMap.end()) {
+ const auto ReqdWGSize = ReqdWGSizeMD->second;
+ GroupSize[0] = std::get<0>(ReqdWGSize);
+ GroupSize[1] = std::get<1>(ReqdWGSize);
+ GroupSize[2] = std::get<2>(ReqdWGSize);
+ }
+ return ReturnValue(GroupSize, 3);
}
case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: {
// OpenCL LOCAL == HIP SHARED
diff --git a/source/adapters/hip/kernel.hpp b/source/adapters/hip/kernel.hpp
index 83693a3d41..afea69832b 100644
--- a/source/adapters/hip/kernel.hpp
+++ b/source/adapters/hip/kernel.hpp
@@ -42,6 +42,9 @@ struct ur_kernel_handle_t_ {
ur_program_handle_t Program;
std::atomic_uint32_t RefCount;
+ static constexpr uint32_t ReqdThreadsPerBlockDimensions = 3u;
+ size_t ReqdThreadsPerBlock[ReqdThreadsPerBlockDimensions];
+
/// Structure that holds the arguments to the kernel.
/// Note earch argument size is known, since it comes
/// from the kernel signature.
@@ -154,6 +157,11 @@ struct ur_kernel_handle_t_ {
ur_context_handle_t Ctxt)
: Function{Func}, FunctionWithOffsetParam{FuncWithOffsetParam},
Name{Name}, Context{Ctxt}, Program{Program}, RefCount{1} {
+ assert(Program->getDevice());
+ UR_CHECK_ERROR(urKernelGetGroupInfo(
+ this, Program->getDevice(),
+ UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
+ sizeof(ReqdThreadsPerBlock), ReqdThreadsPerBlock, nullptr));
urProgramRetain(Program);
urContextRetain(Context);
}
diff --git a/source/adapters/hip/memory.cpp b/source/adapters/hip/memory.cpp
index dcc3e34fad..ff209884ce 100644
--- a/source/adapters/hip/memory.cpp
+++ b/source/adapters/hip/memory.cpp
@@ -525,11 +525,12 @@ inline ur_result_t migrateBufferToDevice(ur_mem_handle_t Mem,
UR_CHECK_ERROR(
hipMemcpyHtoD(Buffer.getPtr(hDevice), Buffer.HostPtr, Buffer.Size));
}
- } else if (Mem->LastEventWritingToMemObj->getDevice() != hDevice) {
- UR_CHECK_ERROR(
- hipMemcpyDtoD(Buffer.getPtr(hDevice),
- Buffer.getPtr(Mem->LastEventWritingToMemObj->getDevice()),
- Buffer.Size));
+ } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
+ hDevice) {
+ UR_CHECK_ERROR(hipMemcpyDtoD(
+ Buffer.getPtr(hDevice),
+ Buffer.getPtr(Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
+ Buffer.Size));
}
return UR_RESULT_SUCCESS;
}
@@ -577,22 +578,24 @@ inline ur_result_t migrateImageToDevice(ur_mem_handle_t Mem,
CpyDesc3D.srcHost = Image.HostPtr;
UR_CHECK_ERROR(hipDrvMemcpy3D(&CpyDesc3D));
}
- } else if (Mem->LastEventWritingToMemObj->getDevice() != hDevice) {
+ } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() !=
+ hDevice) {
if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) {
// FIXME: 1D memcpy from DtoD going through the host.
UR_CHECK_ERROR(hipMemcpyAtoH(
Image.HostPtr,
- Image.getArray(Mem->LastEventWritingToMemObj->getDevice()),
+ Image.getArray(
+ Mem->LastEventWritingToMemObj->getQueue()->getDevice()),
0 /*srcOffset*/, ImageSizeBytes));
UR_CHECK_ERROR(
hipMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes));
} else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) {
- CpyDesc2D.srcArray =
- Image.getArray(Mem->LastEventWritingToMemObj->getDevice());
+ CpyDesc2D.srcArray = Image.getArray(
+ Mem->LastEventWritingToMemObj->getQueue()->getDevice());
UR_CHECK_ERROR(hipMemcpyParam2D(&CpyDesc2D));
} else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) {
- CpyDesc3D.srcArray =
- Image.getArray(Mem->LastEventWritingToMemObj->getDevice());
+ CpyDesc3D.srcArray = Image.getArray(
+ Mem->LastEventWritingToMemObj->getQueue()->getDevice());
UR_CHECK_ERROR(hipDrvMemcpy3D(&CpyDesc3D));
}
}
diff --git a/source/adapters/hip/memory.hpp b/source/adapters/hip/memory.hpp
index 7707794b3c..5d2aa6f9a5 100644
--- a/source/adapters/hip/memory.hpp
+++ b/source/adapters/hip/memory.hpp
@@ -9,12 +9,15 @@
//===----------------------------------------------------------------------===//
#pragma once
-#include "common.hpp"
#include "context.hpp"
#include "event.hpp"
#include
+#include
+#include
#include
+#include "common.hpp"
+
ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t,
const ur_device_handle_t);
ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
@@ -22,6 +25,41 @@ ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t,
// Handler for plain, pointer-based HIP allocations
struct BufferMem {
+ struct BufferMap {
+ /// Size of the active mapped region.
+ size_t MapSize;
+ /// Offset of the active mapped region.
+ size_t MapOffset;
+ /// Original flags for the mapped region
+ ur_map_flags_t MapFlags;
+ /// Allocated host memory used exclusively for this map.
+ std::shared_ptr MapMem;
+
+ BufferMap(size_t MapSize, size_t MapOffset, ur_map_flags_t MapFlags)
+ : MapSize(MapSize), MapOffset(MapOffset), MapFlags(MapFlags),
+ MapMem(nullptr) {}
+
+ BufferMap(size_t MapSize, size_t MapOffset, ur_map_flags_t MapFlags,
+ std::unique_ptr &&MapMem)
+ : MapSize(MapSize), MapOffset(MapOffset), MapFlags(MapFlags),
+ MapMem(std::move(MapMem)) {}
+
+ size_t getMapSize() const noexcept { return MapSize; }
+
+ size_t getMapOffset() const noexcept { return MapOffset; }
+
+ ur_map_flags_t getMapFlags() const noexcept { return MapFlags; }
+ };
+
+ /** AllocMode
+ * Classic: Just a normal buffer allocated on the device via hip malloc
+ * UseHostPtr: Use an address on the host for the device
+ * CopyIn: The data for the device comes from the host but the host
+ pointer is not available later for re-use
+ * AllocHostPtr: Uses pinned-memory allocation
+ */
+ enum class AllocMode { Classic, UseHostPtr, CopyIn, AllocHostPtr };
+
using native_type = hipDeviceptr_t;
// If this allocation is a sub-buffer (i.e., a view on an existing
@@ -29,33 +67,14 @@ struct BufferMem {
ur_mem_handle_t Parent = nullptr;
// Outer mem holding this struct in variant
ur_mem_handle_t OuterMemStruct;
-
/// Pointer associated with this device on the host
void *HostPtr;
/// Size of the allocation in bytes
size_t Size;
- /// Size of the active mapped region.
- size_t MapSize;
- /// Offset of the active mapped region.
- size_t MapOffset;
- /// Pointer to the active mapped region, if any
- void *MapPtr;
- /// Original flags for the mapped region
- ur_map_flags_t MapFlags;
+ /// A map that contains all the active mappings for this buffer.
+ std::unordered_map PtrToBufferMap;
- /** AllocMode
- * Classic: Just a normal buffer allocated on the device via hip malloc
- * UseHostPtr: Use an address on the host for the device
- * CopyIn: The data for the device comes from the host but the host
- pointer is not available later for re-use
- * AllocHostPtr: Uses pinned-memory allocation
- */
- enum class AllocMode {
- Classic,
- UseHostPtr,
- CopyIn,
- AllocHostPtr
- } MemAllocMode;
+ AllocMode MemAllocMode;
private:
// Vector of HIP pointers
@@ -65,10 +84,8 @@ struct BufferMem {
BufferMem(ur_context_handle_t Context, ur_mem_handle_t OuterMemStruct,
AllocMode Mode, void *HostPtr, size_t Size)
: OuterMemStruct{OuterMemStruct}, HostPtr{HostPtr}, Size{Size},
- MapSize{0}, MapOffset{0}, MapPtr{nullptr}, MapFlags{UR_MAP_FLAG_WRITE},
- MemAllocMode{Mode}, Ptrs(Context->Devices.size(), native_type{0}){};
-
- BufferMem(const BufferMem &Buffer) = default;
+ PtrToBufferMap{}, MemAllocMode{Mode},
+ Ptrs(Context->Devices.size(), native_type{0}){};
// This will allocate memory on device if there isn't already an active
// allocation on the device
@@ -98,45 +115,41 @@ struct BufferMem {
size_t getSize() const noexcept { return Size; }
- void *getMapPtr() const noexcept { return MapPtr; }
-
- size_t getMapSize() const noexcept { return MapSize; }
-
- size_t getMapOffset() const noexcept { return MapOffset; }
+ BufferMap *getMapDetails(void *Map) {
+ auto details = PtrToBufferMap.find(Map);
+ if (details != PtrToBufferMap.end()) {
+ return &details->second;
+ }
+ return nullptr;
+ }
/// Returns a pointer to data visible on the host that contains
/// the data on the device associated with this allocation.
/// The offset is used to index into the HIP allocation.
///
- void *mapToPtr(size_t Size, size_t Offset, ur_map_flags_t Flags) noexcept {
- assert(MapPtr == nullptr);
- MapSize = Size;
- MapOffset = Offset;
- MapFlags = Flags;
- if (HostPtr) {
- MapPtr = static_cast(HostPtr) + Offset;
+ void *mapToPtr(size_t MapSize, size_t MapOffset,
+ ur_map_flags_t MapFlags) noexcept {
+ void *MapPtr = nullptr;
+ if (HostPtr == nullptr) {
+ /// If HostPtr is invalid, we need to create a Mapping that owns its own
+ /// memory on the host.
+ auto MapMem = std::make_unique(MapSize);
+ MapPtr = MapMem.get();
+ PtrToBufferMap.insert(
+ {MapPtr, BufferMap(MapSize, MapOffset, MapFlags, std::move(MapMem))});
} else {
- // TODO: Allocate only what is needed based on the offset
- MapPtr = static_cast(malloc(this->getSize()));
+ /// However, if HostPtr already has valid memory (e.g. pinned allocation),
+ /// we can just use that memory for the mapping.
+ MapPtr = static_cast(HostPtr) + MapOffset;
+ PtrToBufferMap.insert({MapPtr, BufferMap(MapSize, MapOffset, MapFlags)});
}
return MapPtr;
}
/// Detach the allocation from the host memory.
- void unmap(void *) noexcept {
+ void unmap(void *MapPtr) noexcept {
assert(MapPtr != nullptr);
-
- if (MapPtr != HostPtr) {
- free(MapPtr);
- }
- MapPtr = nullptr;
- MapSize = 0;
- MapOffset = 0;
- }
-
- ur_map_flags_t getMapFlags() const noexcept {
- assert(MapPtr != nullptr);
- return MapFlags;
+ PtrToBufferMap.erase(MapPtr);
}
ur_result_t clear() {
@@ -414,7 +427,7 @@ struct ur_mem_handle_t_ {
HaveMigratedToDeviceSinceLastWrite(Context->Devices.size(), false),
Mem{std::in_place_type, Ctxt, this, Mode, HostPtr, Size} {
urContextRetain(Context);
- };
+ }
// Subbuffer constructor
ur_mem_handle_t_(ur_mem Parent, size_t SubBufferOffset)
@@ -435,7 +448,7 @@ struct ur_mem_handle_t_ {
}
}
urMemRetain(Parent);
- };
+ }
/// Constructs the UR mem handler for an Image object
ur_mem_handle_t_(ur_context Ctxt, ur_mem_flags_t MemFlags,
@@ -498,7 +511,7 @@ struct ur_mem_handle_t_ {
LastEventWritingToMemObj = NewEvent;
for (const auto &Device : Context->getDevices()) {
HaveMigratedToDeviceSinceLastWrite[Device->getIndex()] =
- Device == NewEvent->getDevice();
+ Device == NewEvent->getQueue()->getDevice();
}
}
};
diff --git a/source/adapters/hip/program.cpp b/source/adapters/hip/program.cpp
index 1ee862804a..8e3653ee02 100644
--- a/source/adapters/hip/program.cpp
+++ b/source/adapters/hip/program.cpp
@@ -9,6 +9,7 @@
//===----------------------------------------------------------------------===//
#include "program.hpp"
+#include "ur_util.hpp"
#ifdef SYCL_ENABLE_KERNEL_FUSION
#ifdef UR_COMGR_VERSION4_INCLUDE
@@ -78,15 +79,6 @@ void getCoMgrBuildLog(const amd_comgr_data_set_t BuildDataSet, char *BuildLog,
} // namespace
#endif
-std::pair
-splitMetadataName(const std::string &metadataName) {
- size_t splitPos = metadataName.rfind('@');
- if (splitPos == std::string::npos)
- return std::make_pair(metadataName, std::string{});
- return std::make_pair(metadataName.substr(0, splitPos),
- metadataName.substr(splitPos, metadataName.length()));
-}
-
ur_result_t
ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata,
size_t Length) {
@@ -107,8 +99,29 @@ ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata,
const char *MetadataValPtrEnd =
MetadataValPtr + MetadataElement.size - sizeof(std::uint64_t);
GlobalIDMD[Prefix] = std::string{MetadataValPtr, MetadataValPtrEnd};
+ } else if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) {
+ // If metadata is reqd_work_group_size, record it for the corresponding
+ // kernel name.
+ size_t MDElemsSize = MetadataElement.size - sizeof(std::uint64_t);
+
+ // Expect between 1 and 3 32-bit integer values.
+ UR_ASSERT(MDElemsSize >= sizeof(std::uint32_t) &&
+ MDElemsSize <= sizeof(std::uint32_t) * 3,
+ UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE);
+
+ // Get pointer to data, skipping 64-bit size at the start of the data.
+ const char *ValuePtr =
+ reinterpret_cast(MetadataElement.value.pData) +
+ sizeof(std::uint64_t);
+ // Read values and pad with 1's for values not present.
+ std::uint32_t ReqdWorkGroupElements[] = {1, 1, 1};
+ std::memcpy(ReqdWorkGroupElements, ValuePtr, MDElemsSize);
+ KernelReqdWorkGroupSizeMD[Prefix] =
+ std::make_tuple(ReqdWorkGroupElements[0], ReqdWorkGroupElements[1],
+ ReqdWorkGroupElements[2]);
}
}
+
return UR_RESULT_SUCCESS;
}
@@ -459,8 +472,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
std::unique_ptr RetProgram{
new ur_program_handle_t_{hContext, hDevice}};
- // TODO: Set metadata here and use reqd_work_group_size information.
- // See urProgramCreateWithBinary in CUDA adapter.
if (pProperties) {
if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
@@ -469,8 +480,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
}
Result =
RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count);
+ UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);
}
- UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);
auto pBinary_string = reinterpret_cast(pBinary);
if (size == 0) {
diff --git a/source/adapters/hip/program.hpp b/source/adapters/hip/program.hpp
index 651d59a42d..cc5da40af5 100644
--- a/source/adapters/hip/program.hpp
+++ b/source/adapters/hip/program.hpp
@@ -12,6 +12,7 @@
#include
#include
+#include
#include "context.hpp"
@@ -30,6 +31,8 @@ struct ur_program_handle_t_ {
bool IsRelocatable = false;
std::unordered_map GlobalIDMD;
+ std::unordered_map>
+ KernelReqdWorkGroupSizeMD;
constexpr static size_t MAX_LOG_SIZE = 8192u;
@@ -38,8 +41,8 @@ struct ur_program_handle_t_ {
ur_program_build_status_t BuildStatus = UR_PROGRAM_BUILD_STATUS_NONE;
ur_program_handle_t_(ur_context_handle_t Ctxt, ur_device_handle_t Device)
- : Module{nullptr}, Binary{},
- BinarySizeInBytes{0}, RefCount{1}, Context{Ctxt}, Device{Device} {
+ : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1},
+ Context{Ctxt}, Device{Device}, KernelReqdWorkGroupSizeMD{} {
urContextRetain(Context);
urDeviceRetain(Device);
}
diff --git a/source/adapters/hip/ur_interface_loader.cpp b/source/adapters/hip/ur_interface_loader.cpp
index 1cf9431fcb..71979b75b1 100644
--- a/source/adapters/hip/ur_interface_loader.cpp
+++ b/source/adapters/hip/ur_interface_loader.cpp
@@ -377,6 +377,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
pDdiTable->pfnCooperativeKernelLaunchExp =
urEnqueueCooperativeKernelLaunchExp;
+ pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp;
return UR_RESULT_SUCCESS;
}
diff --git a/source/adapters/hip/usm.cpp b/source/adapters/hip/usm.cpp
index 4068c1d865..e871f394f2 100644
--- a/source/adapters/hip/usm.cpp
+++ b/source/adapters/hip/usm.cpp
@@ -27,7 +27,7 @@ urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc,
UR_RESULT_ERROR_INVALID_VALUE);
if (!hPool) {
- return USMHostAllocImpl(ppMem, hContext, nullptr, size, alignment);
+ return USMHostAllocImpl(ppMem, hContext, /* flags */ 0, size, alignment);
}
return umfPoolMallocHelper(hPool, ppMem, size, alignment);
@@ -43,7 +43,7 @@ urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
UR_RESULT_ERROR_INVALID_VALUE);
if (!hPool) {
- return USMDeviceAllocImpl(ppMem, hContext, hDevice, nullptr, size,
+ return USMDeviceAllocImpl(ppMem, hContext, hDevice, /* flags */ 0, size,
alignment);
}
@@ -60,8 +60,8 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice,
UR_RESULT_ERROR_INVALID_VALUE);
if (!hPool) {
- return USMSharedAllocImpl(ppMem, hContext, hDevice, nullptr, nullptr, size,
- alignment);
+ return USMSharedAllocImpl(ppMem, hContext, hDevice, /*host flags*/ 0,
+ /*device flags*/ 0, size, alignment);
}
return umfPoolMallocHelper(hPool, ppMem, size, alignment);
@@ -105,7 +105,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext,
ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t,
ur_device_handle_t Device,
- ur_usm_device_mem_flags_t *, size_t Size,
+ ur_usm_device_mem_flags_t, size_t Size,
[[maybe_unused]] uint32_t Alignment) {
try {
ScopedContext Active(Device);
@@ -120,8 +120,8 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t,
ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t,
ur_device_handle_t Device,
- ur_usm_host_mem_flags_t *,
- ur_usm_device_mem_flags_t *, size_t Size,
+ ur_usm_host_mem_flags_t,
+ ur_usm_device_mem_flags_t, size_t Size,
[[maybe_unused]] uint32_t Alignment) {
try {
ScopedContext Active(Device);
@@ -136,7 +136,7 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t,
ur_result_t USMHostAllocImpl(void **ResultPtr,
[[maybe_unused]] ur_context_handle_t Context,
- ur_usm_host_mem_flags_t *, size_t Size,
+ ur_usm_host_mem_flags_t, size_t Size,
[[maybe_unused]] uint32_t Alignment) {
try {
UR_CHECK_ERROR(hipHostMalloc(ResultPtr, Size));
@@ -309,19 +309,19 @@ umf_result_t USMMemoryProvider::get_min_page_size(void *Ptr, size_t *PageSize) {
ur_result_t USMSharedMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
uint32_t Alignment) {
- return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, nullptr, Size,
- Alignment);
+ return USMSharedAllocImpl(ResultPtr, Context, Device, /*host flags*/ 0,
+ /*device flags*/ 0, Size, Alignment);
}
ur_result_t USMDeviceMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
uint32_t Alignment) {
- return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size,
+ return USMDeviceAllocImpl(ResultPtr, Context, Device, /* flags */ 0, Size,
Alignment);
}
ur_result_t USMHostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size,
uint32_t Alignment) {
- return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment);
+ return USMHostAllocImpl(ResultPtr, Context, /* flags */ 0, Size, Alignment);
}
ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context,
diff --git a/source/adapters/hip/usm.hpp b/source/adapters/hip/usm.hpp
index d02145584f..a1c3964263 100644
--- a/source/adapters/hip/usm.hpp
+++ b/source/adapters/hip/usm.hpp
@@ -118,17 +118,17 @@ class USMHostMemoryProvider final : public USMMemoryProvider {
ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context,
ur_device_handle_t Device,
- ur_usm_device_mem_flags_t *Flags, size_t Size,
+ ur_usm_device_mem_flags_t Flags, size_t Size,
uint32_t Alignment);
ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context,
ur_device_handle_t Device,
- ur_usm_host_mem_flags_t *,
- ur_usm_device_mem_flags_t *, size_t Size,
+ ur_usm_host_mem_flags_t,
+ ur_usm_device_mem_flags_t, size_t Size,
uint32_t Alignment);
ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context,
- ur_usm_host_mem_flags_t *Flags, size_t Size,
+ ur_usm_host_mem_flags_t Flags, size_t Size,
uint32_t Alignment);
bool checkUSMAlignment(uint32_t &alignment, const ur_usm_desc_t *pUSMDesc);
diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp
index a06163c2a9..67415a0de0 100644
--- a/source/adapters/level_zero/command_buffer.cpp
+++ b/source/adapters/level_zero/command_buffer.cpp
@@ -1048,8 +1048,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
// Create command-list to execute before `CommandListPtr` and will signal
// when `EventWaitList` dependencies are complete.
ur_command_list_ptr_t WaitCommandList{};
- UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList,
- false, false));
+ UR_CALL(Queue->Context->getAvailableCommandList(
+ Queue, WaitCommandList, false, NumEventsInWaitList, EventWaitList,
+ false));
ZE2UR_CALL(zeCommandListAppendBarrier,
(WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent,
@@ -1086,7 +1087,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
// Create a command-list to signal RetEvent on completion
ur_command_list_ptr_t SignalCommandList{};
UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList,
- false, false));
+ false, NumEventsInWaitList,
+ EventWaitList, false));
// Reset the wait-event for the UR command-buffer that is signaled when its
// submission dependencies have been satisfied.
ZE2UR_CALL(zeCommandListAppendEventReset,
diff --git a/source/adapters/level_zero/common.cpp b/source/adapters/level_zero/common.cpp
index 7ae7272355..926d5f4ba4 100644
--- a/source/adapters/level_zero/common.cpp
+++ b/source/adapters/level_zero/common.cpp
@@ -88,7 +88,11 @@ bool setEnvVar(const char *name, const char *value) {
ZeUSMImportExtension ZeUSMImport;
// This will count the calls to Level-Zero
+// TODO: remove the ifdef once
+// https://github.com/oneapi-src/unified-runtime/issues/1454 is implemented
+#ifndef UR_L0_CALL_COUNT_IN_TESTS
std::map *ZeCallCount = nullptr;
+#endif
inline void zeParseError(ze_result_t ZeError, const char *&ErrorString) {
switch (ZeError) {
diff --git a/source/adapters/level_zero/common.hpp b/source/adapters/level_zero/common.hpp
index 93bf407567..e16d767b71 100644
--- a/source/adapters/level_zero/common.hpp
+++ b/source/adapters/level_zero/common.hpp
@@ -241,6 +241,19 @@ static const uint32_t UrL0QueueSyncNonBlocking = [] {
return L0QueueSyncLockingModeValue;
}();
+// Controls whether the L0 Adapter creates signal events for commands on
+// integrated gpu devices.
+static const uint32_t UrL0OutOfOrderIntegratedSignalEvent = [] {
+ const char *UrL0OutOfOrderIntegratedSignalEventEnv =
+ std::getenv("UR_L0_OOQ_INTEGRATED_SIGNAL_EVENT");
+ uint32_t UrL0OutOfOrderIntegratedSignalEventValue = 1;
+ if (UrL0OutOfOrderIntegratedSignalEventEnv) {
+ UrL0OutOfOrderIntegratedSignalEventValue =
+ std::atoi(UrL0OutOfOrderIntegratedSignalEventEnv);
+ }
+ return UrL0OutOfOrderIntegratedSignalEventValue;
+}();
+
// This class encapsulates actions taken along with a call to Level Zero API.
class ZeCall {
private:
diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp
index 4880c14c4b..85fe582ec3 100644
--- a/source/adapters/level_zero/context.cpp
+++ b/source/adapters/level_zero/context.cpp
@@ -645,7 +645,8 @@ static const size_t CmdListsCleanupThreshold = [] {
// Retrieve an available command list to be used in a PI call.
ur_result_t ur_context_handle_t_::getAvailableCommandList(
ur_queue_handle_t Queue, ur_command_list_ptr_t &CommandList,
- bool UseCopyEngine, bool AllowBatching,
+ bool UseCopyEngine, uint32_t NumEventsInWaitList,
+ const ur_event_handle_t *EventWaitList, bool AllowBatching,
ze_command_queue_handle_t *ForcedCmdQueue) {
// Immediate commandlists have been pre-allocated and are always available.
if (Queue->UsingImmCmdLists) {
@@ -677,9 +678,22 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList(
// for this queue.
if (Queue->hasOpenCommandList(UseCopyEngine)) {
if (AllowBatching) {
- CommandList = CommandBatch.OpenCommandList;
- UR_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList));
- return UR_RESULT_SUCCESS;
+ bool batchingAllowed = true;
+ if (ForcedCmdQueue &&
+ CommandBatch.OpenCommandList->second.ZeQueue != *ForcedCmdQueue) {
+ // Current open batch doesn't match the forced command queue
+ batchingAllowed = false;
+ }
+ if (!UrL0OutOfOrderIntegratedSignalEvent &&
+ Queue->Device->isIntegrated()) {
+ batchingAllowed = eventCanBeBatched(Queue, UseCopyEngine,
+ NumEventsInWaitList, EventWaitList);
+ }
+ if (batchingAllowed) {
+ CommandList = CommandBatch.OpenCommandList;
+ UR_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList));
+ return UR_RESULT_SUCCESS;
+ }
}
// If this command isn't allowed to be batched or doesn't match the forced
// command queue, then we need to go ahead and execute what is already in
diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp
index ff173aa984..4184411de7 100644
--- a/source/adapters/level_zero/context.hpp
+++ b/source/adapters/level_zero/context.hpp
@@ -292,11 +292,11 @@ struct ur_context_handle_t_ : _ur_object {
// When using immediate commandlists, retrieves an immediate command list
// for executing on this device. Immediate commandlists are created only
// once for each SYCL Queue and after that they are reused.
- ur_result_t
- getAvailableCommandList(ur_queue_handle_t Queue,
- ur_command_list_ptr_t &CommandList,
- bool UseCopyEngine, bool AllowBatching = false,
- ze_command_queue_handle_t *ForcedCmdQueue = nullptr);
+ ur_result_t getAvailableCommandList(
+ ur_queue_handle_t Queue, ur_command_list_ptr_t &CommandList,
+ bool UseCopyEngine, uint32_t NumEventsInWaitList,
+ const ur_event_handle_t *EventWaitList, bool AllowBatching = false,
+ ze_command_queue_handle_t *ForcedCmdQueue = nullptr);
// Checks if Device is covered by this context.
// For that the Device or its root devices need to be in the context.
diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp
index 7f9e9b499b..7f832f30f2 100644
--- a/source/adapters/level_zero/device.cpp
+++ b/source/adapters/level_zero/device.cpp
@@ -1,6 +1,6 @@
//===--------- device.cpp - Level Zero Adapter ----------------------------===//
//
-// Copyright (C) 2023 Intel Corporation
+// Copyright (C) 2023-2024 Intel Corporation
//
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
// Exceptions. See LICENSE.TXT
@@ -188,8 +188,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
}
case UR_DEVICE_INFO_ATOMIC_64:
return ReturnValue(
- static_cast(Device->ZeDeviceModuleProperties->flags &
- ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS));
+ static_cast(Device->ZeDeviceModuleProperties->flags &
+ ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS));
case UR_DEVICE_INFO_EXTENSIONS: {
// Convention adopted from OpenCL:
// "Returns a space separated list of extension names (the extension
@@ -258,9 +258,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE:
return ReturnValue(uint32_t{0});
case UR_DEVICE_INFO_COMPILER_AVAILABLE:
- return ReturnValue(static_cast(true));
+ return ReturnValue(static_cast(true));
case UR_DEVICE_INFO_LINKER_AVAILABLE:
- return ReturnValue(static_cast(true));
+ return ReturnValue(static_cast(true));
case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: {
uint32_t MaxComputeUnits =
Device->ZeDeviceProperties->numEUsPerSubslice *
@@ -337,8 +337,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
case UR_DEVICE_INFO_DRIVER_VERSION:
case UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION:
return ReturnValue(Device->Platform->ZeDriverVersion.c_str());
- case UR_DEVICE_INFO_VERSION:
- return ReturnValue(Device->Platform->ZeDriverApiVersion.c_str());
+ case UR_DEVICE_INFO_VERSION: {
+ // from compute-runtime/shared/source/helpers/hw_ip_version.h
+ typedef struct {
+ uint32_t revision : 6;
+ uint32_t reserved : 8;
+ uint32_t release : 8;
+ uint32_t architecture : 10;
+ } version_components_t;
+ typedef struct {
+ union {
+ uint32_t value;
+ version_components_t components;
+ };
+ } ipVersion_t;
+ ipVersion_t IpVersion;
+ IpVersion.value = Device->ZeDeviceIpVersionExt->ipVersion;
+ std::stringstream S;
+ S << IpVersion.components.architecture << "."
+ << IpVersion.components.release << "." << IpVersion.components.revision;
+ return ReturnValue(S.str().c_str());
+ }
case UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: {
auto Res = Device->Platform->populateDeviceCacheIfNeeded();
if (Res != UR_RESULT_SUCCESS) {
@@ -410,7 +429,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION:
return ReturnValue("");
case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC:
- return ReturnValue(static_cast(true));
+ return ReturnValue(static_cast(true));
case UR_DEVICE_INFO_PRINTF_BUFFER_SIZE:
return ReturnValue(
size_t{Device->ZeDeviceModuleProperties->printfBufferSize});
@@ -427,7 +446,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
return ReturnValue(ur_device_exec_capability_flag_t{
UR_DEVICE_EXEC_CAPABILITY_FLAG_NATIVE_KERNEL});
case UR_DEVICE_INFO_ENDIAN_LITTLE:
- return ReturnValue(static_cast(true));
+ return ReturnValue(static_cast(true));
case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT:
return ReturnValue(static_cast(Device->ZeDeviceProperties->flags &
ZE_DEVICE_PROPERTY_FLAG_ECC));
@@ -604,7 +623,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
}
case UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: {
// TODO: Not supported yet. Needs to be updated after support is added.
- return ReturnValue(static_cast(false));
+ return ReturnValue(static_cast(false));
}
case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: {
// ze_device_compute_properties.subGroupSizes is in uint32_t whereas the
@@ -790,7 +809,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
return UR_RESULT_ERROR_INVALID_VALUE;
case UR_DEVICE_INFO_BFLOAT16: {
// bfloat16 math functions are not yet supported on Intel GPUs.
- return ReturnValue(bool{false});
+ return ReturnValue(ur_bool_t{false});
}
case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: {
// There are no explicit restrictions in L0 programming guide, so assume all
@@ -839,9 +858,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
return ReturnValue(capabilities);
}
case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT:
- return ReturnValue(uint32_t{false});
+ return ReturnValue(ur_bool_t{false});
case UR_DEVICE_INFO_IMAGE_SRGB:
- return ReturnValue(uint32_t{false});
+ return ReturnValue(ur_bool_t{false});
case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES:
case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: {
@@ -853,6 +872,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
0)); //__read_write attribute currently undefinde in opencl
}
case UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT: {
+ return ReturnValue(static_cast(true));
+ }
+ case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: {
return ReturnValue(static_cast(true));
}
@@ -893,9 +915,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
// can know if we are in (a) or (b) by checking if a tile is root device
// or not.
ur_device_handle_t URDev = Device->Platform->getDeviceFromNativeHandle(d);
- if (URDev->isSubDevice())
+ if (URDev->isSubDevice()) {
// We are in COMPOSITE mode, return an empty list.
- return ReturnValue(0);
+ if (pSize) {
+ *pSize = 0;
+ }
+ return UR_RESULT_SUCCESS;
+ }
Res.push_back(URDev);
}
@@ -1078,7 +1104,7 @@ bool ur_device_handle_t_::useDriverInOrderLists() {
static const bool UseDriverInOrderLists = [] {
const char *UrRet = std::getenv("UR_L0_USE_DRIVER_INORDER_LISTS");
if (!UrRet)
- return true;
+ return false;
return std::atoi(UrRet) != 0;
}();
diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp
index 484890670b..3cdfcbce7e 100644
--- a/source/adapters/level_zero/device.hpp
+++ b/source/adapters/level_zero/device.hpp
@@ -176,6 +176,10 @@ struct ur_device_handle_t_ : _ur_object {
(ZeDeviceProperties->deviceId & 0xff0) == 0xb60;
}
+ bool isIntegrated() {
+ return (ZeDeviceProperties->flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED);
+ }
+
// Does this device represent a single compute slice?
bool isCCS() const {
return QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute]
diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp
index cd61a8aa23..d91353349f 100644
--- a/source/adapters/level_zero/event.cpp
+++ b/source/adapters/level_zero/event.cpp
@@ -84,8 +84,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
// Get a new command list to be used on this call
ur_command_list_ptr_t CommandList{};
- UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
- UseCopyEngine));
+ UR_CALL(Queue->Context->getAvailableCommandList(
+ Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList));
ze_event_handle_t ZeEvent = nullptr;
ur_event_handle_t InternalEvent;
@@ -256,7 +256,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
// Get an arbitrary command-list in the queue.
ur_command_list_ptr_t CmdList;
UR_CALL(Queue->Context->getAvailableCommandList(
- Queue, CmdList, false /*UseCopyEngine=*/, OkToBatch));
+ Queue, CmdList, false /*UseCopyEngine=*/, NumEventsInWaitList,
+ EventWaitList, OkToBatch));
// Insert the barrier into the command-list and execute.
UR_CALL(insertBarrierIntoCmdList(CmdList, TmpWaitList, *Event, IsInternal));
@@ -311,7 +312,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
if (ZeQueue) {
ur_command_list_ptr_t CmdList;
UR_CALL(Queue->Context->getAvailableCommandList(
- Queue, CmdList, UseCopyEngine, OkToBatch, &ZeQueue));
+ Queue, CmdList, UseCopyEngine, NumEventsInWaitList,
+ EventWaitList, OkToBatch, &ZeQueue));
CmdLists.push_back(CmdList);
}
}
@@ -324,7 +326,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
// Get any available command list.
ur_command_list_ptr_t CmdList;
UR_CALL(Queue->Context->getAvailableCommandList(
- Queue, CmdList, false /*UseCopyEngine=*/, OkToBatch));
+ Queue, CmdList, false /*UseCopyEngine=*/, NumEventsInWaitList,
+ EventWaitList, OkToBatch));
CmdLists.push_back(CmdList);
}
@@ -365,8 +368,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
}
// Execute each command list so the barriers can be encountered.
- for (ur_command_list_ptr_t &CmdList : CmdLists)
+ for (ur_command_list_ptr_t &CmdList : CmdLists) {
+ bool IsCopy =
+ CmdList->second.isCopy(reinterpret_cast(Queue));
+ const auto &CommandBatch =
+ (IsCopy) ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch;
+ // Only batch if the matching CmdList is already open.
+ OkToBatch = CommandBatch.OpenCommandList == CmdList;
+
UR_CALL(Queue->executeCommandList(CmdList, false, OkToBatch));
+ }
UR_CALL(Queue->ActiveBarriers.clear());
auto UREvent = reinterpret_cast(*Event);
@@ -469,8 +480,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
) {
std::shared_lock EventLock(Event->Mutex);
- if (Event->UrQueue &&
- (Event->UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) == 0) {
+ // The event must either have profiling enabled or be recording timestamps.
+ bool isTimestampedEvent = Event->isTimestamped();
+ if (!Event->isProfilingEnabled() && !isTimestampedEvent) {
return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE;
}
@@ -483,6 +495,61 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
UrReturnHelper ReturnValue(PropValueSize, PropValue, PropValueSizeRet);
+ // For timestamped events we have the timestamps ready directly on the event
+ // handle, so we short-circuit the return.
+ if (isTimestampedEvent) {
+ uint64_t ContextStartTime = Event->RecordEventStartTimestamp;
+ switch (PropName) {
+ case UR_PROFILING_INFO_COMMAND_QUEUED:
+ case UR_PROFILING_INFO_COMMAND_SUBMIT:
+ return ReturnValue(ContextStartTime);
+ case UR_PROFILING_INFO_COMMAND_END:
+ case UR_PROFILING_INFO_COMMAND_START: {
+ // If RecordEventEndTimestamp on the event is non-zero it means it has
+ // collected the result of the queue already. In that case it has been
+ // adjusted and is ready for immediate return.
+ if (Event->RecordEventEndTimestamp)
+ return ReturnValue(Event->RecordEventEndTimestamp);
+
+ // Otherwise we need to collect it from the queue.
+ auto Entry = Event->UrQueue->EndTimeRecordings.find(Event);
+
+ // Unexpected state if there is no end-time record.
+ if (Entry == Event->UrQueue->EndTimeRecordings.end())
+ return UR_RESULT_ERROR_UNKNOWN;
+ auto &EndTimeRecording = Entry->second;
+
+ // End time needs to be adjusted for resolution and valid bits.
+ uint64_t ContextEndTime =
+ (EndTimeRecording.RecordEventEndTimestamp & TimestampMaxValue) *
+ ZeTimerResolution;
+
+ // If the result is 0, we have not yet gotten results back and so we just
+ // return it.
+ if (ContextEndTime == 0)
+ return ReturnValue(ContextEndTime);
+
+ // Handle a possible wrap-around (the underlying HW counter is < 64-bit).
+ // Note, it will not report correct time if there were multiple wrap
+ // arounds, and the longer term plan is to enlarge the capacity of the
+ // HW timestamps.
+ if (ContextEndTime < ContextStartTime)
+ ContextEndTime += TimestampMaxValue * ZeTimerResolution;
+
+ // Now that we have the result, there is no need to keep it in the queue
+ // anymore, so we cache it on the event and evict the record from the
+ // queue.
+ Event->RecordEventEndTimestamp = ContextEndTime;
+ Event->UrQueue->EndTimeRecordings.erase(Entry);
+
+ return ReturnValue(ContextEndTime);
+ }
+ default:
+ logger::error("urEventGetProfilingInfo: not supported ParamName");
+ return UR_RESULT_ERROR_INVALID_VALUE;
+ }
+ }
+
ze_kernel_timestamp_result_t tsResult;
// A Command-buffer consists of three command-lists for which only a single
@@ -591,6 +658,63 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo(
return UR_RESULT_SUCCESS;
}
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp(
+ ur_queue_handle_t Queue, ///< [in] handle of the queue object
+ bool Blocking, ///< [in] blocking or non-blocking enqueue
+ uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+ const ur_event_handle_t
+ *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+ ///< pointer to a list of events that must be complete
+ ///< before this command can be executed. If nullptr,
+ ///< the numEventsInWaitList must be 0, indicating
+ ///< that this command does not wait on any event to
+ ///< complete.
+ ur_event_handle_t
+ *OutEvent ///< [in,out] return an event object that identifies
+ ///< this particular command instance.
+) {
+ // Lock automatically releases when this goes out of scope.
+ std::scoped_lock lock(Queue->Mutex);
+
+ ur_device_handle_t Device = Queue->Device;
+
+ bool UseCopyEngine = false;
+ _ur_ze_event_list_t TmpWaitList;
+ UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
+ NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
+
+ // Get a new command list to be used on this call
+ ur_command_list_ptr_t CommandList{};
+ UR_CALL(Queue->Context->getAvailableCommandList(
+ Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList,
+ /* AllowBatching */ false));
+
+ UR_CALL(createEventAndAssociateQueue(
+ Queue, OutEvent, UR_COMMAND_TIMESTAMP_RECORDING_EXP, CommandList,
+ /* IsInternal */ false, /* HostVisible */ true));
+ ze_event_handle_t ZeEvent = (*OutEvent)->ZeEvent;
+ (*OutEvent)->WaitList = TmpWaitList;
+
+ uint64_t DeviceStartTimestamp = 0;
+ UR_CALL(urDeviceGetGlobalTimestamps(Device, &DeviceStartTimestamp, nullptr));
+ (*OutEvent)->RecordEventStartTimestamp = DeviceStartTimestamp;
+
+ // Create a new entry in the queue's recordings.
+ Queue->EndTimeRecordings[*OutEvent] =
+ ur_queue_handle_t_::end_time_recording{};
+
+ ZE2UR_CALL(zeCommandListAppendWriteGlobalTimestamp,
+ (CommandList->first,
+ &Queue->EndTimeRecordings[*OutEvent].RecordEventEndTimestamp,
+ ZeEvent, (*OutEvent)->WaitList.Length,
+ (*OutEvent)->WaitList.ZeEventList));
+
+ UR_CALL(
+ Queue->executeCommandList(CommandList, Blocking, /* OkToBatch */ false));
+
+ return UR_RESULT_SUCCESS;
+}
+
ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent(
ze_event_handle_t &ZeHostVisibleEvent) {
@@ -598,6 +722,7 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent(
this->Mutex);
if (!HostVisibleEvent) {
+ this->IsCreatingHostProxyEvent = true;
if (UrQueue->ZeEventsScope != OnDemandHostVisibleProxy)
die("getOrCreateHostVisibleEvent: missing host-visible event");
@@ -612,7 +737,7 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent(
ur_command_list_ptr_t CommandList{};
UR_CALL(UrQueue->Context->getAvailableCommandList(
- UrQueue, CommandList, false /* UseCopyEngine */, OkToBatch))
+ UrQueue, CommandList, false /* UseCopyEngine */, 0, nullptr, OkToBatch))
// Create a "proxy" host-visible event.
UR_CALL(createEventAndAssociateQueue(
@@ -620,12 +745,18 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent(
/* IsInternal */ false, /* IsMultiDevice */ false,
/* HostVisible */ true));
- ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
- (CommandList->first, 1, &ZeEvent));
+ if (this->IsInnerBatchedEvent) {
+ ZE2UR_CALL(zeCommandListAppendBarrier,
+ (CommandList->first, ZeEvent, 0, nullptr));
+ } else {
+ ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
+ (CommandList->first, 1, &ZeEvent));
+ }
ZE2UR_CALL(zeCommandListAppendSignalEvent,
(CommandList->first, HostVisibleEvent->ZeEvent));
UR_CALL(UrQueue->executeCommandList(CommandList, false, OkToBatch))
+ this->IsCreatingHostProxyEvent = false;
}
ZeHostVisibleEvent = HostVisibleEvent->ZeEvent;
@@ -682,7 +813,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventWait(
ze_event_handle_t ZeEvent = HostVisibleEvent->ZeEvent;
logger::debug("ZeEvent = {}", ur_cast(ZeEvent));
- ZE2UR_CALL(zeHostSynchronize, (ZeEvent));
+ // If this event was an inner batched event, then sync with
+ // the Queue instead of waiting on the event.
+ if (HostVisibleEvent->IsInnerBatchedEvent && Event->ZeBatchedQueue) {
+ ZE2UR_CALL(zeHostSynchronize, (Event->ZeBatchedQueue));
+ } else {
+ ZE2UR_CALL(zeHostSynchronize, (ZeEvent));
+ }
Event->Completed = true;
}
}
@@ -877,10 +1014,28 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) {
}
// Save pointer to the queue before deleting/resetting event.
+ auto Queue = Event->UrQueue;
+
+ // If the event was a timestamp recording, we try to evict its entry in the
+ // queue.
+ if (Event->isTimestamped()) {
+ auto Entry = Queue->EndTimeRecordings.find(Event);
+ if (Entry != Queue->EndTimeRecordings.end()) {
+ auto &EndTimeRecording = Entry->second;
+ if (EndTimeRecording.RecordEventEndTimestamp == 0) {
+ // If the end time recording has not finished, we tell the queue that
+ // the event is no longer alive to avoid invalid write-backs.
+ EndTimeRecording.EventHasDied = true;
+ } else {
+ // Otherwise we evict the entry.
+ Event->UrQueue->EndTimeRecordings.erase(Entry);
+ }
+ }
+ }
+
// When we add an event to the cache we need to check whether profiling is
// enabled or not, so we access properties of the queue and that's why queue
// must released later.
- auto Queue = Event->UrQueue;
if (DisableEventsCaching || !Event->OwnNativeHandle) {
delete Event;
} else {
@@ -938,7 +1093,12 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked,
std::list EventsToBeReleased;
ur_queue_handle_t AssociatedQueue = nullptr;
{
- std::scoped_lock EventLock(Event->Mutex);
+ // If the Event is already locked, then continue with the cleanup, otherwise
+ // block on locking the event.
+ std::unique_lock EventLock(Event->Mutex, std::try_to_lock);
+ if (!EventLock.owns_lock() && !Event->IsCreatingHostProxyEvent) {
+ EventLock.lock();
+ }
if (SetEventCompleted)
Event->Completed = true;
// Exit early of event was already cleanedup.
@@ -1268,16 +1428,26 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
}
auto Queue = EventList[I]->UrQueue;
- if (Queue) {
- // The caller of createAndRetainUrZeEventList must already hold
- // a lock of the CurQueue. Additionally lock the Queue if it
- // is different from CurQueue.
- // TODO: rework this to avoid deadlock when another thread is
- // locking the same queues but in a different order.
- auto Lock = ((Queue == CurQueue)
- ? std::unique_lock()
- : std::unique_lock(Queue->Mutex));
+ auto CurQueueDevice = CurQueue->Device;
+ std::optional> QueueLock =
+ std::nullopt;
+ // The caller of createAndRetainUrZeEventList must already hold
+ // a lock of the CurQueue. However, if the CurQueue is different
+ // then the Event's Queue, we need to drop that lock and
+ // acquire the Event's Queue lock. This is done to avoid a lock
+ // ordering issue.
+ // For the rest of this scope, CurQueue cannot be accessed.
+ // TODO: This solution is very error-prone. This requires a refactor
+ // to either have fine-granularity locks inside of the queues or
+ // to move any operations on queues other than CurQueue out
+ // of this scope.
+ if (Queue && Queue != CurQueue) {
+ CurQueue->Mutex.unlock();
+ QueueLock = std::unique_lock(Queue->Mutex);
+ }
+
+ if (Queue) {
// If the event that is going to be waited is in an open batch
// different from where this next command is going to be added,
// then we have to force execute of that open command-list
@@ -1320,17 +1490,17 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
}
ur_command_list_ptr_t CommandList;
- if (Queue && Queue->Device != CurQueue->Device) {
+ if (Queue && Queue->Device != CurQueueDevice) {
// Get a command list prior to acquiring an event lock.
// This prevents a potential deadlock with recursive
// event locks.
- UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList,
- false, true));
+ UR_CALL(Queue->Context->getAvailableCommandList(
+ Queue, CommandList, false, 0, nullptr, true));
}
std::shared_lock Lock(EventList[I]->Mutex);
- if (Queue && Queue->Device != CurQueue->Device &&
+ if (Queue && Queue->Device != CurQueueDevice &&
!EventList[I]->IsMultiDevice) {
ze_event_handle_t MultiDeviceZeEvent = nullptr;
ur_event_handle_t MultiDeviceEvent;
@@ -1365,6 +1535,10 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
this->UrEventList[TmpListLength]->RefCount.increment();
}
+ if (QueueLock.has_value()) {
+ QueueLock.reset();
+ CurQueue->Mutex.lock();
+ }
TmpListLength += 1;
}
}
@@ -1443,3 +1617,12 @@ bool ur_event_handle_t_::isProfilingEnabled() const {
return !UrQueue || // tentatively assume user events are profiling enabled
(UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0;
}
+
+// Tells if this event was created as a timestamp event, allowing profiling
+// info even if profiling is not enabled.
+bool ur_event_handle_t_::isTimestamped() const {
+ // If we are recording, the start time of the event will be non-zero. The
+ // end time might still be missing, depending on whether the corresponding
+ // enqueue is still running.
+ return RecordEventStartTimestamp != 0;
+}
diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp
index 9fa4663ab4..2d1f536e4e 100644
--- a/source/adapters/level_zero/event.hpp
+++ b/source/adapters/level_zero/event.hpp
@@ -198,6 +198,23 @@ struct ur_event_handle_t_ : _ur_object {
// performance
bool IsMultiDevice = {false};
+ // Indicates inner batched event which was not used as a signal event.
+ bool IsInnerBatchedEvent = {false};
+
+ // Queue where the batched command was executed.
+ ze_command_queue_handle_t ZeBatchedQueue = {nullptr};
+
+ // Indicates within creation of proxy event.
+ bool IsCreatingHostProxyEvent = {false};
+
+ // Indicates the recorded start and end timestamps for the event. These are
+ // only set for events returned by timestamp recording enqueue functions.
+ // A non-zero value for RecordEventStartTimestamp indicates the event was the
+ // result of a timestamp recording. If RecordEventEndTimestamp is non-zero, it
+ // means the event has fetched the end-timestamp from the queue.
+ uint64_t RecordEventStartTimestamp = 0;
+ uint64_t RecordEventEndTimestamp = 0;
+
// Besides each PI object keeping a total reference count in
// _ur_object::RefCount we keep special track of the event *external*
// references. This way we are able to tell when the event is not referenced
@@ -222,6 +239,10 @@ struct ur_event_handle_t_ : _ur_object {
// Tells if this event is with profiling capabilities.
bool isProfilingEnabled() const;
+ // Tells if this event was created as a timestamp event, allowing profiling
+ // info even if profiling is not enabled.
+ bool isTimestamped() const;
+
// Get the host-visible event or create one and enqueue its signal.
ur_result_t getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent);
diff --git a/source/adapters/level_zero/image.cpp b/source/adapters/level_zero/image.cpp
index 3b767f9127..b953e69f06 100644
--- a/source/adapters/level_zero/image.cpp
+++ b/source/adapters/level_zero/image.cpp
@@ -656,13 +656,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageFreeExp(
UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
ur_context_handle_t hContext, ur_device_handle_t hDevice,
ur_exp_image_mem_handle_t hImageMem, const ur_image_format_t *pImageFormat,
- const ur_image_desc_t *pImageDesc, ur_mem_handle_t *phMem,
- ur_exp_image_handle_t *phImage) {
+ const ur_image_desc_t *pImageDesc, ur_exp_image_handle_t *phImage) {
std::shared_lock Lock(hContext->Mutex);
UR_ASSERT(hContext && hDevice && hImageMem,
UR_RESULT_ERROR_INVALID_NULL_HANDLE);
- UR_ASSERT(pImageFormat && pImageDesc && phMem && phImage,
+ UR_ASSERT(pImageFormat && pImageDesc && phImage,
UR_RESULT_ERROR_INVALID_NULL_POINTER);
ZeStruct ZeImageDesc;
@@ -687,11 +686,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
UrImage->ZeImage, &ZeImage));
ZE2UR_CALL(zeContextMakeImageResident,
(hContext->ZeContext, hDevice->ZeDevice, ZeImage));
- UR_CALL(createUrMemFromZeImage(hContext, ZeImage, /*OwnZeMemHandle*/ true,
- ZeImageDesc, phMem));
} else {
ZeImage = UrImage->ZeImage;
- *phMem = nullptr;
}
} else if (MemAllocProperties.type == ZE_MEMORY_TYPE_DEVICE) {
ze_image_pitched_exp_desc_t PitchedDesc;
@@ -710,8 +706,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp(
&ZeImageDesc, &ZeImage));
ZE2UR_CALL(zeContextMakeImageResident,
(hContext->ZeContext, hDevice->ZeDevice, ZeImage));
- UR_CALL(createUrMemFromZeImage(hContext, ZeImage, /*OwnZeMemHandle*/ true,
- ZeImageDesc, phMem));
} else {
return UR_RESULT_ERROR_INVALID_VALUE;
}
@@ -741,10 +735,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp(
ur_context_handle_t hContext, ur_device_handle_t hDevice,
ur_exp_image_mem_handle_t hImageMem, const ur_image_format_t *pImageFormat,
const ur_image_desc_t *pImageDesc, ur_sampler_handle_t hSampler,
- ur_mem_handle_t *phMem, ur_exp_image_handle_t *phImage) {
+ ur_exp_image_handle_t *phImage) {
UR_CALL(urBindlessImagesUnsampledImageCreateExp(
- hContext, hDevice, hImageMem, pImageFormat, pImageDesc, phMem, phImage));
+ hContext, hDevice, hImageMem, pImageFormat, pImageDesc, phImage));
struct combined_sampled_image_handle {
uint64_t RawImageHandle;
@@ -790,8 +784,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
// Get a new command list to be used on this call
ur_command_list_ptr_t CommandList{};
- UR_CALL(hQueue->Context->getAvailableCommandList(hQueue, CommandList,
- UseCopyEngine, OkToBatch));
+ UR_CALL(hQueue->Context->getAvailableCommandList(
+ hQueue, CommandList, UseCopyEngine, numEventsInWaitList, phEventWaitList,
+ OkToBatch));
ze_event_handle_t ZeEvent = nullptr;
ur_event_handle_t InternalEvent;
@@ -800,7 +795,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
UR_CALL(createEventAndAssociateQueue(hQueue, Event, UR_COMMAND_MEM_IMAGE_COPY,
CommandList, IsInternal,
/*IsMultiDevice*/ false));
- ZeEvent = (*Event)->ZeEvent;
+ UR_CALL(setSignalEvent(hQueue, UseCopyEngine, &ZeEvent, Event,
+ numEventsInWaitList, phEventWaitList,
+ CommandList->second.ZeQueue));
(*Event)->WaitList = TmpWaitList;
const auto &ZeCommandList = CommandList->first;
diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp
index d96351dd5d..40d6260ac9 100644
--- a/source/adapters/level_zero/kernel.cpp
+++ b/source/adapters/level_zero/kernel.cpp
@@ -201,7 +201,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
// Get a new command list to be used on this call
ur_command_list_ptr_t CommandList{};
UR_CALL(Queue->Context->getAvailableCommandList(
- Queue, CommandList, UseCopyEngine, true /* AllowBatching */));
+ Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList,
+ true /* AllowBatching */));
ze_event_handle_t ZeEvent = nullptr;
ur_event_handle_t InternalEvent{};
@@ -210,7 +211,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH,
CommandList, IsInternal, false));
- ZeEvent = (*Event)->ZeEvent;
+ UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event,
+ NumEventsInWaitList, EventWaitList,
+ CommandList->second.ZeQueue));
(*Event)->WaitList = TmpWaitList;
// Save the kernel in the event, so that when the event is signalled
@@ -268,13 +271,264 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
}
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
- ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
- const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
- const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
- const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
- return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
- pGlobalWorkSize, pLocalWorkSize,
- numEventsInWaitList, phEventWaitList, phEvent);
+ ur_queue_handle_t Queue, ///< [in] handle of the queue object
+ ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object
+ uint32_t WorkDim, ///< [in] number of dimensions, from 1 to 3, to specify
+ ///< the global and work-group work-items
+ const size_t
+ *GlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned
+ ///< values that specify the offset used to
+ ///< calculate the global ID of a work-item
+ const size_t *GlobalWorkSize, ///< [in] pointer to an array of workDim
+ ///< unsigned values that specify the number
+ ///< of global work-items in workDim that
+ ///< will execute the kernel function
+ const size_t
+ *LocalWorkSize, ///< [in][optional] pointer to an array of workDim
+ ///< unsigned values that specify the number of local
+ ///< work-items forming a work-group that will execute
+ ///< the kernel function. If nullptr, the runtime
+ ///< implementation will choose the work-group size.
+ uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
+ const ur_event_handle_t
+ *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
+ ///< pointer to a list of events that must be complete
+ ///< before the kernel execution. If nullptr, the
+ ///< numEventsInWaitList must be 0, indicating that no
+ ///< wait event.
+ ur_event_handle_t
+ *OutEvent ///< [in,out][optional] return an event object that identifies
+ ///< this particular kernel execution instance.
+) {
+ auto ZeDevice = Queue->Device->ZeDevice;
+
+ ze_kernel_handle_t ZeKernel{};
+ if (Kernel->ZeKernelMap.empty()) {
+ ZeKernel = Kernel->ZeKernel;
+ } else {
+ auto It = Kernel->ZeKernelMap.find(ZeDevice);
+ if (It == Kernel->ZeKernelMap.end()) {
+ /* kernel and queue don't match */
+ return UR_RESULT_ERROR_INVALID_QUEUE;
+ }
+ ZeKernel = It->second;
+ }
+ // Lock automatically releases when this goes out of scope.
+ std::scoped_lock Lock(
+ Queue->Mutex, Kernel->Mutex, Kernel->Program->Mutex);
+ if (GlobalWorkOffset != NULL) {
+ if (!Queue->Device->Platform->ZeDriverGlobalOffsetExtensionFound) {
+ logger::error("No global offset extension found on this driver");
+ return UR_RESULT_ERROR_INVALID_VALUE;
+ }
+
+ ZE2UR_CALL(zeKernelSetGlobalOffsetExp,
+ (ZeKernel, GlobalWorkOffset[0], GlobalWorkOffset[1],
+ GlobalWorkOffset[2]));
+ }
+
+ // If there are any pending arguments set them now.
+ for (auto &Arg : Kernel->PendingArguments) {
+ // The ArgValue may be a NULL pointer in which case a NULL value is used for
+ // the kernel argument declared as a pointer to global or constant memory.
+ char **ZeHandlePtr = nullptr;
+ if (Arg.Value) {
+ UR_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode,
+ Queue->Device));
+ }
+ ZE2UR_CALL(zeKernelSetArgumentValue,
+ (ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr));
+ }
+ Kernel->PendingArguments.clear();
+
+ ze_group_count_t ZeThreadGroupDimensions{1, 1, 1};
+ uint32_t WG[3]{};
+
+ // New variable needed because GlobalWorkSize parameter might not be of size 3
+ size_t GlobalWorkSize3D[3]{1, 1, 1};
+ std::copy(GlobalWorkSize, GlobalWorkSize + WorkDim, GlobalWorkSize3D);
+
+ if (LocalWorkSize) {
+ // L0
+ UR_ASSERT(LocalWorkSize[0] < (std::numeric_limits::max)(),
+ UR_RESULT_ERROR_INVALID_VALUE);
+ UR_ASSERT(LocalWorkSize[1] < (std::numeric_limits