diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000..176a458f94 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +* text=auto diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index d53176133b..56f72c49e9 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,16 +2,21 @@ # Level Zero adapter source/adapters/level_zero @oneapi-src/unified-runtime-level-zero-write +test/adapters/level_zero @oneapi-src/unified-runtime-level-zero-write # CUDA and HIP adapters source/adapters/cuda @oneapi-src/unified-runtime-cuda-write +test/adapters/cuda @oneapi-src/unified-runtime-cuda-write source/adapters/hip @oneapi-src/unified-runtime-hip-write +test/adapters/hip @oneapi-src/unified-runtime-hip-write # OpenCL adapter source/adapters/opencl @oneapi-src/unified-runtime-opencl-write +test/adapters/opencl @oneapi-src/unified-runtime-opencl-write # Native CPU adapter source/adapters/native_cpu @oneapi-src/unified-runtime-native-cpu-write +test/adapters/native_cpu @oneapi-src/unified-runtime-native-cpu-write # Command-buffer experimental feature source/adapters/**/command_buffer.* @oneapi-src/unified-runtime-command-buffer-write @@ -20,6 +25,7 @@ scripts/core/exp-command-buffer.yml @oneapi-src/unified-runtime-command-buff test/conformance/exp_command_buffer** @oneapi-src/unified-runtime-command-buffer-write # Bindless Images experimental feature -scripts/core/EXP-BINDLESS-IMAGES.rst @oneapi-src/unified-runtime-bindless-images-write -scripts/core/exp-bindless-images.yml @oneapi-src/unified-runtime-bindless-images-write -source/adapters/**/image.* @oneapi-src/unified-runtime-bindless-images-write +source/adapters/**/image.* @oneapi-src/unified-runtime-bindless-images-write +scripts/core/EXP-BINDLESS-IMAGES.rst @oneapi-src/unified-runtime-bindless-images-write +scripts/core/exp-bindless-images.yml @oneapi-src/unified-runtime-bindless-images-write +test/conformance/exp_bindless_images** @oneapi-src/unified-runtime-bindless-images-write diff --git a/.github/docker/ubuntu-22.04.Dockerfile b/.github/docker/ubuntu-22.04.Dockerfile index e8e88caa72..09eaab03a8 100644 --- a/.github/docker/ubuntu-22.04.Dockerfile +++ b/.github/docker/ubuntu-22.04.Dockerfile @@ -67,7 +67,8 @@ RUN /opt/install_dpcpp.sh COPY install_libbacktrace.sh /opt/install_libbacktrace.sh RUN /opt/install_libbacktrace.sh -# Add a new (non-root) 'user' -ENV USER user +# Add a new (non-root) 'test_user' and switch to it +ENV USER test_user ENV USERPASS pass RUN useradd -m "${USER}" -g sudo -p "$(mkpasswd ${USERPASS})" +USER test_user diff --git a/.github/scripts/compute_benchmarks.py b/.github/scripts/compute_benchmarks.py new file mode 100755 index 0000000000..01f1b90c40 --- /dev/null +++ b/.github/scripts/compute_benchmarks.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import os +import subprocess # nosec B404 +import csv +import argparse +import io +import json +from pathlib import Path + +# Function to run the benchmark with the given parameters and environment variables +def run_benchmark(directory, ioq, env_vars): + env = os.environ.copy() + env.update(env_vars) + command = [ + f"{directory}/api_overhead_benchmark_sycl", + "--test=SubmitKernel", + f"--Ioq={ioq}", + "--DiscardEvents=0", + "--MeasureCompletion=0", + "--iterations=10000", + "--Profiling=0", + "--NumKernels=10", + "--KernelExecTime=1", + "--csv", + "--noHeaders" + ] + result = subprocess.run(command, capture_output=True, text=True, env=env) # nosec B603 + return command, result.stdout + +# Function to parse the CSV output and extract the mean execution time +def parse_output(output): + # Use StringIO to turn the string output into a file-like object for the csv reader + csv_file = io.StringIO(output) + reader = csv.reader(csv_file) + + # Skip the header row + next(reader, None) + data_row = next(reader, None) + if data_row is None: + raise ValueError("Benchmark output does not contain data.") + try: + name = data_row[0] # Name of the benchmark is the first value + mean = float(data_row[1]) # Mean is the second value + return (name, mean) + except ValueError: + raise ValueError(f"Could not convert mean execution time to float: '{data_row[1]}'") + except IndexError: + raise ValueError("Data row does not contain enough values.") + +# Function to generate the mermaid bar chart script +def generate_mermaid_script(labels, chart_data): + mermaid_script=f""" +--- +config: + gantt: + rightPadding: 10 + leftPadding: 120 + sectionFontSize: 10 + numberSectionStyles: 2 +--- +gantt + title api_overhead_benchmark_sycl, mean execution time per 10 kernels (μs) + todayMarker off + dateFormat X + axisFormat %s +""" + for label in labels: + nbars = 0 + print_label = label.replace(" ", "
") + mermaid_script += f""" + section {print_label} +""" + for (name, data) in chart_data: + if data is not None: + if label in data: + nbars += 1 + mean = data[label] + crit = "crit," if name == "This PR" else "" + mermaid_script += f""" + {name} ({mean} us) : {crit} 0, {int(mean)} +""" + padding = 4 - nbars + if padding > 0: + for _ in range(padding): + mermaid_script += f""" + - : 0, 0 +""" + + return mermaid_script + +# Function to generate the markdown collapsible sections for each variant +def generate_markdown_details(variant_details): + markdown_sections = [] + for label, command, env_vars, output in variant_details: + env_vars_str = '\n'.join(f"{key}={value}" for key, value in env_vars.items()) + markdown_sections.append(f""" +
+{label} + +#### Environment Variables: +{env_vars_str} + +#### Command: +{' '.join(command)} + +#### Output: +{output} + +
+""") + return "\n".join(markdown_sections) + +# Function to generate the full markdown +def generate_markdown_with_mermaid_chart(mermaid_script, variant_details): + return f""" +# Benchmark Results +```mermaid +{mermaid_script} +``` +## Details +{generate_markdown_details(variant_details)} +""" + +def save_benchmark_results(save_name, benchmark_data): + benchmarks_dir = Path.home() / 'benchmarks' + benchmarks_dir.mkdir(exist_ok=True) + file_path = benchmarks_dir / f"{save_name}.json" + with file_path.open('w') as file: + json.dump(benchmark_data, file, indent=4) + print(f"Benchmark results saved to {file_path}") + +def load_benchmark_results(compare_name): + benchmarks_dir = Path.home() / 'benchmarks' + file_path = benchmarks_dir / f"{compare_name}.json" + if file_path.exists(): + with file_path.open('r') as file: + return json.load(file) + else: + return None + +def main(directory, additional_env_vars, save_name, compare_names): + variants = [ + (1, {'UR_L0_USE_IMMEDIATE_COMMANDLISTS': '0'}, "Imm-CmdLists-OFF"), + (0, {'UR_L0_USE_IMMEDIATE_COMMANDLISTS': '0'}, "Imm-CmdLists-OFF"), + (1, {'UR_L0_USE_IMMEDIATE_COMMANDLISTS': '1'}, ""), + (0, {'UR_L0_USE_IMMEDIATE_COMMANDLISTS': '1'}, ""), + ] + + # Run benchmarks and collect means, labels, and variant details + means = [] + labels = [] + variant_details = [] + for ioq, env_vars, extra_label in variants: + merged_env_vars = {**env_vars, **additional_env_vars} + command, output = run_benchmark(directory, ioq, merged_env_vars) + (label, mean) = parse_output(output) + label += f" {extra_label}" + means.append(mean) + labels.append(label) + variant_details.append((label, command, merged_env_vars, output)) + + benchmark_data = {label: mean for label, mean in zip(labels, means)} + + chart_data = [("This PR", benchmark_data)] + for name in compare_names: + chart_data.append((name, load_benchmark_results(name))) + + if save_name: + save_benchmark_results(save_name, benchmark_data) + + mermaid_script = generate_mermaid_script(labels, chart_data) + + markdown_content = generate_markdown_with_mermaid_chart(mermaid_script, variant_details) + + with open('benchmark_results.md', 'w') as file: + file.write(markdown_content) + + print("Markdown with benchmark results has been written to benchmark_results.md") + +def validate_and_parse_env_args(env_args): + env_vars = {} + for arg in env_args: + if '=' not in arg: + raise ValueError(f"Environment variable argument '{arg}' is not in the form Variable=Value.") + key, value = arg.split('=', 1) + env_vars[key] = value + return env_vars + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Run benchmarks and generate a Mermaid bar chart script.') + parser.add_argument('benchmark_directory', type=str, help='The directory where the benchmarks are located.') + parser.add_argument("--env", type=str, help='Use env variable for a benchmark run.', action="append", default=[]) + parser.add_argument("--save", type=str, help='Save the results for comparison under a specified name.') + parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append", default=["baseline"]) + + args = parser.parse_args() + + additional_env_vars = validate_and_parse_env_args(args.env) + + main(args.benchmark_directory, additional_env_vars, args.save, args.compare) diff --git a/.github/scripts/get_system_info.sh b/.github/scripts/get_system_info.sh index 6ca38a33ef..8301c99099 100755 --- a/.github/scripts/get_system_info.sh +++ b/.github/scripts/get_system_info.sh @@ -53,7 +53,7 @@ function system_info { echo "**********/proc/meminfo**********" cat /proc/meminfo echo "**********build/bin/urinfo**********" - $(dirname "$(readlink -f "$0")")/../../build/bin/urinfo || true + $(dirname "$(readlink -f "$0")")/../../build/bin/urinfo --no-linear-ids --verbose || true echo "******OpenCL*******" # The driver version of OpenCL Graphics is the compute-runtime version clinfo || echo "OpenCL not installed" diff --git a/.github/workflows/benchmarks_core.yml b/.github/workflows/benchmarks_core.yml new file mode 100644 index 0000000000..34ffb7134e --- /dev/null +++ b/.github/workflows/benchmarks_core.yml @@ -0,0 +1,199 @@ +name: Compute Benchmarks + +on: + # this workflow can by only triggered by other workflows + # for example by: e2e_cuda.yml or e2e_opencl.yml + workflow_call: + # acceptable input from adapter-specific workflows + inputs: + name: + description: Adapter name + type: string + required: true + str_name: + description: Formatted adapter name + type: string + required: true + config: + description: Params for sycl configuration + type: string + required: true + unit: + description: Test unit (cpu/gpu) + type: string + required: true + runner_tag: + description: Tag defined for the runner + type: string + required: true + trigger: + description: Type of workflow trigger + type: string + required: true + comment: + description: Text if triggered by a comment + type: string + required: false + +permissions: + contents: read + pull-requests: write + +jobs: + e2e-build-hw: + if: github.repository == 'oneapi-src/unified-runtime' # run only on upstream; forks will not have the HW + name: Build SYCL, UR, run Compute Benchmarks + strategy: + matrix: + adapter: [ + {name: "${{inputs.name}}", + str_name: "${{inputs.str_name}}", + config: "${{inputs.config}}", + unit: "${{inputs.unit}}"} + ] + build_type: [Release] + compiler: [{c: clang, cxx: clang++}] + + runs-on: ${{inputs.runner_tag}} + + steps: + # Workspace on self-hosted runners is not cleaned automatically. + # We have to delete the files created outside of using actions. + - name: Cleanup self-hosted workspace + if: always() + run: | + ls -la ./ + rm -rf ./* || true + + - name: Add comment to PR + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + if: ${{ always() && inputs.trigger != 'schedule' }} + with: + script: | + const adapter = '${{ matrix.adapter.name }}'; + const url = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}'; + const body = `Compute Benchmarks ${adapter} run: \n${url}`; + + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: body + }) + + - name: Checkout UR + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + path: ur-repo + + # On issue_comment trigger (for PRs) we need to fetch special ref for + # proper PR's merge commit. Note, this ref may be absent if the PR is already merged. + - name: Fetch PR's merge commit + if: ${{ inputs.trigger != 'schedule' }} + working-directory: ${{github.workspace}}/ur-repo + env: + PR_NO: ${{github.event.issue.number}} + run: | + git fetch -- https://github.com/${{github.repository}} +refs/pull/${PR_NO}/*:refs/remotes/origin/pr/${PR_NO}/* + git checkout origin/pr/${PR_NO}/merge + git rev-parse origin/pr/${PR_NO}/merge + + - name: Checkout SYCL + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + repository: intel/llvm + ref: refs/heads/sycl + path: sycl-repo + fetch-depth: 1 + fetch-tags: false + + - name: Set CUDA env vars + if: matrix.adapter.name == 'CUDA' + run: | + echo "CUDA_LIB_PATH=/usr/local/cuda/lib64/stubs" >> $GITHUB_ENV + echo "LD_LIBRARY_PATH=/usr/local/cuda/compat/:/usr/local/cuda/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV + + - name: Configure SYCL + run: > + python3 sycl-repo/buildbot/configure.py + -t ${{matrix.build_type}} + -o ${{github.workspace}}/sycl_build + --cmake-gen "Ninja" + --ci-defaults ${{matrix.adapter.config}} + --cmake-opt="-DLLVM_INSTALL_UTILS=ON" + --cmake-opt="-DSYCL_PI_TESTS=OFF" + --cmake-opt="-DSYCL_PI_UR_USE_FETCH_CONTENT=OFF" + --cmake-opt="-DSYCL_PI_UR_SOURCE_DIR=${{github.workspace}}/ur-repo/" + --cmake-opt=-DCMAKE_C_COMPILER_LAUNCHER=ccache + --cmake-opt=-DCMAKE_CXX_COMPILER_LAUNCHER=ccache + + - name: Build SYCL + run: cmake --build ${{github.workspace}}/sycl_build -j + + - name: Set additional env. vars + run: | + echo "${{github.workspace}}/sycl_build/bin" >> $GITHUB_PATH + echo "LD_LIBRARY_PATH=${{github.workspace}}/sycl_build/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV + + # Running (newly built) sycl-ls sets up some extra variables + - name: Setup SYCL variables + run: | + which clang++ sycl-ls + SYCL_PI_TRACE=-1 sycl-ls + + - name: Checkout Compute Benchmarks + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + repository: intel/compute-benchmarks + path: compute-benchmarks-repo + submodules: recursive + + - name: Configure Compute Benchmarks + run: > + cmake + -B ${{github.workspace}}/compute-benchmarks-build/ + -S ${{github.workspace}}/compute-benchmarks-repo/ + -DCMAKE_BUILD_TYPE=Release + -DBUILD_SYCL=ON + -DSYCL_COMPILER_ROOT=${{github.workspace}}/sycl_build + -DALLOW_WARNINGS=ON + + - name: Build Compute Benchmarks + run: cmake --build ${{github.workspace}}/compute-benchmarks-build/ -j + + - name: Set oneAPI Device Selector + run: | + echo "ONEAPI_DEVICE_SELECTOR=${{ matrix.adapter.str_name }}:${{ matrix.adapter.unit }}" >> $GITHUB_ENV + + - name: Extract arguments from comment + id: args + run: echo "ARGS=$(echo '${{ inputs.comment }}' | sed -n 's/.*\/benchmarks-[^ ]* \(.*\)/\1/p')" >> $GITHUB_ENV + + - name: Run SYCL API Overhead benchmark + id: benchmarks + run: ${{github.workspace}}/ur-repo/.github/scripts/compute_benchmarks.py ${{github.workspace}}/compute-benchmarks-build/bin/ $ARGS + + - name: Add comment to PR + uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + if: ${{ always() && inputs.trigger != 'schedule' }} + with: + script: | + let markdown = "" + try { + const fs = require('fs'); + markdown = fs.readFileSync('benchmark_results.md', 'utf8'); + } catch(err) { + } + + const adapter = '${{ matrix.adapter.name }}'; + const url = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}'; + const test_status = '${{ steps.benchmarks.outcome }}'; + const job_status = '${{ job.status }}'; + const body = `Compute Benchmarks ${adapter} run:\n${url}\nJob status: ${job_status}. Test status: ${test_status}.\n ${markdown}`; + + github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: body + }) diff --git a/.github/workflows/benchmarks_level_zero.yml b/.github/workflows/benchmarks_level_zero.yml new file mode 100644 index 0000000000..6b38aaeedd --- /dev/null +++ b/.github/workflows/benchmarks_level_zero.yml @@ -0,0 +1,26 @@ +name: Compute Benchmarks Level Zero + +on: + issue_comment: + types: [created, edited] + +permissions: + contents: read + pull-requests: write + +jobs: + e2e-build-hw: + # trigger only if PR comment contains "benchmarks-level-zero" + if: ${{ (github.event.issue.pull_request && contains(github.event.comment.body, '/benchmarks-level-zero'))}} + name: Start benchmarks job + # use core flow, run it with L0 specific parameters + uses: ./.github/workflows/benchmarks_core.yml + # parameters that we pass to the core flow + with: + name: "L0" + runner_tag: "L0_PERF" + str_name: "level_zero" + config: "" + unit: "gpu" + trigger: "${{github.event_name}}" + comment: ${{github.event.comment.body}} diff --git a/.github/workflows/build-hw-reusable.yml b/.github/workflows/build-hw-reusable.yml index a7945e2ac2..8c8f28a868 100644 --- a/.github/workflows/build-hw-reusable.yml +++ b/.github/workflows/build-hw-reusable.yml @@ -15,6 +15,13 @@ on: permissions: contents: read +env: + UR_LOG_CUDA: "level:error;flush:error" + UR_LOG_HIP: "level:error;flush:error" + UR_LOG_LEVEL_ZERO: "level:error;flush:error" + UR_LOG_NATIVE_CPU: "level:error;flush:error" + UR_LOG_OPENCL: "level:error;flush:error" + jobs: adapter-build-hw: name: Build & Test HW diff --git a/.github/workflows/e2e_core.yml b/.github/workflows/e2e_core.yml index 8554c5c476..7f2050d83d 100644 --- a/.github/workflows/e2e_core.yml +++ b/.github/workflows/e2e_core.yml @@ -34,6 +34,22 @@ on: description: Type of workflow trigger type: string required: true + xfail: + description: Allow test failures + type: string + required: false + xfail_not: + description: Not xfail + type: string + required: false + filter_out: + description: Tests to filter out completely + type: string + required: false + extra_lit_flags: + description: Additional llvm-lit flags to use + type: string + required: false permissions: contents: read @@ -50,7 +66,8 @@ jobs: str_name: "${{inputs.str_name}}", prefix: "${{inputs.prefix}}", config: "${{inputs.config}}", - unit: "${{inputs.unit}}"} + unit: "${{inputs.unit}}", + extra_lit_flags: "${{inputs.extra_lit_flags}}"}, ] build_type: [Release] compiler: [{c: clang, cxx: clang++}] @@ -103,7 +120,7 @@ jobs: uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: repository: intel/llvm - ref: sycl + ref: refs/heads/sycl path: sycl-repo - name: Set CUDA env vars @@ -112,11 +129,6 @@ jobs: echo "CUDA_LIB_PATH=/usr/local/cuda/lib64/stubs" >> $GITHUB_ENV echo "LD_LIBRARY_PATH=/usr/local/cuda/compat/:/usr/local/cuda/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV - - name: Run pre setup - run: | - source /opt/intel/oneapi/setvars.sh --force - sycl-ls - - name: Configure SYCL run: > python3 sycl-repo/buildbot/configure.py @@ -134,11 +146,15 @@ jobs: - name: Build SYCL run: cmake --build ${{github.workspace}}/sycl_build -j + - name: Set extra llvm-lit options + if: matrix.adapter.extra_lit_flags != '' + run: echo "LIT_OPTS=${{matrix.adapter.extra_lit_flags}}" >> $GITHUB_ENV + - name: Run check-sycl # Remove after fixing SYCL test :: abi/layout_handler.cpp # This issue does not affect further execution of e2e with UR. continue-on-error: true - run: cmake --build ${{github.workspace}}/sycl_build --target check-sycl -j6 + run: cmake --build ${{github.workspace}}/sycl_build --target check-sycl - name: Set additional env. vars run: | @@ -161,6 +177,13 @@ jobs: -DCMAKE_CXX_COMPILER="$(which clang++)" -DLLVM_LIT="${{github.workspace}}/sycl-repo/llvm/utils/lit/lit.py" + - name: Set test filters for L0 + if: matrix.adapter.name == 'L0' + run: | + echo "LIT_XFAIL_NOT=${{inputs.xfail_not}}" >> $GITHUB_ENV + echo "LIT_XFAIL=${{inputs.xfail}}" >> $GITHUB_ENV + echo "LIT_FILTER_OUT=${{inputs.filter_out}}" >> $GITHUB_ENV + - name: Run e2e tests id: tests run: ninja -C build-e2e check-sycl-e2e diff --git a/.github/workflows/e2e_level_zero.yml b/.github/workflows/e2e_level_zero.yml new file mode 100644 index 0000000000..66c23715ea --- /dev/null +++ b/.github/workflows/e2e_level_zero.yml @@ -0,0 +1,36 @@ +name: E2E Level Zero + +on: + schedule: + # Run every day at 23:00 UTC + - cron: '0 23 * * *' + issue_comment: + types: [created, edited] + +permissions: + contents: read + pull-requests: write + +jobs: + e2e-build-hw: + # trigger only if PR comment contains "e2e-level-zero" + if: ${{ (github.event.issue.pull_request && contains(github.event.comment.body, '/e2e-level-zero')) || (github.event_name == 'schedule') }} + name: Start e2e job + # use core flow, run it with L0 specific parameters + uses: ./.github/workflows/e2e_core.yml + # parameters that we pass to the core flow + with: + name: "L0" + runner_tag: "L0_E2E" + str_name: "level_zero" + prefix: "ext_oneapi_" + config: "" + unit: "gpu" + trigger: "${{github.event_name}}" + # Failing tests + xfail: "ESIMD/preemption.cpp;syclcompat/atomic/atomic_class.cpp;ProgramManager/uneven_kernel_split.cpp;Plugin/level_zero_ext_intel_queue_index.cpp;Plugin/level_zero_ext_intel_cslice.cpp;Matrix/joint_matrix_rowmajorA_rowmajorB.cpp;Matrix/element_wise_ops.cpp;Matrix/element_wise_all_ops.cpp;Matrix/SG32/element_wise_all_ops.cpp" + # Flaky tests + filter_out: "GroupAlgorithm/root_group.cpp|Basic/exceptions-SYCL-2020.cpp|Graph/UnsupportedDevice/device_query.cpp|Graph/RecordReplay/exception_inconsistent_contexts.cpp" + # These runners by default spawn upwards of 260 workers. That's too much for the GPU. + # We also add a time out just in case some test hangs + extra_lit_flags: "-sv -j 50 --max-time 600" diff --git a/.github/workflows/hadolint.yml b/.github/workflows/hadolint.yml deleted file mode 100644 index a1063aefe8..0000000000 --- a/.github/workflows/hadolint.yml +++ /dev/null @@ -1,34 +0,0 @@ -# Runs linter for Docker files -name: Hadolint - -on: - workflow_dispatch: - push: - pull_request: - paths: - - '.github/docker/*Dockerfile' - - '.github/workflows/hadolint.yml' - -concurrency: - group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} - cancel-in-progress: true - -permissions: - contents: read - -jobs: - linux: - name: Hadolint - runs-on: ubuntu-latest - - steps: - - name: Clone the git repo - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - - - name: Run Hadolint - uses: hadolint/hadolint-action@54c9adbab1582c2ef04b2016b760714a4bfde3cf # v3.1.0 - with: - recursive: true - dockerfile: ".github/docker/*Dockerfile" - # ignore pinning apt packages to versions - ignore: DL3008 diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 97362d4f1a..d0cb335d96 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -10,6 +10,9 @@ concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true +permissions: + contents: read + jobs: labeler: permissions: @@ -17,4 +20,4 @@ jobs: pull-requests: write runs-on: ubuntu-latest steps: - - uses: actions/labeler@v5 + - uses: actions/labeler@8558fd74291d67161a8a78ce36a881fa63b766a9 # v5.0.0 diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml new file mode 100644 index 0000000000..2efb04c86a --- /dev/null +++ b/.github/workflows/trivy.yml @@ -0,0 +1,50 @@ +# Runs linter for Docker files +name: Trivy + +on: + workflow_dispatch: + push: + pull_request: + paths: + - '.github/docker/*Dockerfile' + - '.github/workflows/trivy.yml' + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read + +jobs: + linux: + name: Trivy + runs-on: ubuntu-latest + permissions: + security-events: write + + steps: + - name: Clone repo + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + + - name: Run Trivy + uses: aquasecurity/trivy-action@84384bd6e777ef152729993b8145ea352e9dd3ef # v0.17.0 + with: + scan-type: 'config' + hide-progress: false + format: 'sarif' + output: 'trivy-results.sarif' + exit-code: 1 # Fail if issue found + # file with suppressions: .trivyignore (in root dir) + + - name: Print report and trivyignore file + run: | + echo "### Trivy ignore content:" + cat .trivyignore + echo "### Trivy report:" + cat trivy-results.sarif + + - name: Upload results + uses: github/codeql-action/upload-sarif@e8893c57a1f3a2b659b6b55564fdfdbbd2982911 # v3.24.0 + with: + sarif_file: 'trivy-results.sarif' diff --git a/.trivyignore b/.trivyignore new file mode 100644 index 0000000000..63749166bc --- /dev/null +++ b/.trivyignore @@ -0,0 +1,6 @@ +# Docs: https://aquasecurity.github.io/trivy/latest/docs/configuration/filtering/#trivyignore + +# In docker files: +# HEALTHCHECK is not required for development, nor in CI (failed docker = failed CI). +# We're not hosting any application with usage of the dockers. +AVD-DS-0026 diff --git a/CMakeLists.txt b/CMakeLists.txt index 4fcd74e729..cfd8752657 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,8 +3,8 @@ # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -cmake_minimum_required(VERSION 3.14.0 FATAL_ERROR) -project(unified-runtime VERSION 0.9.0) +cmake_minimum_required(VERSION 3.20.0 FATAL_ERROR) +project(unified-runtime VERSION 0.10.0) include(GNUInstallDirs) include(CheckCXXSourceCompiles) @@ -47,12 +47,15 @@ option(UR_BUILD_ADAPTER_ALL "Build all currently supported adapters" OFF) option(UR_BUILD_EXAMPLE_CODEGEN "Build the codegen example." OFF) option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace for linux" OFF) option(UR_ENABLE_ASSERTIONS "Enable assertions for all build types" OFF) +option(UR_BUILD_XPTI_LIBS "Build the XPTI libraries when tracing is enabled" ON) set(UR_DPCXX "" CACHE FILEPATH "Path of the DPC++ compiler executable") +set(UR_DPCXX_BUILD_FLAGS "" CACHE STRING "Build flags to pass to DPC++ when compiling device programs") set(UR_SYCL_LIBRARY_DIR "" CACHE PATH "Path of the SYCL runtime library directory") set(UR_CONFORMANCE_TARGET_TRIPLES "" CACHE STRING "List of sycl targets to build CTS device binaries for") set(UR_CONFORMANCE_AMD_ARCH "" CACHE STRING "AMD device target ID to build CTS binaries for") +option(UR_CONFORMANCE_ENABLE_MATCH_FILES "Enable CTS match files" ON) set(UR_ADAPTER_LEVEL_ZERO_SOURCE_DIR "" CACHE PATH "Path to external 'level_zero' adapter source dir") set(UR_ADAPTER_OPENCL_SOURCE_DIR "" CACHE PATH @@ -107,31 +110,34 @@ endif() if(UR_ENABLE_TRACING) add_compile_definitions(UR_ENABLE_TRACING) - # fetch xpti proxy library for the tracing layer - FetchContentSparse_Declare(xpti https://github.com/intel/llvm.git "sycl-nightly/20230703" "xpti") - FetchContent_MakeAvailable(xpti) + if (UR_BUILD_XPTI_LIBS) + # fetch xpti proxy library for the tracing layer + FetchContentSparse_Declare(xpti https://github.com/intel/llvm.git "sycl-nightly/20230703" "xpti") + FetchContent_MakeAvailable(xpti) - # set -fPIC for xpti since we are linking it with a shared library - set_target_properties(xpti PROPERTIES POSITION_INDEPENDENT_CODE ON) + # set -fPIC for xpti since we are linking it with a shared library + set_target_properties(xpti PROPERTIES POSITION_INDEPENDENT_CODE ON) - # fetch the xptifw dispatcher, mostly used for testing - # these variables need to be set for xptifw to compile - set(XPTI_SOURCE_DIR ${xpti_SOURCE_DIR}) - set(XPTI_DIR ${xpti_SOURCE_DIR}) - set(XPTI_ENABLE_TESTS OFF CACHE INTERNAL "Turn off xptifw tests") + # fetch the xptifw dispatcher, mostly used for testing + # these variables need to be set for xptifw to compile + set(XPTI_SOURCE_DIR ${xpti_SOURCE_DIR}) + set(XPTI_DIR ${xpti_SOURCE_DIR}) + set(XPTI_ENABLE_TESTS OFF CACHE INTERNAL "Turn off xptifw tests") - FetchContentSparse_Declare(xptifw https://github.com/intel/llvm.git "sycl-nightly/20230703" "xptifw") + FetchContentSparse_Declare(xptifw https://github.com/intel/llvm.git "sycl-nightly/20230703" "xptifw") - FetchContent_MakeAvailable(xptifw) + FetchContent_MakeAvailable(xptifw) - check_cxx_compiler_flag("-Wno-error=maybe-uninitialized" HAS_MAYBE_UNINIT) - if (HAS_MAYBE_UNINIT) - target_compile_options(xptifw PRIVATE -Wno-error=maybe-uninitialized) + check_cxx_compiler_flag("-Wno-error=maybe-uninitialized" HAS_MAYBE_UNINIT) + if (HAS_MAYBE_UNINIT) + target_compile_options(xptifw PRIVATE -Wno-error=maybe-uninitialized) + endif() + + set_target_properties(xptifw PROPERTIES + LIBRARY_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY} + ) endif() - set_target_properties(xptifw PROPERTIES - LIBRARY_OUTPUT_DIRECTORY ${CMAKE_LIBRARY_OUTPUT_DIRECTORY} - ) if (MSVC) set(TARGET_XPTI $,xpti,xptid>) else() diff --git a/README.md b/README.md index cb43c380b9..3d53d2117e 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ [![Build and test](https://github.com/oneapi-src/unified-runtime/actions/workflows/cmake.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/cmake.yml) [![E2E Cuda](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_cuda.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_cuda.yml) [![E2E OpenCL](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_opencl.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_opencl.yml) +[![E2E Level Zero](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_level_zero.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/e2e_level_zero.yml) [![CodeQL](https://github.com/oneapi-src/unified-runtime/actions/workflows/codeql.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/codeql.yml) [![Bandit](https://github.com/oneapi-src/unified-runtime/actions/workflows/bandit.yml/badge.svg)](https://github.com/oneapi-src/unified-runtime/actions/workflows/bandit.yml) [![Coverity](https://scan.coverity.com/projects/28213/badge.svg)](https://scan.coverity.com/projects/oneapi-src-unified-runtime) @@ -13,7 +14,6 @@ ## Table of contents - [Unified Runtime](#unified-runtime) - - [Adapters](#adapters) - [Table of contents](#table-of-contents) - [Contents of the repo](#contents-of-the-repo) - [Integration](#integration) @@ -29,7 +29,7 @@ - [Adapter naming convention](#adapter-naming-convention) - [Source code generation](#source-code-generation) - [Documentation](#documentation) -6. [Release Process](#release-process) + - [Release Process](#release-process) ## Contents of the repo @@ -88,7 +88,7 @@ for more detailed instructions on the correct setup. Required packages: - C++ compiler with C++17 support -- [CMake](https://cmake.org/) >= 3.14.0 +- [CMake](https://cmake.org/) >= 3.20.0 - Python v3.6.6 or later ### Windows @@ -131,6 +131,7 @@ List of options provided by CMake: | UR_ENABLE_SANITIZER | Enable device sanitizer layer | ON/OFF | ON | | UR_CONFORMANCE_TARGET_TRIPLES | SYCL triples to build CTS device binaries for | Comma-separated list | spir64 | | UR_CONFORMANCE_AMD_ARCH | AMD device target ID to build CTS binaries for | string | `""` | +| UR_CONFORMANCE_ENABLE_MATCH_FILES | Enable CTS match files | ON/OFF | ON | | UR_BUILD_ADAPTER_L0 | Build the Level-Zero adapter | ON/OFF | OFF | | UR_BUILD_ADAPTER_OPENCL | Build the OpenCL adapter | ON/OFF | OFF | | UR_BUILD_ADAPTER_CUDA | Build the CUDA adapter | ON/OFF | OFF | @@ -140,6 +141,8 @@ List of options provided by CMake: | UR_HIP_PLATFORM | Build HIP adapter for AMD or NVIDIA platform | AMD/NVIDIA | AMD | | UR_ENABLE_COMGR | Enable comgr lib usage | AMD/NVIDIA | AMD | | UR_DPCXX | Path of the DPC++ compiler executable to build CTS device binaries | File path | `""` | +| UR_DEVICE_CODE_EXTRACTOR | Path of the `clang-offload-extract` executable from the DPC++ package, required for CTS device binaries | File path | `"${dirname(UR_DPCXX)}/clang-offload-extract"` | +| UR_DPCXX_BUILD_FLAGS | Build flags to pass to DPC++ when compiling device programs | Space-separated options list | `""` | | UR_SYCL_LIBRARY_DIR | Path of the SYCL runtime library directory to build CTS device binaries | Directory path | `""` | | UR_HIP_ROCM_DIR | Path of the default ROCm HIP installation | Directory path | `/opt/rocm` | | UR_HIP_INCLUDE_DIR | Path of the ROCm HIP include directory | Directory path | `${UR_HIP_ROCM_DIR}/include` | diff --git a/include/ur_api.h b/include/ur_api.h index 8680dbeffb..96a58e04ea 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -7,7 +7,7 @@ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * * @file ur_api.h - * @version v0.9-r0 + * @version v0.10-r0 * */ #ifndef UR_API_H_INCLUDED @@ -222,6 +222,7 @@ typedef enum ur_function_t { UR_FUNCTION_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_EXP = 220, ///< Enumerator for ::urCommandBufferUpdateKernelLaunchExp UR_FUNCTION_COMMAND_BUFFER_GET_INFO_EXP = 221, ///< Enumerator for ::urCommandBufferGetInfoExp UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP = 222, ///< Enumerator for ::urCommandBufferCommandGetInfoExp + UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP = 223, ///< Enumerator for ::urEnqueueTimestampRecordingExp /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -1097,11 +1098,12 @@ urPlatformGetInfo( /// - API versions contain major and minor attributes, use /// ::UR_MAJOR_VERSION and ::UR_MINOR_VERSION typedef enum ur_api_version_t { - UR_API_VERSION_0_6 = UR_MAKE_VERSION(0, 6), ///< version 0.6 - UR_API_VERSION_0_7 = UR_MAKE_VERSION(0, 7), ///< version 0.7 - UR_API_VERSION_0_8 = UR_MAKE_VERSION(0, 8), ///< version 0.8 - UR_API_VERSION_0_9 = UR_MAKE_VERSION(0, 9), ///< version 0.9 - UR_API_VERSION_CURRENT = UR_MAKE_VERSION(0, 9), ///< latest known version + UR_API_VERSION_0_6 = UR_MAKE_VERSION(0, 6), ///< version 0.6 + UR_API_VERSION_0_7 = UR_MAKE_VERSION(0, 7), ///< version 0.7 + UR_API_VERSION_0_8 = UR_MAKE_VERSION(0, 8), ///< version 0.8 + UR_API_VERSION_0_9 = UR_MAKE_VERSION(0, 9), ///< version 0.9 + UR_API_VERSION_0_10 = UR_MAKE_VERSION(0, 10), ///< version 0.10 + UR_API_VERSION_CURRENT = UR_MAKE_VERSION(0, 10), ///< latest known version /// @cond UR_API_VERSION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -1434,201 +1436,214 @@ urDeviceGetSelected( /////////////////////////////////////////////////////////////////////////////// /// @brief Supported device info typedef enum ur_device_info_t { - UR_DEVICE_INFO_TYPE = 0, ///< [::ur_device_type_t] type of the device - UR_DEVICE_INFO_VENDOR_ID = 1, ///< [uint32_t] vendor Id of the device - UR_DEVICE_INFO_DEVICE_ID = 2, ///< [uint32_t] Id of the device - UR_DEVICE_INFO_MAX_COMPUTE_UNITS = 3, ///< [uint32_t] the number of compute units - UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS = 4, ///< [uint32_t] max work item dimensions - UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES = 5, ///< [size_t[]] return an array of max work item sizes - UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE = 6, ///< [size_t] max work group size - UR_DEVICE_INFO_SINGLE_FP_CONFIG = 7, ///< [::ur_device_fp_capability_flags_t] single precision floating point - ///< capability - UR_DEVICE_INFO_HALF_FP_CONFIG = 8, ///< [::ur_device_fp_capability_flags_t] half precision floating point - ///< capability - UR_DEVICE_INFO_DOUBLE_FP_CONFIG = 9, ///< [::ur_device_fp_capability_flags_t] double precision floating point - ///< capability - UR_DEVICE_INFO_QUEUE_PROPERTIES = 10, ///< [::ur_queue_flags_t] command queue properties supported by the device - UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR = 11, ///< [uint32_t] preferred vector width for char - UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT = 12, ///< [uint32_t] preferred vector width for short - UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT = 13, ///< [uint32_t] preferred vector width for int - UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG = 14, ///< [uint32_t] preferred vector width for long - UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT = 15, ///< [uint32_t] preferred vector width for float - UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE = 16, ///< [uint32_t] preferred vector width for double - UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF = 17, ///< [uint32_t] preferred vector width for half float - UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR = 18, ///< [uint32_t] native vector width for char - UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT = 19, ///< [uint32_t] native vector width for short - UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT = 20, ///< [uint32_t] native vector width for int - UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG = 21, ///< [uint32_t] native vector width for long - UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT = 22, ///< [uint32_t] native vector width for float - UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE = 23, ///< [uint32_t] native vector width for double - UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF = 24, ///< [uint32_t] native vector width for half float - UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY = 25, ///< [uint32_t] max clock frequency in MHz - UR_DEVICE_INFO_MEMORY_CLOCK_RATE = 26, ///< [uint32_t] memory clock frequency in MHz - UR_DEVICE_INFO_ADDRESS_BITS = 27, ///< [uint32_t] address bits - UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE = 28, ///< [uint64_t] max memory allocation size - UR_DEVICE_INFO_IMAGE_SUPPORTED = 29, ///< [::ur_bool_t] images are supported - UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS = 30, ///< [uint32_t] max number of image objects arguments of a kernel declared - ///< with the read_only qualifier - UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS = 31, ///< [uint32_t] max number of image objects arguments of a kernel declared - ///< with the write_only qualifier - UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS = 32, ///< [uint32_t] max number of image objects arguments of a kernel declared - ///< with the read_write qualifier - UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH = 33, ///< [size_t] max width of Image2D object - UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT = 34, ///< [size_t] max height of Image2D object - UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH = 35, ///< [size_t] max width of Image3D object - UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT = 36, ///< [size_t] max height of Image3D object - UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH = 37, ///< [size_t] max depth of Image3D object - UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE = 38, ///< [size_t] max image buffer size - UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE = 39, ///< [size_t] max image array size - UR_DEVICE_INFO_MAX_SAMPLERS = 40, ///< [uint32_t] max number of samplers that can be used in a kernel - UR_DEVICE_INFO_MAX_PARAMETER_SIZE = 41, ///< [size_t] max size in bytes of all arguments passed to a kernel - UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN = 42, ///< [uint32_t] memory base address alignment - UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE = 43, ///< [::ur_device_mem_cache_type_t] global memory cache type - UR_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE = 44, ///< [uint32_t] global memory cache line size in bytes - UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE = 45, ///< [uint64_t] size of global memory cache in bytes - UR_DEVICE_INFO_GLOBAL_MEM_SIZE = 46, ///< [uint64_t] size of global memory in bytes - UR_DEVICE_INFO_GLOBAL_MEM_FREE = 47, ///< [uint64_t] size of global memory which is free in bytes - UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE = 48, ///< [uint64_t] max constant buffer size in bytes - UR_DEVICE_INFO_MAX_CONSTANT_ARGS = 49, ///< [uint32_t] max number of __const declared arguments in a kernel - UR_DEVICE_INFO_LOCAL_MEM_TYPE = 50, ///< [::ur_device_local_mem_type_t] local memory type - UR_DEVICE_INFO_LOCAL_MEM_SIZE = 51, ///< [uint64_t] local memory size in bytes - UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT = 52, ///< [::ur_bool_t] support error correction to global and local memory - UR_DEVICE_INFO_HOST_UNIFIED_MEMORY = 53, ///< [::ur_bool_t] unified host device memory - UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION = 54, ///< [size_t] profiling timer resolution in nanoseconds - UR_DEVICE_INFO_ENDIAN_LITTLE = 55, ///< [::ur_bool_t] little endian byte order - UR_DEVICE_INFO_AVAILABLE = 56, ///< [::ur_bool_t] device is available - UR_DEVICE_INFO_COMPILER_AVAILABLE = 57, ///< [::ur_bool_t] device compiler is available - UR_DEVICE_INFO_LINKER_AVAILABLE = 58, ///< [::ur_bool_t] device linker is available - UR_DEVICE_INFO_EXECUTION_CAPABILITIES = 59, ///< [::ur_device_exec_capability_flags_t] device kernel execution - ///< capability bit-field - UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES = 60, ///< [::ur_queue_flags_t] device command queue property bit-field - UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES = 61, ///< [::ur_queue_flags_t] host queue property bit-field - UR_DEVICE_INFO_BUILT_IN_KERNELS = 62, ///< [char[]] a semi-colon separated list of built-in kernels - UR_DEVICE_INFO_PLATFORM = 63, ///< [::ur_platform_handle_t] the platform associated with the device - UR_DEVICE_INFO_REFERENCE_COUNT = 64, ///< [uint32_t] Reference count of the device object. - ///< The reference count returned should be considered immediately stale. - ///< It is unsuitable for general use in applications. This feature is - ///< provided for identifying memory leaks. - UR_DEVICE_INFO_IL_VERSION = 65, ///< [char[]] IL version - UR_DEVICE_INFO_NAME = 66, ///< [char[]] Device name - UR_DEVICE_INFO_VENDOR = 67, ///< [char[]] Device vendor - UR_DEVICE_INFO_DRIVER_VERSION = 68, ///< [char[]] Driver version - UR_DEVICE_INFO_PROFILE = 69, ///< [char[]] Device profile - UR_DEVICE_INFO_VERSION = 70, ///< [char[]] Device version - UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION = 71, ///< [char[]] Version of backend runtime - UR_DEVICE_INFO_EXTENSIONS = 72, ///< [char[]] Return a space separated list of extension names - UR_DEVICE_INFO_PRINTF_BUFFER_SIZE = 73, ///< [size_t] Maximum size in bytes of internal printf buffer - UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC = 74, ///< [::ur_bool_t] prefer user synchronization when sharing object with - ///< other API - UR_DEVICE_INFO_PARENT_DEVICE = 75, ///< [::ur_device_handle_t] return parent device handle - UR_DEVICE_INFO_SUPPORTED_PARTITIONS = 76, ///< [::ur_device_partition_t[]] Returns an array of partition types - ///< supported by the device - UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES = 77, ///< [uint32_t] maximum number of sub-devices when the device is - ///< partitioned - UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN = 78, ///< [::ur_device_affinity_domain_flags_t] Returns a bit-field of the - ///< supported affinity domains for partitioning. - ///< If the device does not support any affinity domains, then 0 will be returned. - UR_DEVICE_INFO_PARTITION_TYPE = 79, ///< [::ur_device_partition_property_t[]] return an array of - ///< ::ur_device_partition_property_t for properties specified in - ///< ::urDevicePartition - UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS = 80, ///< [uint32_t] max number of sub groups - UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS = 81, ///< [::ur_bool_t] support sub group independent forward progress - UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL = 82, ///< [uint32_t[]] return an array of sub group sizes supported on Intel - ///< device - UR_DEVICE_INFO_USM_HOST_SUPPORT = 83, ///< [::ur_device_usm_access_capability_flags_t] support USM host memory - ///< access - UR_DEVICE_INFO_USM_DEVICE_SUPPORT = 84, ///< [::ur_device_usm_access_capability_flags_t] support USM device memory - ///< access - UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT = 85, ///< [::ur_device_usm_access_capability_flags_t] support USM single device - ///< shared memory access - UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT = 86, ///< [::ur_device_usm_access_capability_flags_t] support USM cross device - ///< shared memory access - UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT = 87, ///< [::ur_device_usm_access_capability_flags_t] support USM system wide - ///< shared memory access - UR_DEVICE_INFO_UUID = 88, ///< [uint8_t[]] return device UUID - UR_DEVICE_INFO_PCI_ADDRESS = 89, ///< [char[]] return device PCI address - UR_DEVICE_INFO_GPU_EU_COUNT = 90, ///< [uint32_t] return Intel GPU EU count - UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH = 91, ///< [uint32_t] return Intel GPU EU SIMD width - UR_DEVICE_INFO_GPU_EU_SLICES = 92, ///< [uint32_t] return Intel GPU number of slices - UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 93, ///< [uint32_t] return Intel GPU EU count per subslice - UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE = 94, ///< [uint32_t] return Intel GPU number of subslices per slice - UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU = 95, ///< [uint32_t] return Intel GPU number of threads per EU - UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH = 96, ///< [uint32_t] return max memory bandwidth in Mb/s - UR_DEVICE_INFO_IMAGE_SRGB = 97, ///< [::ur_bool_t] device supports sRGB images - UR_DEVICE_INFO_BUILD_ON_SUBDEVICE = 98, ///< [::ur_bool_t] Return true if sub-device should do its own program - ///< build - UR_DEVICE_INFO_ATOMIC_64 = 99, ///< [::ur_bool_t] support 64 bit atomics - UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 100, ///< [::ur_memory_order_capability_flags_t] return a bit-field of atomic - ///< memory order capabilities - UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 101, ///< [::ur_memory_scope_capability_flags_t] return a bit-field of atomic - ///< memory scope capabilities - UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES = 102, ///< [::ur_memory_order_capability_flags_t] return a bit-field of atomic - ///< memory fence order capabilities - UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES = 103, ///< [::ur_memory_scope_capability_flags_t] return a bit-field of atomic - ///< memory fence scope capabilities - UR_DEVICE_INFO_BFLOAT16 = 104, ///< [::ur_bool_t] support for bfloat16 - UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES = 105, ///< [uint32_t] Returns 1 if the device doesn't have a notion of a - ///< queue index. Otherwise, returns the number of queue indices that are - ///< available for this device. - UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS = 106, ///< [::ur_bool_t] support the ::urKernelSetSpecializationConstants entry - ///< point - UR_DEVICE_INFO_MEMORY_BUS_WIDTH = 107, ///< [uint32_t] return the width in bits of the memory bus interface of the - ///< device. - UR_DEVICE_INFO_MAX_WORK_GROUPS_3D = 108, ///< [size_t[3]] return max 3D work groups - UR_DEVICE_INFO_ASYNC_BARRIER = 109, ///< [::ur_bool_t] return true if Async Barrier is supported - UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT = 110, ///< [::ur_bool_t] return true if specifying memory channels is supported - UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED = 111, ///< [::ur_bool_t] Return true if the device supports enqueueing commands - ///< to read and write pipes from the host. - UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP = 112, ///< [uint32_t] The maximum number of registers available per block. - UR_DEVICE_INFO_IP_VERSION = 113, ///< [uint32_t] The device IP version. The meaning of the device IP version - ///< is implementation-defined, but newer devices should have a higher - ///< version than older devices. - UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT = 114, ///< [::ur_bool_t] return true if the device supports virtual memory. - UR_DEVICE_INFO_ESIMD_SUPPORT = 115, ///< [::ur_bool_t] return true if the device supports ESIMD. - UR_DEVICE_INFO_COMPONENT_DEVICES = 116, ///< [::ur_device_handle_t[]] The set of component devices contained by - ///< this composite device. - UR_DEVICE_INFO_COMPOSITE_DEVICE = 117, ///< [::ur_device_handle_t] The composite device containing this component - ///< device. - UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP = 0x1000, ///< [::ur_bool_t] Returns true if the device supports the use of - ///< command-buffers. - UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP = 0x1001, ///< [::ur_bool_t] Returns true if the device supports updating the kernel - ///< commands in a command-buffer. - UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP = 0x2000, ///< [::ur_bool_t] returns true if the device supports the creation of - ///< bindless images - UR_DEVICE_INFO_BINDLESS_IMAGES_SHARED_USM_SUPPORT_EXP = 0x2001, ///< [::ur_bool_t] returns true if the device supports the creation of - ///< bindless images backed by shared USM - UR_DEVICE_INFO_BINDLESS_IMAGES_1D_USM_SUPPORT_EXP = 0x2002, ///< [::ur_bool_t] returns true if the device supports the creation of 1D - ///< bindless images backed by USM - UR_DEVICE_INFO_BINDLESS_IMAGES_2D_USM_SUPPORT_EXP = 0x2003, ///< [::ur_bool_t] returns true if the device supports the creation of 2D - ///< bindless images backed by USM - UR_DEVICE_INFO_IMAGE_PITCH_ALIGN_EXP = 0x2004, ///< [uint32_t] returns the required alignment of the pitch between two - ///< rows of an image in bytes - UR_DEVICE_INFO_MAX_IMAGE_LINEAR_WIDTH_EXP = 0x2005, ///< [size_t] returns the maximum linear width allowed for images allocated - ///< using USM - UR_DEVICE_INFO_MAX_IMAGE_LINEAR_HEIGHT_EXP = 0x2006, ///< [size_t] returns the maximum linear height allowed for images - ///< allocated using USM - UR_DEVICE_INFO_MAX_IMAGE_LINEAR_PITCH_EXP = 0x2007, ///< [size_t] returns the maximum linear pitch allowed for images allocated - ///< using USM - UR_DEVICE_INFO_MIPMAP_SUPPORT_EXP = 0x2008, ///< [::ur_bool_t] returns true if the device supports allocating mipmap - ///< resources - UR_DEVICE_INFO_MIPMAP_ANISOTROPY_SUPPORT_EXP = 0x2009, ///< [::ur_bool_t] returns true if the device supports sampling mipmap - ///< images with anisotropic filtering - UR_DEVICE_INFO_MIPMAP_MAX_ANISOTROPY_EXP = 0x200A, ///< [uint32_t] returns the maximum anisotropic ratio supported by the - ///< device - UR_DEVICE_INFO_MIPMAP_LEVEL_REFERENCE_SUPPORT_EXP = 0x200B, ///< [::ur_bool_t] returns true if the device supports using images created - ///< from individual mipmap levels - UR_DEVICE_INFO_INTEROP_MEMORY_IMPORT_SUPPORT_EXP = 0x200C, ///< [::ur_bool_t] returns true if the device supports importing external - ///< memory resources - UR_DEVICE_INFO_INTEROP_MEMORY_EXPORT_SUPPORT_EXP = 0x200D, ///< [::ur_bool_t] returns true if the device supports exporting internal - ///< memory resources - UR_DEVICE_INFO_INTEROP_SEMAPHORE_IMPORT_SUPPORT_EXP = 0x200E, ///< [::ur_bool_t] returns true if the device supports importing external - ///< semaphore resources - UR_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP = 0x200F, ///< [::ur_bool_t] returns true if the device supports exporting internal - ///< event resources - UR_DEVICE_INFO_CUBEMAP_SUPPORT_EXP = 0x2010, ///< [::ur_bool_t] returns true if the device supports allocating and - ///< accessing cubemap resources - UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP = 0x2011, ///< [::ur_bool_t] returns true if the device supports sampling cubemapped - ///< images across face boundaries + UR_DEVICE_INFO_TYPE = 0, ///< [::ur_device_type_t] type of the device + UR_DEVICE_INFO_VENDOR_ID = 1, ///< [uint32_t] vendor Id of the device + UR_DEVICE_INFO_DEVICE_ID = 2, ///< [uint32_t] Id of the device + UR_DEVICE_INFO_MAX_COMPUTE_UNITS = 3, ///< [uint32_t] the number of compute units + UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS = 4, ///< [uint32_t] max work item dimensions + UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES = 5, ///< [size_t[]] return an array of max work item sizes + UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE = 6, ///< [size_t] max work group size + UR_DEVICE_INFO_SINGLE_FP_CONFIG = 7, ///< [::ur_device_fp_capability_flags_t] single precision floating point + ///< capability + UR_DEVICE_INFO_HALF_FP_CONFIG = 8, ///< [::ur_device_fp_capability_flags_t] half precision floating point + ///< capability + UR_DEVICE_INFO_DOUBLE_FP_CONFIG = 9, ///< [::ur_device_fp_capability_flags_t] double precision floating point + ///< capability + UR_DEVICE_INFO_QUEUE_PROPERTIES = 10, ///< [::ur_queue_flags_t] command queue properties supported by the device + UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR = 11, ///< [uint32_t] preferred vector width for char + UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_SHORT = 12, ///< [uint32_t] preferred vector width for short + UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_INT = 13, ///< [uint32_t] preferred vector width for int + UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_LONG = 14, ///< [uint32_t] preferred vector width for long + UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_FLOAT = 15, ///< [uint32_t] preferred vector width for float + UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_DOUBLE = 16, ///< [uint32_t] preferred vector width for double + UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_HALF = 17, ///< [uint32_t] preferred vector width for half float + UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_CHAR = 18, ///< [uint32_t] native vector width for char + UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_SHORT = 19, ///< [uint32_t] native vector width for short + UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_INT = 20, ///< [uint32_t] native vector width for int + UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_LONG = 21, ///< [uint32_t] native vector width for long + UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_FLOAT = 22, ///< [uint32_t] native vector width for float + UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_DOUBLE = 23, ///< [uint32_t] native vector width for double + UR_DEVICE_INFO_NATIVE_VECTOR_WIDTH_HALF = 24, ///< [uint32_t] native vector width for half float + UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY = 25, ///< [uint32_t] max clock frequency in MHz + UR_DEVICE_INFO_MEMORY_CLOCK_RATE = 26, ///< [uint32_t] memory clock frequency in MHz + UR_DEVICE_INFO_ADDRESS_BITS = 27, ///< [uint32_t] address bits + UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE = 28, ///< [uint64_t] max memory allocation size + UR_DEVICE_INFO_IMAGE_SUPPORTED = 29, ///< [::ur_bool_t] images are supported + UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS = 30, ///< [uint32_t] max number of image objects arguments of a kernel declared + ///< with the read_only qualifier + UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS = 31, ///< [uint32_t] max number of image objects arguments of a kernel declared + ///< with the write_only qualifier + UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS = 32, ///< [uint32_t] max number of image objects arguments of a kernel declared + ///< with the read_write qualifier + UR_DEVICE_INFO_IMAGE2D_MAX_WIDTH = 33, ///< [size_t] max width of Image2D object + UR_DEVICE_INFO_IMAGE2D_MAX_HEIGHT = 34, ///< [size_t] max height of Image2D object + UR_DEVICE_INFO_IMAGE3D_MAX_WIDTH = 35, ///< [size_t] max width of Image3D object + UR_DEVICE_INFO_IMAGE3D_MAX_HEIGHT = 36, ///< [size_t] max height of Image3D object + UR_DEVICE_INFO_IMAGE3D_MAX_DEPTH = 37, ///< [size_t] max depth of Image3D object + UR_DEVICE_INFO_IMAGE_MAX_BUFFER_SIZE = 38, ///< [size_t] max image buffer size + UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE = 39, ///< [size_t] max image array size + UR_DEVICE_INFO_MAX_SAMPLERS = 40, ///< [uint32_t] max number of samplers that can be used in a kernel + UR_DEVICE_INFO_MAX_PARAMETER_SIZE = 41, ///< [size_t] max size in bytes of all arguments passed to a kernel + UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN = 42, ///< [uint32_t] memory base address alignment + UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE = 43, ///< [::ur_device_mem_cache_type_t] global memory cache type + UR_DEVICE_INFO_GLOBAL_MEM_CACHELINE_SIZE = 44, ///< [uint32_t] global memory cache line size in bytes + UR_DEVICE_INFO_GLOBAL_MEM_CACHE_SIZE = 45, ///< [uint64_t] size of global memory cache in bytes + UR_DEVICE_INFO_GLOBAL_MEM_SIZE = 46, ///< [uint64_t] size of global memory in bytes + UR_DEVICE_INFO_GLOBAL_MEM_FREE = 47, ///< [uint64_t] size of global memory which is free in bytes + UR_DEVICE_INFO_MAX_CONSTANT_BUFFER_SIZE = 48, ///< [uint64_t] max constant buffer size in bytes + UR_DEVICE_INFO_MAX_CONSTANT_ARGS = 49, ///< [uint32_t] max number of __const declared arguments in a kernel + UR_DEVICE_INFO_LOCAL_MEM_TYPE = 50, ///< [::ur_device_local_mem_type_t] local memory type + UR_DEVICE_INFO_LOCAL_MEM_SIZE = 51, ///< [uint64_t] local memory size in bytes + UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT = 52, ///< [::ur_bool_t] support error correction to global and local memory + UR_DEVICE_INFO_HOST_UNIFIED_MEMORY = 53, ///< [::ur_bool_t] unified host device memory + UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION = 54, ///< [size_t] profiling timer resolution in nanoseconds + UR_DEVICE_INFO_ENDIAN_LITTLE = 55, ///< [::ur_bool_t] little endian byte order + UR_DEVICE_INFO_AVAILABLE = 56, ///< [::ur_bool_t] device is available + UR_DEVICE_INFO_COMPILER_AVAILABLE = 57, ///< [::ur_bool_t] device compiler is available + UR_DEVICE_INFO_LINKER_AVAILABLE = 58, ///< [::ur_bool_t] device linker is available + UR_DEVICE_INFO_EXECUTION_CAPABILITIES = 59, ///< [::ur_device_exec_capability_flags_t] device kernel execution + ///< capability bit-field + UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES = 60, ///< [::ur_queue_flags_t] device command queue property bit-field + UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES = 61, ///< [::ur_queue_flags_t] host queue property bit-field + UR_DEVICE_INFO_BUILT_IN_KERNELS = 62, ///< [char[]] a semi-colon separated list of built-in kernels + UR_DEVICE_INFO_PLATFORM = 63, ///< [::ur_platform_handle_t] the platform associated with the device + UR_DEVICE_INFO_REFERENCE_COUNT = 64, ///< [uint32_t] Reference count of the device object. + ///< The reference count returned should be considered immediately stale. + ///< It is unsuitable for general use in applications. This feature is + ///< provided for identifying memory leaks. + UR_DEVICE_INFO_IL_VERSION = 65, ///< [char[]] IL version + UR_DEVICE_INFO_NAME = 66, ///< [char[]] Device name + UR_DEVICE_INFO_VENDOR = 67, ///< [char[]] Device vendor + UR_DEVICE_INFO_DRIVER_VERSION = 68, ///< [char[]] Driver version + UR_DEVICE_INFO_PROFILE = 69, ///< [char[]] Device profile + UR_DEVICE_INFO_VERSION = 70, ///< [char[]] Device version + UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION = 71, ///< [char[]] Version of backend runtime + UR_DEVICE_INFO_EXTENSIONS = 72, ///< [char[]] Return a space separated list of extension names + UR_DEVICE_INFO_PRINTF_BUFFER_SIZE = 73, ///< [size_t] Maximum size in bytes of internal printf buffer + UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC = 74, ///< [::ur_bool_t] prefer user synchronization when sharing object with + ///< other API + UR_DEVICE_INFO_PARENT_DEVICE = 75, ///< [::ur_device_handle_t] return parent device handle + UR_DEVICE_INFO_SUPPORTED_PARTITIONS = 76, ///< [::ur_device_partition_t[]] Returns an array of partition types + ///< supported by the device + UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES = 77, ///< [uint32_t] maximum number of sub-devices when the device is + ///< partitioned + UR_DEVICE_INFO_PARTITION_AFFINITY_DOMAIN = 78, ///< [::ur_device_affinity_domain_flags_t] Returns a bit-field of the + ///< supported affinity domains for partitioning. + ///< If the device does not support any affinity domains, then 0 will be returned. + UR_DEVICE_INFO_PARTITION_TYPE = 79, ///< [::ur_device_partition_property_t[]] return an array of + ///< ::ur_device_partition_property_t for properties specified in + ///< ::urDevicePartition + UR_DEVICE_INFO_MAX_NUM_SUB_GROUPS = 80, ///< [uint32_t] max number of sub groups + UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS = 81, ///< [::ur_bool_t] support sub group independent forward progress + UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL = 82, ///< [uint32_t[]] return an array of sub group sizes supported on Intel + ///< device + UR_DEVICE_INFO_USM_HOST_SUPPORT = 83, ///< [::ur_device_usm_access_capability_flags_t] support USM host memory + ///< access + UR_DEVICE_INFO_USM_DEVICE_SUPPORT = 84, ///< [::ur_device_usm_access_capability_flags_t] support USM device memory + ///< access + UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT = 85, ///< [::ur_device_usm_access_capability_flags_t] support USM single device + ///< shared memory access + UR_DEVICE_INFO_USM_CROSS_SHARED_SUPPORT = 86, ///< [::ur_device_usm_access_capability_flags_t] support USM cross device + ///< shared memory access + UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT = 87, ///< [::ur_device_usm_access_capability_flags_t] support USM system wide + ///< shared memory access + UR_DEVICE_INFO_UUID = 88, ///< [uint8_t[]] return device UUID + UR_DEVICE_INFO_PCI_ADDRESS = 89, ///< [char[]] return device PCI address + UR_DEVICE_INFO_GPU_EU_COUNT = 90, ///< [uint32_t] return Intel GPU EU count + UR_DEVICE_INFO_GPU_EU_SIMD_WIDTH = 91, ///< [uint32_t] return Intel GPU EU SIMD width + UR_DEVICE_INFO_GPU_EU_SLICES = 92, ///< [uint32_t] return Intel GPU number of slices + UR_DEVICE_INFO_GPU_EU_COUNT_PER_SUBSLICE = 93, ///< [uint32_t] return Intel GPU EU count per subslice + UR_DEVICE_INFO_GPU_SUBSLICES_PER_SLICE = 94, ///< [uint32_t] return Intel GPU number of subslices per slice + UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU = 95, ///< [uint32_t] return Intel GPU number of threads per EU + UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH = 96, ///< [uint32_t] return max memory bandwidth in Mb/s + UR_DEVICE_INFO_IMAGE_SRGB = 97, ///< [::ur_bool_t] device supports sRGB images + UR_DEVICE_INFO_BUILD_ON_SUBDEVICE = 98, ///< [::ur_bool_t] Return true if sub-device should do its own program + ///< build + UR_DEVICE_INFO_ATOMIC_64 = 99, ///< [::ur_bool_t] support 64 bit atomics + UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES = 100, ///< [::ur_memory_order_capability_flags_t] return a bit-field of atomic + ///< memory order capabilities + UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES = 101, ///< [::ur_memory_scope_capability_flags_t] return a bit-field of atomic + ///< memory scope capabilities + UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES = 102, ///< [::ur_memory_order_capability_flags_t] return a bit-field of atomic + ///< memory fence order capabilities + UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES = 103, ///< [::ur_memory_scope_capability_flags_t] return a bit-field of atomic + ///< memory fence scope capabilities + UR_DEVICE_INFO_BFLOAT16 = 104, ///< [::ur_bool_t] support for bfloat16 + UR_DEVICE_INFO_MAX_COMPUTE_QUEUE_INDICES = 105, ///< [uint32_t] Returns 1 if the device doesn't have a notion of a + ///< queue index. Otherwise, returns the number of queue indices that are + ///< available for this device. + UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS = 106, ///< [::ur_bool_t] support the ::urKernelSetSpecializationConstants entry + ///< point + UR_DEVICE_INFO_MEMORY_BUS_WIDTH = 107, ///< [uint32_t] return the width in bits of the memory bus interface of the + ///< device. + UR_DEVICE_INFO_MAX_WORK_GROUPS_3D = 108, ///< [size_t[3]] return max 3D work groups + UR_DEVICE_INFO_ASYNC_BARRIER = 109, ///< [::ur_bool_t] return true if Async Barrier is supported + UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT = 110, ///< [::ur_bool_t] return true if specifying memory channels is supported + UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED = 111, ///< [::ur_bool_t] Return true if the device supports enqueueing commands + ///< to read and write pipes from the host. + UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP = 112, ///< [uint32_t] The maximum number of registers available per block. + UR_DEVICE_INFO_IP_VERSION = 113, ///< [uint32_t] The device IP version. The meaning of the device IP version + ///< is implementation-defined, but newer devices should have a higher + ///< version than older devices. + UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT = 114, ///< [::ur_bool_t] return true if the device supports virtual memory. + UR_DEVICE_INFO_ESIMD_SUPPORT = 115, ///< [::ur_bool_t] return true if the device supports ESIMD. + UR_DEVICE_INFO_COMPONENT_DEVICES = 116, ///< [::ur_device_handle_t[]] The set of component devices contained by + ///< this composite device. + UR_DEVICE_INFO_COMPOSITE_DEVICE = 117, ///< [::ur_device_handle_t] The composite device containing this component + ///< device. + UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP = 0x1000, ///< [::ur_bool_t] Returns true if the device supports the use of + ///< command-buffers. + UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP = 0x1001, ///< [::ur_bool_t] Returns true if the device supports updating the kernel + ///< commands in a command-buffer. + UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP = 0x2000, ///< [::ur_bool_t] returns true if the device supports the creation of + ///< bindless images + UR_DEVICE_INFO_BINDLESS_IMAGES_SHARED_USM_SUPPORT_EXP = 0x2001, ///< [::ur_bool_t] returns true if the device supports the creation of + ///< bindless images backed by shared USM + UR_DEVICE_INFO_BINDLESS_IMAGES_1D_USM_SUPPORT_EXP = 0x2002, ///< [::ur_bool_t] returns true if the device supports the creation of 1D + ///< bindless images backed by USM + UR_DEVICE_INFO_BINDLESS_IMAGES_2D_USM_SUPPORT_EXP = 0x2003, ///< [::ur_bool_t] returns true if the device supports the creation of 2D + ///< bindless images backed by USM + UR_DEVICE_INFO_IMAGE_PITCH_ALIGN_EXP = 0x2004, ///< [uint32_t] returns the required alignment of the pitch between two + ///< rows of an image in bytes + UR_DEVICE_INFO_MAX_IMAGE_LINEAR_WIDTH_EXP = 0x2005, ///< [size_t] returns the maximum linear width allowed for images allocated + ///< using USM + UR_DEVICE_INFO_MAX_IMAGE_LINEAR_HEIGHT_EXP = 0x2006, ///< [size_t] returns the maximum linear height allowed for images + ///< allocated using USM + UR_DEVICE_INFO_MAX_IMAGE_LINEAR_PITCH_EXP = 0x2007, ///< [size_t] returns the maximum linear pitch allowed for images allocated + ///< using USM + UR_DEVICE_INFO_MIPMAP_SUPPORT_EXP = 0x2008, ///< [::ur_bool_t] returns true if the device supports allocating mipmap + ///< resources + UR_DEVICE_INFO_MIPMAP_ANISOTROPY_SUPPORT_EXP = 0x2009, ///< [::ur_bool_t] returns true if the device supports sampling mipmap + ///< images with anisotropic filtering + UR_DEVICE_INFO_MIPMAP_MAX_ANISOTROPY_EXP = 0x200A, ///< [uint32_t] returns the maximum anisotropic ratio supported by the + ///< device + UR_DEVICE_INFO_MIPMAP_LEVEL_REFERENCE_SUPPORT_EXP = 0x200B, ///< [::ur_bool_t] returns true if the device supports using images created + ///< from individual mipmap levels + UR_DEVICE_INFO_INTEROP_MEMORY_IMPORT_SUPPORT_EXP = 0x200C, ///< [::ur_bool_t] returns true if the device supports importing external + ///< memory resources + UR_DEVICE_INFO_INTEROP_MEMORY_EXPORT_SUPPORT_EXP = 0x200D, ///< [::ur_bool_t] returns true if the device supports exporting internal + ///< memory resources + UR_DEVICE_INFO_INTEROP_SEMAPHORE_IMPORT_SUPPORT_EXP = 0x200E, ///< [::ur_bool_t] returns true if the device supports importing external + ///< semaphore resources + UR_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP = 0x200F, ///< [::ur_bool_t] returns true if the device supports exporting internal + ///< event resources + UR_DEVICE_INFO_CUBEMAP_SUPPORT_EXP = 0x2010, ///< [::ur_bool_t] returns true if the device supports allocating and + ///< accessing cubemap resources + UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP = 0x2011, ///< [::ur_bool_t] returns true if the device supports sampling cubemapped + ///< images across face boundaries + UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP = 0x2012, ///< [::ur_bool_t] returns true if the device is capable of fetching USM + ///< backed 1D sampled image data. + UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP = 0x2013, ///< [::ur_bool_t] returns true if the device is capable of fetching + ///< non-USM backed 1D sampled image data. + UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP = 0x2014, ///< [::ur_bool_t] returns true if the device is capable of fetching USM + ///< backed 2D sampled image data. + UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP = 0x2015, ///< [::ur_bool_t] returns true if the device is capable of fetching + ///< non-USM backed 2D sampled image data. + UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP = 0x2016, ///< [::ur_bool_t] returns true if the device is capable of fetching USM + ///< backed 3D sampled image data. + UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP = 0x2017, ///< [::ur_bool_t] returns true if the device is capable of fetching + ///< non-USM backed 3D sampled image data. + UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP = 0x2018, ///< [::ur_bool_t] returns true if the device supports timestamp recording /// @cond UR_DEVICE_INFO_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -1654,7 +1669,7 @@ typedef enum ur_device_info_t { /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP < propName` +/// + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -4757,7 +4772,8 @@ typedef enum ur_kernel_group_info_t { UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE = 0, ///< [size_t[3]] Return Work Group maximum global size UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE = 1, ///< [size_t] Return maximum Work Group size UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE = 2, ///< [size_t[3]] Return Work Group size required by the source code, such - ///< as __attribute__((required_work_group_size(X,Y,Z)) + ///< as __attribute__((required_work_group_size(X,Y,Z)), or (0, 0, 0) if + ///< unspecified UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE = 3, ///< [size_t] Return local memory required by the Kernel UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE = 4, ///< [size_t] Return preferred multiple of Work Group size for launch UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE = 5, ///< [size_t] Return minimum amount of private memory in bytes used by each @@ -4773,7 +4789,8 @@ typedef enum ur_kernel_group_info_t { typedef enum ur_kernel_sub_group_info_t { UR_KERNEL_SUB_GROUP_INFO_MAX_SUB_GROUP_SIZE = 0, ///< [uint32_t] Return maximum SubGroup size UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS = 1, ///< [uint32_t] Return maximum number of SubGroup - UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS = 2, ///< [uint32_t] Return number of SubGroup required by the source code + UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS = 2, ///< [uint32_t] Return number of SubGroup required by the source code or 0 + ///< if unspecified UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL = 3, ///< [uint32_t] Return SubGroup size required by Intel /// @cond UR_KERNEL_SUB_GROUP_INFO_FORCE_UINT32 = 0x7fffffff @@ -5606,6 +5623,7 @@ typedef enum ur_command_t { UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP = 0x1000, ///< Event created by ::urCommandBufferEnqueueExp UR_COMMAND_INTEROP_SEMAPHORE_WAIT_EXP = 0x2000, ///< Event created by ::urBindlessImagesWaitExternalSemaphoreExp UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP = 0x2001, ///< Event created by ::urBindlessImagesSignalExternalSemaphoreExp + UR_COMMAND_TIMESTAMP_RECORDING_EXP = 0x2002, ///< Event created by ::urEnqueueTimestampRecordingExp /// @cond UR_COMMAND_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -5973,6 +5991,7 @@ urEventSetCallback( /// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION /// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE /// - ::UR_RESULT_ERROR_INVALID_VALUE +/// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGS - "The kernel argument values have not been specified." /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES UR_APIEXPORT ur_result_t UR_APICALL @@ -7535,7 +7554,6 @@ urBindlessImagesImageFreeExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pImageFormat` /// + `NULL == pImageDesc` -/// + `NULL == phMem` /// + `NULL == phImage` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE @@ -7550,7 +7568,6 @@ urBindlessImagesUnsampledImageCreateExp( ur_exp_image_mem_handle_t hImageMem, ///< [in] handle to memory from which to create the image const ur_image_format_t *pImageFormat, ///< [in] pointer to image format specification const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description - ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created ur_exp_image_handle_t *phImage ///< [out] pointer to handle of image object created ); @@ -7574,7 +7591,6 @@ urBindlessImagesUnsampledImageCreateExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pImageFormat` /// + `NULL == pImageDesc` -/// + `NULL == phMem` /// + `NULL == phImage` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE @@ -7591,7 +7607,6 @@ urBindlessImagesSampledImageCreateExp( const ur_image_format_t *pImageFormat, ///< [in] pointer to image format specification const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description ur_sampler_handle_t hSampler, ///< [in] sampler to be used - ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created ur_exp_image_handle_t *phImage ///< [out] pointer to handle of image object created ); @@ -8878,6 +8893,48 @@ urKernelSuggestMaxCooperativeGroupCountExp( uint32_t *pGroupCountRet ///< [out] pointer to maximum number of groups ); +#if !defined(__GNUC__) +#pragma endregion +#endif +// Intel 'oneAPI' Unified Runtime Experimental APIs for enqueuing timestamp recordings +#if !defined(__GNUC__) +#pragma region enqueue timestamp recording(experimental) +#endif +/////////////////////////////////////////////////////////////////////////////// +/// @brief Enqueue a command for recording the device timestamp +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == phEvent` +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +UR_APIEXPORT ur_result_t UR_APICALL +urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + bool blocking, ///< [in] indicates whether the call to this function should block until + ///< until the device timestamp recording command has executed on the + ///< device. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t *phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. + ur_event_handle_t *phEvent ///< [in,out] return an event object that identifies this particular kernel + ///< execution instance. Profiling information can be queried + ///< from this event as if `hQueue` had profiling enabled. Querying + ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + ///< reports the timestamp at the time of the call to this function. + ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + ///< reports the timestamp recorded when the command is executed on the device. +); + #if !defined(__GNUC__) #pragma endregion #endif @@ -10588,6 +10645,18 @@ typedef struct ur_enqueue_cooperative_kernel_launch_exp_params_t { ur_event_handle_t **pphEvent; } ur_enqueue_cooperative_kernel_launch_exp_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urEnqueueTimestampRecordingExp +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_enqueue_timestamp_recording_exp_params_t { + ur_queue_handle_t *phQueue; + bool *pblocking; + uint32_t *pnumEventsInWaitList; + const ur_event_handle_t **pphEventWaitList; + ur_event_handle_t **pphEvent; +} ur_enqueue_timestamp_recording_exp_params_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urBindlessImagesUnsampledImageHandleDestroyExp /// @details Each entry is a pointer to the parameter passed to the function; @@ -10640,7 +10709,6 @@ typedef struct ur_bindless_images_unsampled_image_create_exp_params_t { ur_exp_image_mem_handle_t *phImageMem; const ur_image_format_t **ppImageFormat; const ur_image_desc_t **ppImageDesc; - ur_mem_handle_t **pphMem; ur_exp_image_handle_t **pphImage; } ur_bindless_images_unsampled_image_create_exp_params_t; @@ -10655,7 +10723,6 @@ typedef struct ur_bindless_images_sampled_image_create_exp_params_t { const ur_image_format_t **ppImageFormat; const ur_image_desc_t **ppImageDesc; ur_sampler_handle_t *phSampler; - ur_mem_handle_t **pphMem; ur_exp_image_handle_t **pphImage; } ur_bindless_images_sampled_image_create_exp_params_t; diff --git a/include/ur_ddi.h b/include/ur_ddi.h index 5a069f0881..52ae226c2a 100644 --- a/include/ur_ddi.h +++ b/include/ur_ddi.h @@ -7,7 +7,7 @@ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * * @file ur_ddi.h - * @version v0.9-r0 + * @version v0.10-r0 * */ #ifndef UR_DDI_H_INCLUDED @@ -1448,10 +1448,20 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueCooperativeKernelLaunchExp_t)( const ur_event_handle_t *, ur_event_handle_t *); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urEnqueueTimestampRecordingExp +typedef ur_result_t(UR_APICALL *ur_pfnEnqueueTimestampRecordingExp_t)( + ur_queue_handle_t, + bool, + uint32_t, + const ur_event_handle_t *, + ur_event_handle_t *); + /////////////////////////////////////////////////////////////////////////////// /// @brief Table of EnqueueExp functions pointers typedef struct ur_enqueue_exp_dditable_t { ur_pfnEnqueueCooperativeKernelLaunchExp_t pfnCooperativeKernelLaunchExp; + ur_pfnEnqueueTimestampRecordingExp_t pfnTimestampRecordingExp; } ur_enqueue_exp_dditable_t; /////////////////////////////////////////////////////////////////////////////// @@ -1513,7 +1523,6 @@ typedef ur_result_t(UR_APICALL *ur_pfnBindlessImagesUnsampledImageCreateExp_t)( ur_exp_image_mem_handle_t, const ur_image_format_t *, const ur_image_desc_t *, - ur_mem_handle_t *, ur_exp_image_handle_t *); /////////////////////////////////////////////////////////////////////////////// @@ -1525,7 +1534,6 @@ typedef ur_result_t(UR_APICALL *ur_pfnBindlessImagesSampledImageCreateExp_t)( const ur_image_format_t *, const ur_image_desc_t *, ur_sampler_handle_t, - ur_mem_handle_t *, ur_exp_image_handle_t *); /////////////////////////////////////////////////////////////////////////////// diff --git a/include/ur_print.h b/include/ur_print.h index 3126c1714b..3377980ce7 100644 --- a/include/ur_print.h +++ b/include/ur_print.h @@ -1954,6 +1954,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueWriteHostPipeParams(const stru /// - `buff_size < out_size` UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueCooperativeKernelLaunchExpParams(const struct ur_enqueue_cooperative_kernel_launch_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_enqueue_timestamp_recording_exp_params_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintEnqueueTimestampRecordingExpParams(const struct ur_enqueue_timestamp_recording_exp_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_bindless_images_unsampled_image_handle_destroy_exp_params_t struct /// @returns diff --git a/include/ur_print.hpp b/include/ur_print.hpp index a5074c5da1..bed5f01670 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -7,7 +7,7 @@ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * * @file ur_print.hpp - * @version v0.9-r0 + * @version v0.10-r0 * */ #ifndef UR_PRINT_HPP @@ -916,6 +916,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP: os << "UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP"; break; + case UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP: + os << "UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP"; + break; default: os << "unknown enumerator"; break; @@ -2553,6 +2556,27 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_device_info_t value) { case UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP: os << "UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP"; break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP: + os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP"; + break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP: + os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP"; + break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP: + os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP"; + break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP: + os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP"; + break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP: + os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP"; + break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP: + os << "UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP"; + break; + case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: + os << "UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP"; + break; default: os << "unknown enumerator"; break; @@ -4190,6 +4214,90 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_device_info os << ")"; } break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP: { + const ur_bool_t *tptr = (const ur_bool_t *)ptr; + if (sizeof(ur_bool_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP: { + const ur_bool_t *tptr = (const ur_bool_t *)ptr; + if (sizeof(ur_bool_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP: { + const ur_bool_t *tptr = (const ur_bool_t *)ptr; + if (sizeof(ur_bool_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP: { + const ur_bool_t *tptr = (const ur_bool_t *)ptr; + if (sizeof(ur_bool_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP: { + const ur_bool_t *tptr = (const ur_bool_t *)ptr; + if (sizeof(ur_bool_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP: { + const ur_bool_t *tptr = (const ur_bool_t *)ptr; + if (sizeof(ur_bool_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; + case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: { + const ur_bool_t *tptr = (const ur_bool_t *)ptr; + if (sizeof(ur_bool_t) > size) { + os << "invalid size (is: " << size << ", expected: >=" << sizeof(ur_bool_t) << ")"; + return UR_RESULT_ERROR_INVALID_SIZE; + } + os << (const void *)(tptr) << " ("; + + os << *tptr; + + os << ")"; + } break; default: os << "unknown enumerator"; return UR_RESULT_ERROR_INVALID_ENUMERATION; @@ -8698,6 +8806,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_command_t value) { case UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP: os << "UR_COMMAND_INTEROP_SEMAPHORE_SIGNAL_EXP"; break; + case UR_COMMAND_TIMESTAMP_RECORDING_EXP: + os << "UR_COMMAND_TIMESTAMP_RECORDING_EXP"; + break; default: os << "unknown enumerator"; break; @@ -14014,6 +14125,48 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct return os; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_enqueue_timestamp_recording_exp_params_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_enqueue_timestamp_recording_exp_params_t *params) { + + os << ".hQueue = "; + + ur::details::printPtr(os, + *(params->phQueue)); + + os << ", "; + os << ".blocking = "; + + os << *(params->pblocking); + + os << ", "; + os << ".numEventsInWaitList = "; + + os << *(params->pnumEventsInWaitList); + + os << ", "; + os << ".phEventWaitList = {"; + for (size_t i = 0; *(params->pphEventWaitList) != NULL && i < *params->pnumEventsInWaitList; ++i) { + if (i != 0) { + os << ", "; + } + + ur::details::printPtr(os, + (*(params->pphEventWaitList))[i]); + } + os << "}"; + + os << ", "; + os << ".phEvent = "; + + ur::details::printPtr(os, + *(params->pphEvent)); + + return os; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_bindless_images_unsampled_image_handle_destroy_exp_params_t type /// @returns @@ -14165,12 +14318,6 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur::details::printPtr(os, *(params->ppImageDesc)); - os << ", "; - os << ".phMem = "; - - ur::details::printPtr(os, - *(params->pphMem)); - os << ", "; os << ".phImage = "; @@ -14221,12 +14368,6 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur::details::printPtr(os, *(params->phSampler)); - os << ", "; - os << ".phMem = "; - - ur::details::printPtr(os, - *(params->pphMem)); - os << ", "; os << ".phImage = "; @@ -17036,6 +17177,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, ur_function_ case UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP: { os << (const struct ur_enqueue_cooperative_kernel_launch_exp_params_t *)params; } break; + case UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP: { + os << (const struct ur_enqueue_timestamp_recording_exp_params_t *)params; + } break; case UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_HANDLE_DESTROY_EXP: { os << (const struct ur_bindless_images_unsampled_image_handle_destroy_exp_params_t *)params; } break; diff --git a/scripts/Doxyfile b/scripts/Doxyfile index 0329aeffff..64f0ac58a6 100644 --- a/scripts/Doxyfile +++ b/scripts/Doxyfile @@ -38,7 +38,7 @@ PROJECT_NAME = "Intel One API Unified Runtime API" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = v0.9 +PROJECT_NUMBER = v0.10 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/scripts/core/EXP-BINDLESS-IMAGES.rst b/scripts/core/EXP-BINDLESS-IMAGES.rst index af90c1ea0f..ee54c69291 100644 --- a/scripts/core/EXP-BINDLESS-IMAGES.rst +++ b/scripts/core/EXP-BINDLESS-IMAGES.rst @@ -91,6 +91,12 @@ Enums * ${X}_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP * ${X}_DEVICE_INFO_CUBEMAP_SUPPORT_EXP * ${X}_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP + * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP + * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP + * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP + * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP + * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP + * ${X}_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP * ${x}_command_t * ${X}_COMMAND_INTEROP_SEMAPHORE_WAIT_EXP @@ -198,6 +204,8 @@ Changelog +------------------------------------------------------------------------+ | 10.0 | Added cubemap image type, sampling properties, and device | | | queries. | ++------------------------------------------------------------------------+ +| 11.0 | Added device queries for sampled image fetch capabilities. | +----------+-------------------------------------------------------------+ Contributors diff --git a/scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst b/scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst new file mode 100644 index 0000000000..3cf46095b2 --- /dev/null +++ b/scripts/core/EXP-ENQUEUE-TIMESTAMP-RECORDING.rst @@ -0,0 +1,70 @@ +<% + OneApi=tags['$OneApi'] + x=tags['$x'] + X=x.upper() +%> + +.. _experimental-enqueue-timestamp-recording: + +================================================================================ +Enqueue Timestamp Recording +================================================================================ + +.. warning:: + + Experimental features: + + * May be replaced, updated, or removed at any time. + * Do not require maintaining API/ABI stability of their own additions over + time. + * Do not require conformance testing of their own additions. + + +Motivation +-------------------------------------------------------------------------------- +Currently, the only way to get timestamp information is through enabling +profiling on a queue and retrieving the information from events coming from +commands submitted to it. However, not all systems give full control of the +queue construction to the programmer wanting the profiling information. To amend +this, this extension adds the ability to enqueue a timestamp recording on any +queue, with or without profiling enabled. This event can in turn be queried for +the usual profiling information. + + +API +-------------------------------------------------------------------------------- + +Enums +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* ${x}_device_info_t + * ${X}_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP + +* ${x}_command_t + * ${X}_COMMAND_TIMESTAMP_RECORDING_EXP + +Functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +* ${x}EnqueueTimestampRecordingExp + +Changelog +-------------------------------------------------------------------------------- + ++-----------+------------------------+ +| Revision | Changes | ++===========+========================+ +| 1.0 | Initial Draft | ++-----------+------------------------+ + + +Support +-------------------------------------------------------------------------------- + +Adapters which support this experimental feature *must* return true for the new +`${X}_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP` device info query. + + +Contributors +-------------------------------------------------------------------------------- + +* Steffen Larsen `steffen.larsen@intel.com `_ diff --git a/scripts/core/INTRO.rst b/scripts/core/INTRO.rst index 52e78153ae..a81c282070 100644 --- a/scripts/core/INTRO.rst +++ b/scripts/core/INTRO.rst @@ -182,6 +182,8 @@ Unified Runtime loader implements tracing support through the `XPTI framework 0`" + - "`phEventWaitList != NULL && numEventsInWaitList == 0`" diff --git a/scripts/core/kernel.yml b/scripts/core/kernel.yml index 4ce4f9c70a..4a0bf0bab1 100644 --- a/scripts/core/kernel.yml +++ b/scripts/core/kernel.yml @@ -135,7 +135,7 @@ etors: - name: WORK_GROUP_SIZE desc: "[size_t] Return maximum Work Group size" - name: COMPILE_WORK_GROUP_SIZE - desc: "[size_t[3]] Return Work Group size required by the source code, such as __attribute__((required_work_group_size(X,Y,Z))" + desc: "[size_t[3]] Return Work Group size required by the source code, such as __attribute__((required_work_group_size(X,Y,Z)), or (0, 0, 0) if unspecified" - name: LOCAL_MEM_SIZE desc: "[size_t] Return local memory required by the Kernel" - name: PREFERRED_WORK_GROUP_SIZE_MULTIPLE @@ -154,7 +154,7 @@ etors: - name: MAX_NUM_SUB_GROUPS desc: "[uint32_t] Return maximum number of SubGroup" - name: COMPILE_NUM_SUB_GROUPS - desc: "[uint32_t] Return number of SubGroup required by the source code" + desc: "[uint32_t] Return number of SubGroup required by the source code or 0 if unspecified" - name: SUB_GROUP_SIZE_INTEL desc: "[uint32_t] Return SubGroup size required by Intel" --- #-------------------------------------------------------------------------- diff --git a/scripts/core/platform.yml b/scripts/core/platform.yml index 510a7e16b3..c35b71622d 100644 --- a/scripts/core/platform.yml +++ b/scripts/core/platform.yml @@ -140,6 +140,9 @@ etors: - name: "0_9" value: "$X_MAKE_VERSION( 0, 9 )" desc: "version 0.9" + - name: "0_10" + value: "$X_MAKE_VERSION( 0, 10 )" + desc: "version 0.10" --- #-------------------------------------------------------------------------- type: function desc: "Returns the API version supported by the specified platform" diff --git a/scripts/core/registry.yml b/scripts/core/registry.yml index 5ae70f7bd1..6a551d5821 100644 --- a/scripts/core/registry.yml +++ b/scripts/core/registry.yml @@ -580,6 +580,9 @@ etors: - name: COMMAND_BUFFER_COMMAND_GET_INFO_EXP desc: Enumerator for $xCommandBufferCommandGetInfoExp value: '222' +- name: ENQUEUE_TIMESTAMP_RECORDING_EXP + desc: Enumerator for $xEnqueueTimestampRecordingExp + value: '223' --- type: enum desc: Defines structure types diff --git a/scripts/ctest_parser.py b/scripts/ctest_parser.py index bd1a84f3ee..5074a5c9b9 100755 --- a/scripts/ctest_parser.py +++ b/scripts/ctest_parser.py @@ -24,7 +24,7 @@ def get_cts_test_suite_names(working_directory): ] def percent(amount, total): - return round((amount / total) * 100, 2) + return round((amount / (total or 1)) * 100, 2) def summarize_results(results): total = results['Total'] diff --git a/scripts/generate_docs.py b/scripts/generate_docs.py index 5c14305898..9b29123e40 100644 --- a/scripts/generate_docs.py +++ b/scripts/generate_docs.py @@ -10,6 +10,7 @@ import subprocess import util import re +from parse_specs import Version RE_ENABLE = r"^\#\#\s*\-\-validate\s*\=\s*on$" RE_DISABLE = r"^\#\#\s*\-\-validate\s*\=\s*off$" @@ -87,7 +88,7 @@ def _make_ref(symbol, symbol_type, meta): generate a valid reStructuredText file """ def _generate_valid_rst(fin, fout, namespace, tags, ver, rev, meta, fast_mode): - ver=float(ver) + ver = Version(ver) enable = True code_block = False diff --git a/scripts/generate_kernel_header.py b/scripts/generate_kernel_header.py index f765189334..46e78d6ed7 100755 --- a/scripts/generate_kernel_header.py +++ b/scripts/generate_kernel_header.py @@ -12,9 +12,7 @@ import subprocess import sys -from mako.template import Template - -HEADER_TEMPLATE = Template("""/* +HEADER_TEMPLATE = """/* * * Copyright (C) 2023 Intel Corporation * @@ -22,7 +20,7 @@ * See LICENSE.TXT * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * - * @file ${file_name}.h + * @file %s.h * */ @@ -33,66 +31,63 @@ namespace uur { namespace device_binaries { std::map> program_kernel_map = { -% for program, entry_points in kernel_name_dict.items(): - {"${program}", { - % for entry_point in entry_points: - "${entry_point}", - % endfor - }}, -% endfor +%s }; } } -""") +""" +PROGRAM_TEMPLATE = """\ + {"%s", { +%s + }}, +""" + +ENTRY_POINT_TEMPLATE = """\ + "%s", +""" def generate_header(output_file, kernel_name_dict): """Render the template and write it to the output file.""" file_name = os.path.basename(output_file) - rendered = HEADER_TEMPLATE.render(file_name=file_name, - kernel_name_dict=kernel_name_dict) + device_binaries = "" + for program, entry_points in kernel_name_dict.items(): + content = "" + for entry_point in entry_points: + content += ENTRY_POINT_TEMPLATE % entry_point + device_binaries += PROGRAM_TEMPLATE % (program, content) + rendered = HEADER_TEMPLATE % (file_name, device_binaries) rendered = re.sub(r"\r\n", r"\n", rendered) - with open(output_file, "w") as fout: fout.write(rendered) -def get_mangled_names(dpcxx_path, source_file, output_header): +def get_mangled_names(source_file, output_header): """Return a list of all the entry point names from a given sycl source file. Filters out wrapper and offset handler entry points. """ output_dir = os.path.dirname(output_header) - il_file = os.path.join(output_dir, os.path.basename(source_file) + ".ll") - generate_il_command = f"""\ - {dpcxx_path} -S -fsycl -fsycl-device-code-split=off \ - -fsycl-device-only -o {il_file} {source_file}""" - subprocess.run(generate_il_command, shell=True) - kernel_line_regex = re.compile("define.*spir_kernel") - definition_lines = [] - with open(il_file) as f: + name = os.path.splitext(os.path.basename(source_file))[0] + ih_file = os.path.join(output_dir, name, name + ".ih") + definitions = [] + writing = False + with open(ih_file) as f: lines = f.readlines() for line in lines: - if kernel_line_regex.search(line) is not None: - definition_lines.append(line) - - entry_point_names = [] - kernel_name_regex = re.compile(r"@(.*?)\(") - for line in definition_lines: - if kernel_name_regex.search(line) is None: - continue - kernel_name = kernel_name_regex.search(line).group(1) - if "kernel_wrapper" not in kernel_name and "with_offset" not in kernel_name: - entry_point_names.append(kernel_name) + if "}" in line and writing: + break + # __pf_kernel_wrapper seems to be an internal function used by dpcpp + if writing and "19__pf_kernel_wrapper" not in line: + definitions.append(line.replace(",", "").strip()[1:-1]) + if "const char* const kernel_names[] = {" in line: + writing = True - os.remove(il_file) - return entry_point_names + return definitions def main(): parser = argparse.ArgumentParser() - parser.add_argument("--dpcxx_path", - help="Full path to dpc++ compiler executable.") parser.add_argument( "-o", "--output", @@ -105,7 +100,7 @@ def main(): for source_file in args.source_files: program_name = os.path.splitext(os.path.basename(source_file))[0] mangled_names[program_name] = get_mangled_names( - args.dpcxx_path, source_file, args.output) + source_file, args.output) generate_header(args.output, mangled_names) diff --git a/scripts/parse_specs.py b/scripts/parse_specs.py index 1bff8887f4..c54e0adbf8 100644 --- a/scripts/parse_specs.py +++ b/scripts/parse_specs.py @@ -6,20 +6,23 @@ SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception """ + import os import generate_ids import util import re import hashlib import json -import yaml import copy from templates.helper import param_traits, type_traits, value_traits import ctypes import itertools +from typing import Dict, List, Optional +from version import Version + -default_version = "0.9" -all_versions = ["0.6", "0.7", "0.8", "0.9"] +default_version = Version("0.10") +all_versions = [Version(ver) for ver in ["0.6", "0.7", "0.8", "0.9", "0.10"]] """ preprocess object @@ -105,14 +108,14 @@ def __validate_version(d, prefix="", base_version=default_version): raise Exception(prefix+"'version' must be a string: '%s'"%type(d['version'])) try: - version = str(float(d['version'])) + version = str(d['version']) except: version = None if version != d['version']: raise Exception(prefix+"'version' invalid value: '%s'"%d['version']) - return float(d.get('version', base_version)) + return Version(d.get('version', base_version)) def __validate_tag(d, key, tags, case): for x in tags: @@ -127,7 +130,7 @@ def __validate_desc(desc): raise Exception(prefix+"'version' must be a string: '%s'"%type(k)) try: - version = str(float(k)) + version = str(k) except: version = None @@ -212,8 +215,8 @@ def __validate_etors(d, tags): typed = d.get('typed_etors', False) value = -1 - d_ver = d.get('version', default_version) - max_ver = float(d_ver) + d_ver = Version(d.get('version', default_version)) + max_ver = d_ver for i, item in enumerate(d['etors']): prefix="'etors'[%s] "%i if not isinstance(item, dict): @@ -300,8 +303,8 @@ def __validate_members(d, tags, meta): if not isinstance(d['members'], list): raise Exception("'members' must be a sequence: '%s'"%type(d['members'])) - d_ver = d.get('version', default_version) - max_ver = float(d_ver) + d_ver = Version(d.get('version', default_version)) + max_ver = d_ver for i, item in enumerate(d['members']): prefix="'members'[%s] "%i if not isinstance(item, dict): @@ -342,8 +345,8 @@ def __validate_params(d, tags, meta): if not isinstance(d['params'], list): raise Exception("'params' must be a sequence: '%s'"%type(d['params'])) - d_ver = d.get('version', default_version) - max_ver = float(d_ver) + d_ver = Version(d.get('version', default_version)) + max_ver = d_ver min = {'[in]': None, '[out]': None, '[in,out]': None} for i, item in enumerate(d['params']): prefix="'params'[%s] "%i @@ -501,24 +504,23 @@ def __validate_union_tag(d): """ filters object by version """ -def _filter_version(d, max_ver): - ver = float(d.get('version', default_version)) +def _filter_version(d, max_ver: Version) -> Optional[Dict]: + ver = Version(d.get('version', default_version)) if ver > max_ver: return None - def __filter_desc(d): + def __filter_desc(d) -> dict: if 'desc' in d and isinstance(d['desc'], dict): for k, v in d['desc'].items(): - if float(k) <= max_ver: - desc = v - d['desc'] = desc + if Version(k) <= max_ver: + d['desc'] = v return d flt = [] type = d['type'] if 'enum' == type: for e in d['etors']: - ver = float(e.get('version', default_version)) + ver = Version(e.get('version', default_version)) if ver <= max_ver: flt.append(__filter_desc(e)) if d['name'].endswith('version_t'): @@ -531,14 +533,14 @@ def __filter_desc(d): elif 'function' == type: for p in d['params']: - ver = float(p.get('version', default_version)) + ver = Version(p.get('version', default_version)) if ver <= max_ver: flt.append(__filter_desc(p)) d['params'] = flt elif 'struct' == type or 'union' == type or 'class' == type: for m in d.get('members',[]): - ver = float(m.get('version', default_version)) + ver = Version(m.get('version', default_version)) if ver <= max_ver: flt.append(__filter_desc(m)) d['members'] = flt @@ -548,15 +550,15 @@ def __filter_desc(d): """ creates docs per version """ -def _make_versions(d, max_ver): +def _make_versions(d, max_ver : Version) -> List[Version]: docs = [] type = d['type'] if 'function' == type or 'struct' == type: for ver in all_versions: - if float(ver) > max_ver: + if ver > max_ver: break - dv = _filter_version(copy.deepcopy(d), float(ver)) + dv = _filter_version(copy.deepcopy(d), ver) if not dv: continue @@ -936,7 +938,7 @@ def parse(section, version, tags, meta, ref): if not _validate_doc(f, d, tags, line_nums[i], meta): continue - d = _filter_version(d, float(version)) + d = _filter_version(d, version) if not d: continue @@ -948,7 +950,10 @@ def parse(section, version, tags, meta, ref): # extract header from objects if re.match(r"header", d['type']): header = d - header['ordinal'] = int(int(header.get('ordinal',"1000")) * float(header.get('version',"1.0"))) + header["ordinal"] = int( + int(header.get("ordinal", "1000")) + * Version(header.get("version", "1.0")).major + ) header['ordinal'] *= 1000 if re.match(r"extension", header.get('desc',"").lower()) else 1 header['ordinal'] *= 1000 if re.match(r"experimental", header.get('desc',"").lower()) else 1 basename = os.path.splitext(os.path.basename(f))[0] @@ -961,7 +966,7 @@ def parse(section, version, tags, meta, ref): for c in '_-': name = name.replace(c, ' ') elif header: - for d in _make_versions(d, float(version)): + for d in _make_versions(d, version): objects.append(d) meta = _generate_meta(d, header['ordinal'], meta) diff --git a/scripts/run.py b/scripts/run.py index b5a9ccc8b8..5d9b475007 100755 --- a/scripts/run.py +++ b/scripts/run.py @@ -13,9 +13,12 @@ import parse_specs import generate_code import generate_docs -import os, sys, platform +import os +import sys +import platform import time import subprocess +from version import Version """ helper for adding mutually-exclusive boolean arguments "--name" and "--!name" @@ -95,7 +98,7 @@ def get_version_from_cmakelists(): for line in cmakelists_file.readlines(): line = line.strip() if line.startswith('project('): - return re.findall(r'\d+\.\d+', line)[0] + return Version(re.findall(r'\d+\.\d+', line)[0]) raise Exception(f'unable to read project version from {cmakelists_path}') @@ -117,8 +120,13 @@ def main(): add_argument(parser, "pdf", "generation of PDF file.") add_argument(parser, "rst", "generation of reStructuredText files.", True) parser.add_argument("--update_spec", type=str, help="root of integrated spec directory to update") - parser.add_argument("--ver", type=str, default=get_version_from_cmakelists(), - required=False, help="specification version to generate.") + parser.add_argument( + "--ver", + type=parse_specs.Version, + default=get_version_from_cmakelists(), + required=False, + help="specification version to generate.", + ) parser.add_argument("--api-json", type=str, default="unified_runtime.json", required=False, help="json output file for the spec") parser.add_argument("--clang-format", type=str, default="clang-format", required=False, help="path to clang-format executable") parser.add_argument('--fast-mode', action='store_true', help='Disable sections which are slow to render') @@ -136,82 +144,75 @@ def main(): 'meta' : {}, 'ref' : {} } - - try: - for section in configParser.sections(): - input['configs'].append({ - 'name' : section, - 'namespace': configParser.get(section,'namespace'), - 'tags' : {'$'+key : configParser.get(section,key) for key in configParser.get(section,'tags').split(",")}, - }) - - # phase 2: parse specs - for config in input['configs']: - specs, input['meta'], input['ref'] = parse_specs.parse(config['name'], args['ver'], config['tags'], input['meta'], input['ref']) - input['specs'].append(specs) + for section in configParser.sections(): + input['configs'].append({ + 'name' : section, + 'namespace': configParser.get(section,'namespace'), + 'tags' : {'$'+key : configParser.get(section,key) for key in configParser.get(section,'tags').split(",")}, + }) - util.jsonWrite(args['api_json'], input) + # phase 2: parse specs + for config in input['configs']: + specs, input['meta'], input['ref'] = parse_specs.parse(config['name'], args['ver'], config['tags'], input['meta'], input['ref']) + input['specs'].append(specs) - # phase 3: generate files - if args['clean']: - clean() + util.jsonWrite(args['api_json'], input) - incpath = os.path.join("../include/") - srcpath = os.path.join("../source/") - docpath = os.path.join("../docs/") + # phase 3: generate files + if args['clean']: + clean() - generate_docs.prepare(docpath, args['rst'], args['html'], args['ver']) + incpath = os.path.join("../include/") + srcpath = os.path.join("../source/") + docpath = os.path.join("../docs/") - for idx, specs in enumerate(input['specs']): - config = input['configs'][idx] - if args[config['name']]: + generate_docs.prepare(docpath, args['rst'], args['html'], args['ver']) - generate_code.generate_api(incpath, srcpath, config['namespace'], config['tags'], args['ver'], args['rev'], specs, input['meta']) + for idx, specs in enumerate(input['specs']): + config = input['configs'][idx] + if args[config['name']]: - # clang-format ur_api.h - proc = subprocess.run([args['clang_format'], "--style=file", "-i" , "ur_api.h"], stderr=subprocess.PIPE, cwd=incpath) - if proc.returncode != 0: - print("-- clang-format failed with non-zero return code. --") - print(proc.stderr.decode()) - raise Exception("Failed to format ur_api.h") + generate_code.generate_api(incpath, srcpath, config['namespace'], config['tags'], args['ver'], args['rev'], specs, input['meta']) - if args['rst']: - generate_docs.generate_rst(docpath, config['name'], config['namespace'], config['tags'], args['ver'], args['rev'], specs, input['meta'], args['fast_mode']) + # clang-format ur_api.h + proc = subprocess.run([args['clang_format'], "--style=file", "-i" , "ur_api.h"], stderr=subprocess.PIPE, cwd=incpath) + if proc.returncode != 0: + print("-- clang-format failed with non-zero return code. --") + print(proc.stderr.decode()) + raise Exception("Failed to format ur_api.h") - if util.makeErrorCount(): - print("\n%s Errors found during generation, stopping execution!"%util.makeErrorCount()) - return + if args['rst']: + generate_docs.generate_rst(docpath, config['name'], config['namespace'], config['tags'], args['ver'], args['rev'], specs, input['meta'], args['fast_mode']) - if args['debug']: - util.makoFileListWrite("generated.json") + if util.makeErrorCount(): + print("\n%s Errors found during generation, stopping execution!"%util.makeErrorCount()) + return - # phase 4: build code - if args['build']: - if not build(): - print("\nBuild failed, stopping execution!") - return + if args['debug']: + util.makoFileListWrite("generated.json") - # phase 5: prep for publication of html or pdf - if args['html'] or args['pdf']: - generate_docs.generate_common(docpath, configParser.sections(), args['ver'], args['rev']) + # phase 4: build code + if args['build']: + if not build(): + print("\nBuild failed, stopping execution!") + return - # phase 5: publish documentation - if args['html']: - generate_docs.generate_html(docpath) + # phase 5: prep for publication of html or pdf + if args['html'] or args['pdf']: + generate_docs.generate_common(docpath, configParser.sections(), args['ver'], args['rev']) - if args['pdf']: - generate_docs.generate_pdf(docpath) + # phase 5: publish documentation + if args['html']: + generate_docs.generate_html(docpath) - if args['update_spec']: - update_spec(args['update_spec']) + if args['pdf']: + generate_docs.generate_pdf(docpath) - print("\nCompleted in %.1f seconds!"%(time.time() - start)) + if args['update_spec']: + update_spec(args['update_spec']) - except BaseException as e: - print("Failed to generate specification.") - print(e) - return sys.exit(1) + print("\nCompleted in %.1f seconds!"%(time.time() - start)) if __name__ == '__main__': diff --git a/scripts/templates/helper.py b/scripts/templates/helper.py index 3b8e5693b7..1d539d70fe 100644 --- a/scripts/templates/helper.py +++ b/scripts/templates/helper.py @@ -12,6 +12,7 @@ # allow imports from top-level scripts directory sys.path.append("..") +from version import Version """ Extracts traits from a spec object @@ -1081,15 +1082,20 @@ def make_param_checks(namespace, tags, obj, cpp=False, meta=None): def get_class_function_objs(specs, cname, version = None): objects = [] for s in specs: - for obj in s['objects']: + for obj in s["objects"]: is_function = obj_traits.is_function(obj) match_cls = cname == obj_traits.class_name(obj) if is_function and match_cls: if version is None: objects.append(obj) - elif float(obj.get('version',"1.0")) <= version: + elif Version(obj.get("version", "1.0")) <= version: objects.append(obj) - return sorted(objects, key=lambda obj: (float(obj.get('version',"1.0"))*10000) + int(obj.get('ordinal',"100"))) + return sorted( + objects, + key=lambda obj: (Version(obj.get("version", "1.0")).major * 10000) + + int(obj.get("ordinal", "100")), + ) + """ Public: @@ -1107,8 +1113,16 @@ def get_class_function_objs_exp(specs, cname): exp_objects.append(obj) else: objects.append(obj) - objects = sorted(objects, key=lambda obj: (float(obj.get('version',"1.0"))*10000) + int(obj.get('ordinal',"100"))) - exp_objects = sorted(exp_objects, key=lambda obj: (float(obj.get('version',"1.0"))*10000) + int(obj.get('ordinal',"100"))) + objects = sorted( + objects, + key=lambda obj: (Version(obj.get("version", "1.0")).major * 10000) + + int(obj.get("ordinal", "100")), + ) + exp_objects = sorted( + exp_objects, + key=lambda obj: (Version(obj.get("version", "1.0")).major * 10000) + + int(obj.get("ordinal", "100")), + ) return objects, exp_objects """ @@ -1222,7 +1236,7 @@ def _filter_param_list(params, filters1=["[in]", "[in,out]", "[out]"], filters2= def get_pfncbtables(specs, meta, namespace, tags): tables = [] for cname in sorted(meta['class'], key=lambda x: meta['class'][x]['ordinal']): - objs = get_class_function_objs(specs, cname, 1.0) + objs = get_class_function_objs(specs, cname, Version('1.0')) if len(objs) > 0: name = get_table_name(namespace, tags, {'class': cname}) print(name) diff --git a/scripts/templates/trcddi.cpp.mako b/scripts/templates/trcddi.cpp.mako index 2ace43072b..6f6579d5ac 100644 --- a/scripts/templates/trcddi.cpp.mako +++ b/scripts/templates/trcddi.cpp.mako @@ -45,10 +45,16 @@ namespace ur_tracing_layer ${th.make_pfncb_param_type(n, tags, obj)} params = { &${",&".join(th.make_param_lines(n, tags, obj, format=["name"]))} }; uint64_t instance = context.notify_begin(${th.make_func_etor(n, tags, obj)}, "${th.make_func_name(n, tags, obj)}", ¶ms); + context.logger.info("---> ${th.make_func_name(n, tags, obj)}"); + ${x}_result_t result = ${th.make_pfn_name(n, tags, obj)}( ${", ".join(th.make_param_lines(n, tags, obj, format=["name"]))} ); context.notify_end(${th.make_func_etor(n, tags, obj)}, "${th.make_func_name(n, tags, obj)}", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, ${th.make_func_etor(n, tags, obj)}, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } %if 'condition' in obj: @@ -112,6 +118,10 @@ namespace ur_tracing_layer return result; } + // Recreate the logger in case env variables have been modified between + // program launch and the call to `urLoaderInit` + logger = logger::create_logger("tracing", true, true); + ur_tracing_layer::context.codelocData = codelocData; %for tbl in th.get_pfntables(specs, meta, n, tags): diff --git a/scripts/version.py b/scripts/version.py new file mode 100644 index 0000000000..8fa72d65a7 --- /dev/null +++ b/scripts/version.py @@ -0,0 +1,41 @@ +""" +Copyright (C) 2024 Intel Corporation + +Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +See LICENSE.TXT +SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +""" + +import functools +import re + + +@functools.total_ordering +class Version: + def __init__(self, version: str): + assert isinstance(version, (str, Version)) + if isinstance(version, str): + self.str = version + match = re.match(r"^(\d+)\.(\d+)$", self.str) + assert isinstance(match, re.Match) + self.major = int(match.groups()[0]) + self.minor = int(match.groups()[1]) + else: + self.str = version.str + self.major = version.major + self.minor = version.minor + + def __eq__(self, other) -> bool: + assert isinstance(other, Version) + return self.major == other.major and self.minor == other.minor + + def __lt__(self, other) -> bool: + if not isinstance(other, Version): + import ipdb; ipdb.set_trace() + return self.major < other.major or ( + self.major == other.major and self.minor < other.minor + ) + + def __str__(self) -> str: + return self.str diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index 8f1ede3010..8d21a93c75 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -203,9 +203,10 @@ static ur_result_t enqueueCommandBufferFillHelper( } } - UR_CHECK_ERROR(cuGraphAddMemsetNode( - &GraphNode, CommandBuffer->CudaGraph, DepsList.data(), - DepsList.size(), &NodeParams, CommandBuffer->Device->getContext())); + UR_CHECK_ERROR( + cuGraphAddMemsetNode(&GraphNode, CommandBuffer->CudaGraph, + DepsList.data(), DepsList.size(), &NodeParams, + CommandBuffer->Device->getNativeContext())); // Get sync point and register the cuNode with it. *SyncPoint = @@ -237,7 +238,7 @@ static ur_result_t enqueueCommandBufferFillHelper( UR_CHECK_ERROR(cuGraphAddMemsetNode( &GraphNodeFirst, CommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), &NodeParamsStepFirst, - CommandBuffer->Device->getContext())); + CommandBuffer->Device->getNativeContext())); // Get sync point and register the cuNode with it. *SyncPoint = CommandBuffer->addSyncPoint( @@ -269,7 +270,7 @@ static ur_result_t enqueueCommandBufferFillHelper( UR_CHECK_ERROR(cuGraphAddMemsetNode( &GraphNode, CommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), &NodeParamsStep, - CommandBuffer->Device->getContext())); + CommandBuffer->Device->getNativeContext())); GraphNodePtr = std::make_shared(GraphNode); // Get sync point and register the cuNode with it. @@ -478,7 +479,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( UR_CHECK_ERROR(cuGraphAddMemcpyNode( &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), - &NodeParams, hCommandBuffer->Device->getContext())); + &NodeParams, hCommandBuffer->Device->getNativeContext())); // Get sync point and register the cuNode with it. *pSyncPoint = @@ -513,8 +514,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( } try { - auto Src = std::get(hSrcMem->Mem).get() + srcOffset; - auto Dst = std::get(hDstMem->Mem).get() + dstOffset; + auto Src = std::get(hSrcMem->Mem) + .getPtrWithOffset(hCommandBuffer->Device, srcOffset); + auto Dst = std::get(hDstMem->Mem) + .getPtrWithOffset(hCommandBuffer->Device, dstOffset); CUDA_MEMCPY3D NodeParams = {}; setCopyParams(&Src, CU_MEMORYTYPE_DEVICE, &Dst, CU_MEMORYTYPE_DEVICE, size, @@ -522,7 +525,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( UR_CHECK_ERROR(cuGraphAddMemcpyNode( &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), - &NodeParams, hCommandBuffer->Device->getContext())); + &NodeParams, hCommandBuffer->Device->getNativeContext())); // Get sync point and register the cuNode with it. *pSyncPoint = @@ -553,8 +556,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( } try { - CUdeviceptr SrcPtr = std::get(hSrcMem->Mem).get(); - CUdeviceptr DstPtr = std::get(hDstMem->Mem).get(); + auto SrcPtr = + std::get(hSrcMem->Mem).getPtr(hCommandBuffer->Device); + auto DstPtr = + std::get(hDstMem->Mem).getPtr(hCommandBuffer->Device); CUDA_MEMCPY3D NodeParams = {}; setCopyRectParams(region, &SrcPtr, CU_MEMORYTYPE_DEVICE, srcOrigin, @@ -563,7 +568,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( UR_CHECK_ERROR(cuGraphAddMemcpyNode( &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), - &NodeParams, hCommandBuffer->Device->getContext())); + &NodeParams, hCommandBuffer->Device->getNativeContext())); // Get sync point and register the cuNode with it. *pSyncPoint = @@ -593,7 +598,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( } try { - auto Dst = std::get(hBuffer->Mem).get() + offset; + auto Dst = std::get(hBuffer->Mem) + .getPtrWithOffset(hCommandBuffer->Device, offset); CUDA_MEMCPY3D NodeParams = {}; setCopyParams(pSrc, CU_MEMORYTYPE_HOST, &Dst, CU_MEMORYTYPE_DEVICE, size, @@ -601,7 +607,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( UR_CHECK_ERROR(cuGraphAddMemcpyNode( &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), - &NodeParams, hCommandBuffer->Device->getContext())); + &NodeParams, hCommandBuffer->Device->getNativeContext())); // Get sync point and register the cuNode with it. *pSyncPoint = @@ -630,7 +636,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( } try { - auto Src = std::get(hBuffer->Mem).get() + offset; + auto Src = std::get(hBuffer->Mem) + .getPtrWithOffset(hCommandBuffer->Device, offset); CUDA_MEMCPY3D NodeParams = {}; setCopyParams(&Src, CU_MEMORYTYPE_DEVICE, pDst, CU_MEMORYTYPE_HOST, size, @@ -638,7 +645,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( UR_CHECK_ERROR(cuGraphAddMemcpyNode( &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), - &NodeParams, hCommandBuffer->Device->getContext())); + &NodeParams, hCommandBuffer->Device->getNativeContext())); // Get sync point and register the cuNode with it. *pSyncPoint = @@ -670,7 +677,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( } try { - CUdeviceptr DstPtr = std::get(hBuffer->Mem).get(); + auto DstPtr = + std::get(hBuffer->Mem).getPtr(hCommandBuffer->Device); CUDA_MEMCPY3D NodeParams = {}; setCopyRectParams(region, pSrc, CU_MEMORYTYPE_HOST, hostOffset, @@ -680,7 +688,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( UR_CHECK_ERROR(cuGraphAddMemcpyNode( &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), - &NodeParams, hCommandBuffer->Device->getContext())); + &NodeParams, hCommandBuffer->Device->getNativeContext())); // Get sync point and register the cuNode with it. *pSyncPoint = @@ -712,7 +720,8 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( } try { - CUdeviceptr SrcPtr = std::get(hBuffer->Mem).get(); + auto SrcPtr = + std::get(hBuffer->Mem).getPtr(hCommandBuffer->Device); CUDA_MEMCPY3D NodeParams = {}; setCopyRectParams(region, &SrcPtr, CU_MEMORYTYPE_DEVICE, bufferOffset, @@ -722,7 +731,7 @@ ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( UR_CHECK_ERROR(cuGraphAddMemcpyNode( &GraphNode, hCommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), - &NodeParams, hCommandBuffer->Device->getContext())); + &NodeParams, hCommandBuffer->Device->getNativeContext())); // Get sync point and register the cuNode with it. *pSyncPoint = @@ -821,7 +830,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( PatternSizeIsValid, UR_RESULT_ERROR_INVALID_SIZE); - auto DstDevice = std::get(hBuffer->Mem).get() + offset; + auto DstDevice = std::get(hBuffer->Mem) + .getPtrWithOffset(hCommandBuffer->Device, offset); return enqueueCommandBufferFillHelper( hCommandBuffer, &DstDevice, CU_MEMORYTYPE_DEVICE, pPattern, patternSize, @@ -854,7 +864,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( try { std::unique_ptr RetImplEvent{nullptr}; - ScopedContext Active(hQueue->getContext()); + ScopedContext Active(hQueue->getDevice()); uint32_t StreamToken; ur_stream_guard_ Guard; CUstream CuStream = hQueue->getNextComputeStream( @@ -972,7 +982,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( if (ArgValue == nullptr) { Kernel->setKernelArg(ArgIndex, 0, nullptr); } else { - CUdeviceptr CuPtr = std::get(ArgValue->Mem).get(); + CUdeviceptr CuPtr = + std::get(ArgValue->Mem).getPtr(CommandBuffer->Device); Kernel->setKernelArg(ArgIndex, sizeof(CUdeviceptr), (void *)&CuPtr); } } catch (ur_result_t Err) { diff --git a/source/adapters/cuda/command_buffer.hpp b/source/adapters/cuda/command_buffer.hpp index 84a9e0405b..d83269f2ae 100644 --- a/source/adapters/cuda/command_buffer.hpp +++ b/source/adapters/cuda/command_buffer.hpp @@ -10,155 +10,13 @@ #include #include +#include #include "context.hpp" #include "logger/ur_logger.hpp" #include #include -static inline const char *getUrResultString(ur_result_t Result) { - switch (Result) { - case UR_RESULT_SUCCESS: - return "UR_RESULT_SUCCESS"; - case UR_RESULT_ERROR_INVALID_OPERATION: - return "UR_RESULT_ERROR_INVALID_OPERATION"; - case UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES: - return "UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES"; - case UR_RESULT_ERROR_INVALID_QUEUE: - return "UR_RESULT_ERROR_INVALID_QUEUE"; - case UR_RESULT_ERROR_INVALID_VALUE: - return "UR_RESULT_ERROR_INVALID_VALUE"; - case UR_RESULT_ERROR_INVALID_CONTEXT: - return "UR_RESULT_ERROR_INVALID_CONTEXT"; - case UR_RESULT_ERROR_INVALID_PLATFORM: - return "UR_RESULT_ERROR_INVALID_PLATFORM"; - case UR_RESULT_ERROR_INVALID_BINARY: - return "UR_RESULT_ERROR_INVALID_BINARY"; - case UR_RESULT_ERROR_INVALID_PROGRAM: - return "UR_RESULT_ERROR_INVALID_PROGRAM"; - case UR_RESULT_ERROR_INVALID_SAMPLER: - return "UR_RESULT_ERROR_INVALID_SAMPLER"; - case UR_RESULT_ERROR_INVALID_BUFFER_SIZE: - return "UR_RESULT_ERROR_INVALID_BUFFER_SIZE"; - case UR_RESULT_ERROR_INVALID_MEM_OBJECT: - return "UR_RESULT_ERROR_INVALID_MEM_OBJECT"; - case UR_RESULT_ERROR_INVALID_EVENT: - return "UR_RESULT_ERROR_INVALID_EVENT"; - case UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: - return "UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST"; - case UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET: - return "UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET"; - case UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE: - return "UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE"; - case UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE: - return "UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE"; - case UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE: - return "UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE"; - case UR_RESULT_ERROR_DEVICE_NOT_FOUND: - return "UR_RESULT_ERROR_DEVICE_NOT_FOUND"; - case UR_RESULT_ERROR_INVALID_DEVICE: - return "UR_RESULT_ERROR_INVALID_DEVICE"; - case UR_RESULT_ERROR_DEVICE_LOST: - return "UR_RESULT_ERROR_DEVICE_LOST"; - case UR_RESULT_ERROR_DEVICE_REQUIRES_RESET: - return "UR_RESULT_ERROR_DEVICE_REQUIRES_RESET"; - case UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE: - return "UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE"; - case UR_RESULT_ERROR_DEVICE_PARTITION_FAILED: - return "UR_RESULT_ERROR_DEVICE_PARTITION_FAILED"; - case UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT: - return "UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT"; - case UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE: - return "UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE"; - case UR_RESULT_ERROR_INVALID_WORK_DIMENSION: - return "UR_RESULT_ERROR_INVALID_WORK_DIMENSION"; - case UR_RESULT_ERROR_INVALID_KERNEL_ARGS: - return "UR_RESULT_ERROR_INVALID_KERNEL_ARGS"; - case UR_RESULT_ERROR_INVALID_KERNEL: - return "UR_RESULT_ERROR_INVALID_KERNEL"; - case UR_RESULT_ERROR_INVALID_KERNEL_NAME: - return "UR_RESULT_ERROR_INVALID_KERNEL_NAME"; - case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX: - return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX"; - case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE: - return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE"; - case UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE: - return "UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE"; - case UR_RESULT_ERROR_INVALID_IMAGE_SIZE: - return "UR_RESULT_ERROR_INVALID_IMAGE_SIZE"; - case UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR: - return "UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR"; - case UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED: - return "UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED"; - case UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE: - return "UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE"; - case UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE: - return "UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE"; - case UR_RESULT_ERROR_UNINITIALIZED: - return "UR_RESULT_ERROR_UNINITIALIZED"; - case UR_RESULT_ERROR_OUT_OF_HOST_MEMORY: - return "UR_RESULT_ERROR_OUT_OF_HOST_MEMORY"; - case UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY: - return "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY"; - case UR_RESULT_ERROR_OUT_OF_RESOURCES: - return "UR_RESULT_ERROR_OUT_OF_RESOURCES"; - case UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE: - return "UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE"; - case UR_RESULT_ERROR_PROGRAM_LINK_FAILURE: - return "UR_RESULT_ERROR_PROGRAM_LINK_FAILURE"; - case UR_RESULT_ERROR_UNSUPPORTED_VERSION: - return "UR_RESULT_ERROR_UNSUPPORTED_VERSION"; - case UR_RESULT_ERROR_UNSUPPORTED_FEATURE: - return "UR_RESULT_ERROR_UNSUPPORTED_FEATURE"; - case UR_RESULT_ERROR_INVALID_ARGUMENT: - return "UR_RESULT_ERROR_INVALID_ARGUMENT"; - case UR_RESULT_ERROR_INVALID_NULL_HANDLE: - return "UR_RESULT_ERROR_INVALID_NULL_HANDLE"; - case UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE: - return "UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE"; - case UR_RESULT_ERROR_INVALID_NULL_POINTER: - return "UR_RESULT_ERROR_INVALID_NULL_POINTER"; - case UR_RESULT_ERROR_INVALID_SIZE: - return "UR_RESULT_ERROR_INVALID_SIZE"; - case UR_RESULT_ERROR_UNSUPPORTED_SIZE: - return "UR_RESULT_ERROR_UNSUPPORTED_SIZE"; - case UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT: - return "UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT"; - case UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT: - return "UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT"; - case UR_RESULT_ERROR_INVALID_ENUMERATION: - return "UR_RESULT_ERROR_INVALID_ENUMERATION"; - case UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION: - return "UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION"; - case UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT: - return "UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT"; - case UR_RESULT_ERROR_INVALID_NATIVE_BINARY: - return "UR_RESULT_ERROR_INVALID_NATIVE_BINARY"; - case UR_RESULT_ERROR_INVALID_GLOBAL_NAME: - return "UR_RESULT_ERROR_INVALID_GLOBAL_NAME"; - case UR_RESULT_ERROR_INVALID_FUNCTION_NAME: - return "UR_RESULT_ERROR_INVALID_FUNCTION_NAME"; - case UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION: - return "UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION"; - case UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION: - return "UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION"; - case UR_RESULT_ERROR_PROGRAM_UNLINKED: - return "UR_RESULT_ERROR_PROGRAM_UNLINKED"; - case UR_RESULT_ERROR_OVERLAPPING_REGIONS: - return "UR_RESULT_ERROR_OVERLAPPING_REGIONS"; - case UR_RESULT_ERROR_INVALID_HOST_PTR: - return "UR_RESULT_ERROR_INVALID_HOST_PTR"; - case UR_RESULT_ERROR_INVALID_USM_SIZE: - return "UR_RESULT_ERROR_INVALID_USM_SIZE"; - case UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE: - return "UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE"; - case UR_RESULT_ERROR_ADAPTER_SPECIFIC: - return "UR_RESULT_ERROR_ADAPTER_SPECIFIC"; - default: - return "UR_RESULT_ERROR_UNKNOWN"; - } -} - // Trace an internal UR call #define UR_TRACE(Call) \ { \ @@ -173,7 +31,7 @@ static inline const char *getUrResultString(ur_result_t Result) { logger::always("UR ---> {}", #Call); \ Result = (Call); \ if (PrintTrace) \ - logger::always("UR <--- {}({})", #Call, getUrResultString(Result)); \ + logger::always("UR <--- {}({})", #Call, Result); \ } // Handle to a kernel command. diff --git a/source/adapters/cuda/context.cpp b/source/adapters/cuda/context.cpp index 40ae0ce4ad..0d08dab76c 100644 --- a/source/adapters/cuda/context.cpp +++ b/source/adapters/cuda/context.cpp @@ -46,23 +46,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextCreate(uint32_t DeviceCount, const ur_device_handle_t *phDevices, const ur_context_properties_t *pProperties, ur_context_handle_t *phContext) { - std::ignore = DeviceCount; std::ignore = pProperties; - assert(DeviceCount == 1); - ur_result_t RetErr = UR_RESULT_SUCCESS; - std::unique_ptr ContextPtr{nullptr}; try { ContextPtr = std::unique_ptr( - new ur_context_handle_t_{*phDevices}); + new ur_context_handle_t_{phDevices, DeviceCount}); *phContext = ContextPtr.release(); } catch (ur_result_t Err) { - RetErr = Err; + return Err; } catch (...) { - RetErr = UR_RESULT_ERROR_OUT_OF_RESOURCES; + return UR_RESULT_ERROR_OUT_OF_RESOURCES; } - return RetErr; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( @@ -72,9 +68,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( switch (static_cast(ContextInfoType)) { case UR_CONTEXT_INFO_NUM_DEVICES: - return ReturnValue(1); + return ReturnValue(static_cast(hContext->getDevices().size())); case UR_CONTEXT_INFO_DEVICES: - return ReturnValue(hContext->getDevice()); + return ReturnValue(hContext->getDevices().data(), + hContext->getDevices().size()); case UR_CONTEXT_INFO_REFERENCE_COUNT: return ReturnValue(hContext->getReferenceCount()); case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { @@ -88,7 +85,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextGetInfo( int Major = 0; UR_CHECK_ERROR(cuDeviceGetAttribute( &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - hContext->getDevice()->get())); + hContext->getDevices()[0]->get())); uint32_t Capabilities = (Major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | @@ -137,7 +134,10 @@ urContextRetain(ur_context_handle_t hContext) { UR_APIEXPORT ur_result_t UR_APICALL urContextGetNativeHandle( ur_context_handle_t hContext, ur_native_handle_t *phNativeContext) { - *phNativeContext = reinterpret_cast(hContext->get()); + // FIXME: this entry point has been deprecated in the SYCL RT and should be + // changed to unsupoorted once deprecation period has elapsed. + *phNativeContext = reinterpret_cast( + hContext->getDevices()[0]->getNativeContext()); return UR_RESULT_SUCCESS; } diff --git a/source/adapters/cuda/context.hpp b/source/adapters/cuda/context.hpp index 4ddcaf3a87..f28e58afe7 100644 --- a/source/adapters/cuda/context.hpp +++ b/source/adapters/cuda/context.hpp @@ -33,27 +33,26 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data); /// /// One of the main differences between the UR API and the CUDA driver API is /// that the second modifies the state of the threads by assigning -/// `CUcontext` objects to threads. `CUcontext` objects store data associated +/// \c CUcontext objects to threads. \c CUcontext objects store data associated /// with a given device and control access to said device from the user side. /// UR API context are objects that are passed to functions, and not bound /// to threads. -/// The ur_context_handle_t_ object doesn't implement this behavior. It only -/// holds the CUDA context data. The RAII object \ref ScopedContext implements -/// the active context behavior. /// -/// Primary vs User-defined context +/// Since the \c ur_context_handle_t can contain multiple devices, and a \c +/// CUcontext refers to only a single device, the \c CUcontext is more tightly +/// coupled to a \c ur_device_handle_t than a \c ur_context_handle_t. In order +/// to remove some ambiguities about the different semantics of \c +/// \c ur_context_handle_t and native \c CUcontext, we access the native \c +/// CUcontext solely through the \c ur_device_handle_t class, by using the +/// object \ref ScopedContext, which sets the active device (by setting the +/// active native \c CUcontext). /// -/// CUDA has two different types of context, the Primary context, -/// which is usable by all threads on a given process for a given device, and -/// the aforementioned custom contexts. -/// The CUDA documentation, confirmed with performance analysis, suggest using -/// the Primary context whenever possible. -/// The Primary context is also used by the CUDA Runtime API. -/// For UR applications to interop with CUDA Runtime API, they have to use -/// the primary context - and make that active in the thread. -/// The `ur_context_handle_t_` object can be constructed with a `kind` parameter -/// that allows to construct a Primary or `user-defined` context, so that -/// the UR object interface is always the same. +/// Primary vs User-defined \c CUcontext +/// +/// CUDA has two different types of \c CUcontext, the Primary context, which is +/// usable by all threads on a given process for a given device, and the +/// aforementioned custom \c CUcontext s. The CUDA documentation, confirmed with +/// performance analysis, suggest using the Primary context whenever possible. /// /// Destructor callback /// @@ -63,6 +62,18 @@ typedef void (*ur_context_extended_deleter_t)(void *user_data); /// See proposal for details. /// https://github.com/codeplaysoftware/standards-proposals/blob/master/extended-context-destruction/index.md /// +/// +/// Memory Management for Devices in a Context <\b> +/// +/// A \c ur_mem_handle_t is associated with a \c ur_context_handle_t_, which +/// may refer to multiple devices. Therefore the \c ur_mem_handle_t must +/// handle a native allocation for each device in the context. UR is +/// responsible for automatically handling event dependencies for kernels +/// writing to or reading from the same \c ur_mem_handle_t and migrating memory +/// between native allocations for devices in the same \c ur_context_handle_t_ +/// if necessary. +/// +/// struct ur_context_handle_t_ { struct deleter_data { @@ -72,18 +83,21 @@ struct ur_context_handle_t_ { void operator()() { Function(UserData); } }; - using native_type = CUcontext; - - native_type CUContext; - ur_device_handle_t DeviceID; + std::vector Devices; std::atomic_uint32_t RefCount; - ur_context_handle_t_(ur_device_handle_t_ *DevID) - : CUContext{DevID->getContext()}, DeviceID{DevID}, RefCount{1} { - urDeviceRetain(DeviceID); + ur_context_handle_t_(const ur_device_handle_t *Devs, uint32_t NumDevices) + : Devices{Devs, Devs + NumDevices}, RefCount{1} { + for (auto &Dev : Devices) { + urDeviceRetain(Dev); + } }; - ~ur_context_handle_t_() { urDeviceRelease(DeviceID); } + ~ur_context_handle_t_() { + for (auto &Dev : Devices) { + urDeviceRelease(Dev); + } + } void invokeExtendedDeleters() { std::lock_guard Guard(Mutex); @@ -98,9 +112,9 @@ struct ur_context_handle_t_ { ExtendedDeleters.emplace_back(deleter_data{Function, UserData}); } - ur_device_handle_t getDevice() const noexcept { return DeviceID; } - - native_type get() const noexcept { return CUContext; } + const std::vector &getDevices() const noexcept { + return Devices; + } uint32_t incrementReferenceCount() noexcept { return ++RefCount; } @@ -123,12 +137,11 @@ struct ur_context_handle_t_ { namespace { class ScopedContext { public: - ScopedContext(ur_context_handle_t Context) { - if (!Context) { - throw UR_RESULT_ERROR_INVALID_CONTEXT; + ScopedContext(ur_device_handle_t Device) { + if (!Device) { + throw UR_RESULT_ERROR_INVALID_DEVICE; } - - setContext(Context->get()); + setContext(Device->getNativeContext()); } ScopedContext(CUcontext NativeContext) { setContext(NativeContext); } diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp index da11f3f1bf..3a94587d1f 100644 --- a/source/adapters/cuda/device.cpp +++ b/source/adapters/cuda/device.cpp @@ -47,7 +47,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, static constexpr uint32_t MaxWorkItemDimensions = 3u; - ScopedContext Active(hDevice->getContext()); + ScopedContext Active(hDevice); switch ((uint32_t)propName) { case UR_DEVICE_INFO_TYPE: { @@ -926,6 +926,34 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // CUDA supports cubemap seamless filtering. return ReturnValue(true); } + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP: { + // CUDA does support fetching 1D USM sampled image data. + return ReturnValue(true); + } + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP: { + // CUDA does not support fetching 1D non-USM sampled image data. + return ReturnValue(false); + } + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP: { + // CUDA does support fetching 2D USM sampled image data. + return ReturnValue(true); + } + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP: { + // CUDA does support fetching 2D non-USM sampled image data. + return ReturnValue(true); + } + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP: { + // CUDA does not support 3D USM sampled textures + return ReturnValue(false); + } + case UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP: { + // CUDA does support fetching 3D non-USM sampled image data. + return ReturnValue(true); + } + case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: { + // CUDA supports recording timestamp events. + return ReturnValue(true); + } case UR_DEVICE_INFO_DEVICE_ID: { int Value = 0; UR_CHECK_ERROR(cuDeviceGetAttribute( @@ -1206,7 +1234,7 @@ ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice, uint64_t *pDeviceTimestamp, uint64_t *pHostTimestamp) { CUevent Event; - ScopedContext Active(hDevice->getContext()); + ScopedContext Active(hDevice); if (pDeviceTimestamp) { UR_CHECK_ERROR(cuEventCreate(&Event, CU_EVENT_DEFAULT)); diff --git a/source/adapters/cuda/device.hpp b/source/adapters/cuda/device.hpp index 373c8a8ab9..0a40329026 100644 --- a/source/adapters/cuda/device.hpp +++ b/source/adapters/cuda/device.hpp @@ -22,6 +22,7 @@ struct ur_device_handle_t_ { CUevent EvBase; // CUDA event used as base counter std::atomic_uint32_t RefCount; ur_platform_handle_t Platform; + uint32_t DeviceIndex; static constexpr uint32_t MaxWorkItemDimensions = 3u; size_t MaxWorkItemSizes[MaxWorkItemDimensions]; @@ -34,9 +35,9 @@ struct ur_device_handle_t_ { public: ur_device_handle_t_(native_type cuDevice, CUcontext cuContext, CUevent evBase, - ur_platform_handle_t platform) + ur_platform_handle_t platform, uint32_t DevIndex) : CuDevice(cuDevice), CuContext(cuContext), EvBase(evBase), RefCount{1}, - Platform(platform) { + Platform(platform), DeviceIndex{DevIndex} { UR_CHECK_ERROR(cuDeviceGetAttribute( &MaxRegsPerBlock, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, @@ -79,12 +80,16 @@ struct ur_device_handle_t_ { native_type get() const noexcept { return CuDevice; }; - CUcontext getContext() const noexcept { return CuContext; }; + CUcontext getNativeContext() const noexcept { return CuContext; }; uint32_t getReferenceCount() const noexcept { return RefCount; } ur_platform_handle_t getPlatform() const noexcept { return Platform; }; + // Returns the index of the device relative to the other devices in the same + // platform + uint32_t getIndex() const noexcept { return DeviceIndex; } + uint64_t getElapsedTime(CUevent) const; size_t getMaxWorkItemSizes(int index) const noexcept { diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 6f99941095..9627fc6da2 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -26,7 +26,7 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream, UR_ASSERT(EventWaitList, UR_RESULT_SUCCESS); try { - ScopedContext Active(CommandQueue->getContext()); + ScopedContext Active(CommandQueue->getDevice()); auto Result = forLatestEvents( EventWaitList, NumEventsInWaitList, @@ -188,7 +188,7 @@ bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device, // @param [out] ThreadsPerBlock Number of threads per block we should run // @param [out] BlocksPerGrid Number of blocks per grid we should run ur_result_t -setKernelParams(const ur_context_handle_t Context, +setKernelParams([[maybe_unused]] const ur_context_handle_t Context, const ur_device_handle_t Device, const uint32_t WorkDim, const size_t *GlobalWorkOffset, const size_t *GlobalWorkSize, const size_t *LocalWorkSize, ur_kernel_handle_t &Kernel, @@ -201,7 +201,7 @@ setKernelParams(const ur_context_handle_t Context, try { // Set the active context here as guessLocalWorkSize needs an active context - ScopedContext Active(Context); + ScopedContext Active(Device); { size_t *ReqdThreadsPerBlock = Kernel->ReqdThreadsPerBlock; MaxWorkGroupSize = Device->getMaxWorkGroupSize(); @@ -271,7 +271,6 @@ setKernelParams(const ur_context_handle_t Context, CudaImplicitOffset); } - auto Device = Context->getDevice(); if (LocalSize > static_cast(Device->getMaxCapacityLocalMem())) { setErrorMessage("Excessive allocation of local memory on the device", UR_RESULT_ERROR_ADAPTER_SPECIFIC); @@ -329,7 +328,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( // This function makes one stream work on the previous work (or work // represented by input events) and then all future work waits on that stream. try { - ScopedContext Active(hQueue->getContext()); + ScopedContext Active(hQueue->getDevice()); uint32_t StreamToken; ur_stream_guard_ Guard; CUstream CuStream = hQueue->getNextComputeStream( @@ -410,11 +409,43 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { // Preconditions - UR_ASSERT(hQueue->getContext() == hKernel->getContext(), + UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(), UR_RESULT_ERROR_INVALID_KERNEL); UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + std::vector DepEvents( + phEventWaitList, phEventWaitList + numEventsInWaitList); + std::vector> MemMigrationLocks; + + // phEventWaitList only contains events that are handed to UR by the SYCL + // runtime. However since UR handles memory dependencies within a context + // we may need to add more events to our dependent events list if the UR + // context contains multiple devices + if (hQueue->getContext()->Devices.size() > 1) { + MemMigrationLocks.reserve(hKernel->Args.MemObjArgs.size()); + for (auto &MemArg : hKernel->Args.MemObjArgs) { + bool PushBack = false; + if (auto MemDepEvent = MemArg.Mem->LastEventWritingToMemObj; + MemDepEvent && std::find(DepEvents.begin(), DepEvents.end(), + MemDepEvent) == DepEvents.end()) { + DepEvents.push_back(MemDepEvent); + PushBack = true; + } + if ((MemArg.AccessFlags & + (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) || + PushBack) { + if (std::find_if(MemMigrationLocks.begin(), MemMigrationLocks.end(), + [MemArg](auto &Lock) { + return Lock.first == MemArg.Mem; + }) == MemMigrationLocks.end()) + MemMigrationLocks.emplace_back( + std::pair{MemArg.Mem, ur_lock{MemArg.Mem->MemoryMigrationMutex}}); + } + } + } + + // Early exit for zero size kernel if (*pGlobalWorkSize == 0) { return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, phEventWaitList, phEvent); @@ -426,26 +457,37 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( size_t BlocksPerGrid[3] = {1u, 1u, 1u}; uint32_t LocalSize = hKernel->getLocalSize(); - ur_result_t Result = UR_RESULT_SUCCESS; CUfunction CuFunc = hKernel->get(); - Result = setKernelParams(hQueue->getContext(), hQueue->Device, workDim, - pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid); - if (Result != UR_RESULT_SUCCESS) { - return Result; - } + // This might return UR_RESULT_ERROR_ADAPTER_SPECIFIC, which cannot be handled + // using the standard UR_CHECK_ERROR + if (ur_result_t Ret = + setKernelParams(hQueue->getContext(), hQueue->Device, workDim, + pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid); + Ret != UR_RESULT_SUCCESS) + return Ret; try { std::unique_ptr RetImplEvent{nullptr}; + ScopedContext Active(hQueue->getDevice()); uint32_t StreamToken; ur_stream_guard_ Guard; CUstream CuStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); - Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, - phEventWaitList); + if (DepEvents.size()) { + UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, DepEvents.size(), + DepEvents.data())); + } + + // For memory migration across devices in the same context + if (hQueue->getContext()->Devices.size() > 1) { + for (auto &MemArg : hKernel->Args.MemObjArgs) { + migrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice()); + } + } if (phEvent) { RetImplEvent = @@ -454,6 +496,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_CHECK_ERROR(RetImplEvent->start()); } + // Once event has been started we can unlock MemoryMigrationMutex + if (hQueue->getContext()->Devices.size() > 1) { + for (auto &MemArg : hKernel->Args.MemObjArgs) { + // Telling the ur_mem_handle_t that it will need to wait on this kernel + // if it has been written to + if (phEvent && (MemArg.AccessFlags & + (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY))) { + MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.get()); + } + } + // We can release the MemoryMigrationMutexes now + MemMigrationLocks.clear(); + } + auto &ArgIndices = hKernel->getArgIndices(); UR_CHECK_ERROR(cuLaunchKernel( CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], @@ -469,9 +525,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( @@ -565,35 +621,52 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( size_t hostRowPitch, size_t hostSlicePitch, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - ur_result_t Result = UR_RESULT_SUCCESS; - CUdeviceptr DevPtr = std::get(hBuffer->Mem).get(); std::unique_ptr RetImplEvent{nullptr}; - try { - ScopedContext Active(hQueue->getContext()); - CUstream CuStream = hQueue->getNextTransferStream(); + ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex}; + auto Device = hQueue->getDevice(); + ScopedContext Active(Device); + CUstream Stream = hQueue->getNextTransferStream(); - Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, - phEventWaitList); + try { + // Note that this entry point may be called on a queue that may not be the + // last queue to write to the MemBuffer, meaning we must perform the copy + // from a different device + if (hBuffer->LastEventWritingToMemObj && + hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() != + hQueue->getDevice()) { + hQueue = hBuffer->LastEventWritingToMemObj->getQueue(); + Device = hQueue->getDevice(); + ScopedContext Active(Device); + Stream = CUstream{0}; // Default stream for different device + // We may have to wait for an event on another queue if it is the last + // event writing to mem obj + UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1, + &hBuffer->LastEventWritingToMemObj)); + } + + UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList, + phEventWaitList)); if (phEvent) { RetImplEvent = std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_BUFFER_READ_RECT, hQueue, CuStream)); + UR_COMMAND_MEM_BUFFER_READ_RECT, hQueue, Stream)); UR_CHECK_ERROR(RetImplEvent->start()); } - Result = commonEnqueueMemBufferCopyRect( - CuStream, region, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin, + auto DevPtr = std::get(hBuffer->Mem).getPtr(Device); + UR_CHECK_ERROR(commonEnqueueMemBufferCopyRect( + Stream, region, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin, bufferRowPitch, bufferSlicePitch, pDst, CU_MEMORYTYPE_HOST, hostOrigin, - hostRowPitch, hostSlicePitch); + hostRowPitch, hostSlicePitch)); if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); } if (blockingRead) { - UR_CHECK_ERROR(cuStreamSynchronize(CuStream)); + UR_CHECK_ERROR(cuStreamSynchronize(Stream)); } if (phEvent) { @@ -601,9 +674,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( @@ -613,15 +686,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - ur_result_t Result = UR_RESULT_SUCCESS; - CUdeviceptr DevPtr = std::get(hBuffer->Mem).get(); + CUdeviceptr DevPtr = + std::get(hBuffer->Mem).getPtr(hQueue->getDevice()); std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext active(hQueue->getContext()); + ScopedContext Active(hQueue->getDevice()); CUstream cuStream = hQueue->getNextTransferStream(); - Result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, - phEventWaitList); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, + phEventWaitList)); if (phEvent) { RetImplEvent = @@ -630,10 +703,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( UR_CHECK_ERROR(RetImplEvent->start()); } - Result = commonEnqueueMemBufferCopyRect( + UR_CHECK_ERROR(commonEnqueueMemBufferCopyRect( cuStream, region, pSrc, CU_MEMORYTYPE_HOST, hostOrigin, hostRowPitch, hostSlicePitch, &DevPtr, CU_MEMORYTYPE_DEVICE, bufferOrigin, - bufferRowPitch, bufferSlicePitch); + bufferRowPitch, bufferSlicePitch)); if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); @@ -648,9 +721,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( @@ -666,7 +739,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext Active(hQueue->getContext()); + ScopedContext Active(hQueue->getDevice()); ur_result_t Result = UR_RESULT_SUCCESS; auto Stream = hQueue->getNextTransferStream(); @@ -680,8 +753,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( UR_CHECK_ERROR(RetImplEvent->start()); } - auto Src = std::get(hBufferSrc->Mem).get() + srcOffset; - auto Dst = std::get(hBufferDst->Mem).get() + dstOffset; + auto Src = std::get(hBufferSrc->Mem) + .getPtrWithOffset(hQueue->getDevice(), srcOffset); + auto Dst = std::get(hBufferDst->Mem) + .getPtrWithOffset(hQueue->getDevice(), dstOffset); UR_CHECK_ERROR(cuMemcpyDtoDAsync(Dst, Src, size, Stream)); @@ -706,12 +781,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { ur_result_t Result = UR_RESULT_SUCCESS; - CUdeviceptr SrcPtr = std::get(hBufferSrc->Mem).get(); - CUdeviceptr DstPtr = std::get(hBufferDst->Mem).get(); + CUdeviceptr SrcPtr = + std::get(hBufferSrc->Mem).getPtr(hQueue->getDevice()); + CUdeviceptr DstPtr = + std::get(hBufferDst->Mem).getPtr(hQueue->getDevice()); std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext Active(hQueue->getContext()); + ScopedContext Active(hQueue->getDevice()); CUstream CuStream = hQueue->getNextTransferStream(); Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, phEventWaitList); @@ -784,7 +861,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext Active(hQueue->getContext()); + ScopedContext Active(hQueue->getDevice()); auto Stream = hQueue->getNextTransferStream(); ur_result_t Result = @@ -797,7 +874,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( UR_CHECK_ERROR(RetImplEvent->start()); } - auto DstDevice = std::get(hBuffer->Mem).get() + offset; + auto DstDevice = std::get(hBuffer->Mem) + .getPtrWithOffset(hQueue->getDevice(), offset); auto N = size / patternSize; // pattern size in bytes @@ -933,18 +1011,34 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( std::ignore = rowPitch; std::ignore = slicePitch; - UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hImage->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); - ur_result_t Result = UR_RESULT_SUCCESS; + ur_lock MemoryMigrationLock{hImage->MemoryMigrationMutex}; + auto Device = hQueue->getDevice(); + CUstream Stream = hQueue->getNextTransferStream(); try { - ScopedContext Active(hQueue->getContext()); - CUstream CuStream = hQueue->getNextTransferStream(); - Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, - phEventWaitList); + // Note that this entry point may be called on a queue that may not be the + // last queue to write to the Image, meaning we must perform the copy + // from a different device + if (hImage->LastEventWritingToMemObj && + hImage->LastEventWritingToMemObj->getQueue()->getDevice() != + hQueue->getDevice()) { + hQueue = hImage->LastEventWritingToMemObj->getQueue(); + Device = hQueue->getDevice(); + ScopedContext Active(Device); + Stream = CUstream{0}; // Default stream for different device + // We may have to wait for an event on another queue if it is the last + // event writing to mem obj + UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1, + &hImage->LastEventWritingToMemObj)); + } + + ScopedContext Active(Device); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList, + phEventWaitList)); - CUarray Array = std::get(hImage->Mem).getArray(); + CUarray Array = std::get(hImage->Mem).getArray(Device); CUDA_ARRAY_DESCRIPTOR ArrayDesc; UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array)); @@ -954,29 +1048,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( size_t ByteOffsetX = origin.x * ElementByteSize * ArrayDesc.NumChannels; size_t BytesToCopy = ElementByteSize * ArrayDesc.NumChannels * region.width; - ur_mem_type_t ImgType = std::get(hImage->Mem).getImageType(); + ur_mem_type_t ImgType = std::get(hImage->Mem).getType(); std::unique_ptr RetImplEvent{nullptr}; if (phEvent) { RetImplEvent = std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_IMAGE_READ, hQueue, CuStream)); + UR_COMMAND_MEM_IMAGE_READ, hQueue, Stream)); UR_CHECK_ERROR(RetImplEvent->start()); } if (ImgType == UR_MEM_TYPE_IMAGE1D) { UR_CHECK_ERROR( - cuMemcpyAtoHAsync(pDst, Array, ByteOffsetX, BytesToCopy, CuStream)); + cuMemcpyAtoHAsync(pDst, Array, ByteOffsetX, BytesToCopy, Stream)); } else { ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height, region.depth}; ur_rect_offset_t SrcOffset = {ByteOffsetX, origin.y, origin.z}; - Result = commonEnqueueMemImageNDCopy( - CuStream, ImgType, AdjustedRegion, &Array, CU_MEMORYTYPE_ARRAY, - SrcOffset, pDst, CU_MEMORYTYPE_HOST, ur_rect_offset_t{}); - if (Result != UR_RESULT_SUCCESS) { - return Result; - } + UR_CHECK_ERROR(commonEnqueueMemImageNDCopy( + Stream, ImgType, AdjustedRegion, &Array, CU_MEMORYTYPE_ARRAY, + SrcOffset, pDst, CU_MEMORYTYPE_HOST, ur_rect_offset_t{})); } if (phEvent) { @@ -985,7 +1076,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( } if (blockingRead) { - UR_CHECK_ERROR(cuStreamSynchronize(CuStream)); + UR_CHECK_ERROR(cuStreamSynchronize(Stream)); } } catch (ur_result_t Err) { return Err; @@ -993,7 +1084,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( return UR_RESULT_ERROR_UNKNOWN; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( @@ -1005,18 +1096,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( std::ignore = rowPitch; std::ignore = slicePitch; - UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hImage->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + auto &Image = std::get(hImage->Mem); + // FIXME: We are assuming that the lifetime of host ptr lives as long as the + // image + if (!Image.HostPtr) + Image.HostPtr = pSrc; ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext Active(hQueue->getContext()); + ScopedContext Active(hQueue->getDevice()); CUstream CuStream = hQueue->getNextTransferStream(); Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, phEventWaitList); - CUarray Array = std::get(hImage->Mem).getArray(); + CUarray Array = Image.getArray(hQueue->getDevice()); CUDA_ARRAY_DESCRIPTOR ArrayDesc; UR_CHECK_ERROR(cuArrayGetDescriptor(&ArrayDesc, Array)); @@ -1034,7 +1129,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( UR_CHECK_ERROR(RetImplEvent->start()); } - ur_mem_type_t ImgType = std::get(hImage->Mem).getImageType(); + ur_mem_type_t ImgType = Image.getType(); if (ImgType == UR_MEM_TYPE_IMAGE1D) { UR_CHECK_ERROR( cuMemcpyHtoAAsync(Array, ByteOffsetX, pSrc, BytesToCopy, CuStream)); @@ -1071,24 +1166,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( ur_rect_offset_t dstOrigin, ur_rect_region_t region, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - UR_ASSERT(hImageSrc->MemType == ur_mem_handle_t_::Type::Surface, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(hImageDst->MemType == ur_mem_handle_t_::Type::Surface, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(std::get(hImageSrc->Mem).getImageType() == - std::get(hImageDst->Mem).getImageType(), + UR_ASSERT(hImageSrc->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hImageDst->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(std::get(hImageSrc->Mem).getType() == + std::get(hImageDst->Mem).getType(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext Active(hQueue->getContext()); + ScopedContext Active(hQueue->getDevice()); CUstream CuStream = hQueue->getNextTransferStream(); Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, phEventWaitList); - CUarray SrcArray = std::get(hImageSrc->Mem).getArray(); - CUarray DstArray = std::get(hImageDst->Mem).getArray(); + CUarray SrcArray = + std::get(hImageSrc->Mem).getArray(hQueue->getDevice()); + CUarray DstArray = + std::get(hImageDst->Mem).getArray(hQueue->getDevice()); CUDA_ARRAY_DESCRIPTOR SrcArrayDesc; UR_CHECK_ERROR(cuArrayGetDescriptor(&SrcArrayDesc, SrcArray)); @@ -1117,7 +1212,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( UR_CHECK_ERROR(RetImplEvent->start()); } - ur_mem_type_t ImgType = std::get(hImageSrc->Mem).getImageType(); + ur_mem_type_t ImgType = std::get(hImageSrc->Mem).getType(); ur_rect_region_t AdjustedRegion = {BytesToCopy, region.height, region.depth}; @@ -1154,8 +1249,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( ur_map_flags_t mapFlags, size_t offset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, void **ppRetMap) { - UR_ASSERT(hBuffer->MemType == ur_mem_handle_t_::Type::Buffer, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hBuffer->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); UR_ASSERT(offset + size <= std::get(hBuffer->Mem).getSize(), UR_RESULT_ERROR_INVALID_SIZE); @@ -1177,7 +1271,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( MapPtr, numEventsInWaitList, phEventWaitList, phEvent); } else { - ScopedContext Active(hQueue->getContext()); + ScopedContext Active(hQueue->getDevice()); if (IsPinned) { Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList, @@ -1208,8 +1302,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( ur_queue_handle_t hQueue, ur_mem_handle_t hMem, void *pMappedPtr, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - UR_ASSERT(hMem->MemType == ur_mem_handle_t_::Type::Buffer, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hMem->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); auto &BufferImpl = std::get(hMem->Mem); auto *Map = BufferImpl.getMapDetails(pMappedPtr); @@ -1225,7 +1318,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( hQueue, hMem, true, Map->getMapOffset(), Map->getMapSize(), pMappedPtr, numEventsInWaitList, phEventWaitList, phEvent); } else { - ScopedContext Active(hQueue->getContext()); + ScopedContext Active(hQueue->getDevice()); if (IsPinned) { Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList, @@ -1256,7 +1349,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( std::unique_ptr EventPtr{nullptr}; try { - ScopedContext Active(hQueue->getContext()); + ScopedContext Active(hQueue->getDevice()); uint32_t StreamToken; ur_stream_guard_ Guard; CUstream CuStream = hQueue->getNextComputeStream( @@ -1310,7 +1403,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( std::unique_ptr EventPtr{nullptr}; try { - ScopedContext Active(hQueue->getContext()); + ScopedContext Active(hQueue->getDevice()); CUstream CuStream = hQueue->getNextTransferStream(); Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, phEventWaitList); @@ -1347,7 +1440,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( UR_CHECK_ERROR(cuPointerGetAttribute( &PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem)); UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE); - ur_device_handle_t Device = hQueue->getContext()->getDevice(); + ur_device_handle_t Device = hQueue->getDevice(); // Certain cuda devices and Windows do not have support for some Unified // Memory features. cuMemPrefetchAsync requires concurrent memory access @@ -1373,7 +1466,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( std::unique_ptr EventPtr{nullptr}; try { - ScopedContext Active(hQueue->getContext()); + ScopedContext Active(hQueue->getDevice()); CUstream CuStream = hQueue->getNextTransferStream(); Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, phEventWaitList); @@ -1414,7 +1507,7 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, (advice & UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE) || (advice & UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE) || (advice & UR_USM_ADVICE_FLAG_DEFAULT)) { - ur_device_handle_t Device = hQueue->getContext()->getDevice(); + ur_device_handle_t Device = hQueue->getDevice(); if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { setErrorMessage("Mem advise ignored as device does not support " "concurrent managed access", @@ -1441,7 +1534,7 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, std::unique_ptr EventPtr{nullptr}; try { - ScopedContext Active(hQueue->getContext()); + ScopedContext Active(hQueue->getDevice()); if (phEvent) { EventPtr = @@ -1453,16 +1546,16 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, if (advice & UR_USM_ADVICE_FLAG_DEFAULT) { UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size, CU_MEM_ADVISE_UNSET_READ_MOSTLY, - hQueue->getContext()->getDevice()->get())); + hQueue->getDevice()->get())); UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size, CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION, - hQueue->getContext()->getDevice()->get())); + hQueue->getDevice()->get())); UR_CHECK_ERROR(cuMemAdvise((CUdeviceptr)pMem, size, CU_MEM_ADVISE_UNSET_ACCESSED_BY, - hQueue->getContext()->getDevice()->get())); + hQueue->getDevice()->get())); } else { Result = setCuMemAdvise((CUdeviceptr)pMem, size, advice, - hQueue->getContext()->getDevice()->get()); + hQueue->getDevice()->get()); } if (phEvent) { @@ -1493,7 +1586,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( ur_result_t result = UR_RESULT_SUCCESS; try { - ScopedContext active(hQueue->getContext()); + ScopedContext active(hQueue->getDevice()); CUstream cuStream = hQueue->getNextTransferStream(); result = enqueueEventsWait(hQueue, cuStream, numEventsInWaitList, phEventWaitList); @@ -1543,33 +1636,50 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( UR_ASSERT(!hBuffer->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); UR_ASSERT(offset + size <= std::get(hBuffer->Mem).Size, UR_RESULT_ERROR_INVALID_SIZE); - - ur_result_t Result = UR_RESULT_SUCCESS; - CUdeviceptr DevPtr = std::get(hBuffer->Mem).get(); std::unique_ptr RetImplEvent{nullptr}; + ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex}; + auto Device = hQueue->getDevice(); + ScopedContext Active(Device); + CUstream Stream = hQueue->getNextTransferStream(); try { - ScopedContext Active(hQueue->getContext()); - CUstream CuStream = hQueue->getNextTransferStream(); - - Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, - phEventWaitList); + // Note that this entry point may be called on a queue that may not be the + // last queue to write to the MemBuffer, meaning we must perform the copy + // from a different device + if (hBuffer->LastEventWritingToMemObj && + hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() != + hQueue->getDevice()) { + hQueue = hBuffer->LastEventWritingToMemObj->getQueue(); + Device = hQueue->getDevice(); + ScopedContext Active(Device); + Stream = CUstream{0}; // Default stream for different device + // We may have to wait for an event on another queue if it is the last + // event writing to mem obj + UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1, + &hBuffer->LastEventWritingToMemObj)); + } + + UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList, + phEventWaitList)); if (phEvent) { RetImplEvent = std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_BUFFER_READ, hQueue, CuStream)); + UR_COMMAND_MEM_BUFFER_READ, hQueue, Stream)); UR_CHECK_ERROR(RetImplEvent->start()); } - UR_CHECK_ERROR(cuMemcpyDtoHAsync(pDst, DevPtr + offset, size, CuStream)); + UR_CHECK_ERROR(cuMemcpyDtoHAsync( + pDst, + std::get(hBuffer->Mem).getPtrWithOffset(Device, offset), + size, Stream)); if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); } if (blockingRead) { - UR_CHECK_ERROR(cuStreamSynchronize(CuStream)); + UR_CHECK_ERROR(cuStreamSynchronize(Stream)); } if (phEvent) { @@ -1577,10 +1687,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( } } catch (ur_result_t Err) { - Result = Err; + return Err; } - - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( @@ -1592,11 +1701,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( UR_RESULT_ERROR_INVALID_SIZE); ur_result_t Result = UR_RESULT_SUCCESS; - CUdeviceptr DevPtr = std::get(hBuffer->Mem).get(); + CUdeviceptr DevPtr = + std::get(hBuffer->Mem).getPtr(hQueue->getDevice()); std::unique_ptr RetImplEvent{nullptr}; try { - ScopedContext Active(hQueue->getContext()); + ScopedContext Active(hQueue->getDevice()); CUstream CuStream = hQueue->getNextTransferStream(); Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, @@ -1709,3 +1819,33 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueWriteHostPipe( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + ur_result_t Result = UR_RESULT_SUCCESS; + std::unique_ptr RetImplEvent{nullptr}; + try { + ScopedContext Active(hQueue->getDevice()); + CUstream CuStream = hQueue->getNextComputeStream(); + + UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList)); + + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_TIMESTAMP_RECORDING_EXP, hQueue, CuStream)); + UR_CHECK_ERROR(RetImplEvent->start()); + UR_CHECK_ERROR(RetImplEvent->record()); + + if (blocking) { + UR_CHECK_ERROR(cuStreamSynchronize(CuStream)); + } + + *phEvent = RetImplEvent.release(); + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp index ac66bf479e..f9889a3f46 100644 --- a/source/adapters/cuda/event.cpp +++ b/source/adapters/cuda/event.cpp @@ -54,7 +54,7 @@ ur_result_t ur_event_handle_t_::start() { ur_result_t Result = UR_RESULT_SUCCESS; try { - if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) { + if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) { // NOTE: This relies on the default stream to be unused. UR_CHECK_ERROR(cuEventRecord(EvQueued, 0)); UR_CHECK_ERROR(cuEventRecord(EvStart, Stream)); @@ -149,7 +149,7 @@ ur_result_t ur_event_handle_t_::release() { UR_CHECK_ERROR(cuEventDestroy(EvEnd)); - if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) { + if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) { UR_CHECK_ERROR(cuEventDestroy(EvQueued)); UR_CHECK_ERROR(cuEventDestroy(EvStart)); } @@ -190,7 +190,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); ur_queue_handle_t Queue = hEvent->getQueue(); - if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE)) { + if (Queue == nullptr || (!(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) && + !hEvent->isTimestampEvent())) { return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; } @@ -220,7 +221,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(ur_event_handle_t, UR_APIEXPORT ur_result_t UR_APICALL urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) { try { - ScopedContext Active(phEventWaitList[0]->getContext()); + ScopedContext Active(phEventWaitList[0]->getQueue()->getDevice()); auto WaitFunc = [](ur_event_handle_t Event) -> ur_result_t { UR_ASSERT(Event, UR_RESULT_ERROR_INVALID_EVENT); @@ -255,7 +256,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { std::unique_ptr event_ptr{hEvent}; ur_result_t Result = UR_RESULT_ERROR_INVALID_EVENT; try { - ScopedContext Active(hEvent->getContext()); Result = hEvent->release(); } catch (...) { Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; diff --git a/source/adapters/cuda/event.hpp b/source/adapters/cuda/event.hpp index 390fd7833a..5ed68f0f25 100644 --- a/source/adapters/cuda/event.hpp +++ b/source/adapters/cuda/event.hpp @@ -57,6 +57,10 @@ struct ur_event_handle_t_ { return UR_EVENT_STATUS_COMPLETE; } + bool isTimestampEvent() const noexcept { + return getCommandType() == UR_COMMAND_TIMESTAMP_RECORDING_EXP; + } + ur_context_handle_t getContext() const noexcept { return Context; }; uint32_t incrementReferenceCount() { return ++RefCount; } @@ -83,13 +87,14 @@ struct ur_event_handle_t_ { static ur_event_handle_t makeNative(ur_command_t Type, ur_queue_handle_t Queue, CUstream Stream, uint32_t StreamToken = std::numeric_limits::max()) { - const bool ProfilingEnabled = - Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE; + const bool RequiresTimings = + Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || + Type == UR_COMMAND_TIMESTAMP_RECORDING_EXP; native_type EvEnd = nullptr, EvQueued = nullptr, EvStart = nullptr; UR_CHECK_ERROR(cuEventCreate( - &EvEnd, ProfilingEnabled ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING)); + &EvEnd, RequiresTimings ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING)); - if (ProfilingEnabled) { + if (RequiresTimings) { UR_CHECK_ERROR(cuEventCreate(&EvQueued, CU_EVENT_DEFAULT)); UR_CHECK_ERROR(cuEventCreate(&EvStart, CU_EVENT_DEFAULT)); } diff --git a/source/adapters/cuda/image.cpp b/source/adapters/cuda/image.cpp index 3dd1a7f34b..95dc2e258e 100644 --- a/source/adapters/cuda/image.cpp +++ b/source/adapters/cuda/image.cpp @@ -331,7 +331,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPitchedAllocExp( const ur_usm_desc_t *pUSMDesc, ur_usm_pool_handle_t pool, size_t widthInBytes, size_t height, size_t elementSizeBytes, void **ppMem, size_t *pResultPitch) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_ASSERT(std::find(hContext->getDevices().begin(), + hContext->getDevices().end(), + hDevice) != hContext->getDevices().end(), UR_RESULT_ERROR_INVALID_CONTEXT); std::ignore = pUSMDesc; std::ignore = pool; @@ -350,7 +352,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPitchedAllocExp( UR_RESULT_ERROR_INVALID_VALUE); ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext Active(hDevice->getContext()); + ScopedContext Active(hDevice); UR_CHECK_ERROR(cuMemAllocPitch((CUdeviceptr *)ppMem, pResultPitch, widthInBytes, height, elementSizeBytes)); } catch (ur_result_t error) { @@ -366,7 +368,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageHandleDestroyExp(ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_exp_image_handle_t hImage) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_ASSERT(std::find(hContext->getDevices().begin(), + hContext->getDevices().end(), + hDevice) != hContext->getDevices().end(), UR_RESULT_ERROR_INVALID_CONTEXT); UR_CHECK_ERROR(cuSurfObjectDestroy((CUsurfObject)hImage)); @@ -377,7 +381,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageHandleDestroyExp(ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_exp_image_handle_t hImage) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_ASSERT(std::find(hContext->getDevices().begin(), + hContext->getDevices().end(), + hDevice) != hContext->getDevices().end(), UR_RESULT_ERROR_INVALID_CONTEXT); UR_CHECK_ERROR(cuTexObjectDestroy((CUtexObject)hImage)); @@ -388,7 +394,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, ur_exp_image_mem_handle_t *phImageMem) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_ASSERT(std::find(hContext->getDevices().begin(), + hContext->getDevices().end(), + hDevice) != hContext->getDevices().end(), UR_RESULT_ERROR_INVALID_CONTEXT); // Populate descriptor @@ -435,7 +443,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } - ScopedContext Active(hDevice->getContext()); + ScopedContext Active(hDevice); // Allocate a cuArray if (pImageDesc->numMipLevel == 1) { @@ -475,10 +483,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageAllocateExp( UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageFreeExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_exp_image_mem_handle_t hImageMem) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_ASSERT(std::find(hContext->getDevices().begin(), + hContext->getDevices().end(), + hDevice) != hContext->getDevices().end(), UR_RESULT_ERROR_INVALID_CONTEXT); - ScopedContext Active(hDevice->getContext()); + ScopedContext Active(hDevice); try { UR_CHECK_ERROR(cuArrayDestroy((CUarray)hImageMem)); } catch (ur_result_t Err) { @@ -492,9 +502,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageFreeExp( UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_exp_image_mem_handle_t hImageMem, const ur_image_format_t *pImageFormat, - const ur_image_desc_t *pImageDesc, ur_mem_handle_t *phMem, + [[maybe_unused]] const ur_image_desc_t *pImageDesc, ur_exp_image_handle_t *phImage) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_ASSERT(std::find(hContext->getDevices().begin(), + hContext->getDevices().end(), + hDevice) != hContext->getDevices().end(), UR_RESULT_ERROR_INVALID_CONTEXT); unsigned int NumChannels = 0; @@ -509,7 +521,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( try { - ScopedContext Active(hDevice->getContext()); + ScopedContext Active(hDevice); CUDA_RESOURCE_DESC image_res_desc = {}; @@ -523,15 +535,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( UR_CHECK_ERROR(cuSurfObjectCreate(&surface, &image_res_desc)); *phImage = (ur_exp_image_handle_t)surface; - auto urMemObj = std::unique_ptr(new ur_mem_handle_t_{ - hContext, (CUarray)hImageMem, surface, pImageDesc->type}); - - if (urMemObj == nullptr) { - return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } - - *phMem = urMemObj.release(); - } catch (ur_result_t Err) { return Err; } catch (...) { @@ -545,11 +548,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_exp_image_mem_handle_t hImageMem, const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, ur_sampler_handle_t hSampler, - ur_mem_handle_t *phMem, ur_exp_image_handle_t *phImage) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + ur_exp_image_handle_t *phImage) { + UR_ASSERT(std::find(hContext->getDevices().begin(), + hContext->getDevices().end(), + hDevice) != hContext->getDevices().end(), UR_RESULT_ERROR_INVALID_CONTEXT); - ScopedContext Active(hDevice->getContext()); + ScopedContext Active(hDevice); unsigned int NumChannels = 0; UR_CHECK_ERROR( @@ -611,15 +616,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( UR_CHECK_ERROR( urTextureCreate(hSampler, pImageDesc, image_res_desc, phImage)); - auto urMemObj = std::unique_ptr(new ur_mem_handle_t_{ - hContext, (CUarray)hImageMem, (CUtexObject)*phImage, hSampler, - pImageDesc->type}); - - if (urMemObj == nullptr) { - return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } - - *phMem = urMemObj.release(); } catch (ur_result_t Err) { return Err; } catch (...) { @@ -654,7 +650,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( &PixelSizeBytes)); try { - ScopedContext Active(hQueue->getContext()); + ScopedContext Active(hQueue->getDevice()); CUstream Stream = hQueue->getNextTransferStream(); enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList); @@ -986,11 +982,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMipmapGetLevelExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_exp_image_mem_handle_t hImageMem, uint32_t mipmapLevel, ur_exp_image_mem_handle_t *phImageMem) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_ASSERT(std::find(hContext->getDevices().begin(), + hContext->getDevices().end(), + hDevice) != hContext->getDevices().end(), UR_RESULT_ERROR_INVALID_CONTEXT); try { - ScopedContext Active(hDevice->getContext()); + ScopedContext Active(hDevice); CUarray ImageArray; UR_CHECK_ERROR(cuMipmappedArrayGetLevel( &ImageArray, (CUmipmappedArray)hImageMem, mipmapLevel)); @@ -1007,10 +1005,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMipmapGetLevelExp( UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMipmapFreeExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_exp_image_mem_handle_t hMem) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_ASSERT(std::find(hContext->getDevices().begin(), + hContext->getDevices().end(), + hDevice) != hContext->getDevices().end(), UR_RESULT_ERROR_INVALID_CONTEXT); - ScopedContext Active(hDevice->getContext()); + ScopedContext Active(hDevice); try { UR_CHECK_ERROR(cuMipmappedArrayDestroy((CUmipmappedArray)hMem)); } catch (ur_result_t Err) { @@ -1025,11 +1025,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImportOpaqueFDExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size, ur_exp_interop_mem_desc_t *pInteropMemDesc, ur_exp_interop_mem_handle_t *phInteropMem) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_ASSERT(std::find(hContext->getDevices().begin(), + hContext->getDevices().end(), + hDevice) != hContext->getDevices().end(), UR_RESULT_ERROR_INVALID_CONTEXT); try { - ScopedContext Active(hDevice->getContext()); + ScopedContext Active(hDevice); CUDA_EXTERNAL_MEMORY_HANDLE_DESC extMemDesc = {}; extMemDesc.size = size; @@ -1068,7 +1070,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, ur_exp_interop_mem_handle_t hInteropMem, ur_exp_image_mem_handle_t *phImageMem) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_ASSERT(std::find(hContext->getDevices().begin(), + hContext->getDevices().end(), + hDevice) != hContext->getDevices().end(), UR_RESULT_ERROR_INVALID_CONTEXT); unsigned int NumChannels = 0; @@ -1080,7 +1084,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( pImageFormat->channelType, pImageFormat->channelOrder, &format, nullptr)); try { - ScopedContext Active(hDevice->getContext()); + ScopedContext Active(hDevice); CUDA_ARRAY3D_DESCRIPTOR ArrayDesc = {}; ArrayDesc.Width = pImageDesc->width; @@ -1119,11 +1123,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_exp_interop_mem_handle_t hInteropMem) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_ASSERT(std::find(hContext->getDevices().begin(), + hContext->getDevices().end(), + hDevice) != hContext->getDevices().end(), UR_RESULT_ERROR_INVALID_CONTEXT); try { - ScopedContext Active(hDevice->getContext()); + ScopedContext Active(hDevice); UR_CHECK_ERROR(cuDestroyExternalMemory((CUexternalMemory)hInteropMem)); } catch (ur_result_t Err) { return Err; @@ -1138,11 +1144,13 @@ urBindlessImagesImportExternalSemaphoreOpaqueFDExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_exp_interop_semaphore_desc_t *pInteropSemaphoreDesc, ur_exp_interop_semaphore_handle_t *phInteropSemaphoreHandle) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_ASSERT(std::find(hContext->getDevices().begin(), + hContext->getDevices().end(), + hDevice) != hContext->getDevices().end(), UR_RESULT_ERROR_INVALID_CONTEXT); try { - ScopedContext Active(hDevice->getContext()); + ScopedContext Active(hDevice); CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC extSemDesc = {}; @@ -1177,11 +1185,13 @@ urBindlessImagesImportExternalSemaphoreOpaqueFDExp( UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesDestroyExternalSemaphoreExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_exp_interop_semaphore_handle_t hInteropSemaphore) { - UR_ASSERT((hContext->getDevice()->get() == hDevice->get()), + UR_ASSERT(std::find(hContext->getDevices().begin(), + hContext->getDevices().end(), + hDevice) != hContext->getDevices().end(), UR_RESULT_ERROR_INVALID_CONTEXT); try { - ScopedContext Active(hDevice->getContext()); + ScopedContext Active(hDevice); UR_CHECK_ERROR( cuDestroyExternalSemaphore((CUexternalSemaphore)hInteropSemaphore)); } catch (ur_result_t Err) { @@ -1198,7 +1208,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( ur_event_handle_t *phEvent) { try { - ScopedContext Active(hQueue->getContext()); + ScopedContext Active(hQueue->getDevice()); CUstream Stream = hQueue->getNextTransferStream(); enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList); @@ -1230,7 +1240,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( ur_event_handle_t *phEvent) { try { - ScopedContext Active(hQueue->getContext()); + ScopedContext Active(hQueue->getDevice()); CUstream Stream = hQueue->getNextTransferStream(); enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList); diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp index c9334add15..675fdbe0a3 100644 --- a/source/adapters/cuda/kernel.cpp +++ b/source/adapters/cuda/kernel.cpp @@ -19,7 +19,7 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName, std::unique_ptr Kernel{nullptr}; try { - ScopedContext Active(hProgram->getContext()); + ScopedContext Active(hProgram->getDevice()); CUfunction CuFunc; CUresult FunctionResult = @@ -293,8 +293,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, const ur_kernel_arg_mem_obj_properties_t *Properties, ur_mem_handle_t hArgValue) { - std::ignore = Properties; - // Below sets kernel arg when zero-sized buffers are handled. // In such case the corresponding memory is null. if (hArgValue == nullptr) { @@ -304,10 +302,14 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, ur_result_t Result = UR_RESULT_SUCCESS; try { - if (hArgValue->MemType == ur_mem_handle_t_::Type::Surface) { + auto Device = hKernel->getProgram()->getDevice(); + ur_mem_flags_t MemAccess = + Properties ? Properties->memoryAccess : UR_MEM_FLAG_READ_WRITE; + hKernel->Args.addMemObjArg(argIndex, hArgValue, MemAccess); + if (hArgValue->isImage()) { CUDA_ARRAY3D_DESCRIPTOR arrayDesc; UR_CHECK_ERROR(cuArray3DGetDescriptor( - &arrayDesc, std::get(hArgValue->Mem).getArray())); + &arrayDesc, std::get(hArgValue->Mem).getArray(Device))); if (arrayDesc.Format != CU_AD_FORMAT_UNSIGNED_INT32 && arrayDesc.Format != CU_AD_FORMAT_SIGNED_INT32 && arrayDesc.Format != CU_AD_FORMAT_HALF && @@ -317,10 +319,11 @@ urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, UR_RESULT_ERROR_ADAPTER_SPECIFIC); return UR_RESULT_ERROR_ADAPTER_SPECIFIC; } - CUsurfObject CuSurf = std::get(hArgValue->Mem).getSurface(); + CUsurfObject CuSurf = + std::get(hArgValue->Mem).getSurface(Device); hKernel->setKernelArg(argIndex, sizeof(CuSurf), (void *)&CuSurf); } else { - CUdeviceptr CuPtr = std::get(hArgValue->Mem).get(); + CUdeviceptr CuPtr = std::get(hArgValue->Mem).getPtr(Device); hKernel->setKernelArg(argIndex, sizeof(CUdeviceptr), (void *)&CuPtr); } } catch (ur_result_t Err) { diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp index 9e65066ba7..b7a7358b27 100644 --- a/source/adapters/cuda/kernel.hpp +++ b/source/adapters/cuda/kernel.hpp @@ -63,6 +63,14 @@ struct ur_kernel_handle_t_ { args_size_t ParamSizes; args_index_t Indices; args_size_t OffsetPerIndex; + // A struct to keep track of memargs so that we can do dependency analysis + // at urEnqueueKernelLaunch + struct mem_obj_arg { + ur_mem_handle_t_ *Mem; + int Index; + ur_mem_flags_t AccessFlags; + }; + std::vector MemObjArgs; std::uint32_t ImplicitOffsetArgs[3] = {0, 0, 0}; @@ -116,6 +124,20 @@ struct ur_kernel_handle_t_ { Size + (AlignedLocalOffset - LocalOffset)); } + void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) { + assert(hMem && "Invalid mem handle"); + // To avoid redundancy we are not storing mem obj with index i at index + // i in the vec of MemObjArgs. + for (auto &Arg : MemObjArgs) { + if (Arg.Index == Index) { + // Overwrite the mem obj with the same index + Arg = arguments::mem_obj_arg{hMem, Index, Flags}; + return; + } + } + MemObjArgs.push_back(arguments::mem_obj_arg{hMem, Index, Flags}); + } + void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) { assert(Size == sizeof(std::uint32_t) * 3); std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size); @@ -142,7 +164,7 @@ struct ur_kernel_handle_t_ { urContextRetain(Context); /// Note: this code assumes that there is only one device per context ur_result_t RetError = urKernelGetGroupInfo( - this, Context->getDevice(), + this, Program->getDevice(), UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, sizeof(ReqdThreadsPerBlock), ReqdThreadsPerBlock, nullptr); (void)RetError; @@ -165,6 +187,7 @@ struct ur_kernel_handle_t_ { uint32_t getReferenceCount() const noexcept { return RefCount; } native_type get() const noexcept { return Function; }; + ur_program_handle_t getProgram() const noexcept { return Program; }; native_type get_with_offset_parameter() const noexcept { return FunctionWithOffsetParam; diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp index f097d2474e..c28ce98748 100644 --- a/source/adapters/cuda/memory.cpp +++ b/source/adapters/cuda/memory.cpp @@ -36,59 +36,50 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( const bool PerformInitialCopy = (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) || ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && !EnableUseHostPtr); - ur_result_t Result = UR_RESULT_SUCCESS; ur_mem_handle_t MemObj = nullptr; try { - ScopedContext Active(hContext); - CUdeviceptr Ptr = 0; auto HostPtr = pProperties ? pProperties->pHost : nullptr; - BufferMem::AllocMode AllocMode = BufferMem::AllocMode::Classic; if ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && EnableUseHostPtr) { UR_CHECK_ERROR( cuMemHostRegister(HostPtr, size, CU_MEMHOSTREGISTER_DEVICEMAP)); - UR_CHECK_ERROR(cuMemHostGetDevicePointer(&Ptr, HostPtr, 0)); AllocMode = BufferMem::AllocMode::UseHostPtr; } else if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) { UR_CHECK_ERROR(cuMemAllocHost(&HostPtr, size)); - UR_CHECK_ERROR(cuMemHostGetDevicePointer(&Ptr, HostPtr, 0)); AllocMode = BufferMem::AllocMode::AllocHostPtr; - } else { - UR_CHECK_ERROR(cuMemAlloc(&Ptr, size)); - if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) { - AllocMode = BufferMem::AllocMode::CopyIn; - } + } else if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) { + AllocMode = BufferMem::AllocMode::CopyIn; } - ur_mem_handle_t parentBuffer = nullptr; + auto URMemObj = std::unique_ptr( + new ur_mem_handle_t_{hContext, flags, AllocMode, HostPtr, size}); + if (URMemObj == nullptr) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } - auto URMemObj = std::unique_ptr(new ur_mem_handle_t_{ - hContext, parentBuffer, flags, AllocMode, Ptr, HostPtr, size}); - if (URMemObj != nullptr) { - MemObj = URMemObj.release(); - if (PerformInitialCopy) { - // Operates on the default stream of the current CUDA context. + // First allocation will be made at urMemBufferCreate if context only + // has one device + if (PerformInitialCopy && HostPtr) { + // Perform initial copy to every device in context + for (auto &Device : hContext->getDevices()) { + ScopedContext Active(Device); + // getPtr may allocate mem if not already allocated + const auto &Ptr = std::get(URMemObj->Mem).getPtr(Device); UR_CHECK_ERROR(cuMemcpyHtoD(Ptr, HostPtr, size)); - // Synchronize with default stream implicitly used by cuMemcpyHtoD - // to make buffer data available on device before any other UR call - // uses it. - CUstream defaultStream = 0; - UR_CHECK_ERROR(cuStreamSynchronize(defaultStream)); } - } else { - Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } + MemObj = URMemObj.release(); } catch (ur_result_t Err) { - Result = Err; + return Err; } catch (...) { - Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; + return UR_RESULT_ERROR_OUT_OF_RESOURCES; } *phBuffer = MemObj; - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) { @@ -117,26 +108,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { return UR_RESULT_SUCCESS; } - ScopedContext Active(MemObjPtr->getContext()); - - if (hMem->MemType == ur_mem_handle_t_::Type::Buffer) { - auto &BufferImpl = std::get(MemObjPtr->Mem); - switch (BufferImpl.MemAllocMode) { - case BufferMem::AllocMode::CopyIn: - case BufferMem::AllocMode::Classic: - UR_CHECK_ERROR(cuMemFree(BufferImpl.Ptr)); - break; - case BufferMem::AllocMode::UseHostPtr: - UR_CHECK_ERROR(cuMemHostUnregister(BufferImpl.HostPtr)); - break; - case BufferMem::AllocMode::AllocHostPtr: - UR_CHECK_ERROR(cuMemFreeHost(BufferImpl.HostPtr)); - }; - } else if (hMem->MemType == ur_mem_handle_t_::Type::Surface) { - auto &SurfaceImpl = std::get(MemObjPtr->Mem); - UR_CHECK_ERROR(cuSurfObjectDestroy(SurfaceImpl.getSurface())); - UR_CHECK_ERROR(cuArrayDestroy(SurfaceImpl.getArray())); - } + UR_CHECK_ERROR(hMem->clear()); } catch (ur_result_t Err) { Result = Err; @@ -161,10 +133,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { /// \param[out] phNativeMem Set to the native handle of the UR mem object. /// /// \return UR_RESULT_SUCCESS -UR_APIEXPORT ur_result_t UR_APICALL urMemGetNativeHandle( - ur_mem_handle_t hMem, ur_device_handle_t, ur_native_handle_t *phNativeMem) { - *phNativeMem = reinterpret_cast( - std::get(hMem->Mem).get()); +UR_APIEXPORT ur_result_t UR_APICALL +urMemGetNativeHandle(ur_mem_handle_t hMem, ur_device_handle_t Device, + ur_native_handle_t *phNativeMem) { + try { + *phNativeMem = reinterpret_cast( + std::get(hMem->Mem).getPtr(Device)); + } catch (ur_result_t Err) { + return Err; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } return UR_RESULT_SUCCESS; } @@ -177,14 +156,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory, UrReturnHelper ReturnValue(propSize, pMemInfo, pPropSizeRet); - ScopedContext Active(hMemory->getContext()); + // Any device in context will do + auto Device = hMemory->getContext()->getDevices()[0]; + ScopedContext Active(Device); switch (MemInfoType) { case UR_MEM_INFO_SIZE: { try { size_t AllocSize = 0; UR_CHECK_ERROR(cuMemGetAddressRange( - nullptr, &AllocSize, std::get(hMemory->Mem).Ptr)); + nullptr, &AllocSize, + std::get(hMemory->Mem).getPtr(Device))); return ReturnValue(AllocSize); } catch (ur_result_t Err) { return Err; @@ -242,160 +224,34 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); } - ur_result_t Result = UR_RESULT_SUCCESS; - // We only support RBGA channel order // TODO: check SYCL CTS and spec. May also have to support BGRA UR_ASSERT(pImageFormat->channelOrder == UR_IMAGE_CHANNEL_ORDER_RGBA, UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION); - // We have to use cuArray3DCreate, which has some caveats. The height and - // depth parameters must be set to 0 produce 1D or 2D arrays. pImageDesc gives - // a minimum value of 1, so we need to convert the answer. - CUDA_ARRAY3D_DESCRIPTOR ArrayDesc; - ArrayDesc.NumChannels = 4; // Only support 4 channel image - ArrayDesc.Flags = 0; // No flags required - ArrayDesc.Width = pImageDesc->width; - if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { - ArrayDesc.Height = 0; - ArrayDesc.Depth = 0; - } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { - ArrayDesc.Height = pImageDesc->height; - ArrayDesc.Depth = 0; - } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { - ArrayDesc.Height = pImageDesc->height; - ArrayDesc.Depth = pImageDesc->depth; - } - - // We need to get this now in bytes for calculating the total image size later - size_t PixelTypeSizeBytes; - - switch (pImageFormat->channelType) { - case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8: - case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: - ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT8; - PixelTypeSizeBytes = 1; - break; - case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8: - ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT8; - PixelTypeSizeBytes = 1; - break; - case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16: - case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16: - ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT16; - PixelTypeSizeBytes = 2; - break; - case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16: - ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT16; - PixelTypeSizeBytes = 2; - break; - case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT: - ArrayDesc.Format = CU_AD_FORMAT_HALF; - PixelTypeSizeBytes = 2; - break; - case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: - ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT32; - PixelTypeSizeBytes = 4; - break; - case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32: - ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT32; - PixelTypeSizeBytes = 4; - break; - case UR_IMAGE_CHANNEL_TYPE_FLOAT: - ArrayDesc.Format = CU_AD_FORMAT_FLOAT; - PixelTypeSizeBytes = 4; - break; - default: - detail::ur::die( - "urMemImageCreate given unsupported image_channel_data_type"); - } - - // When a dimension isn't used pImageDesc has the size set to 1 - size_t PixelSizeBytes = - PixelTypeSizeBytes * 4; // 4 is the only number of channels we support - size_t ImageSizeBytes = PixelSizeBytes * pImageDesc->width * - pImageDesc->height * pImageDesc->depth; - - ScopedContext Active(hContext); - CUarray ImageArray = nullptr; - try { - UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &ArrayDesc)); - } catch (ur_result_t Err) { - if (Err == UR_RESULT_ERROR_INVALID_VALUE) { - return UR_RESULT_ERROR_INVALID_IMAGE_SIZE; - } - return Err; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } + auto URMemObj = std::unique_ptr( + new ur_mem_handle_t_{hContext, flags, *pImageFormat, *pImageDesc, pHost}); try { if (PerformInitialCopy) { - // We have to use a different copy function for each image dimensionality - if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { - UR_CHECK_ERROR(cuMemcpyHtoA(ImageArray, 0, pHost, ImageSizeBytes)); - } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { - CUDA_MEMCPY2D CpyDesc; - memset(&CpyDesc, 0, sizeof(CpyDesc)); - CpyDesc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; - CpyDesc.srcHost = pHost; - CpyDesc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; - CpyDesc.dstArray = ImageArray; - CpyDesc.WidthInBytes = PixelSizeBytes * pImageDesc->width; - CpyDesc.Height = pImageDesc->height; - UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc)); - } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { - CUDA_MEMCPY3D CpyDesc; - memset(&CpyDesc, 0, sizeof(CpyDesc)); - CpyDesc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; - CpyDesc.srcHost = pHost; - CpyDesc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; - CpyDesc.dstArray = ImageArray; - CpyDesc.WidthInBytes = PixelSizeBytes * pImageDesc->width; - CpyDesc.Height = pImageDesc->height; - CpyDesc.Depth = pImageDesc->depth; - UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc)); + for (const auto &Device : hContext->getDevices()) { + UR_CHECK_ERROR(migrateMemoryToDeviceIfNeeded(URMemObj.get(), Device)); } } - // CUDA_RESOURCE_DESC is a union of different structs, shown here - // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html - // We need to fill it as described here to use it for a surface or texture - // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__SURFOBJECT.html - // CUDA_RESOURCE_DESC::resType must be CU_RESOURCE_TYPE_ARRAY and - // CUDA_RESOURCE_DESC::res::array::hArray must be set to a valid CUDA array - // handle. - // CUDA_RESOURCE_DESC::flags must be set to zero - - CUDA_RESOURCE_DESC ImageResDesc; - ImageResDesc.res.array.hArray = ImageArray; - ImageResDesc.resType = CU_RESOURCE_TYPE_ARRAY; - ImageResDesc.flags = 0; - - CUsurfObject Surface; - UR_CHECK_ERROR(cuSurfObjectCreate(&Surface, &ImageResDesc)); - - auto MemObj = std::unique_ptr(new ur_mem_handle_t_( - hContext, ImageArray, Surface, flags, pImageDesc->type, phMem)); - - if (MemObj == nullptr) { + if (URMemObj == nullptr) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } - *phMem = MemObj.release(); + *phMem = URMemObj.release(); } catch (ur_result_t Err) { - if (ImageArray) { - cuArrayDestroy(ImageArray); - } + (*phMem)->clear(); return Err; } catch (...) { - if (ImageArray) { - cuArrayDestroy(ImageArray); - } + (*phMem)->clear(); return UR_RESULT_ERROR_UNKNOWN; } - - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t hMemory, @@ -407,14 +263,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t hMemory, auto Context = hMemory->getContext(); - ScopedContext Active(Context); + // Any device will do + auto Device = Context->getDevices()[0]; + ScopedContext Active(Device); UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); try { CUDA_ARRAY3D_DESCRIPTOR ArrayInfo; UR_CHECK_ERROR(cuArray3DGetDescriptor( - &ArrayInfo, std::get(hMemory->Mem).getArray())); + &ArrayInfo, std::get(hMemory->Mem).getArray(Device))); const auto cuda2urFormat = [](CUarray_format CUFormat, ur_image_channel_type_t *ChannelType) { @@ -544,27 +402,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( UR_ASSERT(pRegion->size != 0u, UR_RESULT_ERROR_INVALID_BUFFER_SIZE); auto &BufferImpl = std::get(hBuffer->Mem); - - assert((pRegion->origin <= (pRegion->origin + pRegion->size)) && "Overflow"); UR_ASSERT(((pRegion->origin + pRegion->size) <= BufferImpl.getSize()), UR_RESULT_ERROR_INVALID_BUFFER_SIZE); - // Retained indirectly due to retaining parent buffer below. - ur_context_handle_t Context = hBuffer->Context; - - BufferMem::AllocMode AllocMode = BufferMem::AllocMode::Classic; - assert(BufferImpl.Ptr != BufferMem::native_type{0}); - BufferMem::native_type Ptr = BufferImpl.Ptr + pRegion->origin; - - void *HostPtr = nullptr; - if (BufferImpl.HostPtr) { - HostPtr = static_cast(BufferImpl.HostPtr) + pRegion->origin; - } - - std::unique_ptr MemObj{nullptr}; + std::unique_ptr RetMemObj{nullptr}; try { - MemObj = std::unique_ptr{new ur_mem_handle_t_{ - Context, hBuffer, flags, AllocMode, Ptr, HostPtr, pRegion->size}}; + for (auto Device : hBuffer->Context->getDevices()) { + BufferImpl.getPtr( + Device); // This is allocating a dev ptr behind the scenes + // which is necessary before SubBuffer partition + } + RetMemObj = std::unique_ptr{ + new ur_mem_handle_t_{hBuffer, pRegion->origin}}; } catch (ur_result_t Err) { *phMem = nullptr; return Err; @@ -573,6 +422,189 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } - *phMem = MemObj.release(); + *phMem = RetMemObj.release(); + return UR_RESULT_SUCCESS; +} + +ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem, + const ur_device_handle_t hDevice) { + ScopedContext Active(hDevice); + ur_lock LockGuard(Mem->MemoryAllocationMutex); + + if (Mem->isBuffer()) { + auto &Buffer = std::get(Mem->Mem); + auto &DevPtr = Buffer.Ptrs[hDevice->getIndex() % Buffer.Ptrs.size()]; + + // Allocation has already been made + if (DevPtr != BufferMem::native_type{0}) { + return UR_RESULT_SUCCESS; + } + + if (Buffer.MemAllocMode == BufferMem::AllocMode::AllocHostPtr) { + // Host allocation has already been made + UR_CHECK_ERROR(cuMemHostGetDevicePointer(&DevPtr, Buffer.HostPtr, 0)); + } else if (Buffer.MemAllocMode == BufferMem::AllocMode::UseHostPtr) { + UR_CHECK_ERROR(cuMemHostRegister(Buffer.HostPtr, Buffer.Size, + CU_MEMHOSTALLOC_DEVICEMAP)); + UR_CHECK_ERROR(cuMemHostGetDevicePointer(&DevPtr, Buffer.HostPtr, 0)); + } else { + UR_CHECK_ERROR(cuMemAlloc(&DevPtr, Buffer.Size)); + } + } else { + CUarray ImageArray; + CUsurfObject Surface; + try { + auto &Image = std::get(Mem->Mem); + // Allocation has already been made + if (Image.Arrays[hDevice->getIndex() % Image.Arrays.size()]) { + return UR_RESULT_SUCCESS; + } + UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &Image.ArrayDesc)); + Image.Arrays[hDevice->getIndex() % Image.Arrays.size()] = ImageArray; + + // CUDA_RESOURCE_DESC is a union of different structs, shown here + // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html + // We need to fill it as described here to use it for a surface or texture + // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__SURFOBJECT.html + // CUDA_RESOURCE_DESC::resType must be CU_RESOURCE_TYPE_ARRAY and + // CUDA_RESOURCE_DESC::res::array::hArray must be set to a valid CUDA + // array handle. CUDA_RESOURCE_DESC::flags must be set to zero + CUDA_RESOURCE_DESC ImageResDesc; + ImageResDesc.res.array.hArray = ImageArray; + ImageResDesc.resType = CU_RESOURCE_TYPE_ARRAY; + ImageResDesc.flags = 0; + + UR_CHECK_ERROR(cuSurfObjectCreate(&Surface, &ImageResDesc)); + Image.SurfObjs[hDevice->getIndex() % Image.SurfObjs.size()] = Surface; + } catch (ur_result_t Err) { + if (ImageArray) { + UR_CHECK_ERROR(cuArrayDestroy(ImageArray)); + } + return Err; + } catch (...) { + if (ImageArray) { + UR_CHECK_ERROR(cuArrayDestroy(ImageArray)); + } + return UR_RESULT_ERROR_UNKNOWN; + } + } + return UR_RESULT_SUCCESS; +} + +namespace { +ur_result_t migrateBufferToDevice(ur_mem_handle_t Mem, + ur_device_handle_t hDevice) { + auto &Buffer = std::get(Mem->Mem); + if (Mem->LastEventWritingToMemObj == nullptr) { + // Device allocation being initialized from host for the first time + if (Buffer.HostPtr) { + UR_CHECK_ERROR( + cuMemcpyHtoD(Buffer.getPtr(hDevice), Buffer.HostPtr, Buffer.Size)); + } + } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() != + hDevice) { + UR_CHECK_ERROR(cuMemcpyDtoD( + Buffer.getPtr(hDevice), + Buffer.getPtr(Mem->LastEventWritingToMemObj->getQueue()->getDevice()), + Buffer.Size)); + } + return UR_RESULT_SUCCESS; +} + +ur_result_t migrateImageToDevice(ur_mem_handle_t Mem, + ur_device_handle_t hDevice) { + auto &Image = std::get(Mem->Mem); + // When a dimension isn't used image_desc has the size set to 1 + size_t PixelSizeBytes = Image.PixelTypeSizeBytes * + 4; // 4 is the only number of channels we support + size_t ImageSizeBytes = PixelSizeBytes * Image.ImageDesc.width * + Image.ImageDesc.height * Image.ImageDesc.depth; + + CUarray ImageArray = Image.getArray(hDevice); + + CUDA_MEMCPY2D CpyDesc2D; + CUDA_MEMCPY3D CpyDesc3D; + // We have to use a different copy function for each image + // dimensionality + if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) { + memset(&CpyDesc2D, 0, sizeof(CpyDesc2D)); + CpyDesc2D.srcHost = Image.HostPtr; + CpyDesc2D.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; + CpyDesc2D.dstArray = ImageArray; + CpyDesc2D.WidthInBytes = PixelSizeBytes * Image.ImageDesc.width; + CpyDesc2D.Height = Image.ImageDesc.height; + } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) { + memset(&CpyDesc3D, 0, sizeof(CpyDesc3D)); + CpyDesc3D.srcHost = Image.HostPtr; + CpyDesc3D.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY; + CpyDesc3D.dstArray = ImageArray; + CpyDesc3D.WidthInBytes = PixelSizeBytes * Image.ImageDesc.width; + CpyDesc3D.Height = Image.ImageDesc.height; + CpyDesc3D.Depth = Image.ImageDesc.depth; + } + + if (Mem->LastEventWritingToMemObj == nullptr) { + if (Image.HostPtr) { + if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) { + UR_CHECK_ERROR( + cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes)); + } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) { + CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; + CpyDesc2D.srcHost = Image.HostPtr; + UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D)); + } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) { + CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; + CpyDesc3D.srcHost = Image.HostPtr; + UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D)); + } + } + } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() != + hDevice) { + if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) { + // FIXME: 1D memcpy from DtoD going through the host. + UR_CHECK_ERROR(cuMemcpyAtoH( + Image.HostPtr, + Image.getArray( + Mem->LastEventWritingToMemObj->getQueue()->getDevice()), + 0 /*srcOffset*/, ImageSizeBytes)); + UR_CHECK_ERROR( + cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes)); + } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) { + CpyDesc2D.srcArray = Image.getArray( + Mem->LastEventWritingToMemObj->getQueue()->getDevice()); + UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D)); + } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) { + CpyDesc3D.srcArray = Image.getArray( + Mem->LastEventWritingToMemObj->getQueue()->getDevice()); + UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D)); + } + } + return UR_RESULT_SUCCESS; +} +} // namespace + +// If calling this entry point it is necessary to lock the memoryMigrationMutex +// beforehand +ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem, + const ur_device_handle_t hDevice) { + UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + // Device allocation has already been initialized with most up to date + // data in buffer + if (Mem->HaveMigratedToDeviceSinceLastWrite + [hDevice->getIndex() % + Mem->HaveMigratedToDeviceSinceLastWrite.size()]) { + return UR_RESULT_SUCCESS; + } + + ScopedContext Active(hDevice); + if (Mem->isBuffer()) { + UR_CHECK_ERROR(migrateBufferToDevice(Mem, hDevice)); + } else { + UR_CHECK_ERROR(migrateImageToDevice(Mem, hDevice)); + } + + Mem->HaveMigratedToDeviceSinceLastWrite + [hDevice->getIndex() % Mem->HaveMigratedToDeviceSinceLastWrite.size()] = + true; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp index 3f09552038..6b7e9d0156 100644 --- a/source/adapters/cuda/memory.hpp +++ b/source/adapters/cuda/memory.hpp @@ -11,10 +11,19 @@ #include #include +#include #include #include #include "common.hpp" +#include "context.hpp" +#include "device.hpp" +#include "event.hpp" + +ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t, + const ur_device_handle_t); +ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t, + const ur_device_handle_t); // Handler for plain, pointer-based CUDA allocations struct BufferMem { @@ -27,7 +36,7 @@ struct BufferMem { /// Original flags for the mapped region ur_map_flags_t MapFlags; /// Allocated host memory used exclusively for this map. - std::unique_ptr MapMem; + std::shared_ptr MapMem; BufferMap(size_t MapSize, size_t MapOffset, ur_map_flags_t MapFlags) : MapSize(MapSize), MapOffset(MapOffset), MapFlags(MapFlags), @@ -61,11 +70,16 @@ struct BufferMem { using native_type = CUdeviceptr; +private: + /// CUDA handler for the pointer + std::vector Ptrs; + +public: /// If this allocation is a sub-buffer (i.e., a view on an existing /// allocation), this is the pointer to the parent handler structure - ur_mem_handle_t Parent; - /// CUDA handler for the pointer - native_type Ptr; + ur_mem_handle_t Parent = nullptr; + /// Outer UR mem holding this BufferMem in variant + ur_mem_handle_t OuterMemStruct; /// Pointer associated with this device on the host void *HostPtr; /// Size of the allocation in bytes @@ -75,12 +89,34 @@ struct BufferMem { AllocMode MemAllocMode; - BufferMem(ur_mem_handle_t Parent, BufferMem::AllocMode Mode, CUdeviceptr Ptr, - void *HostPtr, size_t Size) - : Parent{Parent}, Ptr{Ptr}, HostPtr{HostPtr}, Size{Size}, - PtrToBufferMap{}, MemAllocMode{Mode} {}; + BufferMem(ur_context_handle_t Context, ur_mem_handle_t OuterMemStruct, + AllocMode Mode, void *HostPtr, size_t Size) + : Ptrs(Context->getDevices().size(), native_type{0}), + OuterMemStruct{OuterMemStruct}, HostPtr{HostPtr}, Size{Size}, + MemAllocMode{Mode} {}; - native_type get() const noexcept { return Ptr; } + BufferMem(const BufferMem &Buffer) = default; + + native_type getPtrWithOffset(const ur_device_handle_t Device, size_t Offset) { + if (ur_result_t Err = + allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device); + Err != UR_RESULT_SUCCESS) { + throw Err; + } + return reinterpret_cast( + reinterpret_cast(Ptrs[Device->getIndex() % Ptrs.size()]) + + Offset); + } + + native_type getPtr(const ur_device_handle_t Device) { + return getPtrWithOffset(Device, 0); + } + + void *getVoid(const ur_device_handle_t Device) { + return reinterpret_cast(getPtrWithOffset(Device, 0)); + } + + bool isSubBuffer() const noexcept { return Parent != nullptr; } size_t getSize() const noexcept { return Size; } @@ -120,104 +156,315 @@ struct BufferMem { assert(MapPtr != nullptr); PtrToBufferMap.erase(MapPtr); } -}; -// Handler data for surface object (i.e. Images) -struct SurfaceMem { - CUarray Array; - CUsurfObject SurfObj; - ur_mem_type_t ImageType; + ur_result_t clear() { + if (Parent != nullptr) { + return UR_RESULT_SUCCESS; + } - SurfaceMem(CUarray Array, CUsurfObject Surf, ur_mem_type_t ImageType, - void *HostPtr) - : Array{Array}, SurfObj{Surf}, ImageType{ImageType} { - (void)HostPtr; + switch (MemAllocMode) { + case AllocMode::CopyIn: + case AllocMode::Classic: + for (auto &DevPtr : Ptrs) { + if (DevPtr != native_type{0}) { + UR_CHECK_ERROR(cuMemFree(DevPtr)); + } + } + break; + case AllocMode::UseHostPtr: + UR_CHECK_ERROR(cuMemHostUnregister(HostPtr)); + break; + case AllocMode::AllocHostPtr: + UR_CHECK_ERROR(cuMemFreeHost(HostPtr)); + } + return UR_RESULT_SUCCESS; } - CUarray getArray() const noexcept { return Array; } + friend struct ur_mem_handle_t_; + friend ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t, + const ur_device_handle_t); +}; - CUsurfObject getSurface() const noexcept { return SurfObj; } +// Handler data for surface object (i.e. Images) +struct SurfaceMem { +private: + std::vector Arrays; + std::vector SurfObjs; - ur_mem_type_t getImageType() const noexcept { return ImageType; } -}; +public: + ur_mem_handle_t OuterMemStruct; -// For sampled/unsampled images -struct ImageMem { - CUarray Array; - void *Handle; - ur_mem_type_t ImageType; - ur_sampler_handle_t Sampler; + ur_image_format_t ImageFormat; + ur_image_desc_t ImageDesc; + CUDA_ARRAY3D_DESCRIPTOR ArrayDesc; + size_t PixelTypeSizeBytes; + void *HostPtr; - ImageMem(CUarray Array, void *Handle, ur_mem_type_t ImageType, - ur_sampler_handle_t Sampler) - : Array{Array}, Handle{Handle}, ImageType{ImageType}, Sampler{Sampler} {}; + SurfaceMem(ur_context_handle_t Context, ur_mem_handle_t OuterMemStruct, + ur_image_format_t ImageFormat, ur_image_desc_t ImageDesc, + void *HostPtr) + : Arrays(Context->Devices.size(), CUarray{0}), + SurfObjs(Context->Devices.size(), CUsurfObject{0}), + OuterMemStruct{OuterMemStruct}, + ImageFormat{ImageFormat}, ImageDesc{ImageDesc}, HostPtr{HostPtr} { + // We have to use hipArray3DCreate, which has some caveats. The height and + // depth parameters must be set to 0 produce 1D or 2D arrays. image_desc + // gives a minimum value of 1, so we need to convert the answer. + ArrayDesc.NumChannels = 4; // Only support 4 channel image + ArrayDesc.Flags = 0; // No flags required + ArrayDesc.Width = ImageDesc.width; + if (ImageDesc.type == UR_MEM_TYPE_IMAGE1D) { + ArrayDesc.Height = 0; + ArrayDesc.Depth = 0; + } else if (ImageDesc.type == UR_MEM_TYPE_IMAGE2D) { + ArrayDesc.Height = ImageDesc.height; + ArrayDesc.Depth = 0; + } else if (ImageDesc.type == UR_MEM_TYPE_IMAGE3D) { + ArrayDesc.Height = ImageDesc.height; + ArrayDesc.Depth = ImageDesc.depth; + } - CUarray get_array() const noexcept { return Array; } + // We need to get PixelTypeSizeBytes for calculating the total image size + // later + switch (ImageFormat.channelType) { + + case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8: + case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: + ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT8; + PixelTypeSizeBytes = 1; + break; + case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8: + ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT8; + PixelTypeSizeBytes = 1; + break; + case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16: + case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16: + ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT16; + PixelTypeSizeBytes = 2; + break; + case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16: + ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT16; + PixelTypeSizeBytes = 2; + break; + case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT: + ArrayDesc.Format = CU_AD_FORMAT_HALF; + PixelTypeSizeBytes = 2; + break; + case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: + ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT32; + PixelTypeSizeBytes = 4; + break; + case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32: + ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT32; + PixelTypeSizeBytes = 4; + break; + case UR_IMAGE_CHANNEL_TYPE_FLOAT: + ArrayDesc.Format = CU_AD_FORMAT_FLOAT; + PixelTypeSizeBytes = 4; + break; + default: + detail::ur::die( + "urMemImageCreate given unsupported image_channel_data_type"); + } + } - void *get_handle() const noexcept { return Handle; } + // Will allocate a new array on device if not already allocated + CUarray getArray(const ur_device_handle_t Device) { + if (ur_result_t Err = + allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device); + Err != UR_RESULT_SUCCESS) { + throw Err; + } + return Arrays[Device->getIndex() % Arrays.size()]; + } + // Will allocate a new surface on device if not already allocated + CUsurfObject getSurface(const ur_device_handle_t Device) { + if (ur_result_t Err = + allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device); + Err != UR_RESULT_SUCCESS) { + throw Err; + } + return SurfObjs[Device->getIndex() % SurfObjs.size()]; + } - ur_mem_type_t get_image_type() const noexcept { return ImageType; } + ur_mem_type_t getType() { return ImageDesc.type; } - ur_sampler_handle_t get_sampler() const noexcept { return Sampler; } + ur_result_t clear() { + for (auto Array : Arrays) { + if (Array) { + UR_CHECK_ERROR(cuArrayDestroy(Array)); + } + } + for (auto Surf : SurfObjs) { + if (Surf != CUsurfObject{0}) { + UR_CHECK_ERROR(cuSurfObjectDestroy(Surf)); + } + } + return UR_RESULT_SUCCESS; + } + friend ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t, + const ur_device_handle_t); }; /// UR Mem mapping to CUDA memory allocations, both data and texture/surface. /// \brief Represents non-SVM allocations on the CUDA backend. /// Keeps tracks of all mapped regions used for Map/Unmap calls. /// Only one region can be active at the same time per allocation. +/// +/// The ur_mem_handle_t is responsible for memory allocation and migration +/// across devices in the same ur_context_handle_t. If a kernel writes to a +/// ur_mem_handle_t then it will write to LastEventWritingToMemObj. Then all +/// subsequent operations that want to read from the ur_mem_handle_t must wait +/// on the event referring to the last write. +/// +/// Since urMemBufferCreate/urMemImageCreate do not take a queue or device +/// object, only a ur_context_handle_t, at mem obj creation we don't know which +/// device we must make a native image/allocation on. Therefore no allocations +/// are made at urMemBufferCreate/urMemImageCreate. Instead device +/// images/allocations are made lazily. These allocations are made implicitly +/// with a call to getPtr/getArray which will allocate a new allocation/image on +/// device if need be. +/// +/// Memory migration between native allocations for devices in the same +/// ur_context_handle_t will occur at: +/// +/// 1. urEnqueueKernelLaunch +/// 2. urEnqueueMem(Buffer|Image)Read(Rect) +/// +/// Migrations will occur in both cases if the most recent version of data +/// is on a different device, marked by +/// LastEventWritingToMemObj->getQueue()->getDevice() +/// +/// Example trace: +/// ~~~~~~~~~~~~~~ +/// +/// =====> urContextCreate([device0, device1], ...) // associated with [q0, q1] +/// -> OUT: hContext +/// +/// =====> urMemBufferCreate(hContext,...); +/// -> No native allocations made +/// -> OUT: hBuffer +/// +/// =====> urEnqueueMemBufferWrite(q0, hBuffer,...); +/// -> Allocation made on q0 ie device0 +/// -> New allocation initialized with host data. +/// +/// =====> urKernelSetArgMemObj(hKernel0, hBuffer, ...); +/// -> ur_kernel_handle_t associated with a ur_program_handle_t, +/// which is in turn unique to a device. So we can set the kernel +/// arg with the ptr of the device specific allocation. +/// -> hKernel0->getProgram()->getDevice() == device0 +/// -> allocateMemObjOnDeviceIfNeeded(device0); +/// -> Native allocation already made on device0, continue. +/// +/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...); +/// -> Suppose that hKernel0 writes to hBuffer. +/// -> Call hBuffer->setLastEventWritingToMemObj with return event +/// from this operation +/// -> Enqueue native kernel launch +/// +/// =====> urKernelSetArgMemObj(hKernel1, hBuffer, ...); +/// -> hKernel1->getProgram()->getDevice() == device1 +/// -> New allocation will be made on device1 when calling +/// getPtr(device1) +/// -> No native allocation on device1 +/// -> Make native allocation on device1 +/// +/// =====> urEnqueueKernelLaunch(q1, hKernel1, ...); +/// -> Suppose hKernel1 wants to read from hBuffer and not write. +/// -> migrateMemoryToDeviceIfNeeded(device1); +/// -> hBuffer->LastEventWritingToMemObj is not nullptr +/// -> Check if memory has been migrated to device1 since the +/// last write +/// -> Hasn't been migrated +/// -> Wait on LastEventWritingToMemObj. +/// -> Migrate memory from device0's native allocation to +/// device1's native allocation. +/// -> Enqueue native kernel launch +/// +/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...); +/// -> migrateMemoryToDeviceIfNeeded(device0); +/// -> hBuffer->LastEventWritingToMemObj refers to an event +/// from q0 +/// -> Migration not necessary +/// -> Enqueue native kernel launch +/// struct ur_mem_handle_t_ { // Context where the memory object is accessible ur_context_handle_t Context; /// Reference counting of the handler std::atomic_uint32_t RefCount; - enum class Type { Buffer, Surface, Texture } MemType; // Original mem flags passed ur_mem_flags_t MemFlags; + // If we make a ur_mem_handle_t_ from a native allocation, it can be useful to + // associate it with the device that holds the native allocation. + ur_device_handle_t DeviceWithNativeAllocation{nullptr}; + + // Has the memory been migrated to a device since the last write? + std::vector HaveMigratedToDeviceSinceLastWrite; + + // We should wait on this event prior to migrating memory across allocations + // in this ur_mem_handle_t_ + ur_event_handle_t LastEventWritingToMemObj{nullptr}; + + // Enumerates all possible types of accesses. + enum access_mode_t { unknown, read_write, read_only, write_only }; + + ur_mutex MemoryAllocationMutex; // A mutex for allocations + ur_mutex MemoryMigrationMutex; // A mutex for memory transfers + /// A UR Memory object represents either plain memory allocations ("Buffers" /// in OpenCL) or typed allocations ("Images" in OpenCL). /// In CUDA their API handlers are different. Whereas "Buffers" are allocated /// as pointer-like structs, "Images" are stored in Textures or Surfaces. - /// This union allows implementation to use either from the same handler. - std::variant Mem; + /// This variant allows implementation to use either from the same handler. + std::variant Mem; /// Constructs the UR mem handler for a non-typed allocation ("buffer") - ur_mem_handle_t_(ur_context_handle_t Context, ur_mem_handle_t Parent, - ur_mem_flags_t MemFlags, BufferMem::AllocMode Mode, - CUdeviceptr Ptr, void *HostPtr, size_t Size) - : Context{Context}, RefCount{1}, MemType{Type::Buffer}, - MemFlags{MemFlags}, Mem{BufferMem{Parent, Mode, Ptr, HostPtr, Size}} { - if (isSubBuffer()) { - urMemRetain(std::get(Mem).Parent); - } else { - urContextRetain(Context); - } - }; - - /// Constructs the UR allocation for an Image object (surface in CUDA) - ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array, - CUsurfObject Surf, ur_mem_flags_t MemFlags, - ur_mem_type_t ImageType, void *HostPtr) - : Context{Context}, RefCount{1}, MemType{Type::Surface}, - MemFlags{MemFlags}, Mem{SurfaceMem{Array, Surf, ImageType, HostPtr}} { + ur_mem_handle_t_(ur_context_handle_t Ctxt, ur_mem_flags_t MemFlags, + BufferMem::AllocMode Mode, void *HostPtr, size_t Size) + : Context{Ctxt}, RefCount{1}, MemFlags{MemFlags}, + HaveMigratedToDeviceSinceLastWrite(Context->Devices.size(), false), + Mem{std::in_place_type, Ctxt, this, Mode, HostPtr, Size} { urContextRetain(Context); - } + }; - /// Constructs the UR allocation for an unsampled image object - ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array, - CUsurfObject Surf, ur_mem_type_t ImageType) - : Context{Context}, RefCount{1}, MemType{Type::Surface}, MemFlags{0}, - Mem{ImageMem{Array, (void *)Surf, ImageType, nullptr}} { - urContextRetain(Context); - } + // Subbuffer constructor + ur_mem_handle_t_(ur_mem_handle_t Parent, size_t SubBufferOffset) + : Context{Parent->Context}, RefCount{1}, MemFlags{Parent->MemFlags}, + HaveMigratedToDeviceSinceLastWrite(Parent->Context->Devices.size(), + false), + Mem{BufferMem{std::get(Parent->Mem)}} { + auto &SubBuffer = std::get(Mem); + SubBuffer.Parent = Parent; + SubBuffer.OuterMemStruct = this; + if (SubBuffer.HostPtr) { + SubBuffer.HostPtr = + static_cast(SubBuffer.HostPtr) + SubBufferOffset; + } + for (auto &DevPtr : SubBuffer.Ptrs) { + if (DevPtr) { + DevPtr += SubBufferOffset; + } + } + urMemRetain(Parent); + }; - /// Constructs the UR allocation for a sampled image object - ur_mem_handle_t_(ur_context_handle_t Context, CUarray Array, CUtexObject Tex, - ur_sampler_handle_t Sampler, ur_mem_type_t ImageType) - : Context{Context}, RefCount{1}, MemType{Type::Texture}, MemFlags{0}, - Mem{ImageMem{Array, (void *)Tex, ImageType, Sampler}} { + /// Constructs the UR mem handler for an Image object + ur_mem_handle_t_(ur_context_handle_t Ctxt, ur_mem_flags_t MemFlags, + ur_image_format_t ImageFormat, ur_image_desc_t ImageDesc, + void *HostPtr) + : Context{Ctxt}, RefCount{1}, MemFlags{MemFlags}, + HaveMigratedToDeviceSinceLastWrite(Context->Devices.size(), false), + Mem{std::in_place_type, + Ctxt, + this, + ImageFormat, + ImageDesc, + HostPtr} { urContextRetain(Context); } @@ -229,13 +476,24 @@ struct ur_mem_handle_t_ { urContextRelease(Context); } - bool isBuffer() const noexcept { return MemType == Type::Buffer; } + bool isBuffer() const noexcept { + return std::holds_alternative(Mem); + } bool isSubBuffer() const noexcept { return (isBuffer() && (std::get(Mem).Parent != nullptr)); } - bool isImage() const noexcept { return MemType == Type::Surface; } + bool isImage() const noexcept { + return std::holds_alternative(Mem); + } + + ur_result_t clear() { + if (isBuffer()) { + return std::get(Mem).clear(); + } + return std::get(Mem).clear(); + } ur_context_handle_t getContext() const noexcept { return Context; } @@ -244,4 +502,22 @@ struct ur_mem_handle_t_ { uint32_t decrementReferenceCount() noexcept { return --RefCount; } uint32_t getReferenceCount() const noexcept { return RefCount; } + + void setLastEventWritingToMemObj(ur_event_handle_t NewEvent) { + assert(NewEvent && "Invalid event!"); + // This entry point should only ever be called when using multi device ctx + assert(Context->Devices.size() > 1); + urEventRetain(NewEvent); + if (LastEventWritingToMemObj != nullptr) { + urEventRelease(LastEventWritingToMemObj); + } + LastEventWritingToMemObj = NewEvent; + for (const auto &Device : Context->getDevices()) { + // This event is never an interop event so will always have an associated + // queue + HaveMigratedToDeviceSinceLastWrite + [Device->getIndex() % HaveMigratedToDeviceSinceLastWrite.size()] = + Device == NewEvent->getQueue()->getDevice(); + } + } }; diff --git a/source/adapters/cuda/physical_mem.cpp b/source/adapters/cuda/physical_mem.cpp index e2c46696a0..c8d4adf0ef 100644 --- a/source/adapters/cuda/physical_mem.cpp +++ b/source/adapters/cuda/physical_mem.cpp @@ -23,7 +23,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urPhysicalMemCreate( CUmemAllocationProp AllocProps = {}; AllocProps.location.type = CU_MEM_LOCATION_TYPE_DEVICE; AllocProps.type = CU_MEM_ALLOCATION_TYPE_PINNED; - UR_CHECK_ERROR(GetDeviceOrdinal(hDevice, AllocProps.location.id)); + AllocProps.location.id = hDevice->getIndex(); CUmemGenericAllocationHandle ResHandle; switch (auto Result = cuMemCreate(&ResHandle, size, &AllocProps, 0)) { @@ -32,7 +32,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urPhysicalMemCreate( default: UR_CHECK_ERROR(Result); } - *phPhysicalMem = new ur_physical_mem_handle_t_(ResHandle, hContext); + *phPhysicalMem = new ur_physical_mem_handle_t_(ResHandle, hContext, hDevice); return UR_RESULT_SUCCESS; } @@ -51,7 +51,7 @@ urPhysicalMemRelease(ur_physical_mem_handle_t hPhysicalMem) { try { std::unique_ptr PhysicalMemGuard(hPhysicalMem); - ScopedContext Active(hPhysicalMem->getContext()); + ScopedContext Active(hPhysicalMem->getDevice()); UR_CHECK_ERROR(cuMemRelease(hPhysicalMem->get())); return UR_RESULT_SUCCESS; } catch (ur_result_t err) { diff --git a/source/adapters/cuda/physical_mem.hpp b/source/adapters/cuda/physical_mem.hpp index 0ce332e112..c77ad0d547 100644 --- a/source/adapters/cuda/physical_mem.hpp +++ b/source/adapters/cuda/physical_mem.hpp @@ -26,41 +26,29 @@ struct ur_physical_mem_handle_t_ { std::atomic_uint32_t RefCount; native_type PhysicalMem; ur_context_handle_t_ *Context; + ur_device_handle_t Device; - ur_physical_mem_handle_t_(native_type PhysMem, ur_context_handle_t_ *Ctx) - : RefCount(1), PhysicalMem(PhysMem), Context(Ctx) { + ur_physical_mem_handle_t_(native_type PhysMem, ur_context_handle_t_ *Ctx, + ur_device_handle_t Device) + : RefCount(1), PhysicalMem(PhysMem), Context(Ctx), Device(Device) { urContextRetain(Context); + urDeviceRetain(Device); } - ~ur_physical_mem_handle_t_() { urContextRelease(Context); } + ~ur_physical_mem_handle_t_() { + urContextRelease(Context); + urDeviceRelease(Device); + } native_type get() const noexcept { return PhysicalMem; } ur_context_handle_t_ *getContext() const noexcept { return Context; } + ur_device_handle_t_ *getDevice() const noexcept { return Device; } + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } uint32_t decrementReferenceCount() noexcept { return --RefCount; } uint32_t getReferenceCount() const noexcept { return RefCount; } }; - -// Find a device ordinal of a device. -inline ur_result_t GetDeviceOrdinal(ur_device_handle_t Device, int &Ordinal) { - ur_adapter_handle_t AdapterHandle = &adapter; - // Get list of platforms - uint32_t NumPlatforms; - UR_CHECK_ERROR(urPlatformGet(&AdapterHandle, 1, 0, nullptr, &NumPlatforms)); - UR_ASSERT(NumPlatforms, UR_RESULT_ERROR_UNKNOWN); - - std::vector Platforms{NumPlatforms}; - UR_CHECK_ERROR(urPlatformGet(&AdapterHandle, 1, NumPlatforms, - Platforms.data(), nullptr)); - - // Ordinal corresponds to the platform ID as each device has its own platform. - CUdevice NativeDevice = Device->get(); - for (Ordinal = 0; size_t(Ordinal) < Platforms.size(); ++Ordinal) - if (Platforms[Ordinal]->Devices[0]->get() == NativeDevice) - return UR_RESULT_SUCCESS; - return UR_RESULT_ERROR_INVALID_DEVICE; -} diff --git a/source/adapters/cuda/platform.cpp b/source/adapters/cuda/platform.cpp index 4c730f997a..27b94f756f 100644 --- a/source/adapters/cuda/platform.cpp +++ b/source/adapters/cuda/platform.cpp @@ -52,9 +52,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetInfo( /// There is only one CUDA platform, and contains all devices on the system. /// Triggers the CUDA Driver initialization (cuInit) the first time, so this /// must be the first PI API called. -/// -/// However because multiple devices in a context is not currently supported, -/// place each device in a separate platform. UR_APIEXPORT ur_result_t UR_APICALL urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries, ur_platform_handle_t *phPlatforms, uint32_t *pNumPlatforms) { @@ -62,7 +59,7 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries, try { static std::once_flag InitFlag; static uint32_t NumPlatforms = 1; - static std::vector Platforms; + static ur_platform_handle_t_ Platform; UR_ASSERT(phPlatforms || pNumPlatforms, UR_RESULT_ERROR_INVALID_VALUE); UR_ASSERT(!phPlatforms || NumEntries > 0, UR_RESULT_ERROR_INVALID_SIZE); @@ -76,39 +73,34 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries, int NumDevices = 0; UR_CHECK_ERROR(cuDeviceGetCount(&NumDevices)); try { - // make one platform per device - NumPlatforms = NumDevices; - Platforms.resize(NumDevices); - for (int i = 0; i < NumDevices; ++i) { CUdevice Device; UR_CHECK_ERROR(cuDeviceGet(&Device, i)); CUcontext Context; UR_CHECK_ERROR(cuDevicePrimaryCtxRetain(&Context, Device)); - ScopedContext active(Context); + ScopedContext Active(Context); // Set native ctx as active CUevent EvBase; UR_CHECK_ERROR(cuEventCreate(&EvBase, CU_EVENT_DEFAULT)); // Use default stream to record base event counter UR_CHECK_ERROR(cuEventRecord(EvBase, 0)); - Platforms[i].Devices.emplace_back(new ur_device_handle_t_{ - Device, Context, EvBase, &Platforms[i]}); + Platform.Devices.emplace_back( + new ur_device_handle_t_{Device, Context, EvBase, &Platform, + static_cast(i)}); } } catch (const std::bad_alloc &) { // Signal out-of-memory situation for (int i = 0; i < NumDevices; ++i) { - Platforms[i].Devices.clear(); + Platform.Devices.clear(); } - Platforms.clear(); Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } catch (ur_result_t Err) { // Clear and rethrow to allow retry for (int i = 0; i < NumDevices; ++i) { - Platforms[i].Devices.clear(); + Platform.Devices.clear(); } - Platforms.clear(); Result = Err; throw Err; } catch (...) { @@ -123,9 +115,7 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries, } if (phPlatforms != nullptr) { - for (unsigned i = 0; i < std::min(NumEntries, NumPlatforms); ++i) { - phPlatforms[i] = &Platforms[i]; - } + *phPlatforms = &Platform; } return Result; diff --git a/source/adapters/cuda/program.cpp b/source/adapters/cuda/program.cpp index 390e96b34c..c5cb763ada 100644 --- a/source/adapters/cuda/program.cpp +++ b/source/adapters/cuda/program.cpp @@ -9,6 +9,7 @@ //===----------------------------------------------------------------------===// #include "program.hpp" +#include "ur_util.hpp" bool getMaxRegistersJitOptionValue(const std::string &BuildOptions, unsigned int &Value) { @@ -44,23 +45,6 @@ bool getMaxRegistersJitOptionValue(const std::string &BuildOptions, return true; } -ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Context) - : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1}, - Context{Context}, KernelReqdWorkGroupSizeMD{} { - urContextRetain(Context); -} - -ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(Context); } - -std::pair -splitMetadataName(const std::string &metadataName) { - size_t splitPos = metadataName.rfind('@'); - if (splitPos == std::string::npos) - return std::make_pair(metadataName, std::string{}); - return std::make_pair(metadataName.substr(0, splitPos), - metadataName.substr(splitPos, metadataName.length())); -} - ur_result_t ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata, size_t Length) { @@ -197,12 +181,14 @@ ur_result_t createProgram(ur_context_handle_t hContext, const uint8_t *pBinary, const ur_program_properties_t *pProperties, ur_program_handle_t *phProgram) { - UR_ASSERT(hContext->getDevice()->get() == hDevice->get(), + UR_ASSERT(std::find(hContext->getDevices().begin(), + hContext->getDevices().end(), + hDevice) != hContext->getDevices().end(), UR_RESULT_ERROR_INVALID_CONTEXT); UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE); std::unique_ptr RetProgram{ - new ur_program_handle_t_{hContext}}; + new ur_program_handle_t_{hContext, hDevice}}; if (pProperties) { if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) { @@ -222,18 +208,12 @@ ur_result_t createProgram(ur_context_handle_t hContext, return UR_RESULT_SUCCESS; } -/// CUDA will handle the PTX/CUBIN binaries internally through CUmodule object. -/// So, urProgramCreateWithIL and urProgramCreateWithBinary are equivalent in -/// terms of CUDA adapter. See \ref urProgramCreateWithBinary. +// A program is unique to a device so this entry point cannot be supported with +// a multi device context UR_APIEXPORT ur_result_t UR_APICALL -urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, - size_t length, const ur_program_properties_t *pProperties, - ur_program_handle_t *phProgram) { - ur_device_handle_t hDevice = hContext->getDevice(); - auto pBinary = reinterpret_cast(pIL); - - return createProgram(hContext, hDevice, length, pBinary, pProperties, - phProgram); +urProgramCreateWithIL(ur_context_handle_t, const void *, size_t, + const ur_program_properties_t *, ur_program_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } /// CUDA will handle the PTX/CUBIN binaries internally through a call to @@ -272,7 +252,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext, ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext Active(hProgram->getContext()); + ScopedContext Active(hProgram->getDevice()); hProgram->buildProgram(pOptions); hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_EXECUTABLE; @@ -297,13 +277,17 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count, const ur_program_handle_t *phPrograms, const char *pOptions, ur_program_handle_t *phProgram) { ur_result_t Result = UR_RESULT_SUCCESS; + // All programs must be associated with the same device + for (auto i = 1u; i < count; ++i) + UR_ASSERT(phPrograms[i]->getDevice() == phPrograms[0]->getDevice(), + UR_RESULT_ERROR_INVALID_DEVICE); try { - ScopedContext Active(hContext); + ScopedContext Active(phPrograms[0]->getDevice()); CUlinkState State; std::unique_ptr RetProgram{ - new ur_program_handle_t_{hContext}}; + new ur_program_handle_t_{hContext, phPrograms[0]->getDevice()}}; UR_CHECK_ERROR(cuLinkCreate(0, nullptr, nullptr, &State)); try { @@ -390,7 +374,7 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName, case UR_PROGRAM_INFO_NUM_DEVICES: return ReturnValue(1u); case UR_PROGRAM_INFO_DEVICES: - return ReturnValue(&hProgram->Context->DeviceID, 1); + return ReturnValue(&hProgram->Device, 1); case UR_PROGRAM_INFO_SOURCE: return ReturnValue(hProgram->Binary); case UR_PROGRAM_INFO_BINARY_SIZES: @@ -434,7 +418,7 @@ urProgramRelease(ur_program_handle_t hProgram) { ur_result_t Result = UR_RESULT_ERROR_INVALID_PROGRAM; try { - ScopedContext Active(hProgram->getContext()); + ScopedContext Active(hProgram->getDevice()); auto cuModule = hProgram->get(); // "0" is a valid handle for a cuModule, so the best way to check if we // actually loaded a module and need to unload it is to look at the build @@ -491,8 +475,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer( ur_device_handle_t hDevice, ur_program_handle_t hProgram, const char *pFunctionName, void **ppFunctionPointer) { // Check if device passed is the same the device bound to the context - UR_ASSERT(hDevice == hProgram->getContext()->getDevice(), - UR_RESULT_ERROR_INVALID_DEVICE); + UR_ASSERT(hDevice == hProgram->getDevice(), UR_RESULT_ERROR_INVALID_DEVICE); CUfunction Func; CUresult Ret = cuModuleGetFunction(&Func, hProgram->get(), pFunctionName); diff --git a/source/adapters/cuda/program.hpp b/source/adapters/cuda/program.hpp index feb3a09f31..5d41374d34 100644 --- a/source/adapters/cuda/program.hpp +++ b/source/adapters/cuda/program.hpp @@ -24,6 +24,7 @@ struct ur_program_handle_t_ { size_t BinarySizeInBytes; std::atomic_uint32_t RefCount; ur_context_handle_t Context; + ur_device_handle_t Device; /* The ur_program_binary_type_t property is defined individually for every * device in a program. However, since the CUDA adapter only has 1 device per @@ -42,8 +43,17 @@ struct ur_program_handle_t_ { std::string BuildOptions; ur_program_build_status_t BuildStatus = UR_PROGRAM_BUILD_STATUS_NONE; - ur_program_handle_t_(ur_context_handle_t Context); - ~ur_program_handle_t_(); + ur_program_handle_t_(ur_context_handle_t Context, ur_device_handle_t Device) + : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1}, + Context{Context}, Device{Device}, KernelReqdWorkGroupSizeMD{} { + urContextRetain(Context); + urDeviceRetain(Device); + } + + ~ur_program_handle_t_() { + urContextRelease(Context); + urDeviceRelease(Device); + } ur_result_t setMetadata(const ur_program_metadata_t *Metadata, size_t Length); @@ -51,6 +61,7 @@ struct ur_program_handle_t_ { ur_result_t buildProgram(const char *BuildOptions); ur_context_handle_t getContext() const { return Context; }; + ur_device_handle_t getDevice() const noexcept { return Device; }; native_type get() const noexcept { return Module; }; diff --git a/source/adapters/cuda/queue.cpp b/source/adapters/cuda/queue.cpp index 120d665524..773126f4d5 100644 --- a/source/adapters/cuda/queue.cpp +++ b/source/adapters/cuda/queue.cpp @@ -123,7 +123,8 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, try { std::unique_ptr Queue{nullptr}; - if (hContext->getDevice() != hDevice) { + if (std::find(hContext->getDevices().begin(), hContext->getDevices().end(), + hDevice) == hContext->getDevices().end()) { *phQueue = nullptr; return UR_RESULT_ERROR_INVALID_DEVICE; } @@ -145,10 +146,10 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, IsOutOfOrder = true; } if (URFlags & UR_QUEUE_FLAG_PRIORITY_HIGH) { - ScopedContext Active(hContext); + ScopedContext Active(hDevice); UR_CHECK_ERROR(cuCtxGetStreamPriorityRange(nullptr, &Priority)); } else if (URFlags & UR_QUEUE_FLAG_PRIORITY_LOW) { - ScopedContext Active(hContext); + ScopedContext Active(hDevice); UR_CHECK_ERROR(cuCtxGetStreamPriorityRange(&Priority, nullptr)); } } @@ -193,7 +194,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) { if (!hQueue->backendHasOwnership()) return UR_RESULT_SUCCESS; - ScopedContext Active(hQueue->getContext()); + ScopedContext Active(hQueue->getDevice()); hQueue->forEachStream([](CUstream S) { UR_CHECK_ERROR(cuStreamSynchronize(S)); @@ -212,7 +213,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) { ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext active(hQueue->getContext()); + ScopedContext active(hQueue->getDevice()); hQueue->syncStreams( [](CUstream s) { UR_CHECK_ERROR(cuStreamSynchronize(s)); }); @@ -242,7 +243,7 @@ urQueueGetNativeHandle(ur_queue_handle_t hQueue, ur_queue_native_desc_t *pDesc, ur_native_handle_t *phNativeQueue) { std::ignore = pDesc; - ScopedContext Active(hQueue->getContext()); + ScopedContext Active(hQueue->getDevice()); *phNativeQueue = reinterpret_cast(hQueue->getNextComputeStream()); return UR_RESULT_SUCCESS; @@ -276,7 +277,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( new ur_queue_handle_t_{std::move(ComputeCuStreams), std::move(TransferCuStreams), hContext, - hContext->getDevice(), + hDevice, CuFlags, Flags, /*priority*/ 0, diff --git a/source/adapters/cuda/queue.hpp b/source/adapters/cuda/queue.hpp index c79ca18a9b..46e9968fa9 100644 --- a/source/adapters/cuda/queue.hpp +++ b/source/adapters/cuda/queue.hpp @@ -97,6 +97,7 @@ struct ur_queue_handle_t_ { uint32_t *StreamToken = nullptr); native_type getNextTransferStream(); native_type get() { return getNextComputeStream(); }; + ur_device_handle_t getDevice() const noexcept { return Device; }; bool hasBeenSynchronized(uint32_t StreamToken) { // stream token not associated with one of the compute streams diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp index 2ffc0755ee..80fd211863 100644 --- a/source/adapters/cuda/ur_interface_loader.cpp +++ b/source/adapters/cuda/ur_interface_loader.cpp @@ -407,6 +407,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = urEnqueueCooperativeKernelLaunchExp; + pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/cuda/usm.cpp b/source/adapters/cuda/usm.cpp index 59d3ba70fd..4e6c6898d5 100644 --- a/source/adapters/cuda/usm.cpp +++ b/source/adapters/cuda/usm.cpp @@ -33,7 +33,7 @@ urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, UR_RESULT_ERROR_INVALID_VALUE); if (!hPool) { - return USMHostAllocImpl(ppMem, hContext, nullptr, size, alignment); + return USMHostAllocImpl(ppMem, hContext, /* flags */ 0, size, alignment); } auto UMFPool = hPool->HostMemPool.get(); @@ -57,7 +57,7 @@ urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, UR_RESULT_ERROR_INVALID_VALUE); if (!hPool) { - return USMDeviceAllocImpl(ppMem, hContext, hDevice, nullptr, size, + return USMDeviceAllocImpl(ppMem, hContext, hDevice, /* flags */ 0, size, alignment); } @@ -82,8 +82,8 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, UR_RESULT_ERROR_INVALID_VALUE); if (!hPool) { - return USMSharedAllocImpl(ppMem, hContext, hDevice, nullptr, nullptr, size, - alignment); + return USMSharedAllocImpl(ppMem, hContext, hDevice, /*host flags*/ 0, + /*device flags*/ 0, size, alignment); } auto UMFPool = hPool->SharedMemPool.get(); @@ -95,11 +95,10 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, return UR_RESULT_SUCCESS; } -ur_result_t USMFreeImpl(ur_context_handle_t Context, void *Pointer) { +ur_result_t USMFreeImpl(ur_context_handle_t, void *Pointer) { ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext Active(Context); - bool IsManaged; + unsigned int IsManaged; unsigned int Type; void *AttributeValues[2] = {&IsManaged, &Type}; CUpointer_attribute Attributes[2] = {CU_POINTER_ATTRIBUTE_IS_MANAGED, @@ -131,11 +130,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext, return USMFreeImpl(hContext, pMem); } -ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_device_handle_t, ur_usm_device_mem_flags_t *, - size_t Size, uint32_t Alignment) { +ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t, + ur_device_handle_t Device, + ur_usm_device_mem_flags_t, size_t Size, + uint32_t Alignment) { try { - ScopedContext Active(Context); + ScopedContext Active(Device); UR_CHECK_ERROR(cuMemAlloc((CUdeviceptr *)ResultPtr, Size)); } catch (ur_result_t Err) { return Err; @@ -150,12 +150,13 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, return UR_RESULT_SUCCESS; } -ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_device_handle_t, ur_usm_host_mem_flags_t *, - ur_usm_device_mem_flags_t *, size_t Size, +ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t, + ur_device_handle_t Device, + ur_usm_host_mem_flags_t, + ur_usm_device_mem_flags_t, size_t Size, uint32_t Alignment) { try { - ScopedContext Active(Context); + ScopedContext Active(Device); UR_CHECK_ERROR(cuMemAllocManaged((CUdeviceptr *)ResultPtr, Size, CU_MEM_ATTACH_GLOBAL)); } catch (ur_result_t Err) { @@ -171,11 +172,10 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, return UR_RESULT_SUCCESS; } -ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_usm_host_mem_flags_t *, size_t Size, +ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t, + ur_usm_host_mem_flags_t, size_t Size, uint32_t Alignment) { try { - ScopedContext Active(Context); UR_CHECK_ERROR(cuMemAllocHost(ResultPtr, Size)); } catch (ur_result_t Err) { return Err; @@ -199,7 +199,6 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem, UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); try { - ScopedContext Active(hContext); switch (propName) { case UR_USM_ALLOC_INFO_TYPE: { unsigned int Value; @@ -358,24 +357,24 @@ umf_result_t USMMemoryProvider::get_min_page_size(void *Ptr, size_t *PageSize) { ur_result_t USMSharedMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, uint32_t Alignment) { - return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, nullptr, Size, - Alignment); + return USMSharedAllocImpl(ResultPtr, Context, Device, /*host flags*/ 0, + /*device flags*/ 0, Size, Alignment); } ur_result_t USMDeviceMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, uint32_t Alignment) { - return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size, + return USMDeviceAllocImpl(ResultPtr, Context, Device, /* flags */ 0, Size, Alignment); } ur_result_t USMHostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, uint32_t Alignment) { - return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment); + return USMHostAllocImpl(ResultPtr, Context, /* flags */ 0, Size, Alignment); } ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, ur_usm_pool_desc_t *PoolDesc) - : Context(Context) { + : Context{Context} { const void *pNext = PoolDesc->pNext; while (pNext != nullptr) { const ur_base_desc_t *BaseDesc = static_cast(pNext); @@ -406,25 +405,25 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, &this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Host]) .second; - auto Device = Context->DeviceID; - MemProvider = - umf::memoryProviderMakeUnique(Context, Device) - .second; - DeviceMemPool = - umf::poolMakeUniqueFromOps( - &UMF_DISJOINT_POOL_OPS, std::move(MemProvider), - &this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Device]) - .second; - - MemProvider = - umf::memoryProviderMakeUnique(Context, Device) - .second; - SharedMemPool = - umf::poolMakeUniqueFromOps( - &UMF_DISJOINT_POOL_OPS, std::move(MemProvider), - &this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Shared]) - .second; - Context->addPool(this); + for (const auto &Device : Context->getDevices()) { + MemProvider = + umf::memoryProviderMakeUnique(Context, Device) + .second; + DeviceMemPool = umf::poolMakeUniqueFromOps( + &UMF_DISJOINT_POOL_OPS, std::move(MemProvider), + &this->DisjointPoolConfigs + .Configs[usm::DisjointPoolMemType::Device]) + .second; + MemProvider = + umf::memoryProviderMakeUnique(Context, Device) + .second; + SharedMemPool = umf::poolMakeUniqueFromOps( + &UMF_DISJOINT_POOL_OPS, std::move(MemProvider), + &this->DisjointPoolConfigs + .Configs[usm::DisjointPoolMemType::Shared]) + .second; + Context->addPool(this); + } } bool ur_usm_pool_handle_t_::hasUMFPool(umf_memory_pool_t *umf_pool) { diff --git a/source/adapters/cuda/usm.hpp b/source/adapters/cuda/usm.hpp index 2ec3df150f..e5d1f7fbaa 100644 --- a/source/adapters/cuda/usm.hpp +++ b/source/adapters/cuda/usm.hpp @@ -118,15 +118,15 @@ class USMHostMemoryProvider final : public USMMemoryProvider { ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, ur_device_handle_t Device, - ur_usm_device_mem_flags_t *Flags, size_t Size, + ur_usm_device_mem_flags_t Flags, size_t Size, uint32_t Alignment); ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, ur_device_handle_t Device, - ur_usm_host_mem_flags_t *, - ur_usm_device_mem_flags_t *, size_t Size, + ur_usm_host_mem_flags_t, + ur_usm_device_mem_flags_t, size_t Size, uint32_t Alignment); ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_usm_host_mem_flags_t *Flags, size_t Size, + ur_usm_host_mem_flags_t Flags, size_t Size, uint32_t Alignment); diff --git a/source/adapters/cuda/usm_p2p.cpp b/source/adapters/cuda/usm_p2p.cpp index 810a11ef84..e09220d730 100644 --- a/source/adapters/cuda/usm_p2p.cpp +++ b/source/adapters/cuda/usm_p2p.cpp @@ -14,8 +14,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PEnablePeerAccessExp( ur_device_handle_t commandDevice, ur_device_handle_t peerDevice) { try { - ScopedContext active(commandDevice->getContext()); - UR_CHECK_ERROR(cuCtxEnablePeerAccess(peerDevice->getContext(), 0)); + ScopedContext active(commandDevice); + UR_CHECK_ERROR(cuCtxEnablePeerAccess(peerDevice->getNativeContext(), 0)); } catch (ur_result_t err) { return err; } @@ -25,8 +25,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PEnablePeerAccessExp( UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PDisablePeerAccessExp( ur_device_handle_t commandDevice, ur_device_handle_t peerDevice) { try { - ScopedContext active(commandDevice->getContext()); - UR_CHECK_ERROR(cuCtxDisablePeerAccess(peerDevice->getContext())); + ScopedContext active(commandDevice); + UR_CHECK_ERROR(cuCtxDisablePeerAccess(peerDevice->getNativeContext())); } catch (ur_result_t err) { return err; } @@ -43,7 +43,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( int value; CUdevice_P2PAttribute cuAttr; try { - ScopedContext active(commandDevice->getContext()); + ScopedContext active(commandDevice); switch (propName) { case UR_EXP_PEER_INFO_UR_PEER_ACCESS_SUPPORTED: { cuAttr = CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED; diff --git a/source/adapters/cuda/virtual_mem.cpp b/source/adapters/cuda/virtual_mem.cpp index 9c37dda4fb..29908ad1d4 100644 --- a/source/adapters/cuda/virtual_mem.cpp +++ b/source/adapters/cuda/virtual_mem.cpp @@ -17,12 +17,12 @@ #include UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( - ur_context_handle_t hContext, ur_device_handle_t hDevice, + ur_context_handle_t, ur_device_handle_t hDevice, ur_virtual_mem_granularity_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet) { UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); - ScopedContext Active(hContext); + ScopedContext Active(hDevice); switch (propName) { case UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM: case UR_VIRTUAL_MEM_GRANULARITY_INFO_RECOMMENDED: { @@ -33,7 +33,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( CUmemAllocationProp AllocProps = {}; AllocProps.location.type = CU_MEM_LOCATION_TYPE_DEVICE; AllocProps.type = CU_MEM_ALLOCATION_TYPE_PINNED; - UR_CHECK_ERROR(GetDeviceOrdinal(hDevice, AllocProps.location.id)); + AllocProps.location.id = hDevice->getIndex(); size_t Granularity; UR_CHECK_ERROR( @@ -50,15 +50,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemReserve(ur_context_handle_t hContext, const void *pStart, size_t size, void **ppStart) { - ScopedContext Active(hContext); + // Reserve the virtual mem. Only need to do once for arbitrary context + ScopedContext Active(hContext->getDevices()[0]); UR_CHECK_ERROR(cuMemAddressReserve((CUdeviceptr *)ppStart, size, 0, (CUdeviceptr)pStart, 0)); return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemFree( - ur_context_handle_t hContext, const void *pStart, size_t size) { - ScopedContext Active(hContext); +UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemFree(ur_context_handle_t, + const void *pStart, + size_t size) { UR_CHECK_ERROR(cuMemAddressFree((CUdeviceptr)pStart, size)); return UR_RESULT_SUCCESS; } @@ -66,22 +67,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemFree( UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemSetAccess(ur_context_handle_t hContext, const void *pStart, size_t size, ur_virtual_mem_access_flags_t flags) { - CUmemAccessDesc AccessDesc = {}; - if (flags & UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE) - AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - else if (flags & UR_VIRTUAL_MEM_ACCESS_FLAG_READ_ONLY) - AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READ; - else - AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_NONE; - AccessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - // TODO: When contexts support multiple devices, we should create a descriptor - // for each. We may also introduce a variant of this function with a - // specific device. - UR_CHECK_ERROR( - GetDeviceOrdinal(hContext->getDevice(), AccessDesc.location.id)); - - ScopedContext Active(hContext); - UR_CHECK_ERROR(cuMemSetAccess((CUdeviceptr)pStart, size, &AccessDesc, 1)); + // Set access for every device in the context + for (auto &Device : hContext->getDevices()) { + CUmemAccessDesc AccessDesc = {}; + if (flags & UR_VIRTUAL_MEM_ACCESS_FLAG_READ_WRITE) + AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + else if (flags & UR_VIRTUAL_MEM_ACCESS_FLAG_READ_ONLY) + AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READ; + else + AccessDesc.flags = CU_MEM_ACCESS_FLAGS_PROT_NONE; + AccessDesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + AccessDesc.location.id = Device->getIndex(); + ScopedContext Active(Device); + UR_CHECK_ERROR(cuMemSetAccess((CUdeviceptr)pStart, size, &AccessDesc, 1)); + } return UR_RESULT_SUCCESS; } @@ -89,7 +88,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemMap(ur_context_handle_t hContext, const void *pStart, size_t size, ur_physical_mem_handle_t hPhysicalMem, size_t offset, ur_virtual_mem_access_flags_t flags) { - ScopedContext Active(hContext); + // Map the virtual mem. Only need to do once for arbitrary context + ScopedContext Active(hContext->getDevices()[0]); UR_CHECK_ERROR( cuMemMap((CUdeviceptr)pStart, size, offset, hPhysicalMem->get(), 0)); if (flags) @@ -99,7 +99,8 @@ urVirtualMemMap(ur_context_handle_t hContext, const void *pStart, size_t size, UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemUnmap( ur_context_handle_t hContext, const void *pStart, size_t size) { - ScopedContext Active(hContext); + // Unmap the virtual mem. Only need to do once for arbitrary context + ScopedContext Active(hContext->getDevices()[0]); UR_CHECK_ERROR(cuMemUnmap((CUdeviceptr)pStart, size)); return UR_RESULT_SUCCESS; } @@ -110,12 +111,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urVirtualMemGetInfo( size_t propSize, void *pPropValue, size_t *pPropSizeRet) { UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); - ScopedContext Active(hContext); + // Set arbitrary context + ScopedContext Active(hContext->getDevices()[0]); switch (propName) { case UR_VIRTUAL_MEM_INFO_ACCESS_MODE: { CUmemLocation MemLocation = {}; MemLocation.type = CU_MEM_LOCATION_TYPE_DEVICE; - UR_CHECK_ERROR(GetDeviceOrdinal(hContext->getDevice(), MemLocation.id)); + MemLocation.id = hContext->getDevices()[0]->getIndex(); unsigned long long CuAccessFlags; UR_CHECK_ERROR( diff --git a/source/adapters/hip/command_buffer.hpp b/source/adapters/hip/command_buffer.hpp index 50fddc5448..751fde3720 100644 --- a/source/adapters/hip/command_buffer.hpp +++ b/source/adapters/hip/command_buffer.hpp @@ -10,154 +10,12 @@ #include #include +#include #include "context.hpp" #include #include -static inline const char *getUrResultString(ur_result_t Result) { - switch (Result) { - case UR_RESULT_SUCCESS: - return "UR_RESULT_SUCCESS"; - case UR_RESULT_ERROR_INVALID_OPERATION: - return "UR_RESULT_ERROR_INVALID_OPERATION"; - case UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES: - return "UR_RESULT_ERROR_INVALID_QUEUE_PROPERTIES"; - case UR_RESULT_ERROR_INVALID_QUEUE: - return "UR_RESULT_ERROR_INVALID_QUEUE"; - case UR_RESULT_ERROR_INVALID_VALUE: - return "UR_RESULT_ERROR_INVALID_VALUE"; - case UR_RESULT_ERROR_INVALID_CONTEXT: - return "UR_RESULT_ERROR_INVALID_CONTEXT"; - case UR_RESULT_ERROR_INVALID_PLATFORM: - return "UR_RESULT_ERROR_INVALID_PLATFORM"; - case UR_RESULT_ERROR_INVALID_BINARY: - return "UR_RESULT_ERROR_INVALID_BINARY"; - case UR_RESULT_ERROR_INVALID_PROGRAM: - return "UR_RESULT_ERROR_INVALID_PROGRAM"; - case UR_RESULT_ERROR_INVALID_SAMPLER: - return "UR_RESULT_ERROR_INVALID_SAMPLER"; - case UR_RESULT_ERROR_INVALID_BUFFER_SIZE: - return "UR_RESULT_ERROR_INVALID_BUFFER_SIZE"; - case UR_RESULT_ERROR_INVALID_MEM_OBJECT: - return "UR_RESULT_ERROR_INVALID_MEM_OBJECT"; - case UR_RESULT_ERROR_INVALID_EVENT: - return "UR_RESULT_ERROR_INVALID_EVENT"; - case UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST: - return "UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST"; - case UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET: - return "UR_RESULT_ERROR_MISALIGNED_SUB_BUFFER_OFFSET"; - case UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE: - return "UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE"; - case UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE: - return "UR_RESULT_ERROR_COMPILER_NOT_AVAILABLE"; - case UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE: - return "UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE"; - case UR_RESULT_ERROR_DEVICE_NOT_FOUND: - return "UR_RESULT_ERROR_DEVICE_NOT_FOUND"; - case UR_RESULT_ERROR_INVALID_DEVICE: - return "UR_RESULT_ERROR_INVALID_DEVICE"; - case UR_RESULT_ERROR_DEVICE_LOST: - return "UR_RESULT_ERROR_DEVICE_LOST"; - case UR_RESULT_ERROR_DEVICE_REQUIRES_RESET: - return "UR_RESULT_ERROR_DEVICE_REQUIRES_RESET"; - case UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE: - return "UR_RESULT_ERROR_DEVICE_IN_LOW_POWER_STATE"; - case UR_RESULT_ERROR_DEVICE_PARTITION_FAILED: - return "UR_RESULT_ERROR_DEVICE_PARTITION_FAILED"; - case UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT: - return "UR_RESULT_ERROR_INVALID_DEVICE_PARTITION_COUNT"; - case UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE: - return "UR_RESULT_ERROR_INVALID_WORK_ITEM_SIZE"; - case UR_RESULT_ERROR_INVALID_WORK_DIMENSION: - return "UR_RESULT_ERROR_INVALID_WORK_DIMENSION"; - case UR_RESULT_ERROR_INVALID_KERNEL_ARGS: - return "UR_RESULT_ERROR_INVALID_KERNEL_ARGS"; - case UR_RESULT_ERROR_INVALID_KERNEL: - return "UR_RESULT_ERROR_INVALID_KERNEL"; - case UR_RESULT_ERROR_INVALID_KERNEL_NAME: - return "UR_RESULT_ERROR_INVALID_KERNEL_NAME"; - case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX: - return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_INDEX"; - case UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE: - return "UR_RESULT_ERROR_INVALID_KERNEL_ARGUMENT_SIZE"; - case UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE: - return "UR_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE"; - case UR_RESULT_ERROR_INVALID_IMAGE_SIZE: - return "UR_RESULT_ERROR_INVALID_IMAGE_SIZE"; - case UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR: - return "UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR"; - case UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED: - return "UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED"; - case UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE: - return "UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE"; - case UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE: - return "UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE"; - case UR_RESULT_ERROR_UNINITIALIZED: - return "UR_RESULT_ERROR_UNINITIALIZED"; - case UR_RESULT_ERROR_OUT_OF_HOST_MEMORY: - return "UR_RESULT_ERROR_OUT_OF_HOST_MEMORY"; - case UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY: - return "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY"; - case UR_RESULT_ERROR_OUT_OF_RESOURCES: - return "UR_RESULT_ERROR_OUT_OF_RESOURCES"; - case UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE: - return "UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE"; - case UR_RESULT_ERROR_PROGRAM_LINK_FAILURE: - return "UR_RESULT_ERROR_PROGRAM_LINK_FAILURE"; - case UR_RESULT_ERROR_UNSUPPORTED_VERSION: - return "UR_RESULT_ERROR_UNSUPPORTED_VERSION"; - case UR_RESULT_ERROR_UNSUPPORTED_FEATURE: - return "UR_RESULT_ERROR_UNSUPPORTED_FEATURE"; - case UR_RESULT_ERROR_INVALID_ARGUMENT: - return "UR_RESULT_ERROR_INVALID_ARGUMENT"; - case UR_RESULT_ERROR_INVALID_NULL_HANDLE: - return "UR_RESULT_ERROR_INVALID_NULL_HANDLE"; - case UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE: - return "UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE"; - case UR_RESULT_ERROR_INVALID_NULL_POINTER: - return "UR_RESULT_ERROR_INVALID_NULL_POINTER"; - case UR_RESULT_ERROR_INVALID_SIZE: - return "UR_RESULT_ERROR_INVALID_SIZE"; - case UR_RESULT_ERROR_UNSUPPORTED_SIZE: - return "UR_RESULT_ERROR_UNSUPPORTED_SIZE"; - case UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT: - return "UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT"; - case UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT: - return "UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT"; - case UR_RESULT_ERROR_INVALID_ENUMERATION: - return "UR_RESULT_ERROR_INVALID_ENUMERATION"; - case UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION: - return "UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION"; - case UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT: - return "UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT"; - case UR_RESULT_ERROR_INVALID_NATIVE_BINARY: - return "UR_RESULT_ERROR_INVALID_NATIVE_BINARY"; - case UR_RESULT_ERROR_INVALID_GLOBAL_NAME: - return "UR_RESULT_ERROR_INVALID_GLOBAL_NAME"; - case UR_RESULT_ERROR_INVALID_FUNCTION_NAME: - return "UR_RESULT_ERROR_INVALID_FUNCTION_NAME"; - case UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION: - return "UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION"; - case UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION: - return "UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION"; - case UR_RESULT_ERROR_PROGRAM_UNLINKED: - return "UR_RESULT_ERROR_PROGRAM_UNLINKED"; - case UR_RESULT_ERROR_OVERLAPPING_REGIONS: - return "UR_RESULT_ERROR_OVERLAPPING_REGIONS"; - case UR_RESULT_ERROR_INVALID_HOST_PTR: - return "UR_RESULT_ERROR_INVALID_HOST_PTR"; - case UR_RESULT_ERROR_INVALID_USM_SIZE: - return "UR_RESULT_ERROR_INVALID_USM_SIZE"; - case UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE: - return "UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE"; - case UR_RESULT_ERROR_ADAPTER_SPECIFIC: - return "UR_RESULT_ERROR_ADAPTER_SPECIFIC"; - default: - return "UR_RESULT_ERROR_UNKNOWN"; - } -} - // Trace an internal UR call #define UR_TRACE(Call) \ { \ @@ -169,10 +27,10 @@ static inline const char *getUrResultString(ur_result_t Result) { #define UR_CALL(Call, Result) \ { \ if (PrintTrace) \ - fprintf(stderr, "UR ---> %s\n", #Call); \ + std::cerr << "UR ---> " << #Call << "\n"; \ Result = (Call); \ if (PrintTrace) \ - fprintf(stderr, "UR <--- %s(%s)\n", #Call, getUrResultString(Result)); \ + std::cerr << "UR <--- " << #Call << "(" << Result << ")\n"; \ } // Handle to a kernel command. diff --git a/source/adapters/hip/device.cpp b/source/adapters/hip/device.cpp index ae4dbe159e..dd20a4f50f 100644 --- a/source/adapters/hip/device.cpp +++ b/source/adapters/hip/device.cpp @@ -724,11 +724,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } case UR_DEVICE_INFO_GLOBAL_MEM_FREE: { + // Work around an issue on some (unsupported) architectures, + // where hipMemGetInfo fails internally and returns hipErrorInvalidValue + // when trying to query the amount of available global memory. Since we + // can't distinguish this condition from us doing something wrong, we can't + // handle it gracefully. + hipDeviceProp_t Props; + UR_CHECK_ERROR(hipGetDeviceProperties(&Props, hDevice->get())); + if (strcmp(Props.gcnArchName, "gfx1031") == 0) { + return ReturnValue(size_t{0}); + } + size_t FreeMemory = 0; size_t TotalMemory = 0; - detail::ur::assertion(hipMemGetInfo(&FreeMemory, &TotalMemory) == - hipSuccess, - "failed hipMemGetInfo() API."); + UR_CHECK_ERROR(hipMemGetInfo(&FreeMemory, &TotalMemory)); return ReturnValue(FreeMemory); } @@ -841,6 +850,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_COMPOSITE_DEVICE: // These two are exclusive of L0. return ReturnValue(0); + case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: + return ReturnValue(true); // TODO: Investigate if this information is available on HIP. case UR_DEVICE_INFO_GPU_EU_COUNT: diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index 79522d4c93..4b98a9c491 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -15,12 +15,13 @@ #include "kernel.hpp" #include "memory.hpp" #include "queue.hpp" +#include "ur_api.h" #include extern size_t imageElementByteSize(hipArray_Format ArrayFormat); -ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream, +ur_result_t enqueueEventsWait(ur_queue_handle_t Queue, hipStream_t Stream, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList) { if (!EventWaitList) { @@ -29,8 +30,8 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream, try { auto Result = forLatestEvents( EventWaitList, NumEventsInWaitList, - [Stream](ur_event_handle_t Event) -> ur_result_t { - ScopedContext Active(Event->getDevice()); + [Stream, Queue](ur_event_handle_t Event) -> ur_result_t { + ScopedContext Active(Queue->getDevice()); if (Event->isCompleted() || Event->getStream() == Stream) { return UR_RESULT_SUCCESS; } else { @@ -218,8 +219,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( // last queue to write to the MemBuffer, meaning we must perform the copy // from a different device if (hBuffer->LastEventWritingToMemObj && - hBuffer->LastEventWritingToMemObj->getDevice() != hQueue->getDevice()) { - Device = hBuffer->LastEventWritingToMemObj->getDevice(); + hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() != + hQueue->getDevice()) { + // This event is never created with interop so getQueue is never null + hQueue = hBuffer->LastEventWritingToMemObj->getQueue(); + Device = hQueue->getDevice(); ScopedContext Active(Device); HIPStream = hipStream_t{0}; // Default stream for different device // We may have to wait for an event on another queue if it is the last @@ -584,8 +588,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( // last queue to write to the MemBuffer, meaning we must perform the copy // from a different device if (hBuffer->LastEventWritingToMemObj && - hBuffer->LastEventWritingToMemObj->getDevice() != hQueue->getDevice()) { - Device = hBuffer->LastEventWritingToMemObj->getDevice(); + hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() != + hQueue->getDevice()) { + // This event is never created with interop so getQueue is never null + hQueue = hBuffer->LastEventWritingToMemObj->getQueue(); + Device = hQueue->getDevice(); ScopedContext Active(Device); HIPStream = hipStream_t{0}; // Default stream for different device // We may have to wait for an event on another queue if it is the last @@ -1017,8 +1024,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( // last queue to write to the MemBuffer, meaning we must perform the copy // from a different device if (hImage->LastEventWritingToMemObj && - hImage->LastEventWritingToMemObj->getDevice() != hQueue->getDevice()) { - Device = hImage->LastEventWritingToMemObj->getDevice(); + hImage->LastEventWritingToMemObj->getQueue()->getDevice() != + hQueue->getDevice()) { + hQueue = hImage->LastEventWritingToMemObj->getQueue(); + Device = hQueue->getDevice(); ScopedContext Active(Device); HIPStream = hipStream_t{0}; // Default stream for different device // We may have to wait for an event on another queue if it is the last @@ -1231,49 +1240,42 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( UR_ASSERT(offset + size <= BufferImpl.getSize(), UR_RESULT_ERROR_INVALID_SIZE); - ur_result_t Result = UR_RESULT_ERROR_INVALID_OPERATION; - const bool IsPinned = - BufferImpl.MemAllocMode == BufferMem::AllocMode::AllocHostPtr; - - // Currently no support for overlapping regions - if (BufferImpl.getMapPtr() != nullptr) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + auto MapPtr = BufferImpl.mapToPtr(size, offset, mapFlags); + if (!MapPtr) { + return UR_RESULT_ERROR_INVALID_MEM_OBJECT; } - // Allocate a pointer in the host to store the mapped information - auto HostPtr = BufferImpl.mapToPtr(size, offset, mapFlags); - *ppRetMap = std::get(hBuffer->Mem).getMapPtr(); - if (HostPtr) { - Result = UR_RESULT_SUCCESS; - } + const bool IsPinned = + BufferImpl.MemAllocMode == BufferMem::AllocMode::AllocHostPtr; - if (!IsPinned && - ((mapFlags & UR_MAP_FLAG_READ) || (mapFlags & UR_MAP_FLAG_WRITE))) { - // Pinned host memory is already on host so it doesn't need to be read. - Result = urEnqueueMemBufferRead(hQueue, hBuffer, blockingMap, offset, size, - HostPtr, numEventsInWaitList, - phEventWaitList, phEvent); - } else { - ScopedContext Active(hQueue->getDevice()); + try { + if (!IsPinned && (mapFlags & (UR_MAP_FLAG_READ | UR_MAP_FLAG_WRITE))) { + // Pinned host memory is already on host so it doesn't need to be read. + UR_CHECK_ERROR(urEnqueueMemBufferRead( + hQueue, hBuffer, blockingMap, offset, size, MapPtr, + numEventsInWaitList, phEventWaitList, phEvent)); + } else { + ScopedContext Active(hQueue->getDevice()); - if (IsPinned) { - Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList, - nullptr); - } + if (IsPinned) { + UR_CHECK_ERROR(urEnqueueEventsWait(hQueue, numEventsInWaitList, + phEventWaitList, nullptr)); + } - if (phEvent) { - try { + if (phEvent) { *phEvent = ur_event_handle_t_::makeNative( UR_COMMAND_MEM_BUFFER_MAP, hQueue, hQueue->getNextTransferStream()); UR_CHECK_ERROR((*phEvent)->start()); UR_CHECK_ERROR((*phEvent)->record()); - } catch (ur_result_t Error) { - Result = Error; } } + } catch (ur_result_t Error) { + return Error; } - return Result; + *ppRetMap = MapPtr; + + return UR_RESULT_SUCCESS; } /// Implements the unmap from the host, using a BufferWrite operation. @@ -1284,47 +1286,44 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( ur_queue_handle_t hQueue, ur_mem_handle_t hMem, void *pMappedPtr, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - ur_result_t Result = UR_RESULT_SUCCESS; UR_ASSERT(hMem->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(std::get(hMem->Mem).getMapPtr() != nullptr, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(std::get(hMem->Mem).getMapPtr() == pMappedPtr, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); + auto &BufferImpl = std::get(hMem->Mem); - const bool IsPinned = std::get(hMem->Mem).MemAllocMode == - BufferMem::AllocMode::AllocHostPtr; - - if (!IsPinned && - ((std::get(hMem->Mem).getMapFlags() & UR_MAP_FLAG_WRITE) || - (std::get(hMem->Mem).getMapFlags() & - UR_MAP_FLAG_WRITE_INVALIDATE_REGION))) { - // Pinned host memory is only on host so it doesn't need to be written to. - Result = urEnqueueMemBufferWrite( - hQueue, hMem, true, std::get(hMem->Mem).getMapOffset(), - std::get(hMem->Mem).getMapSize(), pMappedPtr, - numEventsInWaitList, phEventWaitList, phEvent); - } else { - ScopedContext Active(hQueue->getDevice()); + auto *Map = BufferImpl.getMapDetails(pMappedPtr); + UR_ASSERT(Map != nullptr, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - if (IsPinned) { - Result = urEnqueueEventsWait(hQueue, numEventsInWaitList, phEventWaitList, - nullptr); - } + const bool IsPinned = + BufferImpl.MemAllocMode == BufferMem::AllocMode::AllocHostPtr; - if (phEvent) { - try { + try { + if (!IsPinned && + (Map->getMapFlags() & + (UR_MAP_FLAG_WRITE | UR_MAP_FLAG_WRITE_INVALIDATE_REGION))) { + // Pinned host memory is only on host so it doesn't need to be written to. + UR_CHECK_ERROR(urEnqueueMemBufferWrite( + hQueue, hMem, true, Map->getMapOffset(), Map->getMapSize(), + pMappedPtr, numEventsInWaitList, phEventWaitList, phEvent)); + } else { + ScopedContext Active(hQueue->getDevice()); + + if (IsPinned) { + UR_CHECK_ERROR(urEnqueueEventsWait(hQueue, numEventsInWaitList, + phEventWaitList, nullptr)); + } + + if (phEvent) { *phEvent = ur_event_handle_t_::makeNative( UR_COMMAND_MEM_UNMAP, hQueue, hQueue->getNextTransferStream()); UR_CHECK_ERROR((*phEvent)->start()); UR_CHECK_ERROR((*phEvent)->record()); - } catch (ur_result_t Error) { - Result = Error; } } + } catch (ur_result_t Error) { + return Error; } - std::get(hMem->Mem).unmap(pMappedPtr); - return Result; + BufferImpl.unmap(pMappedPtr); + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( @@ -1842,10 +1841,14 @@ setKernelParams(const ur_device_handle_t Device, const uint32_t WorkDim, static_cast(Device->getMaxBlockDimY()), static_cast(Device->getMaxBlockDimZ())}; + auto &ReqdThreadsPerBlock = Kernel->ReqdThreadsPerBlock; MaxWorkGroupSize = Device->getMaxWorkGroupSize(); if (LocalWorkSize != nullptr) { auto isValid = [&](int dim) { + UR_ASSERT(ReqdThreadsPerBlock[dim] == 0 || + LocalWorkSize[dim] == ReqdThreadsPerBlock[dim], + UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); UR_ASSERT(LocalWorkSize[dim] <= MaxThreadsPerBlock[dim], UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); // Checks that local work sizes are a divisor of the global work sizes @@ -1967,3 +1970,36 @@ void setCopyRectParams(ur_rect_region_t Region, const void *SrcPtr, : (DstType == hipMemoryTypeDevice ? hipMemcpyHostToDevice : hipMemcpyHostToHost)); } + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + ur_result_t Result = UR_RESULT_SUCCESS; + std::unique_ptr RetImplEvent{nullptr}; + try { + ScopedContext Active(hQueue->getDevice()); + + uint32_t StreamToken; + ur_stream_quard Guard; + hipStream_t HIPStream = hQueue->getNextComputeStream( + numEventsInWaitList, phEventWaitList, Guard, &StreamToken); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, + phEventWaitList)); + + RetImplEvent = + std::unique_ptr(ur_event_handle_t_::makeNative( + UR_COMMAND_TIMESTAMP_RECORDING_EXP, hQueue, HIPStream)); + UR_CHECK_ERROR(RetImplEvent->start()); + UR_CHECK_ERROR(RetImplEvent->record()); + + if (blocking) { + UR_CHECK_ERROR(hipStreamSynchronize(HIPStream)); + } + + *phEvent = RetImplEvent.release(); + } catch (ur_result_t Err) { + Result = Err; + } + return Result; +} diff --git a/source/adapters/hip/event.cpp b/source/adapters/hip/event.cpp index 7478c0e778..5327c43a3b 100644 --- a/source/adapters/hip/event.cpp +++ b/source/adapters/hip/event.cpp @@ -22,7 +22,8 @@ ur_event_handle_t_::ur_event_handle_t_(ur_command_t Type, StreamToken{StreamToken}, EventId{0}, EvEnd{nullptr}, EvStart{nullptr}, EvQueued{nullptr}, Queue{Queue}, Stream{Stream}, Context{Context} { - bool ProfilingEnabled = Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE; + bool ProfilingEnabled = + Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent(); UR_CHECK_ERROR(hipEventCreateWithFlags( &EvEnd, ProfilingEnabled ? hipEventDefault : hipEventDisableTiming)); @@ -58,7 +59,7 @@ ur_result_t ur_event_handle_t_::start() { ur_result_t Result = UR_RESULT_SUCCESS; try { - if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) { + if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) { // NOTE: This relies on the default stream to be unused. UR_CHECK_ERROR(hipEventRecord(EvQueued, 0)); UR_CHECK_ERROR(hipEventRecord(EvStart, Queue->get())); @@ -177,7 +178,7 @@ ur_result_t ur_event_handle_t_::release() { assert(Queue != nullptr); UR_CHECK_ERROR(hipEventDestroy(EvEnd)); - if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) { + if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) { UR_CHECK_ERROR(hipEventDestroy(EvQueued)); UR_CHECK_ERROR(hipEventDestroy(EvStart)); } @@ -244,7 +245,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( UR_ASSERT(!(pPropValue && propValueSize == 0), UR_RESULT_ERROR_INVALID_VALUE); ur_queue_handle_t Queue = hEvent->getQueue(); - if (Queue == nullptr || !(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE)) { + if (Queue == nullptr || (!(Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE) && + !hEvent->isTimestampEvent())) { return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; } diff --git a/source/adapters/hip/event.hpp b/source/adapters/hip/event.hpp index 50de73b14f..5e7c1d7e7d 100644 --- a/source/adapters/hip/event.hpp +++ b/source/adapters/hip/event.hpp @@ -28,8 +28,6 @@ struct ur_event_handle_t_ { ur_queue_handle_t getQueue() const noexcept { return Queue; } - ur_device_handle_t getDevice() const noexcept { return Queue->getDevice(); } - hipStream_t getStream() const noexcept { return Stream; } uint32_t getComputeStreamToken() const noexcept { return StreamToken; } @@ -55,6 +53,10 @@ struct ur_event_handle_t_ { return UR_EVENT_STATUS_COMPLETE; } + bool isTimestampEvent() const noexcept { + return getCommandType() == UR_COMMAND_TIMESTAMP_RECORDING_EXP; + } + ur_context_handle_t getContext() const noexcept { return Context; }; uint32_t incrementReferenceCount() { return ++RefCount; } diff --git a/source/adapters/hip/image.cpp b/source/adapters/hip/image.cpp index a8fcd6f465..4085614f31 100644 --- a/source/adapters/hip/image.cpp +++ b/source/adapters/hip/image.cpp @@ -59,7 +59,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( [[maybe_unused]] ur_exp_image_mem_handle_t hImageMem, [[maybe_unused]] const ur_image_format_t *pImageFormat, [[maybe_unused]] const ur_image_desc_t *pImageDesc, - [[maybe_unused]] ur_mem_handle_t *phMem, [[maybe_unused]] ur_exp_image_handle_t *phImage) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -71,7 +70,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( [[maybe_unused]] const ur_image_format_t *pImageFormat, [[maybe_unused]] const ur_image_desc_t *pImageDesc, [[maybe_unused]] ur_sampler_handle_t hSampler, - [[maybe_unused]] ur_mem_handle_t *phMem, [[maybe_unused]] ur_exp_image_handle_t *phImage) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp index ddb8f10595..f35d3957bc 100644 --- a/source/adapters/hip/kernel.cpp +++ b/source/adapters/hip/kernel.cpp @@ -91,14 +91,17 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, return ReturnValue(size_t(MaxThreads)); } case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { - size_t group_size[3] = {0, 0, 0}; - // Returns the work-group size specified in the kernel source or IL. - // If the work-group size is not specified in the kernel source or IL, - // (0, 0, 0) is returned. - // https://www.khronos.org/registry/OpenCL/sdk/2.1/docs/man/xhtml/clGetKernelWorkGroupInfo.html - - // TODO: can we extract the work group size from the PTX? - return ReturnValue(group_size, 3); + size_t GroupSize[3] = {0, 0, 0}; + const auto &ReqdWGSizeMDMap = + hKernel->getProgram()->KernelReqdWorkGroupSizeMD; + const auto ReqdWGSizeMD = ReqdWGSizeMDMap.find(hKernel->getName()); + if (ReqdWGSizeMD != ReqdWGSizeMDMap.end()) { + const auto ReqdWGSize = ReqdWGSizeMD->second; + GroupSize[0] = std::get<0>(ReqdWGSize); + GroupSize[1] = std::get<1>(ReqdWGSize); + GroupSize[2] = std::get<2>(ReqdWGSize); + } + return ReturnValue(GroupSize, 3); } case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { // OpenCL LOCAL == HIP SHARED diff --git a/source/adapters/hip/kernel.hpp b/source/adapters/hip/kernel.hpp index 83693a3d41..afea69832b 100644 --- a/source/adapters/hip/kernel.hpp +++ b/source/adapters/hip/kernel.hpp @@ -42,6 +42,9 @@ struct ur_kernel_handle_t_ { ur_program_handle_t Program; std::atomic_uint32_t RefCount; + static constexpr uint32_t ReqdThreadsPerBlockDimensions = 3u; + size_t ReqdThreadsPerBlock[ReqdThreadsPerBlockDimensions]; + /// Structure that holds the arguments to the kernel. /// Note earch argument size is known, since it comes /// from the kernel signature. @@ -154,6 +157,11 @@ struct ur_kernel_handle_t_ { ur_context_handle_t Ctxt) : Function{Func}, FunctionWithOffsetParam{FuncWithOffsetParam}, Name{Name}, Context{Ctxt}, Program{Program}, RefCount{1} { + assert(Program->getDevice()); + UR_CHECK_ERROR(urKernelGetGroupInfo( + this, Program->getDevice(), + UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, + sizeof(ReqdThreadsPerBlock), ReqdThreadsPerBlock, nullptr)); urProgramRetain(Program); urContextRetain(Context); } diff --git a/source/adapters/hip/memory.cpp b/source/adapters/hip/memory.cpp index dcc3e34fad..ff209884ce 100644 --- a/source/adapters/hip/memory.cpp +++ b/source/adapters/hip/memory.cpp @@ -525,11 +525,12 @@ inline ur_result_t migrateBufferToDevice(ur_mem_handle_t Mem, UR_CHECK_ERROR( hipMemcpyHtoD(Buffer.getPtr(hDevice), Buffer.HostPtr, Buffer.Size)); } - } else if (Mem->LastEventWritingToMemObj->getDevice() != hDevice) { - UR_CHECK_ERROR( - hipMemcpyDtoD(Buffer.getPtr(hDevice), - Buffer.getPtr(Mem->LastEventWritingToMemObj->getDevice()), - Buffer.Size)); + } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() != + hDevice) { + UR_CHECK_ERROR(hipMemcpyDtoD( + Buffer.getPtr(hDevice), + Buffer.getPtr(Mem->LastEventWritingToMemObj->getQueue()->getDevice()), + Buffer.Size)); } return UR_RESULT_SUCCESS; } @@ -577,22 +578,24 @@ inline ur_result_t migrateImageToDevice(ur_mem_handle_t Mem, CpyDesc3D.srcHost = Image.HostPtr; UR_CHECK_ERROR(hipDrvMemcpy3D(&CpyDesc3D)); } - } else if (Mem->LastEventWritingToMemObj->getDevice() != hDevice) { + } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() != + hDevice) { if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) { // FIXME: 1D memcpy from DtoD going through the host. UR_CHECK_ERROR(hipMemcpyAtoH( Image.HostPtr, - Image.getArray(Mem->LastEventWritingToMemObj->getDevice()), + Image.getArray( + Mem->LastEventWritingToMemObj->getQueue()->getDevice()), 0 /*srcOffset*/, ImageSizeBytes)); UR_CHECK_ERROR( hipMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes)); } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) { - CpyDesc2D.srcArray = - Image.getArray(Mem->LastEventWritingToMemObj->getDevice()); + CpyDesc2D.srcArray = Image.getArray( + Mem->LastEventWritingToMemObj->getQueue()->getDevice()); UR_CHECK_ERROR(hipMemcpyParam2D(&CpyDesc2D)); } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) { - CpyDesc3D.srcArray = - Image.getArray(Mem->LastEventWritingToMemObj->getDevice()); + CpyDesc3D.srcArray = Image.getArray( + Mem->LastEventWritingToMemObj->getQueue()->getDevice()); UR_CHECK_ERROR(hipDrvMemcpy3D(&CpyDesc3D)); } } diff --git a/source/adapters/hip/memory.hpp b/source/adapters/hip/memory.hpp index 7707794b3c..5d2aa6f9a5 100644 --- a/source/adapters/hip/memory.hpp +++ b/source/adapters/hip/memory.hpp @@ -9,12 +9,15 @@ //===----------------------------------------------------------------------===// #pragma once -#include "common.hpp" #include "context.hpp" #include "event.hpp" #include +#include +#include #include +#include "common.hpp" + ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t, const ur_device_handle_t); ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t, @@ -22,6 +25,41 @@ ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t, // Handler for plain, pointer-based HIP allocations struct BufferMem { + struct BufferMap { + /// Size of the active mapped region. + size_t MapSize; + /// Offset of the active mapped region. + size_t MapOffset; + /// Original flags for the mapped region + ur_map_flags_t MapFlags; + /// Allocated host memory used exclusively for this map. + std::shared_ptr MapMem; + + BufferMap(size_t MapSize, size_t MapOffset, ur_map_flags_t MapFlags) + : MapSize(MapSize), MapOffset(MapOffset), MapFlags(MapFlags), + MapMem(nullptr) {} + + BufferMap(size_t MapSize, size_t MapOffset, ur_map_flags_t MapFlags, + std::unique_ptr &&MapMem) + : MapSize(MapSize), MapOffset(MapOffset), MapFlags(MapFlags), + MapMem(std::move(MapMem)) {} + + size_t getMapSize() const noexcept { return MapSize; } + + size_t getMapOffset() const noexcept { return MapOffset; } + + ur_map_flags_t getMapFlags() const noexcept { return MapFlags; } + }; + + /** AllocMode + * Classic: Just a normal buffer allocated on the device via hip malloc + * UseHostPtr: Use an address on the host for the device + * CopyIn: The data for the device comes from the host but the host + pointer is not available later for re-use + * AllocHostPtr: Uses pinned-memory allocation + */ + enum class AllocMode { Classic, UseHostPtr, CopyIn, AllocHostPtr }; + using native_type = hipDeviceptr_t; // If this allocation is a sub-buffer (i.e., a view on an existing @@ -29,33 +67,14 @@ struct BufferMem { ur_mem_handle_t Parent = nullptr; // Outer mem holding this struct in variant ur_mem_handle_t OuterMemStruct; - /// Pointer associated with this device on the host void *HostPtr; /// Size of the allocation in bytes size_t Size; - /// Size of the active mapped region. - size_t MapSize; - /// Offset of the active mapped region. - size_t MapOffset; - /// Pointer to the active mapped region, if any - void *MapPtr; - /// Original flags for the mapped region - ur_map_flags_t MapFlags; + /// A map that contains all the active mappings for this buffer. + std::unordered_map PtrToBufferMap; - /** AllocMode - * Classic: Just a normal buffer allocated on the device via hip malloc - * UseHostPtr: Use an address on the host for the device - * CopyIn: The data for the device comes from the host but the host - pointer is not available later for re-use - * AllocHostPtr: Uses pinned-memory allocation - */ - enum class AllocMode { - Classic, - UseHostPtr, - CopyIn, - AllocHostPtr - } MemAllocMode; + AllocMode MemAllocMode; private: // Vector of HIP pointers @@ -65,10 +84,8 @@ struct BufferMem { BufferMem(ur_context_handle_t Context, ur_mem_handle_t OuterMemStruct, AllocMode Mode, void *HostPtr, size_t Size) : OuterMemStruct{OuterMemStruct}, HostPtr{HostPtr}, Size{Size}, - MapSize{0}, MapOffset{0}, MapPtr{nullptr}, MapFlags{UR_MAP_FLAG_WRITE}, - MemAllocMode{Mode}, Ptrs(Context->Devices.size(), native_type{0}){}; - - BufferMem(const BufferMem &Buffer) = default; + PtrToBufferMap{}, MemAllocMode{Mode}, + Ptrs(Context->Devices.size(), native_type{0}){}; // This will allocate memory on device if there isn't already an active // allocation on the device @@ -98,45 +115,41 @@ struct BufferMem { size_t getSize() const noexcept { return Size; } - void *getMapPtr() const noexcept { return MapPtr; } - - size_t getMapSize() const noexcept { return MapSize; } - - size_t getMapOffset() const noexcept { return MapOffset; } + BufferMap *getMapDetails(void *Map) { + auto details = PtrToBufferMap.find(Map); + if (details != PtrToBufferMap.end()) { + return &details->second; + } + return nullptr; + } /// Returns a pointer to data visible on the host that contains /// the data on the device associated with this allocation. /// The offset is used to index into the HIP allocation. /// - void *mapToPtr(size_t Size, size_t Offset, ur_map_flags_t Flags) noexcept { - assert(MapPtr == nullptr); - MapSize = Size; - MapOffset = Offset; - MapFlags = Flags; - if (HostPtr) { - MapPtr = static_cast(HostPtr) + Offset; + void *mapToPtr(size_t MapSize, size_t MapOffset, + ur_map_flags_t MapFlags) noexcept { + void *MapPtr = nullptr; + if (HostPtr == nullptr) { + /// If HostPtr is invalid, we need to create a Mapping that owns its own + /// memory on the host. + auto MapMem = std::make_unique(MapSize); + MapPtr = MapMem.get(); + PtrToBufferMap.insert( + {MapPtr, BufferMap(MapSize, MapOffset, MapFlags, std::move(MapMem))}); } else { - // TODO: Allocate only what is needed based on the offset - MapPtr = static_cast(malloc(this->getSize())); + /// However, if HostPtr already has valid memory (e.g. pinned allocation), + /// we can just use that memory for the mapping. + MapPtr = static_cast(HostPtr) + MapOffset; + PtrToBufferMap.insert({MapPtr, BufferMap(MapSize, MapOffset, MapFlags)}); } return MapPtr; } /// Detach the allocation from the host memory. - void unmap(void *) noexcept { + void unmap(void *MapPtr) noexcept { assert(MapPtr != nullptr); - - if (MapPtr != HostPtr) { - free(MapPtr); - } - MapPtr = nullptr; - MapSize = 0; - MapOffset = 0; - } - - ur_map_flags_t getMapFlags() const noexcept { - assert(MapPtr != nullptr); - return MapFlags; + PtrToBufferMap.erase(MapPtr); } ur_result_t clear() { @@ -414,7 +427,7 @@ struct ur_mem_handle_t_ { HaveMigratedToDeviceSinceLastWrite(Context->Devices.size(), false), Mem{std::in_place_type, Ctxt, this, Mode, HostPtr, Size} { urContextRetain(Context); - }; + } // Subbuffer constructor ur_mem_handle_t_(ur_mem Parent, size_t SubBufferOffset) @@ -435,7 +448,7 @@ struct ur_mem_handle_t_ { } } urMemRetain(Parent); - }; + } /// Constructs the UR mem handler for an Image object ur_mem_handle_t_(ur_context Ctxt, ur_mem_flags_t MemFlags, @@ -498,7 +511,7 @@ struct ur_mem_handle_t_ { LastEventWritingToMemObj = NewEvent; for (const auto &Device : Context->getDevices()) { HaveMigratedToDeviceSinceLastWrite[Device->getIndex()] = - Device == NewEvent->getDevice(); + Device == NewEvent->getQueue()->getDevice(); } } }; diff --git a/source/adapters/hip/program.cpp b/source/adapters/hip/program.cpp index 1ee862804a..8e3653ee02 100644 --- a/source/adapters/hip/program.cpp +++ b/source/adapters/hip/program.cpp @@ -9,6 +9,7 @@ //===----------------------------------------------------------------------===// #include "program.hpp" +#include "ur_util.hpp" #ifdef SYCL_ENABLE_KERNEL_FUSION #ifdef UR_COMGR_VERSION4_INCLUDE @@ -78,15 +79,6 @@ void getCoMgrBuildLog(const amd_comgr_data_set_t BuildDataSet, char *BuildLog, } // namespace #endif -std::pair -splitMetadataName(const std::string &metadataName) { - size_t splitPos = metadataName.rfind('@'); - if (splitPos == std::string::npos) - return std::make_pair(metadataName, std::string{}); - return std::make_pair(metadataName.substr(0, splitPos), - metadataName.substr(splitPos, metadataName.length())); -} - ur_result_t ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata, size_t Length) { @@ -107,8 +99,29 @@ ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata, const char *MetadataValPtrEnd = MetadataValPtr + MetadataElement.size - sizeof(std::uint64_t); GlobalIDMD[Prefix] = std::string{MetadataValPtr, MetadataValPtrEnd}; + } else if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) { + // If metadata is reqd_work_group_size, record it for the corresponding + // kernel name. + size_t MDElemsSize = MetadataElement.size - sizeof(std::uint64_t); + + // Expect between 1 and 3 32-bit integer values. + UR_ASSERT(MDElemsSize >= sizeof(std::uint32_t) && + MDElemsSize <= sizeof(std::uint32_t) * 3, + UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); + + // Get pointer to data, skipping 64-bit size at the start of the data. + const char *ValuePtr = + reinterpret_cast(MetadataElement.value.pData) + + sizeof(std::uint64_t); + // Read values and pad with 1's for values not present. + std::uint32_t ReqdWorkGroupElements[] = {1, 1, 1}; + std::memcpy(ReqdWorkGroupElements, ValuePtr, MDElemsSize); + KernelReqdWorkGroupSizeMD[Prefix] = + std::make_tuple(ReqdWorkGroupElements[0], ReqdWorkGroupElements[1], + ReqdWorkGroupElements[2]); } } + return UR_RESULT_SUCCESS; } @@ -459,8 +472,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( std::unique_ptr RetProgram{ new ur_program_handle_t_{hContext, hDevice}}; - // TODO: Set metadata here and use reqd_work_group_size information. - // See urProgramCreateWithBinary in CUDA adapter. if (pProperties) { if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) { return UR_RESULT_ERROR_INVALID_NULL_POINTER; @@ -469,8 +480,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( } Result = RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count); + UR_ASSERT(Result == UR_RESULT_SUCCESS, Result); } - UR_ASSERT(Result == UR_RESULT_SUCCESS, Result); auto pBinary_string = reinterpret_cast(pBinary); if (size == 0) { diff --git a/source/adapters/hip/program.hpp b/source/adapters/hip/program.hpp index 651d59a42d..cc5da40af5 100644 --- a/source/adapters/hip/program.hpp +++ b/source/adapters/hip/program.hpp @@ -12,6 +12,7 @@ #include #include +#include #include "context.hpp" @@ -30,6 +31,8 @@ struct ur_program_handle_t_ { bool IsRelocatable = false; std::unordered_map GlobalIDMD; + std::unordered_map> + KernelReqdWorkGroupSizeMD; constexpr static size_t MAX_LOG_SIZE = 8192u; @@ -38,8 +41,8 @@ struct ur_program_handle_t_ { ur_program_build_status_t BuildStatus = UR_PROGRAM_BUILD_STATUS_NONE; ur_program_handle_t_(ur_context_handle_t Ctxt, ur_device_handle_t Device) - : Module{nullptr}, Binary{}, - BinarySizeInBytes{0}, RefCount{1}, Context{Ctxt}, Device{Device} { + : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1}, + Context{Ctxt}, Device{Device}, KernelReqdWorkGroupSizeMD{} { urContextRetain(Context); urDeviceRetain(Device); } diff --git a/source/adapters/hip/ur_interface_loader.cpp b/source/adapters/hip/ur_interface_loader.cpp index 1cf9431fcb..71979b75b1 100644 --- a/source/adapters/hip/ur_interface_loader.cpp +++ b/source/adapters/hip/ur_interface_loader.cpp @@ -377,6 +377,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = urEnqueueCooperativeKernelLaunchExp; + pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/hip/usm.cpp b/source/adapters/hip/usm.cpp index 4068c1d865..e871f394f2 100644 --- a/source/adapters/hip/usm.cpp +++ b/source/adapters/hip/usm.cpp @@ -27,7 +27,7 @@ urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *pUSMDesc, UR_RESULT_ERROR_INVALID_VALUE); if (!hPool) { - return USMHostAllocImpl(ppMem, hContext, nullptr, size, alignment); + return USMHostAllocImpl(ppMem, hContext, /* flags */ 0, size, alignment); } return umfPoolMallocHelper(hPool, ppMem, size, alignment); @@ -43,7 +43,7 @@ urUSMDeviceAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, UR_RESULT_ERROR_INVALID_VALUE); if (!hPool) { - return USMDeviceAllocImpl(ppMem, hContext, hDevice, nullptr, size, + return USMDeviceAllocImpl(ppMem, hContext, hDevice, /* flags */ 0, size, alignment); } @@ -60,8 +60,8 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, UR_RESULT_ERROR_INVALID_VALUE); if (!hPool) { - return USMSharedAllocImpl(ppMem, hContext, hDevice, nullptr, nullptr, size, - alignment); + return USMSharedAllocImpl(ppMem, hContext, hDevice, /*host flags*/ 0, + /*device flags*/ 0, size, alignment); } return umfPoolMallocHelper(hPool, ppMem, size, alignment); @@ -105,7 +105,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext, ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t, ur_device_handle_t Device, - ur_usm_device_mem_flags_t *, size_t Size, + ur_usm_device_mem_flags_t, size_t Size, [[maybe_unused]] uint32_t Alignment) { try { ScopedContext Active(Device); @@ -120,8 +120,8 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t, ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t, ur_device_handle_t Device, - ur_usm_host_mem_flags_t *, - ur_usm_device_mem_flags_t *, size_t Size, + ur_usm_host_mem_flags_t, + ur_usm_device_mem_flags_t, size_t Size, [[maybe_unused]] uint32_t Alignment) { try { ScopedContext Active(Device); @@ -136,7 +136,7 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t, ur_result_t USMHostAllocImpl(void **ResultPtr, [[maybe_unused]] ur_context_handle_t Context, - ur_usm_host_mem_flags_t *, size_t Size, + ur_usm_host_mem_flags_t, size_t Size, [[maybe_unused]] uint32_t Alignment) { try { UR_CHECK_ERROR(hipHostMalloc(ResultPtr, Size)); @@ -309,19 +309,19 @@ umf_result_t USMMemoryProvider::get_min_page_size(void *Ptr, size_t *PageSize) { ur_result_t USMSharedMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, uint32_t Alignment) { - return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, nullptr, Size, - Alignment); + return USMSharedAllocImpl(ResultPtr, Context, Device, /*host flags*/ 0, + /*device flags*/ 0, Size, Alignment); } ur_result_t USMDeviceMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, uint32_t Alignment) { - return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size, + return USMDeviceAllocImpl(ResultPtr, Context, Device, /* flags */ 0, Size, Alignment); } ur_result_t USMHostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, uint32_t Alignment) { - return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment); + return USMHostAllocImpl(ResultPtr, Context, /* flags */ 0, Size, Alignment); } ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, diff --git a/source/adapters/hip/usm.hpp b/source/adapters/hip/usm.hpp index d02145584f..a1c3964263 100644 --- a/source/adapters/hip/usm.hpp +++ b/source/adapters/hip/usm.hpp @@ -118,17 +118,17 @@ class USMHostMemoryProvider final : public USMMemoryProvider { ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, ur_device_handle_t Device, - ur_usm_device_mem_flags_t *Flags, size_t Size, + ur_usm_device_mem_flags_t Flags, size_t Size, uint32_t Alignment); ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, ur_device_handle_t Device, - ur_usm_host_mem_flags_t *, - ur_usm_device_mem_flags_t *, size_t Size, + ur_usm_host_mem_flags_t, + ur_usm_device_mem_flags_t, size_t Size, uint32_t Alignment); ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_usm_host_mem_flags_t *Flags, size_t Size, + ur_usm_host_mem_flags_t Flags, size_t Size, uint32_t Alignment); bool checkUSMAlignment(uint32_t &alignment, const ur_usm_desc_t *pUSMDesc); diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index a06163c2a9..67415a0de0 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -1048,8 +1048,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( // Create command-list to execute before `CommandListPtr` and will signal // when `EventWaitList` dependencies are complete. ur_command_list_ptr_t WaitCommandList{}; - UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList, - false, false)); + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, WaitCommandList, false, NumEventsInWaitList, EventWaitList, + false)); ZE2UR_CALL(zeCommandListAppendBarrier, (WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent, @@ -1086,7 +1087,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( // Create a command-list to signal RetEvent on completion ur_command_list_ptr_t SignalCommandList{}; UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList, - false, false)); + false, NumEventsInWaitList, + EventWaitList, false)); // Reset the wait-event for the UR command-buffer that is signaled when its // submission dependencies have been satisfied. ZE2UR_CALL(zeCommandListAppendEventReset, diff --git a/source/adapters/level_zero/common.cpp b/source/adapters/level_zero/common.cpp index 7ae7272355..926d5f4ba4 100644 --- a/source/adapters/level_zero/common.cpp +++ b/source/adapters/level_zero/common.cpp @@ -88,7 +88,11 @@ bool setEnvVar(const char *name, const char *value) { ZeUSMImportExtension ZeUSMImport; // This will count the calls to Level-Zero +// TODO: remove the ifdef once +// https://github.com/oneapi-src/unified-runtime/issues/1454 is implemented +#ifndef UR_L0_CALL_COUNT_IN_TESTS std::map *ZeCallCount = nullptr; +#endif inline void zeParseError(ze_result_t ZeError, const char *&ErrorString) { switch (ZeError) { diff --git a/source/adapters/level_zero/common.hpp b/source/adapters/level_zero/common.hpp index 93bf407567..e16d767b71 100644 --- a/source/adapters/level_zero/common.hpp +++ b/source/adapters/level_zero/common.hpp @@ -241,6 +241,19 @@ static const uint32_t UrL0QueueSyncNonBlocking = [] { return L0QueueSyncLockingModeValue; }(); +// Controls whether the L0 Adapter creates signal events for commands on +// integrated gpu devices. +static const uint32_t UrL0OutOfOrderIntegratedSignalEvent = [] { + const char *UrL0OutOfOrderIntegratedSignalEventEnv = + std::getenv("UR_L0_OOQ_INTEGRATED_SIGNAL_EVENT"); + uint32_t UrL0OutOfOrderIntegratedSignalEventValue = 1; + if (UrL0OutOfOrderIntegratedSignalEventEnv) { + UrL0OutOfOrderIntegratedSignalEventValue = + std::atoi(UrL0OutOfOrderIntegratedSignalEventEnv); + } + return UrL0OutOfOrderIntegratedSignalEventValue; +}(); + // This class encapsulates actions taken along with a call to Level Zero API. class ZeCall { private: diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp index 4880c14c4b..85fe582ec3 100644 --- a/source/adapters/level_zero/context.cpp +++ b/source/adapters/level_zero/context.cpp @@ -645,7 +645,8 @@ static const size_t CmdListsCleanupThreshold = [] { // Retrieve an available command list to be used in a PI call. ur_result_t ur_context_handle_t_::getAvailableCommandList( ur_queue_handle_t Queue, ur_command_list_ptr_t &CommandList, - bool UseCopyEngine, bool AllowBatching, + bool UseCopyEngine, uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList, bool AllowBatching, ze_command_queue_handle_t *ForcedCmdQueue) { // Immediate commandlists have been pre-allocated and are always available. if (Queue->UsingImmCmdLists) { @@ -677,9 +678,22 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList( // for this queue. if (Queue->hasOpenCommandList(UseCopyEngine)) { if (AllowBatching) { - CommandList = CommandBatch.OpenCommandList; - UR_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList)); - return UR_RESULT_SUCCESS; + bool batchingAllowed = true; + if (ForcedCmdQueue && + CommandBatch.OpenCommandList->second.ZeQueue != *ForcedCmdQueue) { + // Current open batch doesn't match the forced command queue + batchingAllowed = false; + } + if (!UrL0OutOfOrderIntegratedSignalEvent && + Queue->Device->isIntegrated()) { + batchingAllowed = eventCanBeBatched(Queue, UseCopyEngine, + NumEventsInWaitList, EventWaitList); + } + if (batchingAllowed) { + CommandList = CommandBatch.OpenCommandList; + UR_CALL(Queue->insertStartBarrierIfDiscardEventsMode(CommandList)); + return UR_RESULT_SUCCESS; + } } // If this command isn't allowed to be batched or doesn't match the forced // command queue, then we need to go ahead and execute what is already in diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp index ff173aa984..4184411de7 100644 --- a/source/adapters/level_zero/context.hpp +++ b/source/adapters/level_zero/context.hpp @@ -292,11 +292,11 @@ struct ur_context_handle_t_ : _ur_object { // When using immediate commandlists, retrieves an immediate command list // for executing on this device. Immediate commandlists are created only // once for each SYCL Queue and after that they are reused. - ur_result_t - getAvailableCommandList(ur_queue_handle_t Queue, - ur_command_list_ptr_t &CommandList, - bool UseCopyEngine, bool AllowBatching = false, - ze_command_queue_handle_t *ForcedCmdQueue = nullptr); + ur_result_t getAvailableCommandList( + ur_queue_handle_t Queue, ur_command_list_ptr_t &CommandList, + bool UseCopyEngine, uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList, bool AllowBatching = false, + ze_command_queue_handle_t *ForcedCmdQueue = nullptr); // Checks if Device is covered by this context. // For that the Device or its root devices need to be in the context. diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index 7f9e9b499b..7f832f30f2 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -1,6 +1,6 @@ //===--------- device.cpp - Level Zero Adapter ----------------------------===// // -// Copyright (C) 2023 Intel Corporation +// Copyright (C) 2023-2024 Intel Corporation // // Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM // Exceptions. See LICENSE.TXT @@ -188,8 +188,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( } case UR_DEVICE_INFO_ATOMIC_64: return ReturnValue( - static_cast(Device->ZeDeviceModuleProperties->flags & - ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS)); + static_cast(Device->ZeDeviceModuleProperties->flags & + ZE_DEVICE_MODULE_FLAG_INT64_ATOMICS)); case UR_DEVICE_INFO_EXTENSIONS: { // Convention adopted from OpenCL: // "Returns a space separated list of extension names (the extension @@ -258,9 +258,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE: return ReturnValue(uint32_t{0}); case UR_DEVICE_INFO_COMPILER_AVAILABLE: - return ReturnValue(static_cast(true)); + return ReturnValue(static_cast(true)); case UR_DEVICE_INFO_LINKER_AVAILABLE: - return ReturnValue(static_cast(true)); + return ReturnValue(static_cast(true)); case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: { uint32_t MaxComputeUnits = Device->ZeDeviceProperties->numEUsPerSubslice * @@ -337,8 +337,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( case UR_DEVICE_INFO_DRIVER_VERSION: case UR_DEVICE_INFO_BACKEND_RUNTIME_VERSION: return ReturnValue(Device->Platform->ZeDriverVersion.c_str()); - case UR_DEVICE_INFO_VERSION: - return ReturnValue(Device->Platform->ZeDriverApiVersion.c_str()); + case UR_DEVICE_INFO_VERSION: { + // from compute-runtime/shared/source/helpers/hw_ip_version.h + typedef struct { + uint32_t revision : 6; + uint32_t reserved : 8; + uint32_t release : 8; + uint32_t architecture : 10; + } version_components_t; + typedef struct { + union { + uint32_t value; + version_components_t components; + }; + } ipVersion_t; + ipVersion_t IpVersion; + IpVersion.value = Device->ZeDeviceIpVersionExt->ipVersion; + std::stringstream S; + S << IpVersion.components.architecture << "." + << IpVersion.components.release << "." << IpVersion.components.revision; + return ReturnValue(S.str().c_str()); + } case UR_DEVICE_INFO_PARTITION_MAX_SUB_DEVICES: { auto Res = Device->Platform->populateDeviceCacheIfNeeded(); if (Res != UR_RESULT_SUCCESS) { @@ -410,7 +429,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION: return ReturnValue(""); case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: - return ReturnValue(static_cast(true)); + return ReturnValue(static_cast(true)); case UR_DEVICE_INFO_PRINTF_BUFFER_SIZE: return ReturnValue( size_t{Device->ZeDeviceModuleProperties->printfBufferSize}); @@ -427,7 +446,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue(ur_device_exec_capability_flag_t{ UR_DEVICE_EXEC_CAPABILITY_FLAG_NATIVE_KERNEL}); case UR_DEVICE_INFO_ENDIAN_LITTLE: - return ReturnValue(static_cast(true)); + return ReturnValue(static_cast(true)); case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: return ReturnValue(static_cast(Device->ZeDeviceProperties->flags & ZE_DEVICE_PROPERTY_FLAG_ECC)); @@ -604,7 +623,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( } case UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS: { // TODO: Not supported yet. Needs to be updated after support is added. - return ReturnValue(static_cast(false)); + return ReturnValue(static_cast(false)); } case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { // ze_device_compute_properties.subGroupSizes is in uint32_t whereas the @@ -790,7 +809,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return UR_RESULT_ERROR_INVALID_VALUE; case UR_DEVICE_INFO_BFLOAT16: { // bfloat16 math functions are not yet supported on Intel GPUs. - return ReturnValue(bool{false}); + return ReturnValue(ur_bool_t{false}); } case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { // There are no explicit restrictions in L0 programming guide, so assume all @@ -839,9 +858,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue(capabilities); } case UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT: - return ReturnValue(uint32_t{false}); + return ReturnValue(ur_bool_t{false}); case UR_DEVICE_INFO_IMAGE_SRGB: - return ReturnValue(uint32_t{false}); + return ReturnValue(ur_bool_t{false}); case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: { @@ -853,6 +872,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( 0)); //__read_write attribute currently undefinde in opencl } case UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT: { + return ReturnValue(static_cast(true)); + } + case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: { return ReturnValue(static_cast(true)); } @@ -893,9 +915,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( // can know if we are in (a) or (b) by checking if a tile is root device // or not. ur_device_handle_t URDev = Device->Platform->getDeviceFromNativeHandle(d); - if (URDev->isSubDevice()) + if (URDev->isSubDevice()) { // We are in COMPOSITE mode, return an empty list. - return ReturnValue(0); + if (pSize) { + *pSize = 0; + } + return UR_RESULT_SUCCESS; + } Res.push_back(URDev); } @@ -1078,7 +1104,7 @@ bool ur_device_handle_t_::useDriverInOrderLists() { static const bool UseDriverInOrderLists = [] { const char *UrRet = std::getenv("UR_L0_USE_DRIVER_INORDER_LISTS"); if (!UrRet) - return true; + return false; return std::atoi(UrRet) != 0; }(); diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp index 484890670b..3cdfcbce7e 100644 --- a/source/adapters/level_zero/device.hpp +++ b/source/adapters/level_zero/device.hpp @@ -176,6 +176,10 @@ struct ur_device_handle_t_ : _ur_object { (ZeDeviceProperties->deviceId & 0xff0) == 0xb60; } + bool isIntegrated() { + return (ZeDeviceProperties->flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED); + } + // Does this device represent a single compute slice? bool isCCS() const { return QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index cd61a8aa23..d91353349f 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -84,8 +84,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( // Get a new command list to be used on this call ur_command_list_ptr_t CommandList{}; - UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, - UseCopyEngine)); + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList)); ze_event_handle_t ZeEvent = nullptr; ur_event_handle_t InternalEvent; @@ -256,7 +256,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( // Get an arbitrary command-list in the queue. ur_command_list_ptr_t CmdList; UR_CALL(Queue->Context->getAvailableCommandList( - Queue, CmdList, false /*UseCopyEngine=*/, OkToBatch)); + Queue, CmdList, false /*UseCopyEngine=*/, NumEventsInWaitList, + EventWaitList, OkToBatch)); // Insert the barrier into the command-list and execute. UR_CALL(insertBarrierIntoCmdList(CmdList, TmpWaitList, *Event, IsInternal)); @@ -311,7 +312,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( if (ZeQueue) { ur_command_list_ptr_t CmdList; UR_CALL(Queue->Context->getAvailableCommandList( - Queue, CmdList, UseCopyEngine, OkToBatch, &ZeQueue)); + Queue, CmdList, UseCopyEngine, NumEventsInWaitList, + EventWaitList, OkToBatch, &ZeQueue)); CmdLists.push_back(CmdList); } } @@ -324,7 +326,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( // Get any available command list. ur_command_list_ptr_t CmdList; UR_CALL(Queue->Context->getAvailableCommandList( - Queue, CmdList, false /*UseCopyEngine=*/, OkToBatch)); + Queue, CmdList, false /*UseCopyEngine=*/, NumEventsInWaitList, + EventWaitList, OkToBatch)); CmdLists.push_back(CmdList); } @@ -365,8 +368,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( } // Execute each command list so the barriers can be encountered. - for (ur_command_list_ptr_t &CmdList : CmdLists) + for (ur_command_list_ptr_t &CmdList : CmdLists) { + bool IsCopy = + CmdList->second.isCopy(reinterpret_cast(Queue)); + const auto &CommandBatch = + (IsCopy) ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch; + // Only batch if the matching CmdList is already open. + OkToBatch = CommandBatch.OpenCommandList == CmdList; + UR_CALL(Queue->executeCommandList(CmdList, false, OkToBatch)); + } UR_CALL(Queue->ActiveBarriers.clear()); auto UREvent = reinterpret_cast(*Event); @@ -469,8 +480,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( ) { std::shared_lock EventLock(Event->Mutex); - if (Event->UrQueue && - (Event->UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) == 0) { + // The event must either have profiling enabled or be recording timestamps. + bool isTimestampedEvent = Event->isTimestamped(); + if (!Event->isProfilingEnabled() && !isTimestampedEvent) { return UR_RESULT_ERROR_PROFILING_INFO_NOT_AVAILABLE; } @@ -483,6 +495,61 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( UrReturnHelper ReturnValue(PropValueSize, PropValue, PropValueSizeRet); + // For timestamped events we have the timestamps ready directly on the event + // handle, so we short-circuit the return. + if (isTimestampedEvent) { + uint64_t ContextStartTime = Event->RecordEventStartTimestamp; + switch (PropName) { + case UR_PROFILING_INFO_COMMAND_QUEUED: + case UR_PROFILING_INFO_COMMAND_SUBMIT: + return ReturnValue(ContextStartTime); + case UR_PROFILING_INFO_COMMAND_END: + case UR_PROFILING_INFO_COMMAND_START: { + // If RecordEventEndTimestamp on the event is non-zero it means it has + // collected the result of the queue already. In that case it has been + // adjusted and is ready for immediate return. + if (Event->RecordEventEndTimestamp) + return ReturnValue(Event->RecordEventEndTimestamp); + + // Otherwise we need to collect it from the queue. + auto Entry = Event->UrQueue->EndTimeRecordings.find(Event); + + // Unexpected state if there is no end-time record. + if (Entry == Event->UrQueue->EndTimeRecordings.end()) + return UR_RESULT_ERROR_UNKNOWN; + auto &EndTimeRecording = Entry->second; + + // End time needs to be adjusted for resolution and valid bits. + uint64_t ContextEndTime = + (EndTimeRecording.RecordEventEndTimestamp & TimestampMaxValue) * + ZeTimerResolution; + + // If the result is 0, we have not yet gotten results back and so we just + // return it. + if (ContextEndTime == 0) + return ReturnValue(ContextEndTime); + + // Handle a possible wrap-around (the underlying HW counter is < 64-bit). + // Note, it will not report correct time if there were multiple wrap + // arounds, and the longer term plan is to enlarge the capacity of the + // HW timestamps. + if (ContextEndTime < ContextStartTime) + ContextEndTime += TimestampMaxValue * ZeTimerResolution; + + // Now that we have the result, there is no need to keep it in the queue + // anymore, so we cache it on the event and evict the record from the + // queue. + Event->RecordEventEndTimestamp = ContextEndTime; + Event->UrQueue->EndTimeRecordings.erase(Entry); + + return ReturnValue(ContextEndTime); + } + default: + logger::error("urEventGetProfilingInfo: not supported ParamName"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + } + ze_kernel_timestamp_result_t tsResult; // A Command-buffer consists of three command-lists for which only a single @@ -591,6 +658,63 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t Queue, ///< [in] handle of the queue object + bool Blocking, ///< [in] blocking or non-blocking enqueue + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before this command can be executed. If nullptr, + ///< the numEventsInWaitList must be 0, indicating + ///< that this command does not wait on any event to + ///< complete. + ur_event_handle_t + *OutEvent ///< [in,out] return an event object that identifies + ///< this particular command instance. +) { + // Lock automatically releases when this goes out of scope. + std::scoped_lock lock(Queue->Mutex); + + ur_device_handle_t Device = Queue->Device; + + bool UseCopyEngine = false; + _ur_ze_event_list_t TmpWaitList; + UR_CALL(TmpWaitList.createAndRetainUrZeEventList( + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + + // Get a new command list to be used on this call + ur_command_list_ptr_t CommandList{}; + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList, + /* AllowBatching */ false)); + + UR_CALL(createEventAndAssociateQueue( + Queue, OutEvent, UR_COMMAND_TIMESTAMP_RECORDING_EXP, CommandList, + /* IsInternal */ false, /* HostVisible */ true)); + ze_event_handle_t ZeEvent = (*OutEvent)->ZeEvent; + (*OutEvent)->WaitList = TmpWaitList; + + uint64_t DeviceStartTimestamp = 0; + UR_CALL(urDeviceGetGlobalTimestamps(Device, &DeviceStartTimestamp, nullptr)); + (*OutEvent)->RecordEventStartTimestamp = DeviceStartTimestamp; + + // Create a new entry in the queue's recordings. + Queue->EndTimeRecordings[*OutEvent] = + ur_queue_handle_t_::end_time_recording{}; + + ZE2UR_CALL(zeCommandListAppendWriteGlobalTimestamp, + (CommandList->first, + &Queue->EndTimeRecordings[*OutEvent].RecordEventEndTimestamp, + ZeEvent, (*OutEvent)->WaitList.Length, + (*OutEvent)->WaitList.ZeEventList)); + + UR_CALL( + Queue->executeCommandList(CommandList, Blocking, /* OkToBatch */ false)); + + return UR_RESULT_SUCCESS; +} + ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent( ze_event_handle_t &ZeHostVisibleEvent) { @@ -598,6 +722,7 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent( this->Mutex); if (!HostVisibleEvent) { + this->IsCreatingHostProxyEvent = true; if (UrQueue->ZeEventsScope != OnDemandHostVisibleProxy) die("getOrCreateHostVisibleEvent: missing host-visible event"); @@ -612,7 +737,7 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent( ur_command_list_ptr_t CommandList{}; UR_CALL(UrQueue->Context->getAvailableCommandList( - UrQueue, CommandList, false /* UseCopyEngine */, OkToBatch)) + UrQueue, CommandList, false /* UseCopyEngine */, 0, nullptr, OkToBatch)) // Create a "proxy" host-visible event. UR_CALL(createEventAndAssociateQueue( @@ -620,12 +745,18 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent( /* IsInternal */ false, /* IsMultiDevice */ false, /* HostVisible */ true)); - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (CommandList->first, 1, &ZeEvent)); + if (this->IsInnerBatchedEvent) { + ZE2UR_CALL(zeCommandListAppendBarrier, + (CommandList->first, ZeEvent, 0, nullptr)); + } else { + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (CommandList->first, 1, &ZeEvent)); + } ZE2UR_CALL(zeCommandListAppendSignalEvent, (CommandList->first, HostVisibleEvent->ZeEvent)); UR_CALL(UrQueue->executeCommandList(CommandList, false, OkToBatch)) + this->IsCreatingHostProxyEvent = false; } ZeHostVisibleEvent = HostVisibleEvent->ZeEvent; @@ -682,7 +813,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventWait( ze_event_handle_t ZeEvent = HostVisibleEvent->ZeEvent; logger::debug("ZeEvent = {}", ur_cast(ZeEvent)); - ZE2UR_CALL(zeHostSynchronize, (ZeEvent)); + // If this event was an inner batched event, then sync with + // the Queue instead of waiting on the event. + if (HostVisibleEvent->IsInnerBatchedEvent && Event->ZeBatchedQueue) { + ZE2UR_CALL(zeHostSynchronize, (Event->ZeBatchedQueue)); + } else { + ZE2UR_CALL(zeHostSynchronize, (ZeEvent)); + } Event->Completed = true; } } @@ -877,10 +1014,28 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) { } // Save pointer to the queue before deleting/resetting event. + auto Queue = Event->UrQueue; + + // If the event was a timestamp recording, we try to evict its entry in the + // queue. + if (Event->isTimestamped()) { + auto Entry = Queue->EndTimeRecordings.find(Event); + if (Entry != Queue->EndTimeRecordings.end()) { + auto &EndTimeRecording = Entry->second; + if (EndTimeRecording.RecordEventEndTimestamp == 0) { + // If the end time recording has not finished, we tell the queue that + // the event is no longer alive to avoid invalid write-backs. + EndTimeRecording.EventHasDied = true; + } else { + // Otherwise we evict the entry. + Event->UrQueue->EndTimeRecordings.erase(Entry); + } + } + } + // When we add an event to the cache we need to check whether profiling is // enabled or not, so we access properties of the queue and that's why queue // must released later. - auto Queue = Event->UrQueue; if (DisableEventsCaching || !Event->OwnNativeHandle) { delete Event; } else { @@ -938,7 +1093,12 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked, std::list EventsToBeReleased; ur_queue_handle_t AssociatedQueue = nullptr; { - std::scoped_lock EventLock(Event->Mutex); + // If the Event is already locked, then continue with the cleanup, otherwise + // block on locking the event. + std::unique_lock EventLock(Event->Mutex, std::try_to_lock); + if (!EventLock.owns_lock() && !Event->IsCreatingHostProxyEvent) { + EventLock.lock(); + } if (SetEventCompleted) Event->Completed = true; // Exit early of event was already cleanedup. @@ -1268,16 +1428,26 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( } auto Queue = EventList[I]->UrQueue; - if (Queue) { - // The caller of createAndRetainUrZeEventList must already hold - // a lock of the CurQueue. Additionally lock the Queue if it - // is different from CurQueue. - // TODO: rework this to avoid deadlock when another thread is - // locking the same queues but in a different order. - auto Lock = ((Queue == CurQueue) - ? std::unique_lock() - : std::unique_lock(Queue->Mutex)); + auto CurQueueDevice = CurQueue->Device; + std::optional> QueueLock = + std::nullopt; + // The caller of createAndRetainUrZeEventList must already hold + // a lock of the CurQueue. However, if the CurQueue is different + // then the Event's Queue, we need to drop that lock and + // acquire the Event's Queue lock. This is done to avoid a lock + // ordering issue. + // For the rest of this scope, CurQueue cannot be accessed. + // TODO: This solution is very error-prone. This requires a refactor + // to either have fine-granularity locks inside of the queues or + // to move any operations on queues other than CurQueue out + // of this scope. + if (Queue && Queue != CurQueue) { + CurQueue->Mutex.unlock(); + QueueLock = std::unique_lock(Queue->Mutex); + } + + if (Queue) { // If the event that is going to be waited is in an open batch // different from where this next command is going to be added, // then we have to force execute of that open command-list @@ -1320,17 +1490,17 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( } ur_command_list_ptr_t CommandList; - if (Queue && Queue->Device != CurQueue->Device) { + if (Queue && Queue->Device != CurQueueDevice) { // Get a command list prior to acquiring an event lock. // This prevents a potential deadlock with recursive // event locks. - UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, - false, true)); + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, false, 0, nullptr, true)); } std::shared_lock Lock(EventList[I]->Mutex); - if (Queue && Queue->Device != CurQueue->Device && + if (Queue && Queue->Device != CurQueueDevice && !EventList[I]->IsMultiDevice) { ze_event_handle_t MultiDeviceZeEvent = nullptr; ur_event_handle_t MultiDeviceEvent; @@ -1365,6 +1535,10 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( this->UrEventList[TmpListLength]->RefCount.increment(); } + if (QueueLock.has_value()) { + QueueLock.reset(); + CurQueue->Mutex.lock(); + } TmpListLength += 1; } } @@ -1443,3 +1617,12 @@ bool ur_event_handle_t_::isProfilingEnabled() const { return !UrQueue || // tentatively assume user events are profiling enabled (UrQueue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0; } + +// Tells if this event was created as a timestamp event, allowing profiling +// info even if profiling is not enabled. +bool ur_event_handle_t_::isTimestamped() const { + // If we are recording, the start time of the event will be non-zero. The + // end time might still be missing, depending on whether the corresponding + // enqueue is still running. + return RecordEventStartTimestamp != 0; +} diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp index 9fa4663ab4..2d1f536e4e 100644 --- a/source/adapters/level_zero/event.hpp +++ b/source/adapters/level_zero/event.hpp @@ -198,6 +198,23 @@ struct ur_event_handle_t_ : _ur_object { // performance bool IsMultiDevice = {false}; + // Indicates inner batched event which was not used as a signal event. + bool IsInnerBatchedEvent = {false}; + + // Queue where the batched command was executed. + ze_command_queue_handle_t ZeBatchedQueue = {nullptr}; + + // Indicates within creation of proxy event. + bool IsCreatingHostProxyEvent = {false}; + + // Indicates the recorded start and end timestamps for the event. These are + // only set for events returned by timestamp recording enqueue functions. + // A non-zero value for RecordEventStartTimestamp indicates the event was the + // result of a timestamp recording. If RecordEventEndTimestamp is non-zero, it + // means the event has fetched the end-timestamp from the queue. + uint64_t RecordEventStartTimestamp = 0; + uint64_t RecordEventEndTimestamp = 0; + // Besides each PI object keeping a total reference count in // _ur_object::RefCount we keep special track of the event *external* // references. This way we are able to tell when the event is not referenced @@ -222,6 +239,10 @@ struct ur_event_handle_t_ : _ur_object { // Tells if this event is with profiling capabilities. bool isProfilingEnabled() const; + // Tells if this event was created as a timestamp event, allowing profiling + // info even if profiling is not enabled. + bool isTimestamped() const; + // Get the host-visible event or create one and enqueue its signal. ur_result_t getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent); diff --git a/source/adapters/level_zero/image.cpp b/source/adapters/level_zero/image.cpp index 3b767f9127..b953e69f06 100644 --- a/source/adapters/level_zero/image.cpp +++ b/source/adapters/level_zero/image.cpp @@ -656,13 +656,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageFreeExp( UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_exp_image_mem_handle_t hImageMem, const ur_image_format_t *pImageFormat, - const ur_image_desc_t *pImageDesc, ur_mem_handle_t *phMem, - ur_exp_image_handle_t *phImage) { + const ur_image_desc_t *pImageDesc, ur_exp_image_handle_t *phImage) { std::shared_lock Lock(hContext->Mutex); UR_ASSERT(hContext && hDevice && hImageMem, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(pImageFormat && pImageDesc && phMem && phImage, + UR_ASSERT(pImageFormat && pImageDesc && phImage, UR_RESULT_ERROR_INVALID_NULL_POINTER); ZeStruct ZeImageDesc; @@ -687,11 +686,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( UrImage->ZeImage, &ZeImage)); ZE2UR_CALL(zeContextMakeImageResident, (hContext->ZeContext, hDevice->ZeDevice, ZeImage)); - UR_CALL(createUrMemFromZeImage(hContext, ZeImage, /*OwnZeMemHandle*/ true, - ZeImageDesc, phMem)); } else { ZeImage = UrImage->ZeImage; - *phMem = nullptr; } } else if (MemAllocProperties.type == ZE_MEMORY_TYPE_DEVICE) { ze_image_pitched_exp_desc_t PitchedDesc; @@ -710,8 +706,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( &ZeImageDesc, &ZeImage)); ZE2UR_CALL(zeContextMakeImageResident, (hContext->ZeContext, hDevice->ZeDevice, ZeImage)); - UR_CALL(createUrMemFromZeImage(hContext, ZeImage, /*OwnZeMemHandle*/ true, - ZeImageDesc, phMem)); } else { return UR_RESULT_ERROR_INVALID_VALUE; } @@ -741,10 +735,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( ur_context_handle_t hContext, ur_device_handle_t hDevice, ur_exp_image_mem_handle_t hImageMem, const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, ur_sampler_handle_t hSampler, - ur_mem_handle_t *phMem, ur_exp_image_handle_t *phImage) { + ur_exp_image_handle_t *phImage) { UR_CALL(urBindlessImagesUnsampledImageCreateExp( - hContext, hDevice, hImageMem, pImageFormat, pImageDesc, phMem, phImage)); + hContext, hDevice, hImageMem, pImageFormat, pImageDesc, phImage)); struct combined_sampled_image_handle { uint64_t RawImageHandle; @@ -790,8 +784,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( // Get a new command list to be used on this call ur_command_list_ptr_t CommandList{}; - UR_CALL(hQueue->Context->getAvailableCommandList(hQueue, CommandList, - UseCopyEngine, OkToBatch)); + UR_CALL(hQueue->Context->getAvailableCommandList( + hQueue, CommandList, UseCopyEngine, numEventsInWaitList, phEventWaitList, + OkToBatch)); ze_event_handle_t ZeEvent = nullptr; ur_event_handle_t InternalEvent; @@ -800,7 +795,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( UR_CALL(createEventAndAssociateQueue(hQueue, Event, UR_COMMAND_MEM_IMAGE_COPY, CommandList, IsInternal, /*IsMultiDevice*/ false)); - ZeEvent = (*Event)->ZeEvent; + UR_CALL(setSignalEvent(hQueue, UseCopyEngine, &ZeEvent, Event, + numEventsInWaitList, phEventWaitList, + CommandList->second.ZeQueue)); (*Event)->WaitList = TmpWaitList; const auto &ZeCommandList = CommandList->first; diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index d96351dd5d..40d6260ac9 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -201,7 +201,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // Get a new command list to be used on this call ur_command_list_ptr_t CommandList{}; UR_CALL(Queue->Context->getAvailableCommandList( - Queue, CommandList, UseCopyEngine, true /* AllowBatching */)); + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList, + true /* AllowBatching */)); ze_event_handle_t ZeEvent = nullptr; ur_event_handle_t InternalEvent{}; @@ -210,7 +211,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH, CommandList, IsInternal, false)); - ZeEvent = (*Event)->ZeEvent; + UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, + NumEventsInWaitList, EventWaitList, + CommandList->second.ZeQueue)); (*Event)->WaitList = TmpWaitList; // Save the kernel in the event, so that when the event is signalled @@ -268,13 +271,264 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( - ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numEventsInWaitList, phEventWaitList, phEvent); + ur_queue_handle_t Queue, ///< [in] handle of the queue object + ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object + uint32_t WorkDim, ///< [in] number of dimensions, from 1 to 3, to specify + ///< the global and work-group work-items + const size_t + *GlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned + ///< values that specify the offset used to + ///< calculate the global ID of a work-item + const size_t *GlobalWorkSize, ///< [in] pointer to an array of workDim + ///< unsigned values that specify the number + ///< of global work-items in workDim that + ///< will execute the kernel function + const size_t + *LocalWorkSize, ///< [in][optional] pointer to an array of workDim + ///< unsigned values that specify the number of local + ///< work-items forming a work-group that will execute + ///< the kernel function. If nullptr, the runtime + ///< implementation will choose the work-group size. + uint32_t NumEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t + *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] + ///< pointer to a list of events that must be complete + ///< before the kernel execution. If nullptr, the + ///< numEventsInWaitList must be 0, indicating that no + ///< wait event. + ur_event_handle_t + *OutEvent ///< [in,out][optional] return an event object that identifies + ///< this particular kernel execution instance. +) { + auto ZeDevice = Queue->Device->ZeDevice; + + ze_kernel_handle_t ZeKernel{}; + if (Kernel->ZeKernelMap.empty()) { + ZeKernel = Kernel->ZeKernel; + } else { + auto It = Kernel->ZeKernelMap.find(ZeDevice); + if (It == Kernel->ZeKernelMap.end()) { + /* kernel and queue don't match */ + return UR_RESULT_ERROR_INVALID_QUEUE; + } + ZeKernel = It->second; + } + // Lock automatically releases when this goes out of scope. + std::scoped_lock Lock( + Queue->Mutex, Kernel->Mutex, Kernel->Program->Mutex); + if (GlobalWorkOffset != NULL) { + if (!Queue->Device->Platform->ZeDriverGlobalOffsetExtensionFound) { + logger::error("No global offset extension found on this driver"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + ZE2UR_CALL(zeKernelSetGlobalOffsetExp, + (ZeKernel, GlobalWorkOffset[0], GlobalWorkOffset[1], + GlobalWorkOffset[2])); + } + + // If there are any pending arguments set them now. + for (auto &Arg : Kernel->PendingArguments) { + // The ArgValue may be a NULL pointer in which case a NULL value is used for + // the kernel argument declared as a pointer to global or constant memory. + char **ZeHandlePtr = nullptr; + if (Arg.Value) { + UR_CALL(Arg.Value->getZeHandlePtr(ZeHandlePtr, Arg.AccessMode, + Queue->Device)); + } + ZE2UR_CALL(zeKernelSetArgumentValue, + (ZeKernel, Arg.Index, Arg.Size, ZeHandlePtr)); + } + Kernel->PendingArguments.clear(); + + ze_group_count_t ZeThreadGroupDimensions{1, 1, 1}; + uint32_t WG[3]{}; + + // New variable needed because GlobalWorkSize parameter might not be of size 3 + size_t GlobalWorkSize3D[3]{1, 1, 1}; + std::copy(GlobalWorkSize, GlobalWorkSize + WorkDim, GlobalWorkSize3D); + + if (LocalWorkSize) { + // L0 + UR_ASSERT(LocalWorkSize[0] < (std::numeric_limits::max)(), + UR_RESULT_ERROR_INVALID_VALUE); + UR_ASSERT(LocalWorkSize[1] < (std::numeric_limits::max)(), + UR_RESULT_ERROR_INVALID_VALUE); + UR_ASSERT(LocalWorkSize[2] < (std::numeric_limits::max)(), + UR_RESULT_ERROR_INVALID_VALUE); + WG[0] = static_cast(LocalWorkSize[0]); + WG[1] = static_cast(LocalWorkSize[1]); + WG[2] = static_cast(LocalWorkSize[2]); + } else { + // We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize + // values do not fit to 32-bit that the API only supports currently. + bool SuggestGroupSize = true; + for (int I : {0, 1, 2}) { + if (GlobalWorkSize3D[I] > UINT32_MAX) { + SuggestGroupSize = false; + } + } + if (SuggestGroupSize) { + ZE2UR_CALL(zeKernelSuggestGroupSize, + (ZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1], + GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2])); + } else { + for (int I : {0, 1, 2}) { + // Try to find a I-dimension WG size that the GlobalWorkSize[I] is + // fully divisable with. Start with the max possible size in + // each dimension. + uint32_t GroupSize[] = { + Queue->Device->ZeDeviceComputeProperties->maxGroupSizeX, + Queue->Device->ZeDeviceComputeProperties->maxGroupSizeY, + Queue->Device->ZeDeviceComputeProperties->maxGroupSizeZ}; + GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]); + while (GlobalWorkSize3D[I] % GroupSize[I]) { + --GroupSize[I]; + } + + if (GlobalWorkSize3D[I] / GroupSize[I] > UINT32_MAX) { + logger::error( + "urEnqueueCooperativeKernelLaunchExp: can't find a WG size " + "suitable for global work size > UINT32_MAX"); + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + WG[I] = GroupSize[I]; + } + logger::debug("urEnqueueCooperativeKernelLaunchExp: using computed WG " + "size = {{{}, {}, {}}}", + WG[0], WG[1], WG[2]); + } + } + + // TODO: assert if sizes do not fit into 32-bit? + + switch (WorkDim) { + case 3: + ZeThreadGroupDimensions.groupCountX = + static_cast(GlobalWorkSize3D[0] / WG[0]); + ZeThreadGroupDimensions.groupCountY = + static_cast(GlobalWorkSize3D[1] / WG[1]); + ZeThreadGroupDimensions.groupCountZ = + static_cast(GlobalWorkSize3D[2] / WG[2]); + break; + case 2: + ZeThreadGroupDimensions.groupCountX = + static_cast(GlobalWorkSize3D[0] / WG[0]); + ZeThreadGroupDimensions.groupCountY = + static_cast(GlobalWorkSize3D[1] / WG[1]); + WG[2] = 1; + break; + case 1: + ZeThreadGroupDimensions.groupCountX = + static_cast(GlobalWorkSize3D[0] / WG[0]); + WG[1] = WG[2] = 1; + break; + + default: + logger::error("urEnqueueCooperativeKernelLaunchExp: unsupported work_dim"); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + // Error handling for non-uniform group size case + if (GlobalWorkSize3D[0] != + size_t(ZeThreadGroupDimensions.groupCountX) * WG[0]) { + logger::error("urEnqueueCooperativeKernelLaunchExp: invalid work_dim. The " + "range is not a " + "multiple of the group size in the 1st dimension"); + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + if (GlobalWorkSize3D[1] != + size_t(ZeThreadGroupDimensions.groupCountY) * WG[1]) { + logger::error("urEnqueueCooperativeKernelLaunchExp: invalid work_dim. The " + "range is not a " + "multiple of the group size in the 2nd dimension"); + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + if (GlobalWorkSize3D[2] != + size_t(ZeThreadGroupDimensions.groupCountZ) * WG[2]) { + logger::debug("urEnqueueCooperativeKernelLaunchExp: invalid work_dim. The " + "range is not a " + "multiple of the group size in the 3rd dimension"); + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + + ZE2UR_CALL(zeKernelSetGroupSize, (ZeKernel, WG[0], WG[1], WG[2])); + + bool UseCopyEngine = false; + _ur_ze_event_list_t TmpWaitList; + UR_CALL(TmpWaitList.createAndRetainUrZeEventList( + NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); + + // Get a new command list to be used on this call + ur_command_list_ptr_t CommandList{}; + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList, + true /* AllowBatching */)); + + ze_event_handle_t ZeEvent = nullptr; + ur_event_handle_t InternalEvent{}; + bool IsInternal = OutEvent == nullptr; + ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; + + UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH, + CommandList, IsInternal, false)); + UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, + NumEventsInWaitList, EventWaitList, + CommandList->second.ZeQueue)); + (*Event)->WaitList = TmpWaitList; + + // Save the kernel in the event, so that when the event is signalled + // the code can do a urKernelRelease on this kernel. + (*Event)->CommandData = (void *)Kernel; + + // Increment the reference count of the Kernel and indicate that the Kernel + // is in use. Once the event has been signalled, the code in + // CleanupCompletedEvent(Event) will do a urKernelRelease to update the + // reference count on the kernel, using the kernel saved in CommandData. + UR_CALL(urKernelRetain(Kernel)); + + // Add to list of kernels to be submitted + if (IndirectAccessTrackingEnabled) + Queue->KernelsToBeSubmitted.push_back(Kernel); + + if (Queue->UsingImmCmdLists && IndirectAccessTrackingEnabled) { + // If using immediate commandlists then gathering of indirect + // references and appending to the queue (which means submission) + // must be done together. + std::unique_lock ContextsLock( + Queue->Device->Platform->ContextsMutex, std::defer_lock); + // We are going to submit kernels for execution. If indirect access flag is + // set for a kernel then we need to make a snapshot of existing memory + // allocations in all contexts in the platform. We need to lock the mutex + // guarding the list of contexts in the platform to prevent creation of new + // memory alocations in any context before we submit the kernel for + // execution. + ContextsLock.lock(); + Queue->CaptureIndirectAccesses(); + // Add the command to the command list, which implies submission. + ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel, + (CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent, + (*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList)); + } else { + // Add the command to the command list for later submission. + // No lock is needed here, unlike the immediate commandlist case above, + // because the kernels are not actually submitted yet. Kernels will be + // submitted only when the comamndlist is closed. Then, a lock is held. + ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel, + (CommandList->first, ZeKernel, &ZeThreadGroupDimensions, ZeEvent, + (*Event)->WaitList.Length, (*Event)->WaitList.ZeEventList)); + } + + logger::debug("calling zeCommandListAppendLaunchCooperativeKernel() with" + " ZeEvent {}", + ur_cast(ZeEvent)); + printZeEventList((*Event)->WaitList); + + // Execute command list asynchronously, as the event will be used + // to track down its completion. + UR_CALL(Queue->executeCommandList(CommandList, false, true)); + + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( @@ -583,15 +837,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo( ZeStruct kernelProperties; kernelProperties.pNext = &workGroupProperties; - - auto ZeResult = ZE_CALL_NOCHECK( - zeKernelGetProperties, - (Kernel->ZeKernelMap[Device->ZeDevice], &kernelProperties)); - if (ZeResult || workGroupProperties.maxGroupSize == 0) { + // Set the Kernel to use as the ZeKernel initally for native handle support. + // This makes the assumption that this device is the same device where this + // kernel was created. + auto ZeKernelDevice = Kernel->ZeKernel; + auto It = Kernel->ZeKernelMap.find(Device->ZeDevice); + if (It != Kernel->ZeKernelMap.end()) { + ZeKernelDevice = Kernel->ZeKernelMap[Device->ZeDevice]; + } + if (ZeKernelDevice) { + auto ZeResult = ZE_CALL_NOCHECK(zeKernelGetProperties, + (ZeKernelDevice, &kernelProperties)); + if (ZeResult || workGroupProperties.maxGroupSize == 0) { + return ReturnValue( + uint64_t{Device->ZeDeviceComputeProperties->maxTotalGroupSize}); + } + return ReturnValue(workGroupProperties.maxGroupSize); + } else { return ReturnValue( uint64_t{Device->ZeDeviceComputeProperties->maxTotalGroupSize}); } - return ReturnValue(workGroupProperties.maxGroupSize); } case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { struct { @@ -815,10 +1080,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle( UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( ur_kernel_handle_t hKernel, size_t localWorkSize, size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) { - (void)hKernel; (void)localWorkSize; (void)dynamicSharedMemorySize; - *pGroupCountRet = 1; + std::shared_lock Guard(hKernel->Mutex); + uint32_t TotalGroupCount = 0; + ZE2UR_CALL(zeKernelSuggestMaxCooperativeGroupCount, + (hKernel->ZeKernel, &TotalGroupCount)); + *pGroupCountRet = TotalGroupCount; return UR_RESULT_SUCCESS; } @@ -835,6 +1103,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( try { Kernel = new ur_kernel_handle_t_(ZeKernel, Properties->isNativeHandleOwned, Context); + if (Properties->isNativeHandleOwned) { + // If ownership is passed to the adapter we need to pass the kernel + // to this vector which is then used during ZeKernelRelease. + Kernel->ZeKernels.push_back(ZeKernel); + } + *RetKernel = reinterpret_cast(Kernel); } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; diff --git a/source/adapters/level_zero/memory.cpp b/source/adapters/level_zero/memory.cpp index 4757a0563d..77cb6abb38 100644 --- a/source/adapters/level_zero/memory.cpp +++ b/source/adapters/level_zero/memory.cpp @@ -62,8 +62,9 @@ ur_result_t enqueueMemCopyHelper(ur_command_t CommandType, // Get a new command list to be used on this call ur_command_list_ptr_t CommandList{}; - UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, - UseCopyEngine, OkToBatch)); + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList, + OkToBatch)); ze_event_handle_t ZeEvent = nullptr; ur_event_handle_t InternalEvent; @@ -71,7 +72,9 @@ ur_result_t enqueueMemCopyHelper(ur_command_t CommandType, ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList, IsInternal, false)); - ZeEvent = (*Event)->ZeEvent; + UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, + NumEventsInWaitList, EventWaitList, + CommandList->second.ZeQueue)); (*Event)->WaitList = TmpWaitList; const auto &ZeCommandList = CommandList->first; @@ -112,8 +115,9 @@ ur_result_t enqueueMemCopyRectHelper( // Get a new command list to be used on this call ur_command_list_ptr_t CommandList{}; - UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, - UseCopyEngine, OkToBatch)); + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList, + OkToBatch)); ze_event_handle_t ZeEvent = nullptr; ur_event_handle_t InternalEvent; @@ -121,8 +125,9 @@ ur_result_t enqueueMemCopyRectHelper( ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList, IsInternal, false)); - - ZeEvent = (*Event)->ZeEvent; + UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, + NumEventsInWaitList, EventWaitList, + CommandList->second.ZeQueue)); (*Event)->WaitList = TmpWaitList; const auto &ZeCommandList = CommandList->first; @@ -219,8 +224,9 @@ static ur_result_t enqueueMemFillHelper(ur_command_t CommandType, ur_command_list_ptr_t CommandList{}; // We want to batch these commands to avoid extra submissions (costly) bool OkToBatch = true; - UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, - UseCopyEngine, OkToBatch)); + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList, + OkToBatch)); ze_event_handle_t ZeEvent = nullptr; ur_event_handle_t InternalEvent; @@ -228,8 +234,9 @@ static ur_result_t enqueueMemFillHelper(ur_command_t CommandType, ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList, IsInternal, false)); - - ZeEvent = (*Event)->ZeEvent; + UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, + NumEventsInWaitList, EventWaitList, + CommandList->second.ZeQueue)); (*Event)->WaitList = TmpWaitList; const auto &ZeCommandList = CommandList->first; @@ -334,8 +341,9 @@ static ur_result_t enqueueMemImageCommandHelper( // Get a new command list to be used on this call ur_command_list_ptr_t CommandList{}; - UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, - UseCopyEngine, OkToBatch)); + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList, + OkToBatch)); ze_event_handle_t ZeEvent = nullptr; ur_event_handle_t InternalEvent; @@ -343,7 +351,9 @@ static ur_result_t enqueueMemImageCommandHelper( ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList, IsInternal, false)); - ZeEvent = (*Event)->ZeEvent; + UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, + NumEventsInWaitList, EventWaitList, + CommandList->second.ZeQueue)); (*Event)->WaitList = TmpWaitList; const auto &ZeCommandList = CommandList->first; @@ -991,8 +1001,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( } else { // For discrete devices we need a command list ur_command_list_ptr_t CommandList{}; - UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, - UseCopyEngine)); + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList)); // Add the event to the command list. CommandList->second.append(reinterpret_cast(*Event)); @@ -1004,6 +1014,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( char *ZeHandleSrc; UR_CALL(Buffer->getZeHandle(ZeHandleSrc, AccessMode, Queue->Device)); + UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, + NumEventsInWaitList, EventWaitList, + CommandList->second.ZeQueue)); + ZE2UR_CALL(zeCommandListAppendMemoryCopy, (ZeCommandList, *RetMap, ZeHandleSrc + Offset, Size, ZeEvent, WaitList.Length, WaitList.ZeEventList)); @@ -1114,7 +1128,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( ur_command_list_ptr_t CommandList{}; UR_CALL(Queue->Context->getAvailableCommandList( - reinterpret_cast(Queue), CommandList, UseCopyEngine)); + reinterpret_cast(Queue), CommandList, UseCopyEngine, + NumEventsInWaitList, EventWaitList)); CommandList->second.append(reinterpret_cast(*Event)); (*Event)->RefCount.increment(); @@ -1131,6 +1146,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, Queue->Device)); + UR_CALL(setSignalEvent(Queue, UseCopyEngine, &ZeEvent, Event, + NumEventsInWaitList, EventWaitList, + CommandList->second.ZeQueue)); + ZE2UR_CALL(zeCommandListAppendMemoryCopy, (ZeCommandList, ZeHandleDst + MapInfo.Offset, MappedPtr, MapInfo.Size, ZeEvent, (*Event)->WaitList.Length, @@ -1241,8 +1260,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( ur_command_list_ptr_t CommandList{}; // TODO: Change UseCopyEngine argument to 'true' once L0 backend // support is added - UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, - UseCopyEngine)); + UR_CALL(Queue->Context->getAvailableCommandList( + Queue, CommandList, UseCopyEngine, NumEventsInWaitList, EventWaitList)); // TODO: do we need to create a unique command type for this? ze_event_handle_t ZeEvent = nullptr; @@ -1298,7 +1317,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMAdvise( // TODO: Additional analysis is required to check if this operation will // run faster on copy engines. UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, - UseCopyEngine)); + UseCopyEngine, 0, nullptr)); // TODO: do we need to create a unique command type for this? ze_event_handle_t ZeEvent = nullptr; diff --git a/source/adapters/level_zero/program.cpp b/source/adapters/level_zero/program.cpp index e71ef2a8f9..447721f004 100644 --- a/source/adapters/level_zero/program.cpp +++ b/source/adapters/level_zero/program.cpp @@ -891,10 +891,15 @@ void ur_program_handle_t_::ur_release_program_resources(bool deletion) { } if (ZeModule && OwnZeModule) { - for (auto &ZeModulePair : this->ZeModuleMap) { - ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModulePair.second)); + if (ZeModuleMap.empty()) { + // interop api + ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModule)); + } else { + for (auto &ZeModulePair : this->ZeModuleMap) { + ZE_CALL_NOCHECK(zeModuleDestroy, (ZeModulePair.second)); + } + this->ZeModuleMap.clear(); } - this->ZeModuleMap.clear(); } resourcesReleased = true; } diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index 1030b491cf..f47f960cc0 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -1335,6 +1335,7 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList, // in the command list is not empty, otherwise we are going to just create // and remove proxy event right away and dereference deleted object // afterwards. + bool AppendBarrierNeeded = true; if (ZeEventsScope == LastCommandInBatchHostVisible && !CommandList->second.EventList.empty()) { // If there are only internal events in the command list then we don't @@ -1403,6 +1404,7 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList, ZE2UR_CALL(zeCommandListAppendSignalEvent, (CommandList->first, HostVisibleEvent->ZeEvent)); } else { + AppendBarrierNeeded = false; ZE2UR_CALL( zeCommandListAppendBarrier, (CommandList->first, HostVisibleEvent->ZeEvent, 0, nullptr)); @@ -1415,6 +1417,27 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList, // If we don't have host visible proxy then signal event if needed. this->signalEventFromCmdListIfLastEventDiscarded(CommandList); } + // Append Signalling of the inner events at the end of the batch if this is + // an integrated gpu and out of order signal events are not allowed. + if (!UrL0OutOfOrderIntegratedSignalEvent && this->Device->isIntegrated()) { + for (auto &Event : CommandList->second.EventList) { + // If the events scope does not apply a barrier already above, then we + // need to apply a barrier to wait on all the previous commands without + // signal events to complete before we can signal the batched events as + // completed. This functionality is only used if this command list is + // out of order and there are events created that were not used as + // signal events. + if (Event->IsInnerBatchedEvent) { + if (AppendBarrierNeeded) { + ZE2UR_CALL(zeCommandListAppendBarrier, + (CommandList->first, nullptr, 0, nullptr)); + AppendBarrierNeeded = false; + } + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (CommandList->first, Event->ZeEvent)); + } + } + } // Close the command list and have it ready for dispatch. ZE2UR_CALL(zeCommandListClose, (CommandList->first)); @@ -1489,8 +1512,7 @@ ur_queue_handle_t_::resetDiscardedEvent(ur_command_list_ptr_t CommandList) { } ur_result_t ur_queue_handle_t_::addEventToQueueCache(ur_event_handle_t Event) { - if (!Event->IsMultiDevice && Event->UrQueue) { - auto Device = Event->UrQueue->Device; + if (!Event->IsMultiDevice) { auto EventCachesMap = Event->isHostVisible() ? &EventCachesDeviceMap[0] : &EventCachesDeviceMap[1]; if (EventCachesMap->find(Device) == EventCachesMap->end()) { @@ -1517,6 +1539,34 @@ ur_result_t ur_queue_handle_t_::active_barriers::clear() { return UR_RESULT_SUCCESS; } +void ur_queue_handle_t_::clearEndTimeRecordings() { + uint64_t ZeTimerResolution = Device->ZeDeviceProperties->timerResolution; + const uint64_t TimestampMaxValue = + ((1ULL << Device->ZeDeviceProperties->kernelTimestampValidBits) - 1ULL); + + for (auto Entry : EndTimeRecordings) { + auto &Event = Entry.first; + auto &EndTimeRecording = Entry.second; + if (!Entry.second.EventHasDied) { + // Write the result back to the event if it is not dead. + uint64_t ContextEndTime = + (EndTimeRecording.RecordEventEndTimestamp & TimestampMaxValue) * + ZeTimerResolution; + + // Handle a possible wrap-around (the underlying HW counter is < 64-bit). + // Note, it will not report correct time if there were multiple wrap + // arounds, and the longer term plan is to enlarge the capacity of the + // HW timestamps. + if (ContextEndTime < Event->RecordEventStartTimestamp) + ContextEndTime += TimestampMaxValue * ZeTimerResolution; + + // Store it in the event. + Event->RecordEventEndTimestamp = ContextEndTime; + } + } + EndTimeRecordings.clear(); +} + ur_result_t urQueueReleaseInternal(ur_queue_handle_t Queue) { ur_queue_handle_t UrQueue = reinterpret_cast(Queue); @@ -1542,6 +1592,8 @@ ur_result_t urQueueReleaseInternal(ur_queue_handle_t Queue) { } } + Queue->clearEndTimeRecordings(); + logger::debug("urQueueRelease(compute) NumTimesClosedFull {}, " "NumTimesClosedEarly {}", UrQueue->ComputeCommandBatch.NumTimesClosedFull, @@ -1695,6 +1747,11 @@ ur_result_t ur_queue_handle_t_::synchronize() { LastCommandEvent = nullptr; } + // Since all timestamp recordings should have finished with the + // synchronizations, we can clear the map and write the results to the owning + // events. + clearEndTimeRecordings(); + // With the entire queue synchronized, the active barriers must be done so we // can remove them. if (auto Res = ActiveBarriers.clear()) @@ -1732,6 +1789,58 @@ ur_event_handle_t ur_queue_handle_t_::getEventFromQueueCache(bool IsMultiDevice, return RetEvent; } +// This helper function checks to see if an event for a command can be included +// at the end of a command list batch. This will only be true if the event does +// not have dependencies or the dependencies are not for events which exist in +// this batch. +bool eventCanBeBatched(ur_queue_handle_t Queue, bool UseCopyEngine, + uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList) { + auto &CommandBatch = + UseCopyEngine ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch; + // First see if there is an command-list open for batching commands + // for this queue. + if (Queue->hasOpenCommandList(UseCopyEngine)) { + // If this command should be batched, but the command has a dependency on a + // command in the current batch, then the command needs to have an event + // to track its completion so this event cannot be batched to the end of the + // command list. + if (NumEventsInWaitList > 0) { + for (auto &Event : CommandBatch.OpenCommandList->second.EventList) { + for (uint32_t i = 0; i < NumEventsInWaitList; i++) { + if (Event == EventWaitList[i]) { + return false; + } + } + } + } + } + return true; +} + +// This helper function checks to see if a signal event at the end of a command +// should be set. If the Queue is out of order and the command has no +// dependencies, then this command can be enqueued without a signal event set in +// a command list batch. The signal event will be appended at the end of the +// batch to be signalled at the end of the command list. +ur_result_t setSignalEvent(ur_queue_handle_t Queue, bool UseCopyEngine, + ze_event_handle_t *ZeEvent, ur_event_handle_t *Event, + uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList, + ze_command_queue_handle_t ZeQueue) { + if (!UrL0OutOfOrderIntegratedSignalEvent && Queue->Device->isIntegrated() && + eventCanBeBatched(Queue, UseCopyEngine, NumEventsInWaitList, + EventWaitList) && + !Queue->isInOrderQueue() && !Queue->UsingImmCmdLists) { + ZeEvent = nullptr; + (*Event)->IsInnerBatchedEvent = true; + (*Event)->ZeBatchedQueue = ZeQueue; + } else { + (*ZeEvent) = (*Event)->ZeEvent; + } + return UR_RESULT_SUCCESS; +} + // This helper function creates a ur_event_handle_t and associate a // ur_queue_handle_t. Note that the caller of this function must have acquired // lock on the Queue that is passed in. diff --git a/source/adapters/level_zero/queue.hpp b/source/adapters/level_zero/queue.hpp index 5cb061be5f..799e90e9d9 100644 --- a/source/adapters/level_zero/queue.hpp +++ b/source/adapters/level_zero/queue.hpp @@ -486,6 +486,22 @@ struct ur_queue_handle_t_ : _ur_object { std::vector> EventCachesDeviceMap{2}; + // End-times enqueued are stored on the queue rather than on the event to + // avoid the event objects having been destroyed prior to the write to the + // end-time member. + struct end_time_recording { + // RecordEventEndTimestamp is not adjusted for valid bits nor resolution, as + // it is written asynchronously. + uint64_t RecordEventEndTimestamp = 0; + // The event may die before the recording has been written back. In this + // case the event will mark this for deletion when the queue sees fit. + bool EventHasDied = false; + }; + std::map EndTimeRecordings; + + // Clear the end time recording timestamps entries. + void clearEndTimeRecordings(); + // adjust the queue's batch size, knowing that the current command list // is being closed with a full batch. // For copy commands, IsCopy is set to 'true'. @@ -691,6 +707,25 @@ ur_result_t createEventAndAssociateQueue( ur_command_list_ptr_t CommandList, bool IsInternal, bool IsMultiDevice, std::optional HostVisible = std::nullopt); +// This helper function checks to see if an event for a command can be included +// at the end of a command list batch. This will only be true if the event does +// not have dependencies or the dependencies are not for events which exist in +// this batch. +bool eventCanBeBatched(ur_queue_handle_t Queue, bool UseCopyEngine, + uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList); + +// This helper function checks to see if a signal event at the end of a command +// should be set. If the Queue is out of order and the command has no +// dependencies, then this command can be enqueued without a signal event set in +// a command list batch. The signal event will be appended at the end of the +// batch to be signalled at the end of the command list. +ur_result_t setSignalEvent(ur_queue_handle_t Queue, bool UseCopyEngine, + ze_event_handle_t *ZeEvent, ur_event_handle_t *Event, + uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList, + ze_command_queue_handle_t ZeQueue); + // Helper function to perform the necessary cleanup of the events from reset cmd // list. ur_result_t CleanupEventListFromResetCmdList( diff --git a/source/adapters/level_zero/ur_interface_loader.cpp b/source/adapters/level_zero/ur_interface_loader.cpp index 051db73145..d6d6060ea6 100644 --- a/source/adapters/level_zero/ur_interface_loader.cpp +++ b/source/adapters/level_zero/ur_interface_loader.cpp @@ -456,6 +456,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = urEnqueueCooperativeKernelLaunchExp; + pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/ur_level_zero.cpp b/source/adapters/level_zero/ur_level_zero.cpp index 60ad974b8e..9cecb3d427 100644 --- a/source/adapters/level_zero/ur_level_zero.cpp +++ b/source/adapters/level_zero/ur_level_zero.cpp @@ -1,18 +1,18 @@ -//===--------- ur_level_zero.cpp - Level Zero Adapter ---------------------===// -// -// Copyright (C) 2023 Intel Corporation -// -// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM -// Exceptions. See LICENSE.TXT -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include -#include -#include - -#include "ur_level_zero.hpp" - -// Define the static class field -std::mutex ZeCall::GlobalLock; +//===--------- ur_level_zero.cpp - Level Zero Adapter ---------------------===// +// +// Copyright (C) 2023 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include "ur_level_zero.hpp" + +// Define the static class field +std::mutex ZeCall::GlobalLock; diff --git a/source/adapters/level_zero/ur_level_zero.hpp b/source/adapters/level_zero/ur_level_zero.hpp index dd7bbf67b3..2b41db2d4b 100644 --- a/source/adapters/level_zero/ur_level_zero.hpp +++ b/source/adapters/level_zero/ur_level_zero.hpp @@ -1,38 +1,38 @@ -//===--------- ur_level_zero.hpp - Level Zero Adapter ---------------------===// -// -// Copyright (C) 2023 Intel Corporation -// -// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM -// Exceptions. See LICENSE.TXT -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "common.hpp" -#include "context.hpp" -#include "device.hpp" -#include "event.hpp" -#include "image.hpp" -#include "kernel.hpp" -#include "memory.hpp" -#include "physical_mem.hpp" -#include "platform.hpp" -#include "program.hpp" -#include "queue.hpp" -#include "sampler.hpp" -#include "usm.hpp" +//===--------- ur_level_zero.hpp - Level Zero Adapter ---------------------===// +// +// Copyright (C) 2023 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "common.hpp" +#include "context.hpp" +#include "device.hpp" +#include "event.hpp" +#include "image.hpp" +#include "kernel.hpp" +#include "memory.hpp" +#include "physical_mem.hpp" +#include "platform.hpp" +#include "program.hpp" +#include "queue.hpp" +#include "sampler.hpp" +#include "usm.hpp" diff --git a/source/adapters/level_zero/usm.cpp b/source/adapters/level_zero/usm.cpp index 76be0cd3c7..19d31bcb9b 100644 --- a/source/adapters/level_zero/usm.cpp +++ b/source/adapters/level_zero/usm.cpp @@ -171,7 +171,7 @@ static ur_result_t USMAllocationMakeResident( static ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, ur_device_handle_t Device, - ur_usm_device_mem_flags_t *Flags, + ur_usm_device_mem_flags_t Flags, size_t Size, uint32_t Alignment) { std::ignore = Flags; // TODO: translate PI properties to Level Zero flags @@ -213,12 +213,10 @@ static ur_result_t USMDeviceAllocImpl(void **ResultPtr, return UR_RESULT_SUCCESS; } -static ur_result_t USMSharedAllocImpl(void **ResultPtr, - ur_context_handle_t Context, - ur_device_handle_t Device, - ur_usm_host_mem_flags_t *, - ur_usm_device_mem_flags_t *, size_t Size, - uint32_t Alignment) { +static ur_result_t +USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, + ur_device_handle_t Device, ur_usm_host_mem_flags_t, + ur_usm_device_mem_flags_t, size_t Size, uint32_t Alignment) { // TODO: translate PI properties to Level Zero flags ZeStruct ZeHostDesc; @@ -263,7 +261,7 @@ static ur_result_t USMSharedAllocImpl(void **ResultPtr, static ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_usm_host_mem_flags_t *Flags, size_t Size, + ur_usm_host_mem_flags_t Flags, size_t Size, uint32_t Alignment) { std::ignore = Flags; // TODO: translate PI properties to Level Zero flags @@ -308,8 +306,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc( uint32_t Align = USMDesc ? USMDesc->align : 0; // L0 supports alignment up to 64KB and silently ignores higher values. // We flag alignment > 64KB as an invalid value. - // L0 spec says alignment values that are not powers of 2 are invalid. - if (Align > 65536 || Align && (Align - 1) != 0) + // L0 spec says that alignment values that are not powers of 2 are invalid. + if (Align > 65536 || (Align & (Align - 1)) != 0) return UR_RESULT_ERROR_INVALID_VALUE; ur_platform_handle_t Plt = Context->getPlatform(); @@ -379,7 +377,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( // L0 supports alignment up to 64KB and silently ignores higher values. // We flag alignment > 64KB as an invalid value. // L0 spec says that alignment values that are not powers of 2 are invalid. - if (Alignment > 65536 || Alignment & (Alignment - 1) != 0) + if (Alignment > 65536 || (Alignment & (Alignment - 1)) != 0) return UR_RESULT_ERROR_INVALID_VALUE; ur_platform_handle_t Plt = Device->Platform; @@ -480,7 +478,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( // L0 supports alignment up to 64KB and silently ignores higher values. // We flag alignment > 64KB as an invalid value. // L0 spec says that alignment values that are not powers of 2 are invalid. - if (Alignment > 65536 || Alignment && (Alignment - 1) != 0) + if (Alignment > 65536 || (Alignment & (Alignment - 1)) != 0) return UR_RESULT_ERROR_INVALID_VALUE; ur_platform_handle_t Plt = Device->Platform; @@ -767,8 +765,8 @@ umf_result_t L0MemoryProvider::get_min_page_size(void *Ptr, size_t *PageSize) { ur_result_t L0SharedMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, uint32_t Alignment) { - return USMSharedAllocImpl(ResultPtr, Context, Device, nullptr, nullptr, Size, - Alignment); + return USMSharedAllocImpl(ResultPtr, Context, Device, /*host flags*/ 0, + /*device flags*/ 0, Size, Alignment); } ur_result_t L0SharedReadOnlyMemoryProvider::allocateImpl(void **ResultPtr, @@ -776,20 +774,19 @@ ur_result_t L0SharedReadOnlyMemoryProvider::allocateImpl(void **ResultPtr, uint32_t Alignment) { ur_usm_device_desc_t UsmDeviceDesc{}; UsmDeviceDesc.flags = UR_USM_DEVICE_MEM_FLAG_DEVICE_READ_ONLY; - ur_usm_host_desc_t UsmHostDesc{}; - return USMSharedAllocImpl(ResultPtr, Context, Device, &UsmDeviceDesc.flags, - &UsmHostDesc.flags, Size, Alignment); + return USMSharedAllocImpl(ResultPtr, Context, Device, UsmDeviceDesc.flags, + /*host flags*/ 0, Size, Alignment); } ur_result_t L0DeviceMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, uint32_t Alignment) { - return USMDeviceAllocImpl(ResultPtr, Context, Device, nullptr, Size, + return USMDeviceAllocImpl(ResultPtr, Context, Device, /* flags */ 0, Size, Alignment); } ur_result_t L0HostMemoryProvider::allocateImpl(void **ResultPtr, size_t Size, uint32_t Alignment) { - return USMHostAllocImpl(ResultPtr, Context, nullptr, Size, Alignment); + return USMHostAllocImpl(ResultPtr, Context, /* flags */ 0, Size, Alignment); } ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, diff --git a/source/adapters/native_cpu/device.cpp b/source/adapters/native_cpu/device.cpp index 1babdb0f10..aabe0a3f13 100644 --- a/source/adapters/native_cpu/device.cpp +++ b/source/adapters/native_cpu/device.cpp @@ -112,7 +112,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // '0x8086' : 'Intel HD graphics vendor ID' return ReturnValue(uint32_t{0x8086}); case UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE: - return ReturnValue(size_t{256}); + // TODO: provide a mechanism to estimate/configure this. + return ReturnValue(size_t{2048}); case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: // Imported from level_zero return ReturnValue(uint32_t{8}); @@ -151,7 +152,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES: { struct { size_t Arr[3]; - } MaxGroupSize = {{256, 256, 1}}; + } MaxGroupSize = {{256, 256, 256}}; return ReturnValue(MaxGroupSize); } case UR_DEVICE_INFO_PREFERRED_VECTOR_WIDTH_CHAR: @@ -321,6 +322,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: return ReturnValue(false); + case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: + return ReturnValue(false); default: DIE_NO_IMPLEMENTATION; } diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp index 783f7f0ea9..7ab9883ca7 100644 --- a/source/adapters/native_cpu/enqueue.cpp +++ b/source/adapters/native_cpu/enqueue.cpp @@ -31,7 +31,7 @@ struct NDRDescT { for (uint32_t I = 0; I < WorkDim; I++) { GlobalOffset[I] = GlobalWorkOffset[I]; GlobalSize[I] = GlobalWorkSize[I]; - LocalSize[I] = LocalWorkSize[I]; + LocalSize[I] = LocalWorkSize ? LocalWorkSize[I] : 1; } for (uint32_t I = WorkDim; I < 3; I++) { GlobalSize[I] = 1; @@ -81,6 +81,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( DIE_NO_IMPLEMENTATION; } + // Check reqd_work_group_size + if (hKernel->hasReqdWGSize() && pLocalWorkSize != nullptr) { + const auto &Reqd = hKernel->getReqdWGSize(); + for (uint32_t Dim = 0; Dim < workDim; Dim++) { + if (pLocalWorkSize[Dim] != Reqd[Dim]) { + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + } + } + // TODO: add proper error checking // TODO: add proper event dep management native_cpu::NDRDescT ndr(workDim, pGlobalWorkOffset, pGlobalWorkSize, diff --git a/source/adapters/native_cpu/event.cpp b/source/adapters/native_cpu/event.cpp index 112bb553c0..9049e3c1b6 100644 --- a/source/adapters/native_cpu/event.cpp +++ b/source/adapters/native_cpu/event.cpp @@ -87,3 +87,15 @@ urEventSetCallback(ur_event_handle_t hEvent, ur_execution_info_t execStatus, DIE_NO_IMPLEMENTATION; } + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, bool blocking, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + std::ignore = hQueue; + std::ignore = blocking; + std::ignore = numEventsInWaitList; + std::ignore = phEventWaitList; + std::ignore = phEvent; + + DIE_NO_IMPLEMENTATION; +} diff --git a/source/adapters/native_cpu/image.cpp b/source/adapters/native_cpu/image.cpp index 01da9b7af8..6cb74fcbbe 100644 --- a/source/adapters/native_cpu/image.cpp +++ b/source/adapters/native_cpu/image.cpp @@ -59,7 +59,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( [[maybe_unused]] ur_exp_image_mem_handle_t hImageMem, [[maybe_unused]] const ur_image_format_t *pImageFormat, [[maybe_unused]] const ur_image_desc_t *pImageDesc, - [[maybe_unused]] ur_mem_handle_t *phMem, [[maybe_unused]] ur_exp_image_handle_t *phImage) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -71,7 +70,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( [[maybe_unused]] const ur_image_format_t *pImageFormat, [[maybe_unused]] const ur_image_desc_t *pImageDesc, [[maybe_unused]] ur_sampler_handle_t hSampler, - [[maybe_unused]] ur_mem_handle_t *phMem, [[maybe_unused]] ur_exp_image_handle_t *phImage) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/source/adapters/native_cpu/kernel.cpp b/source/adapters/native_cpu/kernel.cpp index 5a7a286adc..7ef17b0c28 100644 --- a/source/adapters/native_cpu/kernel.cpp +++ b/source/adapters/native_cpu/kernel.cpp @@ -28,7 +28,16 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName, auto f = reinterpret_cast( const_cast(kernelEntry->second)); - auto kernel = new ur_kernel_handle_t_(pKernelName, *f); + ur_kernel_handle_t_ *kernel; + + // Set reqd_work_group_size for kernel if needed + const auto &ReqdMap = hProgram->KernelReqdWorkGroupSizeMD; + auto ReqdIt = ReqdMap.find(pKernelName); + if (ReqdIt != ReqdMap.end()) { + kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f, ReqdIt->second); + } else { + kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f); + } *phKernel = kernel; @@ -84,13 +93,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel, // case UR_KERNEL_INFO_PROGRAM: // return ReturnValue(ur_program_handle_t{ Kernel->Program }); case UR_KERNEL_INFO_FUNCTION_NAME: - if (hKernel->_name) { - return ReturnValue(hKernel->_name); - } - return UR_RESULT_ERROR_INVALID_FUNCTION_NAME; - // case UR_KERNEL_INFO_NUM_ARGS: - // return ReturnValue(uint32_t{ Kernel->ZeKernelProperties->numKernelArgs - // }); + return ReturnValue(hKernel->_name); case UR_KERNEL_INFO_REFERENCE_COUNT: return ReturnValue(uint32_t{hKernel->getReferenceCount()}); case UR_KERNEL_INFO_ATTRIBUTES: @@ -121,8 +124,16 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice, return returnValue(max_threads); } case UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE: { - size_t group_size[3] = {1, 1, 1}; - return returnValue(group_size, 3); + size_t GroupSize[3] = {0, 0, 0}; + const auto &ReqdWGSizeMDMap = hKernel->hProgram->KernelReqdWorkGroupSizeMD; + const auto ReqdWGSizeMD = ReqdWGSizeMDMap.find(hKernel->_name); + if (ReqdWGSizeMD != ReqdWGSizeMDMap.end()) { + const auto ReqdWGSize = ReqdWGSizeMD->second; + GroupSize[0] = std::get<0>(ReqdWGSize); + GroupSize[1] = std::get<1>(ReqdWGSize); + GroupSize[2] = std::get<2>(ReqdWGSize); + } + return returnValue(GroupSize, 3); } case UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE: { int bytes = 0; diff --git a/source/adapters/native_cpu/kernel.hpp b/source/adapters/native_cpu/kernel.hpp index 9023a23fb2..b5728fa8b2 100644 --- a/source/adapters/native_cpu/kernel.hpp +++ b/source/adapters/native_cpu/kernel.hpp @@ -10,6 +10,8 @@ #include "common.hpp" #include "nativecpu_state.hpp" +#include "program.hpp" +#include #include #include @@ -37,13 +39,17 @@ struct local_arg_info_t { struct ur_kernel_handle_t_ : RefCounted { - ur_kernel_handle_t_(const char *name, nativecpu_task_t subhandler) - : _name{name}, _subhandler{std::move(subhandler)} {} + ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name, + nativecpu_task_t subhandler) + : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)}, + HasReqdWGSize(false) {} ur_kernel_handle_t_(const ur_kernel_handle_t_ &other) - : _name(other._name), _subhandler(other._subhandler), _args(other._args), + : hProgram(other.hProgram), _name(other._name), + _subhandler(other._subhandler), _args(other._args), _localArgInfo(other._localArgInfo), _localMemPool(other._localMemPool), - _localMemPoolSize(other._localMemPoolSize) { + _localMemPoolSize(other._localMemPoolSize), + HasReqdWGSize(other.HasReqdWGSize), ReqdWGSize(other.ReqdWGSize) { incrementReferenceCount(); } @@ -52,13 +58,22 @@ struct ur_kernel_handle_t_ : RefCounted { free(_localMemPool); } } - - const char *_name; + ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name, + nativecpu_task_t subhandler, + const native_cpu::ReqdWGSize_t &ReqdWGSize) + : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)}, + HasReqdWGSize(true), ReqdWGSize(ReqdWGSize) {} + + ur_program_handle_t hProgram; + std::string _name; nativecpu_task_t _subhandler; std::vector _args; std::vector _localArgInfo; - // To be called before enqueueing the kernel. + bool hasReqdWGSize() const { return HasReqdWGSize; } + + const native_cpu::ReqdWGSize_t &getReqdWGSize() const { return ReqdWGSize; } + void updateMemPool(size_t numParallelThreads) { // compute requested size. size_t reqSize = 0; @@ -88,4 +103,6 @@ struct ur_kernel_handle_t_ : RefCounted { private: char *_localMemPool = nullptr; size_t _localMemPoolSize = 0; + bool HasReqdWGSize; + native_cpu::ReqdWGSize_t ReqdWGSize; }; diff --git a/source/adapters/native_cpu/program.cpp b/source/adapters/native_cpu/program.cpp index f210f210c9..678bdb8074 100644 --- a/source/adapters/native_cpu/program.cpp +++ b/source/adapters/native_cpu/program.cpp @@ -11,7 +11,9 @@ #include "ur_api.h" #include "common.hpp" +#include "common/ur_util.hpp" #include "program.hpp" +#include UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, @@ -26,6 +28,29 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, DIE_NO_IMPLEMENTATION } +static ur_result_t getReqdWGSize(const ur_program_metadata_t &MetadataElement, + native_cpu::ReqdWGSize_t &res) { + size_t MDElemsSize = MetadataElement.size - sizeof(std::uint64_t); + + // Expect between 1 and 3 32-bit integer values. + UR_ASSERT(MDElemsSize == sizeof(std::uint32_t) || + MDElemsSize == sizeof(std::uint32_t) * 2 || + MDElemsSize == sizeof(std::uint32_t) * 3, + UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); + + // Get pointer to data, skipping 64-bit size at the start of the data. + const char *ValuePtr = + reinterpret_cast(MetadataElement.value.pData) + + sizeof(std::uint64_t); + // Read values and pad with 1's for values not present. + std::uint32_t ReqdWorkGroupElements[] = {1, 1, 1}; + std::memcpy(ReqdWorkGroupElements, ValuePtr, MDElemsSize); + std::get<0>(res) = ReqdWorkGroupElements[0]; + std::get<1>(res) = ReqdWorkGroupElements[1]; + std::get<2>(res) = ReqdWorkGroupElements[2]; + return UR_RESULT_SUCCESS; +} + UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size, const uint8_t *pBinary, const ur_program_properties_t *pProperties, @@ -40,6 +65,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( auto hProgram = new ur_program_handle_t_( hContext, reinterpret_cast(pBinary)); + if (pProperties != nullptr) { + for (uint32_t i = 0; i < pProperties->count; i++) { + auto mdNode = pProperties->pMetadatas[i]; + std::string mdName(mdNode.pName); + auto [Prefix, Tag] = splitMetadataName(mdName); + if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) { + native_cpu::ReqdWGSize_t reqdWGSize; + auto res = getReqdWGSize(mdNode, reqdWGSize); + if (res != UR_RESULT_SUCCESS) { + return res; + } + hProgram->KernelReqdWorkGroupSizeMD[Prefix] = std::move(reqdWGSize); + } + } + } const nativecpu_entry *nativecpu_it = reinterpret_cast(pBinary); diff --git a/source/adapters/native_cpu/program.hpp b/source/adapters/native_cpu/program.hpp index 311eb40992..0673b4122c 100644 --- a/source/adapters/native_cpu/program.hpp +++ b/source/adapters/native_cpu/program.hpp @@ -15,6 +15,10 @@ #include "context.hpp" #include +namespace native_cpu { +using ReqdWGSize_t = std::array; +} + struct ur_program_handle_t_ : RefCounted { ur_program_handle_t_(ur_context_handle_t ctx, const unsigned char *pBinary) : _ctx{ctx}, _ptr{pBinary} {} @@ -30,6 +34,8 @@ struct ur_program_handle_t_ : RefCounted { }; std::map _kernels; + std::unordered_map + KernelReqdWorkGroupSizeMD; }; // The nativecpu_entry struct is also defined as LLVM-IR in the diff --git a/source/adapters/native_cpu/ur_interface_loader.cpp b/source/adapters/native_cpu/ur_interface_loader.cpp index 1f54a98c80..065012613e 100644 --- a/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/source/adapters/native_cpu/ur_interface_loader.cpp @@ -392,6 +392,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( } pDdiTable->pfnCooperativeKernelLaunchExp = nullptr; + pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/null/ur_nullddi.cpp b/source/adapters/null/ur_nullddi.cpp index 37d0ceb15a..45dd6a59c5 100644 --- a/source/adapters/null/ur_nullddi.cpp +++ b/source/adapters/null/ur_nullddi.cpp @@ -4292,7 +4292,6 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( const ur_image_format_t *pImageFormat, ///< [in] pointer to image format specification const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description - ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created ur_exp_image_handle_t *phImage ///< [out] pointer to handle of image object created ) try { @@ -4303,12 +4302,9 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( d_context.urDdiTable.BindlessImagesExp.pfnUnsampledImageCreateExp; if (nullptr != pfnUnsampledImageCreateExp) { result = pfnUnsampledImageCreateExp(hContext, hDevice, hImageMem, - pImageFormat, pImageDesc, phMem, - phImage); + pImageFormat, pImageDesc, phImage); } else { // generic implementation - *phMem = reinterpret_cast(d_context.get()); - *phImage = reinterpret_cast(d_context.get()); } @@ -4328,7 +4324,6 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( *pImageFormat, ///< [in] pointer to image format specification const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description ur_sampler_handle_t hSampler, ///< [in] sampler to be used - ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created ur_exp_image_handle_t *phImage ///< [out] pointer to handle of image object created ) try { @@ -4340,11 +4335,9 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( if (nullptr != pfnSampledImageCreateExp) { result = pfnSampledImageCreateExp(hContext, hDevice, hImageMem, pImageFormat, - pImageDesc, hSampler, phMem, phImage); + pImageDesc, hSampler, phImage); } else { // generic implementation - *phMem = reinterpret_cast(d_context.get()); - *phImage = reinterpret_cast(d_context.get()); } @@ -5513,6 +5506,47 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueTimestampRecordingExp +__urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + bool + blocking, ///< [in] indicates whether the call to this function should block until + ///< until the device timestamp recording command has executed on the + ///< device. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies this particular kernel + ///< execution instance. Profiling information can be queried + ///< from this event as if `hQueue` had profiling enabled. Querying + ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + ///< reports the timestamp at the time of the call to this function. + ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + ///< reports the timestamp recorded when the command is executed on the device. + ) try { + ur_result_t result = UR_RESULT_SUCCESS; + + // if the driver has created a custom function, then call it instead of using the generic path + auto pfnTimestampRecordingExp = + d_context.urDdiTable.EnqueueExp.pfnTimestampRecordingExp; + if (nullptr != pfnTimestampRecordingExp) { + result = pfnTimestampRecordingExp(hQueue, blocking, numEventsInWaitList, + phEventWaitList, phEvent); + } else { + // generic implementation + *phEvent = reinterpret_cast(d_context.get()); + } + + return result; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urProgramBuildExp __urdlllocal ur_result_t UR_APICALL urProgramBuildExp( @@ -6069,6 +6103,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = driver::urEnqueueCooperativeKernelLaunchExp; + pDdiTable->pfnTimestampRecordingExp = + driver::urEnqueueTimestampRecordingExp; + return result; } catch (...) { return exceptionToResult(std::current_exception()); diff --git a/source/adapters/opencl/common.cpp b/source/adapters/opencl/common.cpp index 63981187e7..750616235d 100644 --- a/source/adapters/opencl/common.cpp +++ b/source/adapters/opencl/common.cpp @@ -85,6 +85,8 @@ ur_result_t mapCLErrorToUR(cl_int Result) { return UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS; case CL_DEVICE_NOT_AVAILABLE: return UR_RESULT_ERROR_DEVICE_NOT_AVAILABLE; + case CL_INVALID_KERNEL_ARGS: + return UR_RESULT_ERROR_INVALID_KERNEL_ARGS; default: return UR_RESULT_ERROR_UNKNOWN; } diff --git a/source/adapters/opencl/device.cpp b/source/adapters/opencl/device.cpp index d89a9492a5..ac79b71876 100644 --- a/source/adapters/opencl/device.cpp +++ b/source/adapters/opencl/device.cpp @@ -507,7 +507,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, cl_adapter::cast(hDevice), {"cl_khr_fp16"}, Supported)); if (!Supported) { - return UR_RESULT_ERROR_INVALID_ENUMERATION; + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; } } @@ -799,6 +799,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT: { return ReturnValue(false); } + case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: { + return ReturnValue(false); + } case UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED: { bool Supported = false; CL_RETURN_ON_FAILURE(cl_adapter::checkDeviceExtensions( diff --git a/source/adapters/opencl/event.cpp b/source/adapters/opencl/event.cpp index f5af30734c..5141ce8ff0 100644 --- a/source/adapters/opencl/event.cpp +++ b/source/adapters/opencl/event.cpp @@ -257,3 +257,9 @@ urEventSetCallback(ur_event_handle_t hEvent, ur_execution_info_t execStatus, CallbackType, ClCallback, Callback)); return UR_RESULT_SUCCESS; } + +UR_APIEXPORT ur_result_t UR_APICALL +urEnqueueTimestampRecordingExp(ur_queue_handle_t, bool, uint32_t, + const ur_event_handle_t *, ur_event_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/source/adapters/opencl/image.cpp b/source/adapters/opencl/image.cpp index f7cef40194..1f4887019b 100644 --- a/source/adapters/opencl/image.cpp +++ b/source/adapters/opencl/image.cpp @@ -59,7 +59,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( [[maybe_unused]] ur_exp_image_mem_handle_t hImageMem, [[maybe_unused]] const ur_image_format_t *pImageFormat, [[maybe_unused]] const ur_image_desc_t *pImageDesc, - [[maybe_unused]] ur_mem_handle_t *phMem, [[maybe_unused]] ur_exp_image_handle_t *phImage) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -71,7 +70,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( [[maybe_unused]] const ur_image_format_t *pImageFormat, [[maybe_unused]] const ur_image_desc_t *pImageDesc, [[maybe_unused]] ur_sampler_handle_t hSampler, - [[maybe_unused]] ur_mem_handle_t *phMem, [[maybe_unused]] ur_exp_image_handle_t *phImage) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/source/adapters/opencl/ur_interface_loader.cpp b/source/adapters/opencl/ur_interface_loader.cpp index 8c2c73d7c8..effb2128c3 100644 --- a/source/adapters/opencl/ur_interface_loader.cpp +++ b/source/adapters/opencl/ur_interface_loader.cpp @@ -398,6 +398,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = urEnqueueCooperativeKernelLaunchExp; + pDdiTable->pfnTimestampRecordingExp = urEnqueueTimestampRecordingExp; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/opencl/usm.cpp b/source/adapters/opencl/usm.cpp index ea11bb8752..6661fd4fa1 100644 --- a/source/adapters/opencl/usm.cpp +++ b/source/adapters/opencl/usm.cpp @@ -68,8 +68,7 @@ usmDescToCLMemProperties(const ur_base_desc_t *Desc, return UR_RESULT_ERROR_INVALID_VALUE; } - Next = Next->pNext ? static_cast(Next->pNext) - : nullptr; + Next = static_cast(Next->pNext); } while (Next); if (AllocFlags) { diff --git a/source/common/logger/ur_logger.hpp b/source/common/logger/ur_logger.hpp index 9735d0d814..614d13a214 100644 --- a/source/common/logger/ur_logger.hpp +++ b/source/common/logger/ur_logger.hpp @@ -14,7 +14,8 @@ namespace logger { -Logger create_logger(std::string logger_name, bool skip_prefix = false); +Logger create_logger(std::string logger_name, bool skip_prefix = false, + bool skip_linebreak = false); inline Logger &get_logger(std::string name = "common") { static Logger logger = create_logger(std::move(name)); @@ -105,7 +106,8 @@ template inline std::string toHex(T t) { /// - flush level: error, meaning that only error messages are guaranteed /// to be printed immediately as they occur /// - output: stderr -inline Logger create_logger(std::string logger_name, bool skip_prefix) { +inline Logger create_logger(std::string logger_name, bool skip_prefix, + bool skip_linebreak) { std::transform(logger_name.begin(), logger_name.end(), logger_name.begin(), ::toupper); std::stringstream env_var_name; @@ -121,7 +123,7 @@ inline Logger create_logger(std::string logger_name, bool skip_prefix) { auto map = getenv_to_map(env_var_name.str().c_str()); if (!map.has_value()) { return Logger(std::make_unique( - std::move(logger_name), skip_prefix)); + std::move(logger_name), skip_prefix, skip_linebreak)); } auto kv = map->find("level"); @@ -150,19 +152,20 @@ inline Logger create_logger(std::string logger_name, bool skip_prefix) { << map->begin()->first << "'. Default logger options are set."; return Logger(std::make_unique( - std::move(logger_name), skip_prefix)); + std::move(logger_name), skip_prefix, skip_linebreak)); } - sink = - values.size() == 2 - ? sink_from_str(logger_name, values[0], values[1], skip_prefix) - : sink_from_str(logger_name, values[0], "", skip_prefix); + sink = values.size() == 2 + ? sink_from_str(logger_name, values[0], values[1], + skip_prefix, skip_linebreak) + : sink_from_str(logger_name, values[0], "", skip_prefix, + skip_linebreak); } catch (const std::invalid_argument &e) { std::cerr << "Error when creating a logger instance from the '" << env_var_name.str() << "' environment variable:\n" << e.what() << std::endl; return Logger(std::make_unique( - std::move(logger_name), skip_prefix)); + std::move(logger_name), skip_prefix, skip_linebreak)); } sink->setFlushLevel(flush_level); diff --git a/source/common/logger/ur_logger_details.hpp b/source/common/logger/ur_logger_details.hpp index 5324f4083f..f17d3b3f64 100644 --- a/source/common/logger/ur_logger_details.hpp +++ b/source/common/logger/ur_logger_details.hpp @@ -25,6 +25,7 @@ class Logger { Logger(logger::Level level, std::unique_ptr sink) : level(level), sink(std::move(sink)) {} + Logger &operator=(Logger &&) = default; ~Logger() = default; void setLevel(logger::Level level) { this->level = level; } diff --git a/source/common/logger/ur_sinks.hpp b/source/common/logger/ur_sinks.hpp index 97c87d3cb4..b2ebf72a7a 100644 --- a/source/common/logger/ur_sinks.hpp +++ b/source/common/logger/ur_sinks.hpp @@ -39,8 +39,10 @@ class Sink { std::ostream *ostream; logger::Level flush_level; - Sink(std::string logger_name, bool skip_prefix = false) - : logger_name(std::move(logger_name)), skip_prefix(skip_prefix) { + Sink(std::string logger_name, bool skip_prefix = false, + bool skip_linebreak = false) + : logger_name(std::move(logger_name)), skip_prefix(skip_prefix), + skip_linebreak(skip_linebreak) { ostream = nullptr; flush_level = logger::Level::ERR; } @@ -56,6 +58,7 @@ class Sink { private: std::string logger_name; bool skip_prefix; + bool skip_linebreak; std::mutex output_mutex; const char *error_prefix = "Log message syntax error: "; @@ -83,7 +86,9 @@ class Sink { } } } - buffer << "\n"; + if (!skip_linebreak) { + buffer << "\n"; + } } template @@ -129,14 +134,15 @@ class Sink { class StdoutSink : public Sink { public: - StdoutSink(std::string logger_name, bool skip_prefix = false) - : Sink(std::move(logger_name), skip_prefix) { + StdoutSink(std::string logger_name, bool skip_prefix = false, + bool skip_linebreak = false) + : Sink(std::move(logger_name), skip_prefix, skip_linebreak) { this->ostream = &std::cout; } StdoutSink(std::string logger_name, Level flush_lvl, - bool skip_prefix = false) - : StdoutSink(std::move(logger_name), skip_prefix) { + bool skip_prefix = false, bool skip_linebreak = false) + : StdoutSink(std::move(logger_name), skip_prefix, skip_linebreak) { this->flush_level = flush_lvl; } @@ -145,13 +151,15 @@ class StdoutSink : public Sink { class StderrSink : public Sink { public: - StderrSink(std::string logger_name, bool skip_prefix = false) - : Sink(std::move(logger_name), skip_prefix) { + StderrSink(std::string logger_name, bool skip_prefix = false, + bool skip_linebreak = false) + : Sink(std::move(logger_name), skip_prefix, skip_linebreak) { this->ostream = &std::cerr; } - StderrSink(std::string logger_name, Level flush_lvl, bool skip_prefix) - : StderrSink(std::move(logger_name), skip_prefix) { + StderrSink(std::string logger_name, Level flush_lvl, bool skip_prefix, + bool skip_linebreak) + : StderrSink(std::move(logger_name), skip_prefix, skip_linebreak) { this->flush_level = flush_lvl; } @@ -161,8 +169,8 @@ class StderrSink : public Sink { class FileSink : public Sink { public: FileSink(std::string logger_name, filesystem::path file_path, - bool skip_prefix = false) - : Sink(std::move(logger_name), skip_prefix) { + bool skip_prefix = false, bool skip_linebreak = false) + : Sink(std::move(logger_name), skip_prefix, skip_linebreak) { ofstream = std::ofstream(file_path); if (!ofstream.good()) { std::stringstream ss; @@ -174,8 +182,10 @@ class FileSink : public Sink { } FileSink(std::string logger_name, filesystem::path file_path, - Level flush_lvl, bool skip_prefix = false) - : FileSink(std::move(logger_name), std::move(file_path), skip_prefix) { + Level flush_lvl, bool skip_prefix = false, + bool skip_linebreak = false) + : FileSink(std::move(logger_name), std::move(file_path), skip_prefix, + skip_linebreak) { this->flush_level = flush_lvl; } @@ -188,14 +198,17 @@ class FileSink : public Sink { inline std::unique_ptr sink_from_str(std::string logger_name, std::string name, filesystem::path file_path = "", - bool skip_prefix = false) { + bool skip_prefix = false, + bool skip_linebreak = false) { if (name == "stdout" && file_path.empty()) { - return std::make_unique(logger_name, skip_prefix); + return std::make_unique(logger_name, skip_prefix, + skip_linebreak); } else if (name == "stderr" && file_path.empty()) { - return std::make_unique(logger_name, skip_prefix); + return std::make_unique(logger_name, skip_prefix, + skip_linebreak); } else if (name == "file" && !file_path.empty()) { return std::make_unique(logger_name, file_path, - skip_prefix); + skip_prefix, skip_linebreak); } throw std::invalid_argument( diff --git a/source/common/ur_util.hpp b/source/common/ur_util.hpp index 9cecdbec1e..c78aac1d37 100644 --- a/source/common/ur_util.hpp +++ b/source/common/ur_util.hpp @@ -294,7 +294,7 @@ template struct stype_map {}; // stype_map_impl {}; #include "stype_map_helpers.def" -template constexpr int as_stype() { return stype_map::value; }; +template constexpr int as_stype() { return stype_map::value; } /// Walk a generic UR linked list looking for a node of the given type. If it's /// found, its address is returned, othewise `nullptr`. e.g. to find out whether @@ -334,4 +334,13 @@ namespace ur { } } // namespace ur +inline std::pair +splitMetadataName(const std::string &metadataName) { + size_t splitPos = metadataName.rfind('@'); + if (splitPos == std::string::npos) { + return std::make_pair(metadataName, std::string{}); + } + return std::make_pair(metadataName.substr(0, splitPos), + metadataName.substr(splitPos, metadataName.length())); +} #endif /* UR_UTIL_H */ diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp index c55d752410..37630e5c29 100644 --- a/source/loader/layers/sanitizer/asan_interceptor.cpp +++ b/source/loader/layers/sanitizer/asan_interceptor.cpp @@ -22,8 +22,6 @@ namespace ur_sanitizer_layer { namespace { -constexpr auto kSPIR_DeviceSanitizerReportMem = "__DeviceSanitizerReportMem"; - uptr MemToShadow_CPU(uptr USM_SHADOW_BASE, uptr UPtr) { return USM_SHADOW_BASE + (UPtr >> 3); } @@ -155,6 +153,35 @@ SanitizerInterceptor::SanitizerInterceptor() { cl_Debug = Value == "1" || Value == "true" ? 1 : 0; } + KV = Options->find("redzone"); + if (KV != Options->end()) { + auto Value = KV->second.front(); + try { + cl_MinRZSize = std::stoul(Value); + if (cl_MinRZSize < 16) { + cl_MinRZSize = 16; + context.logger.warning("Trying to set redzone size to a value " + "less than 16 is ignored"); + } + } catch (...) { + die("[ERROR]: \"redzone\" should be an integer"); + } + } + KV = Options->find("max_redzone"); + if (KV != Options->end()) { + auto Value = KV->second.front(); + try { + cl_MaxRZSize = std::stoul(Value); + if (cl_MaxRZSize > 2048) { + cl_MaxRZSize = 2048; + context.logger.warning("Trying to set max redzone size to a " + "value greater than 2048 is ignored"); + } + } catch (...) { + die("[ERROR]: \"max_redzone\" should be an integer"); + } + } + KV = Options->find("quarantine_size_mb"); if (KV != Options->end()) { auto Value = KV->second.front(); @@ -213,7 +240,7 @@ ur_result_t SanitizerInterceptor::allocateMemory( Alignment = MinAlignment; } - uptr RZLog = ComputeRZLog(Size); + uptr RZLog = ComputeRZLog(Size, cl_MinRZSize, cl_MaxRZSize); uptr RZSize = RZLog2Size(RZLog); uptr RoundedSize = RoundUpTo(Size, Alignment); uptr NeededSize = RoundedSize + RZSize * 2; @@ -348,11 +375,14 @@ ur_result_t SanitizerInterceptor::releaseMemory(ur_context_handle_t Context, ur_result_t SanitizerInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, - LaunchInfo &LaunchInfo) { + USMLaunchInfo &LaunchInfo) { auto Context = GetContext(Queue); auto Device = GetDevice(Queue); auto ContextInfo = getContextInfo(Context); auto DeviceInfo = getDeviceInfo(Device); + auto KernelInfo = getKernelInfo(Kernel); + + UR_CALL(LaunchInfo.updateKernelInfo(*KernelInfo.get())); ManagedQueue InternalQueue(Context, Device); if (!InternalQueue) { @@ -370,23 +400,12 @@ ur_result_t SanitizerInterceptor::preLaunchKernel(ur_kernel_handle_t Kernel, ur_result_t SanitizerInterceptor::postLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, - ur_event_handle_t &Event, - LaunchInfo &LaunchInfo) { - auto Program = GetProgram(Kernel); - ur_event_handle_t ReadEvent{}; - - // If kernel has defined SPIR_DeviceSanitizerReportMem, then we try to read it - // to host, but it's okay that it isn't defined + USMLaunchInfo &LaunchInfo) { // FIXME: We must use block operation here, until we support urEventSetCallback - auto Result = context.urDdiTable.Enqueue.pfnDeviceGlobalVariableRead( - Queue, Program, kSPIR_DeviceSanitizerReportMem, true, - sizeof(LaunchInfo.SPIR_DeviceSanitizerReportMem), 0, - &LaunchInfo.SPIR_DeviceSanitizerReportMem, 1, &Event, &ReadEvent); + auto Result = context.urDdiTable.Queue.pfnFinish(Queue); if (Result == UR_RESULT_SUCCESS) { - Event = ReadEvent; - - const auto &AH = LaunchInfo.SPIR_DeviceSanitizerReportMem; + const auto &AH = LaunchInfo.Data->SanitizerReport; if (!AH.Flag) { return UR_RESULT_SUCCESS; } @@ -627,13 +646,44 @@ ur_result_t SanitizerInterceptor::eraseDevice(ur_device_handle_t Device) { return UR_RESULT_SUCCESS; } +ur_result_t SanitizerInterceptor::insertKernel(ur_kernel_handle_t Kernel) { + std::scoped_lock Guard(m_KernelMapMutex); + if (m_KernelMap.find(Kernel) != m_KernelMap.end()) { + return UR_RESULT_SUCCESS; + } + m_KernelMap.emplace(Kernel, std::make_shared(Kernel)); + return UR_RESULT_SUCCESS; +} + +ur_result_t SanitizerInterceptor::eraseKernel(ur_kernel_handle_t Kernel) { + std::scoped_lock Guard(m_KernelMapMutex); + assert(m_KernelMap.find(Kernel) != m_KernelMap.end()); + m_KernelMap.erase(Kernel); + return UR_RESULT_SUCCESS; +} + ur_result_t SanitizerInterceptor::prepareLaunch( ur_context_handle_t Context, std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, ur_kernel_handle_t Kernel, - LaunchInfo &LaunchInfo) { + USMLaunchInfo &LaunchInfo) { auto Program = GetProgram(Kernel); do { + // Set launch info argument + auto ArgNums = GetKernelNumArgs(Kernel); + if (ArgNums) { + context.logger.debug( + "launch_info {} (numLocalArgs={}, localArgs={})", + (void *)LaunchInfo.Data, LaunchInfo.Data->NumLocalArgs, + (void *)LaunchInfo.Data->LocalArgs); + ur_result_t URes = context.urDdiTable.Kernel.pfnSetArgPointer( + Kernel, ArgNums - 1, nullptr, &LaunchInfo.Data); + if (URes != UR_RESULT_SUCCESS) { + context.logger.error("Failed to set launch info: {}", URes); + return URes; + } + } + // Write global variable to program auto EnqueueWriteGlobal = [Queue, Program](const char *Name, const void *Value, @@ -723,15 +773,17 @@ ur_result_t SanitizerInterceptor::prepareLaunch( "LocalShadowMemorySize={})", NumWG, LocalMemorySize, LocalShadowMemorySize); - UR_CALL(EnqueueAllocateDevice(LocalShadowMemorySize, - LaunchInfo.LocalShadowOffset)); + UR_CALL(EnqueueAllocateDevice( + LocalShadowMemorySize, LaunchInfo.Data->LocalShadowOffset)); - LaunchInfo.LocalShadowOffsetEnd = - LaunchInfo.LocalShadowOffset + LocalShadowMemorySize - 1; + LaunchInfo.Data->LocalShadowOffsetEnd = + LaunchInfo.Data->LocalShadowOffset + LocalShadowMemorySize - + 1; - context.logger.info("ShadowMemory(Local, {} - {})", - (void *)LaunchInfo.LocalShadowOffset, - (void *)LaunchInfo.LocalShadowOffsetEnd); + context.logger.info( + "ShadowMemory(Local, {} - {})", + (void *)LaunchInfo.Data->LocalShadowOffset, + (void *)LaunchInfo.Data->LocalShadowOffsetEnd); } } } while (false); @@ -749,15 +801,61 @@ SanitizerInterceptor::findAllocInfoByAddress(uptr Address) { return --It; } -LaunchInfo::~LaunchInfo() { +ur_result_t USMLaunchInfo::initialize() { + UR_CALL(context.urDdiTable.Context.pfnRetain(Context)); + UR_CALL(context.urDdiTable.Device.pfnRetain(Device)); + UR_CALL(context.urDdiTable.USM.pfnSharedAlloc( + Context, Device, nullptr, nullptr, sizeof(LaunchInfo), (void **)&Data)); + *Data = LaunchInfo{}; + return UR_RESULT_SUCCESS; +} + +ur_result_t USMLaunchInfo::updateKernelInfo(const KernelInfo &KI) { + auto NumArgs = KI.LocalArgs.size(); + if (NumArgs) { + Data->NumLocalArgs = NumArgs; + UR_CALL(context.urDdiTable.USM.pfnSharedAlloc( + Context, Device, nullptr, nullptr, sizeof(LocalArgsInfo) * NumArgs, + (void **)&Data->LocalArgs)); + uint32_t i = 0; + for (auto [ArgIndex, ArgInfo] : KI.LocalArgs) { + Data->LocalArgs[i++] = ArgInfo; + context.logger.debug( + "local_args (argIndex={}, size={}, sizeWithRZ={})", ArgIndex, + ArgInfo.Size, ArgInfo.SizeWithRedZone); + } + } + return UR_RESULT_SUCCESS; +} + +USMLaunchInfo::~USMLaunchInfo() { [[maybe_unused]] ur_result_t Result; - if (LocalShadowOffset) { - Result = - context.urDdiTable.USM.pfnFree(Context, (void *)LocalShadowOffset); + if (Data) { + auto Type = GetDeviceType(Device); + if (Type == DeviceType::GPU_PVC) { + if (Data->PrivateShadowOffset) { + Result = context.urDdiTable.USM.pfnFree( + Context, (void *)Data->PrivateShadowOffset); + assert(Result == UR_RESULT_SUCCESS); + } + if (Data->LocalShadowOffset) { + Result = context.urDdiTable.USM.pfnFree( + Context, (void *)Data->LocalShadowOffset); + assert(Result == UR_RESULT_SUCCESS); + } + } + if (Data->LocalArgs) { + Result = context.urDdiTable.USM.pfnFree(Context, + (void *)Data->LocalArgs); + assert(Result == UR_RESULT_SUCCESS); + } + Result = context.urDdiTable.USM.pfnFree(Context, (void *)Data); assert(Result == UR_RESULT_SUCCESS); } Result = context.urDdiTable.Context.pfnRelease(Context); assert(Result == UR_RESULT_SUCCESS); + Result = context.urDdiTable.Device.pfnRelease(Device); + assert(Result == UR_RESULT_SUCCESS); } } // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan_interceptor.hpp index a691bee7b7..f02ca3b370 100644 --- a/source/loader/layers/sanitizer/asan_interceptor.hpp +++ b/source/loader/layers/sanitizer/asan_interceptor.hpp @@ -79,6 +79,26 @@ struct QueueInfo { } }; +struct KernelInfo { + ur_kernel_handle_t Handle; + + ur_shared_mutex Mutex; + // Need preserve the order of local arguments + std::map LocalArgs; + + explicit KernelInfo(ur_kernel_handle_t Kernel) : Handle(Kernel) { + [[maybe_unused]] auto Result = + context.urDdiTable.Kernel.pfnRetain(Kernel); + assert(Result == UR_RESULT_SUCCESS); + } + + ~KernelInfo() { + [[maybe_unused]] auto Result = + context.urDdiTable.Kernel.pfnRelease(Handle); + assert(Result == UR_RESULT_SUCCESS); + } +}; + struct ContextInfo { ur_context_handle_t Handle; @@ -107,31 +127,30 @@ struct ContextInfo { } }; -struct LaunchInfo { - uptr LocalShadowOffset = 0; - uptr LocalShadowOffsetEnd = 0; - DeviceSanitizerReport SPIR_DeviceSanitizerReportMem; +struct USMLaunchInfo { + LaunchInfo *Data; ur_context_handle_t Context = nullptr; + ur_device_handle_t Device = nullptr; const size_t *GlobalWorkSize = nullptr; const size_t *GlobalWorkOffset = nullptr; std::vector LocalWorkSize; uint32_t WorkDim = 0; - LaunchInfo(ur_context_handle_t Context, const size_t *GlobalWorkSize, - const size_t *LocalWorkSize, const size_t *GlobalWorkOffset, - uint32_t WorkDim) - : Context(Context), GlobalWorkSize(GlobalWorkSize), + USMLaunchInfo(ur_context_handle_t Context, ur_device_handle_t Device, + const size_t *GlobalWorkSize, const size_t *LocalWorkSize, + const size_t *GlobalWorkOffset, uint32_t WorkDim) + : Context(Context), Device(Device), GlobalWorkSize(GlobalWorkSize), GlobalWorkOffset(GlobalWorkOffset), WorkDim(WorkDim) { - [[maybe_unused]] auto Result = - context.urDdiTable.Context.pfnRetain(Context); - assert(Result == UR_RESULT_SUCCESS); if (LocalWorkSize) { this->LocalWorkSize = std::vector(LocalWorkSize, LocalWorkSize + WorkDim); } } - ~LaunchInfo(); + ~USMLaunchInfo(); + + ur_result_t initialize(); + ur_result_t updateKernelInfo(const KernelInfo &KI); }; struct DeviceGlobalInfo { @@ -158,12 +177,11 @@ class SanitizerInterceptor { ur_result_t preLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, - LaunchInfo &LaunchInfo); + USMLaunchInfo &LaunchInfo); ur_result_t postLaunchKernel(ur_kernel_handle_t Kernel, ur_queue_handle_t Queue, - ur_event_handle_t &Event, - LaunchInfo &LaunchInfo); + USMLaunchInfo &LaunchInfo); ur_result_t insertContext(ur_context_handle_t Context, std::shared_ptr &CI); @@ -173,6 +191,9 @@ class SanitizerInterceptor { std::shared_ptr &CI); ur_result_t eraseDevice(ur_device_handle_t Device); + ur_result_t insertKernel(ur_kernel_handle_t Kernel); + ur_result_t eraseKernel(ur_kernel_handle_t Kernel); + std::optional findAllocInfoByAddress(uptr Address); std::shared_ptr getContextInfo(ur_context_handle_t Context) { @@ -181,6 +202,18 @@ class SanitizerInterceptor { return m_ContextMap[Context]; } + std::shared_ptr getDeviceInfo(ur_device_handle_t Device) { + std::shared_lock Guard(m_DeviceMapMutex); + assert(m_DeviceMap.find(Device) != m_DeviceMap.end()); + return m_DeviceMap[Device]; + } + + std::shared_ptr getKernelInfo(ur_kernel_handle_t Kernel) { + std::shared_lock Guard(m_KernelMapMutex); + assert(m_KernelMap.find(Kernel) != m_KernelMap.end()); + return m_KernelMap[Kernel]; + } + private: ur_result_t updateShadowMemory(std::shared_ptr &ContextInfo, std::shared_ptr &DeviceInfo, @@ -195,32 +228,31 @@ class SanitizerInterceptor { std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, ur_kernel_handle_t Kernel, - LaunchInfo &LaunchInfo); + USMLaunchInfo &LaunchInfo); ur_result_t allocShadowMemory(ur_context_handle_t Context, std::shared_ptr &DeviceInfo); - std::shared_ptr getDeviceInfo(ur_device_handle_t Device) { - std::shared_lock Guard(m_DeviceMapMutex); - assert(m_DeviceMap.find(Device) != m_DeviceMap.end()); - return m_DeviceMap[Device]; - } - private: std::unordered_map> m_ContextMap; ur_shared_mutex m_ContextMapMutex; - std::unordered_map> m_DeviceMap; ur_shared_mutex m_DeviceMapMutex; + std::unordered_map> + m_KernelMap; + ur_shared_mutex m_KernelMapMutex; + /// Assumption: all USM chunks are allocated in one VA AllocationMap m_AllocationMap; ur_shared_mutex m_AllocationMapMutex; // We use "uint64_t" here because EnqueueWriteGlobal will fail when it's "uint32_t" uint64_t cl_Debug = 0; + uint64_t cl_MinRZSize = 16; + uint64_t cl_MaxRZSize = 2048; uint32_t cl_MaxQuarantineSizeMB = 0; bool cl_DetectLocals = true; diff --git a/source/loader/layers/sanitizer/asan_libdevice.hpp b/source/loader/layers/sanitizer/asan_libdevice.hpp index 46ddee4423..1c8ef24a9d 100644 --- a/source/loader/layers/sanitizer/asan_libdevice.hpp +++ b/source/loader/layers/sanitizer/asan_libdevice.hpp @@ -62,6 +62,23 @@ struct DeviceSanitizerReport { bool IsRecover = false; }; +struct LocalArgsInfo { + uint64_t Size = 0; + uint64_t SizeWithRedZone = 0; +}; + +struct LaunchInfo { + uintptr_t PrivateShadowOffset = + 0; // don't move this field, we use it in AddressSanitizerPass + + uintptr_t LocalShadowOffset = 0; + uintptr_t LocalShadowOffsetEnd = 0; + DeviceSanitizerReport SanitizerReport; + + uint32_t NumLocalArgs = 0; + LocalArgsInfo *LocalArgs = nullptr; // ordered by ArgIndex +}; + constexpr unsigned ASAN_SHADOW_SCALE = 3; constexpr unsigned ASAN_SHADOW_GRANULARITY = 1ULL << ASAN_SHADOW_SCALE; diff --git a/source/loader/layers/sanitizer/common.hpp b/source/loader/layers/sanitizer/common.hpp index d5612100aa..5ac39b49fe 100644 --- a/source/loader/layers/sanitizer/common.hpp +++ b/source/loader/layers/sanitizer/common.hpp @@ -16,6 +16,7 @@ #include "ur_ddi.h" #include +#include #include #include @@ -53,7 +54,17 @@ inline constexpr uptr RZLog2Size(uptr rz_log) { return 16 << rz_log; } -inline constexpr uptr ComputeRZLog(uptr user_requested_size) { +inline constexpr uptr RZSize2Log(uptr rz_size) { + assert(rz_size >= 16); + assert(rz_size <= 2048); + assert(IsPowerOfTwo(rz_size)); + uptr res = log2(rz_size) - 4; + assert(rz_size == RZLog2Size(res)); + return res; +} + +inline constexpr uptr ComputeRZLog(uptr user_requested_size, uptr min_size, + uptr max_size) { uptr rz_log = user_requested_size <= 64 - 16 ? 0 : user_requested_size <= 128 - 32 ? 1 : user_requested_size <= 512 - 64 ? 2 @@ -62,7 +73,44 @@ inline constexpr uptr ComputeRZLog(uptr user_requested_size) { : user_requested_size <= (1 << 15) - 512 ? 5 : user_requested_size <= (1 << 16) - 1024 ? 6 : 7; - return rz_log; + uptr min_log = RZSize2Log(min_size); + uptr max_log = RZSize2Log(max_size); + return std::min(std::max(rz_log, min_log), max_log); +} + +/// Returns the next integer (mod 2**64) that is greater than or equal to +/// \p Value and is a multiple of \p Align. \p Align must be non-zero. +/// +/// Examples: +/// \code +/// alignTo(5, 8) = 8 +/// alignTo(17, 8) = 24 +/// alignTo(~0LL, 8) = 0 +/// alignTo(321, 255) = 510 +/// \endcode +inline uint64_t AlignTo(uint64_t Value, uint64_t Align) { + assert(Align != 0u && "Align can't be 0."); + return (Value + Align - 1) / Align * Align; +} + +inline uint64_t GetSizeAndRedzoneSizeForLocal(uint64_t Size, + uint64_t Granularity, + uint64_t Alignment) { + uint64_t Res = 0; + if (Size <= 4) { + Res = 16; + } else if (Size <= 16) { + Res = 32; + } else if (Size <= 128) { + Res = Size + 32; + } else if (Size <= 512) { + Res = Size + 64; + } else if (Size <= 4096) { + Res = Size + 128; + } else { + Res = Size + 256; + } + return AlignTo(std::max(Res, 2 * Granularity), Alignment); } // ================================================================ diff --git a/source/loader/layers/sanitizer/ur_sanddi.cpp b/source/loader/layers/sanitizer/ur_sanddi.cpp index 53ce5d1c1e..58f54c9338 100644 --- a/source/loader/layers/sanitizer/ur_sanddi.cpp +++ b/source/loader/layers/sanitizer/ur_sanddi.cpp @@ -272,8 +272,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( context.logger.debug("==== urEnqueueKernelLaunch"); - LaunchInfo LaunchInfo(GetContext(hQueue), pGlobalWorkSize, pLocalWorkSize, - pGlobalWorkOffset, workDim); + USMLaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), + pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset, + workDim); + UR_CALL(LaunchInfo.initialize()); UR_CALL(context.interceptor->preLaunchKernel(hKernel, hQueue, LaunchInfo)); @@ -283,8 +285,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( pLocalWorkSize, numEventsInWaitList, phEventWaitList, &hEvent); if (result == UR_RESULT_SUCCESS) { - UR_CALL(context.interceptor->postLaunchKernel(hKernel, hQueue, hEvent, - LaunchInfo)); + UR_CALL( + context.interceptor->postLaunchKernel(hKernel, hQueue, LaunchInfo)); } if (phEvent) { @@ -374,6 +376,90 @@ __urdlllocal ur_result_t UR_APICALL urContextRelease( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelCreate +__urdlllocal ur_result_t UR_APICALL urKernelCreate( + ur_program_handle_t hProgram, ///< [in] handle of the program instance + const char *pKernelName, ///< [in] pointer to null-terminated string. + ur_kernel_handle_t + *phKernel ///< [out] pointer to handle of kernel object created. +) { + auto pfnCreate = context.urDdiTable.Kernel.pfnCreate; + + if (nullptr == pfnCreate) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urKernelCreate"); + + UR_CALL(pfnCreate(hProgram, pKernelName, phKernel)); + UR_CALL(context.interceptor->insertKernel(*phKernel)); + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelRelease +__urdlllocal ur_result_t urKernelRelease( + ur_kernel_handle_t hKernel ///< [in] handle for the Kernel to release +) { + auto pfnRelease = context.urDdiTable.Kernel.pfnRelease; + + if (nullptr == pfnRelease) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urKernelRelease"); + UR_CALL(pfnRelease(hKernel)); + + if (auto KernelInfo = context.interceptor->getKernelInfo(hKernel)) { + uint32_t RefCount; + UR_CALL(context.urDdiTable.Kernel.pfnGetInfo( + hKernel, UR_KERNEL_INFO_REFERENCE_COUNT, sizeof(RefCount), + &RefCount, nullptr)); + if (RefCount == 1) { + UR_CALL(context.interceptor->eraseKernel(hKernel)); + } + } + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelSetArgLocal +__urdlllocal ur_result_t UR_APICALL urKernelSetArgLocal( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + uint32_t argIndex, ///< [in] argument index in range [0, num args - 1] + size_t + argSize, ///< [in] size of the local buffer to be allocated by the runtime + const ur_kernel_arg_local_properties_t + *pProperties ///< [in][optional] pointer to local buffer properties. +) { + auto pfnSetArgLocal = context.urDdiTable.Kernel.pfnSetArgLocal; + + if (nullptr == pfnSetArgLocal) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urKernelSetArgLocal (argIndex={}, argSize={})", + argIndex, argSize); + + { + auto KI = context.interceptor->getKernelInfo(hKernel); + std::scoped_lock Guard(KI->Mutex); + // TODO: get local variable alignment + auto argSizeWithRZ = GetSizeAndRedzoneSizeForLocal( + argSize, ASAN_SHADOW_GRANULARITY, ASAN_SHADOW_GRANULARITY); + KI->LocalArgs[argIndex] = LocalArgsInfo{argSize, argSizeWithRZ}; + argSize = argSizeWithRZ; + } + + ur_result_t result = + pfnSetArgLocal(hKernel, argIndex, argSize, pProperties); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Exported function for filling application's Context table /// with current process' addresses @@ -470,6 +556,38 @@ __urdlllocal ur_result_t UR_APICALL urGetProgramExpProcAddrTable( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Exported function for filling application's Kernel table +/// with current process' addresses +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION +__urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable( + ur_api_version_t version, ///< [in] API version requested + ur_kernel_dditable_t + *pDdiTable ///< [in,out] pointer to table of DDI function pointers +) { + if (nullptr == pDdiTable) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (UR_MAJOR_VERSION(ur_sanitizer_layer::context.version) != + UR_MAJOR_VERSION(version) || + UR_MINOR_VERSION(ur_sanitizer_layer::context.version) > + UR_MINOR_VERSION(version)) { + return UR_RESULT_ERROR_UNSUPPORTED_VERSION; + } + + ur_result_t result = UR_RESULT_SUCCESS; + + pDdiTable->pfnCreate = ur_sanitizer_layer::urKernelCreate; + pDdiTable->pfnRelease = ur_sanitizer_layer::urKernelRelease; + pDdiTable->pfnSetArgLocal = ur_sanitizer_layer::urKernelSetArgLocal; + + return result; +} /////////////////////////////////////////////////////////////////////////////// /// @brief Exported function for filling application's Enqueue table /// with current process' addresses @@ -570,6 +688,11 @@ ur_result_t context_t::init(ur_dditable_t *dditable, UR_API_VERSION_CURRENT, &dditable->Context); } + if (UR_RESULT_SUCCESS == result) { + result = ur_sanitizer_layer::urGetKernelProcAddrTable( + UR_API_VERSION_CURRENT, &dditable->Kernel); + } + if (UR_RESULT_SUCCESS == result) { result = ur_sanitizer_layer::urGetProgramProcAddrTable( UR_API_VERSION_CURRENT, &dditable->Program); diff --git a/source/loader/layers/sanitizer/ur_sanitizer_layer.cpp b/source/loader/layers/sanitizer/ur_sanitizer_layer.cpp index 0df123b6c2..98d9ead12f 100644 --- a/source/loader/layers/sanitizer/ur_sanitizer_layer.cpp +++ b/source/loader/layers/sanitizer/ur_sanitizer_layer.cpp @@ -18,8 +18,8 @@ context_t context; /////////////////////////////////////////////////////////////////////////////// context_t::context_t() - : interceptor(std::make_unique()), - logger(logger::create_logger("sanitizer")) {} + : logger(logger::create_logger("sanitizer")), + interceptor(std::make_unique()) {} bool context_t::isAvailable() const { return true; } diff --git a/source/loader/layers/sanitizer/ur_sanitizer_layer.hpp b/source/loader/layers/sanitizer/ur_sanitizer_layer.hpp index 018d9f4a80..32f01103d5 100644 --- a/source/loader/layers/sanitizer/ur_sanitizer_layer.hpp +++ b/source/loader/layers/sanitizer/ur_sanitizer_layer.hpp @@ -32,8 +32,8 @@ enum class SanitizerType { class __urdlllocal context_t : public proxy_layer_context_t { public: ur_dditable_t urDdiTable = {}; - std::unique_ptr interceptor; logger::Logger logger; + std::unique_ptr interceptor; SanitizerType enabledType = SanitizerType::None; context_t(); diff --git a/source/loader/layers/tracing/ur_tracing_layer.cpp b/source/loader/layers/tracing/ur_tracing_layer.cpp index dd36b286f2..01243a670d 100644 --- a/source/loader/layers/tracing/ur_tracing_layer.cpp +++ b/source/loader/layers/tracing/ur_tracing_layer.cpp @@ -27,7 +27,7 @@ constexpr auto STREAM_VER_MINOR = UR_MINOR_VERSION(UR_API_VERSION_CURRENT); static thread_local xpti_td *activeEvent; /////////////////////////////////////////////////////////////////////////////// -context_t::context_t() { +context_t::context_t() : logger(logger::create_logger("tracing", true, true)) { xptiFrameworkInitialize(); call_stream_id = xptiRegisterStream(CALL_STREAM_NAME); @@ -37,7 +37,7 @@ context_t::context_t() { streamv.str().data()); } -bool context_t::isAvailable() const { return xptiTraceEnabled(); } +bool context_t::isAvailable() const { return true; } void context_t::notify(uint16_t trace_type, uint32_t id, const char *name, void *args, ur_result_t *resultp, uint64_t instance) { diff --git a/source/loader/layers/tracing/ur_tracing_layer.hpp b/source/loader/layers/tracing/ur_tracing_layer.hpp index ddda493c05..84a109fb4f 100644 --- a/source/loader/layers/tracing/ur_tracing_layer.hpp +++ b/source/loader/layers/tracing/ur_tracing_layer.hpp @@ -13,6 +13,7 @@ #ifndef UR_TRACING_LAYER_H #define UR_TRACING_LAYER_H 1 +#include "logger/ur_logger.hpp" #include "ur_ddi.h" #include "ur_proxy_layer.hpp" #include "ur_util.hpp" @@ -25,6 +26,7 @@ class __urdlllocal context_t : public proxy_layer_context_t { public: ur_dditable_t urDdiTable = {}; codeloc_data codelocData; + logger::Logger logger; context_t(); ~context_t(); diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index 771b46c0c0..83987e5d90 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -40,11 +40,17 @@ __urdlllocal ur_result_t UR_APICALL urAdapterGet( uint64_t instance = context.notify_begin(UR_FUNCTION_ADAPTER_GET, "urAdapterGet", ¶ms); + context.logger.info("---> urAdapterGet"); + ur_result_t result = pfnAdapterGet(NumEntries, phAdapters, pNumAdapters); context.notify_end(UR_FUNCTION_ADAPTER_GET, "urAdapterGet", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_ADAPTER_GET, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -63,11 +69,18 @@ __urdlllocal ur_result_t UR_APICALL urAdapterRelease( uint64_t instance = context.notify_begin(UR_FUNCTION_ADAPTER_RELEASE, "urAdapterRelease", ¶ms); + context.logger.info("---> urAdapterRelease"); + ur_result_t result = pfnAdapterRelease(hAdapter); context.notify_end(UR_FUNCTION_ADAPTER_RELEASE, "urAdapterRelease", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_ADAPTER_RELEASE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -86,11 +99,18 @@ __urdlllocal ur_result_t UR_APICALL urAdapterRetain( uint64_t instance = context.notify_begin(UR_FUNCTION_ADAPTER_RETAIN, "urAdapterRetain", ¶ms); + context.logger.info("---> urAdapterRetain"); + ur_result_t result = pfnAdapterRetain(hAdapter); context.notify_end(UR_FUNCTION_ADAPTER_RETAIN, "urAdapterRetain", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_ADAPTER_RETAIN, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -117,11 +137,18 @@ __urdlllocal ur_result_t UR_APICALL urAdapterGetLastError( uint64_t instance = context.notify_begin(UR_FUNCTION_ADAPTER_GET_LAST_ERROR, "urAdapterGetLastError", ¶ms); + context.logger.info("---> urAdapterGetLastError"); + ur_result_t result = pfnAdapterGetLastError(hAdapter, ppMessage, pError); context.notify_end(UR_FUNCTION_ADAPTER_GET_LAST_ERROR, "urAdapterGetLastError", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ADAPTER_GET_LAST_ERROR, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -151,12 +178,19 @@ __urdlllocal ur_result_t UR_APICALL urAdapterGetInfo( uint64_t instance = context.notify_begin(UR_FUNCTION_ADAPTER_GET_INFO, "urAdapterGetInfo", ¶ms); + context.logger.info("---> urAdapterGetInfo"); + ur_result_t result = pfnAdapterGetInfo(hAdapter, propName, propSize, pPropValue, pPropSizeRet); context.notify_end(UR_FUNCTION_ADAPTER_GET_INFO, "urAdapterGetInfo", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_ADAPTER_GET_INFO, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -189,12 +223,19 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGet( uint64_t instance = context.notify_begin(UR_FUNCTION_PLATFORM_GET, "urPlatformGet", ¶ms); + context.logger.info("---> urPlatformGet"); + ur_result_t result = pfnGet(phAdapters, NumAdapters, NumEntries, phPlatforms, pNumPlatforms); context.notify_end(UR_FUNCTION_PLATFORM_GET, "urPlatformGet", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PLATFORM_GET, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -224,12 +265,19 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetInfo( uint64_t instance = context.notify_begin(UR_FUNCTION_PLATFORM_GET_INFO, "urPlatformGetInfo", ¶ms); + context.logger.info("---> urPlatformGetInfo"); + ur_result_t result = pfnGetInfo(hPlatform, propName, propSize, pPropValue, pPropSizeRet); context.notify_end(UR_FUNCTION_PLATFORM_GET_INFO, "urPlatformGetInfo", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PLATFORM_GET_INFO, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -250,11 +298,18 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetApiVersion( context.notify_begin(UR_FUNCTION_PLATFORM_GET_API_VERSION, "urPlatformGetApiVersion", ¶ms); + context.logger.info("---> urPlatformGetApiVersion"); + ur_result_t result = pfnGetApiVersion(hPlatform, pVersion); context.notify_end(UR_FUNCTION_PLATFORM_GET_API_VERSION, "urPlatformGetApiVersion", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PLATFORM_GET_API_VERSION, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -277,11 +332,18 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetNativeHandle( context.notify_begin(UR_FUNCTION_PLATFORM_GET_NATIVE_HANDLE, "urPlatformGetNativeHandle", ¶ms); + context.logger.info("---> urPlatformGetNativeHandle"); + ur_result_t result = pfnGetNativeHandle(hPlatform, phNativePlatform); context.notify_end(UR_FUNCTION_PLATFORM_GET_NATIVE_HANDLE, "urPlatformGetNativeHandle", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PLATFORM_GET_NATIVE_HANDLE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -308,6 +370,8 @@ __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( context.notify_begin(UR_FUNCTION_PLATFORM_CREATE_WITH_NATIVE_HANDLE, "urPlatformCreateWithNativeHandle", ¶ms); + context.logger.info("---> urPlatformCreateWithNativeHandle"); + ur_result_t result = pfnCreateWithNativeHandle(hNativePlatform, pProperties, phPlatform); @@ -315,6 +379,11 @@ __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( "urPlatformCreateWithNativeHandle", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PLATFORM_CREATE_WITH_NATIVE_HANDLE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -340,6 +409,8 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetBackendOption( context.notify_begin(UR_FUNCTION_PLATFORM_GET_BACKEND_OPTION, "urPlatformGetBackendOption", ¶ms); + context.logger.info("---> urPlatformGetBackendOption"); + ur_result_t result = pfnGetBackendOption(hPlatform, pFrontendOption, ppPlatformOption); @@ -347,6 +418,11 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetBackendOption( "urPlatformGetBackendOption", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PLATFORM_GET_BACKEND_OPTION, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -378,12 +454,18 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGet( uint64_t instance = context.notify_begin(UR_FUNCTION_DEVICE_GET, "urDeviceGet", ¶ms); + context.logger.info("---> urDeviceGet"); + ur_result_t result = pfnGet(hPlatform, DeviceType, NumEntries, phDevices, pNumDevices); context.notify_end(UR_FUNCTION_DEVICE_GET, "urDeviceGet", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_GET, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -414,12 +496,19 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetInfo( uint64_t instance = context.notify_begin(UR_FUNCTION_DEVICE_GET_INFO, "urDeviceGetInfo", ¶ms); + context.logger.info("---> urDeviceGetInfo"); + ur_result_t result = pfnGetInfo(hDevice, propName, propSize, pPropValue, pPropSizeRet); context.notify_end(UR_FUNCTION_DEVICE_GET_INFO, "urDeviceGetInfo", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_GET_INFO, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -439,11 +528,18 @@ __urdlllocal ur_result_t UR_APICALL urDeviceRetain( uint64_t instance = context.notify_begin(UR_FUNCTION_DEVICE_RETAIN, "urDeviceRetain", ¶ms); + context.logger.info("---> urDeviceRetain"); + ur_result_t result = pfnRetain(hDevice); context.notify_end(UR_FUNCTION_DEVICE_RETAIN, "urDeviceRetain", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_RETAIN, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -462,11 +558,18 @@ __urdlllocal ur_result_t UR_APICALL urDeviceRelease( uint64_t instance = context.notify_begin(UR_FUNCTION_DEVICE_RELEASE, "urDeviceRelease", ¶ms); + context.logger.info("---> urDeviceRelease"); + ur_result_t result = pfnRelease(hDevice); context.notify_end(UR_FUNCTION_DEVICE_RELEASE, "urDeviceRelease", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_RELEASE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -496,12 +599,19 @@ __urdlllocal ur_result_t UR_APICALL urDevicePartition( uint64_t instance = context.notify_begin(UR_FUNCTION_DEVICE_PARTITION, "urDevicePartition", ¶ms); + context.logger.info("---> urDevicePartition"); + ur_result_t result = pfnPartition(hDevice, pProperties, NumDevices, phSubDevices, pNumDevicesRet); context.notify_end(UR_FUNCTION_DEVICE_PARTITION, "urDevicePartition", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_PARTITION, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -530,12 +640,19 @@ __urdlllocal ur_result_t UR_APICALL urDeviceSelectBinary( uint64_t instance = context.notify_begin(UR_FUNCTION_DEVICE_SELECT_BINARY, "urDeviceSelectBinary", ¶ms); + context.logger.info("---> urDeviceSelectBinary"); + ur_result_t result = pfnSelectBinary(hDevice, pBinaries, NumBinaries, pSelectedBinary); context.notify_end(UR_FUNCTION_DEVICE_SELECT_BINARY, "urDeviceSelectBinary", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_DEVICE_SELECT_BINARY, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -557,11 +674,18 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetNativeHandle( context.notify_begin(UR_FUNCTION_DEVICE_GET_NATIVE_HANDLE, "urDeviceGetNativeHandle", ¶ms); + context.logger.info("---> urDeviceGetNativeHandle"); + ur_result_t result = pfnGetNativeHandle(hDevice, phNativeDevice); context.notify_end(UR_FUNCTION_DEVICE_GET_NATIVE_HANDLE, "urDeviceGetNativeHandle", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_DEVICE_GET_NATIVE_HANDLE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -589,6 +713,8 @@ __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( context.notify_begin(UR_FUNCTION_DEVICE_CREATE_WITH_NATIVE_HANDLE, "urDeviceCreateWithNativeHandle", ¶ms); + context.logger.info("---> urDeviceCreateWithNativeHandle"); + ur_result_t result = pfnCreateWithNativeHandle(hNativeDevice, hPlatform, pProperties, phDevice); @@ -596,6 +722,11 @@ __urdlllocal ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( "urDeviceCreateWithNativeHandle", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_DEVICE_CREATE_WITH_NATIVE_HANDLE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -623,6 +754,8 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetGlobalTimestamps( context.notify_begin(UR_FUNCTION_DEVICE_GET_GLOBAL_TIMESTAMPS, "urDeviceGetGlobalTimestamps", ¶ms); + context.logger.info("---> urDeviceGetGlobalTimestamps"); + ur_result_t result = pfnGetGlobalTimestamps(hDevice, pDeviceTimestamp, pHostTimestamp); @@ -630,6 +763,11 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetGlobalTimestamps( "urDeviceGetGlobalTimestamps", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_DEVICE_GET_GLOBAL_TIMESTAMPS, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -655,12 +793,19 @@ __urdlllocal ur_result_t UR_APICALL urContextCreate( uint64_t instance = context.notify_begin(UR_FUNCTION_CONTEXT_CREATE, "urContextCreate", ¶ms); + context.logger.info("---> urContextCreate"); + ur_result_t result = pfnCreate(DeviceCount, phDevices, pProperties, phContext); context.notify_end(UR_FUNCTION_CONTEXT_CREATE, "urContextCreate", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_CONTEXT_CREATE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -680,11 +825,18 @@ __urdlllocal ur_result_t UR_APICALL urContextRetain( uint64_t instance = context.notify_begin(UR_FUNCTION_CONTEXT_RETAIN, "urContextRetain", ¶ms); + context.logger.info("---> urContextRetain"); + ur_result_t result = pfnRetain(hContext); context.notify_end(UR_FUNCTION_CONTEXT_RETAIN, "urContextRetain", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_CONTEXT_RETAIN, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -703,11 +855,18 @@ __urdlllocal ur_result_t UR_APICALL urContextRelease( uint64_t instance = context.notify_begin(UR_FUNCTION_CONTEXT_RELEASE, "urContextRelease", ¶ms); + context.logger.info("---> urContextRelease"); + ur_result_t result = pfnRelease(hContext); context.notify_end(UR_FUNCTION_CONTEXT_RELEASE, "urContextRelease", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_CONTEXT_RELEASE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -739,12 +898,19 @@ __urdlllocal ur_result_t UR_APICALL urContextGetInfo( uint64_t instance = context.notify_begin(UR_FUNCTION_CONTEXT_GET_INFO, "urContextGetInfo", ¶ms); + context.logger.info("---> urContextGetInfo"); + ur_result_t result = pfnGetInfo(hContext, propName, propSize, pPropValue, pPropSizeRet); context.notify_end(UR_FUNCTION_CONTEXT_GET_INFO, "urContextGetInfo", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_CONTEXT_GET_INFO, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -767,11 +933,18 @@ __urdlllocal ur_result_t UR_APICALL urContextGetNativeHandle( context.notify_begin(UR_FUNCTION_CONTEXT_GET_NATIVE_HANDLE, "urContextGetNativeHandle", ¶ms); + context.logger.info("---> urContextGetNativeHandle"); + ur_result_t result = pfnGetNativeHandle(hContext, phNativeContext); context.notify_end(UR_FUNCTION_CONTEXT_GET_NATIVE_HANDLE, "urContextGetNativeHandle", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_CONTEXT_GET_NATIVE_HANDLE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -801,6 +974,8 @@ __urdlllocal ur_result_t UR_APICALL urContextCreateWithNativeHandle( context.notify_begin(UR_FUNCTION_CONTEXT_CREATE_WITH_NATIVE_HANDLE, "urContextCreateWithNativeHandle", ¶ms); + context.logger.info("---> urContextCreateWithNativeHandle"); + ur_result_t result = pfnCreateWithNativeHandle( hNativeContext, numDevices, phDevices, pProperties, phContext); @@ -808,6 +983,11 @@ __urdlllocal ur_result_t UR_APICALL urContextCreateWithNativeHandle( "urContextCreateWithNativeHandle", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_CONTEXT_CREATE_WITH_NATIVE_HANDLE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -833,12 +1013,19 @@ __urdlllocal ur_result_t UR_APICALL urContextSetExtendedDeleter( context.notify_begin(UR_FUNCTION_CONTEXT_SET_EXTENDED_DELETER, "urContextSetExtendedDeleter", ¶ms); + context.logger.info("---> urContextSetExtendedDeleter"); + ur_result_t result = pfnSetExtendedDeleter(hContext, pfnDeleter, pUserData); context.notify_end(UR_FUNCTION_CONTEXT_SET_EXTENDED_DELETER, "urContextSetExtendedDeleter", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_CONTEXT_SET_EXTENDED_DELETER, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -864,12 +1051,19 @@ __urdlllocal ur_result_t UR_APICALL urMemImageCreate( uint64_t instance = context.notify_begin(UR_FUNCTION_MEM_IMAGE_CREATE, "urMemImageCreate", ¶ms); + context.logger.info("---> urMemImageCreate"); + ur_result_t result = pfnImageCreate(hContext, flags, pImageFormat, pImageDesc, pHost, phMem); context.notify_end(UR_FUNCTION_MEM_IMAGE_CREATE, "urMemImageCreate", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_IMAGE_CREATE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -895,12 +1089,19 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferCreate( uint64_t instance = context.notify_begin(UR_FUNCTION_MEM_BUFFER_CREATE, "urMemBufferCreate", ¶ms); + context.logger.info("---> urMemBufferCreate"); + ur_result_t result = pfnBufferCreate(hContext, flags, size, pProperties, phBuffer); context.notify_end(UR_FUNCTION_MEM_BUFFER_CREATE, "urMemBufferCreate", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_BUFFER_CREATE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -919,11 +1120,17 @@ __urdlllocal ur_result_t UR_APICALL urMemRetain( uint64_t instance = context.notify_begin(UR_FUNCTION_MEM_RETAIN, "urMemRetain", ¶ms); + context.logger.info("---> urMemRetain"); + ur_result_t result = pfnRetain(hMem); context.notify_end(UR_FUNCTION_MEM_RETAIN, "urMemRetain", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_RETAIN, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -942,11 +1149,17 @@ __urdlllocal ur_result_t UR_APICALL urMemRelease( uint64_t instance = context.notify_begin(UR_FUNCTION_MEM_RELEASE, "urMemRelease", ¶ms); + context.logger.info("---> urMemRelease"); + ur_result_t result = pfnRelease(hMem); context.notify_end(UR_FUNCTION_MEM_RELEASE, "urMemRelease", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_RELEASE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -973,12 +1186,19 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferPartition( uint64_t instance = context.notify_begin(UR_FUNCTION_MEM_BUFFER_PARTITION, "urMemBufferPartition", ¶ms); + context.logger.info("---> urMemBufferPartition"); + ur_result_t result = pfnBufferPartition(hBuffer, flags, bufferCreateType, pRegion, phMem); context.notify_end(UR_FUNCTION_MEM_BUFFER_PARTITION, "urMemBufferPartition", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_BUFFER_PARTITION, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1001,11 +1221,18 @@ __urdlllocal ur_result_t UR_APICALL urMemGetNativeHandle( uint64_t instance = context.notify_begin(UR_FUNCTION_MEM_GET_NATIVE_HANDLE, "urMemGetNativeHandle", ¶ms); + context.logger.info("---> urMemGetNativeHandle"); + ur_result_t result = pfnGetNativeHandle(hMem, hDevice, phNativeMem); context.notify_end(UR_FUNCTION_MEM_GET_NATIVE_HANDLE, "urMemGetNativeHandle", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_GET_NATIVE_HANDLE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1033,6 +1260,8 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle( context.notify_begin(UR_FUNCTION_MEM_BUFFER_CREATE_WITH_NATIVE_HANDLE, "urMemBufferCreateWithNativeHandle", ¶ms); + context.logger.info("---> urMemBufferCreateWithNativeHandle"); + ur_result_t result = pfnBufferCreateWithNativeHandle(hNativeMem, hContext, pProperties, phMem); @@ -1040,6 +1269,11 @@ __urdlllocal ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle( "urMemBufferCreateWithNativeHandle", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_MEM_BUFFER_CREATE_WITH_NATIVE_HANDLE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1071,6 +1305,8 @@ __urdlllocal ur_result_t UR_APICALL urMemImageCreateWithNativeHandle( context.notify_begin(UR_FUNCTION_MEM_IMAGE_CREATE_WITH_NATIVE_HANDLE, "urMemImageCreateWithNativeHandle", ¶ms); + context.logger.info("---> urMemImageCreateWithNativeHandle"); + ur_result_t result = pfnImageCreateWithNativeHandle( hNativeMem, hContext, pImageFormat, pImageDesc, pProperties, phMem); @@ -1078,6 +1314,11 @@ __urdlllocal ur_result_t UR_APICALL urMemImageCreateWithNativeHandle( "urMemImageCreateWithNativeHandle", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_MEM_IMAGE_CREATE_WITH_NATIVE_HANDLE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1109,12 +1350,19 @@ __urdlllocal ur_result_t UR_APICALL urMemGetInfo( uint64_t instance = context.notify_begin(UR_FUNCTION_MEM_GET_INFO, "urMemGetInfo", ¶ms); + context.logger.info("---> urMemGetInfo"); + ur_result_t result = pfnGetInfo(hMemory, propName, propSize, pPropValue, pPropSizeRet); context.notify_end(UR_FUNCTION_MEM_GET_INFO, "urMemGetInfo", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_GET_INFO, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1145,12 +1393,19 @@ __urdlllocal ur_result_t UR_APICALL urMemImageGetInfo( uint64_t instance = context.notify_begin(UR_FUNCTION_MEM_IMAGE_GET_INFO, "urMemImageGetInfo", ¶ms); + context.logger.info("---> urMemImageGetInfo"); + ur_result_t result = pfnImageGetInfo(hMemory, propName, propSize, pPropValue, pPropSizeRet); context.notify_end(UR_FUNCTION_MEM_IMAGE_GET_INFO, "urMemImageGetInfo", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_MEM_IMAGE_GET_INFO, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1172,11 +1427,18 @@ __urdlllocal ur_result_t UR_APICALL urSamplerCreate( uint64_t instance = context.notify_begin(UR_FUNCTION_SAMPLER_CREATE, "urSamplerCreate", ¶ms); + context.logger.info("---> urSamplerCreate"); + ur_result_t result = pfnCreate(hContext, pDesc, phSampler); context.notify_end(UR_FUNCTION_SAMPLER_CREATE, "urSamplerCreate", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_SAMPLER_CREATE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1196,11 +1458,18 @@ __urdlllocal ur_result_t UR_APICALL urSamplerRetain( uint64_t instance = context.notify_begin(UR_FUNCTION_SAMPLER_RETAIN, "urSamplerRetain", ¶ms); + context.logger.info("---> urSamplerRetain"); + ur_result_t result = pfnRetain(hSampler); context.notify_end(UR_FUNCTION_SAMPLER_RETAIN, "urSamplerRetain", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_SAMPLER_RETAIN, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1220,11 +1489,18 @@ __urdlllocal ur_result_t UR_APICALL urSamplerRelease( uint64_t instance = context.notify_begin(UR_FUNCTION_SAMPLER_RELEASE, "urSamplerRelease", ¶ms); + context.logger.info("---> urSamplerRelease"); + ur_result_t result = pfnRelease(hSampler); context.notify_end(UR_FUNCTION_SAMPLER_RELEASE, "urSamplerRelease", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_SAMPLER_RELEASE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1252,12 +1528,19 @@ __urdlllocal ur_result_t UR_APICALL urSamplerGetInfo( uint64_t instance = context.notify_begin(UR_FUNCTION_SAMPLER_GET_INFO, "urSamplerGetInfo", ¶ms); + context.logger.info("---> urSamplerGetInfo"); + ur_result_t result = pfnGetInfo(hSampler, propName, propSize, pPropValue, pPropSizeRet); context.notify_end(UR_FUNCTION_SAMPLER_GET_INFO, "urSamplerGetInfo", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_SAMPLER_GET_INFO, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1280,11 +1563,18 @@ __urdlllocal ur_result_t UR_APICALL urSamplerGetNativeHandle( context.notify_begin(UR_FUNCTION_SAMPLER_GET_NATIVE_HANDLE, "urSamplerGetNativeHandle", ¶ms); + context.logger.info("---> urSamplerGetNativeHandle"); + ur_result_t result = pfnGetNativeHandle(hSampler, phNativeSampler); context.notify_end(UR_FUNCTION_SAMPLER_GET_NATIVE_HANDLE, "urSamplerGetNativeHandle", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_SAMPLER_GET_NATIVE_HANDLE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1312,6 +1602,8 @@ __urdlllocal ur_result_t UR_APICALL urSamplerCreateWithNativeHandle( context.notify_begin(UR_FUNCTION_SAMPLER_CREATE_WITH_NATIVE_HANDLE, "urSamplerCreateWithNativeHandle", ¶ms); + context.logger.info("---> urSamplerCreateWithNativeHandle"); + ur_result_t result = pfnCreateWithNativeHandle(hNativeSampler, hContext, pProperties, phSampler); @@ -1319,6 +1611,11 @@ __urdlllocal ur_result_t UR_APICALL urSamplerCreateWithNativeHandle( "urSamplerCreateWithNativeHandle", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_SAMPLER_CREATE_WITH_NATIVE_HANDLE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1345,11 +1642,18 @@ __urdlllocal ur_result_t UR_APICALL urUSMHostAlloc( uint64_t instance = context.notify_begin(UR_FUNCTION_USM_HOST_ALLOC, "urUSMHostAlloc", ¶ms); + context.logger.info("---> urUSMHostAlloc"); + ur_result_t result = pfnHostAlloc(hContext, pUSMDesc, pool, size, ppMem); context.notify_end(UR_FUNCTION_USM_HOST_ALLOC, "urUSMHostAlloc", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_HOST_ALLOC, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1377,12 +1681,19 @@ __urdlllocal ur_result_t UR_APICALL urUSMDeviceAlloc( uint64_t instance = context.notify_begin(UR_FUNCTION_USM_DEVICE_ALLOC, "urUSMDeviceAlloc", ¶ms); + context.logger.info("---> urUSMDeviceAlloc"); + ur_result_t result = pfnDeviceAlloc(hContext, hDevice, pUSMDesc, pool, size, ppMem); context.notify_end(UR_FUNCTION_USM_DEVICE_ALLOC, "urUSMDeviceAlloc", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_DEVICE_ALLOC, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1410,12 +1721,19 @@ __urdlllocal ur_result_t UR_APICALL urUSMSharedAlloc( uint64_t instance = context.notify_begin(UR_FUNCTION_USM_SHARED_ALLOC, "urUSMSharedAlloc", ¶ms); + context.logger.info("---> urUSMSharedAlloc"); + ur_result_t result = pfnSharedAlloc(hContext, hDevice, pUSMDesc, pool, size, ppMem); context.notify_end(UR_FUNCTION_USM_SHARED_ALLOC, "urUSMSharedAlloc", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_SHARED_ALLOC, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1435,11 +1753,17 @@ __urdlllocal ur_result_t UR_APICALL urUSMFree( uint64_t instance = context.notify_begin(UR_FUNCTION_USM_FREE, "urUSMFree", ¶ms); + context.logger.info("---> urUSMFree"); + ur_result_t result = pfnFree(hContext, pMem); context.notify_end(UR_FUNCTION_USM_FREE, "urUSMFree", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_FREE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1469,12 +1793,19 @@ __urdlllocal ur_result_t UR_APICALL urUSMGetMemAllocInfo( uint64_t instance = context.notify_begin(UR_FUNCTION_USM_GET_MEM_ALLOC_INFO, "urUSMGetMemAllocInfo", ¶ms); + context.logger.info("---> urUSMGetMemAllocInfo"); + ur_result_t result = pfnGetMemAllocInfo(hContext, pMem, propName, propSize, pPropValue, pPropSizeRet); context.notify_end(UR_FUNCTION_USM_GET_MEM_ALLOC_INFO, "urUSMGetMemAllocInfo", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_USM_GET_MEM_ALLOC_INFO, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1497,11 +1828,18 @@ __urdlllocal ur_result_t UR_APICALL urUSMPoolCreate( uint64_t instance = context.notify_begin(UR_FUNCTION_USM_POOL_CREATE, "urUSMPoolCreate", ¶ms); + context.logger.info("---> urUSMPoolCreate"); + ur_result_t result = pfnPoolCreate(hContext, pPoolDesc, ppPool); context.notify_end(UR_FUNCTION_USM_POOL_CREATE, "urUSMPoolCreate", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_POOL_CREATE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1520,11 +1858,18 @@ __urdlllocal ur_result_t UR_APICALL urUSMPoolRetain( uint64_t instance = context.notify_begin(UR_FUNCTION_USM_POOL_RETAIN, "urUSMPoolRetain", ¶ms); + context.logger.info("---> urUSMPoolRetain"); + ur_result_t result = pfnPoolRetain(pPool); context.notify_end(UR_FUNCTION_USM_POOL_RETAIN, "urUSMPoolRetain", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_POOL_RETAIN, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1543,11 +1888,18 @@ __urdlllocal ur_result_t UR_APICALL urUSMPoolRelease( uint64_t instance = context.notify_begin(UR_FUNCTION_USM_POOL_RELEASE, "urUSMPoolRelease", ¶ms); + context.logger.info("---> urUSMPoolRelease"); + ur_result_t result = pfnPoolRelease(pPool); context.notify_end(UR_FUNCTION_USM_POOL_RELEASE, "urUSMPoolRelease", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_POOL_RELEASE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1574,12 +1926,19 @@ __urdlllocal ur_result_t UR_APICALL urUSMPoolGetInfo( uint64_t instance = context.notify_begin(UR_FUNCTION_USM_POOL_GET_INFO, "urUSMPoolGetInfo", ¶ms); + context.logger.info("---> urUSMPoolGetInfo"); + ur_result_t result = pfnPoolGetInfo(hPool, propName, propSize, pPropValue, pPropSizeRet); context.notify_end(UR_FUNCTION_USM_POOL_GET_INFO, "urUSMPoolGetInfo", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_POOL_GET_INFO, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1615,6 +1974,8 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( context.notify_begin(UR_FUNCTION_VIRTUAL_MEM_GRANULARITY_GET_INFO, "urVirtualMemGranularityGetInfo", ¶ms); + context.logger.info("---> urVirtualMemGranularityGetInfo"); + ur_result_t result = pfnGranularityGetInfo( hContext, hDevice, propName, propSize, pPropValue, pPropSizeRet); @@ -1622,6 +1983,11 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGranularityGetInfo( "urVirtualMemGranularityGetInfo", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_VIRTUAL_MEM_GRANULARITY_GET_INFO, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1650,11 +2016,18 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemReserve( uint64_t instance = context.notify_begin(UR_FUNCTION_VIRTUAL_MEM_RESERVE, "urVirtualMemReserve", ¶ms); + context.logger.info("---> urVirtualMemReserve"); + ur_result_t result = pfnReserve(hContext, pStart, size, ppStart); context.notify_end(UR_FUNCTION_VIRTUAL_MEM_RESERVE, "urVirtualMemReserve", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_VIRTUAL_MEM_RESERVE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1676,11 +2049,18 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemFree( uint64_t instance = context.notify_begin(UR_FUNCTION_VIRTUAL_MEM_FREE, "urVirtualMemFree", ¶ms); + context.logger.info("---> urVirtualMemFree"); + ur_result_t result = pfnFree(hContext, pStart, size); context.notify_end(UR_FUNCTION_VIRTUAL_MEM_FREE, "urVirtualMemFree", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_VIRTUAL_MEM_FREE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1709,12 +2089,19 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemMap( uint64_t instance = context.notify_begin(UR_FUNCTION_VIRTUAL_MEM_MAP, "urVirtualMemMap", ¶ms); + context.logger.info("---> urVirtualMemMap"); + ur_result_t result = pfnMap(hContext, pStart, size, hPhysicalMem, offset, flags); context.notify_end(UR_FUNCTION_VIRTUAL_MEM_MAP, "urVirtualMemMap", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_VIRTUAL_MEM_MAP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1736,11 +2123,18 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemUnmap( uint64_t instance = context.notify_begin(UR_FUNCTION_VIRTUAL_MEM_UNMAP, "urVirtualMemUnmap", ¶ms); + context.logger.info("---> urVirtualMemUnmap"); + ur_result_t result = pfnUnmap(hContext, pStart, size); context.notify_end(UR_FUNCTION_VIRTUAL_MEM_UNMAP, "urVirtualMemUnmap", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_VIRTUAL_MEM_UNMAP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1765,11 +2159,18 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemSetAccess( uint64_t instance = context.notify_begin(UR_FUNCTION_VIRTUAL_MEM_SET_ACCESS, "urVirtualMemSetAccess", ¶ms); + context.logger.info("---> urVirtualMemSetAccess"); + ur_result_t result = pfnSetAccess(hContext, pStart, size, flags); context.notify_end(UR_FUNCTION_VIRTUAL_MEM_SET_ACCESS, "urVirtualMemSetAccess", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_VIRTUAL_MEM_SET_ACCESS, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1803,12 +2204,19 @@ __urdlllocal ur_result_t UR_APICALL urVirtualMemGetInfo( uint64_t instance = context.notify_begin(UR_FUNCTION_VIRTUAL_MEM_GET_INFO, "urVirtualMemGetInfo", ¶ms); + context.logger.info("---> urVirtualMemGetInfo"); + ur_result_t result = pfnGetInfo(hContext, pStart, size, propName, propSize, pPropValue, pPropSizeRet); context.notify_end(UR_FUNCTION_VIRTUAL_MEM_GET_INFO, "urVirtualMemGetInfo", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_VIRTUAL_MEM_GET_INFO, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1836,12 +2244,19 @@ __urdlllocal ur_result_t UR_APICALL urPhysicalMemCreate( uint64_t instance = context.notify_begin(UR_FUNCTION_PHYSICAL_MEM_CREATE, "urPhysicalMemCreate", ¶ms); + context.logger.info("---> urPhysicalMemCreate"); + ur_result_t result = pfnCreate(hContext, hDevice, size, pProperties, phPhysicalMem); context.notify_end(UR_FUNCTION_PHYSICAL_MEM_CREATE, "urPhysicalMemCreate", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PHYSICAL_MEM_CREATE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1861,11 +2276,18 @@ __urdlllocal ur_result_t UR_APICALL urPhysicalMemRetain( uint64_t instance = context.notify_begin(UR_FUNCTION_PHYSICAL_MEM_RETAIN, "urPhysicalMemRetain", ¶ms); + context.logger.info("---> urPhysicalMemRetain"); + ur_result_t result = pfnRetain(hPhysicalMem); context.notify_end(UR_FUNCTION_PHYSICAL_MEM_RETAIN, "urPhysicalMemRetain", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PHYSICAL_MEM_RETAIN, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1885,11 +2307,18 @@ __urdlllocal ur_result_t UR_APICALL urPhysicalMemRelease( uint64_t instance = context.notify_begin(UR_FUNCTION_PHYSICAL_MEM_RELEASE, "urPhysicalMemRelease", ¶ms); + context.logger.info("---> urPhysicalMemRelease"); + ur_result_t result = pfnRelease(hPhysicalMem); context.notify_end(UR_FUNCTION_PHYSICAL_MEM_RELEASE, "urPhysicalMemRelease", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PHYSICAL_MEM_RELEASE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1915,12 +2344,19 @@ __urdlllocal ur_result_t UR_APICALL urProgramCreateWithIL( uint64_t instance = context.notify_begin(UR_FUNCTION_PROGRAM_CREATE_WITH_IL, "urProgramCreateWithIL", ¶ms); + context.logger.info("---> urProgramCreateWithIL"); + ur_result_t result = pfnCreateWithIL(hContext, pIL, length, pProperties, phProgram); context.notify_end(UR_FUNCTION_PROGRAM_CREATE_WITH_IL, "urProgramCreateWithIL", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PROGRAM_CREATE_WITH_IL, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1949,12 +2385,19 @@ __urdlllocal ur_result_t UR_APICALL urProgramCreateWithBinary( context.notify_begin(UR_FUNCTION_PROGRAM_CREATE_WITH_BINARY, "urProgramCreateWithBinary", ¶ms); + context.logger.info("---> urProgramCreateWithBinary"); + ur_result_t result = pfnCreateWithBinary(hContext, hDevice, size, pBinary, pProperties, phProgram); context.notify_end(UR_FUNCTION_PROGRAM_CREATE_WITH_BINARY, "urProgramCreateWithBinary", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PROGRAM_CREATE_WITH_BINARY, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -1976,11 +2419,18 @@ __urdlllocal ur_result_t UR_APICALL urProgramBuild( uint64_t instance = context.notify_begin(UR_FUNCTION_PROGRAM_BUILD, "urProgramBuild", ¶ms); + context.logger.info("---> urProgramBuild"); + ur_result_t result = pfnBuild(hContext, hProgram, pOptions); context.notify_end(UR_FUNCTION_PROGRAM_BUILD, "urProgramBuild", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_BUILD, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2003,11 +2453,18 @@ __urdlllocal ur_result_t UR_APICALL urProgramCompile( uint64_t instance = context.notify_begin(UR_FUNCTION_PROGRAM_COMPILE, "urProgramCompile", ¶ms); + context.logger.info("---> urProgramCompile"); + ur_result_t result = pfnCompile(hContext, hProgram, pOptions); context.notify_end(UR_FUNCTION_PROGRAM_COMPILE, "urProgramCompile", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_COMPILE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2034,12 +2491,19 @@ __urdlllocal ur_result_t UR_APICALL urProgramLink( uint64_t instance = context.notify_begin(UR_FUNCTION_PROGRAM_LINK, "urProgramLink", ¶ms); + context.logger.info("---> urProgramLink"); + ur_result_t result = pfnLink(hContext, count, phPrograms, pOptions, phProgram); context.notify_end(UR_FUNCTION_PROGRAM_LINK, "urProgramLink", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_LINK, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2058,11 +2522,18 @@ __urdlllocal ur_result_t UR_APICALL urProgramRetain( uint64_t instance = context.notify_begin(UR_FUNCTION_PROGRAM_RETAIN, "urProgramRetain", ¶ms); + context.logger.info("---> urProgramRetain"); + ur_result_t result = pfnRetain(hProgram); context.notify_end(UR_FUNCTION_PROGRAM_RETAIN, "urProgramRetain", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_RETAIN, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2081,11 +2552,18 @@ __urdlllocal ur_result_t UR_APICALL urProgramRelease( uint64_t instance = context.notify_begin(UR_FUNCTION_PROGRAM_RELEASE, "urProgramRelease", ¶ms); + context.logger.info("---> urProgramRelease"); + ur_result_t result = pfnRelease(hProgram); context.notify_end(UR_FUNCTION_PROGRAM_RELEASE, "urProgramRelease", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_RELEASE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2116,6 +2594,8 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetFunctionPointer( context.notify_begin(UR_FUNCTION_PROGRAM_GET_FUNCTION_POINTER, "urProgramGetFunctionPointer", ¶ms); + context.logger.info("---> urProgramGetFunctionPointer"); + ur_result_t result = pfnGetFunctionPointer(hDevice, hProgram, pFunctionName, ppFunctionPointer); @@ -2123,6 +2603,11 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetFunctionPointer( "urProgramGetFunctionPointer", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PROGRAM_GET_FUNCTION_POINTER, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2155,6 +2640,8 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetGlobalVariablePointer( context.notify_begin(UR_FUNCTION_PROGRAM_GET_GLOBAL_VARIABLE_POINTER, "urProgramGetGlobalVariablePointer", ¶ms); + context.logger.info("---> urProgramGetGlobalVariablePointer"); + ur_result_t result = pfnGetGlobalVariablePointer( hDevice, hProgram, pGlobalVariableName, pGlobalVariableSizeRet, ppGlobalVariablePointerRet); @@ -2163,6 +2650,11 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetGlobalVariablePointer( "urProgramGetGlobalVariablePointer", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PROGRAM_GET_GLOBAL_VARIABLE_POINTER, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2193,12 +2685,19 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetInfo( uint64_t instance = context.notify_begin(UR_FUNCTION_PROGRAM_GET_INFO, "urProgramGetInfo", ¶ms); + context.logger.info("---> urProgramGetInfo"); + ur_result_t result = pfnGetInfo(hProgram, propName, propSize, pPropValue, pPropSizeRet); context.notify_end(UR_FUNCTION_PROGRAM_GET_INFO, "urProgramGetInfo", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_GET_INFO, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2231,12 +2730,19 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetBuildInfo( uint64_t instance = context.notify_begin(UR_FUNCTION_PROGRAM_GET_BUILD_INFO, "urProgramGetBuildInfo", ¶ms); + context.logger.info("---> urProgramGetBuildInfo"); + ur_result_t result = pfnGetBuildInfo(hProgram, hDevice, propName, propSize, pPropValue, pPropSizeRet); context.notify_end(UR_FUNCTION_PROGRAM_GET_BUILD_INFO, "urProgramGetBuildInfo", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PROGRAM_GET_BUILD_INFO, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2262,6 +2768,8 @@ __urdlllocal ur_result_t UR_APICALL urProgramSetSpecializationConstants( context.notify_begin(UR_FUNCTION_PROGRAM_SET_SPECIALIZATION_CONSTANTS, "urProgramSetSpecializationConstants", ¶ms); + context.logger.info("---> urProgramSetSpecializationConstants"); + ur_result_t result = pfnSetSpecializationConstants(hProgram, count, pSpecConstants); @@ -2269,6 +2777,11 @@ __urdlllocal ur_result_t UR_APICALL urProgramSetSpecializationConstants( "urProgramSetSpecializationConstants", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PROGRAM_SET_SPECIALIZATION_CONSTANTS, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2291,11 +2804,18 @@ __urdlllocal ur_result_t UR_APICALL urProgramGetNativeHandle( context.notify_begin(UR_FUNCTION_PROGRAM_GET_NATIVE_HANDLE, "urProgramGetNativeHandle", ¶ms); + context.logger.info("---> urProgramGetNativeHandle"); + ur_result_t result = pfnGetNativeHandle(hProgram, phNativeProgram); context.notify_end(UR_FUNCTION_PROGRAM_GET_NATIVE_HANDLE, "urProgramGetNativeHandle", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PROGRAM_GET_NATIVE_HANDLE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2323,6 +2843,8 @@ __urdlllocal ur_result_t UR_APICALL urProgramCreateWithNativeHandle( context.notify_begin(UR_FUNCTION_PROGRAM_CREATE_WITH_NATIVE_HANDLE, "urProgramCreateWithNativeHandle", ¶ms); + context.logger.info("---> urProgramCreateWithNativeHandle"); + ur_result_t result = pfnCreateWithNativeHandle(hNativeProgram, hContext, pProperties, phProgram); @@ -2330,6 +2852,11 @@ __urdlllocal ur_result_t UR_APICALL urProgramCreateWithNativeHandle( "urProgramCreateWithNativeHandle", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_PROGRAM_CREATE_WITH_NATIVE_HANDLE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2351,11 +2878,18 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreate( uint64_t instance = context.notify_begin(UR_FUNCTION_KERNEL_CREATE, "urKernelCreate", ¶ms); + context.logger.info("---> urKernelCreate"); + ur_result_t result = pfnCreate(hProgram, pKernelName, phKernel); context.notify_end(UR_FUNCTION_KERNEL_CREATE, "urKernelCreate", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_CREATE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2381,12 +2915,19 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgValue( uint64_t instance = context.notify_begin(UR_FUNCTION_KERNEL_SET_ARG_VALUE, "urKernelSetArgValue", ¶ms); + context.logger.info("---> urKernelSetArgValue"); + ur_result_t result = pfnSetArgValue(hKernel, argIndex, argSize, pProperties, pArgValue); context.notify_end(UR_FUNCTION_KERNEL_SET_ARG_VALUE, "urKernelSetArgValue", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_SET_ARG_VALUE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2411,12 +2952,19 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgLocal( uint64_t instance = context.notify_begin(UR_FUNCTION_KERNEL_SET_ARG_LOCAL, "urKernelSetArgLocal", ¶ms); + context.logger.info("---> urKernelSetArgLocal"); + ur_result_t result = pfnSetArgLocal(hKernel, argIndex, argSize, pProperties); context.notify_end(UR_FUNCTION_KERNEL_SET_ARG_LOCAL, "urKernelSetArgLocal", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_SET_ARG_LOCAL, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2448,12 +2996,19 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetInfo( uint64_t instance = context.notify_begin(UR_FUNCTION_KERNEL_GET_INFO, "urKernelGetInfo", ¶ms); + context.logger.info("---> urKernelGetInfo"); + ur_result_t result = pfnGetInfo(hKernel, propName, propSize, pPropValue, pPropSizeRet); context.notify_end(UR_FUNCTION_KERNEL_GET_INFO, "urKernelGetInfo", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_GET_INFO, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2483,12 +3038,19 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetGroupInfo( uint64_t instance = context.notify_begin(UR_FUNCTION_KERNEL_GET_GROUP_INFO, "urKernelGetGroupInfo", ¶ms); + context.logger.info("---> urKernelGetGroupInfo"); + ur_result_t result = pfnGetGroupInfo(hKernel, hDevice, propName, propSize, pPropValue, pPropSizeRet); context.notify_end(UR_FUNCTION_KERNEL_GET_GROUP_INFO, "urKernelGetGroupInfo", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_GET_GROUP_INFO, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2519,12 +3081,19 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetSubGroupInfo( context.notify_begin(UR_FUNCTION_KERNEL_GET_SUB_GROUP_INFO, "urKernelGetSubGroupInfo", ¶ms); + context.logger.info("---> urKernelGetSubGroupInfo"); + ur_result_t result = pfnGetSubGroupInfo(hKernel, hDevice, propName, propSize, pPropValue, pPropSizeRet); context.notify_end(UR_FUNCTION_KERNEL_GET_SUB_GROUP_INFO, "urKernelGetSubGroupInfo", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_GET_SUB_GROUP_INFO, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2543,11 +3112,18 @@ __urdlllocal ur_result_t UR_APICALL urKernelRetain( uint64_t instance = context.notify_begin(UR_FUNCTION_KERNEL_RETAIN, "urKernelRetain", ¶ms); + context.logger.info("---> urKernelRetain"); + ur_result_t result = pfnRetain(hKernel); context.notify_end(UR_FUNCTION_KERNEL_RETAIN, "urKernelRetain", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_RETAIN, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2566,11 +3142,18 @@ __urdlllocal ur_result_t UR_APICALL urKernelRelease( uint64_t instance = context.notify_begin(UR_FUNCTION_KERNEL_RELEASE, "urKernelRelease", ¶ms); + context.logger.info("---> urKernelRelease"); + ur_result_t result = pfnRelease(hKernel); context.notify_end(UR_FUNCTION_KERNEL_RELEASE, "urKernelRelease", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_RELEASE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2596,12 +3179,19 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgPointer( uint64_t instance = context.notify_begin(UR_FUNCTION_KERNEL_SET_ARG_POINTER, "urKernelSetArgPointer", ¶ms); + context.logger.info("---> urKernelSetArgPointer"); + ur_result_t result = pfnSetArgPointer(hKernel, argIndex, pProperties, pArgValue); context.notify_end(UR_FUNCTION_KERNEL_SET_ARG_POINTER, "urKernelSetArgPointer", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_SET_ARG_POINTER, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2628,12 +3218,19 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetExecInfo( uint64_t instance = context.notify_begin(UR_FUNCTION_KERNEL_SET_EXEC_INFO, "urKernelSetExecInfo", ¶ms); + context.logger.info("---> urKernelSetExecInfo"); + ur_result_t result = pfnSetExecInfo(hKernel, propName, propSize, pProperties, pPropValue); context.notify_end(UR_FUNCTION_KERNEL_SET_EXEC_INFO, "urKernelSetExecInfo", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_KERNEL_SET_EXEC_INFO, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2657,12 +3254,19 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgSampler( uint64_t instance = context.notify_begin(UR_FUNCTION_KERNEL_SET_ARG_SAMPLER, "urKernelSetArgSampler", ¶ms); + context.logger.info("---> urKernelSetArgSampler"); + ur_result_t result = pfnSetArgSampler(hKernel, argIndex, pProperties, hArgValue); context.notify_end(UR_FUNCTION_KERNEL_SET_ARG_SAMPLER, "urKernelSetArgSampler", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_SET_ARG_SAMPLER, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2686,12 +3290,19 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgMemObj( uint64_t instance = context.notify_begin(UR_FUNCTION_KERNEL_SET_ARG_MEM_OBJ, "urKernelSetArgMemObj", ¶ms); + context.logger.info("---> urKernelSetArgMemObj"); + ur_result_t result = pfnSetArgMemObj(hKernel, argIndex, pProperties, hArgValue); context.notify_end(UR_FUNCTION_KERNEL_SET_ARG_MEM_OBJ, "urKernelSetArgMemObj", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_SET_ARG_MEM_OBJ, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2716,6 +3327,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetSpecializationConstants( context.notify_begin(UR_FUNCTION_KERNEL_SET_SPECIALIZATION_CONSTANTS, "urKernelSetSpecializationConstants", ¶ms); + context.logger.info("---> urKernelSetSpecializationConstants"); + ur_result_t result = pfnSetSpecializationConstants(hKernel, count, pSpecConstants); @@ -2723,6 +3336,11 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetSpecializationConstants( "urKernelSetSpecializationConstants", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_SET_SPECIALIZATION_CONSTANTS, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2744,11 +3362,18 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetNativeHandle( context.notify_begin(UR_FUNCTION_KERNEL_GET_NATIVE_HANDLE, "urKernelGetNativeHandle", ¶ms); + context.logger.info("---> urKernelGetNativeHandle"); + ur_result_t result = pfnGetNativeHandle(hKernel, phNativeKernel); context.notify_end(UR_FUNCTION_KERNEL_GET_NATIVE_HANDLE, "urKernelGetNativeHandle", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_GET_NATIVE_HANDLE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2778,6 +3403,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreateWithNativeHandle( context.notify_begin(UR_FUNCTION_KERNEL_CREATE_WITH_NATIVE_HANDLE, "urKernelCreateWithNativeHandle", ¶ms); + context.logger.info("---> urKernelCreateWithNativeHandle"); + ur_result_t result = pfnCreateWithNativeHandle( hNativeKernel, hContext, hProgram, pProperties, phKernel); @@ -2785,6 +3412,11 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreateWithNativeHandle( "urKernelCreateWithNativeHandle", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_CREATE_WITH_NATIVE_HANDLE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2812,12 +3444,19 @@ __urdlllocal ur_result_t UR_APICALL urQueueGetInfo( uint64_t instance = context.notify_begin(UR_FUNCTION_QUEUE_GET_INFO, "urQueueGetInfo", ¶ms); + context.logger.info("---> urQueueGetInfo"); + ur_result_t result = pfnGetInfo(hQueue, propName, propSize, pPropValue, pPropSizeRet); context.notify_end(UR_FUNCTION_QUEUE_GET_INFO, "urQueueGetInfo", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_GET_INFO, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2842,11 +3481,18 @@ __urdlllocal ur_result_t UR_APICALL urQueueCreate( uint64_t instance = context.notify_begin(UR_FUNCTION_QUEUE_CREATE, "urQueueCreate", ¶ms); + context.logger.info("---> urQueueCreate"); + ur_result_t result = pfnCreate(hContext, hDevice, pProperties, phQueue); context.notify_end(UR_FUNCTION_QUEUE_CREATE, "urQueueCreate", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_CREATE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2865,11 +3511,18 @@ __urdlllocal ur_result_t UR_APICALL urQueueRetain( uint64_t instance = context.notify_begin(UR_FUNCTION_QUEUE_RETAIN, "urQueueRetain", ¶ms); + context.logger.info("---> urQueueRetain"); + ur_result_t result = pfnRetain(hQueue); context.notify_end(UR_FUNCTION_QUEUE_RETAIN, "urQueueRetain", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_RETAIN, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2888,11 +3541,18 @@ __urdlllocal ur_result_t UR_APICALL urQueueRelease( uint64_t instance = context.notify_begin(UR_FUNCTION_QUEUE_RELEASE, "urQueueRelease", ¶ms); + context.logger.info("---> urQueueRelease"); + ur_result_t result = pfnRelease(hQueue); context.notify_end(UR_FUNCTION_QUEUE_RELEASE, "urQueueRelease", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_RELEASE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2916,11 +3576,18 @@ __urdlllocal ur_result_t UR_APICALL urQueueGetNativeHandle( uint64_t instance = context.notify_begin( UR_FUNCTION_QUEUE_GET_NATIVE_HANDLE, "urQueueGetNativeHandle", ¶ms); + context.logger.info("---> urQueueGetNativeHandle"); + ur_result_t result = pfnGetNativeHandle(hQueue, pDesc, phNativeQueue); context.notify_end(UR_FUNCTION_QUEUE_GET_NATIVE_HANDLE, "urQueueGetNativeHandle", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_QUEUE_GET_NATIVE_HANDLE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2949,6 +3616,8 @@ __urdlllocal ur_result_t UR_APICALL urQueueCreateWithNativeHandle( context.notify_begin(UR_FUNCTION_QUEUE_CREATE_WITH_NATIVE_HANDLE, "urQueueCreateWithNativeHandle", ¶ms); + context.logger.info("---> urQueueCreateWithNativeHandle"); + ur_result_t result = pfnCreateWithNativeHandle( hNativeQueue, hContext, hDevice, pProperties, phQueue); @@ -2956,6 +3625,11 @@ __urdlllocal ur_result_t UR_APICALL urQueueCreateWithNativeHandle( "urQueueCreateWithNativeHandle", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_QUEUE_CREATE_WITH_NATIVE_HANDLE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2974,11 +3648,18 @@ __urdlllocal ur_result_t UR_APICALL urQueueFinish( uint64_t instance = context.notify_begin(UR_FUNCTION_QUEUE_FINISH, "urQueueFinish", ¶ms); + context.logger.info("---> urQueueFinish"); + ur_result_t result = pfnFinish(hQueue); context.notify_end(UR_FUNCTION_QUEUE_FINISH, "urQueueFinish", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_FINISH, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -2997,11 +3678,17 @@ __urdlllocal ur_result_t UR_APICALL urQueueFlush( uint64_t instance = context.notify_begin(UR_FUNCTION_QUEUE_FLUSH, "urQueueFlush", ¶ms); + context.logger.info("---> urQueueFlush"); + ur_result_t result = pfnFlush(hQueue); context.notify_end(UR_FUNCTION_QUEUE_FLUSH, "urQueueFlush", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_QUEUE_FLUSH, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3027,12 +3714,19 @@ __urdlllocal ur_result_t UR_APICALL urEventGetInfo( uint64_t instance = context.notify_begin(UR_FUNCTION_EVENT_GET_INFO, "urEventGetInfo", ¶ms); + context.logger.info("---> urEventGetInfo"); + ur_result_t result = pfnGetInfo(hEvent, propName, propSize, pPropValue, pPropSizeRet); context.notify_end(UR_FUNCTION_EVENT_GET_INFO, "urEventGetInfo", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_EVENT_GET_INFO, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3062,12 +3756,19 @@ __urdlllocal ur_result_t UR_APICALL urEventGetProfilingInfo( context.notify_begin(UR_FUNCTION_EVENT_GET_PROFILING_INFO, "urEventGetProfilingInfo", ¶ms); + context.logger.info("---> urEventGetProfilingInfo"); + ur_result_t result = pfnGetProfilingInfo(hEvent, propName, propSize, pPropValue, pPropSizeRet); context.notify_end(UR_FUNCTION_EVENT_GET_PROFILING_INFO, "urEventGetProfilingInfo", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_EVENT_GET_PROFILING_INFO, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3089,11 +3790,17 @@ __urdlllocal ur_result_t UR_APICALL urEventWait( uint64_t instance = context.notify_begin(UR_FUNCTION_EVENT_WAIT, "urEventWait", ¶ms); + context.logger.info("---> urEventWait"); + ur_result_t result = pfnWait(numEvents, phEventWaitList); context.notify_end(UR_FUNCTION_EVENT_WAIT, "urEventWait", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_EVENT_WAIT, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3112,11 +3819,18 @@ __urdlllocal ur_result_t UR_APICALL urEventRetain( uint64_t instance = context.notify_begin(UR_FUNCTION_EVENT_RETAIN, "urEventRetain", ¶ms); + context.logger.info("---> urEventRetain"); + ur_result_t result = pfnRetain(hEvent); context.notify_end(UR_FUNCTION_EVENT_RETAIN, "urEventRetain", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_EVENT_RETAIN, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3135,11 +3849,18 @@ __urdlllocal ur_result_t UR_APICALL urEventRelease( uint64_t instance = context.notify_begin(UR_FUNCTION_EVENT_RELEASE, "urEventRelease", ¶ms); + context.logger.info("---> urEventRelease"); + ur_result_t result = pfnRelease(hEvent); context.notify_end(UR_FUNCTION_EVENT_RELEASE, "urEventRelease", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_EVENT_RELEASE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3160,11 +3881,18 @@ __urdlllocal ur_result_t UR_APICALL urEventGetNativeHandle( uint64_t instance = context.notify_begin( UR_FUNCTION_EVENT_GET_NATIVE_HANDLE, "urEventGetNativeHandle", ¶ms); + context.logger.info("---> urEventGetNativeHandle"); + ur_result_t result = pfnGetNativeHandle(hEvent, phNativeEvent); context.notify_end(UR_FUNCTION_EVENT_GET_NATIVE_HANDLE, "urEventGetNativeHandle", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_EVENT_GET_NATIVE_HANDLE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3192,6 +3920,8 @@ __urdlllocal ur_result_t UR_APICALL urEventCreateWithNativeHandle( context.notify_begin(UR_FUNCTION_EVENT_CREATE_WITH_NATIVE_HANDLE, "urEventCreateWithNativeHandle", ¶ms); + context.logger.info("---> urEventCreateWithNativeHandle"); + ur_result_t result = pfnCreateWithNativeHandle(hNativeEvent, hContext, pProperties, phEvent); @@ -3199,6 +3929,11 @@ __urdlllocal ur_result_t UR_APICALL urEventCreateWithNativeHandle( "urEventCreateWithNativeHandle", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_EVENT_CREATE_WITH_NATIVE_HANDLE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3222,12 +3957,19 @@ __urdlllocal ur_result_t UR_APICALL urEventSetCallback( uint64_t instance = context.notify_begin(UR_FUNCTION_EVENT_SET_CALLBACK, "urEventSetCallback", ¶ms); + context.logger.info("---> urEventSetCallback"); + ur_result_t result = pfnSetCallback(hEvent, execStatus, pfnNotify, pUserData); context.notify_end(UR_FUNCTION_EVENT_SET_CALLBACK, "urEventSetCallback", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_EVENT_SET_CALLBACK, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3280,6 +4022,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH, "urEnqueueKernelLaunch", ¶ms); + context.logger.info("---> urEnqueueKernelLaunch"); + ur_result_t result = pfnKernelLaunch( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, numEventsInWaitList, phEventWaitList, phEvent); @@ -3287,6 +4031,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( context.notify_end(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH, "urEnqueueKernelLaunch", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3316,12 +4065,19 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWait( uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_EVENTS_WAIT, "urEnqueueEventsWait", ¶ms); + context.logger.info("---> urEnqueueEventsWait"); + ur_result_t result = pfnEventsWait(hQueue, numEventsInWaitList, phEventWaitList, phEvent); context.notify_end(UR_FUNCTION_ENQUEUE_EVENTS_WAIT, "urEnqueueEventsWait", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_EVENTS_WAIT, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3353,6 +4109,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( context.notify_begin(UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER, "urEnqueueEventsWaitWithBarrier", ¶ms); + context.logger.info("---> urEnqueueEventsWaitWithBarrier"); + ur_result_t result = pfnEventsWaitWithBarrier(hQueue, numEventsInWaitList, phEventWaitList, phEvent); @@ -3360,6 +4118,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( "urEnqueueEventsWaitWithBarrier", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_EVENTS_WAIT_WITH_BARRIER, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3396,6 +4159,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferRead( uint64_t instance = context.notify_begin( UR_FUNCTION_ENQUEUE_MEM_BUFFER_READ, "urEnqueueMemBufferRead", ¶ms); + context.logger.info("---> urEnqueueMemBufferRead"); + ur_result_t result = pfnMemBufferRead(hQueue, hBuffer, blockingRead, offset, size, pDst, numEventsInWaitList, phEventWaitList, phEvent); @@ -3403,6 +4168,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferRead( context.notify_end(UR_FUNCTION_ENQUEUE_MEM_BUFFER_READ, "urEnqueueMemBufferRead", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_READ, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3442,6 +4212,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWrite( context.notify_begin(UR_FUNCTION_ENQUEUE_MEM_BUFFER_WRITE, "urEnqueueMemBufferWrite", ¶ms); + context.logger.info("---> urEnqueueMemBufferWrite"); + ur_result_t result = pfnMemBufferWrite(hQueue, hBuffer, blockingWrite, offset, size, pSrc, numEventsInWaitList, phEventWaitList, phEvent); @@ -3449,6 +4221,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWrite( context.notify_end(UR_FUNCTION_ENQUEUE_MEM_BUFFER_WRITE, "urEnqueueMemBufferWrite", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_WRITE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3508,6 +4285,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferReadRect( context.notify_begin(UR_FUNCTION_ENQUEUE_MEM_BUFFER_READ_RECT, "urEnqueueMemBufferReadRect", ¶ms); + context.logger.info("---> urEnqueueMemBufferReadRect"); + ur_result_t result = pfnMemBufferReadRect( hQueue, hBuffer, blockingRead, bufferOrigin, hostOrigin, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, @@ -3517,6 +4296,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferReadRect( "urEnqueueMemBufferReadRect", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_READ_RECT, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3580,6 +4364,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( context.notify_begin(UR_FUNCTION_ENQUEUE_MEM_BUFFER_WRITE_RECT, "urEnqueueMemBufferWriteRect", ¶ms); + context.logger.info("---> urEnqueueMemBufferWriteRect"); + ur_result_t result = pfnMemBufferWriteRect( hQueue, hBuffer, blockingWrite, bufferOrigin, hostOrigin, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, @@ -3589,6 +4375,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( "urEnqueueMemBufferWriteRect", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_WRITE_RECT, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3625,6 +4416,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopy( uint64_t instance = context.notify_begin( UR_FUNCTION_ENQUEUE_MEM_BUFFER_COPY, "urEnqueueMemBufferCopy", ¶ms); + context.logger.info("---> urEnqueueMemBufferCopy"); + ur_result_t result = pfnMemBufferCopy(hQueue, hBufferSrc, hBufferDst, srcOffset, dstOffset, size, numEventsInWaitList, phEventWaitList, phEvent); @@ -3632,6 +4425,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopy( context.notify_end(UR_FUNCTION_ENQUEUE_MEM_BUFFER_COPY, "urEnqueueMemBufferCopy", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_COPY, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3680,6 +4478,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( context.notify_begin(UR_FUNCTION_ENQUEUE_MEM_BUFFER_COPY_RECT, "urEnqueueMemBufferCopyRect", ¶ms); + context.logger.info("---> urEnqueueMemBufferCopyRect"); + ur_result_t result = pfnMemBufferCopyRect( hQueue, hBufferSrc, hBufferDst, srcOrigin, dstOrigin, region, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, @@ -3689,6 +4489,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( "urEnqueueMemBufferCopyRect", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_COPY_RECT, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3730,6 +4535,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferFill( uint64_t instance = context.notify_begin( UR_FUNCTION_ENQUEUE_MEM_BUFFER_FILL, "urEnqueueMemBufferFill", ¶ms); + context.logger.info("---> urEnqueueMemBufferFill"); + ur_result_t result = pfnMemBufferFill(hQueue, hBuffer, pPattern, patternSize, offset, size, numEventsInWaitList, phEventWaitList, phEvent); @@ -3737,6 +4544,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferFill( context.notify_end(UR_FUNCTION_ENQUEUE_MEM_BUFFER_FILL, "urEnqueueMemBufferFill", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_FILL, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3779,6 +4591,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageRead( uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_MEM_IMAGE_READ, "urEnqueueMemImageRead", ¶ms); + context.logger.info("---> urEnqueueMemImageRead"); + ur_result_t result = pfnMemImageRead( hQueue, hImage, blockingRead, origin, region, rowPitch, slicePitch, pDst, numEventsInWaitList, phEventWaitList, phEvent); @@ -3786,6 +4600,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageRead( context.notify_end(UR_FUNCTION_ENQUEUE_MEM_IMAGE_READ, "urEnqueueMemImageRead", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_MEM_IMAGE_READ, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3829,6 +4648,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageWrite( uint64_t instance = context.notify_begin( UR_FUNCTION_ENQUEUE_MEM_IMAGE_WRITE, "urEnqueueMemImageWrite", ¶ms); + context.logger.info("---> urEnqueueMemImageWrite"); + ur_result_t result = pfnMemImageWrite( hQueue, hImage, blockingWrite, origin, region, rowPitch, slicePitch, pSrc, numEventsInWaitList, phEventWaitList, phEvent); @@ -3836,6 +4657,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageWrite( context.notify_end(UR_FUNCTION_ENQUEUE_MEM_IMAGE_WRITE, "urEnqueueMemImageWrite", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_MEM_IMAGE_WRITE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3878,6 +4704,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageCopy( uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_MEM_IMAGE_COPY, "urEnqueueMemImageCopy", ¶ms); + context.logger.info("---> urEnqueueMemImageCopy"); + ur_result_t result = pfnMemImageCopy(hQueue, hImageSrc, hImageDst, srcOrigin, dstOrigin, region, numEventsInWaitList, phEventWaitList, phEvent); @@ -3885,6 +4713,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemImageCopy( context.notify_end(UR_FUNCTION_ENQUEUE_MEM_IMAGE_COPY, "urEnqueueMemImageCopy", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_MEM_IMAGE_COPY, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3923,6 +4756,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferMap( uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_MEM_BUFFER_MAP, "urEnqueueMemBufferMap", ¶ms); + context.logger.info("---> urEnqueueMemBufferMap"); + ur_result_t result = pfnMemBufferMap(hQueue, hBuffer, blockingMap, mapFlags, offset, size, numEventsInWaitList, phEventWaitList, phEvent, ppRetMap); @@ -3930,6 +4765,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferMap( context.notify_end(UR_FUNCTION_ENQUEUE_MEM_BUFFER_MAP, "urEnqueueMemBufferMap", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_MEM_BUFFER_MAP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -3962,6 +4802,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemUnmap( uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_MEM_UNMAP, "urEnqueueMemUnmap", ¶ms); + context.logger.info("---> urEnqueueMemUnmap"); + ur_result_t result = pfnMemUnmap(hQueue, hMem, pMappedPtr, numEventsInWaitList, phEventWaitList, phEvent); @@ -3969,6 +4811,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueMemUnmap( context.notify_end(UR_FUNCTION_ENQUEUE_MEM_UNMAP, "urEnqueueMemUnmap", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_MEM_UNMAP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4007,6 +4854,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill( uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_USM_FILL, "urEnqueueUSMFill", ¶ms); + context.logger.info("---> urEnqueueUSMFill"); + ur_result_t result = pfnUSMFill(hQueue, pMem, patternSize, pPattern, size, numEventsInWaitList, phEventWaitList, phEvent); @@ -4014,6 +4863,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill( context.notify_end(UR_FUNCTION_ENQUEUE_USM_FILL, "urEnqueueUSMFill", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_USM_FILL, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4049,6 +4903,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy( uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_USM_MEMCPY, "urEnqueueUSMMemcpy", ¶ms); + context.logger.info("---> urEnqueueUSMMemcpy"); + ur_result_t result = pfnUSMMemcpy(hQueue, blocking, pDst, pSrc, size, numEventsInWaitList, phEventWaitList, phEvent); @@ -4056,6 +4912,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy( context.notify_end(UR_FUNCTION_ENQUEUE_USM_MEMCPY, "urEnqueueUSMMemcpy", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_USM_MEMCPY, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4089,6 +4950,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMPrefetch( uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_USM_PREFETCH, "urEnqueueUSMPrefetch", ¶ms); + context.logger.info("---> urEnqueueUSMPrefetch"); + ur_result_t result = pfnUSMPrefetch(hQueue, pMem, size, flags, numEventsInWaitList, phEventWaitList, phEvent); @@ -4096,6 +4959,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMPrefetch( context.notify_end(UR_FUNCTION_ENQUEUE_USM_PREFETCH, "urEnqueueUSMPrefetch", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_USM_PREFETCH, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4122,11 +4990,18 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMAdvise( uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_USM_ADVISE, "urEnqueueUSMAdvise", ¶ms); + context.logger.info("---> urEnqueueUSMAdvise"); + ur_result_t result = pfnUSMAdvise(hQueue, pMem, size, advice, phEvent); context.notify_end(UR_FUNCTION_ENQUEUE_USM_ADVISE, "urEnqueueUSMAdvise", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_USM_ADVISE, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4170,6 +5045,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill2D( uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_USM_FILL_2D, "urEnqueueUSMFill2D", ¶ms); + context.logger.info("---> urEnqueueUSMFill2D"); + ur_result_t result = pfnUSMFill2D(hQueue, pMem, pitch, patternSize, pPattern, width, height, numEventsInWaitList, phEventWaitList, phEvent); @@ -4177,6 +5054,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMFill2D( context.notify_end(UR_FUNCTION_ENQUEUE_USM_FILL_2D, "urEnqueueUSMFill2D", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_USM_FILL_2D, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4220,6 +5102,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_USM_MEMCPY_2D, "urEnqueueUSMMemcpy2D", ¶ms); + context.logger.info("---> urEnqueueUSMMemcpy2D"); + ur_result_t result = pfnUSMMemcpy2D(hQueue, blocking, pDst, dstPitch, pSrc, srcPitch, width, height, numEventsInWaitList, phEventWaitList, phEvent); @@ -4227,6 +5111,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( context.notify_end(UR_FUNCTION_ENQUEUE_USM_MEMCPY_2D, "urEnqueueUSMMemcpy2D", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_ENQUEUE_USM_MEMCPY_2D, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4268,6 +5157,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( context.notify_begin(UR_FUNCTION_ENQUEUE_DEVICE_GLOBAL_VARIABLE_WRITE, "urEnqueueDeviceGlobalVariableWrite", ¶ms); + context.logger.info("---> urEnqueueDeviceGlobalVariableWrite"); + ur_result_t result = pfnDeviceGlobalVariableWrite( hQueue, hProgram, name, blockingWrite, count, offset, pSrc, numEventsInWaitList, phEventWaitList, phEvent); @@ -4276,6 +5167,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( "urEnqueueDeviceGlobalVariableWrite", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_DEVICE_GLOBAL_VARIABLE_WRITE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4317,6 +5213,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( context.notify_begin(UR_FUNCTION_ENQUEUE_DEVICE_GLOBAL_VARIABLE_READ, "urEnqueueDeviceGlobalVariableRead", ¶ms); + context.logger.info("---> urEnqueueDeviceGlobalVariableRead"); + ur_result_t result = pfnDeviceGlobalVariableRead( hQueue, hProgram, name, blockingRead, count, offset, pDst, numEventsInWaitList, phEventWaitList, phEvent); @@ -4325,6 +5223,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableRead( "urEnqueueDeviceGlobalVariableRead", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_DEVICE_GLOBAL_VARIABLE_READ, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4368,6 +5271,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueReadHostPipe( uint64_t instance = context.notify_begin(UR_FUNCTION_ENQUEUE_READ_HOST_PIPE, "urEnqueueReadHostPipe", ¶ms); + context.logger.info("---> urEnqueueReadHostPipe"); + ur_result_t result = pfnReadHostPipe(hQueue, hProgram, pipe_symbol, blocking, pDst, size, numEventsInWaitList, phEventWaitList, phEvent); @@ -4375,6 +5280,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueReadHostPipe( context.notify_end(UR_FUNCTION_ENQUEUE_READ_HOST_PIPE, "urEnqueueReadHostPipe", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_READ_HOST_PIPE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4418,6 +5328,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueWriteHostPipe( uint64_t instance = context.notify_begin( UR_FUNCTION_ENQUEUE_WRITE_HOST_PIPE, "urEnqueueWriteHostPipe", ¶ms); + context.logger.info("---> urEnqueueWriteHostPipe"); + ur_result_t result = pfnWriteHostPipe(hQueue, hProgram, pipe_symbol, blocking, pSrc, size, numEventsInWaitList, phEventWaitList, phEvent); @@ -4425,6 +5337,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueWriteHostPipe( context.notify_end(UR_FUNCTION_ENQUEUE_WRITE_HOST_PIPE, "urEnqueueWriteHostPipe", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_WRITE_HOST_PIPE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4457,6 +5374,8 @@ __urdlllocal ur_result_t UR_APICALL urUSMPitchedAllocExp( uint64_t instance = context.notify_begin(UR_FUNCTION_USM_PITCHED_ALLOC_EXP, "urUSMPitchedAllocExp", ¶ms); + context.logger.info("---> urUSMPitchedAllocExp"); + ur_result_t result = pfnPitchedAllocExp(hContext, hDevice, pUSMDesc, pool, widthInBytes, height, elementSizeBytes, ppMem, pResultPitch); @@ -4464,6 +5383,11 @@ __urdlllocal ur_result_t UR_APICALL urUSMPitchedAllocExp( context.notify_end(UR_FUNCTION_USM_PITCHED_ALLOC_EXP, "urUSMPitchedAllocExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_PITCHED_ALLOC_EXP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4489,6 +5413,8 @@ urBindlessImagesUnsampledImageHandleDestroyExp( UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_HANDLE_DESTROY_EXP, "urBindlessImagesUnsampledImageHandleDestroyExp", ¶ms); + context.logger.info("---> urBindlessImagesUnsampledImageHandleDestroyExp"); + ur_result_t result = pfnUnsampledImageHandleDestroyExp(hContext, hDevice, hImage); @@ -4497,6 +5423,13 @@ urBindlessImagesUnsampledImageHandleDestroyExp( "urBindlessImagesUnsampledImageHandleDestroyExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, + UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_HANDLE_DESTROY_EXP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4522,6 +5455,8 @@ urBindlessImagesSampledImageHandleDestroyExp( UR_FUNCTION_BINDLESS_IMAGES_SAMPLED_IMAGE_HANDLE_DESTROY_EXP, "urBindlessImagesSampledImageHandleDestroyExp", ¶ms); + context.logger.info("---> urBindlessImagesSampledImageHandleDestroyExp"); + ur_result_t result = pfnSampledImageHandleDestroyExp(hContext, hDevice, hImage); @@ -4530,6 +5465,12 @@ urBindlessImagesSampledImageHandleDestroyExp( "urBindlessImagesSampledImageHandleDestroyExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_SAMPLED_IMAGE_HANDLE_DESTROY_EXP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4557,6 +5498,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageAllocateExp( context.notify_begin(UR_FUNCTION_BINDLESS_IMAGES_IMAGE_ALLOCATE_EXP, "urBindlessImagesImageAllocateExp", ¶ms); + context.logger.info("---> urBindlessImagesImageAllocateExp"); + ur_result_t result = pfnImageAllocateExp(hContext, hDevice, pImageFormat, pImageDesc, phImageMem); @@ -4564,6 +5507,11 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageAllocateExp( "urBindlessImagesImageAllocateExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_IMAGE_ALLOCATE_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4587,12 +5535,19 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageFreeExp( context.notify_begin(UR_FUNCTION_BINDLESS_IMAGES_IMAGE_FREE_EXP, "urBindlessImagesImageFreeExp", ¶ms); + context.logger.info("---> urBindlessImagesImageFreeExp"); + ur_result_t result = pfnImageFreeExp(hContext, hDevice, hImageMem); context.notify_end(UR_FUNCTION_BINDLESS_IMAGES_IMAGE_FREE_EXP, "urBindlessImagesImageFreeExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_IMAGE_FREE_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4606,7 +5561,6 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( const ur_image_format_t *pImageFormat, ///< [in] pointer to image format specification const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description - ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created ur_exp_image_handle_t *phImage ///< [out] pointer to handle of image object created ) { @@ -4618,19 +5572,26 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( } ur_bindless_images_unsampled_image_create_exp_params_t params = { - &hContext, &hDevice, &hImageMem, &pImageFormat, - &pImageDesc, &phMem, &phImage}; + &hContext, &hDevice, &hImageMem, &pImageFormat, &pImageDesc, &phImage}; uint64_t instance = context.notify_begin( UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_CREATE_EXP, "urBindlessImagesUnsampledImageCreateExp", ¶ms); + context.logger.info("---> urBindlessImagesUnsampledImageCreateExp"); + ur_result_t result = pfnUnsampledImageCreateExp( - hContext, hDevice, hImageMem, pImageFormat, pImageDesc, phMem, phImage); + hContext, hDevice, hImageMem, pImageFormat, pImageDesc, phImage); context.notify_end(UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_CREATE_EXP, "urBindlessImagesUnsampledImageCreateExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_UNSAMPLED_IMAGE_CREATE_EXP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4645,7 +5606,6 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( *pImageFormat, ///< [in] pointer to image format specification const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description ur_sampler_handle_t hSampler, ///< [in] sampler to be used - ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created ur_exp_image_handle_t *phImage ///< [out] pointer to handle of image object created ) { @@ -4658,19 +5618,27 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( ur_bindless_images_sampled_image_create_exp_params_t params = { &hContext, &hDevice, &hImageMem, &pImageFormat, - &pImageDesc, &hSampler, &phMem, &phImage}; + &pImageDesc, &hSampler, &phImage}; uint64_t instance = context.notify_begin( UR_FUNCTION_BINDLESS_IMAGES_SAMPLED_IMAGE_CREATE_EXP, "urBindlessImagesSampledImageCreateExp", ¶ms); + context.logger.info("---> urBindlessImagesSampledImageCreateExp"); + ur_result_t result = pfnSampledImageCreateExp(hContext, hDevice, hImageMem, pImageFormat, - pImageDesc, hSampler, phMem, phImage); + pImageDesc, hSampler, phImage); context.notify_end(UR_FUNCTION_BINDLESS_IMAGES_SAMPLED_IMAGE_CREATE_EXP, "urBindlessImagesSampledImageCreateExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_SAMPLED_IMAGE_CREATE_EXP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4731,6 +5699,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageCopyExp( context.notify_begin(UR_FUNCTION_BINDLESS_IMAGES_IMAGE_COPY_EXP, "urBindlessImagesImageCopyExp", ¶ms); + context.logger.info("---> urBindlessImagesImageCopyExp"); + ur_result_t result = pfnImageCopyExp( hQueue, pDst, pSrc, pImageFormat, pImageDesc, imageCopyFlags, srcOffset, dstOffset, copyExtent, hostExtent, numEventsInWaitList, phEventWaitList, @@ -4740,6 +5710,11 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageCopyExp( "urBindlessImagesImageCopyExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_IMAGE_COPY_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4764,6 +5739,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageGetInfoExp( context.notify_begin(UR_FUNCTION_BINDLESS_IMAGES_IMAGE_GET_INFO_EXP, "urBindlessImagesImageGetInfoExp", ¶ms); + context.logger.info("---> urBindlessImagesImageGetInfoExp"); + ur_result_t result = pfnImageGetInfoExp(hImageMem, propName, pPropValue, pPropSizeRet); @@ -4771,6 +5748,11 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageGetInfoExp( "urBindlessImagesImageGetInfoExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_IMAGE_GET_INFO_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4798,6 +5780,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMipmapGetLevelExp( context.notify_begin(UR_FUNCTION_BINDLESS_IMAGES_MIPMAP_GET_LEVEL_EXP, "urBindlessImagesMipmapGetLevelExp", ¶ms); + context.logger.info("---> urBindlessImagesMipmapGetLevelExp"); + ur_result_t result = pfnMipmapGetLevelExp(hContext, hDevice, hImageMem, mipmapLevel, phImageMem); @@ -4805,6 +5789,11 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMipmapGetLevelExp( "urBindlessImagesMipmapGetLevelExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_MIPMAP_GET_LEVEL_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4828,12 +5817,19 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMipmapFreeExp( context.notify_begin(UR_FUNCTION_BINDLESS_IMAGES_MIPMAP_FREE_EXP, "urBindlessImagesMipmapFreeExp", ¶ms); + context.logger.info("---> urBindlessImagesMipmapFreeExp"); + ur_result_t result = pfnMipmapFreeExp(hContext, hDevice, hMem); context.notify_end(UR_FUNCTION_BINDLESS_IMAGES_MIPMAP_FREE_EXP, "urBindlessImagesMipmapFreeExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_MIPMAP_FREE_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4861,6 +5857,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportOpaqueFDExp( context.notify_begin(UR_FUNCTION_BINDLESS_IMAGES_IMPORT_OPAQUE_FD_EXP, "urBindlessImagesImportOpaqueFDExp", ¶ms); + context.logger.info("---> urBindlessImagesImportOpaqueFDExp"); + ur_result_t result = pfnImportOpaqueFDExp(hContext, hDevice, size, pInteropMemDesc, phInteropMem); @@ -4868,6 +5866,11 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImportOpaqueFDExp( "urBindlessImagesImportOpaqueFDExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_IMPORT_OPAQUE_FD_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4898,6 +5901,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( context.notify_begin(UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_ARRAY_EXP, "urBindlessImagesMapExternalArrayExp", ¶ms); + context.logger.info("---> urBindlessImagesMapExternalArrayExp"); + ur_result_t result = pfnMapExternalArrayExp( hContext, hDevice, pImageFormat, pImageDesc, hInteropMem, phImageMem); @@ -4905,6 +5910,11 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( "urBindlessImagesMapExternalArrayExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_MAP_EXTERNAL_ARRAY_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4929,12 +5939,19 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesReleaseInteropExp( context.notify_begin(UR_FUNCTION_BINDLESS_IMAGES_RELEASE_INTEROP_EXP, "urBindlessImagesReleaseInteropExp", ¶ms); + context.logger.info("---> urBindlessImagesReleaseInteropExp"); + ur_result_t result = pfnReleaseInteropExp(hContext, hDevice, hInteropMem); context.notify_end(UR_FUNCTION_BINDLESS_IMAGES_RELEASE_INTEROP_EXP, "urBindlessImagesReleaseInteropExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_RELEASE_INTEROP_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4963,6 +5980,9 @@ urBindlessImagesImportExternalSemaphoreOpaqueFDExp( UR_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_SEMAPHORE_OPAQUE_FD_EXP, "urBindlessImagesImportExternalSemaphoreOpaqueFDExp", ¶ms); + context.logger.info( + "---> urBindlessImagesImportExternalSemaphoreOpaqueFDExp"); + ur_result_t result = pfnImportExternalSemaphoreOpaqueFDExp( hContext, hDevice, pInteropSemaphoreDesc, phInteropSemaphore); @@ -4971,6 +5991,13 @@ urBindlessImagesImportExternalSemaphoreOpaqueFDExp( "urBindlessImagesImportExternalSemaphoreOpaqueFDExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, + UR_FUNCTION_BINDLESS_IMAGES_IMPORT_EXTERNAL_SEMAPHORE_OPAQUE_FD_EXP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -4995,6 +6022,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesDestroyExternalSemaphoreExp( UR_FUNCTION_BINDLESS_IMAGES_DESTROY_EXTERNAL_SEMAPHORE_EXP, "urBindlessImagesDestroyExternalSemaphoreExp", ¶ms); + context.logger.info("---> urBindlessImagesDestroyExternalSemaphoreExp"); + ur_result_t result = pfnDestroyExternalSemaphoreExp(hContext, hDevice, hInteropSemaphore); @@ -5003,6 +6032,12 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesDestroyExternalSemaphoreExp( "urBindlessImagesDestroyExternalSemaphoreExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_DESTROY_EXTERNAL_SEMAPHORE_EXP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5036,6 +6071,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( UR_FUNCTION_BINDLESS_IMAGES_WAIT_EXTERNAL_SEMAPHORE_EXP, "urBindlessImagesWaitExternalSemaphoreExp", ¶ms); + context.logger.info("---> urBindlessImagesWaitExternalSemaphoreExp"); + ur_result_t result = pfnWaitExternalSemaphoreExp( hQueue, hSemaphore, numEventsInWaitList, phEventWaitList, phEvent); @@ -5043,6 +6080,12 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesWaitExternalSemaphoreExp( "urBindlessImagesWaitExternalSemaphoreExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_WAIT_EXTERNAL_SEMAPHORE_EXP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5076,6 +6119,8 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( UR_FUNCTION_BINDLESS_IMAGES_SIGNAL_EXTERNAL_SEMAPHORE_EXP, "urBindlessImagesSignalExternalSemaphoreExp", ¶ms); + context.logger.info("---> urBindlessImagesSignalExternalSemaphoreExp"); + ur_result_t result = pfnSignalExternalSemaphoreExp( hQueue, hSemaphore, numEventsInWaitList, phEventWaitList, phEvent); @@ -5084,6 +6129,12 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSignalExternalSemaphoreExp( "urBindlessImagesSignalExternalSemaphoreExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_BINDLESS_IMAGES_SIGNAL_EXTERNAL_SEMAPHORE_EXP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5109,12 +6160,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCreateExp( context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_CREATE_EXP, "urCommandBufferCreateExp", ¶ms); + context.logger.info("---> urCommandBufferCreateExp"); + ur_result_t result = pfnCreateExp(hContext, hDevice, pCommandBufferDesc, phCommandBuffer); context.notify_end(UR_FUNCTION_COMMAND_BUFFER_CREATE_EXP, "urCommandBufferCreateExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_CREATE_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5135,11 +6193,18 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferRetainExp( context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_RETAIN_EXP, "urCommandBufferRetainExp", ¶ms); + context.logger.info("---> urCommandBufferRetainExp"); + ur_result_t result = pfnRetainExp(hCommandBuffer); context.notify_end(UR_FUNCTION_COMMAND_BUFFER_RETAIN_EXP, "urCommandBufferRetainExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_RETAIN_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5160,11 +6225,18 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferReleaseExp( context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_RELEASE_EXP, "urCommandBufferReleaseExp", ¶ms); + context.logger.info("---> urCommandBufferReleaseExp"); + ur_result_t result = pfnReleaseExp(hCommandBuffer); context.notify_end(UR_FUNCTION_COMMAND_BUFFER_RELEASE_EXP, "urCommandBufferReleaseExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_RELEASE_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5185,12 +6257,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferFinalizeExp( context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_FINALIZE_EXP, "urCommandBufferFinalizeExp", ¶ms); + context.logger.info("---> urCommandBufferFinalizeExp"); + ur_result_t result = pfnFinalizeExp(hCommandBuffer); context.notify_end(UR_FUNCTION_COMMAND_BUFFER_FINALIZE_EXP, "urCommandBufferFinalizeExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_FINALIZE_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5239,6 +6318,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_EXP, "urCommandBufferAppendKernelLaunchExp", ¶ms); + context.logger.info("---> urCommandBufferAppendKernelLaunchExp"); + ur_result_t result = pfnAppendKernelLaunchExp( hCommandBuffer, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint, @@ -5248,6 +6329,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( "urCommandBufferAppendKernelLaunchExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_KERNEL_LAUNCH_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5281,6 +6367,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP, "urCommandBufferAppendUSMMemcpyExp", ¶ms); + context.logger.info("---> urCommandBufferAppendUSMMemcpyExp"); + ur_result_t result = pfnAppendUSMMemcpyExp(hCommandBuffer, pDst, pSrc, size, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); @@ -5289,6 +6377,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( "urCommandBufferAppendUSMMemcpyExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_MEMCPY_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5325,6 +6418,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_FILL_EXP, "urCommandBufferAppendUSMFillExp", ¶ms); + context.logger.info("---> urCommandBufferAppendUSMFillExp"); + ur_result_t result = pfnAppendUSMFillExp( hCommandBuffer, pMemory, pPattern, patternSize, size, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); @@ -5333,6 +6428,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( "urCommandBufferAppendUSMFillExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_FILL_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5375,6 +6475,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_EXP, "urCommandBufferAppendMemBufferCopyExp", ¶ms); + context.logger.info("---> urCommandBufferAppendMemBufferCopyExp"); + ur_result_t result = pfnAppendMemBufferCopyExp( hCommandBuffer, hSrcMem, hDstMem, srcOffset, dstOffset, size, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); @@ -5383,6 +6485,12 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( "urCommandBufferAppendMemBufferCopyExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_EXP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5424,6 +6532,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_EXP, "urCommandBufferAppendMemBufferWriteExp", ¶ms); + context.logger.info("---> urCommandBufferAppendMemBufferWriteExp"); + ur_result_t result = pfnAppendMemBufferWriteExp( hCommandBuffer, hBuffer, offset, size, pSrc, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); @@ -5432,6 +6542,12 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( "urCommandBufferAppendMemBufferWriteExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_EXP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5472,6 +6588,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_EXP, "urCommandBufferAppendMemBufferReadExp", ¶ms); + context.logger.info("---> urCommandBufferAppendMemBufferReadExp"); + ur_result_t result = pfnAppendMemBufferReadExp( hCommandBuffer, hBuffer, offset, size, pDst, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); @@ -5480,6 +6598,12 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( "urCommandBufferAppendMemBufferReadExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_EXP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5533,6 +6657,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_RECT_EXP, "urCommandBufferAppendMemBufferCopyRectExp", ¶ms); + context.logger.info("---> urCommandBufferAppendMemBufferCopyRectExp"); + ur_result_t result = pfnAppendMemBufferCopyRectExp( hCommandBuffer, hSrcMem, hDstMem, srcOrigin, dstOrigin, region, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, @@ -5543,6 +6669,12 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( "urCommandBufferAppendMemBufferCopyRectExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_COPY_RECT_EXP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5602,6 +6734,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_RECT_EXP, "urCommandBufferAppendMemBufferWriteRectExp", ¶ms); + context.logger.info("---> urCommandBufferAppendMemBufferWriteRectExp"); + ur_result_t result = pfnAppendMemBufferWriteRectExp( hCommandBuffer, hBuffer, bufferOffset, hostOffset, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, @@ -5612,6 +6746,12 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( "urCommandBufferAppendMemBufferWriteRectExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_WRITE_RECT_EXP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5669,6 +6809,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_RECT_EXP, "urCommandBufferAppendMemBufferReadRectExp", ¶ms); + context.logger.info("---> urCommandBufferAppendMemBufferReadRectExp"); + ur_result_t result = pfnAppendMemBufferReadRectExp( hCommandBuffer, hBuffer, bufferOffset, hostOffset, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, @@ -5679,6 +6821,12 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( "urCommandBufferAppendMemBufferReadRectExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_READ_RECT_EXP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5722,6 +6870,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_FILL_EXP, "urCommandBufferAppendMemBufferFillExp", ¶ms); + context.logger.info("---> urCommandBufferAppendMemBufferFillExp"); + ur_result_t result = pfnAppendMemBufferFillExp( hCommandBuffer, hBuffer, pPattern, patternSize, offset, size, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); @@ -5730,6 +6880,12 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( "urCommandBufferAppendMemBufferFillExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_MEM_BUFFER_FILL_EXP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5768,6 +6924,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_PREFETCH_EXP, "urCommandBufferAppendUSMPrefetchExp", ¶ms); + context.logger.info("---> urCommandBufferAppendUSMPrefetchExp"); + ur_result_t result = pfnAppendUSMPrefetchExp( hCommandBuffer, pMemory, size, flags, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); @@ -5776,6 +6934,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( "urCommandBufferAppendUSMPrefetchExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_PREFETCH_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5814,6 +6977,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_ADVISE_EXP, "urCommandBufferAppendUSMAdviseExp", ¶ms); + context.logger.info("---> urCommandBufferAppendUSMAdviseExp"); + ur_result_t result = pfnAppendUSMAdviseExp(hCommandBuffer, pMemory, size, advice, numSyncPointsInWaitList, pSyncPointWaitList, pSyncPoint); @@ -5822,6 +6987,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( "urCommandBufferAppendUSMAdviseExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_APPEND_USM_ADVISE_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5854,12 +7024,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferEnqueueExp( context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_ENQUEUE_EXP, "urCommandBufferEnqueueExp", ¶ms); + context.logger.info("---> urCommandBufferEnqueueExp"); + ur_result_t result = pfnEnqueueExp( hCommandBuffer, hQueue, numEventsInWaitList, phEventWaitList, phEvent); context.notify_end(UR_FUNCTION_COMMAND_BUFFER_ENQUEUE_EXP, "urCommandBufferEnqueueExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_ENQUEUE_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5881,12 +7058,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferRetainCommandExp( context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_RETAIN_COMMAND_EXP, "urCommandBufferRetainCommandExp", ¶ms); + context.logger.info("---> urCommandBufferRetainCommandExp"); + ur_result_t result = pfnRetainCommandExp(hCommand); context.notify_end(UR_FUNCTION_COMMAND_BUFFER_RETAIN_COMMAND_EXP, "urCommandBufferRetainCommandExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_RETAIN_COMMAND_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5908,12 +7092,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferReleaseCommandExp( context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_RELEASE_COMMAND_EXP, "urCommandBufferReleaseCommandExp", ¶ms); + context.logger.info("---> urCommandBufferReleaseCommandExp"); + ur_result_t result = pfnReleaseCommandExp(hCommand); context.notify_end(UR_FUNCTION_COMMAND_BUFFER_RELEASE_COMMAND_EXP, "urCommandBufferReleaseCommandExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_RELEASE_COMMAND_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5938,6 +7129,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( UR_FUNCTION_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_EXP, "urCommandBufferUpdateKernelLaunchExp", ¶ms); + context.logger.info("---> urCommandBufferUpdateKernelLaunchExp"); + ur_result_t result = pfnUpdateKernelLaunchExp(hCommand, pUpdateKernelLaunch); @@ -5945,6 +7138,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( "urCommandBufferUpdateKernelLaunchExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -5975,12 +7173,19 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferGetInfoExp( context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_GET_INFO_EXP, "urCommandBufferGetInfoExp", ¶ms); + context.logger.info("---> urCommandBufferGetInfoExp"); + ur_result_t result = pfnGetInfoExp(hCommandBuffer, propName, propSize, pPropValue, pPropSizeRet); context.notify_end(UR_FUNCTION_COMMAND_BUFFER_GET_INFO_EXP, "urCommandBufferGetInfoExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_GET_INFO_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -6012,6 +7217,8 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCommandGetInfoExp( context.notify_begin(UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP, "urCommandBufferCommandGetInfoExp", ¶ms); + context.logger.info("---> urCommandBufferCommandGetInfoExp"); + ur_result_t result = pfnCommandGetInfoExp(hCommand, propName, propSize, pPropValue, pPropSizeRet); @@ -6019,6 +7226,11 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferCommandGetInfoExp( "urCommandBufferCommandGetInfoExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -6074,6 +7286,8 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( context.notify_begin(UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP, "urEnqueueCooperativeKernelLaunchExp", ¶ms); + context.logger.info("---> urEnqueueCooperativeKernelLaunchExp"); + ur_result_t result = pfnCooperativeKernelLaunchExp( hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, numEventsInWaitList, phEventWaitList, phEvent); @@ -6082,6 +7296,11 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( "urEnqueueCooperativeKernelLaunchExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_COOPERATIVE_KERNEL_LAUNCH_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -6110,6 +7329,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP, "urKernelSuggestMaxCooperativeGroupCountExp", ¶ms); + context.logger.info("---> urKernelSuggestMaxCooperativeGroupCountExp"); + ur_result_t result = pfnSuggestMaxCooperativeGroupCountExp( hKernel, localWorkSize, dynamicSharedMemorySize, pGroupCountRet); @@ -6118,6 +7339,65 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( "urKernelSuggestMaxCooperativeGroupCountExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_SUGGEST_MAX_COOPERATIVE_GROUP_COUNT_EXP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + + return result; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueTimestampRecordingExp +__urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + bool + blocking, ///< [in] indicates whether the call to this function should block until + ///< until the device timestamp recording command has executed on the + ///< device. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies this particular kernel + ///< execution instance. Profiling information can be queried + ///< from this event as if `hQueue` had profiling enabled. Querying + ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + ///< reports the timestamp at the time of the call to this function. + ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + ///< reports the timestamp recorded when the command is executed on the device. +) { + auto pfnTimestampRecordingExp = + context.urDdiTable.EnqueueExp.pfnTimestampRecordingExp; + + if (nullptr == pfnTimestampRecordingExp) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + ur_enqueue_timestamp_recording_exp_params_t params = { + &hQueue, &blocking, &numEventsInWaitList, &phEventWaitList, &phEvent}; + uint64_t instance = + context.notify_begin(UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP, + "urEnqueueTimestampRecordingExp", ¶ms); + + context.logger.info("---> urEnqueueTimestampRecordingExp"); + + ur_result_t result = pfnTimestampRecordingExp( + hQueue, blocking, numEventsInWaitList, phEventWaitList, phEvent); + + context.notify_end(UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP, + "urEnqueueTimestampRecordingExp", ¶ms, &result, + instance); + + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -6142,11 +7422,18 @@ __urdlllocal ur_result_t UR_APICALL urProgramBuildExp( uint64_t instance = context.notify_begin(UR_FUNCTION_PROGRAM_BUILD_EXP, "urProgramBuildExp", ¶ms); + context.logger.info("---> urProgramBuildExp"); + ur_result_t result = pfnBuildExp(hProgram, numDevices, phDevices, pOptions); context.notify_end(UR_FUNCTION_PROGRAM_BUILD_EXP, "urProgramBuildExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_BUILD_EXP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -6172,12 +7459,19 @@ __urdlllocal ur_result_t UR_APICALL urProgramCompileExp( uint64_t instance = context.notify_begin(UR_FUNCTION_PROGRAM_COMPILE_EXP, "urProgramCompileExp", ¶ms); + context.logger.info("---> urProgramCompileExp"); + ur_result_t result = pfnCompileExp(hProgram, numDevices, phDevices, pOptions); context.notify_end(UR_FUNCTION_PROGRAM_COMPILE_EXP, "urProgramCompileExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_COMPILE_EXP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -6208,12 +7502,19 @@ __urdlllocal ur_result_t UR_APICALL urProgramLinkExp( uint64_t instance = context.notify_begin(UR_FUNCTION_PROGRAM_LINK_EXP, "urProgramLinkExp", ¶ms); + context.logger.info("---> urProgramLinkExp"); + ur_result_t result = pfnLinkExp(hContext, numDevices, phDevices, count, phPrograms, pOptions, phProgram); context.notify_end(UR_FUNCTION_PROGRAM_LINK_EXP, "urProgramLinkExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_PROGRAM_LINK_EXP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -6234,11 +7535,18 @@ __urdlllocal ur_result_t UR_APICALL urUSMImportExp( uint64_t instance = context.notify_begin(UR_FUNCTION_USM_IMPORT_EXP, "urUSMImportExp", ¶ms); + context.logger.info("---> urUSMImportExp"); + ur_result_t result = pfnImportExp(hContext, pMem, size); context.notify_end(UR_FUNCTION_USM_IMPORT_EXP, "urUSMImportExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_IMPORT_EXP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -6258,11 +7566,18 @@ __urdlllocal ur_result_t UR_APICALL urUSMReleaseExp( uint64_t instance = context.notify_begin(UR_FUNCTION_USM_RELEASE_EXP, "urUSMReleaseExp", ¶ms); + context.logger.info("---> urUSMReleaseExp"); + ur_result_t result = pfnReleaseExp(hContext, pMem); context.notify_end(UR_FUNCTION_USM_RELEASE_EXP, "urUSMReleaseExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams(args_str, UR_FUNCTION_USM_RELEASE_EXP, + ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -6286,12 +7601,19 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PEnablePeerAccessExp( context.notify_begin(UR_FUNCTION_USM_P2P_ENABLE_PEER_ACCESS_EXP, "urUsmP2PEnablePeerAccessExp", ¶ms); + context.logger.info("---> urUsmP2PEnablePeerAccessExp"); + ur_result_t result = pfnEnablePeerAccessExp(commandDevice, peerDevice); context.notify_end(UR_FUNCTION_USM_P2P_ENABLE_PEER_ACCESS_EXP, "urUsmP2PEnablePeerAccessExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_USM_P2P_ENABLE_PEER_ACCESS_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -6315,12 +7637,19 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PDisablePeerAccessExp( context.notify_begin(UR_FUNCTION_USM_P2P_DISABLE_PEER_ACCESS_EXP, "urUsmP2PDisablePeerAccessExp", ¶ms); + context.logger.info("---> urUsmP2PDisablePeerAccessExp"); + ur_result_t result = pfnDisablePeerAccessExp(commandDevice, peerDevice); context.notify_end(UR_FUNCTION_USM_P2P_DISABLE_PEER_ACCESS_EXP, "urUsmP2PDisablePeerAccessExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_USM_P2P_DISABLE_PEER_ACCESS_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -6356,6 +7685,8 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( context.notify_begin(UR_FUNCTION_USM_P2P_PEER_ACCESS_GET_INFO_EXP, "urUsmP2PPeerAccessGetInfoExp", ¶ms); + context.logger.info("---> urUsmP2PPeerAccessGetInfoExp"); + ur_result_t result = pfnPeerAccessGetInfoExp(commandDevice, peerDevice, propName, propSize, pPropValue, pPropSizeRet); @@ -6364,6 +7695,11 @@ __urdlllocal ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( "urUsmP2PPeerAccessGetInfoExp", ¶ms, &result, instance); + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_USM_P2P_PEER_ACCESS_GET_INFO_EXP, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + return result; } @@ -6828,6 +8164,10 @@ __urdlllocal ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = ur_tracing_layer::urEnqueueCooperativeKernelLaunchExp; + dditable.pfnTimestampRecordingExp = pDdiTable->pfnTimestampRecordingExp; + pDdiTable->pfnTimestampRecordingExp = + ur_tracing_layer::urEnqueueTimestampRecordingExp; + return result; } /////////////////////////////////////////////////////////////////////////////// @@ -7632,6 +8972,10 @@ ur_result_t context_t::init(ur_dditable_t *dditable, return result; } + // Recreate the logger in case env variables have been modified between + // program launch and the call to `urLoaderInit` + logger = logger::create_logger("tracing", true, true); + ur_tracing_layer::context.codelocData = codelocData; if (UR_RESULT_SUCCESS == result) { diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index 1e14552b4e..d2f63921e2 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -496,7 +496,7 @@ __urdlllocal ur_result_t UR_APICALL urDeviceGetInfo( return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - if (UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP < propName) { + if (UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName) { return UR_RESULT_ERROR_INVALID_ENUMERATION; } @@ -6961,7 +6961,6 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( const ur_image_format_t *pImageFormat, ///< [in] pointer to image format specification const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description - ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created ur_exp_image_handle_t *phImage ///< [out] pointer to handle of image object created ) { @@ -6993,10 +6992,6 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - if (NULL == phMem) { - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - } - if (NULL == phImage) { return UR_RESULT_ERROR_INVALID_NULL_POINTER; } @@ -7017,7 +7012,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( } ur_result_t result = pfnUnsampledImageCreateExp( - hContext, hDevice, hImageMem, pImageFormat, pImageDesc, phMem, phImage); + hContext, hDevice, hImageMem, pImageFormat, pImageDesc, phImage); return result; } @@ -7033,7 +7028,6 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( *pImageFormat, ///< [in] pointer to image format specification const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description ur_sampler_handle_t hSampler, ///< [in] sampler to be used - ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created ur_exp_image_handle_t *phImage ///< [out] pointer to handle of image object created ) { @@ -7069,10 +7063,6 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - if (NULL == phMem) { - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - } - if (NULL == phImage) { return UR_RESULT_ERROR_INVALID_NULL_POINTER; } @@ -7099,7 +7089,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( ur_result_t result = pfnSampledImageCreateExp(hContext, hDevice, hImageMem, pImageFormat, - pImageDesc, hSampler, phMem, phImage); + pImageDesc, hSampler, phImage); return result; } @@ -8936,6 +8926,73 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueTimestampRecordingExp +__urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + bool + blocking, ///< [in] indicates whether the call to this function should block until + ///< until the device timestamp recording command has executed on the + ///< device. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies this particular kernel + ///< execution instance. Profiling information can be queried + ///< from this event as if `hQueue` had profiling enabled. Querying + ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + ///< reports the timestamp at the time of the call to this function. + ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + ///< reports the timestamp recorded when the command is executed on the device. +) { + auto pfnTimestampRecordingExp = + context.urDdiTable.EnqueueExp.pfnTimestampRecordingExp; + + if (nullptr == pfnTimestampRecordingExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (context.enableParameterValidation) { + if (NULL == hQueue) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + if (NULL == phEvent) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (phEventWaitList == NULL && numEventsInWaitList > 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList == 0) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + + if (phEventWaitList != NULL && numEventsInWaitList > 0) { + for (uint32_t i = 0; i < numEventsInWaitList; ++i) { + if (phEventWaitList[i] == NULL) { + return UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST; + } + } + } + } + + if (context.enableLifetimeValidation && + !refCountContext.isReferenceValid(hQueue)) { + refCountContext.logInvalidReference(hQueue); + } + + ur_result_t result = pfnTimestampRecordingExp( + hQueue, blocking, numEventsInWaitList, phEventWaitList, phEvent); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urProgramBuildExp __urdlllocal ur_result_t UR_APICALL urProgramBuildExp( @@ -9745,6 +9802,10 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( pDdiTable->pfnCooperativeKernelLaunchExp = ur_validation_layer::urEnqueueCooperativeKernelLaunchExp; + dditable.pfnTimestampRecordingExp = pDdiTable->pfnTimestampRecordingExp; + pDdiTable->pfnTimestampRecordingExp = + ur_validation_layer::urEnqueueTimestampRecordingExp; + return result; } diff --git a/source/loader/ur_adapter_registry.hpp b/source/loader/ur_adapter_registry.hpp index 3cfac34647..060a5ae8a9 100644 --- a/source/loader/ur_adapter_registry.hpp +++ b/source/loader/ur_adapter_registry.hpp @@ -114,11 +114,13 @@ class AdapterRegistry { // to load the adapter. std::vector> adaptersLoadPaths; - static constexpr std::array knownAdapterNames{ + static constexpr std::array knownAdapterNames{ MAKE_LIBRARY_NAME("ur_adapter_level_zero", "0"), - MAKE_LIBRARY_NAME("ur_adapter_hip", "0"), MAKE_LIBRARY_NAME("ur_adapter_opencl", "0"), - MAKE_LIBRARY_NAME("ur_adapter_cuda", "0")}; + MAKE_LIBRARY_NAME("ur_adapter_cuda", "0"), + MAKE_LIBRARY_NAME("ur_adapter_hip", "0"), + MAKE_LIBRARY_NAME("ur_adapter_native_cpu", "0"), + }; std::optional> getEnvAdapterSearchPaths() { std::optional> pathStringsOpt; diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index ffb7eec027..a8c9dc8dcc 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -5917,7 +5917,6 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( const ur_image_format_t *pImageFormat, ///< [in] pointer to image format specification const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description - ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created ur_exp_image_handle_t *phImage ///< [out] pointer to handle of image object created ) { @@ -5942,21 +5941,13 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( reinterpret_cast(hImageMem)->handle; // forward to device-platform - result = pfnUnsampledImageCreateExp( - hContext, hDevice, hImageMem, pImageFormat, pImageDesc, phMem, phImage); + result = pfnUnsampledImageCreateExp(hContext, hDevice, hImageMem, + pImageFormat, pImageDesc, phImage); if (UR_RESULT_SUCCESS != result) { return result; } - try { - // convert platform handle to loader handle - *phMem = reinterpret_cast( - ur_mem_factory.getInstance(*phMem, dditable)); - } catch (std::bad_alloc &) { - result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } - try { // convert platform handle to loader handle *phImage = reinterpret_cast( @@ -5979,7 +5970,6 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( *pImageFormat, ///< [in] pointer to image format specification const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description ur_sampler_handle_t hSampler, ///< [in] sampler to be used - ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created ur_exp_image_handle_t *phImage ///< [out] pointer to handle of image object created ) { @@ -6009,20 +5999,12 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( // forward to device-platform result = pfnSampledImageCreateExp(hContext, hDevice, hImageMem, pImageFormat, - pImageDesc, hSampler, phMem, phImage); + pImageDesc, hSampler, phImage); if (UR_RESULT_SUCCESS != result) { return result; } - try { - // convert platform handle to loader handle - *phMem = reinterpret_cast( - ur_mem_factory.getInstance(*phMem, dditable)); - } catch (std::bad_alloc &) { - result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } - try { // convert platform handle to loader handle *phImage = reinterpret_cast( @@ -7653,6 +7635,69 @@ __urdlllocal ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueTimestampRecordingExp +__urdlllocal ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + bool + blocking, ///< [in] indicates whether the call to this function should block until + ///< until the device timestamp recording command has executed on the + ///< device. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies this particular kernel + ///< execution instance. Profiling information can be queried + ///< from this event as if `hQueue` had profiling enabled. Querying + ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + ///< reports the timestamp at the time of the call to this function. + ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + ///< reports the timestamp recorded when the command is executed on the device. +) { + ur_result_t result = UR_RESULT_SUCCESS; + + // extract platform's function pointer table + auto dditable = reinterpret_cast(hQueue)->dditable; + auto pfnTimestampRecordingExp = + dditable->ur.EnqueueExp.pfnTimestampRecordingExp; + if (nullptr == pfnTimestampRecordingExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + // convert loader handle to platform handle + hQueue = reinterpret_cast(hQueue)->handle; + + // convert loader handles to platform handles + auto phEventWaitListLocal = + std::vector(numEventsInWaitList); + for (size_t i = 0; i < numEventsInWaitList; ++i) { + phEventWaitListLocal[i] = + reinterpret_cast(phEventWaitList[i])->handle; + } + + // forward to device-platform + result = pfnTimestampRecordingExp(hQueue, blocking, numEventsInWaitList, + phEventWaitListLocal.data(), phEvent); + + if (UR_RESULT_SUCCESS != result) { + return result; + } + + try { + // convert platform handle to loader handle + *phEvent = reinterpret_cast( + ur_event_factory.getInstance(*phEvent, dditable)); + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urProgramBuildExp __urdlllocal ur_result_t UR_APICALL urProgramBuildExp( @@ -8373,6 +8418,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( // return pointers to loader's DDIs pDdiTable->pfnCooperativeKernelLaunchExp = ur_loader::urEnqueueCooperativeKernelLaunchExp; + pDdiTable->pfnTimestampRecordingExp = + ur_loader::urEnqueueTimestampRecordingExp; } else { // return pointers directly to platform's DDIs *pDdiTable = diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 66f4835c56..62b502095c 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -842,7 +842,7 @@ ur_result_t UR_APICALL urDeviceGetSelected( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP < propName` +/// + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -4768,6 +4768,7 @@ ur_result_t UR_APICALL urEventSetCallback( /// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION /// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE /// - ::UR_RESULT_ERROR_INVALID_VALUE +/// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGS - "The kernel argument values have not been specified." /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urEnqueueKernelLaunch( @@ -6602,7 +6603,6 @@ ur_result_t UR_APICALL urBindlessImagesImageFreeExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pImageFormat` /// + `NULL == pImageDesc` -/// + `NULL == phMem` /// + `NULL == phImage` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE @@ -6618,7 +6618,6 @@ ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( const ur_image_format_t *pImageFormat, ///< [in] pointer to image format specification const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description - ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created ur_exp_image_handle_t *phImage ///< [out] pointer to handle of image object created ) try { @@ -6630,7 +6629,7 @@ ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( } return pfnUnsampledImageCreateExp(hContext, hDevice, hImageMem, - pImageFormat, pImageDesc, phMem, phImage); + pImageFormat, pImageDesc, phImage); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -6655,7 +6654,6 @@ ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pImageFormat` /// + `NULL == pImageDesc` -/// + `NULL == phMem` /// + `NULL == phImage` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE @@ -6673,7 +6671,6 @@ ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( *pImageFormat, ///< [in] pointer to image format specification const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description ur_sampler_handle_t hSampler, ///< [in] sampler to be used - ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created ur_exp_image_handle_t *phImage ///< [out] pointer to handle of image object created ) try { @@ -6684,7 +6681,7 @@ ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( } return pfnSampledImageCreateExp(hContext, hDevice, hImageMem, pImageFormat, - pImageDesc, hSampler, phMem, phImage); + pImageDesc, hSampler, phImage); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -8315,6 +8312,54 @@ ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Enqueue a command for recording the device timestamp +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == phEvent` +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + bool + blocking, ///< [in] indicates whether the call to this function should block until + ///< until the device timestamp recording command has executed on the + ///< device. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies this particular kernel + ///< execution instance. Profiling information can be queried + ///< from this event as if `hQueue` had profiling enabled. Querying + ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + ///< reports the timestamp at the time of the call to this function. + ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + ///< reports the timestamp recorded when the command is executed on the device. + ) try { + auto pfnTimestampRecordingExp = + ur_lib::context->urDdiTable.EnqueueExp.pfnTimestampRecordingExp; + if (nullptr == pfnTimestampRecordingExp) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + return pfnTimestampRecordingExp(hQueue, blocking, numEventsInWaitList, + phEventWaitList, phEvent); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Produces an executable program from one program, negates need for the /// linking step. diff --git a/source/loader/ur_print.cpp b/source/loader/ur_print.cpp index 5721ebf3ef..3b144c87ad 100644 --- a/source/loader/ur_print.cpp +++ b/source/loader/ur_print.cpp @@ -1653,6 +1653,14 @@ ur_result_t urPrintEnqueueCooperativeKernelLaunchExpParams( return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintEnqueueTimestampRecordingExpParams( + const struct ur_enqueue_timestamp_recording_exp_params_t *params, + char *buffer, const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintEventGetInfoParams(const struct ur_event_get_info_params_t *params, char *buffer, const size_t buff_size, diff --git a/source/ur/ur.cpp b/source/ur/ur.cpp index dad6312d57..cff431069a 100644 --- a/source/ur/ur.cpp +++ b/source/ur/ur.cpp @@ -1,24 +1,24 @@ - -//===--------- ur.cpp - Unified Runtime ----------------------------------===// -// -// Copyright (C) 2023 Intel Corporation -// -// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM -// Exceptions. See LICENSE.TXT -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "ur.hpp" -#include - -// Controls tracing UR calls from within the UR itself. -bool PrintTrace = [] { - const char *PiRet = std::getenv("SYCL_PI_TRACE"); - const char *Trace = PiRet ? PiRet : nullptr; - const int TraceValue = Trace ? std::stoi(Trace) : 0; - if (TraceValue == -1 || TraceValue == 2) { // Means print all traces - return true; - } - return false; -}(); + +//===--------- ur.cpp - Unified Runtime ----------------------------------===// +// +// Copyright (C) 2023 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ur.hpp" +#include + +// Controls tracing UR calls from within the UR itself. +bool PrintTrace = [] { + const char *PiRet = std::getenv("SYCL_PI_TRACE"); + const char *Trace = PiRet ? PiRet : nullptr; + const int TraceValue = Trace ? std::stoi(Trace) : 0; + if (TraceValue == -1 || TraceValue == 2) { // Means print all traces + return true; + } + return false; +}(); diff --git a/source/ur/ur.hpp b/source/ur/ur.hpp index 8e684f6832..ccca6a756b 100644 --- a/source/ur/ur.hpp +++ b/source/ur/ur.hpp @@ -1,377 +1,377 @@ -//===--------- ur.hpp - Unified Runtime ----------------------------------===// -// -// Copyright (C) 2023 Intel Corporation -// -// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM -// Exceptions. See LICENSE.TXT -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "ur_util.hpp" - -template To ur_cast(From Value) { - // TODO: see if more sanity checks are possible. - assert(sizeof(From) == sizeof(To)); - return (To)(Value); -} - -template <> uint32_t inline ur_cast(uint64_t Value) { - // Cast value and check that we don't lose any information. - uint32_t CastedValue = (uint32_t)(Value); - assert((uint64_t)CastedValue == Value); - return CastedValue; -} - -// TODO: promote all of the below extensions to the Unified Runtime -// and get rid of these ZER_EXT constants. -const ur_device_info_t UR_EXT_DEVICE_INFO_OPENCL_C_VERSION = - (ur_device_info_t)0x103D; - -const ur_command_t UR_EXT_COMMAND_TYPE_USER = - (ur_command_t)((uint32_t)UR_COMMAND_FORCE_UINT32 - 1); - -/// Program metadata tags recognized by the UR adapters. For kernels the tag -/// must appear after the kernel name. -#define __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE \ - "@reqd_work_group_size" -#define __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING "@global_id_mapping" -#define __SYCL_UR_PROGRAM_METADATA_TAG_NEED_FINALIZATION "Requires finalization" - -// Terminates the process with a catastrophic error message. -[[noreturn]] inline void die(const char *Message) { - std::cerr << "die: " << Message << std::endl; - std::terminate(); -} - -// A single-threaded app has an opportunity to enable this mode to avoid -// overhead from mutex locking. Default value is 0 which means that single -// thread mode is disabled. -static const bool SingleThreadMode = [] { - auto UrRet = ur_getenv("UR_L0_SINGLE_THREAD_MODE"); - auto PiRet = ur_getenv("SYCL_PI_LEVEL_ZERO_SINGLE_THREAD_MODE"); - const bool RetVal = - UrRet ? std::stoi(*UrRet) : (PiRet ? std::stoi(*PiRet) : 0); - return RetVal; -}(); - -// Class which acts like shared_mutex if SingleThreadMode variable is not set. -// If SingleThreadMode variable is set then mutex operations are turned into -// nop. -class ur_shared_mutex { - std::shared_mutex Mutex; - -public: - void lock() { - if (!SingleThreadMode) { - Mutex.lock(); - } - } - bool try_lock() { return SingleThreadMode ? true : Mutex.try_lock(); } - void unlock() { - if (!SingleThreadMode) { - Mutex.unlock(); - } - } - - void lock_shared() { - if (!SingleThreadMode) { - Mutex.lock_shared(); - } - } - bool try_lock_shared() { - return SingleThreadMode ? true : Mutex.try_lock_shared(); - } - void unlock_shared() { - if (!SingleThreadMode) { - Mutex.unlock_shared(); - } - } -}; - -// Class which acts like std::mutex if SingleThreadMode variable is not set. -// If SingleThreadMode variable is set then mutex operations are turned into -// nop. -class ur_mutex { - std::mutex Mutex; - friend class ur_lock; - -public: - void lock() { - if (!SingleThreadMode) { - Mutex.lock(); - } - } - bool try_lock() { return SingleThreadMode ? true : Mutex.try_lock(); } - void unlock() { - if (!SingleThreadMode) { - Mutex.unlock(); - } - } -}; - -class ur_lock { - std::unique_lock Lock; - -public: - explicit ur_lock(ur_mutex &Mutex) { - if (!SingleThreadMode) { - Lock = std::unique_lock(Mutex.Mutex); - } - } -}; - -/// SpinLock is a synchronization primitive, that uses atomic variable and -/// causes thread trying acquire lock wait in loop while repeatedly check if -/// the lock is available. -/// -/// One important feature of this implementation is that std::atomic can -/// be zero-initialized. This allows SpinLock to have trivial constructor and -/// destructor, which makes it possible to use it in global context (unlike -/// std::mutex, that doesn't provide such guarantees). -class SpinLock { -public: - void lock() { - while (MLock.test_and_set(std::memory_order_acquire)) { - std::this_thread::yield(); - } - } - void unlock() { MLock.clear(std::memory_order_release); } - -private: - std::atomic_flag MLock = ATOMIC_FLAG_INIT; -}; - -// The wrapper for immutable data. -// The data is initialized only once at first access (via ->) with the -// initialization function provided in Init. All subsequent access to -// the data just returns the already stored data. -// -template struct ZeCache : private T { - // The initialization function takes a reference to the data - // it is going to initialize, since it is private here in - // order to disallow access other than through "->". - // - using InitFunctionType = std::function; - InitFunctionType Compute{nullptr}; - std::once_flag Computed; - - ZeCache() : T{} {} - - // Access to the fields of the original T data structure. - T *operator->() { - std::call_once(Computed, Compute, static_cast(*this)); - return this; - } -}; - -// Helper for one-liner validation -#define UR_ASSERT(condition, error) \ - if (!(condition)) \ - return error; - -// TODO: populate with target agnostic handling of UR platforms -struct _ur_platform {}; - -// Controls tracing UR calls from within the UR itself. -extern bool PrintTrace; - -// The getInfo*/ReturnHelper facilities provide shortcut way of -// writing return bytes for the various getInfo APIs. -namespace ur { -template -ur_result_t getInfoImpl(size_t param_value_size, void *param_value, - size_t *param_value_size_ret, T value, - size_t value_size, Assign &&assign_func) { - if (!param_value && !param_value_size_ret) { - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - } - - if (param_value != nullptr) { - - if (param_value_size < value_size) { - return UR_RESULT_ERROR_INVALID_SIZE; - } - - assign_func(param_value, value, value_size); - } - - if (param_value_size_ret != nullptr) { - *param_value_size_ret = value_size; - } - - return UR_RESULT_SUCCESS; -} - -template -ur_result_t getInfo(size_t param_value_size, void *param_value, - size_t *param_value_size_ret, T value) { - - auto assignment = [](void *param_value, T value, size_t value_size) { - std::ignore = value_size; - *static_cast(param_value) = value; - }; - - return getInfoImpl(param_value_size, param_value, param_value_size_ret, value, - sizeof(T), assignment); -} - -template -ur_result_t getInfoArray(size_t array_length, size_t param_value_size, - void *param_value, size_t *param_value_size_ret, - const T *value) { - return getInfoImpl(param_value_size, param_value, param_value_size_ret, value, - array_length * sizeof(T), memcpy); -} - -template -ur_result_t getInfoArray(size_t array_length, size_t param_value_size, - void *param_value, size_t *param_value_size_ret, - const T *value) { - if (param_value) { - memset(param_value, 0, param_value_size); - for (uint32_t I = 0; I < array_length; I++) { - ((RetType *)param_value)[I] = (RetType)value[I]; - } - } - if (param_value_size_ret) { - *param_value_size_ret = array_length * sizeof(RetType); - } - return UR_RESULT_SUCCESS; -} - -template <> -inline ur_result_t -getInfo(size_t param_value_size, void *param_value, - size_t *param_value_size_ret, const char *value) { - return getInfoArray(strlen(value) + 1, param_value_size, param_value, - param_value_size_ret, value); -} -} // namespace ur - -class UrReturnHelper { -public: - UrReturnHelper(size_t param_value_size, void *param_value, - size_t *param_value_size_ret) - : param_value_size(param_value_size), param_value(param_value), - param_value_size_ret(param_value_size_ret) {} - - // A version where in/out info size is represented by a single pointer - // to a value which is updated on return - UrReturnHelper(size_t *param_value_size, void *param_value) - : param_value_size(*param_value_size), param_value(param_value), - param_value_size_ret(param_value_size) {} - - // Scalar return value - template ur_result_t operator()(const T &t) { - return ur::getInfo(param_value_size, param_value, param_value_size_ret, t); - } - - // Array return value - template ur_result_t operator()(const T *t, size_t s) { - return ur::getInfoArray(s, param_value_size, param_value, - param_value_size_ret, t); - } - - // Array return value where element type is differrent from T - template - ur_result_t operator()(const T *t, size_t s) { - return ur::getInfoArray(s, param_value_size, param_value, - param_value_size_ret, t); - } - -protected: - size_t param_value_size; - void *param_value; - size_t *param_value_size_ret; -}; - -template class Result { -public: - Result(ur_result_t err) : value_or_err(err) {} - Result(T value) : value_or_err(std::move(value)) {} - Result() : value_or_err(UR_RESULT_ERROR_UNINITIALIZED) {} - - bool is_err() { return std::holds_alternative(value_or_err); } - explicit operator bool() const { return !is_err(); } - - const T *get_value() { return std::get_if(&value_or_err); } - - ur_result_t get_error() { - auto *err = std::get_if(&value_or_err); - return err ? *err : UR_RESULT_SUCCESS; - } - -private: - std::variant value_or_err; -}; - -// Helper to make sure each x, y, z dim divide the global dimension. -// -// In/Out: ThreadsPerBlockInDim - The dimension of workgroup in some dimension -// In: GlobalWorkSizeInDim - The global size in some dimension -static inline void -roundToHighestFactorOfGlobalSize(size_t &ThreadsPerBlockInDim, - const size_t GlobalWorkSizeInDim) { - while (ThreadsPerBlockInDim > 1 && - GlobalWorkSizeInDim % ThreadsPerBlockInDim) { - --ThreadsPerBlockInDim; - } -} - -// Returns whether or not Value is a power of 2 -template >> -bool isPowerOf2(const T &Value) { - return Value && !(Value & (Value - 1)); -} - -// Helper to make sure each x, y, z dim divide the global dimension. -// Additionally it makes sure that the inner dimension always is a power of 2 -// -// In/Out: ThreadsPerBlock - The size of wg in 3d -// In: GlobalSize - The global size in 3d (if dim < 3 then outer -// dims == 1) -// In: MaxBlockDim - The max size of block in 3d -// In: MaxBlockSize - The max total size of block in all dimensions -// In: WorkDim - The workdim (1, 2 or 3) -static inline void roundToHighestFactorOfGlobalSizeIn3d( - size_t *ThreadsPerBlock, const size_t *GlobalSize, - const size_t *MaxBlockDim, const size_t MaxBlockSize) { - assert(GlobalSize[0] && "GlobalSize[0] cannot be zero"); - assert(GlobalSize[1] && "GlobalSize[1] cannot be zero"); - assert(GlobalSize[2] && "GlobalSize[2] cannot be zero"); - - ThreadsPerBlock[0] = - std::min(GlobalSize[0], std::min(MaxBlockSize, MaxBlockDim[0])); - do { - roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalSize[0]); - } while (!isPowerOf2(ThreadsPerBlock[0]) && ThreadsPerBlock[0] > 32 && - --ThreadsPerBlock[0]); - - ThreadsPerBlock[1] = - std::min(GlobalSize[1], - std::min(MaxBlockSize / ThreadsPerBlock[0], MaxBlockDim[1])); - roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalSize[1]); - - ThreadsPerBlock[2] = std::min( - GlobalSize[2], - std::min(MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[0]), - MaxBlockDim[2])); - roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalSize[2]); -} +//===--------- ur.hpp - Unified Runtime ----------------------------------===// +// +// Copyright (C) 2023 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "ur_util.hpp" + +template To ur_cast(From Value) { + // TODO: see if more sanity checks are possible. + assert(sizeof(From) == sizeof(To)); + return (To)(Value); +} + +template <> uint32_t inline ur_cast(uint64_t Value) { + // Cast value and check that we don't lose any information. + uint32_t CastedValue = (uint32_t)(Value); + assert((uint64_t)CastedValue == Value); + return CastedValue; +} + +// TODO: promote all of the below extensions to the Unified Runtime +// and get rid of these ZER_EXT constants. +const ur_device_info_t UR_EXT_DEVICE_INFO_OPENCL_C_VERSION = + (ur_device_info_t)0x103D; + +const ur_command_t UR_EXT_COMMAND_TYPE_USER = + (ur_command_t)((uint32_t)UR_COMMAND_FORCE_UINT32 - 1); + +/// Program metadata tags recognized by the UR adapters. For kernels the tag +/// must appear after the kernel name. +#define __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE \ + "@reqd_work_group_size" +#define __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING "@global_id_mapping" +#define __SYCL_UR_PROGRAM_METADATA_TAG_NEED_FINALIZATION "Requires finalization" + +// Terminates the process with a catastrophic error message. +[[noreturn]] inline void die(const char *Message) { + std::cerr << "die: " << Message << std::endl; + std::terminate(); +} + +// A single-threaded app has an opportunity to enable this mode to avoid +// overhead from mutex locking. Default value is 0 which means that single +// thread mode is disabled. +static const bool SingleThreadMode = [] { + auto UrRet = ur_getenv("UR_L0_SINGLE_THREAD_MODE"); + auto PiRet = ur_getenv("SYCL_PI_LEVEL_ZERO_SINGLE_THREAD_MODE"); + const bool RetVal = + UrRet ? std::stoi(*UrRet) : (PiRet ? std::stoi(*PiRet) : 0); + return RetVal; +}(); + +// Class which acts like shared_mutex if SingleThreadMode variable is not set. +// If SingleThreadMode variable is set then mutex operations are turned into +// nop. +class ur_shared_mutex { + std::shared_mutex Mutex; + +public: + void lock() { + if (!SingleThreadMode) { + Mutex.lock(); + } + } + bool try_lock() { return SingleThreadMode ? true : Mutex.try_lock(); } + void unlock() { + if (!SingleThreadMode) { + Mutex.unlock(); + } + } + + void lock_shared() { + if (!SingleThreadMode) { + Mutex.lock_shared(); + } + } + bool try_lock_shared() { + return SingleThreadMode ? true : Mutex.try_lock_shared(); + } + void unlock_shared() { + if (!SingleThreadMode) { + Mutex.unlock_shared(); + } + } +}; + +// Class which acts like std::mutex if SingleThreadMode variable is not set. +// If SingleThreadMode variable is set then mutex operations are turned into +// nop. +class ur_mutex { + std::mutex Mutex; + friend class ur_lock; + +public: + void lock() { + if (!SingleThreadMode) { + Mutex.lock(); + } + } + bool try_lock() { return SingleThreadMode ? true : Mutex.try_lock(); } + void unlock() { + if (!SingleThreadMode) { + Mutex.unlock(); + } + } +}; + +class ur_lock { + std::unique_lock Lock; + +public: + explicit ur_lock(ur_mutex &Mutex) { + if (!SingleThreadMode) { + Lock = std::unique_lock(Mutex.Mutex); + } + } +}; + +/// SpinLock is a synchronization primitive, that uses atomic variable and +/// causes thread trying acquire lock wait in loop while repeatedly check if +/// the lock is available. +/// +/// One important feature of this implementation is that std::atomic can +/// be zero-initialized. This allows SpinLock to have trivial constructor and +/// destructor, which makes it possible to use it in global context (unlike +/// std::mutex, that doesn't provide such guarantees). +class SpinLock { +public: + void lock() { + while (MLock.test_and_set(std::memory_order_acquire)) { + std::this_thread::yield(); + } + } + void unlock() { MLock.clear(std::memory_order_release); } + +private: + std::atomic_flag MLock = ATOMIC_FLAG_INIT; +}; + +// The wrapper for immutable data. +// The data is initialized only once at first access (via ->) with the +// initialization function provided in Init. All subsequent access to +// the data just returns the already stored data. +// +template struct ZeCache : private T { + // The initialization function takes a reference to the data + // it is going to initialize, since it is private here in + // order to disallow access other than through "->". + // + using InitFunctionType = std::function; + InitFunctionType Compute{nullptr}; + std::once_flag Computed; + + ZeCache() : T{} {} + + // Access to the fields of the original T data structure. + T *operator->() { + std::call_once(Computed, Compute, static_cast(*this)); + return this; + } +}; + +// Helper for one-liner validation +#define UR_ASSERT(condition, error) \ + if (!(condition)) \ + return error; + +// TODO: populate with target agnostic handling of UR platforms +struct _ur_platform {}; + +// Controls tracing UR calls from within the UR itself. +extern bool PrintTrace; + +// The getInfo*/ReturnHelper facilities provide shortcut way of +// writing return bytes for the various getInfo APIs. +namespace ur { +template +ur_result_t getInfoImpl(size_t param_value_size, void *param_value, + size_t *param_value_size_ret, T value, + size_t value_size, Assign &&assign_func) { + if (!param_value && !param_value_size_ret) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (param_value != nullptr) { + + if (param_value_size < value_size) { + return UR_RESULT_ERROR_INVALID_SIZE; + } + + assign_func(param_value, value, value_size); + } + + if (param_value_size_ret != nullptr) { + *param_value_size_ret = value_size; + } + + return UR_RESULT_SUCCESS; +} + +template +ur_result_t getInfo(size_t param_value_size, void *param_value, + size_t *param_value_size_ret, T value) { + + auto assignment = [](void *param_value, T value, size_t value_size) { + std::ignore = value_size; + *static_cast(param_value) = value; + }; + + return getInfoImpl(param_value_size, param_value, param_value_size_ret, value, + sizeof(T), assignment); +} + +template +ur_result_t getInfoArray(size_t array_length, size_t param_value_size, + void *param_value, size_t *param_value_size_ret, + const T *value) { + return getInfoImpl(param_value_size, param_value, param_value_size_ret, value, + array_length * sizeof(T), memcpy); +} + +template +ur_result_t getInfoArray(size_t array_length, size_t param_value_size, + void *param_value, size_t *param_value_size_ret, + const T *value) { + if (param_value) { + memset(param_value, 0, param_value_size); + for (uint32_t I = 0; I < array_length; I++) { + ((RetType *)param_value)[I] = (RetType)value[I]; + } + } + if (param_value_size_ret) { + *param_value_size_ret = array_length * sizeof(RetType); + } + return UR_RESULT_SUCCESS; +} + +template <> +inline ur_result_t +getInfo(size_t param_value_size, void *param_value, + size_t *param_value_size_ret, const char *value) { + return getInfoArray(strlen(value) + 1, param_value_size, param_value, + param_value_size_ret, value); +} +} // namespace ur + +class UrReturnHelper { +public: + UrReturnHelper(size_t param_value_size, void *param_value, + size_t *param_value_size_ret) + : param_value_size(param_value_size), param_value(param_value), + param_value_size_ret(param_value_size_ret) {} + + // A version where in/out info size is represented by a single pointer + // to a value which is updated on return + UrReturnHelper(size_t *param_value_size, void *param_value) + : param_value_size(*param_value_size), param_value(param_value), + param_value_size_ret(param_value_size) {} + + // Scalar return value + template ur_result_t operator()(const T &t) { + return ur::getInfo(param_value_size, param_value, param_value_size_ret, t); + } + + // Array return value + template ur_result_t operator()(const T *t, size_t s) { + return ur::getInfoArray(s, param_value_size, param_value, + param_value_size_ret, t); + } + + // Array return value where element type is differrent from T + template + ur_result_t operator()(const T *t, size_t s) { + return ur::getInfoArray(s, param_value_size, param_value, + param_value_size_ret, t); + } + +protected: + size_t param_value_size; + void *param_value; + size_t *param_value_size_ret; +}; + +template class Result { +public: + Result(ur_result_t err) : value_or_err(err) {} + Result(T value) : value_or_err(std::move(value)) {} + Result() : value_or_err(UR_RESULT_ERROR_UNINITIALIZED) {} + + bool is_err() { return std::holds_alternative(value_or_err); } + explicit operator bool() const { return !is_err(); } + + const T *get_value() { return std::get_if(&value_or_err); } + + ur_result_t get_error() { + auto *err = std::get_if(&value_or_err); + return err ? *err : UR_RESULT_SUCCESS; + } + +private: + std::variant value_or_err; +}; + +// Helper to make sure each x, y, z dim divide the global dimension. +// +// In/Out: ThreadsPerBlockInDim - The dimension of workgroup in some dimension +// In: GlobalWorkSizeInDim - The global size in some dimension +static inline void +roundToHighestFactorOfGlobalSize(size_t &ThreadsPerBlockInDim, + const size_t GlobalWorkSizeInDim) { + while (ThreadsPerBlockInDim > 1 && + GlobalWorkSizeInDim % ThreadsPerBlockInDim) { + --ThreadsPerBlockInDim; + } +} + +// Returns whether or not Value is a power of 2 +template >> +bool isPowerOf2(const T &Value) { + return Value && !(Value & (Value - 1)); +} + +// Helper to make sure each x, y, z dim divide the global dimension. +// Additionally it makes sure that the inner dimension always is a power of 2 +// +// In/Out: ThreadsPerBlock - The size of wg in 3d +// In: GlobalSize - The global size in 3d (if dim < 3 then outer +// dims == 1) +// In: MaxBlockDim - The max size of block in 3d +// In: MaxBlockSize - The max total size of block in all dimensions +// In: WorkDim - The workdim (1, 2 or 3) +static inline void roundToHighestFactorOfGlobalSizeIn3d( + size_t *ThreadsPerBlock, const size_t *GlobalSize, + const size_t *MaxBlockDim, const size_t MaxBlockSize) { + assert(GlobalSize[0] && "GlobalSize[0] cannot be zero"); + assert(GlobalSize[1] && "GlobalSize[1] cannot be zero"); + assert(GlobalSize[2] && "GlobalSize[2] cannot be zero"); + + ThreadsPerBlock[0] = + std::min(GlobalSize[0], std::min(MaxBlockSize, MaxBlockDim[0])); + do { + roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalSize[0]); + } while (!isPowerOf2(ThreadsPerBlock[0]) && ThreadsPerBlock[0] > 32 && + --ThreadsPerBlock[0]); + + ThreadsPerBlock[1] = + std::min(GlobalSize[1], + std::min(MaxBlockSize / ThreadsPerBlock[0], MaxBlockDim[1])); + roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalSize[1]); + + ThreadsPerBlock[2] = std::min( + GlobalSize[2], + std::min(MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[0]), + MaxBlockDim[2])); + roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalSize[2]); +} diff --git a/source/ur_api.cpp b/source/ur_api.cpp index e6410ee99b..1ed70e0494 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -7,7 +7,7 @@ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * * @file ur_api.cpp - * @version v0.9-r0 + * @version v0.10-r0 * */ #include "ur_api.h" @@ -736,7 +736,7 @@ ur_result_t UR_APICALL urDeviceGetSelected( /// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE /// + `NULL == hDevice` /// - ::UR_RESULT_ERROR_INVALID_ENUMERATION -/// + `::UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP < propName` +/// + `::UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP < propName` /// - ::UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION /// + If `propName` is not supported by the adapter. /// - ::UR_RESULT_ERROR_INVALID_SIZE @@ -4040,6 +4040,7 @@ ur_result_t UR_APICALL urEventSetCallback( /// - ::UR_RESULT_ERROR_INVALID_WORK_DIMENSION /// - ::UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE /// - ::UR_RESULT_ERROR_INVALID_VALUE +/// - ::UR_RESULT_ERROR_INVALID_KERNEL_ARGS - "The kernel argument values have not been specified." /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urEnqueueKernelLaunch( @@ -5634,7 +5635,6 @@ ur_result_t UR_APICALL urBindlessImagesImageFreeExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pImageFormat` /// + `NULL == pImageDesc` -/// + `NULL == phMem` /// + `NULL == phImage` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE @@ -5650,7 +5650,6 @@ ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( const ur_image_format_t *pImageFormat, ///< [in] pointer to image format specification const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description - ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created ur_exp_image_handle_t *phImage ///< [out] pointer to handle of image object created ) { @@ -5678,7 +5677,6 @@ ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == pImageFormat` /// + `NULL == pImageDesc` -/// + `NULL == phMem` /// + `NULL == phImage` /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE @@ -5696,7 +5694,6 @@ ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( *pImageFormat, ///< [in] pointer to image format specification const ur_image_desc_t *pImageDesc, ///< [in] pointer to image description ur_sampler_handle_t hSampler, ///< [in] sampler to be used - ur_mem_handle_t *phMem, ///< [out] pointer to handle of image object created ur_exp_image_handle_t *phImage ///< [out] pointer to handle of image object created ) { @@ -7034,6 +7031,46 @@ ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Enqueue a command for recording the device timestamp +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == phEvent` +/// - ::UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST +/// + `phEventWaitList == NULL && numEventsInWaitList > 0` +/// + `phEventWaitList != NULL && numEventsInWaitList == 0` +ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + bool + blocking, ///< [in] indicates whether the call to this function should block until + ///< until the device timestamp recording command has executed on the + ///< device. + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before the kernel execution. + ///< If nullptr, the numEventsInWaitList must be 0, indicating no wait + ///< events. + ur_event_handle_t * + phEvent ///< [in,out] return an event object that identifies this particular kernel + ///< execution instance. Profiling information can be queried + ///< from this event as if `hQueue` had profiling enabled. Querying + ///< `UR_PROFILING_INFO_COMMAND_QUEUED` or `UR_PROFILING_INFO_COMMAND_SUBMIT` + ///< reports the timestamp at the time of the call to this function. + ///< Querying `UR_PROFILING_INFO_COMMAND_START` or `UR_PROFILING_INFO_COMMAND_END` + ///< reports the timestamp recorded when the command is executed on the device. +) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Produces an executable program from one program, negates need for the /// linking step. diff --git a/test/adapters/CMakeLists.txt b/test/adapters/CMakeLists.txt index 5eff6e357a..969d8a4e86 100644 --- a/test/adapters/CMakeLists.txt +++ b/test/adapters/CMakeLists.txt @@ -1,4 +1,4 @@ -# Copyright (C) 2023 Intel Corporation +# Copyright (C) 2023-2024 Intel Corporation # Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception @@ -48,6 +48,35 @@ function(add_adapter_test name) ENVIRONMENT "${args_ENVIRONMENT}") endfunction() +find_program(VALGRIND valgrind) + +function(add_adapter_memcheck_test name) + cmake_parse_arguments(args + "" # options + "" # one value keywords + "ENVIRONMENT" # multi value keywords + ${ARGN}) + if(VALGRIND) + set(target test-adapter-${name}) + set(test_name ${target}-memcheck) + + add_test(NAME ${test_name} + COMMAND ${CMAKE_COMMAND} + -D TEST_FILE=valgrind + -D TEST_ARGS="--tool=memcheck --leak-check=full $ --devices_count=${UR_TEST_DEVICES_COUNT} --platforms_count=${UR_TEST_DEVICES_COUNT}" + -D MODE=stderr + -D MATCH_FILE=${CMAKE_CURRENT_SOURCE_DIR}/${name}_memcheck.match + -P ${PROJECT_SOURCE_DIR}/cmake/match.cmake + DEPENDS ${TEST_TARGET_NAME} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) + + set_tests_properties(${test_name} PROPERTIES + LABELS "adapter-specific;${name}" + ENVIRONMENT "${args_ENVIRONMENT}") + endif() +endfunction() + if(UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_ALL) add_subdirectory(cuda) endif() diff --git a/test/adapters/cuda/context_tests.cpp b/test/adapters/cuda/context_tests.cpp index 37742a002c..b08e22c8f9 100644 --- a/test/adapters/cuda/context_tests.cpp +++ b/test/adapters/cuda/context_tests.cpp @@ -21,7 +21,9 @@ TEST_P(cudaUrContextCreateTest, CreateWithChildThread) { // Retrieve the CUDA context to check information is correct auto checkValue = [=] { - CUcontext cudaContext = context.handle->get(); + // Just testing the first device in context + CUcontext cudaContext = + context.handle->getDevices()[0]->getNativeContext(); unsigned int version = 0; EXPECT_SUCCESS_CUDA(cuCtxGetApiVersion(cudaContext, &version)); EXPECT_EQ(version, known_cuda_api_version); @@ -94,15 +96,6 @@ TEST_P(cudaUrContextCreateTest, ContextLifetimeExisting) { // ensure the queue has the correct context ASSERT_EQ(context, queue->getContext()); - - // create a buffer in the context to set the context as active - uur::raii::Mem buffer; - ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, 1024, - nullptr, buffer.ptr())); - - // check that context is now the active cuda context - ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(¤t)); - ASSERT_EQ(current, context->get()); } TEST_P(cudaUrContextCreateTest, ThreadedContext) { @@ -127,8 +120,6 @@ TEST_P(cudaUrContextCreateTest, ThreadedContext) { // the first context, and then create and release another queue with // the second context. auto test_thread = std::thread([&] { - CUcontext current = nullptr; - { // create a queue with the first context uur::raii::Queue queue; @@ -165,15 +156,6 @@ TEST_P(cudaUrContextCreateTest, ThreadedContext) { // ensure queue has correct context ASSERT_EQ(context2, queue->getContext()); - - // create a buffer to set the active context - uur::raii::Mem buffer = nullptr; - ASSERT_SUCCESS(urMemBufferCreate(context2, UR_MEM_FLAG_READ_WRITE, - 1024, nullptr, buffer.ptr())); - - // check that the 2nd context is now tha active cuda context - ASSERT_SUCCESS_CUDA(cuCtxGetCurrent(¤t)); - ASSERT_EQ(current, context2->get()); } }); diff --git a/test/adapters/level_zero/CMakeLists.txt b/test/adapters/level_zero/CMakeLists.txt index 45d68594ed..17be4306ce 100644 --- a/test/adapters/level_zero/CMakeLists.txt +++ b/test/adapters/level_zero/CMakeLists.txt @@ -13,15 +13,47 @@ else() FIXTURE KERNELS SOURCES urProgramLink.cpp + urKernelCreateWithNativeHandle.cpp ENVIRONMENT "UR_ADAPTERS_FORCE_LOAD=\"$\"" ) + # TODO: valgrind tests require very new environment. + # Enable once all L0 runners are updated. + # add_adapter_memcheck_test(level_zero + # ENVIRONMENT + # "UR_ADAPTERS_FORCE_LOAD=\"$\"" + # ) + + target_link_libraries(test-adapter-level_zero PRIVATE + LevelZeroLoader + LevelZeroLoader-Headers + ) target_include_directories(test-adapter-level_zero PRIVATE ${PROJECT_SOURCE_DIR}/source ${PROJECT_SOURCE_DIR}/source/adapters/level_zero + LevelZeroLoader-Headers ) add_dependencies(test-adapter-level_zero generate_device_binaries kernel_names_header) endif() + +if(LINUX) + # Make L0 use CallMap from a seprate shared lib so that we can access the map + # from the tests. This only seems to work on linux + add_library(zeCallMap SHARED zeCallMap.cpp) + target_compile_definitions(ur_adapter_level_zero PRIVATE UR_L0_CALL_COUNT_IN_TESTS) + target_link_libraries(ur_adapter_level_zero PRIVATE zeCallMap) + + add_adapter_test(level_zero + FIXTURE DEVICES + SOURCES + event_cache_tests.cpp + ENVIRONMENT + "UR_ADAPTERS_FORCE_LOAD=\"$\"" + "UR_L0_LEAKS_DEBUG=1" + ) + + target_link_libraries(test-adapter-level_zero PRIVATE zeCallMap) +endif() diff --git a/test/adapters/level_zero/event_cache_tests.cpp b/test/adapters/level_zero/event_cache_tests.cpp new file mode 100644 index 0000000000..53bc39ad96 --- /dev/null +++ b/test/adapters/level_zero/event_cache_tests.cpp @@ -0,0 +1,163 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "ur_print.hpp" +#include "uur/fixtures.h" +#include "uur/raii.h" + +#include +#include + +template auto combineFlags(std::tuple tuple) { + return std::apply([](auto... args) { return (... |= args); }, tuple); +} + +extern std::map *ZeCallCount; + +using FlagsTupleType = std::tuple; + +struct urEventCacheTest : uur::urContextTestWithParam { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::SetUp()); + + flags = combineFlags(getParam()); + + ur_queue_properties_t props; + props.flags = flags; + ASSERT_SUCCESS(urQueueCreate(context, device, &props, &queue)); + ASSERT_NE(queue, nullptr); + + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_WRITE_ONLY, size, + nullptr, &buffer)); + + (*ZeCallCount)["zeEventCreate"] = 0; + (*ZeCallCount)["zeEventDestroy"] = 0; + } + + void TearDown() override { + if (buffer) { + EXPECT_SUCCESS(urMemRelease(buffer)); + } + if (queue) { + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urQueueRelease(queue)); + } + UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::TearDown()); + } + + auto enqueueWork(ur_event_handle_t *hEvent, int data) { + input.assign(count, data); + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urEnqueueMemBufferWrite( + queue, buffer, false, 0, size, input.data(), 0, nullptr, hEvent)); + } + + void verifyData() { + std::vector output(count, 1); + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urEnqueueMemBufferRead( + queue, buffer, true, 0, size, output.data(), 0, nullptr, nullptr)); + + if (!(flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)) { + ASSERT_EQ(input, output); + } + } + + const size_t count = 1024; + const size_t size = sizeof(uint32_t) * count; + ur_mem_handle_t buffer = nullptr; + ur_queue_handle_t queue = nullptr; + std::vector input; + ur_queue_flags_t flags; +}; + +TEST_P(urEventCacheTest, eventsReuseNoVisibleEvent) { + static constexpr int numIters = 16; + static constexpr int numEnqueues = 128; + + for (int i = 0; i < numIters; i++) { + for (int j = 0; j < numEnqueues; j++) { + enqueueWork(nullptr, i * numEnqueues + j); + } + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urQueueFinish(queue)); + verifyData(); + } + + // TODO: why events are not reused for UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE? + if ((flags & UR_QUEUE_FLAG_DISCARD_EVENTS) && + !(flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)) { + ASSERT_EQ((*ZeCallCount)["zeEventCreate"], 2); + } else { + ASSERT_GE((*ZeCallCount)["zeEventCreate"], numIters * numEnqueues); + } +} + +TEST_P(urEventCacheTest, eventsReuseWithVisibleEvent) { + static constexpr int numIters = 16; + static constexpr int numEnqueues = 128; + + for (int i = 0; i < numIters; i++) { + std::vector events(numEnqueues); + for (int j = 0; j < numEnqueues; j++) { + enqueueWork(events[j].ptr(), i * numEnqueues + j); + } + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urQueueFinish(queue)); + verifyData(); + } + + ASSERT_LT((*ZeCallCount)["zeEventCreate"], numIters * numEnqueues); +} + +TEST_P(urEventCacheTest, eventsReuseWithVisibleEventAndWait) { + static constexpr int numIters = 16; + static constexpr int numEnqueues = 128; + static constexpr int waitEveryN = 16; + + for (int i = 0; i < numIters; i++) { + std::vector events; + for (int j = 0; j < numEnqueues; j++) { + events.emplace_back(); + enqueueWork(events.back().ptr(), i * numEnqueues + j); + + if (j > 0 && j % waitEveryN == 0) { + ASSERT_SUCCESS(urEventWait(waitEveryN, + (ur_event_handle_t *)events.data())); + verifyData(); + events.clear(); + } + } + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urQueueFinish(queue)); + } + + ASSERT_GE((*ZeCallCount)["zeEventCreate"], waitEveryN); + // TODO: why there are more events than this? + // ASSERT_LE((*ZeCallCount)["zeEventCreate"], waitEveryN * 2 + 2); +} + +template +inline std::string +printFlags(const testing::TestParamInfo &info) { + const auto device_handle = std::get<0>(info.param); + const auto platform_device_name = + uur::GetPlatformAndDeviceName(device_handle); + auto flags = combineFlags(std::get<1>(info.param)); + + std::stringstream ss; + ur::details::printFlag(ss, flags); + + auto str = ss.str(); + std::replace(str.begin(), str.end(), ' ', '_'); + std::replace(str.begin(), str.end(), '|', '_'); + return platform_device_name + "__" + str; +} + +UUR_TEST_SUITE_P( + urEventCacheTest, + ::testing::Combine( + testing::Values(0, UR_QUEUE_FLAG_DISCARD_EVENTS), + testing::Values(0, UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE), + // TODO: why the test fails with UR_QUEUE_FLAG_SUBMISSION_BATCHED? + testing::Values( + UR_QUEUE_FLAG_SUBMISSION_IMMEDIATE /*, UR_QUEUE_FLAG_SUBMISSION_BATCHED */), + testing::Values(0, UR_QUEUE_FLAG_PROFILING_ENABLE)), + printFlags); diff --git a/test/adapters/level_zero/level_zero_memcheck.match b/test/adapters/level_zero/level_zero_memcheck.match new file mode 100644 index 0000000000..904185989c --- /dev/null +++ b/test/adapters/level_zero/level_zero_memcheck.match @@ -0,0 +1,2 @@ +{{IGNORE}} +{{.*}} ERROR SUMMARY: 0 errors from 0 contexts {{.*}} \ No newline at end of file diff --git a/test/adapters/level_zero/urKernelCreateWithNativeHandle.cpp b/test/adapters/level_zero/urKernelCreateWithNativeHandle.cpp new file mode 100644 index 0000000000..19e4de21ec --- /dev/null +++ b/test/adapters/level_zero/urKernelCreateWithNativeHandle.cpp @@ -0,0 +1,63 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "level_zero/ze_api.h" +#include "ur_api.h" +#include "uur/checks.h" +#include + +using urLevelZeroKernelNativeHandleTest = uur::urContextTest; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urLevelZeroKernelNativeHandleTest); + +TEST_P(urLevelZeroKernelNativeHandleTest, OwnedHandleRelease) { + ze_context_handle_t native_context; + urContextGetNativeHandle(context, (ur_native_handle_t *)&native_context); + + ze_device_handle_t native_device; + urDeviceGetNativeHandle(device, (ur_native_handle_t *)&native_device); + + std::shared_ptr> il_binary; + uur::KernelsEnvironment::instance->LoadSource("foo", il_binary); + + auto kernel_name = + uur::KernelsEnvironment::instance->GetEntryPointNames("foo")[0]; + + ze_module_desc_t moduleDesc = {ZE_STRUCTURE_TYPE_MODULE_DESC}; + moduleDesc.format = ZE_MODULE_FORMAT_IL_SPIRV; + moduleDesc.inputSize = il_binary->size(); + moduleDesc.pInputModule = + reinterpret_cast(il_binary->data()); + moduleDesc.pBuildFlags = ""; + ze_module_handle_t module; + + ASSERT_EQ(zeModuleCreate(native_context, native_device, &moduleDesc, + &module, NULL), + ZE_RESULT_SUCCESS); + + ze_kernel_desc_t kernelDesc = {ZE_STRUCTURE_TYPE_KERNEL_DESC}; + kernelDesc.pKernelName = kernel_name.c_str(); + + ze_kernel_handle_t native_kernel; + + ASSERT_EQ(zeKernelCreate(module, &kernelDesc, &native_kernel), + ZE_RESULT_SUCCESS); + + ur_program_native_properties_t pprops = { + UR_STRUCTURE_TYPE_PROGRAM_NATIVE_PROPERTIES, nullptr, 1}; + + ur_program_handle_t program; + ASSERT_SUCCESS(urProgramCreateWithNativeHandle((ur_native_handle_t)module, + context, &pprops, &program)); + + ur_kernel_native_properties_t kprops = { + UR_STRUCTURE_TYPE_KERNEL_NATIVE_PROPERTIES, nullptr, 1}; + + ur_kernel_handle_t kernel; + ASSERT_SUCCESS(urKernelCreateWithNativeHandle( + (ur_native_handle_t)native_kernel, context, program, &kprops, &kernel)); + + ASSERT_SUCCESS(urKernelRelease(kernel)); + ASSERT_SUCCESS(urProgramRelease(program)); +} diff --git a/test/adapters/level_zero/urProgramLink.cpp b/test/adapters/level_zero/urProgramLink.cpp index 77ce3b8f49..b84247ec47 100644 --- a/test/adapters/level_zero/urProgramLink.cpp +++ b/test/adapters/level_zero/urProgramLink.cpp @@ -3,6 +3,7 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include "ur_api.h" #include using urLevelZeroProgramLinkTest = uur::urProgramTest; @@ -28,4 +29,6 @@ TEST_P(urLevelZeroProgramLinkTest, InvalidLinkOptionsPrintedInLog) { log.data(), nullptr)); ASSERT_EQ(log[logSize - 1], '\0'); ASSERT_NE(std::string{log.data()}.find("-foo"), std::string::npos); + + ASSERT_SUCCESS(urProgramRelease(linked_program)); } diff --git a/test/adapters/level_zero/zeCallMap.cpp b/test/adapters/level_zero/zeCallMap.cpp new file mode 100644 index 0000000000..3c6487f36d --- /dev/null +++ b/test/adapters/level_zero/zeCallMap.cpp @@ -0,0 +1,12 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include + +// Map used by L0 adapter to count the number of calls to each L0 function +// Lifetime is managed by the adapter, this variable is defined here +// only so that we can read it from the tests. +std::map *ZeCallCount = nullptr; diff --git a/test/conformance/CMakeLists.txt b/test/conformance/CMakeLists.txt index 14875a117a..9a80c5345e 100644 --- a/test/conformance/CMakeLists.txt +++ b/test/conformance/CMakeLists.txt @@ -9,23 +9,36 @@ function(add_test_adapter name adapter) set(TEST_TARGET_NAME test-${name}) set(TEST_NAME ${name}-${adapter}) - add_test(NAME ${TEST_NAME} - COMMAND ${CMAKE_COMMAND} - -D TEST_FILE=${Python3_EXECUTABLE} - -D TEST_ARGS="${UR_CONFORMANCE_TEST_DIR}/cts_exe.py --test_command ${CMAKE_BINARY_DIR}/bin/${TEST_TARGET_NAME} --test_devices_count=${UR_TEST_DEVICES_COUNT} --test_platforms_count=${UR_TEST_PLATFORMS_COUNT}" - -D MODE=stdout - -D MATCH_FILE=${CMAKE_CURRENT_SOURCE_DIR}/${name}_${adapter}.match - -P ${PROJECT_SOURCE_DIR}/cmake/match.cmake - DEPENDS ${TEST_TARGET_NAME} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + set(TEST_COMMAND + "${PROJECT_BINARY_DIR}/bin/${TEST_TARGET_NAME} --test_devices_count=${UR_TEST_DEVICES_COUNT} --test_platforms_count=${UR_TEST_PLATFORMS_COUNT}" ) - set(testEnv - UR_ADAPTERS_FORCE_LOAD="$" - GTEST_COLOR=no - ) + if(UR_CONFORMANCE_ENABLE_MATCH_FILES) + add_test(NAME ${TEST_NAME} + COMMAND ${CMAKE_COMMAND} + -D TEST_FILE=${Python3_EXECUTABLE} + -D TEST_ARGS="${UR_CONFORMANCE_TEST_DIR}/cts_exe.py --test_command ${TEST_COMMAND}" + -D MODE=stdout + -D MATCH_FILE=${CMAKE_CURRENT_SOURCE_DIR}/${name}_${adapter}.match + -P ${PROJECT_SOURCE_DIR}/cmake/match.cmake + DEPENDS ${TEST_TARGET_NAME} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) + else() + separate_arguments(TEST_COMMAND) + add_test(NAME ${TEST_NAME} + COMMAND ${TEST_COMMAND} + DEPENDS ${TEST_TARGET_NAME} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) + endif() + + set(TEST_ENV UR_ADAPTERS_FORCE_LOAD="$") + if(UR_CONFORMANCE_ENABLE_MATCH_FILES) + list(APPEND TEST_ENV GTEST_COLOR=no) + endif() set_tests_properties(${TEST_NAME} PROPERTIES - ENVIRONMENT "${testEnv}" + ENVIRONMENT "${TEST_ENV}" LABELS "conformance;${adapter}") endfunction() @@ -123,6 +136,7 @@ if(UR_DPCXX) add_subdirectory(kernel) add_subdirectory(program) add_subdirectory(enqueue) + add_subdirectory(integration) add_subdirectory(exp_command_buffer) add_subdirectory(exp_usm_p2p) else() diff --git a/test/conformance/context/context_adapter_cuda.match b/test/conformance/context/context_adapter_cuda.match index b884aef583..e69de29bb2 100644 --- a/test/conformance/context/context_adapter_cuda.match +++ b/test/conformance/context/context_adapter_cuda.match @@ -1 +0,0 @@ -urContextCreateWithNativeHandleTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_ diff --git a/test/conformance/context/context_adapter_hip.match b/test/conformance/context/context_adapter_hip.match index 82d8d71397..0c9d1530b7 100644 --- a/test/conformance/context/context_adapter_hip.match +++ b/test/conformance/context/context_adapter_hip.match @@ -1,2 +1,4 @@ urContextCreateWithNativeHandleTest.Success/AMD_HIP_BACKEND___{{.*}}_ +urContextCreateWithNativeHandleTest.SuccessWithOwnedNativeHandle/AMD_HIP_BACKEND___{{.*}}_ +urContextCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle/AMD_HIP_BACKEND___{{.*}}_ urContextGetInfoTestWithInfoParam.Success/AMD_HIP_BACKEND___{{.*}} diff --git a/test/conformance/context/context_adapter_level_zero.match b/test/conformance/context/context_adapter_level_zero.match index e43bf4d5b8..f25df872a3 100644 --- a/test/conformance/context/context_adapter_level_zero.match +++ b/test/conformance/context/context_adapter_level_zero.match @@ -1 +1,2 @@ +urContextCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urContextSetExtendedDeleterTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ diff --git a/test/conformance/context/context_adapter_native_cpu.match b/test/conformance/context/context_adapter_native_cpu.match index 64f39d35c5..4c4ca027cb 100644 --- a/test/conformance/context/context_adapter_native_cpu.match +++ b/test/conformance/context/context_adapter_native_cpu.match @@ -1 +1,3 @@ +urContextCreateWithNativeHandleTest.InvalidNullPointerDevices/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urContextCreateWithNativeHandleTest.InvalidNullPointerContext/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urContextSetExtendedDeleterTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ diff --git a/test/conformance/context/urContextCreateWithNativeHandle.cpp b/test/conformance/context/urContextCreateWithNativeHandle.cpp index 97cfa79b9a..64d4470df1 100644 --- a/test/conformance/context/urContextCreateWithNativeHandle.cpp +++ b/test/conformance/context/urContextCreateWithNativeHandle.cpp @@ -10,8 +10,9 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urContextCreateWithNativeHandleTest); TEST_P(urContextCreateWithNativeHandleTest, Success) { ur_native_handle_t native_context = nullptr; - if (urContextGetNativeHandle(context, &native_context)) { - GTEST_SKIP(); + { + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urContextGetNativeHandle(context, &native_context)); } // We cannot assume anything about a native_handle, not even if it's @@ -20,8 +21,8 @@ TEST_P(urContextCreateWithNativeHandleTest, Success) { // and perform some query on it to verify that it works. ur_context_handle_t ctx = nullptr; ur_context_native_properties_t props{}; - ASSERT_SUCCESS(urContextCreateWithNativeHandle(native_context, 1, &device, - &props, &ctx)); + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urContextCreateWithNativeHandle( + native_context, 1, &device, &props, &ctx)); ASSERT_NE(ctx, nullptr); uint32_t n_devices = 0; @@ -30,3 +31,64 @@ TEST_P(urContextCreateWithNativeHandleTest, Success) { ASSERT_SUCCESS(urContextRelease(ctx)); } + +TEST_P(urContextCreateWithNativeHandleTest, SuccessWithOwnedNativeHandle) { + ur_native_handle_t native_context = nullptr; + { + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urContextGetNativeHandle(context, &native_context)); + } + + ur_context_handle_t ctx = nullptr; + ur_context_native_properties_t props{ + UR_STRUCTURE_TYPE_CONTEXT_NATIVE_PROPERTIES, nullptr, true}; + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urContextCreateWithNativeHandle( + native_context, 1, &device, &props, &ctx)); + ASSERT_NE(ctx, nullptr); + + uint32_t ref_count = 0; + ASSERT_SUCCESS(urContextGetInfo(ctx, UR_CONTEXT_INFO_REFERENCE_COUNT, + sizeof(uint32_t), &ref_count, nullptr)); + ASSERT_EQ(ref_count, 1); +} + +TEST_P(urContextCreateWithNativeHandleTest, SuccessWithUnOwnedNativeHandle) { + ur_native_handle_t native_context = nullptr; + { + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urContextGetNativeHandle(context, &native_context)); + } + + ur_context_handle_t ctx = nullptr; + ur_context_native_properties_t props{ + UR_STRUCTURE_TYPE_CONTEXT_NATIVE_PROPERTIES, nullptr, false}; + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urContextCreateWithNativeHandle( + native_context, 1, &device, &props, &ctx)); + ASSERT_NE(ctx, nullptr); + + uint32_t ref_count = 0; + ASSERT_SUCCESS(urContextGetInfo(ctx, UR_CONTEXT_INFO_REFERENCE_COUNT, + sizeof(uint32_t), &ref_count, nullptr)); + ASSERT_EQ(ref_count, 2); + + ASSERT_SUCCESS(urContextRelease(ctx)); +} + +TEST_P(urContextCreateWithNativeHandleTest, InvalidNullPointerDevices) { + ur_native_handle_t native_context = nullptr; + ASSERT_SUCCESS(urContextGetNativeHandle(context, &native_context)); + + ur_context_handle_t ctx = nullptr; + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, + urContextCreateWithNativeHandle(native_context, 1, nullptr, + nullptr, &ctx)); +} + +TEST_P(urContextCreateWithNativeHandleTest, InvalidNullPointerContext) { + ur_native_handle_t native_context = nullptr; + ASSERT_SUCCESS(urContextGetNativeHandle(context, &native_context)); + + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, + urContextCreateWithNativeHandle(native_context, 1, &device, + nullptr, nullptr)); +} diff --git a/test/conformance/device/device_adapter_cuda.match b/test/conformance/device/device_adapter_cuda.match index 48e00debe4..9989fbd774 100644 --- a/test/conformance/device/device_adapter_cuda.match +++ b/test/conformance/device/device_adapter_cuda.match @@ -1 +1,2 @@ +urDeviceCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle {{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime diff --git a/test/conformance/device/device_adapter_hip.match b/test/conformance/device/device_adapter_hip.match index 1e498301c8..715a8ba055 100644 --- a/test/conformance/device/device_adapter_hip.match +++ b/test/conformance/device/device_adapter_hip.match @@ -1,2 +1,4 @@ {{OPT}}urDeviceCreateWithNativeHandleTest.Success +{{OPT}}urDeviceCreateWithNativeHandleTest.SuccessWithOwnedNativeHandle +{{OPT}}urDeviceCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle {{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime diff --git a/test/conformance/device/device_adapter_level_zero.match b/test/conformance/device/device_adapter_level_zero.match index b1bff7376c..1f735d7a77 100644 --- a/test/conformance/device/device_adapter_level_zero.match +++ b/test/conformance/device/device_adapter_level_zero.match @@ -1,18 +1,10 @@ -{{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime +urDeviceCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle urDeviceGetInfoTest.Success/UR_DEVICE_INFO_GLOBAL_MEM_FREE urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT urDeviceGetInfoTest.Success/UR_DEVICE_INFO_HOST_UNIFIED_MEMORY -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ENDIAN_LITTLE urDeviceGetInfoTest.Success/UR_DEVICE_INFO_AVAILABLE -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_COMPILER_AVAILABLE -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_LINKER_AVAILABLE -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_IMAGE_SRGB urDeviceGetInfoTest.Success/UR_DEVICE_INFO_BUILD_ON_SUBDEVICE -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ATOMIC_64 urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ASYNC_BARRIER -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MEM_CHANNEL_SUPPORT urDeviceGetInfoTest.Success/UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP diff --git a/test/conformance/device/device_adapter_native_cpu.match b/test/conformance/device/device_adapter_native_cpu.match index 4c1a0f7f7f..147a9a3dfd 100644 --- a/test/conformance/device/device_adapter_native_cpu.match +++ b/test/conformance/device/device_adapter_native_cpu.match @@ -1,4 +1,7 @@ +urDeviceCreateWithNativeHandleTest.InvalidNullHandlePlatform +urDeviceCreateWithNativeHandleTest.InvalidNullPointerDevice {{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime +urDeviceGetInfoSingleTest.MaxWorkGroupSizeIsNonzero {{OPT}}urDeviceSelectBinaryTest.Success urDeviceGetInfoTest.Success/UR_DEVICE_INFO_DEVICE_ID urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MEMORY_CLOCK_RATE diff --git a/test/conformance/device/device_adapter_opencl.match b/test/conformance/device/device_adapter_opencl.match index 716ebd54fe..39854cbcd3 100644 --- a/test/conformance/device/device_adapter_opencl.match +++ b/test/conformance/device/device_adapter_opencl.match @@ -1 +1 @@ -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_HALF_FP_CONFIG +urDeviceCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle diff --git a/test/conformance/device/urDeviceCreateWithNativeHandle.cpp b/test/conformance/device/urDeviceCreateWithNativeHandle.cpp index fbcd5e2f4b..cf31ebe9be 100644 --- a/test/conformance/device/urDeviceCreateWithNativeHandle.cpp +++ b/test/conformance/device/urDeviceCreateWithNativeHandle.cpp @@ -9,8 +9,9 @@ using urDeviceCreateWithNativeHandleTest = uur::urAllDevicesTest; TEST_F(urDeviceCreateWithNativeHandleTest, Success) { for (auto device : devices) { ur_native_handle_t native_handle = nullptr; - if (urDeviceGetNativeHandle(device, &native_handle)) { - continue; + { + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urDeviceGetNativeHandle(device, &native_handle)); } // We cannot assume anything about a native_handle, not even if it's @@ -18,8 +19,8 @@ TEST_F(urDeviceCreateWithNativeHandleTest, Success) { // We can however convert the native_handle back into a unified-runtime handle // and perform some query on it to verify that it works. ur_device_handle_t dev = nullptr; - ASSERT_SUCCESS(urDeviceCreateWithNativeHandle(native_handle, platform, - nullptr, &dev)); + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urDeviceCreateWithNativeHandle( + native_handle, platform, nullptr, &dev)); ASSERT_NE(dev, nullptr); uint32_t dev_id = 0; @@ -27,3 +28,72 @@ TEST_F(urDeviceCreateWithNativeHandleTest, Success) { sizeof(uint32_t), &dev_id, nullptr)); } } + +TEST_F(urDeviceCreateWithNativeHandleTest, SuccessWithOwnedNativeHandle) { + for (auto device : devices) { + ur_native_handle_t native_handle = nullptr; + { + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urDeviceGetNativeHandle(device, &native_handle)); + } + + ur_device_handle_t dev = nullptr; + ur_device_native_properties_t props{ + UR_STRUCTURE_TYPE_DEVICE_NATIVE_PROPERTIES, nullptr, true}; + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urDeviceCreateWithNativeHandle( + native_handle, platform, &props, &dev)); + ASSERT_NE(dev, nullptr); + + uint32_t ref_count = 0; + ASSERT_SUCCESS(urDeviceGetInfo(dev, UR_DEVICE_INFO_REFERENCE_COUNT, + sizeof(uint32_t), &ref_count, nullptr)); + + ASSERT_EQ(ref_count, 1); + } +} + +TEST_F(urDeviceCreateWithNativeHandleTest, SuccessWithUnOwnedNativeHandle) { + for (auto device : devices) { + ur_native_handle_t native_handle = nullptr; + { + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urDeviceGetNativeHandle(device, &native_handle)); + } + + ur_device_handle_t dev = nullptr; + ur_device_native_properties_t props{ + UR_STRUCTURE_TYPE_DEVICE_NATIVE_PROPERTIES, nullptr, false}; + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urDeviceCreateWithNativeHandle( + native_handle, platform, &props, &dev)); + ASSERT_NE(dev, nullptr); + + uint32_t ref_count = 0; + ASSERT_SUCCESS(urDeviceGetInfo(dev, UR_DEVICE_INFO_REFERENCE_COUNT, + sizeof(uint32_t), &ref_count, nullptr)); + + ASSERT_EQ(ref_count, 2); + } +} + +TEST_F(urDeviceCreateWithNativeHandleTest, InvalidNullHandlePlatform) { + for (auto device : devices) { + ur_native_handle_t native_handle = nullptr; + ASSERT_SUCCESS(urDeviceGetNativeHandle(device, &native_handle)); + + ur_device_handle_t dev = nullptr; + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE, + urDeviceCreateWithNativeHandle(native_handle, nullptr, + nullptr, &dev)); + } +} + +TEST_F(urDeviceCreateWithNativeHandleTest, InvalidNullPointerDevice) { + for (auto device : devices) { + ur_native_handle_t native_handle = nullptr; + ASSERT_SUCCESS(urDeviceGetNativeHandle(device, &native_handle)); + + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, + urDeviceCreateWithNativeHandle(native_handle, platform, + nullptr, nullptr)); + } +} diff --git a/test/conformance/device/urDeviceGetInfo.cpp b/test/conformance/device/urDeviceGetInfo.cpp index a3fd9afe10..d1a04d8a6a 100644 --- a/test/conformance/device/urDeviceGetInfo.cpp +++ b/test/conformance/device/urDeviceGetInfo.cpp @@ -3,6 +3,7 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include #include #include @@ -242,6 +243,12 @@ INSTANTIATE_TEST_SUITE_P( return ss.str(); }); +struct urDeviceGetInfoSingleTest : uur::urAllDevicesTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(uur::urAllDevicesTest::SetUp()); + } +}; + bool doesReturnArray(ur_device_info_t info_type) { if (info_type == UR_DEVICE_INFO_SUPPORTED_PARTITIONS || info_type == UR_DEVICE_INFO_PARTITION_TYPE) { @@ -284,7 +291,7 @@ TEST_P(urDeviceGetInfoTest, Success) { } } -TEST_P(urDeviceGetInfoTest, InvalidNullHandleDevice) { +TEST_F(urDeviceGetInfoSingleTest, InvalidNullHandleDevice) { ur_device_type_t device_type; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE, urDeviceGetInfo(nullptr, UR_DEVICE_INFO_TYPE, @@ -292,7 +299,7 @@ TEST_P(urDeviceGetInfoTest, InvalidNullHandleDevice) { nullptr)); } -TEST_P(urDeviceGetInfoTest, InvalidEnumerationInfoType) { +TEST_F(urDeviceGetInfoSingleTest, InvalidEnumerationInfoType) { for (auto device : devices) { ur_device_type_t device_type; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_ENUMERATION, @@ -302,7 +309,7 @@ TEST_P(urDeviceGetInfoTest, InvalidEnumerationInfoType) { } } -TEST_P(urDeviceGetInfoTest, InvalidSizePropSize) { +TEST_F(urDeviceGetInfoSingleTest, InvalidSizePropSize) { for (auto device : devices) { ur_device_type_t device_type; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_SIZE, @@ -311,7 +318,7 @@ TEST_P(urDeviceGetInfoTest, InvalidSizePropSize) { } } -TEST_P(urDeviceGetInfoTest, InvalidSizePropSizeSmall) { +TEST_F(urDeviceGetInfoSingleTest, InvalidSizePropSizeSmall) { for (auto device : devices) { ur_device_type_t device_type; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_SIZE, @@ -321,7 +328,7 @@ TEST_P(urDeviceGetInfoTest, InvalidSizePropSizeSmall) { } } -TEST_P(urDeviceGetInfoTest, InvalidNullPointerPropValue) { +TEST_F(urDeviceGetInfoSingleTest, InvalidNullPointerPropValue) { for (auto device : devices) { ur_device_type_t device_type; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, @@ -331,10 +338,30 @@ TEST_P(urDeviceGetInfoTest, InvalidNullPointerPropValue) { } } -TEST_P(urDeviceGetInfoTest, InvalidNullPointerPropSizeRet) { +TEST_F(urDeviceGetInfoSingleTest, InvalidNullPointerPropSizeRet) { for (auto device : devices) { ASSERT_EQ_RESULT( UR_RESULT_ERROR_INVALID_NULL_POINTER, urDeviceGetInfo(device, UR_DEVICE_INFO_TYPE, 0, nullptr, nullptr)); } } + +TEST_F(urDeviceGetInfoSingleTest, MaxWorkGroupSizeIsNonzero) { + for (auto device : devices) { + size_t max_global_size; + + ASSERT_SUCCESS( + urDeviceGetInfo(device, UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE, + sizeof(size_t), &max_global_size, nullptr)); + ASSERT_NE(max_global_size, 0); + + std::array max_work_group_sizes; + ASSERT_SUCCESS(urDeviceGetInfo(device, + UR_DEVICE_INFO_MAX_WORK_GROUPS_3D, + sizeof(max_work_group_sizes), + max_work_group_sizes.data(), nullptr)); + for (size_t i = 0; i < 3; i++) { + ASSERT_NE(max_work_group_sizes[i], 0); + } + } +} diff --git a/test/conformance/device/urDevicePartition.cpp b/test/conformance/device/urDevicePartition.cpp index c4c992e06a..2b2939066d 100644 --- a/test/conformance/device/urDevicePartition.cpp +++ b/test/conformance/device/urDevicePartition.cpp @@ -244,6 +244,21 @@ TEST_F(urDevicePartitionTest, InvalidNullPointerProperties) { } } +TEST_F(urDevicePartitionTest, InvalidNullPointerPropertiesArray) { + ur_device_partition_properties_t properties{ + UR_STRUCTURE_TYPE_DEVICE_PARTITION_PROPERTIES, + nullptr, + nullptr, + 0, + }; + for (auto device : devices) { + ur_device_handle_t sub_device = nullptr; + ASSERT_EQ_RESULT( + UR_RESULT_ERROR_INVALID_NULL_POINTER, + urDevicePartition(device, &properties, 1, &sub_device, nullptr)); + } +} + TEST_F(urDevicePartitionTest, SuccessSubSet) { for (auto device : devices) { diff --git a/test/conformance/device_code/CMakeLists.txt b/test/conformance/device_code/CMakeLists.txt index ee16b0eb43..1419604b9d 100644 --- a/test/conformance/device_code/CMakeLists.txt +++ b/test/conformance/device_code/CMakeLists.txt @@ -10,6 +10,16 @@ else() set(AMD_ARCH "${UR_CONFORMANCE_AMD_ARCH}") endif() +if (WIN32) + set(NULDEV NUL) +else() + set(NULDEV /dev/null) +endif() + +cmake_path(GET UR_DPCXX EXTENSION EXE) +cmake_path(REPLACE_FILENAME UR_DPCXX "clang-offload-extract${EXE}" OUTPUT_VARIABLE DEFAULT_EXTRACTOR_NAME) +set(UR_DEVICE_CODE_EXTRACTOR "${DEFAULT_EXTRACTOR_NAME}" CACHE PATH "Path to clang-offload-extract") + if("${AMD_ARCH}" STREQUAL "" AND "${TARGET_TRIPLES}" MATCHES "amd") find_package(RocmAgentEnumerator) if(NOT ROCM_AGENT_ENUMERATOR_FOUND) @@ -52,8 +62,15 @@ macro(add_device_binary SOURCE_FILE) set(EXTRA_ENV DYLD_FALLBACK_LIBRARY_PATH=${UR_SYCL_LIBRARY_DIR}) endif() endif() + + # Convert build flags to a regular CMake list, splitting by unquoted white + # space as necessary. + separate_arguments(DPCXX_BUILD_FLAGS_LIST NATIVE_COMMAND "${UR_DPCXX_BUILD_FLAGS}") + foreach(TRIPLE ${TARGET_TRIPLES}) set(EXE_PATH "${DEVICE_BINARY_DIR}/${KERNEL_NAME}_${TRIPLE}") + set(BIN_PATH "${DEVICE_BINARY_DIR}/${TRIPLE}.bin.0") + if(${TRIPLE} MATCHES "amd") set(AMD_TARGET_BACKEND -Xsycl-target-backend=${TRIPLE}) set(AMD_OFFLOAD_ARCH --offload-arch=${AMD_ARCH}) @@ -76,19 +93,37 @@ macro(add_device_binary SOURCE_FILE) continue() endif() - add_custom_command(OUTPUT ${EXE_PATH} + # HIP doesn't seem to provide the symbol + # `_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E11FixedSgSize` which + # causes a build failure here + if(${TRIPLE} MATCHES "amd" AND ${KERNEL_NAME} MATCHES "subgroup") + continue() + endif() + + add_custom_command(OUTPUT "${BIN_PATH}" COMMAND ${UR_DPCXX} -fsycl -fsycl-targets=${TRIPLE} -fsycl-device-code-split=off ${AMD_TARGET_BACKEND} ${AMD_OFFLOAD_ARCH} ${AMD_NOGPULIB} - ${SOURCE_FILE} -o ${EXE_PATH} + ${DPCXX_BUILD_FLAGS_LIST} ${SOURCE_FILE} -o ${EXE_PATH} + + COMMAND ${CMAKE_COMMAND} -E env ${EXTRA_ENV} ${UR_DEVICE_CODE_EXTRACTOR} --stem="${TRIPLE}.bin" ${EXE_PATH} - COMMAND ${CMAKE_COMMAND} -E env ${EXTRA_ENV} SYCL_DUMP_IMAGES=true - ${EXE_PATH} || exit 0 WORKING_DIRECTORY "${DEVICE_BINARY_DIR}" DEPENDS ${SOURCE_FILE} ) - add_custom_target(generate_${KERNEL_NAME}_${TRIPLE} DEPENDS ${EXE_PATH}) + add_custom_target(generate_${KERNEL_NAME}_${TRIPLE} DEPENDS ${BIN_PATH}) add_dependencies(generate_device_binaries generate_${KERNEL_NAME}_${TRIPLE}) endforeach() + + set(IH_PATH "${DEVICE_BINARY_DIR}/${KERNEL_NAME}.ih") + add_custom_command(OUTPUT "${IH_PATH}" + COMMAND ${UR_DPCXX} -fsycl -fsycl-device-code-split=off + -fsycl-device-only -c -Xclang -fsycl-int-header="${IH_PATH}" + ${DPCXX_BUILD_FLAGS_LIST} ${SOURCE_FILE} -o ${NULDEV} + + WORKING_DIRECTORY "${DEVICE_BINARY_DIR}" + DEPENDS ${SOURCE_FILE} + ) + list(APPEND DEVICE_IHS ${IH_PATH}) list(APPEND DEVICE_CODE_SOURCES ${SOURCE_FILE}) endmacro() @@ -101,6 +136,9 @@ add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fill_usm.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/foo.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/image_copy.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/mean.cpp) +add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/cpy_and_mult.cpp) +add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/cpy_and_mult_usm.cpp) +add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/multiply.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/spec_constant.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/spec_constant_multiple.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/usm_ll.cpp) @@ -108,13 +146,16 @@ add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/saxpy.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/saxpy_usm.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/indexers_usm.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/build_failure.cpp) +add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fixed_wg_size.cpp) +add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/standard_types.cpp) +add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/subgroup.cpp) set(KERNEL_HEADER ${UR_CONFORMANCE_DEVICE_BINARIES_DIR}/kernel_entry_points.h) add_custom_command(OUTPUT ${KERNEL_HEADER} WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/scripts COMMAND ${Python3_EXECUTABLE} generate_kernel_header.py - --dpcxx_path ${UR_DPCXX} -o ${KERNEL_HEADER} ${DEVICE_CODE_SOURCES} + -o ${KERNEL_HEADER} ${DEVICE_CODE_SOURCES} DEPENDS ${PROJECT_SOURCE_DIR}/scripts/generate_kernel_header.py - ${DEVICE_CODE_SOURCES}) + ${DEVICE_CODE_SOURCES} ${DEVICE_IHS}) add_custom_target(kernel_names_header DEPENDS ${KERNEL_HEADER}) add_dependencies(generate_device_binaries kernel_names_header) diff --git a/test/conformance/device_code/bar.cpp b/test/conformance/device_code/bar.cpp index fecac40c75..58f2696bf8 100644 --- a/test/conformance/device_code/bar.cpp +++ b/test/conformance/device_code/bar.cpp @@ -3,14 +3,14 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { - cl::sycl::queue deviceQueue; - cl::sycl::range<1> numOfItems{1}; + sycl::queue deviceQueue; + sycl::range<1> numOfItems{1}; - deviceQueue.submit([&](cl::sycl::handler &cgh) { - auto kern = [=](cl::sycl::id<1>) {}; + deviceQueue.submit([&](sycl::handler &cgh) { + auto kern = [=](sycl::id<1>) {}; cgh.parallel_for(numOfItems, kern); }); diff --git a/test/conformance/device_code/cpy_and_mult.cpp b/test/conformance/device_code/cpy_and_mult.cpp new file mode 100644 index 0000000000..a2bdaccf55 --- /dev/null +++ b/test/conformance/device_code/cpy_and_mult.cpp @@ -0,0 +1,29 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +int main() { + size_t array_size = 16; + cl::sycl::queue sycl_queue; + std::vector src(array_size, 1); + std::vector dst(array_size, 1); + auto src_buff = + cl::sycl::buffer(src.data(), cl::sycl::range<1>(array_size)); + auto dst_buff = + cl::sycl::buffer(dst.data(), cl::sycl::range<1>(array_size)); + + sycl_queue.submit([&](cl::sycl::handler &cgh) { + auto src_acc = src_buff.get_access(cgh); + auto dst_acc = dst_buff.get_access(cgh); + cgh.parallel_for( + cl::sycl::range<1>{array_size}, + [src_acc, dst_acc](cl::sycl::item<1> itemId) { + auto id = itemId.get_id(0); + dst_acc[id] = src_acc[id] * 2; + }); + }); + return 0; +} diff --git a/test/conformance/device_code/cpy_and_mult_usm.cpp b/test/conformance/device_code/cpy_and_mult_usm.cpp new file mode 100644 index 0000000000..e253dfe14b --- /dev/null +++ b/test/conformance/device_code/cpy_and_mult_usm.cpp @@ -0,0 +1,22 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +int main() { + size_t array_size = 16; + cl::sycl::queue sycl_queue; + uint32_t *src = cl::sycl::malloc_device(array_size, sycl_queue); + uint32_t *dst = cl::sycl::malloc_device(array_size, sycl_queue); + sycl_queue.submit([&](cl::sycl::handler &cgh) { + cgh.parallel_for( + cl::sycl::range<1>{array_size}, + [src, dst](cl::sycl::item<1> itemId) { + auto id = itemId.get_id(0); + dst[id] = src[id] * 2; + }); + }); + return 0; +} diff --git a/test/conformance/device_code/fill.cpp b/test/conformance/device_code/fill.cpp index 443373edf2..fabcbcf8ec 100644 --- a/test/conformance/device_code/fill.cpp +++ b/test/conformance/device_code/fill.cpp @@ -3,19 +3,18 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { size_t array_size = 16; std::vector A(array_size, 1); uint32_t val = 42; - cl::sycl::queue sycl_queue; - auto A_buff = - cl::sycl::buffer(A.data(), cl::sycl::range<1>(array_size)); - sycl_queue.submit([&](cl::sycl::handler &cgh) { - auto A_acc = A_buff.get_access(cgh); - cgh.parallel_for(cl::sycl::range<1>{array_size}, - [A_acc, val](cl::sycl::item<1> itemId) { + sycl::queue sycl_queue; + auto A_buff = sycl::buffer(A.data(), sycl::range<1>(array_size)); + sycl_queue.submit([&](sycl::handler &cgh) { + auto A_acc = A_buff.get_access(cgh); + cgh.parallel_for(sycl::range<1>{array_size}, + [A_acc, val](sycl::item<1> itemId) { auto id = itemId.get_id(0); A_acc[id] = val; }); diff --git a/test/conformance/device_code/fill_2d.cpp b/test/conformance/device_code/fill_2d.cpp index d4dd6c704f..5fed417ed8 100644 --- a/test/conformance/device_code/fill_2d.cpp +++ b/test/conformance/device_code/fill_2d.cpp @@ -3,24 +3,24 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { size_t nd_range_x = 8; size_t nd_range_y = 8; - auto nd_range = cl::sycl::range<2>(nd_range_x, nd_range_y); + auto nd_range = sycl::range<2>(nd_range_x, nd_range_y); std::vector A(nd_range_x * nd_range_y, 1); uint32_t val = 42; - cl::sycl::queue sycl_queue; + sycl::queue sycl_queue; - auto work_range = cl::sycl::nd_range<2>(nd_range, cl::sycl::range<2>(1, 1)); - auto A_buff = cl::sycl::buffer( - A.data(), cl::sycl::range<1>(nd_range_x * nd_range_y)); - sycl_queue.submit([&](cl::sycl::handler &cgh) { - auto A_acc = A_buff.get_access(cgh); + auto work_range = sycl::nd_range<2>(nd_range, sycl::range<2>(1, 1)); + auto A_buff = sycl::buffer( + A.data(), sycl::range<1>(nd_range_x * nd_range_y)); + sycl_queue.submit([&](sycl::handler &cgh) { + auto A_acc = A_buff.get_access(cgh); cgh.parallel_for( - work_range, [A_acc, val](cl::sycl::nd_item<2> item_id) { + work_range, [A_acc, val](sycl::nd_item<2> item_id) { auto id = item_id.get_global_linear_id(); A_acc[id] = val; }); diff --git a/test/conformance/device_code/fill_3d.cpp b/test/conformance/device_code/fill_3d.cpp index a1f172ba6a..fd835222a2 100644 --- a/test/conformance/device_code/fill_3d.cpp +++ b/test/conformance/device_code/fill_3d.cpp @@ -3,26 +3,25 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { size_t nd_range_x = 4; size_t nd_range_y = 4; size_t nd_range_z = 4; - auto nd_range = cl::sycl::range<3>(nd_range_x, nd_range_y, nd_range_z); + auto nd_range = sycl::range<3>(nd_range_x, nd_range_y, nd_range_z); std::vector A(nd_range_x * nd_range_y * nd_range_y, 1); uint32_t val = 42; - cl::sycl::queue sycl_queue; + sycl::queue sycl_queue; - auto work_range = - cl::sycl::nd_range<3>(nd_range, cl::sycl::range<3>(1, 1, 1)); - auto A_buff = cl::sycl::buffer( - A.data(), cl::sycl::range<1>(nd_range_x * nd_range_y)); - sycl_queue.submit([&](cl::sycl::handler &cgh) { - auto A_acc = A_buff.get_access(cgh); + auto work_range = sycl::nd_range<3>(nd_range, sycl::range<3>(1, 1, 1)); + auto A_buff = sycl::buffer( + A.data(), sycl::range<1>(nd_range_x * nd_range_y)); + sycl_queue.submit([&](sycl::handler &cgh) { + auto A_acc = A_buff.get_access(cgh); cgh.parallel_for( - work_range, [A_acc, val](cl::sycl::nd_item<3> item_id) { + work_range, [A_acc, val](sycl::nd_item<3> item_id) { auto id = item_id.get_global_linear_id(); A_acc[id] = val; }); diff --git a/test/conformance/device_code/fill_usm.cpp b/test/conformance/device_code/fill_usm.cpp index 92cd255399..d57309cb59 100644 --- a/test/conformance/device_code/fill_usm.cpp +++ b/test/conformance/device_code/fill_usm.cpp @@ -3,17 +3,17 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { size_t array_size = 16; std::vector A(array_size, 1); uint32_t val = 42; - cl::sycl::queue sycl_queue; - uint32_t *data = cl::sycl::malloc_shared(array_size, sycl_queue); - sycl_queue.submit([&](cl::sycl::handler &cgh) { - cgh.parallel_for(cl::sycl::range<1>{array_size}, - [data, val](cl::sycl::item<1> itemId) { + sycl::queue sycl_queue; + uint32_t *data = sycl::malloc_shared(array_size, sycl_queue); + sycl_queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::range<1>{array_size}, + [data, val](sycl::item<1> itemId) { auto id = itemId.get_id(0); data[id] = val; }); diff --git a/test/conformance/device_code/fixed_wg_size.cpp b/test/conformance/device_code/fixed_wg_size.cpp new file mode 100644 index 0000000000..db2e8a9250 --- /dev/null +++ b/test/conformance/device_code/fixed_wg_size.cpp @@ -0,0 +1,27 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +struct KernelFunctor { + void operator()(sycl::nd_item<3>) const {} + void operator()(sycl::item<3>) const {} + + auto get(sycl::ext::oneapi::experimental::properties_tag) { + return sycl::ext::oneapi::experimental::properties{ + sycl::ext::oneapi::experimental::work_group_size<4, 4, 4>}; + } +}; + +int main() { + sycl::queue myQueue; + myQueue.submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::range<3>(8, 8, 8), + KernelFunctor{}); + }); + + myQueue.wait(); + return 0; +} diff --git a/test/conformance/device_code/foo.cpp b/test/conformance/device_code/foo.cpp index dc108b9606..20ad92be4b 100644 --- a/test/conformance/device_code/foo.cpp +++ b/test/conformance/device_code/foo.cpp @@ -3,14 +3,14 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { - cl::sycl::queue deviceQueue; - cl::sycl::range<1> numOfItems{1}; + sycl::queue deviceQueue; + sycl::range<1> numOfItems{1}; - deviceQueue.submit([&](cl::sycl::handler &cgh) { - auto kern = [=](cl::sycl::id<1>) {}; + deviceQueue.submit([&](sycl::handler &cgh) { + auto kern = [=](sycl::id<1>) {}; cgh.parallel_for(numOfItems, kern); }); diff --git a/test/conformance/device_code/image_copy.cpp b/test/conformance/device_code/image_copy.cpp index a64b601213..d04398c2fe 100644 --- a/test/conformance/device_code/image_copy.cpp +++ b/test/conformance/device_code/image_copy.cpp @@ -3,44 +3,40 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { - cl::sycl::queue sycl_queue; + sycl::queue sycl_queue; const int height = 8; const int width = 8; - auto image_range = cl::sycl::range<2>(height, width); + auto image_range = sycl::range<2>(height, width); const int channels = 4; std::vector in_data(height * width * channels, 0.5f); std::vector out_data(height * width * channels, 0); - cl::sycl::image<2> image_in( - in_data.data(), cl::sycl::image_channel_order::rgba, - cl::sycl::image_channel_type::fp32, image_range); - cl::sycl::image<2> image_out( - out_data.data(), cl::sycl::image_channel_order::rgba, - cl::sycl::image_channel_type::fp32, image_range); + sycl::image<2> image_in(in_data.data(), sycl::image_channel_order::rgba, + sycl::image_channel_type::fp32, image_range); + sycl::image<2> image_out(out_data.data(), sycl::image_channel_order::rgba, + sycl::image_channel_type::fp32, image_range); - auto work_range = - cl::sycl::nd_range<2>(image_range, cl::sycl::range<2>(1, 1)); - sycl_queue.submit([&](cl::sycl::handler &cgh) { - cl::sycl::accessor + auto work_range = sycl::nd_range<2>(image_range, sycl::range<2>(1, 1)); + sycl_queue.submit([&](sycl::handler &cgh) { + sycl::accessor in_acc(image_in, cgh); - cl::sycl::accessor + sycl::accessor out_acc(image_out, cgh); - cl::sycl::sampler smpl( - cl::sycl::coordinate_normalization_mode::unnormalized, - cl::sycl::addressing_mode::clamp, - cl::sycl::filtering_mode::nearest); + sycl::sampler smpl(sycl::coordinate_normalization_mode::unnormalized, + sycl::addressing_mode::clamp, + sycl::filtering_mode::nearest); cgh.parallel_for( - work_range, [=](cl::sycl::nd_item<2> item_id) { - auto coords = cl::sycl::int2(item_id.get_global_id(0), - item_id.get_global_id(1)); + work_range, [=](sycl::nd_item<2> item_id) { + auto coords = sycl::int2(item_id.get_global_id(0), + item_id.get_global_id(1)); out_acc.write(coords, in_acc.read(coords, smpl)); }); }); diff --git a/test/conformance/device_code/indexers_usm.cpp b/test/conformance/device_code/indexers_usm.cpp index 76b0751730..e055fa47cc 100644 --- a/test/conformance/device_code/indexers_usm.cpp +++ b/test/conformance/device_code/indexers_usm.cpp @@ -3,25 +3,24 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { - const cl::sycl::range<3> global_range(8, 8, 8); - const cl::sycl::range<3> local_range(2, 2, 2); - const cl::sycl::id<3> global_offset(4, 4, 4); - const cl::sycl::nd_range<3> nd_range(global_range, local_range, - global_offset); + const sycl::range<3> global_range(8, 8, 8); + const sycl::range<3> local_range(2, 2, 2); + const sycl::id<3> global_offset(4, 4, 4); + const sycl::nd_range<3> nd_range(global_range, local_range, global_offset); - cl::sycl::queue sycl_queue; + sycl::queue sycl_queue; const size_t elements_per_work_item = 6; - int *ptr = cl::sycl::malloc_shared(global_range[0] * global_range[1] * - global_range[2] * - elements_per_work_item, - sycl_queue); + int *ptr = + sycl::malloc_shared(global_range[0] * global_range[1] * + global_range[2] * elements_per_work_item, + sycl_queue); - sycl_queue.submit([&](cl::sycl::handler &cgh) { + sycl_queue.submit([&](sycl::handler &cgh) { cgh.parallel_for( - nd_range, [ptr](cl::sycl::nd_item<3> index) { + nd_range, [ptr](sycl::nd_item<3> index) { int *wi_ptr = ptr + index.get_global_linear_id() * elements_per_work_item; diff --git a/test/conformance/device_code/mean.cpp b/test/conformance/device_code/mean.cpp index 61623e0914..6d5a571374 100644 --- a/test/conformance/device_code/mean.cpp +++ b/test/conformance/device_code/mean.cpp @@ -3,34 +3,34 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { const int array_size = 16; const int wg_size = 4; std::vector in(array_size * wg_size, 1); std::vector out(array_size, 0); - cl::sycl::queue sycl_queue; - auto in_buff = cl::sycl::buffer( - in.data(), cl::sycl::range<1>(array_size * wg_size)); + sycl::queue sycl_queue; + auto in_buff = + sycl::buffer(in.data(), sycl::range<1>(array_size * wg_size)); auto out_buff = - cl::sycl::buffer(out.data(), cl::sycl::range<1>(array_size)); - sycl_queue.submit([&](cl::sycl::handler &cgh) { + sycl::buffer(out.data(), sycl::range<1>(array_size)); + sycl_queue.submit([&](sycl::handler &cgh) { sycl::local_accessor local_mem(wg_size, cgh); - auto in_acc = in_buff.get_access(cgh); - auto out_acc = out_buff.get_access(cgh); + auto in_acc = in_buff.get_access(cgh); + auto out_acc = out_buff.get_access(cgh); - cl::sycl::range<1> num_groups{array_size}; - cl::sycl::range<1> group_size{wg_size}; + sycl::range<1> num_groups{array_size}; + sycl::range<1> group_size{wg_size}; cgh.parallel_for_work_group( - num_groups, group_size, [=](cl::sycl::group<1> group) { + num_groups, group_size, [=](sycl::group<1> group) { auto group_id = group.get_group_id(); group.parallel_for_work_item([&](sycl::h_item<1> item) { auto local_id = item.get_local_id(0); auto in_index = (group_id * wg_size) + local_id; local_mem[local_id] = in_acc[in_index]; }); - cl::sycl::group_barrier(group); + sycl::group_barrier(group); uint32_t total = 0; for (int i = 0; i < wg_size; i++) { total += local_mem[i]; diff --git a/test/conformance/device_code/multiply.cpp b/test/conformance/device_code/multiply.cpp new file mode 100644 index 0000000000..070cef18fd --- /dev/null +++ b/test/conformance/device_code/multiply.cpp @@ -0,0 +1,20 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +int main() { + + const size_t inputSize = 1; + sycl::queue sycl_queue; + uint32_t *inputArray = sycl::malloc_shared(inputSize, sycl_queue); + + sycl_queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::range<1>(inputSize), + [=](sycl::id<1> itemID) { inputArray[itemID] *= 2; }); + }); + return 0; +} diff --git a/test/conformance/device_code/saxpy.cpp b/test/conformance/device_code/saxpy.cpp index 593e8e2435..ac113884b9 100644 --- a/test/conformance/device_code/saxpy.cpp +++ b/test/conformance/device_code/saxpy.cpp @@ -3,7 +3,7 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { size_t array_size = 16; @@ -11,20 +11,17 @@ int main() { std::vector Y(array_size, 2); std::vector Z(array_size, 0); uint32_t A = 42; - auto x_buff = - cl::sycl::buffer(X.data(), cl::sycl::range<1>(array_size)); - auto y_buff = - cl::sycl::buffer(Y.data(), cl::sycl::range<1>(array_size)); - auto z_buff = - cl::sycl::buffer(Z.data(), cl::sycl::range<1>(array_size)); + auto x_buff = sycl::buffer(X.data(), sycl::range<1>(array_size)); + auto y_buff = sycl::buffer(Y.data(), sycl::range<1>(array_size)); + auto z_buff = sycl::buffer(Z.data(), sycl::range<1>(array_size)); - cl::sycl::queue sycl_queue; - sycl_queue.submit([&](cl::sycl::handler &cgh) { - auto x_acc = x_buff.get_access(cgh); - auto y_acc = y_buff.get_access(cgh); - auto z_acc = z_buff.get_access(cgh); - cgh.parallel_for(cl::sycl::range<1>{array_size}, - [=](cl::sycl::item<1> itemId) { + sycl::queue sycl_queue; + sycl_queue.submit([&](sycl::handler &cgh) { + auto x_acc = x_buff.get_access(cgh); + auto y_acc = y_buff.get_access(cgh); + auto z_acc = z_buff.get_access(cgh); + cgh.parallel_for(sycl::range<1>{array_size}, + [=](sycl::item<1> itemId) { auto i = itemId.get_id(0); z_acc[i] = A * x_acc[i] + y_acc[i]; }); diff --git a/test/conformance/device_code/saxpy_usm.cpp b/test/conformance/device_code/saxpy_usm.cpp index 8772a7e25d..774686ab21 100644 --- a/test/conformance/device_code/saxpy_usm.cpp +++ b/test/conformance/device_code/saxpy_usm.cpp @@ -3,20 +3,20 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { size_t array_size = 16; - cl::sycl::queue sycl_queue; - uint32_t *X = cl::sycl::malloc_shared(array_size, sycl_queue); - uint32_t *Y = cl::sycl::malloc_shared(array_size, sycl_queue); - uint32_t *Z = cl::sycl::malloc_shared(array_size, sycl_queue); + sycl::queue sycl_queue; + uint32_t *X = sycl::malloc_shared(array_size, sycl_queue); + uint32_t *Y = sycl::malloc_shared(array_size, sycl_queue); + uint32_t *Z = sycl::malloc_shared(array_size, sycl_queue); uint32_t A = 42; - sycl_queue.submit([&](cl::sycl::handler &cgh) { - cgh.parallel_for(cl::sycl::range<1>{array_size}, - [=](cl::sycl::item<1> itemId) { + sycl_queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::range<1>{array_size}, + [=](sycl::item<1> itemId) { auto i = itemId.get_id(0); Z[i] = A * X[i] + Y[i]; }); diff --git a/test/conformance/device_code/standard_types.cpp b/test/conformance/device_code/standard_types.cpp new file mode 100644 index 0000000000..d3cf89aa66 --- /dev/null +++ b/test/conformance/device_code/standard_types.cpp @@ -0,0 +1,42 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include +#include + +struct Struct { + uint32_t a; + uint32_t b; +}; + +int main() { + sycl::queue deviceQueue; + sycl::range<1> numOfItems{1}; + + uint32_t output = 0; + + volatile bool test_bool = true; + volatile uint8_t test_u8 = 2; + volatile uint32_t test_u32 = 3; + volatile uint64_t test_u64 = 5; + Struct test_struct{7, 5}; + volatile float test_float = 11; + + { + sycl::buffer output_buff(&output, sycl::range(1)); + deviceQueue.submit([&](sycl::handler &cgh) { + sycl::accessor acc{output_buff, cgh, sycl::read_write}; + auto kern = [=](sycl::id<1> id) { + acc[id] = 100 + (test_bool ? 1 : 0) * test_u8 * test_u32 * + test_u64 * test_struct.a * + static_cast(test_float); + }; + cgh.parallel_for(numOfItems, kern); + }); + deviceQueue.wait(); + } + + return output == 2410; +} diff --git a/test/conformance/device_code/subgroup.cpp b/test/conformance/device_code/subgroup.cpp new file mode 100644 index 0000000000..fa4228f846 --- /dev/null +++ b/test/conformance/device_code/subgroup.cpp @@ -0,0 +1,35 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +struct KernelFunctor { + sycl::accessor Acc; + + KernelFunctor(sycl::accessor Acc) + : Acc(Acc) {} + + void operator()(sycl::nd_item<1> NdItem) const { + auto SG = NdItem.get_sub_group(); + if (NdItem.get_global_linear_id() == 0) { + Acc[0] = SG.get_local_linear_range(); + } + } +}; + +int main() { + sycl::queue myQueue; + size_t output = 0; + sycl::buffer output_buff(&output, sycl::range(1)); + + myQueue.submit([&](sycl::handler &cgh) { + sycl::accessor acc{output_buff, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(sycl::nd_range<1>(8, 2), + KernelFunctor{acc}); + }); + + myQueue.wait(); + return 0; +} diff --git a/test/conformance/enqueue/CMakeLists.txt b/test/conformance/enqueue/CMakeLists.txt index 532cab1b85..7cc68203a0 100644 --- a/test/conformance/enqueue/CMakeLists.txt +++ b/test/conformance/enqueue/CMakeLists.txt @@ -29,4 +29,5 @@ add_conformance_test_with_kernels_environment(enqueue urEnqueueUSMPrefetch.cpp urEnqueueReadHostPipe.cpp urEnqueueWriteHostPipe.cpp + urEnqueueTimestampRecording.cpp ) diff --git a/test/conformance/enqueue/enqueue_adapter_cuda.match b/test/conformance/enqueue/enqueue_adapter_cuda.match index 9b57269f3d..8d015c3dc7 100644 --- a/test/conformance/enqueue/enqueue_adapter_cuda.match +++ b/test/conformance/enqueue/enqueue_adapter_cuda.match @@ -1,3 +1,6 @@ +urEnqueueKernelLaunchTest.InvalidKernelArgs/NVIDIA_CUDA_BACKEND___{{.*}}_ +urEnqueueKernelLaunchKernelWgSizeTest.NonMatchingLocalSize/NVIDIA_CUDA_BACKEND___{{.*}}_ +urEnqueueKernelLaunchKernelSubGroupTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_ {{OPT}}urEnqueueKernelLaunchWithVirtualMemory.Success/NVIDIA_CUDA_BACKEND___{{.*}}_ {{OPT}}urEnqueueMemBufferCopyRectTest.InvalidSize/NVIDIA_CUDA_BACKEND___{{.*}}_ {{OPT}}urEnqueueMemBufferFillTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}___size__256__patternSize__256 diff --git a/test/conformance/enqueue/enqueue_adapter_hip.match b/test/conformance/enqueue/enqueue_adapter_hip.match index 8c30bb6a1f..f97188d742 100644 --- a/test/conformance/enqueue/enqueue_adapter_hip.match +++ b/test/conformance/enqueue/enqueue_adapter_hip.match @@ -1,10 +1,11 @@ -{{OPT}}urEnqueueDeviceGetGlobalVariableReadTest.Success/AMD_HIP_BACKEND___{{.*}}_ +{{OPT}}urEnqueueKernelLaunchTest.InvalidKernelArgs/AMD_HIP_BACKEND___{{.*}}_ +{{OPT}}urEnqueueKernelLaunchKernelWgSizeTest.NonMatchingLocalSize/AMD_HIP_BACKEND___{{.*}}_ +{{OPT}}urEnqueueKernelLaunchKernelSubGroupTest.Success/AMD_HIP_BACKEND___{{.*}}_ {{OPT}}urEnqueueKernelLaunchUSMLinkedList.Success/AMD_HIP_BACKEND___{{.*}}___UsePoolEnabled {{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___copy_row_2D {{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___copy_3d_2d {{OPT}}urEnqueueMemBufferFillTest.Success/AMD_HIP_BACKEND___{{.*}}___size__256__patternSize__256 {{OPT}}urEnqueueMemBufferFillTest.Success/AMD_HIP_BACKEND___{{.*}}___size__1024__patternSize__256 -{{OPT}}urEnqueueMemBufferMapTest.SuccessMultiMaps/AMD_HIP_BACKEND___{{.*}}_ {{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___write_row_2D {{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___write_3d_2d {{OPT}}urEnqueueMemImageCopyTest.Success/AMD_HIP_BACKEND___{{.*}}___1D @@ -33,58 +34,12 @@ {{OPT}}urEnqueueMemImageReadTest.Success3D/AMD_HIP_BACKEND___{{.*}}_ {{OPT}}urEnqueueMemImageWriteTest.Success1D/AMD_HIP_BACKEND___{{.*}}_ {{OPT}}urEnqueueMemImageWriteTest.Success3D/AMD_HIP_BACKEND___{{.*}}_ -{{OPT}}urEnqueueUSMFillTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___size__1__patternSize__1 -{{OPT}}urEnqueueUSMFillTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___size__256__patternSize__256 -{{OPT}}urEnqueueUSMFillTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___size__1024__patternSize__256 -{{OPT}}urEnqueueUSMFillTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___size__256__patternSize__4 -{{OPT}}urEnqueueUSMFillTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___size__256__patternSize__8 -{{OPT}}urEnqueueUSMFillTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___size__256__patternSize__16 -{{OPT}}urEnqueueUSMFillTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___size__256__patternSize__32 {{OPT}}urEnqueueUSMFill2DNegativeTest.OutOfBounds/AMD_HIP_BACKEND___{{.*}}_ {{OPT}}urEnqueueUSMAdviseWithParamTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_USM_ADVICE_FLAG_DEFAULT {{OPT}}urEnqueueUSMAdviseTest.MultipleParamsSuccess/AMD_HIP_BACKEND___{{.*}}_ {{OPT}}urEnqueueUSMAdviseTest.NonCoherentDeviceMemorySuccessOrWarning/AMD_HIP_BACKEND___{{.*}}_ -{{OPT}}urEnqueueUSMMemcpyTest.Blocking/AMD_HIP_BACKEND___{{.*}}_ -{{OPT}}urEnqueueUSMMemcpyTest.BlockingWithEvent/AMD_HIP_BACKEND___{{.*}}_ -{{OPT}}urEnqueueUSMMemcpyTest.NonBlocking/AMD_HIP_BACKEND___{{.*}}_ -{{OPT}}urEnqueueUSMMemcpyTest.WaitForDependencies/AMD_HIP_BACKEND___{{.*}}_ -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1__width__1__height__1__src__Device__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1__width__1__height__1__src__Host__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1__width__1__height__1__src__Shared__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__1__src__Device__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__1__src__Host__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__1__src__Shared__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__1024__height__1__src__Device__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__1024__height__1__src__Host__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__1024__height__1__src__Shared__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__256__src__Device__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__256__src__Host__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__256__src__Shared__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__23__src__Device__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__23__src__Host__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__23__src__Shared__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__1__src__Device__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__1__src__Host__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__1__src__Shared__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1__width__1__height__1__src__Device__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1__width__1__height__1__src__Host__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1__width__1__height__1__src__Shared__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__1__src__Device__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__1__src__Host__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__1__src__Shared__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__1024__height__1__src__Device__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__1024__height__1__src__Host__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__1024__height__1__src__Shared__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__256__src__Device__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__256__src__Host__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__1024__width__256__height__256__src__Shared__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__23__src__Device__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__23__src__Host__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__23__src__Shared__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__1__src__Device__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__1__src__Host__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/AMD_HIP_BACKEND___{{.*}}___pitch__234__width__233__height__1__src__Shared__dst__Device -{{OPT}}urEnqueueUSMMemcpy2DNegativeTest.InvalidSize/AMD_HIP_BACKEND___{{.*}}___pitch__1__width__1__height__1__src__Device__dst__Device +{{OPT}}urEnqueueUSMMemcpy2DNegativeTest.InvalidSize/AMD_HIP_BACKEND___{{.*}}___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE {{OPT}}urEnqueueUSMPrefetchWithParamTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_USM_MIGRATION_FLAG_DEFAULT {{OPT}}urEnqueueUSMPrefetchWithParamTest.CheckWaitEvent/AMD_HIP_BACKEND___{{.*}}___UR_USM_MIGRATION_FLAG_DEFAULT -{{OPT}}{{Segmentation fault|Aborted}} +{{OPT}}urEnqueueTimestampRecordingExpTest.Success/AMD_HIP_BACKEND___{{.*}} +{{OPT}}urEnqueueTimestampRecordingExpTest.SuccessBlocking/AMD_HIP_BACKEND___{{.*}} diff --git a/test/conformance/enqueue/enqueue_adapter_native_cpu.match b/test/conformance/enqueue/enqueue_adapter_native_cpu.match index 43a114bf21..155a400e89 100644 --- a/test/conformance/enqueue/enqueue_adapter_native_cpu.match +++ b/test/conformance/enqueue/enqueue_adapter_native_cpu.match @@ -22,6 +22,13 @@ {{OPT}}urEnqueueKernelLaunchTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU_ {{OPT}}urEnqueueKernelLaunchTest.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU_ {{OPT}}urEnqueueKernelLaunchTest.InvalidWorkDimension/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueKernelLaunchTest.InvalidWorkGroupSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueKernelLaunchTest.InvalidKernelArgs/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueKernelLaunchKernelWgSizeTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueKernelLaunchKernelWgSizeTest.SuccessWithExplicitLocalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueKernelLaunchKernelWgSizeTest.NonMatchingLocalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueKernelLaunchKernelSubGroupTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueKernelLaunchKernelStandardTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ {{OPT}}urEnqueueKernelLaunchTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D_1 {{OPT}}urEnqueueKernelLaunchTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D_31 {{OPT}}urEnqueueKernelLaunchTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D_1027 diff --git a/test/conformance/enqueue/enqueue_adapter_opencl.match b/test/conformance/enqueue/enqueue_adapter_opencl.match index 4aa265c897..0e751b8a25 100644 --- a/test/conformance/enqueue/enqueue_adapter_opencl.match +++ b/test/conformance/enqueue/enqueue_adapter_opencl.match @@ -32,4 +32,6 @@ {{OPT}}urEnqueueUSMMemcpy2DNegativeTest.InvalidSize/Intel_R__OpenCL___{{.*}} {{OPT}}urEnqueueUSMMemcpy2DNegativeTest.InvalidEventWaitList/Intel_R__OpenCL___{{.*}} {{OPT}}urEnqueueUSMPrefetchTest.InvalidSizeTooLarge/Intel_R__OpenCL___{{.*}} +urEnqueueKernelLaunchKernelWgSizeTest.Success/Intel_R__OpenCL___{{.*}}_ +urEnqueueKernelLaunchKernelSubGroupTest.Success/Intel_R__OpenCL___{{.*}}_ {{OPT}}urEnqueueKernelLaunchUSMLinkedList.Success/Intel_R__OpenCL___{{.*}}_UsePoolEnabled diff --git a/test/conformance/enqueue/urEnqueueKernelLaunch.cpp b/test/conformance/enqueue/urEnqueueKernelLaunch.cpp index 9217457270..88b65348dc 100644 --- a/test/conformance/enqueue/urEnqueueKernelLaunch.cpp +++ b/test/conformance/enqueue/urEnqueueKernelLaunch.cpp @@ -18,6 +18,45 @@ struct urEnqueueKernelLaunchTest : uur::urKernelExecutionTest { }; UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunchTest); +struct urEnqueueKernelLaunchKernelWgSizeTest : uur::urKernelExecutionTest { + void SetUp() override { + program_name = "fixed_wg_size"; + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + } + + std::array global_size{32, 32, 32}; + std::array global_offset{0, 0, 0}; + // This must match the size in fixed_wg_size.cpp + std::array wg_size{4, 4, 4}; + size_t n_dimensions = 3; +}; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunchKernelWgSizeTest); + +// Note: Due to an issue with HIP, the subgroup test is not generated +struct urEnqueueKernelLaunchKernelSubGroupTest : uur::urKernelExecutionTest { + void SetUp() override { + program_name = "subgroup"; + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + } + + std::array global_size{32, 32, 32}; + std::array global_offset{0, 0, 0}; + size_t n_dimensions = 3; +}; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunchKernelSubGroupTest); + +struct urEnqueueKernelLaunchKernelStandardTest : uur::urKernelExecutionTest { + void SetUp() override { + program_name = "standard_types"; + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + } + + size_t n_dimensions = 1; + size_t global_size = 1; + size_t offset = 0; +}; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunchKernelStandardTest); + TEST_P(urEnqueueKernelLaunchTest, Success) { ur_mem_handle_t buffer = nullptr; AddBuffer1DArg(sizeof(val) * global_size, &buffer); @@ -77,6 +116,94 @@ TEST_P(urEnqueueKernelLaunchTest, InvalidWorkDimension) { UR_RESULT_ERROR_INVALID_WORK_DIMENSION); } +TEST_P(urEnqueueKernelLaunchTest, InvalidWorkGroupSize) { + // As far as I can tell, there's no way to check if a kernel or device + // requires uniform work group sizes or not, so this may succeed or report + // an error + size_t local_size = 31; + ur_mem_handle_t buffer = nullptr; + AddBuffer1DArg(sizeof(val) * global_size, &buffer); + AddPodArg(val); + auto result = + urEnqueueKernelLaunch(queue, kernel, n_dimensions, &global_offset, + &global_size, &local_size, 0, nullptr, nullptr); + ASSERT_TRUE(result == UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE || + result == UR_RESULT_SUCCESS); +} + +TEST_P(urEnqueueKernelLaunchTest, InvalidKernelArgs) { + ur_platform_backend_t backend; + ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, + sizeof(ur_platform_backend_t), &backend, + nullptr)); + + if (backend == UR_PLATFORM_BACKEND_CUDA || + backend == UR_PLATFORM_BACKEND_HIP) { + GTEST_FAIL() << "AMD and Nvidia can't check kernel arguments."; + } + + // Enqueue kernel without setting any args + ASSERT_EQ_RESULT(urEnqueueKernelLaunch(queue, kernel, n_dimensions, + &global_offset, &global_size, + nullptr, 0, nullptr, nullptr), + UR_RESULT_ERROR_INVALID_KERNEL_ARGS); +} + +TEST_P(urEnqueueKernelLaunchKernelWgSizeTest, Success) { + ASSERT_SUCCESS(urEnqueueKernelLaunch( + queue, kernel, n_dimensions, global_offset.data(), global_size.data(), + nullptr, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); +} + +TEST_P(urEnqueueKernelLaunchKernelWgSizeTest, SuccessWithExplicitLocalSize) { + ASSERT_SUCCESS(urEnqueueKernelLaunch( + queue, kernel, n_dimensions, global_offset.data(), global_size.data(), + wg_size.data(), 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); +} + +TEST_P(urEnqueueKernelLaunchKernelWgSizeTest, NonMatchingLocalSize) { + std::array wrong_wg_size{8, 8, 8}; + ASSERT_EQ_RESULT( + urEnqueueKernelLaunch(queue, kernel, n_dimensions, global_offset.data(), + global_size.data(), wrong_wg_size.data(), 0, + nullptr, nullptr), + UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); +} + +TEST_P(urEnqueueKernelLaunchKernelSubGroupTest, Success) { + ur_mem_handle_t buffer = nullptr; + AddBuffer1DArg(sizeof(size_t), &buffer); + ASSERT_SUCCESS(urEnqueueKernelLaunch( + queue, kernel, n_dimensions, global_offset.data(), global_size.data(), + nullptr, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + ValidateBuffer(buffer, sizeof(size_t), 8); +} + +struct Pair { + uint32_t a; + uint32_t b; +}; +TEST_P(urEnqueueKernelLaunchKernelStandardTest, Success) { + uint32_t expected_result = 2410; + ur_mem_handle_t output = nullptr; + AddBuffer1DArg(sizeof(uint32_t), &output); + AddPodArg(true); + AddPodArg(2); + AddPodArg(3); + AddPodArg(5); + AddPodArg({7, 5}); + AddPodArg(11.0); + + ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, &offset, + &global_size, nullptr, 0, nullptr, + nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + ValidateBuffer(output, sizeof(uint32_t), expected_result); +} + struct testParametersEnqueueKernel { size_t X, Y, Z; size_t Dims; diff --git a/test/conformance/enqueue/urEnqueueTimestampRecording.cpp b/test/conformance/enqueue/urEnqueueTimestampRecording.cpp new file mode 100644 index 0000000000..5fc8ee5547 --- /dev/null +++ b/test/conformance/enqueue/urEnqueueTimestampRecording.cpp @@ -0,0 +1,94 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +struct urEnqueueTimestampRecordingExpTest : uur::urQueueTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urQueueTest::SetUp()); + bool timestamp_recording_support = false; + ASSERT_SUCCESS(uur::GetTimestampRecordingSupport( + device, timestamp_recording_support)); + if (!timestamp_recording_support) { + GTEST_SKIP() << "Timestamp recording is not supported"; + } + } + + void TearDown() override { urQueueTest::TearDown(); } +}; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueTimestampRecordingExpTest); + +void common_check(ur_event_handle_t event) { + // All successful runs should return a non-zero profiling results. + uint64_t queuedTime = 0, submitTime = 0, startTime = 0, endTime = 0; + ASSERT_SUCCESS( + urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_QUEUED, + sizeof(uint64_t), &queuedTime, nullptr)); + ASSERT_SUCCESS( + urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_SUBMIT, + sizeof(uint64_t), &submitTime, nullptr)); + ASSERT_SUCCESS( + urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_START, + sizeof(uint64_t), &startTime, nullptr)); + ASSERT_SUCCESS(urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_END, + sizeof(uint64_t), &endTime, + nullptr)); + ASSERT_TRUE(queuedTime > 0); + ASSERT_TRUE(submitTime > 0); + ASSERT_TRUE(startTime > 0); + ASSERT_TRUE(endTime > 0); + ASSERT_TRUE(queuedTime == submitTime); + ASSERT_TRUE(startTime == endTime); + ASSERT_TRUE(endTime >= submitTime); +} + +TEST_P(urEnqueueTimestampRecordingExpTest, Success) { + ur_event_handle_t event = nullptr; + ASSERT_SUCCESS( + urEnqueueTimestampRecordingExp(queue, false, 0, nullptr, &event)); + ASSERT_SUCCESS(urQueueFinish(queue)); + common_check(event); + ASSERT_SUCCESS(urEventRelease(event)); +} + +TEST_P(urEnqueueTimestampRecordingExpTest, SuccessBlocking) { + ur_event_handle_t event = nullptr; + ASSERT_SUCCESS( + urEnqueueTimestampRecordingExp(queue, true, 0, nullptr, &event)); + common_check(event); + ASSERT_SUCCESS(urEventRelease(event)); +} + +TEST_P(urEnqueueTimestampRecordingExpTest, InvalidNullHandleQueue) { + ur_event_handle_t event = nullptr; + ASSERT_EQ_RESULT( + urEnqueueTimestampRecordingExp(nullptr, false, 0, nullptr, &event), + UR_RESULT_ERROR_INVALID_NULL_HANDLE); +} + +TEST_P(urEnqueueTimestampRecordingExpTest, InvalidNullPointerEvent) { + ASSERT_EQ_RESULT( + urEnqueueTimestampRecordingExp(queue, false, 0, nullptr, nullptr), + UR_RESULT_ERROR_INVALID_NULL_POINTER); +} + +TEST_P(urEnqueueTimestampRecordingExpTest, InvalidNullPtrEventWaitList) { + ur_event_handle_t event = nullptr; + ASSERT_EQ_RESULT( + urEnqueueTimestampRecordingExp(queue, true, 1, nullptr, &event), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + + ur_event_handle_t validEvent; + ASSERT_SUCCESS(urEnqueueEventsWait(queue, 0, nullptr, &validEvent)); + ASSERT_EQ_RESULT( + urEnqueueTimestampRecordingExp(queue, true, 0, &validEvent, &event), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + ASSERT_SUCCESS(urEventRelease(validEvent)); + + ur_event_handle_t invalidEvent = nullptr; + ASSERT_EQ_RESULT( + urEnqueueTimestampRecordingExp(queue, true, 0, &invalidEvent, &event), + UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); +} diff --git a/test/conformance/event/event_adapter_native_cpu.match b/test/conformance/event/event_adapter_native_cpu.match index d3bc8ef822..f5769d271f 100644 --- a/test/conformance/event/event_adapter_native_cpu.match +++ b/test/conformance/event/event_adapter_native_cpu.match @@ -21,6 +21,7 @@ urEventRetainTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urEventReleaseTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urEventGetNativeHandleTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urEventGetNativeHandleTest.InvalidNullPointerNativeEvent/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urEventCreateWithNativeHandleTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urEventSetCallbackTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urEventSetCallbackTest.ValidateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urEventSetCallbackTest.AllStates/SYCL_NATIVE_CPU___SYCL_Native_CPU_ diff --git a/test/conformance/event/urEventCreateWithNativeHandle.cpp b/test/conformance/event/urEventCreateWithNativeHandle.cpp index 076f6b312f..47daa464ed 100644 --- a/test/conformance/event/urEventCreateWithNativeHandle.cpp +++ b/test/conformance/event/urEventCreateWithNativeHandle.cpp @@ -11,8 +11,9 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEventCreateWithNativeHandleTest); TEST_P(urEventCreateWithNativeHandleTest, Success) { ur_native_handle_t native_event = nullptr; - if (urEventGetNativeHandle(event, &native_event)) { - GTEST_SKIP(); + { + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urEventGetNativeHandle(event, &native_event)); } // We cannot assume anything about a native_handle, not even if it's @@ -20,8 +21,8 @@ TEST_P(urEventCreateWithNativeHandleTest, Success) { // We can however convert the native_handle back into a unified-runtime handle // and perform some query on it to verify that it works. uur::raii::Event evt = nullptr; - ASSERT_SUCCESS(urEventCreateWithNativeHandle(native_event, context, nullptr, - evt.ptr())); + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urEventCreateWithNativeHandle( + native_event, context, nullptr, evt.ptr())); ASSERT_NE(evt, nullptr); ur_execution_info_t exec_info; diff --git a/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp b/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp index 2663e6824a..80561cc9e1 100644 --- a/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp +++ b/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp @@ -19,18 +19,32 @@ struct BufferFillCommandTest &buffer)); // First argument is buffer to fill - ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, 0, nullptr, buffer)); - - // second arg is hidden accessor - struct { - size_t offsets[1] = {0}; - } accessor; - ASSERT_SUCCESS(urKernelSetArgValue(kernel, 1, sizeof(accessor), nullptr, - &accessor)); - - // Second argument is scalar to fill with. + unsigned current_arg_index = 0; ASSERT_SUCCESS( - urKernelSetArgValue(kernel, 2, sizeof(val), nullptr, &val)); + urKernelSetArgMemObj(kernel, current_arg_index++, nullptr, buffer)); + + // Add accessor arguments depending on backend. + // HIP has 3 offset parameters and other backends only have 1. + if (backend == UR_PLATFORM_BACKEND_HIP) { + size_t val = 0; + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index++, + sizeof(size_t), nullptr, &val)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index++, + sizeof(size_t), nullptr, &val)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index++, + sizeof(size_t), nullptr, &val)); + } else { + struct { + size_t offsets[1] = {0}; + } accessor; + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index++, + sizeof(accessor), nullptr, + &accessor)); + } + + // Second user defined argument is scalar to fill with. + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index++, + sizeof(val), nullptr, &val)); // Append kernel command to command-buffer and close command-buffer ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match index 76e0b99dc0..1447b0b31b 100644 --- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match +++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match @@ -1,8 +1,3 @@ -{{OPT}}BufferFillCommandTest.UpdateParameters/AMD_HIP_BACKEND{{.*}} -{{OPT}}BufferFillCommandTest.UpdateGlobalSize/AMD_HIP_BACKEND{{.*}} -{{OPT}}BufferFillCommandTest.SeparateUpdateCalls/AMD_HIP_BACKEND{{.*}} -{{OPT}}BufferFillCommandTest.OverrideUpdate/AMD_HIP_BACKEND{{.*}} -{{OPT}}BufferFillCommandTest.OverrideArgList/AMD_HIP_BACKEND{{.*}} {{OPT}}USMFillCommandTest.UpdateParameters/AMD_HIP_BACKEND{{.*}} {{OPT}}USMMultipleFillCommandTest.UpdateAllKernels/AMD_HIP_BACKEND{{.*}} {{OPT}}BufferSaxpyKernelTest.UpdateParameters/AMD_HIP_BACKEND{{.*}} @@ -14,4 +9,3 @@ {{OPT}}urCommandBufferReleaseCommandExpTest.InvalidNullHandle/AMD_HIP_BACKEND{{.*}} {{OPT}}urCommandBufferRetainCommandExpTest.Success/AMD_HIP_BACKEND{{.*}} {{OPT}}urCommandBufferRetainCommandExpTest.InvalidNullHandle/AMD_HIP_BACKEND{{.*}} -{{OPT}}{{Segmentation fault|Aborted}} diff --git a/test/conformance/integration/CMakeLists.txt b/test/conformance/integration/CMakeLists.txt new file mode 100644 index 0000000000..1689cf04f4 --- /dev/null +++ b/test/conformance/integration/CMakeLists.txt @@ -0,0 +1,10 @@ +# Copyright (C) 2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +add_conformance_test_with_kernels_environment(integration + QueueEmptyStatus.cpp + QueueUSM.cpp + QueueBuffer.cpp + ) diff --git a/test/conformance/integration/QueueBuffer.cpp b/test/conformance/integration/QueueBuffer.cpp new file mode 100644 index 0000000000..d801ebf684 --- /dev/null +++ b/test/conformance/integration/QueueBuffer.cpp @@ -0,0 +1,108 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "fixtures.h" +#include +#include + +struct QueueBufferTestWithParam : uur::IntegrationQueueTestWithParam { + void SetUp() override { + program_name = "cpy_and_mult"; + UUR_RETURN_ON_FATAL_FAILURE( + uur::IntegrationQueueTestWithParam::SetUp()); + } + + void TearDown() override { uur::IntegrationQueueTestWithParam::TearDown(); } + + void verifyResults(ur_mem_handle_t Buffer, uint32_t ExpectedValue) { + uint32_t HostMem[ArraySize] = {}; + ASSERT_SUCCESS(urEnqueueMemBufferRead(Queue, Buffer, true, 0, + sizeof(uint32_t) * ArraySize, + HostMem, 0, nullptr, nullptr)); + + for (uint32_t i : HostMem) { + ASSERT_EQ(i, ExpectedValue); + } + } + + ur_mem_handle_t Buffer1 = nullptr; + ur_mem_handle_t Buffer2 = nullptr; +}; + +UUR_TEST_SUITE_P(QueueBufferTestWithParam, + testing::Values(0, /* In-Order */ + UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE), + uur::IntegrationQueueTestWithParam::paramPrinter); + +/* Submits multiple kernels that interact with each other by accessing and + * writing to the same buffers. + * Checks that when using an IN_ORDER queue, no synchronization is needed + * between calls to urEnqueueKernelLaunch. + * Checks that when using an OUT_OF_ORDER queue, synchronizing using only + * event barriers is enough. */ +TEST_P(QueueBufferTestWithParam, QueueBufferTest) { + + std::vector EventsFill; + ur_event_handle_t Event; + + size_t Buffer1Index; + size_t Buffer2Index; + ASSERT_NO_FATAL_FAILURE( + AddBuffer1DArg(ArraySize * sizeof(uint32_t), &Buffer1, &Buffer1Index)); + ASSERT_NO_FATAL_FAILURE( + AddBuffer1DArg(ArraySize * sizeof(uint32_t), &Buffer2, &Buffer2Index)); + + ASSERT_SUCCESS(urEnqueueMemBufferFill( + Queue, Buffer1, &InitialValue, sizeof(uint32_t), 0, + ArraySize * sizeof(uint32_t), 0, nullptr, &Event)); + EventsFill.push_back(Event); + + ASSERT_SUCCESS(urEnqueueMemBufferFill( + Queue, Buffer2, &InitialValue, sizeof(uint32_t), 0, + ArraySize * sizeof(uint32_t), 0, nullptr, &Event)); + EventsFill.push_back(Event); + + ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(EventsFill)); + + constexpr size_t GlobalOffset = 0; + constexpr size_t NDimensions = 1; + constexpr uint32_t NumIterations = 5; + + uint32_t CurValueMem1 = InitialValue; + uint32_t CurValueMem2 = InitialValue; + for (uint32_t i = 0; i < NumIterations; ++i) { + + /* Copy from DeviceMem1 to DeviceMem2 and multiply by 2 */ + ASSERT_SUCCESS( + urKernelSetArgMemObj(kernel, Buffer2Index, nullptr, Buffer2)); + ASSERT_SUCCESS( + urKernelSetArgMemObj(kernel, Buffer1Index, nullptr, Buffer1)); + + ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, NDimensions, + &GlobalOffset, &ArraySize, nullptr, + 0, nullptr, &Event)); + ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event)); + + CurValueMem2 = CurValueMem1 * 2; + + /* Copy from DeviceMem1 to DeviceMem2 and multiply by 2 */ + ASSERT_SUCCESS( + urKernelSetArgMemObj(kernel, Buffer1Index, nullptr, Buffer2)); + ASSERT_SUCCESS( + urKernelSetArgMemObj(kernel, Buffer2Index, nullptr, Buffer1)); + + ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, NDimensions, + &GlobalOffset, &ArraySize, nullptr, + 0, nullptr, &Event)); + ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event)); + + CurValueMem1 = CurValueMem2 * 2; + } + + ASSERT_SUCCESS(urQueueFinish(Queue)); + + ASSERT_NO_FATAL_FAILURE(verifyResults(Buffer1, CurValueMem1)); + ASSERT_NO_FATAL_FAILURE(verifyResults(Buffer2, CurValueMem2)); +} diff --git a/test/conformance/integration/QueueEmptyStatus.cpp b/test/conformance/integration/QueueEmptyStatus.cpp new file mode 100644 index 0000000000..b8f1517b70 --- /dev/null +++ b/test/conformance/integration/QueueEmptyStatus.cpp @@ -0,0 +1,107 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "fixtures.h" +#include +#include + +struct QueueEmptyStatusTestWithParam : uur::IntegrationQueueTestWithParam { + + void SetUp() override { + + program_name = "multiply"; + UUR_RETURN_ON_FATAL_FAILURE( + uur::IntegrationQueueTestWithParam::SetUp()); + + ur_device_usm_access_capability_flags_t shared_usm_flags = 0; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; + } + + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + ArraySize * sizeof(uint32_t), + &SharedMem)); + } + + void TearDown() override { + ASSERT_SUCCESS(urUSMFree(context, SharedMem)); + uur::IntegrationQueueTestWithParam::TearDown(); + } + + void submitWorkToQueue() { + ur_event_handle_t Event; + ASSERT_SUCCESS( + urEnqueueUSMFill(Queue, SharedMem, sizeof(uint32_t), &InitialValue, + ArraySize * sizeof(uint32_t), 0, nullptr, &Event)); + ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event)); + + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, &SharedMem)); + + constexpr size_t global_offset = 0; + constexpr size_t n_dimensions = 1; + constexpr uint32_t num_iterations = 5; + for (uint32_t i = 0; i < num_iterations; ++i) { + ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, n_dimensions, + &global_offset, &ArraySize, + nullptr, 0, nullptr, &Event)); + ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event)); + } + + ASSERT_SUCCESS(urQueueFlush(Queue)); + } + + void waitUntilQueueEmpty() const { + + using namespace std::chrono_literals; + + constexpr auto step = 500ms; + constexpr auto maxWait = 5000ms; + + /* Wait a bit until work finishes running. We don't synchronize with + * urQueueFinish() because we want to check if the status is set without + * calling it explicitly. */ + for (auto currentWait = 0ms; currentWait < maxWait; + currentWait += step) { + std::this_thread::sleep_for(step); + + ur_bool_t is_queue_empty; + ASSERT_SUCCESS(urQueueGetInfo(Queue, UR_QUEUE_INFO_EMPTY, + sizeof(ur_bool_t), &is_queue_empty, + nullptr)); + if (is_queue_empty) { + return; + } + } + + /* If we are here, the test failed. Let's call queue finish to avoid + * issues when freeing memory */ + ASSERT_SUCCESS(urQueueFinish(Queue)); + GTEST_FAIL(); + } + + void *SharedMem = nullptr; +}; + +UUR_TEST_SUITE_P(QueueEmptyStatusTestWithParam, + testing::Values(0, /* In-Order */ + UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE), + uur::IntegrationQueueTestWithParam::paramPrinter); + +/* Submits kernels that have a dependency on each other and checks that the + * queue submits all the work in the correct order to the device. + * Explicit synchronization (except for barriers) is avoided in these tests to + * check that the properties of In-Order and OutOfOrder queues are working as + * expected */ +TEST_P(QueueEmptyStatusTestWithParam, QueueEmptyStatusTest) { + ASSERT_NO_FATAL_FAILURE(submitWorkToQueue()); + ASSERT_NO_FATAL_FAILURE(waitUntilQueueEmpty()); + + constexpr size_t expected_value = 3200; + for (uint32_t i = 0; i < ArraySize; ++i) { + ASSERT_EQ(reinterpret_cast(SharedMem)[i], expected_value); + } +} diff --git a/test/conformance/integration/QueueUSM.cpp b/test/conformance/integration/QueueUSM.cpp new file mode 100644 index 0000000000..cc8201453a --- /dev/null +++ b/test/conformance/integration/QueueUSM.cpp @@ -0,0 +1,117 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "fixtures.h" +#include +#include + +struct QueueUSMTestWithParam : uur::IntegrationQueueTestWithParam { + void SetUp() override { + program_name = "cpy_and_mult_usm"; + UUR_RETURN_ON_FATAL_FAILURE( + uur::IntegrationQueueTestWithParam::SetUp()); + + ur_device_usm_access_capability_flags_t shared_usm_flags = 0; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; + } + + ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr, + ArraySize * sizeof(uint32_t), + &DeviceMem1)); + + ASSERT_SUCCESS(urUSMDeviceAlloc(context, device, nullptr, nullptr, + ArraySize * sizeof(uint32_t), + &DeviceMem2)); + } + + void TearDown() override { + ASSERT_SUCCESS(urUSMFree(context, DeviceMem1)); + ASSERT_SUCCESS(urUSMFree(context, DeviceMem2)); + uur::IntegrationQueueTestWithParam::TearDown(); + } + + void verifyResults(void *DeviceMem, uint32_t ExpectedValue) { + uint32_t HostMem[ArraySize] = {}; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(Queue, true, HostMem, DeviceMem, + sizeof(uint32_t) * ArraySize, 0, + nullptr, nullptr)); + + for (uint32_t i : HostMem) { + ASSERT_EQ(i, ExpectedValue); + } + } + + void *DeviceMem1 = nullptr; + void *DeviceMem2 = nullptr; +}; + +UUR_TEST_SUITE_P(QueueUSMTestWithParam, + testing::Values(0, /* In-Order */ + UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE), + uur::IntegrationQueueTestWithParam::paramPrinter); + +/* Submits multiple kernels that interact with each other by accessing and + * writing to the same USM memory locations. + * Checks that when using an IN_ORDER queue, no synchronization is needed + * between calls to urEnqueueKernelLaunch. + * Checks that when using an OUT_OF_ORDER queue, synchronizing using only + * event barriers is enough. */ +TEST_P(QueueUSMTestWithParam, QueueUSMTest) { + + std::vector EventsFill; + ur_event_handle_t Event; + ASSERT_SUCCESS(urEnqueueUSMFill(Queue, DeviceMem1, sizeof(uint32_t), + &InitialValue, ArraySize * sizeof(uint32_t), + 0, nullptr, &Event)); + EventsFill.push_back(Event); + + ASSERT_SUCCESS(urEnqueueUSMFill(Queue, DeviceMem2, sizeof(uint32_t), + &InitialValue, ArraySize * sizeof(uint32_t), + 0, nullptr, &Event)); + EventsFill.push_back(Event); + + ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(EventsFill)); + + constexpr size_t GlobalOffset = 0; + constexpr size_t NDimensions = 1; + constexpr uint32_t NumIterations = 5; + + uint32_t CurValueMem1 = InitialValue; + uint32_t CurValueMem2 = InitialValue; + + std::vector EventsKernel; + + for (uint32_t i = 0; i < NumIterations; ++i) { + /* Copy from DeviceMem2 to DeviceMem1 and multiply by 2 */ + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, &DeviceMem1)); + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 1, nullptr, &DeviceMem2)); + + ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, NDimensions, + &GlobalOffset, &ArraySize, nullptr, + 0, nullptr, &Event)); + ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event)); + + CurValueMem2 = CurValueMem1 * 2; + + /* Copy from DeviceMem1 to DeviceMem2 and multiply by 2 */ + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, &DeviceMem2)); + ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 1, nullptr, &DeviceMem1)); + + ASSERT_SUCCESS(urEnqueueKernelLaunch(Queue, kernel, NDimensions, + &GlobalOffset, &ArraySize, nullptr, + 0, nullptr, &Event)); + ASSERT_NO_FATAL_FAILURE(submitBarrierIfNeeded(Event)); + + CurValueMem1 = CurValueMem2 * 2; + } + + ASSERT_SUCCESS(urQueueFinish(Queue)); + + ASSERT_NO_FATAL_FAILURE(verifyResults(DeviceMem1, CurValueMem1)); + ASSERT_NO_FATAL_FAILURE(verifyResults(DeviceMem2, CurValueMem2)); +} diff --git a/test/conformance/integration/fixtures.h b/test/conformance/integration/fixtures.h new file mode 100644 index 0000000000..aca70a5245 --- /dev/null +++ b/test/conformance/integration/fixtures.h @@ -0,0 +1,76 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef UR_CONFORMANCE_INTEGRATION_FIXTURES_H +#define UR_CONFORMANCE_INTEGRATION_FIXTURES_H + +#include + +namespace uur { + +struct IntegrationQueueTestWithParam + : uur::urKernelExecutionTestWithParam { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE( + uur::urKernelExecutionTestWithParam::SetUp()); + + QueueFlags = getParam(); + ur_queue_properties_t queue_properties = { + UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr, QueueFlags}; + ASSERT_SUCCESS( + urQueueCreate(context, device, &queue_properties, &Queue)); + } + + void TearDown() override { + for (ur_event_handle_t Event : AllEvents) { + ASSERT_SUCCESS(urEventRelease(Event)); + } + + UUR_RETURN_ON_FATAL_FAILURE( + uur::urKernelExecutionTestWithParam::TearDown()); + } + + void submitBarrierIfNeeded(std::vector &(Events)) { + if (QueueFlags == UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) { + ASSERT_SUCCESS(urEnqueueEventsWaitWithBarrier( + Queue, Events.size(), Events.data(), nullptr)); + AllEvents.insert(AllEvents.end(), Events.begin(), Events.end()); + } + } + + void submitBarrierIfNeeded(ur_event_handle_t Event) { + if (QueueFlags == UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) { + ASSERT_SUCCESS( + urEnqueueEventsWaitWithBarrier(Queue, 1, &Event, nullptr)); + AllEvents.push_back(Event); + } + } + + std::vector AllEvents; + ur_queue_flags_t QueueFlags{}; + ur_queue_handle_t Queue{}; + static constexpr size_t ArraySize = 100; + static constexpr uint32_t InitialValue = 100; + + static std::string + paramPrinter(const ::testing::TestParamInfo< + std::tuple> &info) { + auto device = std::get<0>(info.param); + auto param = std::get<1>(info.param); + + std::stringstream ss; + if (param == 0) { + ss << "IN_ORDER_QUEUE"; + } + if (param == UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) { + ss << "OUT_OF_ORDER_QUEUE"; + } + + return uur::GetPlatformAndDeviceName(device) + "__" + ss.str(); + } +}; +} // namespace uur + +#endif //UR_CONFORMANCE_INTEGRATION_FIXTURES_H diff --git a/test/conformance/integration/integration_adapter_cuda.match b/test/conformance/integration/integration_adapter_cuda.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/conformance/integration/integration_adapter_hip.match b/test/conformance/integration/integration_adapter_hip.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/conformance/integration/integration_adapter_level_zero.match b/test/conformance/integration/integration_adapter_level_zero.match new file mode 100644 index 0000000000..905fdea60f --- /dev/null +++ b/test/conformance/integration/integration_adapter_level_zero.match @@ -0,0 +1,6 @@ +{{OPT}}QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___IN_ORDER_QUEUE +{{OPT}}QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___OUT_OF_ORDER_QUEUE +{{OPT}}QueueUSMTestWithParam.QueueUSMTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___IN_ORDER_QUEUE +{{OPT}}QueueUSMTestWithParam.QueueUSMTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___OUT_OF_ORDER_QUEUE +{{OPT}}QueueBufferTestWithParam.QueueBufferTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___IN_ORDER_QUEUE +{{OPT}}QueueBufferTestWithParam.QueueBufferTest/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___OUT_OF_ORDER_QUEUE diff --git a/test/conformance/integration/integration_adapter_native_cpu.match b/test/conformance/integration/integration_adapter_native_cpu.match new file mode 100644 index 0000000000..07afde2cef --- /dev/null +++ b/test/conformance/integration/integration_adapter_native_cpu.match @@ -0,0 +1,7 @@ +QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___IN_ORDER_QUEUE +QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___OUT_OF_ORDER_QUEUE +QueueUSMTestWithParam.QueueUSMTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___IN_ORDER_QUEUE +QueueUSMTestWithParam.QueueUSMTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___OUT_OF_ORDER_QUEUE +QueueBufferTestWithParam.QueueBufferTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___IN_ORDER_QUEUE +QueueBufferTestWithParam.QueueBufferTest/SYCL_NATIVE_CPU___SYCL_Native_CPU___OUT_OF_ORDER_QUEUE +{{OPT}}{{Segmentation fault|Aborted}} diff --git a/test/conformance/integration/integration_adapter_opencl.match b/test/conformance/integration/integration_adapter_opencl.match new file mode 100644 index 0000000000..57a5299327 --- /dev/null +++ b/test/conformance/integration/integration_adapter_opencl.match @@ -0,0 +1,6 @@ +QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/Intel_R__OpenCL___{{.*}}___IN_ORDER_QUEUE +QueueEmptyStatusTestWithParam.QueueEmptyStatusTest/Intel_R__OpenCL___{{.*}}___OUT_OF_ORDER_QUEUE +QueueUSMTestWithParam.QueueUSMTest/Intel_R__OpenCL___{{.*}}___IN_ORDER_QUEUE +QueueUSMTestWithParam.QueueUSMTest/Intel_R__OpenCL___{{.*}}___OUT_OF_ORDER_QUEUE +QueueBufferTestWithParam.QueueBufferTest/Intel_R__OpenCL___{{.*}}___IN_ORDER_QUEUE +QueueBufferTestWithParam.QueueBufferTest/Intel_R__OpenCL___{{.*}}___OUT_OF_ORDER_QUEUE diff --git a/test/conformance/kernel/kernel_adapter_cuda.match b/test/conformance/kernel/kernel_adapter_cuda.match index 992b184a29..fe44a34352 100644 --- a/test/conformance/kernel/kernel_adapter_cuda.match +++ b/test/conformance/kernel/kernel_adapter_cuda.match @@ -1,9 +1,6 @@ -{{OPT}}urKernelSetArgLocalTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_ -{{OPT}}urKernelSetArgLocalTest.InvalidNullHandleKernel/NVIDIA_CUDA_BACKEND___{{.*}}_ +urKernelGetGroupInfoWgSizeTest.CompileWorkGroupSize/NVIDIA_CUDA_BACKEND___{{.*}}_ {{OPT}}urKernelSetArgLocalTest.InvalidKernelArgumentIndex/NVIDIA_CUDA_BACKEND___{{.*}}_ {{OPT}}urKernelSetArgMemObjTest.InvalidKernelArgumentIndex/NVIDIA_CUDA_BACKEND___{{.*}}_ -{{OPT}}urKernelSetArgPointerNegativeTest.InvalidNullHandleKernel/NVIDIA_CUDA_BACKEND___{{.*}}_ {{OPT}}urKernelSetArgPointerNegativeTest.InvalidKernelArgumentIndex/NVIDIA_CUDA_BACKEND___{{.*}}_ {{OPT}}urKernelSetArgSamplerTest.InvalidKernelArgumentIndex/NVIDIA_CUDA_BACKEND___{{.*}}_ {{OPT}}urKernelSetArgValueTest.InvalidKernelArgumentIndex/NVIDIA_CUDA_BACKEND___{{.*}}_ -{{OPT}}Segmentation fault diff --git a/test/conformance/kernel/kernel_adapter_hip.match b/test/conformance/kernel/kernel_adapter_hip.match index c5ed864c90..894bd698bb 100644 --- a/test/conformance/kernel/kernel_adapter_hip.match +++ b/test/conformance/kernel/kernel_adapter_hip.match @@ -1,3 +1,8 @@ +{{OPT}}urKernelCreateWithNativeHandleTest.Success/AMD_HIP_BACKEND___{{.*}}_ +{{OPT}}urKernelCreateWithNativeHandleTest.InvalidNullHandleContext/AMD_HIP_BACKEND___{{.*}}_ +{{OPT}}urKernelCreateWithNativeHandleTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}_ +{{OPT}}urKernelCreateWithNativeHandleTest.InvalidNullPointerNativeKernel/AMD_HIP_BACKEND___{{.*}}_ +urKernelGetGroupInfoWgSizeTest.CompileWorkGroupSize/AMD_HIP_BACKEND___{{.*}}_ {{OPT}}urKernelGetInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_KERNEL_INFO_NUM_REGS {{OPT}}urKernelGetInfoTest.InvalidSizeSmall/AMD_HIP_BACKEND___{{.*}}___UR_KERNEL_INFO_FUNCTION_NAME {{OPT}}urKernelGetInfoTest.InvalidSizeSmall/AMD_HIP_BACKEND___{{.*}}___UR_KERNEL_INFO_NUM_ARGS @@ -22,4 +27,3 @@ {{OPT}}urKernelSetSpecializationConstantsTest.InvalidNullHandleKernel/AMD_HIP_BACKEND___{{.*}}_ {{OPT}}urKernelSetSpecializationConstantsTest.InvalidNullPointerSpecConstants/AMD_HIP_BACKEND___{{.*}}_ {{OPT}}urKernelSetSpecializationConstantsTest.InvalidSizeCount/AMD_HIP_BACKEND___{{.*}}_ -{{OPT}}{{Segmentation fault|Aborted}} diff --git a/test/conformance/kernel/kernel_adapter_native_cpu.match b/test/conformance/kernel/kernel_adapter_native_cpu.match index fae4d2f51b..93e3ddd67d 100644 --- a/test/conformance/kernel/kernel_adapter_native_cpu.match +++ b/test/conformance/kernel/kernel_adapter_native_cpu.match @@ -31,6 +31,8 @@ urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU___ urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE +urKernelGetGroupInfoSingleTest.CompileWorkGroupSizeEmpty/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urKernelGetGroupInfoWgSizeTest.CompileWorkGroupSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urKernelGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_KERNEL_INFO_FUNCTION_NAME urKernelGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_KERNEL_INFO_NUM_ARGS urKernelGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_KERNEL_INFO_REFERENCE_COUNT @@ -101,6 +103,7 @@ urKernelGetSubGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU urKernelGetSubGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_KERNEL_SUB_GROUP_INFO_MAX_NUM_SUB_GROUPS urKernelGetSubGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS urKernelGetSubGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL +urKernelGetSubGroupInfoSingleTest.CompileNumSubgroupsIsZero/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urKernelReleaseTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urKernelReleaseTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urKernelRetainTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ diff --git a/test/conformance/kernel/urKernelCreateWithNativeHandle.cpp b/test/conformance/kernel/urKernelCreateWithNativeHandle.cpp index ab01069c82..aac3d6e7f7 100644 --- a/test/conformance/kernel/urKernelCreateWithNativeHandle.cpp +++ b/test/conformance/kernel/urKernelCreateWithNativeHandle.cpp @@ -8,8 +8,9 @@ struct urKernelCreateWithNativeHandleTest : uur::urKernelTest { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(urKernelTest::SetUp()); - if (urKernelGetNativeHandle(kernel, &native_kernel_handle)) { - GTEST_SKIP(); + { + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urKernelGetNativeHandle(kernel, &native_kernel_handle)); } } @@ -31,7 +32,7 @@ struct urKernelCreateWithNativeHandleTest : uur::urKernelTest { UUR_INSTANTIATE_KERNEL_TEST_SUITE_P(urKernelCreateWithNativeHandleTest); TEST_P(urKernelCreateWithNativeHandleTest, Success) { - ASSERT_SUCCESS(urKernelCreateWithNativeHandle( + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urKernelCreateWithNativeHandle( native_kernel_handle, context, program, &properties, &native_kernel)); uint32_t ref_count = 0; diff --git a/test/conformance/kernel/urKernelGetGroupInfo.cpp b/test/conformance/kernel/urKernelGetGroupInfo.cpp index 5ad6225676..b91001a07f 100644 --- a/test/conformance/kernel/urKernelGetGroupInfo.cpp +++ b/test/conformance/kernel/urKernelGetGroupInfo.cpp @@ -18,6 +18,24 @@ UUR_TEST_SUITE_P( UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE), uur::deviceTestWithParamPrinter); +struct urKernelGetGroupInfoSingleTest : uur::urKernelTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urKernelTest::SetUp()); + } +}; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urKernelGetGroupInfoSingleTest); + +struct urKernelGetGroupInfoWgSizeTest : uur::urKernelTest { + void SetUp() override { + program_name = "fixed_wg_size"; + UUR_RETURN_ON_FATAL_FAILURE(urKernelTest::SetUp()); + } + + // This must match the size in fixed_wg_size.cpp + std::array wg_size{4, 4, 4}; +}; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urKernelGetGroupInfoWgSizeTest); + TEST_P(urKernelGetGroupInfoTest, Success) { auto property_name = getParam(); size_t property_size = 0; @@ -57,3 +75,21 @@ TEST_P(urKernelGetGroupInfoTest, InvalidEnumeration) { UR_KERNEL_GROUP_INFO_FORCE_UINT32, 0, nullptr, &bad_enum_length)); } + +TEST_P(urKernelGetGroupInfoWgSizeTest, CompileWorkGroupSize) { + std::array read_dims{1, 1, 1}; + ASSERT_SUCCESS(urKernelGetGroupInfo( + kernel, device, UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, + sizeof(read_dims), read_dims.data(), nullptr)); + ASSERT_EQ(read_dims, wg_size); +} + +TEST_P(urKernelGetGroupInfoSingleTest, CompileWorkGroupSizeEmpty) { + // Returns 0 by default when there is no sepecific information + std::array read_dims{1, 1, 1}; + std::array zero{0, 0, 0}; + ASSERT_SUCCESS(urKernelGetGroupInfo( + kernel, device, UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE, + sizeof(read_dims), read_dims.data(), nullptr)); + ASSERT_EQ(read_dims, zero); +} diff --git a/test/conformance/kernel/urKernelGetSubGroupInfo.cpp b/test/conformance/kernel/urKernelGetSubGroupInfo.cpp index 9a8e599510..fa4e045483 100644 --- a/test/conformance/kernel/urKernelGetSubGroupInfo.cpp +++ b/test/conformance/kernel/urKernelGetSubGroupInfo.cpp @@ -16,6 +16,13 @@ UUR_TEST_SUITE_P( UR_KERNEL_SUB_GROUP_INFO_SUB_GROUP_SIZE_INTEL), uur::deviceTestWithParamPrinter); +struct urKernelGetSubGroupInfoSingleTest : uur::urKernelTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urKernelTest::SetUp()); + } +}; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urKernelGetSubGroupInfoSingleTest); + TEST_P(urKernelGetSubGroupInfoTest, Success) { auto property_name = getParam(); size_t property_size = 0; @@ -53,3 +60,12 @@ TEST_P(urKernelGetSubGroupInfoTest, InvalidEnumeration) { kernel, device, UR_KERNEL_SUB_GROUP_INFO_FORCE_UINT32, 0, nullptr, &bad_enum_length)); } + +TEST_P(urKernelGetSubGroupInfoSingleTest, CompileNumSubgroupsIsZero) { + // Returns 0 by default when there is no specific information + size_t subgroups = 1; + ASSERT_SUCCESS(urKernelGetSubGroupInfo( + kernel, device, UR_KERNEL_SUB_GROUP_INFO_COMPILE_NUM_SUB_GROUPS, + sizeof(subgroups), &subgroups, nullptr)); + ASSERT_EQ(subgroups, 0); +} diff --git a/test/conformance/memory/memory_adapter_cuda.match b/test/conformance/memory/memory_adapter_cuda.match index cd65787020..b9353f20be 100644 --- a/test/conformance/memory/memory_adapter_cuda.match +++ b/test/conformance/memory/memory_adapter_cuda.match @@ -1 +1 @@ -urMemBufferCreateWithNativeHandleTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_ +urMemImageCreateTest.InvalidSize/NVIDIA_CUDA_BACKEND___{{.*}}_ diff --git a/test/conformance/memory/memory_adapter_native_cpu.match b/test/conformance/memory/memory_adapter_native_cpu.match index 58651a745b..d0cb7644d9 100644 --- a/test/conformance/memory/memory_adapter_native_cpu.match +++ b/test/conformance/memory/memory_adapter_native_cpu.match @@ -11,5 +11,7 @@ urMemImageCreateTest.InvalidImageDescNumMipLevel/SYCL_NATIVE_CPU___SYCL_Native_C urMemImageCreateTest.InvalidImageDescNumSamples/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urMemImageCreateTest.InvalidImageDescRowPitch/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urMemImageCreateTest.InvalidImageDescSlicePitch/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urMemImageCreateWithHostPtrFlagsTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER +urMemImageCreateWithHostPtrFlagsTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_FLAG_USE_HOST_POINTER urMemReleaseTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urMemRetainTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ diff --git a/test/conformance/memory/urMemBufferCreateWithNativeHandle.cpp b/test/conformance/memory/urMemBufferCreateWithNativeHandle.cpp index 573c9c0036..96871d24b6 100644 --- a/test/conformance/memory/urMemBufferCreateWithNativeHandle.cpp +++ b/test/conformance/memory/urMemBufferCreateWithNativeHandle.cpp @@ -10,8 +10,9 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urMemBufferCreateWithNativeHandleTest); TEST_P(urMemBufferCreateWithNativeHandleTest, Success) { ur_native_handle_t hNativeMem = nullptr; - if (urMemGetNativeHandle(buffer, device, &hNativeMem)) { - GTEST_SKIP(); + { + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urMemGetNativeHandle(buffer, device, &hNativeMem)); } // We cannot assume anything about a native_handle, not even if it's @@ -24,7 +25,7 @@ TEST_P(urMemBufferCreateWithNativeHandleTest, Success) { /*.pNext =*/nullptr, /*.isNativeHandleOwned =*/false, }; - ASSERT_SUCCESS( + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( urMemBufferCreateWithNativeHandle(hNativeMem, context, &props, &mem)); ASSERT_NE(mem, nullptr); diff --git a/test/conformance/memory/urMemImageCreate.cpp b/test/conformance/memory/urMemImageCreate.cpp index b64c09558b..0ba25bd1c9 100644 --- a/test/conformance/memory/urMemImageCreate.cpp +++ b/test/conformance/memory/urMemImageCreate.cpp @@ -3,6 +3,7 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include +#include using urMemImageCreateTest = uur::urContextTest; UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urMemImageCreateTest); @@ -64,6 +65,13 @@ TEST_P(urMemImageCreateTest, InvalidNullPointerImageDesc) { &image_handle)); } +TEST_P(urMemImageCreateTest, InvalidNullPointerImageFormat) { + ur_mem_handle_t image_handle = nullptr; + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, + urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, nullptr, + &image_desc, nullptr, &image_handle)); +} + TEST_P(urMemImageCreateTest, InvalidSize) { ur_mem_handle_t image_handle = nullptr; @@ -171,6 +179,20 @@ UUR_TEST_SUITE_P(urMemImageCreateWithHostPtrFlagsTest, UR_MEM_FLAG_USE_HOST_POINTER), uur::deviceTestWithParamPrinter); +TEST_P(urMemImageCreateWithHostPtrFlagsTest, Success) { + uur::raii::Mem host_ptr_buffer = nullptr; + ASSERT_SUCCESS(urMemImageCreate(context, UR_MEM_FLAG_ALLOC_HOST_POINTER, + &image_format, &image_desc, nullptr, + host_ptr_buffer.ptr())); + + ur_mem_handle_t image_handle = nullptr; + ASSERT_SUCCESS(urMemImageCreate(context, getParam(), &image_format, + &image_desc, host_ptr_buffer.ptr(), + &image_handle)); + ASSERT_NE(nullptr, image_handle); + ASSERT_SUCCESS(urMemRelease(image_handle)); +} + TEST_P(urMemImageCreateWithHostPtrFlagsTest, InvalidHostPtr) { ur_mem_handle_t image_handle = nullptr; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_HOST_PTR, diff --git a/test/conformance/platform/platform_adapter_cuda.match b/test/conformance/platform/platform_adapter_cuda.match index e69de29bb2..b459b89bbe 100644 --- a/test/conformance/platform/platform_adapter_cuda.match +++ b/test/conformance/platform/platform_adapter_cuda.match @@ -0,0 +1 @@ +urPlatformCreateWithNativeHandleTest.InvalidNullPointerPlatform diff --git a/test/conformance/platform/platform_adapter_hip.match b/test/conformance/platform/platform_adapter_hip.match index e69de29bb2..b459b89bbe 100644 --- a/test/conformance/platform/platform_adapter_hip.match +++ b/test/conformance/platform/platform_adapter_hip.match @@ -0,0 +1 @@ +urPlatformCreateWithNativeHandleTest.InvalidNullPointerPlatform diff --git a/test/conformance/platform/platform_adapter_native_cpu.match b/test/conformance/platform/platform_adapter_native_cpu.match index 6e05372d57..257822d30b 100644 --- a/test/conformance/platform/platform_adapter_native_cpu.match +++ b/test/conformance/platform/platform_adapter_native_cpu.match @@ -1,3 +1,4 @@ +urPlatformCreateWithNativeHandleTest.InvalidNullPointerPlatform urPlatfromGetBackendOptionTest.InvalidValueFrontendOption urPlatfromGetBackendOptionTestWithParam.Success/_O0 urPlatfromGetBackendOptionTestWithParam.Success/_O1 diff --git a/test/conformance/platform/urPlatformCreateWithNativeHandle.cpp b/test/conformance/platform/urPlatformCreateWithNativeHandle.cpp index 6dd3310f6a..435ac23a00 100644 --- a/test/conformance/platform/urPlatformCreateWithNativeHandle.cpp +++ b/test/conformance/platform/urPlatformCreateWithNativeHandle.cpp @@ -10,32 +10,80 @@ using urPlatformCreateWithNativeHandleTest = uur::platform::urPlatformTest; TEST_F(urPlatformCreateWithNativeHandleTest, Success) { for (auto platform : platforms) { ur_native_handle_t native_handle = nullptr; - if (urPlatformGetNativeHandle(platform, &native_handle)) { - continue; - }; + { + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urPlatformGetNativeHandle(platform, &native_handle)); + } // We cannot assume anything about a native_handle, not even if it's // `nullptr` since this could be a valid representation within a backend. // We can however convert the native_handle back into a unified-runtime // handle and perform some query on it to verify that it works. ur_platform_handle_t plat = nullptr; - ASSERT_SUCCESS( + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( urPlatformCreateWithNativeHandle(native_handle, nullptr, &plat)); ASSERT_NE(plat, nullptr); - ur_platform_backend_t backend; - ASSERT_SUCCESS(urPlatformGetInfo(plat, UR_PLATFORM_INFO_BACKEND, - sizeof(ur_platform_backend_t), - &backend, nullptr)); + std::string input_platform_name = uur::GetPlatformName(platform); + std::string created_platform_name = uur::GetPlatformName(plat); + ASSERT_EQ(input_platform_name, created_platform_name); } } -TEST_F(urPlatformCreateWithNativeHandleTest, InvalidNullPointerPlatform) { +TEST_F(urPlatformCreateWithNativeHandleTest, SuccessWithOwnedNativeHandle) { + for (auto platform : platforms) { + ur_native_handle_t native_handle = nullptr; + { + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urPlatformGetNativeHandle(platform, &native_handle)); + } + + // We cannot assume anything about a native_handle, not even if it's + // `nullptr` since this could be a valid representation within a backend. + // We can however convert the native_handle back into a unified-runtime + // handle and perform some query on it to verify that it works. + ur_platform_native_properties_t props = { + UR_STRUCTURE_TYPE_PLATFORM_NATIVE_PROPERTIES, nullptr, true}; + ur_platform_handle_t plat = nullptr; + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urPlatformCreateWithNativeHandle(native_handle, &props, &plat)); + ASSERT_NE(plat, nullptr); + + std::string input_platform_name = uur::GetPlatformName(platform); + std::string created_platform_name = uur::GetPlatformName(plat); + ASSERT_EQ(input_platform_name, created_platform_name); + } +} + +TEST_F(urPlatformCreateWithNativeHandleTest, SuccessWithUnOwnedNativeHandle) { for (auto platform : platforms) { ur_native_handle_t native_handle = nullptr; - if (urPlatformGetNativeHandle(platform, &native_handle)) { - continue; + { + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urPlatformGetNativeHandle(platform, &native_handle)); } + + // We cannot assume anything about a native_handle, not even if it's + // `nullptr` since this could be a valid representation within a backend. + // We can however convert the native_handle back into a unified-runtime + // handle and perform some query on it to verify that it works. + ur_platform_native_properties_t props = { + UR_STRUCTURE_TYPE_PLATFORM_NATIVE_PROPERTIES, nullptr, false}; + ur_platform_handle_t plat = nullptr; + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urPlatformCreateWithNativeHandle(native_handle, &props, &plat)); + ASSERT_NE(plat, nullptr); + + std::string input_platform_name = uur::GetPlatformName(platform); + std::string created_platform_name = uur::GetPlatformName(plat); + ASSERT_EQ(input_platform_name, created_platform_name); + } +} + +TEST_F(urPlatformCreateWithNativeHandleTest, InvalidNullPointerPlatform) { + for (auto platform : platforms) { + ur_native_handle_t native_handle = nullptr; + ASSERT_SUCCESS(urPlatformGetNativeHandle(platform, &native_handle)); ASSERT_EQ_RESULT( UR_RESULT_ERROR_INVALID_NULL_POINTER, urPlatformCreateWithNativeHandle(native_handle, nullptr, nullptr)); diff --git a/test/conformance/platform/urPlatformGet.cpp b/test/conformance/platform/urPlatformGet.cpp index 2f3b28279d..f3ac6318e9 100644 --- a/test/conformance/platform/urPlatformGet.cpp +++ b/test/conformance/platform/urPlatformGet.cpp @@ -33,3 +33,11 @@ TEST_F(urPlatformGetTest, InvalidNumEntries) { static_cast(adapters.size()), 0, platforms.data(), nullptr)); } + +TEST_F(urPlatformGetTest, InvalidNullPointer) { + uint32_t count; + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, + urPlatformGet(nullptr, + static_cast(adapters.size()), 0, + nullptr, &count)); +} diff --git a/test/conformance/program/program_adapter_cuda.match b/test/conformance/program/program_adapter_cuda.match index 8b917219ec..5f6c2de212 100644 --- a/test/conformance/program/program_adapter_cuda.match +++ b/test/conformance/program/program_adapter_cuda.match @@ -1,29 +1,13 @@ urProgramBuildTest.BuildFailure/NVIDIA_CUDA_BACKEND___{{.*}}_ -{{OPT}}urProgramCreateWithNativeHandleTest.InvalidNullHandleContext/NVIDIA_CUDA_BACKEND___{{.*}}_ -{{OPT}}urProgramCreateWithNativeHandleTest.InvalidNullPointerProgram/NVIDIA_CUDA_BACKEND___{{.*}}_ -{{OPT}}urProgramGetBuildInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_BINARY_TYPE -{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_STATUS -{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_OPTIONS -{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_LOG -{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_BINARY_TYPE -{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleDevice/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_STATUS -{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleDevice/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_OPTIONS -{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleDevice/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_LOG -{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleDevice/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_BINARY_TYPE -{{OPT}}urProgramGetBuildInfoSingleTest.LogIsNullTerminated/NVIDIA_CUDA_BACKEND___{{.*}}_ -{{OPT}}urProgramGetInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_NUM_KERNELS -{{OPT}}urProgramGetInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_KERNEL_NAMES -{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_REFERENCE_COUNT -{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_CONTEXT -{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_NUM_DEVICES -{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_DEVICES -{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_SOURCE -{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_BINARY_SIZES -{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_BINARIES -{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_NUM_KERNELS -{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROGRAM_INFO_KERNEL_NAMES -{{OPT}}urProgramLinkTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_ -{{OPT}}urProgramSetSpecializationConstantsTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_ -{{OPT}}urProgramSetSpecializationConstantsTest.UseDefaultValue/NVIDIA_CUDA_BACKEND___{{.*}}_ -urProgramSetMultipleSpecializationConstantsTest.MultipleCalls/NVIDIA_CUDA_BACKEND___{{.*}}_ -urProgramSetMultipleSpecializationConstantsTest.SingleCall/NVIDIA_CUDA_BACKEND___{{.*}}_ +{{OPT}}urProgramCreateWithILTest.Success/NVIDIA_CUDA_BACKEND___{{.*}} +{{OPT}}urProgramCreateWithILTest.SuccessWithProperties/NVIDIA_CUDA_BACKEND___{{.*}} +{{OPT}}urProgramCreateWithILTest.BuildInvalidProgram/NVIDIA_CUDA_BACKEND___{{.*}} +{{OPT}}urProgramGetBuildInfoSingleTest.LogIsNullTerminated/NVIDIA_CUDA_BACKEND___{{.*}} +{{OPT}}urProgramGetInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}} +{{OPT}}urProgramGetInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}} +{{OPT}}urProgramGetInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}} +{{OPT}}urProgramGetInfoSingleTest.NumDevicesMatchesDeviceArray/NVIDIA_CUDA_BACKEND___{{.*}} +{{OPT}}urProgramSetSpecializationConstantsTest.Success/NVIDIA_CUDA_BACKEND___{{.*}} +{{OPT}}urProgramSetSpecializationConstantsTest.UseDefaultValue/NVIDIA_CUDA_BACKEND___{{.*}} +urProgramSetMultipleSpecializationConstantsTest.MultipleCalls/NVIDIA_CUDA_BACKEND___{{.*}} +urProgramSetMultipleSpecializationConstantsTest.SingleCall/NVIDIA_CUDA_BACKEND___{{.*}} diff --git a/test/conformance/program/program_adapter_hip.match b/test/conformance/program/program_adapter_hip.match index 684e066ce3..27bd6831a5 100644 --- a/test/conformance/program/program_adapter_hip.match +++ b/test/conformance/program/program_adapter_hip.match @@ -1,4 +1,5 @@ urProgramBuildTest.BuildFailure/AMD_HIP_BACKEND___{{.*}}_ +{{OPT}}urProgramCreateWithNativeHandleTest.Success/AMD_HIP_BACKEND___{{.*}}_ {{OPT}}urProgramCreateWithNativeHandleTest.InvalidNullHandleContext/AMD_HIP_BACKEND___{{.*}}_ {{OPT}}urProgramCreateWithNativeHandleTest.InvalidNullPointerProgram/AMD_HIP_BACKEND___{{.*}}_ {{OPT}}urProgramGetBuildInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_BINARY_TYPE @@ -24,4 +25,6 @@ urProgramBuildTest.BuildFailure/AMD_HIP_BACKEND___{{.*}}_ {{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_KERNEL_NAMES {{OPT}}urProgramLinkTest.Success/AMD_HIP_BACKEND___{{.*}}_ {{OPT}}urProgramSetSpecializationConstantsTest.Success/AMD_HIP_BACKEND___{{.*}}_ -{{OPT}}{{Segmentation fault|Aborted}} +{{OPT}}urProgramSetSpecializationConstantsTest.UseDefaultValue/AMD_HIP_BACKEND___{{.*}}_ +{{OPT}}urProgramSetMultipleSpecializationConstantsTest.MultipleCalls/AMD_HIP_BACKEND___{{.*}}_ +{{OPT}}urProgramSetMultipleSpecializationConstantsTest.SingleCall/AMD_HIP_BACKEND___{{.*}}_ diff --git a/test/conformance/program/program_adapter_native_cpu.match b/test/conformance/program/program_adapter_native_cpu.match index c509c67d3e..9a5b0a9830 100644 --- a/test/conformance/program/program_adapter_native_cpu.match +++ b/test/conformance/program/program_adapter_native_cpu.match @@ -138,3 +138,4 @@ {{OPT}}urProgramSetSpecializationConstantsTest.InvalidSizeCount/SYCL_NATIVE_CPU___SYCL_Native_CPU_ {{OPT}}urProgramSetMultipleSpecializationConstantsTest.MultipleCalls/SYCL_NATIVE_CPU___SYCL_Native_CPU_ {{OPT}}urProgramSetMultipleSpecializationConstantsTest.SingleCall/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}{{Segmentation fault|Aborted}} diff --git a/test/conformance/program/urProgramBuild.cpp b/test/conformance/program/urProgramBuild.cpp index f99b94321e..a7e7e4a275 100644 --- a/test/conformance/program/urProgramBuild.cpp +++ b/test/conformance/program/urProgramBuild.cpp @@ -30,8 +30,7 @@ TEST_P(urProgramBuildTest, InvalidNullHandleProgram) { TEST_P(urProgramBuildTest, BuildFailure) { ur_program_handle_t program = nullptr; std::shared_ptr> il_binary; - uur::KernelsEnvironment::instance->LoadSource("build_failure", 0, - il_binary); + uur::KernelsEnvironment::instance->LoadSource("build_failure", il_binary); if (!il_binary) { // The build failure we are testing for happens at SYCL compile time on // AMD and Nvidia, so no binary exists to check for a build failure diff --git a/test/conformance/program/urProgramCreateWithIL.cpp b/test/conformance/program/urProgramCreateWithIL.cpp index 800a43cd5d..3d81d14104 100644 --- a/test/conformance/program/urProgramCreateWithIL.cpp +++ b/test/conformance/program/urProgramCreateWithIL.cpp @@ -17,7 +17,7 @@ struct urProgramCreateWithILTest : uur::urContextTest { if (backend == UR_PLATFORM_BACKEND_HIP) { GTEST_SKIP(); } - uur::KernelsEnvironment::instance->LoadSource("foo", 0, il_binary); + uur::KernelsEnvironment::instance->LoadSource("foo", il_binary); } void TearDown() override { diff --git a/test/conformance/program/urProgramCreateWithNativeHandle.cpp b/test/conformance/program/urProgramCreateWithNativeHandle.cpp index e121c61de8..000f94e0ad 100644 --- a/test/conformance/program/urProgramCreateWithNativeHandle.cpp +++ b/test/conformance/program/urProgramCreateWithNativeHandle.cpp @@ -8,8 +8,10 @@ struct urProgramCreateWithNativeHandleTest : uur::urProgramTest { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(urProgramTest::SetUp()); - ASSERT_SUCCESS( - urProgramGetNativeHandle(program, &native_program_handle)); + { + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urProgramGetNativeHandle(program, &native_program_handle)); + } } void TearDown() override { @@ -25,10 +27,8 @@ struct urProgramCreateWithNativeHandleTest : uur::urProgramTest { UUR_INSTANTIATE_KERNEL_TEST_SUITE_P(urProgramCreateWithNativeHandleTest); TEST_P(urProgramCreateWithNativeHandleTest, Success) { - if (urProgramCreateWithNativeHandle(native_program_handle, context, nullptr, - &native_program)) { - GTEST_SKIP(); - } + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urProgramCreateWithNativeHandle( + native_program_handle, context, nullptr, &native_program)); uint32_t ref_count = 0; ASSERT_SUCCESS(urProgramGetInfo(native_program, diff --git a/test/conformance/program/urProgramSetSpecializationConstants.cpp b/test/conformance/program/urProgramSetSpecializationConstants.cpp index 74c602cbe3..6d5b70322f 100644 --- a/test/conformance/program/urProgramSetSpecializationConstants.cpp +++ b/test/conformance/program/urProgramSetSpecializationConstants.cpp @@ -48,9 +48,10 @@ TEST_P(urProgramSetSpecializationConstantsTest, UseDefaultValue) { ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, sizeof(ur_platform_backend_t), &backend, nullptr)); - if (backend == UR_PLATFORM_BACKEND_CUDA) { - GTEST_FAIL() - << "This test is known to cause crashes on Nvidia; not running."; + if (backend == UR_PLATFORM_BACKEND_CUDA || + backend == UR_PLATFORM_BACKEND_HIP) { + GTEST_FAIL() << "This test is known to cause crashes on Nvidia and " + "AMD; not running."; } ASSERT_SUCCESS(urProgramBuild(context, program, nullptr)); diff --git a/test/conformance/queue/queue_adapter_native_cpu.match b/test/conformance/queue/queue_adapter_native_cpu.match index c2887b1063..a4c2c502a4 100644 --- a/test/conformance/queue/queue_adapter_native_cpu.match +++ b/test/conformance/queue/queue_adapter_native_cpu.match @@ -1,3 +1,5 @@ +urQueueCreateTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urQueueCreateTest.CheckContext/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_PROFILING_ENABLE urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_ON_DEVICE @@ -9,6 +11,17 @@ urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_C urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_SUBMISSION_IMMEDIATE urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_USE_DEFAULT_STREAM urQueueCreateWithParamTest.SuccessWithProperties/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM +urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE +urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_PROFILING_ENABLE +urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_ON_DEVICE +urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_ON_DEVICE_DEFAULT +urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_DISCARD_EVENTS +urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_PRIORITY_LOW +urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_PRIORITY_HIGH +urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_SUBMISSION_BATCHED +urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_SUBMISSION_IMMEDIATE +urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_USE_DEFAULT_STREAM +urQueueCreateWithParamTest.MatchingDeviceHandles/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM urQueueFinishTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urQueueFlushTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urQueueGetInfoTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_CONTEXT diff --git a/test/conformance/queue/urQueueCreate.cpp b/test/conformance/queue/urQueueCreate.cpp index 03cda76d50..ad0957d747 100644 --- a/test/conformance/queue/urQueueCreate.cpp +++ b/test/conformance/queue/urQueueCreate.cpp @@ -2,16 +2,24 @@ // Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include "uur/raii.h" #include using urQueueCreateTest = uur::urContextTest; UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urQueueCreateTest); TEST_P(urQueueCreateTest, Success) { - ur_queue_handle_t queue = nullptr; - ASSERT_SUCCESS(urQueueCreate(context, device, nullptr, &queue)); + uur::raii::Queue queue = nullptr; + ASSERT_SUCCESS(urQueueCreate(context, device, nullptr, queue.ptr())); ASSERT_NE(nullptr, queue); - ASSERT_SUCCESS(urQueueRelease(queue)); + + ur_queue_info_t queue_flags; + ASSERT_SUCCESS(urQueueGetInfo(queue, UR_QUEUE_INFO_FLAGS, + sizeof(ur_queue_info_t), &queue_flags, + nullptr)); + + /* Check that the queue was created without any flag */ + ASSERT_EQ(queue_flags, 0); } using urQueueCreateWithParamTest = uur::urContextTestWithParam; @@ -54,9 +62,87 @@ TEST_P(urQueueCreateWithParamTest, SuccessWithProperties) { nullptr)); ASSERT_TRUE(queueFlags & queryFlag); + // Check that no other bit is set (i.e. is power of 2) + ASSERT_TRUE(queueFlags != 0 && (queueFlags & (queueFlags - 1)) == 0); + ASSERT_SUCCESS(urQueueRelease(queue)); } +/* Creates two queues with the same platform and device, and checks that the + * queried device and platform of both queues match. */ +TEST_P(urQueueCreateWithParamTest, MatchingDeviceHandles) { + ur_queue_flags_t supportedFlags{}; + ASSERT_SUCCESS(uur::GetDeviceQueueOnHostProperties(device, supportedFlags)); + + ur_queue_flags_t queryFlag = getParam(); + if (!(supportedFlags & queryFlag)) { + GTEST_SKIP() << queryFlag << " : is not supported by the device."; + } + + ur_queue_properties_t props = { + /*.stype =*/UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, + /*.pNext =*/nullptr, + /*.flags =*/queryFlag, + }; + + uur::raii::Queue queue1 = nullptr; + ASSERT_SUCCESS(urQueueCreate(context, device, &props, queue1.ptr())); + ASSERT_NE(queue1, nullptr); + + uur::raii::Queue queue2 = nullptr; + ASSERT_SUCCESS(urQueueCreate(context, device, &props, queue2.ptr())); + ASSERT_NE(queue2, nullptr); + + ur_device_handle_t deviceQueue1; + ASSERT_SUCCESS(urQueueGetInfo(queue1, UR_QUEUE_INFO_DEVICE, + sizeof(ur_device_handle_t), &deviceQueue1, + nullptr)); + + ur_device_handle_t deviceQueue2; + ASSERT_SUCCESS(urQueueGetInfo(queue1, UR_QUEUE_INFO_DEVICE, + sizeof(ur_device_handle_t), &deviceQueue2, + nullptr)); + + ASSERT_EQ(deviceQueue1, deviceQueue2); +} + +/* Create a queue and check that it returns the right context*/ +TEST_P(urQueueCreateTest, CheckContext) { + + uur::raii::Queue queue = nullptr; + ASSERT_SUCCESS(urQueueCreate(context, device, nullptr, queue.ptr())); + ASSERT_NE(queue.ptr(), nullptr); + + ur_context_handle_t returned_context = nullptr; + ASSERT_SUCCESS(urQueueGetInfo(queue, UR_QUEUE_INFO_CONTEXT, + sizeof(ur_context_handle_t), + &returned_context, nullptr)); + + ASSERT_EQ(this->context, returned_context); +} + +using urQueueCreateTestMultipleDevices = uur::urAllDevicesTest; + +/* Create a queue using a context from a different device */ +TEST_F(urQueueCreateTestMultipleDevices, ContextFromWrongDevice) { + + if (devices.size() < 2) { + GTEST_SKIP() << "Test requires at least 2 devices in the system"; + } + ur_device_handle_t device1 = devices[0]; + uur::raii::Context context1 = nullptr; + urContextCreate(1, &device1, nullptr, context1.ptr()); + + ur_device_handle_t device2 = devices[1]; + uur::raii::Context context2 = nullptr; + urContextCreate(1, &device2, nullptr, context2.ptr()); + + ur_queue_handle_t queue = nullptr; + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_DEVICE, + urQueueCreate(context2, device1, nullptr, &queue)); + ASSERT_EQ(queue, nullptr); +} + TEST_P(urQueueCreateTest, InvalidNullHandleContext) { ur_queue_handle_t queue = nullptr; ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE, diff --git a/test/conformance/queue/urQueueCreateWithNativeHandle.cpp b/test/conformance/queue/urQueueCreateWithNativeHandle.cpp index 9f7588601f..2f910198f7 100644 --- a/test/conformance/queue/urQueueCreateWithNativeHandle.cpp +++ b/test/conformance/queue/urQueueCreateWithNativeHandle.cpp @@ -9,8 +9,9 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urQueueCreateWithNativeHandleTest); TEST_P(urQueueCreateWithNativeHandleTest, Success) { ur_native_handle_t native_handle = nullptr; - if (urQueueGetNativeHandle(queue, nullptr, &native_handle)) { - GTEST_SKIP(); + { + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urQueueGetNativeHandle(queue, nullptr, &native_handle)); } // We cannot assume anything about a native_handle, not even if it's diff --git a/test/conformance/sampler/urSamplerCreateWithNativeHandle.cpp b/test/conformance/sampler/urSamplerCreateWithNativeHandle.cpp index 342e8f074e..c5bf46bb13 100644 --- a/test/conformance/sampler/urSamplerCreateWithNativeHandle.cpp +++ b/test/conformance/sampler/urSamplerCreateWithNativeHandle.cpp @@ -11,8 +11,9 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urSamplerCreateWithNativeHandleTest); TEST_P(urSamplerCreateWithNativeHandleTest, Success) { ur_native_handle_t native_sampler = nullptr; - if (urSamplerGetNativeHandle(sampler, &native_sampler)) { - GTEST_SKIP(); + { + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( + urSamplerGetNativeHandle(sampler, &native_sampler)); } // We cannot assume anything about a native_handle, not even if it's @@ -21,8 +22,8 @@ TEST_P(urSamplerCreateWithNativeHandleTest, Success) { // and perform some query on it to verify that it works. ur_sampler_handle_t hSampler = nullptr; ur_sampler_native_properties_t props{}; - ASSERT_SUCCESS(urSamplerCreateWithNativeHandle(native_sampler, context, - &props, &hSampler)); + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urSamplerCreateWithNativeHandle( + native_sampler, context, &props, &hSampler)); ASSERT_NE(hSampler, nullptr); ur_sampler_addressing_mode_t addr_mode; diff --git a/test/conformance/source/environment.cpp b/test/conformance/source/environment.cpp index a58b3ecdbd..209bc8f9a2 100644 --- a/test/conformance/source/environment.cpp +++ b/test/conformance/source/environment.cpp @@ -367,7 +367,7 @@ KernelsEnvironment::parseKernelOptions(int argc, char **argv, return options; } -std::string KernelsEnvironment::getSupportedILPostfix(uint32_t device_index) { +std::string KernelsEnvironment::getTargetName() { std::stringstream IL; if (instance->GetDevices().size() == 0) { @@ -382,66 +382,44 @@ std::string KernelsEnvironment::getSupportedILPostfix(uint32_t device_index) { error = "failed to get backend from platform."; return {}; } - if (backend == UR_PLATFORM_BACKEND_HIP) { - return ".bin"; - } - auto device = instance->GetDevices()[device_index]; - std::string IL_version; - if (uur::GetDeviceILVersion(device, IL_version)) { - error = "failed to get device IL version"; + std::string target = ""; + switch (backend) { + case UR_PLATFORM_BACKEND_OPENCL: + case UR_PLATFORM_BACKEND_LEVEL_ZERO: + return "spir64"; + case UR_PLATFORM_BACKEND_CUDA: + return "nvptx64-nvidia-cuda"; + case UR_PLATFORM_BACKEND_HIP: + return "amdgcn-amd-amdhsa"; + case UR_PLATFORM_BACKEND_NATIVE_CPU: + error = "native_cpu doesn't support kernel tests yet"; return {}; - } - - // TODO: This potentially needs updating as more adapters are tested. - if (IL_version.find("SPIR-V") != std::string::npos) { - IL << ".spv"; - } else if (IL_version.find("nvptx") != std::string::npos) { - IL << ".bin"; - } else { - error = "Undefined IL version: " + IL_version; + default: + error = "unknown target."; return {}; } - - return IL.str(); } std::string -KernelsEnvironment::getKernelSourcePath(const std::string &kernel_name, - uint32_t device_index) { +KernelsEnvironment::getKernelSourcePath(const std::string &kernel_name) { std::stringstream path; path << kernel_options.kernel_directory << "/" << kernel_name; - std::string il_postfix = getSupportedILPostfix(device_index); - - if (il_postfix.empty()) { - return {}; - } - - std::string binary_name; - for (const auto &entry : filesystem::directory_iterator(path.str())) { - auto file_name = entry.path().filename().string(); - if (file_name.find(il_postfix) != std::string::npos) { - binary_name = file_name; - break; - } - } - if (binary_name.empty()) { - error = - "failed retrieving kernel source path for kernel: " + kernel_name; + std::string target_name = getTargetName(); + if (target_name.empty()) { return {}; } - path << "/" << binary_name; + path << "/" << target_name << ".bin.0"; return path.str(); } void KernelsEnvironment::LoadSource( - const std::string &kernel_name, uint32_t device_index, + const std::string &kernel_name, std::shared_ptr> &binary_out) { - std::string source_path = - instance->getKernelSourcePath(kernel_name, device_index); + std::string source_path = instance->getKernelSourcePath(kernel_name); if (source_path.empty()) { FAIL() << error; @@ -487,9 +465,10 @@ ur_result_t KernelsEnvironment::CreateProgram( nullptr)) { return error; } - if (backend == UR_PLATFORM_BACKEND_HIP) { - // The HIP adapter does not support urProgramCreateWithIL so we need to - // use urProgramCreateWithBinary instead. + if (backend == UR_PLATFORM_BACKEND_HIP || + backend == UR_PLATFORM_BACKEND_CUDA) { + // The CUDA and HIP adapters do not support urProgramCreateWithIL so we + // need to use urProgramCreateWithBinary instead. if (auto error = urProgramCreateWithBinary( hContext, hDevice, binary.size(), reinterpret_cast(binary.data()), properties, diff --git a/test/conformance/testing/include/uur/environment.h b/test/conformance/testing/include/uur/environment.h index bcb7e46661..acd255a3c2 100644 --- a/test/conformance/testing/include/uur/environment.h +++ b/test/conformance/testing/include/uur/environment.h @@ -72,7 +72,7 @@ struct KernelsEnvironment : DevicesEnvironment { virtual void SetUp() override; virtual void TearDown() override; - void LoadSource(const std::string &kernel_name, uint32_t device_index, + void LoadSource(const std::string &kernel_name, std::shared_ptr> &binary_out); ur_result_t CreateProgram(ur_platform_handle_t hPlatform, @@ -89,9 +89,8 @@ struct KernelsEnvironment : DevicesEnvironment { private: KernelOptions parseKernelOptions(int argc, char **argv, const std::string &kernels_default_dir); - std::string getKernelSourcePath(const std::string &kernel_name, - uint32_t device_index); - std::string getSupportedILPostfix(uint32_t device_index); + std::string getKernelSourcePath(const std::string &kernel_name); + std::string getTargetName(); KernelOptions kernel_options; // mapping between kernels (full_path + kernel_name) and their saved source. diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h index 55f3f7c4f4..cf64aa13d3 100644 --- a/test/conformance/testing/include/uur/fixtures.h +++ b/test/conformance/testing/include/uur/fixtures.h @@ -20,6 +20,14 @@ } \ (void)0 +#define UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(ret) \ + auto status = ret; \ + if (status == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { \ + GTEST_SKIP(); \ + } else { \ + ASSERT_EQ(status, UR_RESULT_SUCCESS); \ + } + namespace uur { struct urPlatformTest : ::testing::Test { @@ -321,7 +329,8 @@ template struct urMemImageTestWithParam : urContextTestWithParam { struct urQueueTest : urContextTest { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(urContextTest::SetUp()); - ASSERT_SUCCESS(urQueueCreate(context, device, 0, &queue)); + ASSERT_SUCCESS( + urQueueCreate(context, device, &queue_properties, &queue)); ASSERT_NE(queue, nullptr); } @@ -332,6 +341,8 @@ struct urQueueTest : urContextTest { UUR_RETURN_ON_FATAL_FAILURE(urContextTest::TearDown()); } + ur_queue_properties_t queue_properties = { + UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr, 0}; ur_queue_handle_t queue = nullptr; }; @@ -339,7 +350,7 @@ struct urHostPipeTest : urQueueTest { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(urQueueTest::SetUp()); UUR_RETURN_ON_FATAL_FAILURE( - uur::KernelsEnvironment::instance->LoadSource("foo", 0, il_binary)); + uur::KernelsEnvironment::instance->LoadSource("foo", il_binary)); ASSERT_SUCCESS(uur::KernelsEnvironment::instance->CreateProgram( platform, context, device, *il_binary, nullptr, &program)); @@ -382,7 +393,9 @@ struct urHostPipeTest : urQueueTest { template struct urQueueTestWithParam : urContextTestWithParam { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::SetUp()); - ASSERT_SUCCESS(urQueueCreate(this->context, this->device, 0, &queue)); + ASSERT_SUCCESS(urQueueCreate(this->context, this->device, + &queue_properties, &queue)); + ASSERT_NE(queue, nullptr); } void TearDown() override { @@ -391,7 +404,8 @@ template struct urQueueTestWithParam : urContextTestWithParam { } UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::TearDown()); } - + ur_queue_properties_t queue_properties = { + UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr, 0}; ur_queue_handle_t queue; }; @@ -1121,7 +1135,7 @@ struct urProgramTest : urQueueTest { GTEST_SKIP(); } UUR_RETURN_ON_FATAL_FAILURE( - uur::KernelsEnvironment::instance->LoadSource(program_name, 0, + uur::KernelsEnvironment::instance->LoadSource(program_name, il_binary)); const ur_program_properties_t properties = { @@ -1145,9 +1159,9 @@ struct urProgramTest : urQueueTest { std::vector metadatas{}; }; -template struct urProgramTestWithParam : urContextTestWithParam { +template struct urProgramTestWithParam : urQueueTestWithParam { void SetUp() override { - UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::SetUp()); + UUR_RETURN_ON_FATAL_FAILURE(urQueueTestWithParam::SetUp()); ur_platform_backend_t backend; ASSERT_SUCCESS(urPlatformGetInfo(this->platform, @@ -1160,7 +1174,7 @@ template struct urProgramTestWithParam : urContextTestWithParam { } UUR_RETURN_ON_FATAL_FAILURE( - uur::KernelsEnvironment::instance->LoadSource(program_name, 0, + uur::KernelsEnvironment::instance->LoadSource(program_name, il_binary)); ASSERT_SUCCESS(uur::KernelsEnvironment::instance->CreateProgram( this->platform, this->context, this->device, *il_binary, nullptr, @@ -1171,7 +1185,7 @@ template struct urProgramTestWithParam : urContextTestWithParam { if (program) { EXPECT_SUCCESS(urProgramRelease(program)); } - UUR_RETURN_ON_FATAL_FAILURE(urContextTestWithParam::TearDown()); + UUR_RETURN_ON_FATAL_FAILURE(urQueueTestWithParam::TearDown()); } std::shared_ptr> il_binary; @@ -1246,20 +1260,16 @@ template struct urKernelTestWithParam : urBaseKernelTestWithParam { } }; -struct urBaseKernelExecutionTest : urBaseKernelTest { - void SetUp() override { - UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTest::SetUp()); - } +struct KernelLaunchHelper { - void TearDown() override { - for (auto &buffer : buffer_args) { - ASSERT_SUCCESS(urMemRelease(buffer)); - } - UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTest::TearDown()); - } + KernelLaunchHelper(ur_platform_handle_t &platform, + ur_context_handle_t &context, ur_kernel_handle_t &kernel, + ur_queue_handle_t &queue) + : platform{platform}, context{context}, kernel{kernel}, queue{queue} {} // Adds a kernel arg representing a sycl buffer constructed with a 1D range. - void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer) { + void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer, + size_t *buffer_index) { ur_mem_handle_t mem_handle = nullptr; ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, size, nullptr, &mem_handle)); @@ -1270,6 +1280,9 @@ struct urBaseKernelExecutionTest : urBaseKernelTest { ASSERT_SUCCESS(urQueueFinish(queue)); ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, current_arg_index, nullptr, mem_handle)); + if (buffer_index) { + *buffer_index = current_arg_index; + } // SYCL device kernels have different interfaces depending on the // backend being used. Typically a kernel which takes a buffer argument @@ -1301,8 +1314,6 @@ struct urBaseKernelExecutionTest : urBaseKernelTest { &accessor)); current_arg_index += 2; } - - buffer_args.push_back(mem_handle); *out_buffer = mem_handle; } @@ -1340,7 +1351,11 @@ struct urBaseKernelExecutionTest : urBaseKernelTest { ValidateBuffer(buffer, size, validator); } - std::vector buffer_args; + ur_platform_handle_t &platform; + ur_context_handle_t &context; + ur_kernel_handle_t &kernel; + ur_queue_handle_t &queue; + uint32_t current_arg_index = 0; }; @@ -1349,10 +1364,6 @@ struct urBaseKernelExecutionTestWithParam : urBaseKernelTestWithParam { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam::SetUp()); UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam::Build()); - context = urBaseKernelTestWithParam::context; - kernel = urBaseKernelTestWithParam::kernel; - ASSERT_SUCCESS(urQueueCreate( - context, urBaseKernelTestWithParam::device, 0, &queue)); } void TearDown() override { @@ -1360,91 +1371,85 @@ struct urBaseKernelExecutionTestWithParam : urBaseKernelTestWithParam { ASSERT_SUCCESS(urMemRelease(buffer)); } UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam::TearDown()); - if (queue) { - EXPECT_SUCCESS(urQueueRelease(queue)); - } } - // Adds a kernel arg representing a sycl buffer constructed with a 1D range. - void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer) { - ur_mem_handle_t mem_handle = nullptr; - ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, size, - nullptr, &mem_handle)); - char zero = 0; - ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, mem_handle, &zero, - sizeof(zero), 0, size, 0, nullptr, - nullptr)); - ASSERT_SUCCESS(urQueueFinish(queue)); - ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, current_arg_index, nullptr, - mem_handle)); + void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer, + size_t *buffer_index = nullptr) { + helper.AddBuffer1DArg(size, out_buffer, buffer_index); + buffer_args.push_back(*out_buffer); + } - // SYCL device kernels have different interfaces depending on the - // backend being used. Typically a kernel which takes a buffer argument - // will take a pointer to the start of the buffer and a sycl::id param - // which is a struct that encodes the accessor to the buffer. However - // the AMD backend handles this differently and uses three separate - // arguments for each of the three dimensions of the accessor. + template void AddPodArg(K data) { helper.AddPodArg(data); } - ur_platform_backend_t backend; - ASSERT_SUCCESS(urPlatformGetInfo(urBaseKernelTestWithParam::platform, - UR_PLATFORM_INFO_BACKEND, - sizeof(backend), &backend, nullptr)); - if (backend == UR_PLATFORM_BACKEND_HIP) { - // this emulates the three offset params for buffer accessor on AMD. - size_t val = 0; - ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1, - sizeof(size_t), nullptr, &val)); - ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 2, - sizeof(size_t), nullptr, &val)); - ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 3, - sizeof(size_t), nullptr, &val)); - current_arg_index += 4; - } else { - // This emulates the offset struct sycl adds for a 1D buffer accessor. - struct { - size_t offsets[1] = {0}; - } accessor; - ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1, - sizeof(accessor), nullptr, - &accessor)); - current_arg_index += 2; + void Launch1DRange(size_t global_size, size_t local_size = 1) { + helper.Launch1DRange(global_size, local_size); + } + + template + void ValidateBuffer(ur_mem_handle_t buffer, size_t size, + std::function validator) { + helper.ValidateBuffer(buffer, size, validator); + } + + template + void ValidateBuffer(ur_mem_handle_t buffer, size_t size, K value) { + helper.ValidateBuffer(buffer, size, value); + } + + private: + KernelLaunchHelper helper = KernelLaunchHelper{ + this->platform, this->context, this->kernel, this->queue}; + std::vector buffer_args; +}; + +struct urBaseKernelExecutionTest : urBaseKernelTest { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTest::SetUp()); + } + + void TearDown() override { + for (auto &buffer : buffer_args) { + ASSERT_SUCCESS(urMemRelease(buffer)); } + UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTest::TearDown()); + } - buffer_args.push_back(mem_handle); - *out_buffer = mem_handle; + void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer, + size_t *buffer_index = nullptr) { + helper.AddBuffer1DArg(size, out_buffer, buffer_index); + buffer_args.push_back(*out_buffer); } - template void AddPodArg(U data) { - ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index, - sizeof(data), nullptr, &data)); - current_arg_index++; + template void AddPodArg(T data) { helper.AddPodArg(data); } + + void Launch1DRange(size_t global_size, size_t local_size = 1) { + helper.Launch1DRange(global_size, local_size); } - // Validate the contents of `buffer` according to the given validator. - template + template void ValidateBuffer(ur_mem_handle_t buffer, size_t size, - std::function validator) { - std::vector read_buffer(size / sizeof(U)); - ASSERT_SUCCESS(urEnqueueMemBufferRead(queue, buffer, true, 0, size, - read_buffer.data(), 0, nullptr, - nullptr)); - ASSERT_TRUE( - std::all_of(read_buffer.begin(), read_buffer.end(), validator)); + std::function validator) { + helper.ValidateBuffer(buffer, size, validator); } - // Helper that uses the generic validate function to check for a given value. - template - void ValidateBuffer(ur_mem_handle_t buffer, size_t size, U value) { - auto validator = [&value](U result) -> bool { return result == value; }; - - ValidateBuffer(buffer, size, validator); + template + void ValidateBuffer(ur_mem_handle_t buffer, size_t size, T value) { + helper.ValidateBuffer(buffer, size, value); } + private: + KernelLaunchHelper helper = + KernelLaunchHelper{platform, context, kernel, queue}; std::vector buffer_args; - uint32_t current_arg_index = 0; - ur_context_handle_t context; - ur_kernel_handle_t kernel; - ur_queue_handle_t queue; +}; + +template +struct urKernelExecutionTestWithParam : urBaseKernelExecutionTestWithParam { + void SetUp() { + UUR_RETURN_ON_FATAL_FAILURE( + urBaseKernelExecutionTestWithParam::SetUp()); + this->Build(); + } }; struct urKernelExecutionTest : urBaseKernelExecutionTest { diff --git a/test/conformance/testing/include/uur/utils.h b/test/conformance/testing/include/uur/utils.h index d2341cc31b..34e5f7768b 100644 --- a/test/conformance/testing/include/uur/utils.h +++ b/test/conformance/testing/include/uur/utils.h @@ -395,6 +395,8 @@ ur_result_t GetDeviceMaxComputeQueueIndices(ur_device_handle_t device, uint32_t &max_indices); ur_result_t GetDeviceHostPipeRWSupported(ur_device_handle_t device, bool &support); +ur_result_t GetTimestampRecordingSupport(ur_device_handle_t device, + bool &support); ur_device_partition_property_t makePartitionByCountsDesc(uint32_t count); diff --git a/test/conformance/testing/source/utils.cpp b/test/conformance/testing/source/utils.cpp index 6cb8d3a90c..ab78f64bbc 100644 --- a/test/conformance/testing/source/utils.cpp +++ b/test/conformance/testing/source/utils.cpp @@ -635,6 +635,12 @@ ur_result_t GetDeviceHostPipeRWSupported(ur_device_handle_t device, device, UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED, support); } +ur_result_t GetTimestampRecordingSupport(ur_device_handle_t device, + bool &support) { + return GetDeviceInfo( + device, UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP, support); +} + ur_device_partition_property_t makePartitionByCountsDesc(uint32_t count) { ur_device_partition_property_t desc; desc.type = UR_DEVICE_PARTITION_BY_COUNTS; diff --git a/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match b/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match index dbb7cdebd5..9cda954748 100644 --- a/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match +++ b/test/conformance/virtual_memory/virtual_memory_adapter_level_zero.match @@ -1,88 +1,10 @@ -urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1 -urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2 -urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3 -urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7 -urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12 +{{OPT}}urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3 +{{OPT}}urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7 +{{OPT}}urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12 urPhysicalMemCreateTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44 -urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1 -urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2 -urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3 -urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7 -urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12 -urPhysicalMemCreateTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44 -urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1 -urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2 -urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3 -urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7 -urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12 -urPhysicalMemCreateTest.InvalidNullHandleDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44 -urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1 -urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2 -urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3 -urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7 -urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12 -urPhysicalMemCreateTest.InvalidNullPointerPhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___3 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___7 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___12 urPhysicalMemCreateTest.InvalidSize/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___44 -urPhysicalMemReleaseTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urPhysicalMemReleaseTest.InvalidNullHandlePhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urPhysicalMemRetainTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urPhysicalMemRetainTest.InvalidNullHandlePhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemFreeTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemFreeTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemFreeTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemGetInfoTestWithParam.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_VIRTUAL_MEM_INFO_ACCESS_MODE -urVirtualMemGetInfoTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemGetInfoTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemGetInfoTest.InvalidEnumerationInfo/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemGranularityGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_VIRTUAL_MEM_GRANULARITY_INFO_MINIMUM -urVirtualMemGranularityGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_VIRTUAL_MEM_GRANULARITY_INFO_RECOMMENDED -urVirtualMemGranularityGetInfoNegativeTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemGranularityGetInfoNegativeTest.InvalidEnumeration/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemGranularityGetInfoNegativeTest.InvalidNullPointerPropSizeRet/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemGranularityGetInfoNegativeTest.InvalidNullPointerPropValue/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemGranularityGetInfoNegativeTest.InvalidPropSizeZero/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemGranularityGetInfoNegativeTest.InvalidSizePropSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemMapTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemMapTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemMapTest.InvalidNullHandlePhysicalMem/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemMapTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemMapTest.InvalidEnumerationFlags/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___8 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___16 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___32 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___64 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___128 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___256 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___512 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2048 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___5000 -urVirtualMemReserveTestWithParam.SuccessNoStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___100000 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___4 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___8 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___16 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___32 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___64 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___128 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___256 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___512 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___1024 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___2048 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___5000 -urVirtualMemReserveTestWithParam.SuccessWithStartPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___100000 -urVirtualMemReserveTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemReserveTest.InvalidNullPointer/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemSetAccessTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemSetAccessTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemSetAccessTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemUnmapTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemUnmapTest.InvalidNullHandleContext/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urVirtualMemUnmapTest.InvalidNullPointerStart/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ diff --git a/test/layers/tracing/CMakeLists.txt b/test/layers/tracing/CMakeLists.txt index 2ccb4f69b0..27c94b5cfd 100644 --- a/test/layers/tracing/CMakeLists.txt +++ b/test/layers/tracing/CMakeLists.txt @@ -43,6 +43,20 @@ add_test(NAME example-collected-hello-world set_tracing_test_props(example-collected-hello-world collector) +add_test(NAME example-logged-hello-world + COMMAND ${CMAKE_COMMAND} + -D MODE=stdout + -D TEST_FILE=$ + -D MATCH_FILE=${CMAKE_CURRENT_SOURCE_DIR}/hello_world.out.logged.match + -P ${PROJECT_SOURCE_DIR}/cmake/match.cmake + DEPENDS hello_world +) +set_tests_properties(example-logged-hello-world PROPERTIES LABELS "tracing") +set_property(TEST example-logged-hello-world PROPERTY ENVIRONMENT + "UR_LOG_TRACING=level:info\;output:stdout" + "UR_ADAPTERS_FORCE_LOAD=\"$\"" + "UR_ENABLE_LAYERS=UR_LAYER_TRACING") + function(add_tracing_test name) set(TEST_TARGET_NAME tracing-test-${name}) add_ur_executable(${TEST_TARGET_NAME} diff --git a/test/layers/tracing/hello_world.out.logged.match b/test/layers/tracing/hello_world.out.logged.match new file mode 100644 index 0000000000..5bd7a33eae --- /dev/null +++ b/test/layers/tracing/hello_world.out.logged.match @@ -0,0 +1,13 @@ +Platform initialized. +---> urAdapterGet(.NumEntries = 0, .phAdapters = {{.*}}, .pNumAdapters = {{.*}} (1)) -> UR_RESULT_SUCCESS; +---> urAdapterGet(.NumEntries = 1, .phAdapters = {{.*}}, .pNumAdapters = nullptr) -> UR_RESULT_SUCCESS; +---> urPlatformGet(.phAdapters = {{.*}}, .NumAdapters = 1, .NumEntries = 1, .phPlatforms = {{.*}}, .pNumPlatforms = {{.*}} (1)) -> UR_RESULT_SUCCESS; +---> urPlatformGet(.phAdapters = {{.*}}, .NumAdapters = 1, .NumEntries = 1, .phPlatforms = {{.*}}, .pNumPlatforms = nullptr) -> UR_RESULT_SUCCESS; +---> urPlatformGetApiVersion(.hPlatform = {{.*}}, .pVersion = {{.*}} ({{0\.[0-9]+}})) -> UR_RESULT_SUCCESS; +API version: {{0\.[0-9]+}} +---> urDeviceGet(.hPlatform = {{.*}}, .DeviceType = UR_DEVICE_TYPE_GPU, .NumEntries = 0, .phDevices = {}, .pNumDevices = {{.*}} (1)) -> UR_RESULT_SUCCESS; +---> urDeviceGet(.hPlatform = {{.*}}, .DeviceType = UR_DEVICE_TYPE_GPU, .NumEntries = 1, .phDevices = {{.*}}, .pNumDevices = nullptr) -> UR_RESULT_SUCCESS; +---> urDeviceGetInfo(.hDevice = {{.*}}, .propName = UR_DEVICE_INFO_TYPE, .propSize = 4, .pPropValue = {{.*}} (UR_DEVICE_TYPE_GPU), .pPropSizeRet = nullptr) -> UR_RESULT_SUCCESS; +---> urDeviceGetInfo(.hDevice = {{.*}}, .propName = UR_DEVICE_INFO_NAME, .propSize = {{.*}}, .pPropValue = {{.*}} (Null Device), .pPropSizeRet = nullptr) -> UR_RESULT_SUCCESS; +Found a Null Device gpu. +---> urAdapterRelease(.hAdapter = {{.*}}) -> UR_RESULT_SUCCESS; diff --git a/third_party/requirements.txt b/third_party/requirements.txt index 0211d29d6b..45c2184890 100644 --- a/third_party/requirements.txt +++ b/third_party/requirements.txt @@ -10,9 +10,9 @@ clang-format==15.0.7 colorama==0.4.1 docutils==0.15.2 exhale==0.3.0 -idna==2.8 +idna==3.7 imagesize==1.1.0 -Jinja2==3.1.3 +Jinja2==3.1.4 lxml==4.9.3 Mako==1.3.0 MarkupSafe==2.1.5 diff --git a/tools/urinfo/urinfo.hpp b/tools/urinfo/urinfo.hpp index 111726f6cc..caf61e86f6 100644 --- a/tools/urinfo/urinfo.hpp +++ b/tools/urinfo/urinfo.hpp @@ -383,5 +383,26 @@ inline void printDeviceInfos(ur_device_handle_t hDevice, std::cout << prefix; printDeviceInfo( hDevice, UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP); + std::cout << prefix; + printDeviceInfo( + hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_USM_EXP); + std::cout << prefix; + printDeviceInfo( + hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_1D_EXP); + std::cout << prefix; + printDeviceInfo( + hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_USM_EXP); + std::cout << prefix; + printDeviceInfo( + hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D_EXP); + std::cout << prefix; + printDeviceInfo( + hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM_EXP); + std::cout << prefix; + printDeviceInfo( + hDevice, UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_EXP); + std::cout << prefix; + printDeviceInfo(hDevice, + UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP); } } // namespace urinfo