diff --git a/.github/workflows/benchmarks_core.yml b/.github/workflows/benchmarks_compute.yml similarity index 69% rename from .github/workflows/benchmarks_core.yml rename to .github/workflows/benchmarks_compute.yml index 34ffb7134e..2264b315b9 100644 --- a/.github/workflows/benchmarks_core.yml +++ b/.github/workflows/benchmarks_compute.yml @@ -1,39 +1,39 @@ name: Compute Benchmarks on: - # this workflow can by only triggered by other workflows - # for example by: e2e_cuda.yml or e2e_opencl.yml - workflow_call: - # acceptable input from adapter-specific workflows + # Can be triggered via manual "dispatch" (from workflow view in GitHub Actions tab) + workflow_dispatch: + # acceptable input for adapter-specific runs inputs: - name: - description: Adapter name - type: string - required: true str_name: description: Formatted adapter name - type: string - required: true - config: - description: Params for sycl configuration - type: string + type: choice required: true + default: 'level_zero' + options: + - level_zero unit: description: Test unit (cpu/gpu) - type: string + type: choice required: true - runner_tag: - description: Tag defined for the runner - type: string + default: 'gpu' + options: + - cpu + - gpu + pr_no: + description: PR number (if 0, it'll run on the main) + type: number required: true - trigger: - description: Type of workflow trigger + bench_script_params: + description: Parameters passed to script executing benchmark type: string - required: true - comment: - description: Text if triggered by a comment + required: false + default: '' + sycl_config_params: + description: Extra params for SYCL configuration type: string required: false + default: '' permissions: contents: read @@ -41,20 +41,20 @@ permissions: jobs: e2e-build-hw: - if: github.repository == 'oneapi-src/unified-runtime' # run only on upstream; forks will not have the HW + # Run only on upstream; forks will not have the HW + if: github.repository == 'oneapi-src/unified-runtime' name: Build SYCL, UR, run Compute Benchmarks strategy: matrix: adapter: [ - {name: "${{inputs.name}}", - str_name: "${{inputs.str_name}}", - config: "${{inputs.config}}", + {str_name: "${{inputs.str_name}}", + sycl_config: "${{inputs.sycl_config_params}}", unit: "${{inputs.unit}}"} ] build_type: [Release] compiler: [{c: clang, cxx: clang++}] - runs-on: ${{inputs.runner_tag}} + runs-on: "${{inputs.str_name}}_PERF" steps: # Workspace on self-hosted runners is not cleaned automatically. @@ -67,15 +67,17 @@ jobs: - name: Add comment to PR uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 - if: ${{ always() && inputs.trigger != 'schedule' }} + if: ${{ always() && inputs.pr_no != 0 }} with: script: | - const adapter = '${{ matrix.adapter.name }}'; + const pr_no = '${{ inputs.pr_no }}'; + const adapter = '${{ matrix.adapter.str_name }}'; const url = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}'; - const body = `Compute Benchmarks ${adapter} run: \n${url}`; + const params = '${{ inputs.bench_script_params }}'; + const body = `Compute Benchmarks ${adapter} run (with params: ${params}):\n${url}`; github.rest.issues.createComment({ - issue_number: context.issue.number, + issue_number: pr_no, owner: context.repo.owner, repo: context.repo.repo, body: body @@ -86,13 +88,12 @@ jobs: with: path: ur-repo - # On issue_comment trigger (for PRs) we need to fetch special ref for - # proper PR's merge commit. Note, this ref may be absent if the PR is already merged. + # We need to fetch special ref for proper PR's merge commit. Note, this ref may be absent if the PR is already merged. - name: Fetch PR's merge commit - if: ${{ inputs.trigger != 'schedule' }} + if: ${{ inputs.pr_no != 0 }} working-directory: ${{github.workspace}}/ur-repo env: - PR_NO: ${{github.event.issue.number}} + PR_NO: ${{ inputs.pr_no }} run: | git fetch -- https://github.com/${{github.repository}} +refs/pull/${PR_NO}/*:refs/remotes/origin/pr/${PR_NO}/* git checkout origin/pr/${PR_NO}/merge @@ -108,7 +109,7 @@ jobs: fetch-tags: false - name: Set CUDA env vars - if: matrix.adapter.name == 'CUDA' + if: matrix.adapter.str_name == 'cuda' run: | echo "CUDA_LIB_PATH=/usr/local/cuda/lib64/stubs" >> $GITHUB_ENV echo "LD_LIBRARY_PATH=/usr/local/cuda/compat/:/usr/local/cuda/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV @@ -119,7 +120,7 @@ jobs: -t ${{matrix.build_type}} -o ${{github.workspace}}/sycl_build --cmake-gen "Ninja" - --ci-defaults ${{matrix.adapter.config}} + --ci-defaults ${{matrix.adapter.sycl_config}} --cmake-opt="-DLLVM_INSTALL_UTILS=ON" --cmake-opt="-DSYCL_PI_TESTS=OFF" --cmake-opt="-DSYCL_PI_UR_USE_FETCH_CONTENT=OFF" @@ -165,17 +166,13 @@ jobs: run: | echo "ONEAPI_DEVICE_SELECTOR=${{ matrix.adapter.str_name }}:${{ matrix.adapter.unit }}" >> $GITHUB_ENV - - name: Extract arguments from comment - id: args - run: echo "ARGS=$(echo '${{ inputs.comment }}' | sed -n 's/.*\/benchmarks-[^ ]* \(.*\)/\1/p')" >> $GITHUB_ENV - - name: Run SYCL API Overhead benchmark id: benchmarks - run: ${{github.workspace}}/ur-repo/.github/scripts/compute_benchmarks.py ${{github.workspace}}/compute-benchmarks-build/bin/ $ARGS + run: ${{ github.workspace }}/ur-repo/.github/scripts/compute_benchmarks.py ${{ github.workspace }}/compute-benchmarks-build/bin/ ${{ inputs.bench_script_params }} - name: Add comment to PR uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 - if: ${{ always() && inputs.trigger != 'schedule' }} + if: ${{ always() && inputs.pr_no != 0 }} with: script: | let markdown = "" @@ -185,14 +182,16 @@ jobs: } catch(err) { } - const adapter = '${{ matrix.adapter.name }}'; + const pr_no = '${{ inputs.pr_no }}'; + const adapter = '${{ matrix.adapter.str_name }}'; const url = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}'; const test_status = '${{ steps.benchmarks.outcome }}'; const job_status = '${{ job.status }}'; - const body = `Compute Benchmarks ${adapter} run:\n${url}\nJob status: ${job_status}. Test status: ${test_status}.\n ${markdown}`; + const params = '${{ inputs.bench_script_params }}'; + const body = `Compute Benchmarks ${adapter} run (with params: ${params}):\n${url}\nJob status: ${job_status}. Test status: ${test_status}.\n ${markdown}`; github.rest.issues.createComment({ - issue_number: context.issue.number, + issue_number: pr_no, owner: context.repo.owner, repo: context.repo.repo, body: body diff --git a/.github/workflows/benchmarks_level_zero.yml b/.github/workflows/benchmarks_level_zero.yml deleted file mode 100644 index 6b38aaeedd..0000000000 --- a/.github/workflows/benchmarks_level_zero.yml +++ /dev/null @@ -1,26 +0,0 @@ -name: Compute Benchmarks Level Zero - -on: - issue_comment: - types: [created, edited] - -permissions: - contents: read - pull-requests: write - -jobs: - e2e-build-hw: - # trigger only if PR comment contains "benchmarks-level-zero" - if: ${{ (github.event.issue.pull_request && contains(github.event.comment.body, '/benchmarks-level-zero'))}} - name: Start benchmarks job - # use core flow, run it with L0 specific parameters - uses: ./.github/workflows/benchmarks_core.yml - # parameters that we pass to the core flow - with: - name: "L0" - runner_tag: "L0_PERF" - str_name: "level_zero" - config: "" - unit: "gpu" - trigger: "${{github.event_name}}" - comment: ${{github.event.comment.body}} diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 6f04308154..49e68c6b86 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -8,6 +8,7 @@ concurrency: permissions: contents: read + pull-requests: write jobs: ubuntu-build: @@ -161,14 +162,12 @@ jobs: run: ctest -C ${{matrix.build_type}} --output-on-failure -L "fuzz-short" --verbose level-zero: - if: github.repository == 'oneapi-src/unified-runtime' name: Level Zero uses: ./.github/workflows/build-hw-reusable.yml with: name: L0 opencl: - if: github.repository == 'oneapi-src/unified-runtime' name: OpenCL uses: ./.github/workflows/build-hw-reusable.yml with: @@ -176,26 +175,47 @@ jobs: platform: "Intel(R) OpenCL" cuda: - if: github.repository == 'oneapi-src/unified-runtime' name: CUDA uses: ./.github/workflows/build-hw-reusable.yml with: name: CUDA hip: - if: github.repository == 'oneapi-src/unified-runtime' name: HIP uses: ./.github/workflows/build-hw-reusable.yml with: name: HIP native-cpu: - if: github.repository == 'oneapi-src/unified-runtime' name: Native CPU uses: ./.github/workflows/build-hw-reusable.yml with: name: NATIVE_CPU + e2e-level-zero: + name: E2E L0 + permissions: + contents: read + pull-requests: write + needs: [ubuntu-build, level-zero] + uses: ./.github/workflows/e2e_level_zero.yml + + e2e-opencl: + name: E2E OpenCL + permissions: + contents: read + pull-requests: write + needs: [ubuntu-build, opencl] + uses: ./.github/workflows/e2e_opencl.yml + + e2e-cuda: + name: E2E CUDA + permissions: + contents: read + pull-requests: write + needs: [ubuntu-build, cuda] + uses: ./.github/workflows/e2e_cuda.yml + windows-build: name: Build - Windows strategy: @@ -211,6 +231,9 @@ jobs: adapter: {name: L0, var: '-DUR_BUILD_ADAPTER_L0=ON'} - adapter: {name: L0, var: '-DUR_BUILD_ADAPTER_L0=ON'} compiler: {c: clang-cl, cxx: clang-cl} + # TODO: testing is flaky on windows-2022 in Release mode + - os: 'windows-2022' + build_type: Release build_type: [Debug, Release] compiler: [{c: cl, cxx: cl}, {c: clang-cl, cxx: clang-cl}] @@ -262,7 +285,6 @@ jobs: working-directory: ${{github.workspace}}/build run: ctest -C ${{matrix.build_type}} --output-on-failure -L "umf|loader|validation|tracing|unit|urtrace" - macos-build: name: Build - MacOS strategy: diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index f1ae35587f..b0ed45d6b5 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -1,6 +1,6 @@ name: "CodeQL" -on: [push, pull_request] +on: [push] concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -24,7 +24,7 @@ jobs: uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: Initialize CodeQL - uses: github/codeql-action/init@b7bf0a3ed3ecfa44160715d7c442788f65f0f923 # v3.23.2 + uses: github/codeql-action/init@f079b8493333aace61c81488f8bd40919487bd9f # v3.25.7 with: languages: cpp, python @@ -38,7 +38,7 @@ jobs: run: cmake --build ${{github.workspace}}/build -j $(nproc) - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@b7bf0a3ed3ecfa44160715d7c442788f65f0f923 # v3.23.2 + uses: github/codeql-action/analyze@f079b8493333aace61c81488f8bd40919487bd9f # v3.25.7 analyze-windows: name: Analyze on Windows @@ -54,7 +54,7 @@ jobs: uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: Initialize CodeQL - uses: github/codeql-action/init@b7bf0a3ed3ecfa44160715d7c442788f65f0f923 # v3.23.2 + uses: github/codeql-action/init@f079b8493333aace61c81488f8bd40919487bd9f # v3.25.7 with: languages: cpp, python @@ -68,4 +68,4 @@ jobs: run: cmake --build ${{github.workspace}}/build -j $(nproc) --config Release - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@b7bf0a3ed3ecfa44160715d7c442788f65f0f923 # v3.23.2 + uses: github/codeql-action/analyze@f079b8493333aace61c81488f8bd40919487bd9f # v3.25.7 diff --git a/.github/workflows/e2e_core.yml b/.github/workflows/e2e_core.yml index 7f2050d83d..c15f1df101 100644 --- a/.github/workflows/e2e_core.yml +++ b/.github/workflows/e2e_core.yml @@ -30,10 +30,6 @@ on: description: Tag defifned for the runner type: string required: true - trigger: - description: Type of workflow trigger - type: string - required: true xfail: description: Allow test failures type: string @@ -56,9 +52,31 @@ permissions: pull-requests: write jobs: + changed-files: + name: Check for changed files + runs-on: ubuntu-22.04 + outputs: + any_changed: ${{ steps.get-changed.outputs.any_changed }} + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + - name: Get changed files + id: get-changed + uses: tj-actions/changed-files@d6babd6899969df1a11d14c368283ea4436bca78 # v44.5.2 + with: + files: | + source/adapters/${{inputs.str_name}}/** + e2e-build-hw: - if: github.repository == 'oneapi-src/unified-runtime' # run only on upstream; forks will not have the HW + # We want to run the job only if there are changes in the specific adapter + if: needs.changed-files.outputs.any_changed == 'true' name: Build SYCL, UR, run E2E + needs: changed-files + permissions: + contents: read + pull-requests: write + + # Allow failures, since SYCL tests and API may be not stable + continue-on-error: true strategy: matrix: adapter: [ @@ -83,39 +101,11 @@ jobs: ls -la ./ rm -rf ./* || true - - name: Add comment to PR - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 - if: ${{ always() && inputs.trigger != 'schedule' }} - with: - script: | - const adapter = '${{ matrix.adapter.name }}'; - const url = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}'; - const body = `E2E ${adapter} build: \n${url}`; - - github.rest.issues.createComment({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - body: body - }) - - name: Checkout UR uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: path: ur-repo - # On issue_comment trigger (for PRs) we need to fetch special ref for - # proper PR's merge commit. Note, this ref may be absent if the PR is already merged. - - name: Fetch PR's merge commit - if: ${{ inputs.trigger != 'schedule' }} - working-directory: ${{github.workspace}}/ur-repo - env: - PR_NO: ${{github.event.issue.number}} - run: | - git fetch -- https://github.com/${{github.repository}} +refs/pull/${PR_NO}/*:refs/remotes/origin/pr/${PR_NO}/* - git checkout origin/pr/${PR_NO}/merge - git rev-parse origin/pr/${PR_NO}/merge - - name: Checkout SYCL uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 with: @@ -188,20 +178,23 @@ jobs: id: tests run: ninja -C build-e2e check-sycl-e2e - - name: Add comment to PR - uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 - if: ${{ always() && inputs.trigger != 'schedule' }} - with: - script: | - const adapter = '${{ matrix.adapter.name }}'; - const url = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}'; - const test_status = '${{ steps.tests.outcome }}'; - const job_status = '${{ job.status }}'; - const body = `E2E ${adapter} build:\n${url}\nJob status: ${job_status}. Test status: ${test_status}`; - - github.rest.issues.createComment({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - body: body - }) + # FIXME: Requires pull-request: write permissions but this is only granted + # on pull requests from forks if using pull_request_target workflow + # trigger but not the pull_request trigger.. + # - name: Add comment to PR + # uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1 + # if: ${{ always() }} + # with: + # script: | + # const adapter = '${{ matrix.adapter.name }}'; + # const url = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}'; + # const test_status = '${{ steps.tests.outcome }}'; + # const job_status = '${{ job.status }}'; + # const body = `E2E ${adapter} build:\n${url}\nJob status: ${job_status}. Test status: ${test_status}`; + + # github.rest.issues.createComment({ + # issue_number: context.issue.number, + # owner: context.repo.owner, + # repo: context.repo.repo, + # body: body + # }) diff --git a/.github/workflows/e2e_cuda.yml b/.github/workflows/e2e_cuda.yml index 433e455eba..6bf181b0a4 100644 --- a/.github/workflows/e2e_cuda.yml +++ b/.github/workflows/e2e_cuda.yml @@ -1,11 +1,7 @@ name: E2E Cuda on: - schedule: - # Run every day at 23:00 UTC - - cron: '0 23 * * *' - issue_comment: - types: [created, edited] + workflow_call: permissions: contents: read @@ -13,12 +9,10 @@ permissions: jobs: e2e-build-hw: - # trigger only if PR comment contains "e2e-cuda" - if: ${{ (github.event.issue.pull_request && contains(github.event.comment.body, '/e2e-cuda')) || (github.event_name == 'schedule') }} + if: github.repository == 'oneapi-src/unified-runtime' # run only on upstream; forks will not have the HW name: Start e2e job # use core flow, run it with cuda specific parameters uses: ./.github/workflows/e2e_core.yml - # parameters that we pass to the core flow with: name: "CUDA" runner_tag: "CUDA_E2E" @@ -26,4 +20,3 @@ jobs: prefix: "ext_oneapi_" config: "--cuda" unit: "gpu" - trigger: "${{github.event_name}}" diff --git a/.github/workflows/e2e_level_zero.yml b/.github/workflows/e2e_level_zero.yml index 66c23715ea..5397dabb50 100644 --- a/.github/workflows/e2e_level_zero.yml +++ b/.github/workflows/e2e_level_zero.yml @@ -1,11 +1,7 @@ name: E2E Level Zero on: - schedule: - # Run every day at 23:00 UTC - - cron: '0 23 * * *' - issue_comment: - types: [created, edited] + workflow_call: permissions: contents: read @@ -13,12 +9,10 @@ permissions: jobs: e2e-build-hw: - # trigger only if PR comment contains "e2e-level-zero" - if: ${{ (github.event.issue.pull_request && contains(github.event.comment.body, '/e2e-level-zero')) || (github.event_name == 'schedule') }} + if: github.repository == 'oneapi-src/unified-runtime' # run only on upstream; forks will not have the HW name: Start e2e job # use core flow, run it with L0 specific parameters uses: ./.github/workflows/e2e_core.yml - # parameters that we pass to the core flow with: name: "L0" runner_tag: "L0_E2E" @@ -26,7 +20,6 @@ jobs: prefix: "ext_oneapi_" config: "" unit: "gpu" - trigger: "${{github.event_name}}" # Failing tests xfail: "ESIMD/preemption.cpp;syclcompat/atomic/atomic_class.cpp;ProgramManager/uneven_kernel_split.cpp;Plugin/level_zero_ext_intel_queue_index.cpp;Plugin/level_zero_ext_intel_cslice.cpp;Matrix/joint_matrix_rowmajorA_rowmajorB.cpp;Matrix/element_wise_ops.cpp;Matrix/element_wise_all_ops.cpp;Matrix/SG32/element_wise_all_ops.cpp" # Flaky tests diff --git a/.github/workflows/e2e_opencl.yml b/.github/workflows/e2e_opencl.yml index 302a9c995a..5264b668f2 100644 --- a/.github/workflows/e2e_opencl.yml +++ b/.github/workflows/e2e_opencl.yml @@ -1,11 +1,7 @@ name: E2E OpenCL on: - schedule: - # Run every day at 23:00 UTC - - cron: '0 23 * * *' - issue_comment: - types: [created, edited] + workflow_call: permissions: contents: read @@ -13,12 +9,10 @@ permissions: jobs: e2e-build-hw: - # trigger only if PR comment contains "e2e-opencl" - if: ${{ (github.event.issue.pull_request && contains(github.event.comment.body, '/e2e-opencl')) || (github.event_name == 'schedule') }} + if: github.repository == 'oneapi-src/unified-runtime' # run only on upstream; forks will not have the HW name: Start e2e job # use core flow, run it with OpenCL specific parameters uses: ./.github/workflows/e2e_core.yml - # parameters that we pass to the core flow with: name: "OPENCL" runner_tag: "OPENCL" @@ -26,4 +20,3 @@ jobs: prefix: "" config: "" unit: "cpu" - trigger: "${{github.event_name}}" diff --git a/CMakeLists.txt b/CMakeLists.txt index cfd8752657..40fbebcf65 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,6 +25,7 @@ set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED YES) # Build Options +option(UR_BUILD_EXAMPLES "Build example applications." ON) option(UR_BUILD_TESTS "Build unit tests." ON) option(UR_BUILD_TOOLS "build ur tools" ON) option(UR_FORMAT_CPP_STYLE "format code style of C++ sources" OFF) @@ -258,7 +259,9 @@ install( EXPORT ${PROJECT_NAME}-targets) add_subdirectory(source) -add_subdirectory(examples) +if(UR_BUILD_EXAMPLES) + add_subdirectory(examples) +endif() if(UR_BUILD_TESTS) add_subdirectory(test) endif() diff --git a/README.md b/README.md index 3d53d2117e..ae61b76b09 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,7 @@ List of options provided by CMake: | Name | Description | Values | Default | | - | - | - | - | +| UR_BUILD_EXAMPLES | Build example applications | ON/OFF | ON | | UR_BUILD_TESTS | Build the tests | ON/OFF | ON | | UR_BUILD_TOOLS | Build tools | ON/OFF | ON | | UR_FORMAT_CPP_STYLE | Format code style | ON/OFF | OFF | diff --git a/cmake/FindCUDACupti.cmake b/cmake/FindCUDACupti.cmake new file mode 100644 index 0000000000..3e8447bcdd --- /dev/null +++ b/cmake/FindCUDACupti.cmake @@ -0,0 +1,26 @@ +# Copyright (C) 2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# This is lifted from intel-llvm's FindCUDACupti implementation +# https://github.com/intel/llvm/blob/0cd04144d9ca83371c212e8e4709a59c968291b9/sycl/cmake/modules/FindCUDACupti.cmake + +macro(find_cuda_cupti_library) + find_library(CUDA_cupti_LIBRARY + NAMES cupti + HINTS ${CUDA_TOOLKIT_ROOT_DIR} + ENV CUDA_PATH + PATH_SUFFIXES nvidia/current lib64 lib/x64 lib + ../extras/CUPTI/lib64/ + ../extras/CUPTI/lib/ + ) +endmacro() + +macro(find_cuda_cupti_include_dir) + find_path(CUDA_CUPTI_INCLUDE_DIR cupti.h PATHS + "${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include" + "${CUDA_INCLUDE_DIRS}/../extras/CUPTI/include" + "${CUDA_INCLUDE_DIRS}" + NO_DEFAULT_PATH) +endmacro() diff --git a/cmake/match.py b/cmake/match.py index e138577165..5b96d3008f 100755 --- a/cmake/match.py +++ b/cmake/match.py @@ -63,6 +63,7 @@ def check_status(input_lines, match_lines): class Tag(Enum): OPT = "{{OPT}}" # makes the line optional IGNORE = "{{IGNORE}}" # ignores all input until next match or end of input file + COMMENT = "#" # comment - line ignored ## @brief main function for the match file processing script @@ -76,7 +77,15 @@ def main(): with open(input_file, 'r') as input, open(match_file, 'r') as match: input_lines = input.readlines() - match_lines = match.readlines() + # Filter out empty lines and comments (lines beginning with the comment + # character, ignoring leading whitespace) + match_lines = list( + filter( + lambda line: line.strip() + and not line.lstrip().startswith(Tag.COMMENT.value), + match.readlines(), + ) + ) ignored_lines = [] diff --git a/include/ur_api.h b/include/ur_api.h index 9d88eecbc6..d2d4ca6394 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -224,6 +224,7 @@ typedef enum ur_function_t { UR_FUNCTION_COMMAND_BUFFER_COMMAND_GET_INFO_EXP = 222, ///< Enumerator for ::urCommandBufferCommandGetInfoExp UR_FUNCTION_ENQUEUE_TIMESTAMP_RECORDING_EXP = 223, ///< Enumerator for ::urEnqueueTimestampRecordingExp UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP = 224, ///< Enumerator for ::urEnqueueKernelLaunchCustomExp + UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE = 225, ///< Enumerator for ::urKernelGetSuggestedLocalWorkSize /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -451,51 +452,50 @@ typedef enum ur_result_t { ///< device UR_RESULT_ERROR_INVALID_IMAGE_SIZE = 33, ///< Invalid image size UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR = 34, ///< Invalid image format descriptor - UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED = 35, ///< Image format not supported - UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE = 36, ///< Memory object allocation failure - UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE = 37, ///< Program object parameter is invalid. - UR_RESULT_ERROR_UNINITIALIZED = 38, ///< [Validation] adapter is not initialized or specific entry-point is not + UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE = 35, ///< Memory object allocation failure + UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE = 36, ///< Program object parameter is invalid. + UR_RESULT_ERROR_UNINITIALIZED = 37, ///< [Validation] adapter is not initialized or specific entry-point is not ///< implemented - UR_RESULT_ERROR_OUT_OF_HOST_MEMORY = 39, ///< Insufficient host memory to satisfy call - UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY = 40, ///< Insufficient device memory to satisfy call - UR_RESULT_ERROR_OUT_OF_RESOURCES = 41, ///< Out of resources - UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE = 42, ///< Error occurred when building program, see build log for details - UR_RESULT_ERROR_PROGRAM_LINK_FAILURE = 43, ///< Error occurred when linking programs, see build log for details - UR_RESULT_ERROR_UNSUPPORTED_VERSION = 44, ///< [Validation] generic error code for unsupported versions - UR_RESULT_ERROR_UNSUPPORTED_FEATURE = 45, ///< [Validation] generic error code for unsupported features - UR_RESULT_ERROR_INVALID_ARGUMENT = 46, ///< [Validation] generic error code for invalid arguments - UR_RESULT_ERROR_INVALID_NULL_HANDLE = 47, ///< [Validation] handle argument is not valid - UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE = 48, ///< [Validation] object pointed to by handle still in-use by device - UR_RESULT_ERROR_INVALID_NULL_POINTER = 49, ///< [Validation] pointer argument may not be nullptr - UR_RESULT_ERROR_INVALID_SIZE = 50, ///< [Validation] invalid size or dimensions (e.g., must not be zero, or is + UR_RESULT_ERROR_OUT_OF_HOST_MEMORY = 38, ///< Insufficient host memory to satisfy call + UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY = 39, ///< Insufficient device memory to satisfy call + UR_RESULT_ERROR_OUT_OF_RESOURCES = 40, ///< Out of resources + UR_RESULT_ERROR_PROGRAM_BUILD_FAILURE = 41, ///< Error occurred when building program, see build log for details + UR_RESULT_ERROR_PROGRAM_LINK_FAILURE = 42, ///< Error occurred when linking programs, see build log for details + UR_RESULT_ERROR_UNSUPPORTED_VERSION = 43, ///< [Validation] generic error code for unsupported versions + UR_RESULT_ERROR_UNSUPPORTED_FEATURE = 44, ///< [Validation] generic error code for unsupported features + UR_RESULT_ERROR_INVALID_ARGUMENT = 45, ///< [Validation] generic error code for invalid arguments + UR_RESULT_ERROR_INVALID_NULL_HANDLE = 46, ///< [Validation] handle argument is not valid + UR_RESULT_ERROR_HANDLE_OBJECT_IN_USE = 47, ///< [Validation] object pointed to by handle still in-use by device + UR_RESULT_ERROR_INVALID_NULL_POINTER = 48, ///< [Validation] pointer argument may not be nullptr + UR_RESULT_ERROR_INVALID_SIZE = 49, ///< [Validation] invalid size or dimensions (e.g., must not be zero, or is ///< out of bounds) - UR_RESULT_ERROR_UNSUPPORTED_SIZE = 51, ///< [Validation] size argument is not supported by the device (e.g., too + UR_RESULT_ERROR_UNSUPPORTED_SIZE = 50, ///< [Validation] size argument is not supported by the device (e.g., too ///< large) - UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT = 52, ///< [Validation] alignment argument is not supported by the device (e.g., + UR_RESULT_ERROR_UNSUPPORTED_ALIGNMENT = 51, ///< [Validation] alignment argument is not supported by the device (e.g., ///< too small) - UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT = 53, ///< [Validation] synchronization object in invalid state - UR_RESULT_ERROR_INVALID_ENUMERATION = 54, ///< [Validation] enumerator argument is not valid - UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION = 55, ///< [Validation] enumerator argument is not supported by the device - UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT = 56, ///< [Validation] image format is not supported by the device - UR_RESULT_ERROR_INVALID_NATIVE_BINARY = 57, ///< [Validation] native binary is not supported by the device - UR_RESULT_ERROR_INVALID_GLOBAL_NAME = 58, ///< [Validation] global variable is not found in the program - UR_RESULT_ERROR_INVALID_FUNCTION_NAME = 59, ///< [Validation] function name is not found in the program - UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION = 60, ///< [Validation] group size dimension is not valid for the kernel or + UR_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT = 52, ///< [Validation] synchronization object in invalid state + UR_RESULT_ERROR_INVALID_ENUMERATION = 53, ///< [Validation] enumerator argument is not valid + UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION = 54, ///< [Validation] enumerator argument is not supported by the device + UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT = 55, ///< [Validation] image format is not supported by the device + UR_RESULT_ERROR_INVALID_NATIVE_BINARY = 56, ///< [Validation] native binary is not supported by the device + UR_RESULT_ERROR_INVALID_GLOBAL_NAME = 57, ///< [Validation] global variable is not found in the program + UR_RESULT_ERROR_INVALID_FUNCTION_NAME = 58, ///< [Validation] function name is not found in the program + UR_RESULT_ERROR_INVALID_GROUP_SIZE_DIMENSION = 59, ///< [Validation] group size dimension is not valid for the kernel or ///< device - UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION = 61, ///< [Validation] global width dimension is not valid for the kernel or + UR_RESULT_ERROR_INVALID_GLOBAL_WIDTH_DIMENSION = 60, ///< [Validation] global width dimension is not valid for the kernel or ///< device - UR_RESULT_ERROR_PROGRAM_UNLINKED = 62, ///< [Validation] compiled program or program with imports needs to be + UR_RESULT_ERROR_PROGRAM_UNLINKED = 61, ///< [Validation] compiled program or program with imports needs to be ///< linked before kernels can be created from it. - UR_RESULT_ERROR_OVERLAPPING_REGIONS = 63, ///< [Validation] copy operations do not support overlapping regions of + UR_RESULT_ERROR_OVERLAPPING_REGIONS = 62, ///< [Validation] copy operations do not support overlapping regions of ///< memory - UR_RESULT_ERROR_INVALID_HOST_PTR = 64, ///< Invalid host pointer - UR_RESULT_ERROR_INVALID_USM_SIZE = 65, ///< Invalid USM size - UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE = 66, ///< Objection allocation failure - UR_RESULT_ERROR_ADAPTER_SPECIFIC = 67, ///< An adapter specific warning/error has been reported and can be + UR_RESULT_ERROR_INVALID_HOST_PTR = 63, ///< Invalid host pointer + UR_RESULT_ERROR_INVALID_USM_SIZE = 64, ///< Invalid USM size + UR_RESULT_ERROR_OBJECT_ALLOCATION_FAILURE = 65, ///< Objection allocation failure + UR_RESULT_ERROR_ADAPTER_SPECIFIC = 66, ///< An adapter specific warning/error has been reported and can be ///< retrieved via the urPlatformGetLastError entry point. - UR_RESULT_ERROR_LAYER_NOT_PRESENT = 68, ///< A requested layer was not found by the loader. - UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS = 69, ///< An event in the provided wait list has ::UR_EVENT_STATUS_ERROR. - UR_RESULT_ERROR_DEVICE_NOT_AVAILABLE = 70, ///< Device in question has `::UR_DEVICE_INFO_AVAILABLE == false` + UR_RESULT_ERROR_LAYER_NOT_PRESENT = 67, ///< A requested layer was not found by the loader. + UR_RESULT_ERROR_IN_EVENT_LIST_EXEC_STATUS = 68, ///< An event in the provided wait list has ::UR_EVENT_STATUS_ERROR. + UR_RESULT_ERROR_DEVICE_NOT_AVAILABLE = 69, ///< Device in question has `::UR_DEVICE_INFO_AVAILABLE == false` UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_EXP = 0x1000, ///< Invalid Command-Buffer UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_EXP = 0x1001, ///< Sync point is not valid for the command-buffer UR_RESULT_ERROR_INVALID_COMMAND_BUFFER_SYNC_POINT_WAIT_LIST_EXP = 0x1002, ///< Sync point wait list is invalid @@ -1188,6 +1188,8 @@ typedef struct ur_platform_native_properties_t { /// - ::UR_RESULT_ERROR_UNINITIALIZED /// - ::UR_RESULT_ERROR_DEVICE_LOST /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hAdapter` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == phPlatform` /// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE @@ -1195,6 +1197,7 @@ typedef struct ur_platform_native_properties_t { UR_APIEXPORT ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( ur_native_handle_t hNativePlatform, ///< [in][nocheck] the native handle of the platform. + ur_adapter_handle_t hAdapter, ///< [in] handle of the adapter associated with the native backend. const ur_platform_native_properties_t *pProperties, ///< [in][optional] pointer to native platform properties struct. ur_platform_handle_t *phPlatform ///< [out] pointer to the handle of the platform object created. ); @@ -2441,13 +2444,11 @@ typedef enum ur_mem_flag_t { /////////////////////////////////////////////////////////////////////////////// /// @brief Memory types typedef enum ur_mem_type_t { - UR_MEM_TYPE_BUFFER = 0, ///< Buffer object - UR_MEM_TYPE_IMAGE2D = 1, ///< 2D image object - UR_MEM_TYPE_IMAGE3D = 2, ///< 3D image object - UR_MEM_TYPE_IMAGE2D_ARRAY = 3, ///< 2D image array object - UR_MEM_TYPE_IMAGE1D = 4, ///< 1D image object - UR_MEM_TYPE_IMAGE1D_ARRAY = 5, ///< 1D image array object - UR_MEM_TYPE_IMAGE1D_BUFFER = 6, ///< 1D image buffer object + UR_MEM_TYPE_IMAGE2D = 0, ///< 2D image object + UR_MEM_TYPE_IMAGE3D = 1, ///< 3D image object + UR_MEM_TYPE_IMAGE2D_ARRAY = 2, ///< 2D image array object + UR_MEM_TYPE_IMAGE1D = 3, ///< 1D image object + UR_MEM_TYPE_IMAGE1D_ARRAY = 4, ///< 1D image array object UR_MEM_TYPE_IMAGE_CUBEMAP_EXP = 0x2000, ///< Experimental cubemap image object /// @cond UR_MEM_TYPE_FORCE_UINT32 = 0x7fffffff @@ -2558,6 +2559,22 @@ typedef struct ur_image_desc_t { /////////////////////////////////////////////////////////////////////////////// /// @brief Create an image object /// +/// @details +/// - The primary ::ur_image_format_t that must be supported by all the +/// adapters are {UR_IMAGE_CHANNEL_ORDER_RGBA, +/// UR_IMAGE_CHANNEL_TYPE_UNORM_INT8}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT16}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT8}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT16}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_FLOAT}. +/// /// @remarks /// _Analogues_ /// - **clCreateImage** @@ -2578,12 +2595,13 @@ typedef struct ur_image_desc_t { /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR -/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type` +/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type` /// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE /// - ::UR_RESULT_ERROR_INVALID_OPERATION /// - ::UR_RESULT_ERROR_INVALID_HOST_PTR /// + `pHost == NULL && (flags & (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) != 0` /// + `pHost != NULL && (flags & (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) == 0` +/// - ::UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES UR_APIEXPORT ur_result_t UR_APICALL @@ -5230,6 +5248,43 @@ urKernelCreateWithNativeHandle( ur_kernel_handle_t *phKernel ///< [out] pointer to the handle of the kernel object created. ); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Get the suggested local work size for a kernel. +/// +/// @details +/// - Query a suggested local work size for a kernel given a global size for +/// each dimension. +/// - The application may call this function from simultaneous threads for +/// the same context. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hKernel` +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkOffset` +/// + `NULL == pGlobalWorkSize` +/// + `NULL == pSuggestedLocalWorkSize` +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +UR_APIEXPORT ur_result_t UR_APICALL +urKernelGetSuggestedLocalWorkSize( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + uint32_t numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global + ///< and work-group work-items + const size_t *pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the offset used to calculate the global ID of a work-item + const size_t *pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the number of global work-items in workDim that will execute the + ///< kernel function + size_t *pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify + ///< suggested local work size that will contain the result of the query +); + #if !defined(__GNUC__) #pragma endregion #endif @@ -5250,7 +5305,9 @@ typedef enum ur_queue_info_t { ///< The reference count returned should be considered immediately stale. ///< It is unsuitable for general use in applications. This feature is ///< provided for identifying memory leaks. - UR_QUEUE_INFO_SIZE = 5, ///< [uint32_t] The size of the queue + UR_QUEUE_INFO_SIZE = 5, ///< [uint32_t] The size of the queue on the device. Only a valid query + ///< if the queue was created with the `ON_DEVICE` queue flag, otherwise + ///< `::urQueueGetInfo` will return `::UR_RESULT_ERROR_INVALID_QUEUE`. UR_QUEUE_INFO_EMPTY = 6, ///< [::ur_bool_t] return true if the queue was empty at the time of the ///< query /// @cond @@ -5265,7 +5322,8 @@ typedef uint32_t ur_queue_flags_t; typedef enum ur_queue_flag_t { UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE = UR_BIT(0), ///< Enable/disable out of order execution UR_QUEUE_FLAG_PROFILING_ENABLE = UR_BIT(1), ///< Enable/disable profiling - UR_QUEUE_FLAG_ON_DEVICE = UR_BIT(2), ///< Is a device queue + UR_QUEUE_FLAG_ON_DEVICE = UR_BIT(2), ///< Is a device queue. If this is enabled `OUT_OF_ORDER_EXEC_MODE_ENABLE` + ///< must also be enabled. UR_QUEUE_FLAG_ON_DEVICE_DEFAULT = UR_BIT(3), ///< Is the default queue for a device UR_QUEUE_FLAG_DISCARD_EVENTS = UR_BIT(4), ///< Events will be discarded UR_QUEUE_FLAG_PRIORITY_LOW = UR_BIT(5), ///< Low priority queue @@ -5310,7 +5368,7 @@ typedef enum ur_queue_flag_t { /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `propSize != 0 && pPropValue == NULL` /// + `pPropValue == NULL && pPropSizeRet == NULL` -/// - ::UR_RESULT_ERROR_INVALID_QUEUE +/// - ::UR_RESULT_ERROR_INVALID_QUEUE - "If `hQueue` isn't a valid queue handle or if `propName` isn't supported by `hQueue`." /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES UR_APIEXPORT ur_result_t UR_APICALL @@ -7499,7 +7557,7 @@ urBindlessImagesSampledImageHandleDestroyExp( /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR -/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type` +/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type` /// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE /// - ::UR_RESULT_ERROR_INVALID_OPERATION UR_APIEXPORT ur_result_t UR_APICALL @@ -7559,7 +7617,7 @@ urBindlessImagesImageFreeExp( /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR -/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type` +/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type` /// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE /// - ::UR_RESULT_ERROR_INVALID_OPERATION UR_APIEXPORT ur_result_t UR_APICALL @@ -7596,7 +7654,7 @@ urBindlessImagesUnsampledImageCreateExp( /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR -/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type` +/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type` /// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE /// - ::UR_RESULT_ERROR_INVALID_SAMPLER /// - ::UR_RESULT_ERROR_INVALID_OPERATION @@ -7638,7 +7696,7 @@ urBindlessImagesSampledImageCreateExp( /// - ::UR_RESULT_ERROR_INVALID_QUEUE /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR -/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type` +/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type` /// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE /// - ::UR_RESULT_ERROR_INVALID_OPERATION UR_APIEXPORT ur_result_t UR_APICALL @@ -7799,7 +7857,7 @@ urBindlessImagesImportOpaqueFDExp( /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR -/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type` +/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type` /// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE /// - ::UR_RESULT_ERROR_INVALID_OPERATION /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES @@ -8704,7 +8762,9 @@ urCommandBufferReleaseCommandExp( ); /////////////////////////////////////////////////////////////////////////////// -/// @brief Update a kernel launch command in a finalized command-buffer. +/// @brief Update a kernel launch command in a finalized command-buffer. This +/// entry-point is synchronous and may block if the command-buffer is +/// executing when the entry-point is called. /// /// @returns /// - ::UR_RESULT_SUCCESS @@ -9498,6 +9558,7 @@ typedef struct ur_platform_get_native_handle_params_t { /// allowing the callback the ability to modify the parameter's value typedef struct ur_platform_create_with_native_handle_params_t { ur_native_handle_t *phNativePlatform; + ur_adapter_handle_t *phAdapter; const ur_platform_native_properties_t **ppProperties; ur_platform_handle_t **pphPlatform; } ur_platform_create_with_native_handle_params_t; @@ -9943,6 +10004,19 @@ typedef struct ur_kernel_create_with_native_handle_params_t { ur_kernel_handle_t **pphKernel; } ur_kernel_create_with_native_handle_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urKernelGetSuggestedLocalWorkSize +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_kernel_get_suggested_local_work_size_params_t { + ur_kernel_handle_t *phKernel; + ur_queue_handle_t *phQueue; + uint32_t *pnumWorkDim; + const size_t **ppGlobalWorkOffset; + const size_t **ppGlobalWorkSize; + size_t **ppSuggestedLocalWorkSize; +} ur_kernel_get_suggested_local_work_size_params_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urKernelSetArgValue /// @details Each entry is a pointer to the parameter passed to the function; diff --git a/include/ur_ddi.h b/include/ur_ddi.h index fb1f1823b3..deff91472c 100644 --- a/include/ur_ddi.h +++ b/include/ur_ddi.h @@ -49,6 +49,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnPlatformGetNativeHandle_t)( /// @brief Function-pointer for urPlatformCreateWithNativeHandle typedef ur_result_t(UR_APICALL *ur_pfnPlatformCreateWithNativeHandle_t)( ur_native_handle_t, + ur_adapter_handle_t, const ur_platform_native_properties_t *, ur_platform_handle_t *); @@ -535,6 +536,16 @@ typedef ur_result_t(UR_APICALL *ur_pfnKernelCreateWithNativeHandle_t)( const ur_kernel_native_properties_t *, ur_kernel_handle_t *); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urKernelGetSuggestedLocalWorkSize +typedef ur_result_t(UR_APICALL *ur_pfnKernelGetSuggestedLocalWorkSize_t)( + ur_kernel_handle_t, + ur_queue_handle_t, + uint32_t, + const size_t *, + const size_t *, + size_t *); + /////////////////////////////////////////////////////////////////////////////// /// @brief Function-pointer for urKernelSetArgValue typedef ur_result_t(UR_APICALL *ur_pfnKernelSetArgValue_t)( @@ -603,6 +614,7 @@ typedef struct ur_kernel_dditable_t { ur_pfnKernelRelease_t pfnRelease; ur_pfnKernelGetNativeHandle_t pfnGetNativeHandle; ur_pfnKernelCreateWithNativeHandle_t pfnCreateWithNativeHandle; + ur_pfnKernelGetSuggestedLocalWorkSize_t pfnGetSuggestedLocalWorkSize; ur_pfnKernelSetArgValue_t pfnSetArgValue; ur_pfnKernelSetArgLocal_t pfnSetArgLocal; ur_pfnKernelSetArgPointer_t pfnSetArgPointer; diff --git a/include/ur_print.h b/include/ur_print.h index 753875ace9..c8fb41753e 100644 --- a/include/ur_print.h +++ b/include/ur_print.h @@ -1442,6 +1442,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelGetNativeHandleParams(const str /// - `buff_size < out_size` UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelCreateWithNativeHandleParams(const struct ur_kernel_create_with_native_handle_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print ur_kernel_get_suggested_local_work_size_params_t struct +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_SIZE +/// - `buff_size < out_size` +UR_APIEXPORT ur_result_t UR_APICALL urPrintKernelGetSuggestedLocalWorkSizeParams(const struct ur_kernel_get_suggested_local_work_size_params_t *params, char *buffer, const size_t buff_size, size_t *out_size); + /////////////////////////////////////////////////////////////////////////////// /// @brief Print ur_kernel_set_arg_value_params_t struct /// @returns diff --git a/include/ur_print.hpp b/include/ur_print.hpp index db230c91d7..7bd6cc75ab 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -929,6 +929,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_function_t value) { case UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP: os << "UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP"; break; + case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE: + os << "UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE"; + break; default: os << "unknown enumerator"; break; @@ -1451,9 +1454,6 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_result_t value) { case UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR: os << "UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR"; break; - case UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED: - os << "UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED"; - break; case UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE: os << "UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE"; break; @@ -5463,9 +5463,6 @@ inline ur_result_t printFlag(std::ostream &os, uint32_t flag) { /// std::ostream & inline std::ostream &operator<<(std::ostream &os, enum ur_mem_type_t value) { switch (value) { - case UR_MEM_TYPE_BUFFER: - os << "UR_MEM_TYPE_BUFFER"; - break; case UR_MEM_TYPE_IMAGE2D: os << "UR_MEM_TYPE_IMAGE2D"; break; @@ -5481,9 +5478,6 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_mem_type_t value) { case UR_MEM_TYPE_IMAGE1D_ARRAY: os << "UR_MEM_TYPE_IMAGE1D_ARRAY"; break; - case UR_MEM_TYPE_IMAGE1D_BUFFER: - os << "UR_MEM_TYPE_IMAGE1D_BUFFER"; - break; case UR_MEM_TYPE_IMAGE_CUBEMAP_EXP: os << "UR_MEM_TYPE_IMAGE_CUBEMAP_EXP"; break; @@ -10220,6 +10214,12 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur::details::printPtr(os, *(params->phNativePlatform)); + os << ", "; + os << ".hAdapter = "; + + ur::details::printPtr(os, + *(params->phAdapter)); + os << ", "; os << ".pProperties = "; @@ -11462,6 +11462,49 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct return os; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ur_kernel_get_suggested_local_work_size_params_t type +/// @returns +/// std::ostream & +inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct ur_kernel_get_suggested_local_work_size_params_t *params) { + + os << ".hKernel = "; + + ur::details::printPtr(os, + *(params->phKernel)); + + os << ", "; + os << ".hQueue = "; + + ur::details::printPtr(os, + *(params->phQueue)); + + os << ", "; + os << ".numWorkDim = "; + + os << *(params->pnumWorkDim); + + os << ", "; + os << ".pGlobalWorkOffset = "; + + ur::details::printPtr(os, + *(params->ppGlobalWorkOffset)); + + os << ", "; + os << ".pGlobalWorkSize = "; + + ur::details::printPtr(os, + *(params->ppGlobalWorkSize)); + + os << ", "; + os << ".pSuggestedLocalWorkSize = "; + + ur::details::printPtr(os, + *(params->ppSuggestedLocalWorkSize)); + + return os; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Print operator for the ur_kernel_set_arg_value_params_t type /// @returns @@ -17143,6 +17186,9 @@ inline ur_result_t UR_APICALL printFunctionParams(std::ostream &os, ur_function_ case UR_FUNCTION_KERNEL_CREATE_WITH_NATIVE_HANDLE: { os << (const struct ur_kernel_create_with_native_handle_params_t *)params; } break; + case UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE: { + os << (const struct ur_kernel_get_suggested_local_work_size_params_t *)params; + } break; case UR_FUNCTION_KERNEL_SET_ARG_VALUE: { os << (const struct ur_kernel_set_arg_value_params_t *)params; } break; diff --git a/scripts/core/EXP-BINDLESS-IMAGES.rst b/scripts/core/EXP-BINDLESS-IMAGES.rst index ee54c69291..d609c3b3d9 100644 --- a/scripts/core/EXP-BINDLESS-IMAGES.rst +++ b/scripts/core/EXP-BINDLESS-IMAGES.rst @@ -50,6 +50,7 @@ Runtime: * Sampled images * Unsampled images * Mipmaps + * Image arrays * Cubemaps * USM backed images @@ -207,6 +208,8 @@ Changelog +------------------------------------------------------------------------+ | 11.0 | Added device queries for sampled image fetch capabilities. | +----------+-------------------------------------------------------------+ +| 12.0 | Added image arrays to list of supported bindless images | ++----------+-------------------------------------------------------------+ Contributors -------------------------------------------------------------------------------- diff --git a/scripts/core/PROG.rst b/scripts/core/PROG.rst index e7bf24986b..4e71cce8c7 100644 --- a/scripts/core/PROG.rst +++ b/scripts/core/PROG.rst @@ -277,6 +277,39 @@ native handle to a driver handle. For example, OpenCL platform may expose an extension ${x}ProgramCreateWithNativeHandle to retrieve a cl_program. +Memory +====== + +UR Mem Handles +-------------- + +A ${x}_mem_handle_t can represent an untyped memory buffer object, created by +${x}MemBufferCreate, or a memory image object, created by ${x}MemImageCreate. +A ${x}_mem_handle_t manages the internal allocation and deallocation of native +memory objects across all devices in a ${x}_context_handle_t. A +${x}_mem_handle_t may only be used by queues that share the same +${x}_context_handle_t. + +If multiple queues in the same ${x}_context_handle_t use the same +${x}_mem_handle_t across dependent commands, a dependency must be defined by the +user using the enqueue entry point's phEventWaitList parameter. Provided that +dependencies are explicitly passed to UR entry points, a UR adapter will manage +memory migration of native memory objects across all devices in a context, if +memory migration is indeed necessary in the backend API. + +.. parsed-literal:: + + // Q1 and Q2 are both in hContext + ${x}_mem_handle_t hBuffer; + ${x}MemBufferCreate(hContext,,,,&hBuffer); + ${x}EnqueueMemBufferWrite(Q1, hBuffer,,,,,,, &outEv); + ${x}EnqueueMemBufferRead(Q2, hBuffer,,,,, 1, &outEv /*phEventWaitList*/, ); + +As such, the buffer written to in ${x}EnqueueMemBufferWrite can be +successfully read using ${x}EnqueueMemBufferRead from another queue in the same +context, since the event associated with the write operation has been passed as +a dependency to the read operation. + Memory Pooling ---------------------------------- diff --git a/scripts/core/common.yml b/scripts/core/common.yml index 0dad27d028..8aeb8ea248 100644 --- a/scripts/core/common.yml +++ b/scripts/core/common.yml @@ -205,8 +205,6 @@ etors: desc: "Invalid image size" - name: ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR desc: "Invalid image format descriptor" - - name: ERROR_IMAGE_FORMAT_NOT_SUPPORTED - desc: "Image format not supported" - name: ERROR_MEM_OBJECT_ALLOCATION_FAILURE desc: "Memory object allocation failure" - name: ERROR_INVALID_PROGRAM_EXECUTABLE diff --git a/scripts/core/exp-bindless-images.yml b/scripts/core/exp-bindless-images.yml index c5a3d5d5e3..622e378f0b 100644 --- a/scripts/core/exp-bindless-images.yml +++ b/scripts/core/exp-bindless-images.yml @@ -387,7 +387,7 @@ returns: - $X_RESULT_ERROR_INVALID_CONTEXT - $X_RESULT_ERROR_INVALID_VALUE - $X_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR: - - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`" + - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`" - $X_RESULT_ERROR_INVALID_IMAGE_SIZE - $X_RESULT_ERROR_INVALID_OPERATION --- #-------------------------------------------------------------------------- @@ -442,7 +442,7 @@ returns: - $X_RESULT_ERROR_INVALID_CONTEXT - $X_RESULT_ERROR_INVALID_VALUE - $X_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR: - - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`" + - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`" - $X_RESULT_ERROR_INVALID_IMAGE_SIZE - $X_RESULT_ERROR_INVALID_OPERATION --- #-------------------------------------------------------------------------- @@ -479,7 +479,7 @@ returns: - $X_RESULT_ERROR_INVALID_CONTEXT - $X_RESULT_ERROR_INVALID_VALUE - $X_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR: - - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`" + - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`" - $X_RESULT_ERROR_INVALID_IMAGE_SIZE - $X_RESULT_ERROR_INVALID_SAMPLER - $X_RESULT_ERROR_INVALID_OPERATION @@ -542,7 +542,7 @@ returns: - $X_RESULT_ERROR_INVALID_QUEUE - $X_RESULT_ERROR_INVALID_VALUE - $X_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR: - - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`" + - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`" - $X_RESULT_ERROR_INVALID_IMAGE_SIZE - $X_RESULT_ERROR_INVALID_OPERATION --- #-------------------------------------------------------------------------- @@ -679,7 +679,7 @@ returns: - $X_RESULT_ERROR_INVALID_CONTEXT - $X_RESULT_ERROR_INVALID_VALUE - $X_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR: - - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`" + - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`" - $X_RESULT_ERROR_INVALID_IMAGE_SIZE - $X_RESULT_ERROR_INVALID_OPERATION - $X_RESULT_ERROR_OUT_OF_RESOURCES diff --git a/scripts/core/exp-command-buffer.yml b/scripts/core/exp-command-buffer.yml index 78a1b020ef..6e276eac88 100644 --- a/scripts/core/exp-command-buffer.yml +++ b/scripts/core/exp-command-buffer.yml @@ -900,7 +900,7 @@ returns: - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY --- #-------------------------------------------------------------------------- type: function -desc: "Update a kernel launch command in a finalized command-buffer." +desc: "Update a kernel launch command in a finalized command-buffer. This entry-point is synchronous and may block if the command-buffer is executing when the entry-point is called." class: $xCommandBuffer name: UpdateKernelLaunchExp params: diff --git a/scripts/core/kernel.yml b/scripts/core/kernel.yml index 4a0bf0bab1..5446f3bc1d 100644 --- a/scripts/core/kernel.yml +++ b/scripts/core/kernel.yml @@ -534,3 +534,44 @@ params: returns: - $X_RESULT_ERROR_UNSUPPORTED_FEATURE: - "If the adapter has no underlying equivalent handle." +--- #-------------------------------------------------------------------------- +type: function +desc: "Get the suggested local work size for a kernel." +class: $xKernel +name: GetSuggestedLocalWorkSize +ordinal: "0" +details: + - "Query a suggested local work size for a kernel given a global size for each dimension." + - "The application may call this function from simultaneous threads for the same context." +params: + - type: $x_kernel_handle_t + name: hKernel + desc: | + [in] handle of the kernel + - type: $x_queue_handle_t + name: hQueue + desc: | + [in] handle of the queue object + - type: uint32_t + name: numWorkDim + desc: | + [in] number of dimensions, from 1 to 3, to specify the global + and work-group work-items + - type: const size_t* + name: pGlobalWorkOffset + desc: | + [in] pointer to an array of numWorkDim unsigned values that specify + the offset used to calculate the global ID of a work-item + - type: const size_t* + name: pGlobalWorkSize + desc: | + [in] pointer to an array of numWorkDim unsigned values that specify + the number of global work-items in workDim that will execute the + kernel function + - type: size_t* + name: pSuggestedLocalWorkSize + desc: | + [out] pointer to an array of numWorkDim unsigned values that specify + suggested local work size that will contain the result of the query +returns: + - $X_RESULT_ERROR_UNSUPPORTED_FEATURE diff --git a/scripts/core/memory.yml b/scripts/core/memory.yml index 6f88b10cdc..c4009bc56e 100644 --- a/scripts/core/memory.yml +++ b/scripts/core/memory.yml @@ -41,8 +41,6 @@ desc: "Memory types" class: $xMem name: $x_mem_type_t etors: - - name: BUFFER - desc: "Buffer object" - name: IMAGE2D desc: "2D image object" - name: IMAGE3D @@ -53,8 +51,6 @@ etors: desc: "1D image object" - name: IMAGE1D_ARRAY desc: "1D image array object" - - name: IMAGE1D_BUFFER - desc: "1D image buffer object" --- #-------------------------------------------------------------------------- type: enum desc: "Memory Information type" @@ -213,6 +209,15 @@ name: ImageCreate ordinal: "0" analogue: - "**clCreateImage**" +details: + - | + The primary $x_image_format_t that must be supported by all the adapters are {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT8}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT16}, {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT8}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT16}, {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16}, {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8}, {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32}, {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_FLOAT}. params: - type: $x_context_handle_t name: hContext @@ -236,12 +241,13 @@ returns: - $X_RESULT_ERROR_INVALID_CONTEXT - $X_RESULT_ERROR_INVALID_VALUE - $X_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR: - - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type`" + - "`pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type`" - $X_RESULT_ERROR_INVALID_IMAGE_SIZE - $X_RESULT_ERROR_INVALID_OPERATION - $X_RESULT_ERROR_INVALID_HOST_PTR: - "`pHost == NULL && (flags & (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) != 0`" - "`pHost != NULL && (flags & (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) == 0`" + - $X_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- diff --git a/scripts/core/platform.yml b/scripts/core/platform.yml index c35b71622d..f3069005ab 100644 --- a/scripts/core/platform.yml +++ b/scripts/core/platform.yml @@ -211,6 +211,9 @@ params: - type: $x_native_handle_t name: hNativePlatform desc: "[in][nocheck] the native handle of the platform." + - type: $x_adapter_handle_t + name: hAdapter + desc: "[in] handle of the adapter associated with the native backend." - type: const $x_platform_native_properties_t* name: pProperties desc: "[in][optional] pointer to native platform properties struct." diff --git a/scripts/core/queue.yml b/scripts/core/queue.yml index 816da179ba..27e8077ba2 100644 --- a/scripts/core/queue.yml +++ b/scripts/core/queue.yml @@ -32,7 +32,10 @@ etors: The reference count returned should be considered immediately stale. It is unsuitable for general use in applications. This feature is provided for identifying memory leaks. - name: SIZE - desc: "[uint32_t] The size of the queue" + desc: | + [uint32_t] The size of the queue on the device. Only a valid query + if the queue was created with the `ON_DEVICE` queue flag, otherwise + `$xQueueGetInfo` will return `$X_RESULT_ERROR_INVALID_QUEUE`. - name: EMPTY desc: "[$x_bool_t] return true if the queue was empty at the time of the query" --- #-------------------------------------------------------------------------- @@ -49,7 +52,7 @@ etors: desc: "Enable/disable profiling" - name: ON_DEVICE value: "$X_BIT(2)" - desc: "Is a device queue" + desc: "Is a device queue. If this is enabled `OUT_OF_ORDER_EXEC_MODE_ENABLE` must also be enabled." - name: ON_DEVICE_DEFAULT value: "$X_BIT(3)" desc: "Is the default queue for a device" @@ -108,6 +111,7 @@ returns: - "`propSize != 0 && pPropValue == NULL`" - "`pPropValue == NULL && pPropSizeRet == NULL`" - $X_RESULT_ERROR_INVALID_QUEUE + - "If `hQueue` isn't a valid queue handle or if `propName` isn't supported by `hQueue`." - $X_RESULT_ERROR_OUT_OF_HOST_MEMORY - $X_RESULT_ERROR_OUT_OF_RESOURCES --- #-------------------------------------------------------------------------- diff --git a/scripts/core/registry.yml b/scripts/core/registry.yml index b0a61e7f88..52585ade3a 100644 --- a/scripts/core/registry.yml +++ b/scripts/core/registry.yml @@ -586,6 +586,9 @@ etors: - name: ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP desc: Enumerator for $xEnqueueKernelLaunchCustomExp value: '224' +- name: KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE + desc: Enumerator for $xKernelGetSuggestedLocalWorkSize + value: '225' --- type: enum desc: Defines structure types diff --git a/scripts/generate_code.py b/scripts/generate_code.py index b8bfa97ba5..bdaa475a3e 100644 --- a/scripts/generate_code.py +++ b/scripts/generate_code.py @@ -328,12 +328,12 @@ def _mako_info_hpp(path, namespace, tags, version, specs, meta): specs=specs, meta=meta) + """ Entry-point: generates linker version scripts """ -def _mako_linker_scripts(path, ext, namespace, tags, version, specs, meta): - name = "adapter" +def _mako_linker_scripts(path, name, ext, namespace, tags, version, specs, meta): filename = f"{name}.{ext}.in" fin = os.path.join(templates_dir, f"{filename}.mako") fout = os.path.join(path, filename) @@ -347,6 +347,7 @@ def _mako_linker_scripts(path, ext, namespace, tags, version, specs, meta): specs=specs, meta=meta) + """ Entry-point: generates lib code @@ -370,6 +371,12 @@ def generate_loader(path, section, namespace, tags, version, specs, meta): loc = 0 loc += _mako_loader_cpp(dstpath, namespace, tags, version, specs, meta) loc += _mako_print_cpp(dstpath, namespace, tags, version, specs, meta) + loc += _mako_linker_scripts( + dstpath, "loader", "map", namespace, tags, version, specs, meta + ) + loc += _mako_linker_scripts( + dstpath, "loader", "def", namespace, tags, version, specs, meta + ) print("Generated %s lines of code.\n"%loc) """ @@ -382,8 +389,12 @@ def generate_adapters(path, section, namespace, tags, version, specs, meta): loc = 0 loc += _mako_null_adapter_cpp(dstpath, namespace, tags, version, specs, meta) - loc += _mako_linker_scripts(dstpath, "map", namespace, tags, version, specs, meta) - loc += _mako_linker_scripts(dstpath, "def", namespace, tags, version, specs, meta) + loc += _mako_linker_scripts( + dstpath, "adapter", "map", namespace, tags, version, specs, meta + ) + loc += _mako_linker_scripts( + dstpath, "adapter", "def", namespace, tags, version, specs, meta + ) print("Generated %s lines of code.\n"%loc) """ diff --git a/scripts/templates/helper.py b/scripts/templates/helper.py index 1d539d70fe..0c90f4da8e 100644 --- a/scripts/templates/helper.py +++ b/scripts/templates/helper.py @@ -12,8 +12,10 @@ # allow imports from top-level scripts directory sys.path.append("..") +from .print_helper import get_api_types_funcs from version import Version + """ Extracts traits from a spec object """ @@ -651,11 +653,37 @@ def get_adapter_handles(specs): objs = [] for s in specs: for obj in s['objects']: - if obj_traits.is_handle(obj) and not obj_traits.is_loader_only(obj): + if obj_traits.is_handle(obj) and not (obj_traits.is_loader_only(obj) or 'native' in obj['name']): objs.append(obj) return objs +""" +Public: + returns a list of all loader API functions' names +""" +def get_loader_functions(specs, meta, n, tags): + func_names = [] + + # Main API functions + for s in specs: + for obj in s["objects"]: + if obj_traits.is_function(obj): + func_names.append(make_func_name(n, tags, obj)) + + # Process address tables functions + for tbl in get_pfntables(specs, meta, n, tags): + func_names.append(tbl['export']['name']) + + # Print functions + api_types_funcs = get_api_types_funcs(specs, meta, n, tags) + for func in api_types_funcs: + func_names.append(func.c_name) + func_names.append(f"{tags['$x']}PrintFunctionParams") + + return sorted(func_names) + + """ Private: removes 'const' from c++ type diff --git a/scripts/templates/ldrddi.cpp.mako b/scripts/templates/ldrddi.cpp.mako index bbc7c7c7d0..4da75d1f91 100644 --- a/scripts/templates/ldrddi.cpp.mako +++ b/scripts/templates/ldrddi.cpp.mako @@ -132,7 +132,7 @@ namespace ur_loader %else: <%param_replacements={}%> %for i, item in enumerate(th.get_loader_prologue(n, tags, obj, meta)): - %if not '_native_object_' in item['obj'] or th.make_func_name(n, tags, obj) == 'urPlatformCreateWithNativeHandle': + %if not '_native_object_' in item['obj']: // extract platform's function pointer table auto dditable = reinterpret_cast<${item['obj']}*>( ${item['pointer']}${item['name']} )->dditable; auto ${th.make_pfn_name(n, tags, obj)} = dditable->${n}.${th.get_table_name(n, tags, obj)}.${th.make_pfn_name(n, tags, obj)}; @@ -151,7 +151,7 @@ namespace ur_loader for( size_t i = ${item['range'][0]}; i < ${item['range'][1]}; ++i ) ${item['name']}Local[ i ] = reinterpret_cast<${item['obj']}*>( ${item['name']}[ i ] )->handle; %else: - %if not '_native_object_' in item['obj'] or th.make_func_name(n, tags, obj) == 'urPlatformCreateWithNativeHandle': + %if not '_native_object_' in item['obj']: // convert loader handle to platform handle %if item['optional']: ${item['name']} = ( ${item['name']} ) ? reinterpret_cast<${item['obj']}*>( ${item['name']} )->handle : nullptr; @@ -279,7 +279,7 @@ namespace ur_loader %if item['release']: // release loader handle ${item['factory']}.release( ${item['name']} ); - %elif not '_native_object_' in item['obj'] or th.make_func_name(n, tags, obj) == 'urPlatformCreateWithNativeHandle': + %elif not '_native_object_' in item['obj']: try { %if 'typename' in item: diff --git a/scripts/templates/loader.def.in.mako b/scripts/templates/loader.def.in.mako new file mode 100644 index 0000000000..1bdfbe9fa7 --- /dev/null +++ b/scripts/templates/loader.def.in.mako @@ -0,0 +1,11 @@ +<%! +import re +from templates import helper as th +%><% + n=namespace +%>\ +LIBRARY @TARGET_LIBNAME@ +EXPORTS +%for line in th.get_loader_functions(specs, meta, n, tags): + ${line} +%endfor diff --git a/scripts/templates/loader.map.in.mako b/scripts/templates/loader.map.in.mako new file mode 100644 index 0000000000..0df1250440 --- /dev/null +++ b/scripts/templates/loader.map.in.mako @@ -0,0 +1,14 @@ +<%! +import re +from templates import helper as th +%><% + n=namespace +%>\ +@TARGET_LIBNAME@ { + global: +%for line in th.get_loader_functions(specs, meta, n, tags): + ${line}; +%endfor + local: + *; +}; diff --git a/source/adapters/cuda/CMakeLists.txt b/source/adapters/cuda/CMakeLists.txt index cd2a003a32..baa67e5961 100644 --- a/source/adapters/cuda/CMakeLists.txt +++ b/source/adapters/cuda/CMakeLists.txt @@ -76,8 +76,38 @@ else() message(WARNING "CUDA adapter USM pools are disabled, set UMF_ENABLE_POOL_TRACKING to enable them") endif() +if (UR_ENABLE_TRACING) + include(FindCUDACupti) + # The following two ifs can be removed when FindCUDA -> FindCUDAToolkit. + # CUDA_CUPTI_INCLUDE_DIR -> CUDAToolkit_CUPTI_INCLUDE_DIR + if(NOT CUDA_CUPTI_INCLUDE_DIR) + find_cuda_cupti_include_dir() + endif() + # CUDA_cupti_LIBRARY -> CUDAToolkit_cupti_LIBRARY + if(NOT CUDA_cupti_LIBRARY) + find_cuda_cupti_library() + endif() + + if (NOT XPTI_INCLUDES) + get_target_property(XPTI_INCLUDES xpti INCLUDE_DIRECTORIES) + endif() + if (NOT XPTI_PROXY_SRC) + get_target_property(XPTI_SRC_DIR xpti SOURCE_DIR) + set(XPTI_PROXY_SRC "${XPTI_SRC_DIR}/xpti_proxy.cpp") + endif() + target_compile_definitions(${TARGET_NAME} PRIVATE + XPTI_ENABLE_INSTRUMENTATION + ) + target_include_directories(${TARGET_NAME} PUBLIC + ${XPTI_INCLUDES} + ${CUDA_CUPTI_INCLUDE_DIR} + ) + target_sources(${TARGET_NAME} PRIVATE ${XPTI_PROXY_SRC}) +endif() + if (CUDA_cupti_LIBRARY) target_compile_definitions("ur_adapter_cuda" PRIVATE CUPTI_LIB_PATH="${CUDA_cupti_LIBRARY}") + list(APPEND EXTRA_LIBS ${CUDA_cupti_LIBRARY}) endif() target_link_libraries(${TARGET_NAME} PRIVATE @@ -85,6 +115,7 @@ target_link_libraries(${TARGET_NAME} PRIVATE ${PROJECT_NAME}::common Threads::Threads cudadrv + ${EXTRA_LIBS} ) target_include_directories(${TARGET_NAME} PRIVATE diff --git a/source/adapters/cuda/context.hpp b/source/adapters/cuda/context.hpp index f28e58afe7..a10e8e9ca7 100644 --- a/source/adapters/cuda/context.hpp +++ b/source/adapters/cuda/context.hpp @@ -116,6 +116,13 @@ struct ur_context_handle_t_ { return Devices; } + // Gets the index of the device relative to other devices in the context + size_t getDeviceIndex(ur_device_handle_t hDevice) { + auto It = std::find(Devices.begin(), Devices.end(), hDevice); + assert(It != Devices.end()); + return std::distance(Devices.begin(), It); + } + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } uint32_t decrementReferenceCount() noexcept { return --RefCount; } diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp index e6389c5ee2..375f6a98f4 100644 --- a/source/adapters/cuda/device.cpp +++ b/source/adapters/cuda/device.cpp @@ -215,7 +215,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, int Major = 0; UR_CHECK_ERROR(cuDeviceGetAttribute( &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice->get())); - uint64_t Capabilities = + ur_memory_scope_capability_flags_t Capabilities = (Major >= 7) ? UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | @@ -270,7 +270,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, int WarpSize = 0; UR_CHECK_ERROR(cuDeviceGetAttribute( &WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, hDevice->get())); - size_t Sizes[1] = {static_cast(WarpSize)}; + uint32_t Sizes[1] = {static_cast(WarpSize)}; return ReturnValue(Sizes, 1); } case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: { @@ -418,7 +418,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(static_cast(Min)); } case UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: { - return ReturnValue(0lu); + return ReturnValue(size_t(0)); } case UR_DEVICE_INFO_MAX_SAMPLERS: { // This call is kind of meaningless for cuda, as samplers don't exist. @@ -429,7 +429,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // https://docs.nvidia.com/cuda/cuda-c-programming-guide/#function-parameters // __global__ function parameters are passed to the device via constant // memory and are limited to 4 KB. - return ReturnValue(4000lu); + return ReturnValue(size_t(4000)); } case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: { int MemBaseAddrAlign = 0; @@ -542,7 +542,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: { // Hard coded to value returned by clinfo for OpenCL 1.2 CUDA | GeForce GTX // 1060 3GB - return ReturnValue(1000lu); + return ReturnValue(size_t(1000)); } case UR_DEVICE_INFO_ENDIAN_LITTLE: { return ReturnValue(true); @@ -569,10 +569,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, ur_queue_flag_t(UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE | UR_QUEUE_FLAG_PROFILING_ENABLE)); case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: { - // The mandated minimum capability: - ur_queue_flags_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE | - UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; - return ReturnValue(Capability); + return ReturnValue(0); } case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: { // The mandated minimum capability: @@ -647,7 +644,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } case UR_DEVICE_INFO_PRINTF_BUFFER_SIZE: { // The minimum value for the FULL profile is 1 MB. - return ReturnValue(1024lu); + return ReturnValue(size_t(1024)); } case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: { return ReturnValue(true); @@ -692,8 +689,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, // respect to other CPUs and GPUs in the system Value = UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS | UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_ACCESS | - UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS | - UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ATOMIC_CONCURRENT_ACCESS; + UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_CONCURRENT_ACCESS; } else { // on GPU architectures with compute capability lower than 6.x, atomic // operations from the GPU to CPU memory will not be atomic with respect diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 279426a41e..906fd49d1d 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -237,7 +237,7 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context, if (hasExceededMaxRegistersPerBlock(Device, Kernel, KernelLocalWorkGroupSize)) { - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + return UR_RESULT_ERROR_OUT_OF_RESOURCES; } } else { guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim, @@ -414,37 +414,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - std::vector DepEvents( - phEventWaitList, phEventWaitList + numEventsInWaitList); - std::vector> MemMigrationLocks; - - // phEventWaitList only contains events that are handed to UR by the SYCL - // runtime. However since UR handles memory dependencies within a context - // we may need to add more events to our dependent events list if the UR - // context contains multiple devices - if (hQueue->getContext()->Devices.size() > 1) { - MemMigrationLocks.reserve(hKernel->Args.MemObjArgs.size()); - for (auto &MemArg : hKernel->Args.MemObjArgs) { - bool PushBack = false; - if (auto MemDepEvent = MemArg.Mem->LastEventWritingToMemObj; - MemDepEvent && std::find(DepEvents.begin(), DepEvents.end(), - MemDepEvent) == DepEvents.end()) { - DepEvents.push_back(MemDepEvent); - PushBack = true; - } - if ((MemArg.AccessFlags & - (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) || - PushBack) { - if (std::find_if(MemMigrationLocks.begin(), MemMigrationLocks.end(), - [MemArg](auto &Lock) { - return Lock.first == MemArg.Mem; - }) == MemMigrationLocks.end()) - MemMigrationLocks.emplace_back( - std::pair{MemArg.Mem, ur_lock{MemArg.Mem->MemoryMigrationMutex}}); - } - } - } - // Early exit for zero size kernel if (*pGlobalWorkSize == 0) { return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, @@ -477,15 +446,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( CUstream CuStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); - if (DepEvents.size()) { - UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, DepEvents.size(), - DepEvents.data())); - } + UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList)); // For memory migration across devices in the same context if (hQueue->getContext()->Devices.size() > 1) { for (auto &MemArg : hKernel->Args.MemObjArgs) { - migrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice()); + enqueueMigrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice(), + CuStream); + if (MemArg.AccessFlags & + (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) { + MemArg.Mem->setLastQueueWritingToMemObj(hQueue); + } } } @@ -496,20 +468,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_CHECK_ERROR(RetImplEvent->start()); } - // Once event has been started we can unlock MemoryMigrationMutex - if (hQueue->getContext()->Devices.size() > 1) { - for (auto &MemArg : hKernel->Args.MemObjArgs) { - // Telling the ur_mem_handle_t that it will need to wait on this kernel - // if it has been written to - if (phEvent && (MemArg.AccessFlags & - (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY))) { - MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.get()); - } - } - // We can release the MemoryMigrationMutexes now - MemMigrationLocks.clear(); - } - auto &ArgIndices = hKernel->getArgIndices(); UR_CHECK_ERROR(cuLaunchKernel( CuFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], @@ -523,7 +481,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_CHECK_ERROR(RetImplEvent->record()); *phEvent = RetImplEvent.release(); } - } catch (ur_result_t Err) { return Err; } @@ -535,6 +492,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + if (pGlobalWorkOffset == nullptr || *pGlobalWorkOffset == 0) { + ur_exp_launch_property_t coop_prop; + coop_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE; + coop_prop.value.cooperative = 1; + return urEnqueueKernelLaunchCustomExp( + hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, 1, + &coop_prop, numEventsInWaitList, phEventWaitList, phEvent); + } return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, numEventsInWaitList, phEventWaitList, phEvent); @@ -553,7 +518,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( pLocalWorkSize, numEventsInWaitList, phEventWaitList, phEvent); } - +#if CUDA_VERSION >= 11080 // Preconditions UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(), UR_RESULT_ERROR_INVALID_KERNEL); @@ -595,37 +560,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( } } - std::vector DepEvents( - phEventWaitList, phEventWaitList + numEventsInWaitList); - std::vector> MemMigrationLocks; - - // phEventWaitList only contains events that are handed to UR by the SYCL - // runtime. However since UR handles memory dependencies within a context - // we may need to add more events to our dependent events list if the UR - // context contains multiple devices - if (hQueue->getContext()->Devices.size() > 1) { - MemMigrationLocks.reserve(hKernel->Args.MemObjArgs.size()); - for (auto &MemArg : hKernel->Args.MemObjArgs) { - bool PushBack = false; - if (auto MemDepEvent = MemArg.Mem->LastEventWritingToMemObj; - MemDepEvent && std::find(DepEvents.begin(), DepEvents.end(), - MemDepEvent) == DepEvents.end()) { - DepEvents.push_back(MemDepEvent); - PushBack = true; - } - if ((MemArg.AccessFlags & - (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) || - PushBack) { - if (std::find_if(MemMigrationLocks.begin(), MemMigrationLocks.end(), - [MemArg](auto &Lock) { - return Lock.first == MemArg.Mem; - }) == MemMigrationLocks.end()) - MemMigrationLocks.emplace_back( - std::pair{MemArg.Mem, ur_lock{MemArg.Mem->MemoryMigrationMutex}}); - } - } - } - // Early exit for zero size kernel if (*pGlobalWorkSize == 0) { return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, @@ -658,15 +592,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( CUstream CuStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); - if (DepEvents.size()) { - UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, DepEvents.size(), - DepEvents.data())); - } + UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList)); // For memory migration across devices in the same context if (hQueue->getContext()->Devices.size() > 1) { for (auto &MemArg : hKernel->Args.MemObjArgs) { - migrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice()); + enqueueMigrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice(), + CuStream); + if (MemArg.AccessFlags & + (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) { + MemArg.Mem->setLastQueueWritingToMemObj(hQueue); + } } } @@ -677,20 +614,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( UR_CHECK_ERROR(RetImplEvent->start()); } - // Once event has been started we can unlock MemoryMigrationMutex - if (hQueue->getContext()->Devices.size() > 1) { - for (auto &MemArg : hKernel->Args.MemObjArgs) { - // Telling the ur_mem_handle_t that it will need to wait on this kernel - // if it has been written to - if (phEvent && (MemArg.AccessFlags & - (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY))) { - MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.get()); - } - } - // We can release the MemoryMigrationMutexes now - MemMigrationLocks.clear(); - } - auto &ArgIndices = hKernel->getArgIndices(); CUlaunchConfig launch_config; @@ -717,11 +640,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( UR_CHECK_ERROR(RetImplEvent->record()); *phEvent = RetImplEvent.release(); } - } catch (ur_result_t Err) { return Err; } return UR_RESULT_SUCCESS; +#else + setErrorMessage("This feature requires cuda 11.8 or later.", + UR_RESULT_ERROR_ADAPTER_SPECIFIC); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; +#endif // CUDA_VERSION >= 11080 } /// Set parameters for general 3D memory copy. @@ -807,28 +734,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( ur_event_handle_t *phEvent) { std::unique_ptr RetImplEvent{nullptr}; - ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex}; - auto Device = hQueue->getDevice(); - ScopedContext Active(Device); - CUstream Stream = hQueue->getNextTransferStream(); - try { // Note that this entry point may be called on a queue that may not be the // last queue to write to the MemBuffer, meaning we must perform the copy // from a different device - if (hBuffer->LastEventWritingToMemObj && - hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() != - hQueue->getDevice()) { - hQueue = hBuffer->LastEventWritingToMemObj->getQueue(); - Device = hQueue->getDevice(); - ScopedContext Active(Device); - Stream = CUstream{0}; // Default stream for different device - // We may have to wait for an event on another queue if it is the last - // event writing to mem obj - UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1, - &hBuffer->LastEventWritingToMemObj)); + if (hBuffer->LastQueueWritingToMemObj && + hBuffer->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) { + hQueue = hBuffer->LastQueueWritingToMemObj; } + auto Device = hQueue->getDevice(); + ScopedContext Active(Device); + CUstream Stream = hQueue->getNextTransferStream(); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList)); @@ -873,6 +791,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( CUdeviceptr DevPtr = std::get(hBuffer->Mem).getPtr(hQueue->getDevice()); std::unique_ptr RetImplEvent{nullptr}; + hBuffer->setLastQueueWritingToMemObj(hQueue); try { ScopedContext Active(hQueue->getDevice()); @@ -903,7 +822,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( if (phEvent) { *phEvent = RetImplEvent.release(); } - } catch (ur_result_t Err) { return Err; } @@ -1041,20 +959,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( ur_event_handle_t *phEvent) { UR_ASSERT(size + offset <= std::get(hBuffer->Mem).getSize(), UR_RESULT_ERROR_INVALID_SIZE); - std::unique_ptr RetImplEvent{nullptr}; + hBuffer->setLastQueueWritingToMemObj(hQueue); try { ScopedContext Active(hQueue->getDevice()); auto Stream = hQueue->getNextTransferStream(); - ur_result_t Result = - enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList, + phEventWaitList)); if (phEvent) { RetImplEvent = std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_BUFFER_FILL, hQueue, Stream)); + UR_COMMAND_MEM_BUFFER_WRITE_RECT, hQueue, Stream)); UR_CHECK_ERROR(RetImplEvent->start()); } @@ -1080,8 +998,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( break; } default: { - Result = commonMemSetLargePattern(Stream, patternSize, size, pPattern, - DstDevice); + UR_CHECK_ERROR(commonMemSetLargePattern(Stream, patternSize, size, + pPattern, DstDevice)); break; } } @@ -1090,13 +1008,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( UR_CHECK_ERROR(RetImplEvent->record()); *phEvent = RetImplEvent.release(); } - - return Result; } catch (ur_result_t Err) { return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } + return UR_RESULT_SUCCESS; } static size_t imageElementByteSize(CUDA_ARRAY_DESCRIPTOR ArrayDesc) { @@ -1197,28 +1114,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( UR_ASSERT(hImage->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); - ur_lock MemoryMigrationLock{hImage->MemoryMigrationMutex}; - auto Device = hQueue->getDevice(); - CUstream Stream = hQueue->getNextTransferStream(); - try { // Note that this entry point may be called on a queue that may not be the // last queue to write to the Image, meaning we must perform the copy // from a different device - if (hImage->LastEventWritingToMemObj && - hImage->LastEventWritingToMemObj->getQueue()->getDevice() != - hQueue->getDevice()) { - hQueue = hImage->LastEventWritingToMemObj->getQueue(); - Device = hQueue->getDevice(); - ScopedContext Active(Device); - Stream = CUstream{0}; // Default stream for different device - // We may have to wait for an event on another queue if it is the last - // event writing to mem obj - UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1, - &hImage->LastEventWritingToMemObj)); + if (hImage->LastQueueWritingToMemObj && + hImage->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) { + hQueue = hImage->LastQueueWritingToMemObj; } + auto Device = hQueue->getDevice(); ScopedContext Active(Device); + CUstream Stream = hQueue->getNextTransferStream(); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList)); @@ -1821,28 +1729,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( UR_ASSERT(offset + size <= std::get(hBuffer->Mem).Size, UR_RESULT_ERROR_INVALID_SIZE); std::unique_ptr RetImplEvent{nullptr}; - ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex}; - auto Device = hQueue->getDevice(); - ScopedContext Active(Device); - CUstream Stream = hQueue->getNextTransferStream(); try { // Note that this entry point may be called on a queue that may not be the // last queue to write to the MemBuffer, meaning we must perform the copy // from a different device - if (hBuffer->LastEventWritingToMemObj && - hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() != - hQueue->getDevice()) { - hQueue = hBuffer->LastEventWritingToMemObj->getQueue(); - Device = hQueue->getDevice(); - ScopedContext Active(Device); - Stream = CUstream{0}; // Default stream for different device - // We may have to wait for an event on another queue if it is the last - // event writing to mem obj - UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, 1, - &hBuffer->LastEventWritingToMemObj)); + if (hBuffer->LastQueueWritingToMemObj && + hBuffer->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) { + hQueue = hBuffer->LastQueueWritingToMemObj; } + auto Device = hQueue->getDevice(); + ScopedContext Active(Device); + CUstream Stream = hQueue->getNextTransferStream(); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList, phEventWaitList)); @@ -1884,17 +1784,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( UR_ASSERT(offset + size <= std::get(hBuffer->Mem).Size, UR_RESULT_ERROR_INVALID_SIZE); - ur_result_t Result = UR_RESULT_SUCCESS; CUdeviceptr DevPtr = std::get(hBuffer->Mem).getPtr(hQueue->getDevice()); std::unique_ptr RetImplEvent{nullptr}; + hBuffer->setLastQueueWritingToMemObj(hQueue); try { ScopedContext Active(hQueue->getDevice()); CUstream CuStream = hQueue->getNextTransferStream(); - Result = enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, - phEventWaitList); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, CuStream, numEventsInWaitList, + phEventWaitList)); if (phEvent) { RetImplEvent = @@ -1917,9 +1817,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( *phEvent = RetImplEvent.release(); } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueDeviceGlobalVariableWrite( diff --git a/source/adapters/cuda/enqueue.hpp b/source/adapters/cuda/enqueue.hpp index c925a27295..be141f7b20 100644 --- a/source/adapters/cuda/enqueue.hpp +++ b/source/adapters/cuda/enqueue.hpp @@ -17,6 +17,10 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream, uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList); +void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock, + const size_t *GlobalWorkSize, const uint32_t WorkDim, + ur_kernel_handle_t Kernel); + bool hasExceededMaxRegistersPerBlock(ur_device_handle_t Device, ur_kernel_handle_t Kernel, size_t BlockSize); diff --git a/source/adapters/cuda/event.cpp b/source/adapters/cuda/event.cpp index f9889a3f46..99446a3b7b 100644 --- a/source/adapters/cuda/event.cpp +++ b/source/adapters/cuda/event.cpp @@ -55,8 +55,7 @@ ur_result_t ur_event_handle_t_::start() { try { if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) { - // NOTE: This relies on the default stream to be unused. - UR_CHECK_ERROR(cuEventRecord(EvQueued, 0)); + UR_CHECK_ERROR(cuEventRecord(EvQueued, Queue->getHostSubmitTimeStream())); UR_CHECK_ERROR(cuEventRecord(EvStart, Stream)); } } catch (ur_result_t Err) { @@ -176,7 +175,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetInfo(ur_event_handle_t hEvent, case UR_EVENT_INFO_CONTEXT: return ReturnValue(hEvent->getContext()); default: - detail::ur::die("Event info request not implemented"); + break; } return UR_RESULT_ERROR_INVALID_ENUMERATION; @@ -207,8 +206,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventGetProfilingInfo( default: break; } - detail::ur::die("Event Profiling info request not implemented"); - return {}; + return UR_RESULT_ERROR_INVALID_ENUMERATION; } UR_APIEXPORT ur_result_t UR_APICALL urEventSetCallback(ur_event_handle_t, @@ -280,8 +278,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle( std::unique_ptr EventPtr{nullptr}; - *phEvent = ur_event_handle_t_::makeWithNative( - hContext, reinterpret_cast(hNativeEvent)); + try { + EventPtr = + std::unique_ptr(ur_event_handle_t_::makeWithNative( + hContext, reinterpret_cast(hNativeEvent))); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + *phEvent = EventPtr.release(); return UR_RESULT_SUCCESS; } diff --git a/source/adapters/cuda/event.hpp b/source/adapters/cuda/event.hpp index 5ed68f0f25..3cb11469ce 100644 --- a/source/adapters/cuda/event.hpp +++ b/source/adapters/cuda/event.hpp @@ -90,6 +90,9 @@ struct ur_event_handle_t_ { const bool RequiresTimings = Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || Type == UR_COMMAND_TIMESTAMP_RECORDING_EXP; + if (RequiresTimings) { + Queue->createHostSubmitTimeStream(); + } native_type EvEnd = nullptr, EvQueued = nullptr, EvStart = nullptr; UR_CHECK_ERROR(cuEventCreate( &EvEnd, RequiresTimings ? CU_EVENT_DEFAULT : CU_EVENT_DISABLE_TIMING)); diff --git a/source/adapters/cuda/image.cpp b/source/adapters/cuda/image.cpp index 95dc2e258e..b641685920 100644 --- a/source/adapters/cuda/image.cpp +++ b/source/adapters/cuda/image.cpp @@ -35,7 +35,7 @@ ur_result_t urCalculateNumChannels(ur_image_channel_order_t order, *NumChannels = 2; return UR_RESULT_SUCCESS; case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_RGB: - return UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED; + return UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT; case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_RGBA: case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_ARGB: case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_BGRA: @@ -49,7 +49,7 @@ ur_result_t urCalculateNumChannels(ur_image_channel_order_t order, case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_INTENSITY: case ur_image_channel_order_t::UR_IMAGE_CHANNEL_ORDER_LUMINANCE: default: - return UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED; + return UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT; } } @@ -147,7 +147,7 @@ urToCudaImageChannelFormat(ur_image_channel_type_t image_channel_type, cuda_format = cuda_format_and_size.first; pixel_size_bytes = cuda_format_and_size.second; } catch (const std::out_of_range &) { - return UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED; + return UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT; } } @@ -228,7 +228,7 @@ cudaToUrImageChannelFormat(CUarray_format cuda_format, #endif #undef MAP default: - return UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED; + return UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT; } } diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp index 675fdbe0a3..5e01845a56 100644 --- a/source/adapters/cuda/kernel.cpp +++ b/source/adapters/cuda/kernel.cpp @@ -9,7 +9,9 @@ //===----------------------------------------------------------------------===// #include "kernel.hpp" +#include "enqueue.hpp" #include "memory.hpp" +#include "queue.hpp" #include "sampler.hpp" UR_APIEXPORT ur_result_t UR_APICALL @@ -380,3 +382,30 @@ urKernelSetArgSampler(ur_kernel_handle_t hKernel, uint32_t argIndex, } return Result; } + +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( + ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim, + [[maybe_unused]] const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, size_t *pSuggestedLocalWorkSize) { + // Preconditions + UR_ASSERT(hQueue->getContext() == hKernel->getContext(), + UR_RESULT_ERROR_INVALID_KERNEL); + UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(pSuggestedLocalWorkSize != nullptr, + UR_RESULT_ERROR_INVALID_NULL_POINTER); + + ur_device_handle_t Device = hQueue->Device; + ur_result_t Result = UR_RESULT_SUCCESS; + size_t ThreadsPerBlock[3] = {}; + + // Set the active context here as guessLocalWorkSize needs an active context + ScopedContext Active(Device); + + guessLocalWorkSize(Device, ThreadsPerBlock, pGlobalWorkSize, workDim, + hKernel); + + std::copy(ThreadsPerBlock, ThreadsPerBlock + workDim, + pSuggestedLocalWorkSize); + return Result; +} diff --git a/source/adapters/cuda/memory.cpp b/source/adapters/cuda/memory.cpp index c28ce98748..9ea62a2c1b 100644 --- a/source/adapters/cuda/memory.cpp +++ b/source/adapters/cuda/memory.cpp @@ -12,6 +12,7 @@ #include "common.hpp" #include "context.hpp" +#include "enqueue.hpp" #include "memory.hpp" /// Creates a UR Memory object using a CUDA memory allocation. @@ -211,7 +212,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( UR_ASSERT(pImageDesc->stype == UR_STRUCTURE_TYPE_IMAGE_DESC, UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); - UR_ASSERT(pImageDesc->type <= UR_MEM_TYPE_IMAGE1D_BUFFER, + UR_ASSERT(pImageDesc->type <= UR_MEM_TYPE_IMAGE1D_ARRAY, UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); UR_ASSERT(pImageDesc->numMipLevel == 0, UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); @@ -227,15 +228,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( // We only support RBGA channel order // TODO: check SYCL CTS and spec. May also have to support BGRA UR_ASSERT(pImageFormat->channelOrder == UR_IMAGE_CHANNEL_ORDER_RGBA, - UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION); + UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT); auto URMemObj = std::unique_ptr( new ur_mem_handle_t_{hContext, flags, *pImageFormat, *pImageDesc, pHost}); + UR_ASSERT(std::get(URMemObj->Mem).PixelTypeSizeBytes, + UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT); + try { if (PerformInitialCopy) { for (const auto &Device : hContext->getDevices()) { - UR_CHECK_ERROR(migrateMemoryToDeviceIfNeeded(URMemObj.get(), Device)); + // Synchronous behaviour is best in this case + ScopedContext Active(Device); + CUstream Stream{0}; // Use default stream + UR_CHECK_ERROR(enqueueMigrateMemoryToDeviceIfNeeded(URMemObj.get(), + Device, Stream)); + UR_CHECK_ERROR(cuStreamSynchronize(Stream)); } } @@ -429,11 +438,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem, const ur_device_handle_t hDevice) { ScopedContext Active(hDevice); + auto DeviceIdx = Mem->getContext()->getDeviceIndex(hDevice); ur_lock LockGuard(Mem->MemoryAllocationMutex); if (Mem->isBuffer()) { auto &Buffer = std::get(Mem->Mem); - auto &DevPtr = Buffer.Ptrs[hDevice->getIndex() % Buffer.Ptrs.size()]; + auto &DevPtr = Buffer.Ptrs[DeviceIdx]; // Allocation has already been made if (DevPtr != BufferMem::native_type{0}) { @@ -456,11 +466,11 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem, try { auto &Image = std::get(Mem->Mem); // Allocation has already been made - if (Image.Arrays[hDevice->getIndex() % Image.Arrays.size()]) { + if (Image.Arrays[DeviceIdx]) { return UR_RESULT_SUCCESS; } UR_CHECK_ERROR(cuArray3DCreate(&ImageArray, &Image.ArrayDesc)); - Image.Arrays[hDevice->getIndex() % Image.Arrays.size()] = ImageArray; + Image.Arrays[DeviceIdx] = ImageArray; // CUDA_RESOURCE_DESC is a union of different structs, shown here // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TEXOBJECT.html @@ -475,7 +485,7 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem, ImageResDesc.flags = 0; UR_CHECK_ERROR(cuSurfObjectCreate(&Surface, &ImageResDesc)); - Image.SurfObjs[hDevice->getIndex() % Image.SurfObjs.size()] = Surface; + Image.SurfObjs[DeviceIdx] = Surface; } catch (ur_result_t Err) { if (ImageArray) { UR_CHECK_ERROR(cuArrayDestroy(ImageArray)); @@ -492,27 +502,28 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem, } namespace { -ur_result_t migrateBufferToDevice(ur_mem_handle_t Mem, - ur_device_handle_t hDevice) { +ur_result_t enqueueMigrateBufferToDevice(ur_mem_handle_t Mem, + ur_device_handle_t hDevice, + CUstream Stream) { auto &Buffer = std::get(Mem->Mem); - if (Mem->LastEventWritingToMemObj == nullptr) { + if (Mem->LastQueueWritingToMemObj == nullptr) { // Device allocation being initialized from host for the first time if (Buffer.HostPtr) { - UR_CHECK_ERROR( - cuMemcpyHtoD(Buffer.getPtr(hDevice), Buffer.HostPtr, Buffer.Size)); + UR_CHECK_ERROR(cuMemcpyHtoDAsync(Buffer.getPtr(hDevice), Buffer.HostPtr, + Buffer.Size, Stream)); } - } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() != - hDevice) { - UR_CHECK_ERROR(cuMemcpyDtoD( + } else if (Mem->LastQueueWritingToMemObj->getDevice() != hDevice) { + UR_CHECK_ERROR(cuMemcpyDtoDAsync( Buffer.getPtr(hDevice), - Buffer.getPtr(Mem->LastEventWritingToMemObj->getQueue()->getDevice()), - Buffer.Size)); + Buffer.getPtr(Mem->LastQueueWritingToMemObj->getDevice()), Buffer.Size, + Stream)); } return UR_RESULT_SUCCESS; } -ur_result_t migrateImageToDevice(ur_mem_handle_t Mem, - ur_device_handle_t hDevice) { +ur_result_t enqueueMigrateImageToDevice(ur_mem_handle_t Mem, + ur_device_handle_t hDevice, + CUstream Stream) { auto &Image = std::get(Mem->Mem); // When a dimension isn't used image_desc has the size set to 1 size_t PixelSizeBytes = Image.PixelTypeSizeBytes * @@ -543,40 +554,42 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem, CpyDesc3D.Depth = Image.ImageDesc.depth; } - if (Mem->LastEventWritingToMemObj == nullptr) { + if (Mem->LastQueueWritingToMemObj == nullptr) { if (Image.HostPtr) { if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) { - UR_CHECK_ERROR( - cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes)); + UR_CHECK_ERROR(cuMemcpyHtoAAsync(ImageArray, 0, Image.HostPtr, + ImageSizeBytes, Stream)); } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) { CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; CpyDesc2D.srcHost = Image.HostPtr; - UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D)); + UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc2D, Stream)); } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) { CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST; CpyDesc3D.srcHost = Image.HostPtr; - UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D)); + UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc3D, Stream)); } } - } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() != - hDevice) { + } else if (Mem->LastQueueWritingToMemObj->getDevice() != hDevice) { if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) { + // Blocking wait needed + UR_CHECK_ERROR(urQueueFinish(Mem->LastQueueWritingToMemObj)); // FIXME: 1D memcpy from DtoD going through the host. UR_CHECK_ERROR(cuMemcpyAtoH( Image.HostPtr, - Image.getArray( - Mem->LastEventWritingToMemObj->getQueue()->getDevice()), + Image.getArray(Mem->LastQueueWritingToMemObj->getDevice()), 0 /*srcOffset*/, ImageSizeBytes)); UR_CHECK_ERROR( cuMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes)); } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) { - CpyDesc2D.srcArray = Image.getArray( - Mem->LastEventWritingToMemObj->getQueue()->getDevice()); - UR_CHECK_ERROR(cuMemcpy2D(&CpyDesc2D)); + CpyDesc2D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE; + CpyDesc2D.srcArray = + Image.getArray(Mem->LastQueueWritingToMemObj->getDevice()); + UR_CHECK_ERROR(cuMemcpy2DAsync(&CpyDesc2D, Stream)); } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) { - CpyDesc3D.srcArray = Image.getArray( - Mem->LastEventWritingToMemObj->getQueue()->getDevice()); - UR_CHECK_ERROR(cuMemcpy3D(&CpyDesc3D)); + CpyDesc3D.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_DEVICE; + CpyDesc3D.srcArray = + Image.getArray(Mem->LastQueueWritingToMemObj->getDevice()); + UR_CHECK_ERROR(cuMemcpy3DAsync(&CpyDesc3D, Stream)); } } return UR_RESULT_SUCCESS; @@ -585,26 +598,52 @@ ur_result_t migrateImageToDevice(ur_mem_handle_t Mem, // If calling this entry point it is necessary to lock the memoryMigrationMutex // beforehand -ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem, - const ur_device_handle_t hDevice) { +ur_result_t enqueueMigrateMemoryToDeviceIfNeeded( + ur_mem_handle_t Mem, const ur_device_handle_t hDevice, CUstream Stream) { UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); // Device allocation has already been initialized with most up to date // data in buffer - if (Mem->HaveMigratedToDeviceSinceLastWrite - [hDevice->getIndex() % - Mem->HaveMigratedToDeviceSinceLastWrite.size()]) { + if (Mem->HaveMigratedToDeviceSinceLastWrite[Mem->getContext()->getDeviceIndex( + hDevice)]) { return UR_RESULT_SUCCESS; } ScopedContext Active(hDevice); if (Mem->isBuffer()) { - UR_CHECK_ERROR(migrateBufferToDevice(Mem, hDevice)); + UR_CHECK_ERROR(enqueueMigrateBufferToDevice(Mem, hDevice, Stream)); } else { - UR_CHECK_ERROR(migrateImageToDevice(Mem, hDevice)); + UR_CHECK_ERROR(enqueueMigrateImageToDevice(Mem, hDevice, Stream)); } - Mem->HaveMigratedToDeviceSinceLastWrite - [hDevice->getIndex() % Mem->HaveMigratedToDeviceSinceLastWrite.size()] = - true; + Mem->HaveMigratedToDeviceSinceLastWrite[Mem->getContext()->getDeviceIndex( + hDevice)] = true; return UR_RESULT_SUCCESS; } + +BufferMem::native_type +BufferMem::getPtrWithOffset(const ur_device_handle_t Device, size_t Offset) { + if (ur_result_t Err = allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device); + Err != UR_RESULT_SUCCESS) { + throw Err; + } + return reinterpret_cast( + reinterpret_cast( + Ptrs[OuterMemStruct->getContext()->getDeviceIndex(Device)]) + + Offset); +} + +CUarray SurfaceMem::getArray(const ur_device_handle_t Device) { + if (ur_result_t Err = allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device); + Err != UR_RESULT_SUCCESS) { + throw Err; + } + return Arrays[OuterMemStruct->getContext()->getDeviceIndex(Device)]; +} + +CUsurfObject SurfaceMem::getSurface(const ur_device_handle_t Device) { + if (ur_result_t Err = allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device); + Err != UR_RESULT_SUCCESS) { + throw Err; + } + return SurfObjs[OuterMemStruct->getContext()->getDeviceIndex(Device)]; +} diff --git a/source/adapters/cuda/memory.hpp b/source/adapters/cuda/memory.hpp index 6b7e9d0156..a67e9295cc 100644 --- a/source/adapters/cuda/memory.hpp +++ b/source/adapters/cuda/memory.hpp @@ -22,8 +22,9 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t, const ur_device_handle_t); -ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t, - const ur_device_handle_t); +ur_result_t enqueueMigrateMemoryToDeviceIfNeeded(ur_mem_handle_t, + const ur_device_handle_t, + CUstream); // Handler for plain, pointer-based CUDA allocations struct BufferMem { @@ -97,16 +98,7 @@ struct BufferMem { BufferMem(const BufferMem &Buffer) = default; - native_type getPtrWithOffset(const ur_device_handle_t Device, size_t Offset) { - if (ur_result_t Err = - allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device); - Err != UR_RESULT_SUCCESS) { - throw Err; - } - return reinterpret_cast( - reinterpret_cast(Ptrs[Device->getIndex() % Ptrs.size()]) + - Offset); - } + native_type getPtrWithOffset(const ur_device_handle_t Device, size_t Offset); native_type getPtr(const ur_device_handle_t Device) { return getPtrWithOffset(Device, 0); @@ -199,6 +191,7 @@ struct SurfaceMem { CUDA_ARRAY3D_DESCRIPTOR ArrayDesc; size_t PixelTypeSizeBytes; void *HostPtr; + ur_result_t error = UR_RESULT_SUCCESS; SurfaceMem(ur_context_handle_t Context, ur_mem_handle_t OuterMemStruct, ur_image_format_t ImageFormat, ur_image_desc_t ImageDesc, @@ -233,6 +226,7 @@ struct SurfaceMem { ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT8; PixelTypeSizeBytes = 1; break; + case UR_IMAGE_CHANNEL_TYPE_SNORM_INT8: case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8: ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT8; PixelTypeSizeBytes = 1; @@ -242,6 +236,7 @@ struct SurfaceMem { ArrayDesc.Format = CU_AD_FORMAT_UNSIGNED_INT16; PixelTypeSizeBytes = 2; break; + case UR_IMAGE_CHANNEL_TYPE_SNORM_INT16: case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16: ArrayDesc.Format = CU_AD_FORMAT_SIGNED_INT16; PixelTypeSizeBytes = 2; @@ -263,29 +258,15 @@ struct SurfaceMem { PixelTypeSizeBytes = 4; break; default: - detail::ur::die( - "urMemImageCreate given unsupported image_channel_data_type"); + break; } } // Will allocate a new array on device if not already allocated - CUarray getArray(const ur_device_handle_t Device) { - if (ur_result_t Err = - allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device); - Err != UR_RESULT_SUCCESS) { - throw Err; - } - return Arrays[Device->getIndex() % Arrays.size()]; - } + CUarray getArray(const ur_device_handle_t Device); + // Will allocate a new surface on device if not already allocated - CUsurfObject getSurface(const ur_device_handle_t Device) { - if (ur_result_t Err = - allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device); - Err != UR_RESULT_SUCCESS) { - throw Err; - } - return SurfObjs[Device->getIndex() % SurfObjs.size()]; - } + CUsurfObject getSurface(const ur_device_handle_t Device); ur_mem_type_t getType() { return ImageDesc.type; } @@ -313,7 +294,7 @@ struct SurfaceMem { /// /// The ur_mem_handle_t is responsible for memory allocation and migration /// across devices in the same ur_context_handle_t. If a kernel writes to a -/// ur_mem_handle_t then it will write to LastEventWritingToMemObj. Then all +/// ur_mem_handle_t then it will write to LastQueueWritingToMemObj. Then all /// subsequent operations that want to read from the ur_mem_handle_t must wait /// on the event referring to the last write. /// @@ -333,61 +314,7 @@ struct SurfaceMem { /// /// Migrations will occur in both cases if the most recent version of data /// is on a different device, marked by -/// LastEventWritingToMemObj->getQueue()->getDevice() -/// -/// Example trace: -/// ~~~~~~~~~~~~~~ -/// -/// =====> urContextCreate([device0, device1], ...) // associated with [q0, q1] -/// -> OUT: hContext -/// -/// =====> urMemBufferCreate(hContext,...); -/// -> No native allocations made -/// -> OUT: hBuffer -/// -/// =====> urEnqueueMemBufferWrite(q0, hBuffer,...); -/// -> Allocation made on q0 ie device0 -/// -> New allocation initialized with host data. -/// -/// =====> urKernelSetArgMemObj(hKernel0, hBuffer, ...); -/// -> ur_kernel_handle_t associated with a ur_program_handle_t, -/// which is in turn unique to a device. So we can set the kernel -/// arg with the ptr of the device specific allocation. -/// -> hKernel0->getProgram()->getDevice() == device0 -/// -> allocateMemObjOnDeviceIfNeeded(device0); -/// -> Native allocation already made on device0, continue. -/// -/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...); -/// -> Suppose that hKernel0 writes to hBuffer. -/// -> Call hBuffer->setLastEventWritingToMemObj with return event -/// from this operation -/// -> Enqueue native kernel launch -/// -/// =====> urKernelSetArgMemObj(hKernel1, hBuffer, ...); -/// -> hKernel1->getProgram()->getDevice() == device1 -/// -> New allocation will be made on device1 when calling -/// getPtr(device1) -/// -> No native allocation on device1 -/// -> Make native allocation on device1 -/// -/// =====> urEnqueueKernelLaunch(q1, hKernel1, ...); -/// -> Suppose hKernel1 wants to read from hBuffer and not write. -/// -> migrateMemoryToDeviceIfNeeded(device1); -/// -> hBuffer->LastEventWritingToMemObj is not nullptr -/// -> Check if memory has been migrated to device1 since the -/// last write -/// -> Hasn't been migrated -/// -> Wait on LastEventWritingToMemObj. -/// -> Migrate memory from device0's native allocation to -/// device1's native allocation. -/// -> Enqueue native kernel launch -/// -/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...); -/// -> migrateMemoryToDeviceIfNeeded(device0); -/// -> hBuffer->LastEventWritingToMemObj refers to an event -/// from q0 -/// -> Migration not necessary -/// -> Enqueue native kernel launch +/// LastQueueWritingToMemObj->getDevice() /// struct ur_mem_handle_t_ { // Context where the memory object is accessible @@ -406,15 +333,13 @@ struct ur_mem_handle_t_ { // Has the memory been migrated to a device since the last write? std::vector HaveMigratedToDeviceSinceLastWrite; - // We should wait on this event prior to migrating memory across allocations - // in this ur_mem_handle_t_ - ur_event_handle_t LastEventWritingToMemObj{nullptr}; + // Queue with most up to date data of ur_mem_handle_t_ + ur_queue_handle_t LastQueueWritingToMemObj{nullptr}; // Enumerates all possible types of accesses. enum access_mode_t { unknown, read_write, read_only, write_only }; ur_mutex MemoryAllocationMutex; // A mutex for allocations - ur_mutex MemoryMigrationMutex; // A mutex for memory transfers /// A UR Memory object represents either plain memory allocations ("Buffers" /// in OpenCL) or typed allocations ("Images" in OpenCL). @@ -503,21 +428,20 @@ struct ur_mem_handle_t_ { uint32_t getReferenceCount() const noexcept { return RefCount; } - void setLastEventWritingToMemObj(ur_event_handle_t NewEvent) { - assert(NewEvent && "Invalid event!"); - // This entry point should only ever be called when using multi device ctx - assert(Context->Devices.size() > 1); - urEventRetain(NewEvent); - if (LastEventWritingToMemObj != nullptr) { - urEventRelease(LastEventWritingToMemObj); + void setLastQueueWritingToMemObj(ur_queue_handle_t WritingQueue) { + urQueueRetain(WritingQueue); + if (LastQueueWritingToMemObj != nullptr) { + urQueueRelease(LastQueueWritingToMemObj); } - LastEventWritingToMemObj = NewEvent; + LastQueueWritingToMemObj = WritingQueue; for (const auto &Device : Context->getDevices()) { // This event is never an interop event so will always have an associated // queue - HaveMigratedToDeviceSinceLastWrite - [Device->getIndex() % HaveMigratedToDeviceSinceLastWrite.size()] = - Device == NewEvent->getQueue()->getDevice(); + HaveMigratedToDeviceSinceLastWrite[Context->getDeviceIndex(Device)] = + Device == WritingQueue->getDevice(); } } }; + +ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t, + const ur_device_handle_t); diff --git a/source/adapters/cuda/platform.cpp b/source/adapters/cuda/platform.cpp index 27b94f756f..d89a5b7a19 100644 --- a/source/adapters/cuda/platform.cpp +++ b/source/adapters/cuda/platform.cpp @@ -141,12 +141,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetNativeHandle( } UR_APIEXPORT ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( - ur_native_handle_t hNativePlatform, - const ur_platform_native_properties_t *pProperties, - ur_platform_handle_t *phPlatform) { - std::ignore = hNativePlatform; - std::ignore = pProperties; - std::ignore = phPlatform; + ur_native_handle_t, ur_adapter_handle_t, + const ur_platform_native_properties_t *, ur_platform_handle_t *) { + // There is no CUDA equivalent to ur_platform_handle_t return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/source/adapters/cuda/queue.cpp b/source/adapters/cuda/queue.cpp index 773126f4d5..35a2272cbc 100644 --- a/source/adapters/cuda/queue.cpp +++ b/source/adapters/cuda/queue.cpp @@ -201,6 +201,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) { UR_CHECK_ERROR(cuStreamDestroy(S)); }); + if (hQueue->getHostSubmitTimeStream() != CUstream{0}) { + UR_CHECK_ERROR(cuStreamSynchronize(hQueue->getHostSubmitTimeStream())); + UR_CHECK_ERROR(cuStreamDestroy(hQueue->getHostSubmitTimeStream())); + } + return UR_RESULT_SUCCESS; } catch (ur_result_t Err) { return Err; diff --git a/source/adapters/cuda/queue.hpp b/source/adapters/cuda/queue.hpp index 46e9968fa9..727df68573 100644 --- a/source/adapters/cuda/queue.hpp +++ b/source/adapters/cuda/queue.hpp @@ -9,10 +9,12 @@ //===----------------------------------------------------------------------===// #pragma once +#include "common.hpp" #include #include #include +#include #include using ur_stream_guard_ = std::unique_lock; @@ -27,6 +29,10 @@ struct ur_queue_handle_t_ { std::vector ComputeStreams; std::vector TransferStreams; + // Stream used for recording EvQueue, which holds information about when the + // command in question is enqueued on host, as opposed to started. It is + // created only if profiling is enabled - either for queue or per event. + native_type HostSubmitTimeStream{0}; // delay_compute_ keeps track of which streams have been recently reused and // their next use should be delayed. If a stream has been recently reused it // will be skipped the next time it would be selected round-robin style. When @@ -99,6 +105,18 @@ struct ur_queue_handle_t_ { native_type get() { return getNextComputeStream(); }; ur_device_handle_t getDevice() const noexcept { return Device; }; + // Function which creates the profiling stream. Called only from makeNative + // event when profiling is required. + void createHostSubmitTimeStream() { + static std::once_flag HostSubmitTimeStreamFlag; + std::call_once(HostSubmitTimeStreamFlag, [&]() { + UR_CHECK_ERROR(cuStreamCreateWithPriority(&HostSubmitTimeStream, + CU_STREAM_NON_BLOCKING, 0)); + }); + } + + native_type getHostSubmitTimeStream() { return HostSubmitTimeStream; } + bool hasBeenSynchronized(uint32_t StreamToken) { // stream token not associated with one of the compute streams if (StreamToken == std::numeric_limits::max()) { diff --git a/source/adapters/cuda/ur_interface_loader.cpp b/source/adapters/cuda/ur_interface_loader.cpp index b70198b227..fc8cad9d43 100644 --- a/source/adapters/cuda/ur_interface_loader.cpp +++ b/source/adapters/cuda/ur_interface_loader.cpp @@ -125,6 +125,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetArgValue = urKernelSetArgValue; pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = nullptr; + pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/cuda/usm.cpp b/source/adapters/cuda/usm.cpp index c0035052d8..89f4b9a6a2 100644 --- a/source/adapters/cuda/usm.cpp +++ b/source/adapters/cuda/usm.cpp @@ -261,16 +261,13 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL, (CUdeviceptr)pMem)); - // currently each device is in its own platform, so find the platform at - // the same index - std::vector Platforms; - Platforms.resize(DeviceIndex + 1); + // cuda backend has only one platform containing all devices + ur_platform_handle_t platform; ur_adapter_handle_t AdapterHandle = &adapter; - Result = urPlatformGet(&AdapterHandle, 1, DeviceIndex + 1, - Platforms.data(), nullptr); + Result = urPlatformGet(&AdapterHandle, 1, 1, &platform, nullptr); // get the device from the platform - ur_device_handle_t Device = Platforms[DeviceIndex]->Devices[0].get(); + ur_device_handle_t Device = platform->Devices[DeviceIndex].get(); return ReturnValue(Device); } case UR_USM_ALLOC_INFO_POOL: { diff --git a/source/adapters/hip/common.hpp b/source/adapters/hip/common.hpp index be332c280b..98799d58f5 100644 --- a/source/adapters/hip/common.hpp +++ b/source/adapters/hip/common.hpp @@ -204,3 +204,38 @@ template class ReleaseGuard { /// UR object. void dismiss() { Captive = nullptr; } }; + +// Helper method to return a (non-null) pointer's attributes, or std::nullopt in +// the case that the pointer is unknown to the HIP subsystem. +inline static std::optional +getPointerAttributes(const void *pMem) { + // do not throw if hipPointerGetAttributes returns hipErrorInvalidValue + hipPointerAttribute_t hipPointerAttributes; + hipError_t Ret = hipPointerGetAttributes(&hipPointerAttributes, pMem); + if (Ret == hipErrorInvalidValue && pMem) { + // pointer non-null but not known to the HIP subsystem + return std::nullopt; + } + // Direct usage of the function, instead of UR_CHECK_ERROR, so we can get + // the line offset. + checkErrorUR(Ret, __func__, __LINE__ - 7, __FILE__); + // ROCm 6.0.0 introduces hipMemoryTypeUnregistered in the hipMemoryType + // enum to mark unregistered allocations (i.e., via system allocators). +#if HIP_VERSION_MAJOR >= 6 + if (hipPointerAttributes.type == hipMemoryTypeUnregistered) { + // pointer not known to the HIP subsystem + return std::nullopt; + } +#endif + return hipPointerAttributes; +} + +// Helper method to abstract away the fact that retrieving a pointer's memory +// type differs depending on the version of HIP. +inline static unsigned getMemoryType(hipPointerAttribute_t hipPointerAttrs) { +#if HIP_VERSION >= 50600000 + return hipPointerAttrs.type; +#else + return hipPointerAttrs.memoryType; +#endif +} diff --git a/source/adapters/hip/context.cpp b/source/adapters/hip/context.cpp index 73ac777edb..c3fcb3a1a3 100644 --- a/source/adapters/hip/context.cpp +++ b/source/adapters/hip/context.cpp @@ -47,18 +47,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urContextCreate( // Create a scoped context. ContextPtr = std::unique_ptr( new ur_context_handle_t_{phDevices, DeviceCount}); - - static std::once_flag InitFlag; - std::call_once( - InitFlag, - [](ur_result_t &) { - // Use default stream to record base event counter - UR_CHECK_ERROR(hipEventCreateWithFlags(&ur_platform_handle_t_::EvBase, - hipEventDefault)); - UR_CHECK_ERROR(hipEventRecord(ur_platform_handle_t_::EvBase, 0)); - }, - RetErr); - *phContext = ContextPtr.release(); } catch (ur_result_t Err) { RetErr = Err; @@ -78,7 +66,8 @@ urContextGetInfo(ur_context_handle_t hContext, ur_context_info_t propName, case UR_CONTEXT_INFO_NUM_DEVICES: return ReturnValue(static_cast(hContext->Devices.size())); case UR_CONTEXT_INFO_DEVICES: - return ReturnValue(hContext->getDevices()); + return ReturnValue(hContext->getDevices().data(), + hContext->getDevices().size()); case UR_CONTEXT_INFO_REFERENCE_COUNT: return ReturnValue(hContext->getReferenceCount()); case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: diff --git a/source/adapters/hip/context.hpp b/source/adapters/hip/context.hpp index 69d4df9b6d..90366436e2 100644 --- a/source/adapters/hip/context.hpp +++ b/source/adapters/hip/context.hpp @@ -112,6 +112,13 @@ struct ur_context_handle_t_ { return Devices; } + // Gets the index of the device relative to other devices in the context + size_t getDeviceIndex(ur_device_handle_t hDevice) { + auto It = std::find(Devices.begin(), Devices.end(), hDevice); + assert(It != Devices.end()); + return std::distance(Devices.begin(), It); + } + uint32_t incrementReferenceCount() noexcept { return ++RefCount; } uint32_t decrementReferenceCount() noexcept { return --RefCount; } diff --git a/source/adapters/hip/device.cpp b/source/adapters/hip/device.cpp index dd20a4f50f..0e9b50f94e 100644 --- a/source/adapters/hip/device.cpp +++ b/source/adapters/hip/device.cpp @@ -9,6 +9,7 @@ //===----------------------------------------------------------------------===// #include "device.hpp" +#include "adapter.hpp" #include "context.hpp" #include "event.hpp" @@ -20,6 +21,18 @@ int getAttribute(ur_device_handle_t Device, hipDeviceAttribute_t Attribute) { return Value; } +uint64_t ur_device_handle_t_::getElapsedTime(hipEvent_t ev) const { + float Milliseconds = 0.0f; + + // hipEventSynchronize waits till the event is ready for call to + // hipEventElapsedTime. + UR_CHECK_ERROR(hipEventSynchronize(EvBase)); + UR_CHECK_ERROR(hipEventSynchronize(ev)); + UR_CHECK_ERROR(hipEventElapsedTime(&Milliseconds, EvBase, ev)); + + return static_cast(Milliseconds * 1.0e6); +} + UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, ur_device_info_t propName, size_t propSize, @@ -177,7 +190,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, int WarpSize = 0; UR_CHECK_ERROR(hipDeviceGetAttribute(&WarpSize, hipDeviceAttributeWarpSize, hDevice->get())); - size_t Sizes[1] = {static_cast(WarpSize)}; + uint32_t Sizes[1] = {static_cast(WarpSize)}; return ReturnValue(Sizes, 1); } case UR_DEVICE_INFO_MAX_CLOCK_FREQUENCY: { @@ -321,7 +334,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(static_cast(Min)); } case UR_DEVICE_INFO_IMAGE_MAX_ARRAY_SIZE: { - return ReturnValue(0lu); + return ReturnValue(size_t(0)); } case UR_DEVICE_INFO_MAX_SAMPLERS: { // This call is kind of meaningless for HIP, as samplers don't exist. @@ -331,7 +344,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_MAX_PARAMETER_SIZE: { // __global__ function parameters are passed to the device via constant // memory and are limited to 4 KB. - return ReturnValue(4000lu); + return ReturnValue(size_t(4000)); } case UR_DEVICE_INFO_MEM_BASE_ADDR_ALIGN: { int MemBaseAddrAlign = 0; @@ -442,7 +455,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: { // Hard coded to value returned by clinfo for OpenCL 1.2 HIP | GeForce GTX // 1060 3GB - return ReturnValue(1000lu); + return ReturnValue(size_t(1000)); } case UR_DEVICE_INFO_ENDIAN_LITTLE: { return ReturnValue(true); @@ -465,10 +478,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(Capability); } case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: { - // The mandated minimum capability: - ur_queue_flags_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE | - UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; - return ReturnValue(Capability); + return ReturnValue(0); } case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES: case UR_DEVICE_INFO_QUEUE_PROPERTIES: { @@ -569,7 +579,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, } case UR_DEVICE_INFO_PRINTF_BUFFER_SIZE: { // The minimum value for the FULL profile is 1 MB. - return ReturnValue(1024lu); + return ReturnValue(size_t(1024)); } case UR_DEVICE_INFO_PREFERRED_INTEROP_USER_SYNC: { return ReturnValue(true); @@ -761,6 +771,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return ReturnValue(int32_t{1}); } + case UR_DEVICE_INFO_KERNEL_SET_SPECIALIZATION_CONSTANTS: { + return ReturnValue(ur_bool_t{false}); + } + case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { ur_memory_order_capability_flags_t Capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | @@ -768,16 +782,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE; return ReturnValue(Capabilities); } - case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: - case UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: { + case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { // SYCL2020 4.6.4.2 minimum mandated capabilities for // atomic_fence/memory_scope_capabilities. // Because scopes are hierarchical, wider scopes support all narrower // scopes. At a minimum, each device must support WORK_ITEM, SUB_GROUP and // WORK_GROUP. (https://github.com/KhronosGroup/SYCL-Docs/pull/382) - uint64_t Capabilities = UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | - UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP; + ur_memory_scope_capability_flags_t Capabilities = + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP; + return ReturnValue(Capabilities); + } + case UR_DEVICE_INFO_ATOMIC_FENCE_SCOPE_CAPABILITIES: { + constexpr ur_memory_scope_capability_flags_t Capabilities = + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_ITEM | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SUB_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_WORK_GROUP | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_DEVICE | + UR_MEMORY_SCOPE_CAPABILITY_FLAG_SYSTEM; return ReturnValue(Capabilities); } case UR_DEVICE_INFO_ATOMIC_FENCE_ORDER_CAPABILITIES: { @@ -788,6 +811,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL; +#ifdef __HIP_PLATFORM_NVIDIA__ + // Nvidia introduced fence.sc for seq_cst only since SM 7.0. + int Major = 0; + UR_CHECK_ERROR(hipDeviceGetAttribute( + &Major, hipDeviceAttributeComputeCapabilityMajor, hDevice->get())); + if (Major >= 7) + Capabilities |= UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST; +#else + Capabilities |= UR_MEMORY_ORDER_CAPABILITY_FLAG_SEQ_CST; +#endif return ReturnValue(Capabilities); } case UR_DEVICE_INFO_DEVICE_ID: { @@ -950,8 +983,57 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetNativeHandle( } UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( - ur_native_handle_t, ur_platform_handle_t, - const ur_device_native_properties_t *, ur_device_handle_t *) { + ur_native_handle_t hNativeDevice, ur_platform_handle_t hPlatform, + [[maybe_unused]] const ur_device_native_properties_t *pProperties, + ur_device_handle_t *phDevice) { + // We can't cast between ur_native_handle_t and hipDevice_t, so memcpy the + // bits instead + hipDevice_t HIPDevice = 0; + memcpy(&HIPDevice, &hNativeDevice, sizeof(hipDevice_t)); + + auto IsDevice = [=](std::unique_ptr &Dev) { + return Dev->get() == HIPDevice; + }; + + // If a platform is provided just check if the device is in it + if (hPlatform) { + auto SearchRes = std::find_if(begin(hPlatform->Devices), + end(hPlatform->Devices), IsDevice); + if (SearchRes != end(hPlatform->Devices)) { + *phDevice = SearchRes->get(); + return UR_RESULT_SUCCESS; + } + } + + // Get list of platforms + uint32_t NumPlatforms = 0; + ur_adapter_handle_t AdapterHandle = &adapter; + ur_result_t Result = + urPlatformGet(&AdapterHandle, 1, 0, nullptr, &NumPlatforms); + if (Result != UR_RESULT_SUCCESS) + return Result; + + // We can only have a maximum of one platform. + if (NumPlatforms != 1) + return UR_RESULT_ERROR_INVALID_OPERATION; + + ur_platform_handle_t Platform = nullptr; + + Result = urPlatformGet(&AdapterHandle, 1, NumPlatforms, &Platform, nullptr); + if (Result != UR_RESULT_SUCCESS) + return Result; + + // Iterate through the platform's devices to find the device that matches + // nativeHandle + auto SearchRes = std::find_if(std::begin(Platform->Devices), + std::end(Platform->Devices), IsDevice); + if (SearchRes != end(Platform->Devices)) { + *phDevice = static_cast((*SearchRes).get()); + return UR_RESULT_SUCCESS; + } + + // If the provided nativeHandle cannot be matched to an + // existing device return error return UR_RESULT_ERROR_INVALID_OPERATION; } @@ -995,11 +1077,7 @@ ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice, if (pDeviceTimestamp) { UR_CHECK_ERROR(hipEventCreateWithFlags(&Event, hipEventDefault)); UR_CHECK_ERROR(hipEventRecord(Event)); - UR_CHECK_ERROR(hipEventSynchronize(Event)); - float ElapsedTime = 0.0f; - UR_CHECK_ERROR(hipEventElapsedTime(&ElapsedTime, - ur_platform_handle_t_::EvBase, Event)); - *pDeviceTimestamp = (uint64_t)(ElapsedTime * (double)1e6); + *pDeviceTimestamp = hDevice->getElapsedTime(Event); } if (pHostTimestamp) { diff --git a/source/adapters/hip/device.hpp b/source/adapters/hip/device.hpp index 269848f3cd..5fd11bfc2f 100644 --- a/source/adapters/hip/device.hpp +++ b/source/adapters/hip/device.hpp @@ -25,7 +25,9 @@ struct ur_device_handle_t_ { std::atomic_uint32_t RefCount; ur_platform_handle_t Platform; hipCtx_t HIPContext; + hipEvent_t EvBase; // HIP event used as base counter uint32_t DeviceIndex; + int MaxWorkGroupSize{0}; int MaxBlockDimX{0}; int MaxBlockDimY{0}; @@ -36,9 +38,10 @@ struct ur_device_handle_t_ { public: ur_device_handle_t_(native_type HipDevice, hipCtx_t Context, - ur_platform_handle_t Platform, uint32_t DeviceIndex) + hipEvent_t EvBase, ur_platform_handle_t Platform, + uint32_t DeviceIndex) : HIPDevice(HipDevice), RefCount{1}, Platform(Platform), - HIPContext(Context), DeviceIndex(DeviceIndex) { + HIPContext(Context), EvBase(EvBase), DeviceIndex(DeviceIndex) { UR_CHECK_ERROR(hipDeviceGetAttribute( &MaxWorkGroupSize, hipDeviceAttributeMaxThreadsPerBlock, HIPDevice)); @@ -68,6 +71,8 @@ struct ur_device_handle_t_ { ur_platform_handle_t getPlatform() const noexcept { return Platform; }; + uint64_t getElapsedTime(hipEvent_t) const; + hipCtx_t getNativeContext() const noexcept { return HIPContext; }; // Returns the index of the device relative to the other devices in the same diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index 4b98a9c491..4fc4f95f75 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -160,8 +160,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); UR_ASSERT(hBuffer->isBuffer(), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - ur_result_t Result = UR_RESULT_SUCCESS; std::unique_ptr RetImplEvent{nullptr}; + hBuffer->setLastQueueWritingToMemObj(hQueue); try { ScopedContext Active(hQueue->getDevice()); @@ -193,9 +193,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( *phEvent = RetImplEvent.release(); } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( @@ -210,29 +210,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( std::unique_ptr RetImplEvent{nullptr}; - ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex}; - auto Device = hQueue->getDevice(); - hipStream_t HIPStream = hQueue->getNextTransferStream(); - try { // Note that this entry point may be called on a queue that may not be the // last queue to write to the MemBuffer, meaning we must perform the copy // from a different device - if (hBuffer->LastEventWritingToMemObj && - hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() != - hQueue->getDevice()) { - // This event is never created with interop so getQueue is never null - hQueue = hBuffer->LastEventWritingToMemObj->getQueue(); - Device = hQueue->getDevice(); - ScopedContext Active(Device); - HIPStream = hipStream_t{0}; // Default stream for different device - // We may have to wait for an event on another queue if it is the last - // event writing to mem obj - UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, 1, - &hBuffer->LastEventWritingToMemObj)); + if (hBuffer->LastQueueWritingToMemObj && + hBuffer->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) { + hQueue = hBuffer->LastQueueWritingToMemObj; } + auto Device = hQueue->getDevice(); ScopedContext Active(Device); + hipStream_t HIPStream = hQueue->getNextTransferStream(); // Use the default stream if copying from another device UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, @@ -280,44 +269,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - std::vector DepEvents( - phEventWaitList, phEventWaitList + numEventsInWaitList); - std::vector> MemMigrationLocks; - - // phEventWaitList only contains events that are handed to UR by the SYCL - // runtime. However since UR handles memory dependencies within a context - // we may need to add more events to our dependent events list if the UR - // context contains multiple devices - if (hQueue->getContext()->Devices.size() > 1) { - MemMigrationLocks.reserve(hKernel->Args.MemObjArgs.size()); - for (auto &MemArg : hKernel->Args.MemObjArgs) { - bool PushBack = false; - if (auto MemDepEvent = MemArg.Mem->LastEventWritingToMemObj; - MemDepEvent && std::find(DepEvents.begin(), DepEvents.end(), - MemDepEvent) == DepEvents.end()) { - DepEvents.push_back(MemDepEvent); - PushBack = true; - } - if ((MemArg.AccessFlags & - (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) || - PushBack) { - if (std::find_if(MemMigrationLocks.begin(), MemMigrationLocks.end(), - [MemArg](auto &Lock) { - return Lock.first == MemArg.Mem; - }) == MemMigrationLocks.end()) - MemMigrationLocks.emplace_back( - std::pair{MemArg.Mem, ur_lock{MemArg.Mem->MemoryMigrationMutex}}); - } - } - } - // Early exit for zero size range kernel if (*pGlobalWorkSize == 0) { - if (DepEvents.size()) { - return urEnqueueEventsWaitWithBarrier(hQueue, DepEvents.size(), - phEventWaitList, phEvent); - } - return UR_RESULT_SUCCESS; + return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, + phEventWaitList, phEvent); } // Set the number of threads per block to the number of threads per warp @@ -325,7 +280,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; size_t BlocksPerGrid[3] = {1u, 1u, 1u}; - ur_result_t Result = UR_RESULT_SUCCESS; std::unique_ptr RetImplEvent{nullptr}; try { @@ -343,20 +297,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( hipStream_t HIPStream = hQueue->getNextComputeStream( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); - if (DepEvents.size()) { - UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, DepEvents.size(), - DepEvents.data())); - } + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, + phEventWaitList)); // For memory migration across devices in the same context if (hQueue->getContext()->Devices.size() > 1) { for (auto &MemArg : hKernel->Args.MemObjArgs) { - migrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice()); + enqueueMigrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice(), + HIPStream); + if (MemArg.AccessFlags & + (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) { + MemArg.Mem->setLastQueueWritingToMemObj(hQueue); + } } } auto ArgIndices = hKernel->getArgIndices(); + // If migration of mem across buffer is needed, an event must be associated + // with this command, implicitly if phEvent is nullptr if (phEvent) { RetImplEvent = std::unique_ptr(ur_event_handle_t_::makeNative( @@ -364,20 +323,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_CHECK_ERROR(RetImplEvent->start()); } - // Once event has been started we can unlock MemoryMigrationMutex - if (hQueue->getContext()->Devices.size() > 1) { - for (auto &MemArg : hKernel->Args.MemObjArgs) { - // Telling the ur_mem_handle_t that it will need to wait on this kernel - // if it has been written to - if (phEvent && (MemArg.AccessFlags & - (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY))) { - MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.get()); - } - } - // We can release the MemoryMigrationMutexes now - MemMigrationLocks.clear(); - } - UR_CHECK_ERROR(hipModuleLaunchKernel( HIPFunc, BlocksPerGrid[0], BlocksPerGrid[1], BlocksPerGrid[2], ThreadsPerBlock[0], ThreadsPerBlock[1], ThreadsPerBlock[2], @@ -390,9 +335,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( *phEvent = RetImplEvent.release(); } } catch (ur_result_t err) { - Result = err; + return err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( @@ -578,30 +523,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( std::unique_ptr RetImplEvent{nullptr}; - ur_result_t Result = UR_RESULT_SUCCESS; - ur_lock MemoryMigrationLock(hBuffer->MemoryMigrationMutex); - auto Device = hQueue->getDevice(); - hipStream_t HIPStream = hQueue->getNextTransferStream(); - try { // Note that this entry point may be called on a queue that may not be the // last queue to write to the MemBuffer, meaning we must perform the copy // from a different device - if (hBuffer->LastEventWritingToMemObj && - hBuffer->LastEventWritingToMemObj->getQueue()->getDevice() != - hQueue->getDevice()) { - // This event is never created with interop so getQueue is never null - hQueue = hBuffer->LastEventWritingToMemObj->getQueue(); - Device = hQueue->getDevice(); - ScopedContext Active(Device); - HIPStream = hipStream_t{0}; // Default stream for different device - // We may have to wait for an event on another queue if it is the last - // event writing to mem obj - UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, 1, - &hBuffer->LastEventWritingToMemObj)); + if (hBuffer->LastQueueWritingToMemObj && + hBuffer->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) { + hQueue = hBuffer->LastQueueWritingToMemObj; } + auto Device = hQueue->getDevice(); ScopedContext Active(Device); + hipStream_t HIPStream = hQueue->getNextTransferStream(); UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, phEventWaitList)); @@ -632,9 +565,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( } } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( @@ -644,27 +577,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - ur_result_t Result = UR_RESULT_SUCCESS; void *DevPtr = std::get(hBuffer->Mem).getVoid(hQueue->getDevice()); std::unique_ptr RetImplEvent{nullptr}; + hBuffer->setLastQueueWritingToMemObj(hQueue); try { ScopedContext Active(hQueue->getDevice()); hipStream_t HIPStream = hQueue->getNextTransferStream(); - Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, - phEventWaitList); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, + phEventWaitList)); if (phEvent) { RetImplEvent = std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_BUFFER_WRITE_RECT, hQueue, HIPStream)); + UR_COMMAND_MEM_BUFFER_WRITE, hQueue, HIPStream)); UR_CHECK_ERROR(RetImplEvent->start()); } - Result = commonEnqueueMemBufferCopyRect( + UR_CHECK_ERROR(commonEnqueueMemBufferCopyRect( HIPStream, region, pSrc, hipMemoryTypeHost, hostOrigin, hostRowPitch, hostSlicePitch, &DevPtr, hipMemoryTypeDevice, bufferOrigin, - bufferRowPitch, bufferSlicePitch); + bufferRowPitch, bufferSlicePitch)); if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); @@ -677,11 +610,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( if (phEvent) { *phEvent = RetImplEvent.release(); } - } catch (ur_result_t Err) { - Result = Err; + return Err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( @@ -803,8 +735,8 @@ static inline void memsetRemainPattern(hipStream_t Stream, uint32_t PatternSize, // HIP has no memset functions that allow setting values more than 4 bytes. UR // API lets you pass an arbitrary "pattern" to the buffer fill, which can be // more than 4 bytes. We must break up the pattern into 1 byte values, and set -// the buffer using multiple strided calls. The first 4 patterns are set using -// hipMemsetD32Async then all subsequent 1 byte patterns are set using +// the buffer using multiple strided calls. The first 4 patterns are set +// using hipMemsetD32Async then all subsequent 1 byte patterns are set using // hipMemset2DAsync which is called for each pattern. ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize, size_t Size, const void *pPattern, @@ -823,8 +755,8 @@ ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize, UR_CHECK_ERROR(hipPointerGetAttributes(&ptrAttribs, (const void *)Ptr)); // The hostPointer attribute is non-null also for shared memory allocations. - // To make sure that this workaround only executes for host pinned memory, we - // need to check that isManaged attribute is false. + // To make sure that this workaround only executes for host pinned memory, + // we need to check that isManaged attribute is false. if (ptrAttribs.hostPointer && !ptrAttribs.isManaged) { const auto NumOfCopySteps = Size / PatternSize; const auto Offset = sizeof(uint32_t); @@ -857,38 +789,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( ur_event_handle_t *phEvent) { UR_ASSERT(size + offset <= std::get(hBuffer->Mem).getSize(), UR_RESULT_ERROR_INVALID_SIZE); - auto ArgsAreMultiplesOfPatternSize = - (offset % patternSize == 0) || (size % patternSize == 0); - - auto PatternIsValid = (pPattern != nullptr); - - auto PatternSizeIsValid = - ((patternSize & (patternSize - 1)) == 0) && // is power of two - (patternSize > 0) && (patternSize <= 128); // falls within valid range - - UR_ASSERT(ArgsAreMultiplesOfPatternSize && PatternIsValid && - PatternSizeIsValid, - UR_RESULT_ERROR_INVALID_VALUE); - std::ignore = ArgsAreMultiplesOfPatternSize; - std::ignore = PatternIsValid; - std::ignore = PatternSizeIsValid; std::unique_ptr RetImplEvent{nullptr}; + hBuffer->setLastQueueWritingToMemObj(hQueue); try { ScopedContext Active(hQueue->getDevice()); auto Stream = hQueue->getNextTransferStream(); - ur_result_t Result = UR_RESULT_SUCCESS; if (phEventWaitList) { - Result = enqueueEventsWait(hQueue, Stream, numEventsInWaitList, - phEventWaitList); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, Stream, numEventsInWaitList, + phEventWaitList)); } if (phEvent) { RetImplEvent = std::unique_ptr(ur_event_handle_t_::makeNative( - UR_COMMAND_MEM_BUFFER_FILL, hQueue, Stream)); + UR_COMMAND_MEM_BUFFER_WRITE, hQueue, Stream)); UR_CHECK_ERROR(RetImplEvent->start()); } @@ -915,8 +832,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( } default: { - Result = commonMemSetLargePattern(Stream, patternSize, size, pPattern, - DstDevice); + UR_CHECK_ERROR(commonMemSetLargePattern(Stream, patternSize, size, + pPattern, DstDevice)); break; } } @@ -925,13 +842,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( UR_CHECK_ERROR(RetImplEvent->record()); *phEvent = RetImplEvent.release(); } - - return Result; } catch (ur_result_t Err) { return Err; } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } + return UR_RESULT_SUCCESS; } /// General ND memory copy operation for images (where N > 1). @@ -948,7 +864,7 @@ static ur_result_t commonEnqueueMemImageNDCopy( UR_ASSERT(DstType == hipMemoryTypeArray || DstType == hipMemoryTypeHost, UR_RESULT_ERROR_INVALID_VALUE); - if (ImgType == UR_MEM_TYPE_IMAGE2D) { + if (ImgType == UR_MEM_TYPE_IMAGE1D || ImgType == UR_MEM_TYPE_IMAGE2D) { hip_Memcpy2D CpyDesc; memset(&CpyDesc, 0, sizeof(CpyDesc)); CpyDesc.srcMemoryType = SrcType; @@ -956,7 +872,7 @@ static ur_result_t commonEnqueueMemImageNDCopy( CpyDesc.srcArray = reinterpret_cast(const_cast(SrcPtr)); CpyDesc.srcXInBytes = SrcOffset[0]; - CpyDesc.srcY = SrcOffset[1]; + CpyDesc.srcY = (ImgType == UR_MEM_TYPE_IMAGE1D) ? 0 : SrcOffset[1]; } else { CpyDesc.srcHost = SrcPtr; } @@ -965,12 +881,12 @@ static ur_result_t commonEnqueueMemImageNDCopy( CpyDesc.dstArray = reinterpret_cast(const_cast(DstPtr)); CpyDesc.dstXInBytes = DstOffset[0]; - CpyDesc.dstY = DstOffset[1]; + CpyDesc.dstY = (ImgType == UR_MEM_TYPE_IMAGE1D) ? 0 : DstOffset[1]; } else { CpyDesc.dstHost = DstPtr; } CpyDesc.WidthInBytes = Region[0]; - CpyDesc.Height = Region[1]; + CpyDesc.Height = (ImgType == UR_MEM_TYPE_IMAGE1D) ? 1 : Region[1]; UR_CHECK_ERROR(hipMemcpyParam2DAsync(&CpyDesc, HipStream)); return UR_RESULT_SUCCESS; } @@ -1015,28 +931,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { UR_ASSERT(hImage->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); - ur_lock MemoryMigrationLock{hImage->MemoryMigrationMutex}; - auto Device = hQueue->getDevice(); - hipStream_t HIPStream = hQueue->getNextTransferStream(); - try { // Note that this entry point may be called on a queue that may not be the - // last queue to write to the MemBuffer, meaning we must perform the copy + // last queue to write to the MemImage, meaning we must perform the copy // from a different device - if (hImage->LastEventWritingToMemObj && - hImage->LastEventWritingToMemObj->getQueue()->getDevice() != - hQueue->getDevice()) { - hQueue = hImage->LastEventWritingToMemObj->getQueue(); - Device = hQueue->getDevice(); - ScopedContext Active(Device); - HIPStream = hipStream_t{0}; // Default stream for different device - // We may have to wait for an event on another queue if it is the last - // event writing to mem obj - UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, 1, - &hImage->LastEventWritingToMemObj)); + if (hImage->LastQueueWritingToMemObj && + hImage->LastQueueWritingToMemObj->getDevice() != hQueue->getDevice()) { + hQueue = hImage->LastQueueWritingToMemObj; } + auto Device = hQueue->getDevice(); ScopedContext Active(Device); + hipStream_t HIPStream = hQueue->getNextTransferStream(); if (phEventWaitList) { UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, @@ -1052,11 +958,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( int ElementByteSize = imageElementByteSize(Format); size_t ByteOffsetX = origin.x * ElementByteSize * NumChannels; - size_t BytesToCopy = ElementByteSize * NumChannels * region.depth; + size_t BytesToCopy = ElementByteSize * NumChannels * region.width; auto ImgType = std::get(hImage->Mem).getImageType(); - size_t AdjustedRegion[3] = {BytesToCopy, region.height, region.height}; + size_t AdjustedRegion[3] = {BytesToCopy, region.height, region.depth}; size_t SrcOffset[3] = {ByteOffsetX, origin.y, origin.z}; std::unique_ptr RetImplEvent{nullptr}; @@ -1113,11 +1019,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( int ElementByteSize = imageElementByteSize(Format); size_t ByteOffsetX = origin.x * ElementByteSize * NumChannels; - size_t BytesToCopy = ElementByteSize * NumChannels * region.depth; + size_t BytesToCopy = ElementByteSize * NumChannels * region.width; auto ImgType = std::get(hImage->Mem).getImageType(); - size_t AdjustedRegion[3] = {BytesToCopy, region.height, region.height}; + size_t AdjustedRegion[3] = {BytesToCopy, region.height, region.depth}; size_t DstOffset[3] = {ByteOffsetX, origin.y, origin.z}; std::unique_ptr RetImplEvent{nullptr}; @@ -1186,13 +1092,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( int ElementByteSize = imageElementByteSize(SrcFormat); - size_t DstByteOffsetX = dstOrigin.x * ElementByteSize * SrcNumChannels; - size_t SrcByteOffsetX = srcOrigin.x * ElementByteSize * DstNumChannels; - size_t BytesToCopy = ElementByteSize * SrcNumChannels * region.depth; + size_t DstByteOffsetX = dstOrigin.x * ElementByteSize * DstNumChannels; + size_t SrcByteOffsetX = srcOrigin.x * ElementByteSize * SrcNumChannels; + size_t BytesToCopy = ElementByteSize * SrcNumChannels * region.width; auto ImgType = std::get(hImageSrc->Mem).getImageType(); - size_t AdjustedRegion[3] = {BytesToCopy, region.height, region.width}; + size_t AdjustedRegion[3] = {BytesToCopy, region.height, region.depth}; size_t SrcOffset[3] = {SrcByteOffsetX, srcOrigin.y, srcOrigin.z}; size_t DstOffset[3] = {DstByteOffsetX, dstOrigin.y, dstOrigin.z}; @@ -1299,7 +1205,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( if (!IsPinned && (Map->getMapFlags() & (UR_MAP_FLAG_WRITE | UR_MAP_FLAG_WRITE_INVALIDATE_REGION))) { - // Pinned host memory is only on host so it doesn't need to be written to. + // Pinned host memory is only on host so it doesn't need to be written + // to. UR_CHECK_ERROR(urEnqueueMemBufferWrite( hQueue, hMem, true, Map->getMapOffset(), Map->getMapSize(), pMappedPtr, numEventsInWaitList, phEventWaitList, phEvent)); @@ -1475,10 +1382,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( hipPointerAttribute_t attribs; // TODO: hipPointerGetAttributes will fail if pMem is non-HIP allocated - // memory, as it is neither registered as host memory, nor into the address - // space for the current device, meaning the pMem ptr points to a - // system-allocated memory. This means we may need to check system-alloacted - // memory and handle the failure more gracefully. + // memory, as it is neither registered as host memory, nor into the + // address space for the current device, meaning the pMem ptr points to a + // system-allocated memory. This means we may need to check + // system-alloacted memory and handle the failure more gracefully. UR_CHECK_ERROR(hipPointerGetAttributes(&attribs, pMem)); // async prefetch requires USM pointer (or hip SVM) to work. if (!attribs.isManaged) { @@ -1507,8 +1414,9 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, ur_device_handle_t Device = hQueue->getDevice(); #if HIP_VERSION_MAJOR >= 5 - // NOTE: The hipPointerGetAttribute API is marked as beta, meaning, while this - // is feature complete, it is still open to changes and outstanding issues. + // NOTE: The hipPointerGetAttribute API is marked as beta, meaning, while + // this is feature complete, it is still open to changes and outstanding + // issues. size_t PointerRangeSize = 0; UR_CHECK_ERROR(hipPointerGetAttribute( &PointerRangeSize, HIP_POINTER_ATTRIBUTE_RANGE_SIZE, @@ -1548,9 +1456,10 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, } // Passing MEM_ADVICE_SET/MEM_ADVICE_CLEAR_PREFERRED_LOCATION to - // hipMemAdvise on a GPU device requires the GPU device to report a non-zero - // value for hipDeviceAttributeConcurrentManagedAccess. Therefore, ignore - // the mem advice if concurrent managed memory access is not available. + // hipMemAdvise on a GPU device requires the GPU device to report a + // non-zero value for hipDeviceAttributeConcurrentManagedAccess. + // Therefore, ignore the mem advice if concurrent managed memory access is + // not available. if (advice & (UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION | UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION | UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE | @@ -1585,9 +1494,10 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, #endif } else { Result = setHipMemAdvise(HIPDevicePtr, size, advice, DeviceID); - // UR_RESULT_ERROR_INVALID_ENUMERATION is returned when using a valid but - // currently unmapped advice arguments as not supported by this platform. - // Therefore, warn the user instead of throwing and aborting the runtime. + // UR_RESULT_ERROR_INVALID_ENUMERATION is returned when using a valid + // but currently unmapped advice arguments as not supported by this + // platform. Therefore, warn the user instead of throwing and aborting + // the runtime. if (Result == UR_RESULT_ERROR_INVALID_ENUMERATION) { releaseEvent(); setErrorMessage("mem_advise is ignored as the advice argument is not " @@ -1648,15 +1558,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( UR_CHECK_ERROR(RetImplEvent->start()); } - // There is an issue with hipMemcpy2D* when hipMemcpyDefault is used, which - // makes the HIP runtime not correctly derive the copy kind (direction) for - // the copies since ROCm 5.6.0+. See: https://github.com/ROCm/clr/issues/40 + // There is an issue with hipMemcpy2D* when hipMemcpyDefault is used, + // which makes the HIP runtime not correctly derive the copy kind + // (direction) for the copies since ROCm 5.6.0+. See: + // https://github.com/ROCm/clr/issues/40 // TODO: Add maximum HIP_VERSION when bug has been fixed. #if HIP_VERSION >= 50600000 hipPointerAttribute_t srcAttribs{}; hipPointerAttribute_t dstAttribs{}; - // Determine if pSrc and/or pDst are system allocated pageable host memory. + // Determine if pSrc and/or pDst are system allocated pageable host + // memory. bool srcIsSystemAlloc{false}; bool dstIsSystemAlloc{false}; @@ -1851,9 +1763,9 @@ setKernelParams(const ur_device_handle_t Device, const uint32_t WorkDim, UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); UR_ASSERT(LocalWorkSize[dim] <= MaxThreadsPerBlock[dim], UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); - // Checks that local work sizes are a divisor of the global work sizes - // which includes that the local work sizes are neither larger than - // the global work sizes and not 0. + // Checks that local work sizes are a divisor of the global work + // sizes which includes that the local work sizes are neither larger + // than the global work sizes and not 0. UR_ASSERT(LocalWorkSize != 0, UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE); UR_ASSERT((GlobalWorkSize[dim] % LocalWorkSize[dim]) == 0, diff --git a/source/adapters/hip/enqueue.hpp b/source/adapters/hip/enqueue.hpp index a1f86b3678..eacac72a82 100644 --- a/source/adapters/hip/enqueue.hpp +++ b/source/adapters/hip/enqueue.hpp @@ -30,3 +30,7 @@ void setCopyRectParams(ur_rect_region_t Region, const void *SrcPtr, const hipMemoryType DstType, ur_rect_offset_t DstOffset, size_t DstRowPitch, size_t DstSlicePitch, hipMemcpy3DParms &Params); + +void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock, + const size_t *GlobalWorkSize, const uint32_t WorkDim, + const size_t MaxThreadsPerBlock[3]); diff --git a/source/adapters/hip/event.cpp b/source/adapters/hip/event.cpp index 5327c43a3b..4b99dd97d7 100644 --- a/source/adapters/hip/event.cpp +++ b/source/adapters/hip/event.cpp @@ -16,23 +16,13 @@ ur_event_handle_t_::ur_event_handle_t_(ur_command_t Type, ur_context_handle_t Context, ur_queue_handle_t Queue, - hipStream_t Stream, uint32_t StreamToken) + hipEvent_t EvEnd, hipEvent_t EvQueued, + hipEvent_t EvStart, hipStream_t Stream, + uint32_t StreamToken) : CommandType{Type}, RefCount{1}, HasOwnership{true}, HasBeenWaitedOn{false}, IsRecorded{false}, IsStarted{false}, - StreamToken{StreamToken}, EventId{0}, EvEnd{nullptr}, EvStart{nullptr}, - EvQueued{nullptr}, Queue{Queue}, Stream{Stream}, Context{Context} { - - bool ProfilingEnabled = - Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent(); - - UR_CHECK_ERROR(hipEventCreateWithFlags( - &EvEnd, ProfilingEnabled ? hipEventDefault : hipEventDisableTiming)); - - if (ProfilingEnabled) { - UR_CHECK_ERROR(hipEventCreateWithFlags(&EvQueued, hipEventDefault)); - UR_CHECK_ERROR(hipEventCreateWithFlags(&EvStart, hipEventDefault)); - } - + StreamToken{StreamToken}, EventId{0}, EvEnd{EvEnd}, EvStart{EvStart}, + EvQueued{EvQueued}, Queue{Queue}, Stream{Stream}, Context{Context} { urQueueRetain(Queue); urContextRetain(Context); } @@ -60,9 +50,9 @@ ur_result_t ur_event_handle_t_::start() { try { if (Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || isTimestampEvent()) { - // NOTE: This relies on the default stream to be unused. - UR_CHECK_ERROR(hipEventRecord(EvQueued, 0)); - UR_CHECK_ERROR(hipEventRecord(EvStart, Queue->get())); + UR_CHECK_ERROR( + hipEventRecord(EvQueued, Queue->getHostSubmitTimeStream())); + UR_CHECK_ERROR(hipEventRecord(EvStart, Stream)); } } catch (ur_result_t Error) { Result = Error; @@ -90,44 +80,18 @@ bool ur_event_handle_t_::isCompleted() const { } uint64_t ur_event_handle_t_::getQueuedTime() const { - float MilliSeconds = 0.0f; assert(isStarted()); - - // hipEventSynchronize waits till the event is ready for call to - // hipEventElapsedTime. - UR_CHECK_ERROR(hipEventSynchronize(EvStart)); - UR_CHECK_ERROR(hipEventSynchronize(EvEnd)); - - UR_CHECK_ERROR(hipEventElapsedTime(&MilliSeconds, EvStart, EvEnd)); - return static_cast(MilliSeconds * 1.0e6); + return Queue->getDevice()->getElapsedTime(EvQueued); } uint64_t ur_event_handle_t_::getStartTime() const { - float MiliSeconds = 0.0f; assert(isStarted()); - - // hipEventSynchronize waits till the event is ready for call to - // hipEventElapsedTime. - UR_CHECK_ERROR(hipEventSynchronize(ur_platform_handle_t_::EvBase)); - UR_CHECK_ERROR(hipEventSynchronize(EvStart)); - - UR_CHECK_ERROR(hipEventElapsedTime(&MiliSeconds, - ur_platform_handle_t_::EvBase, EvStart)); - return static_cast(MiliSeconds * 1.0e6); + return Queue->getDevice()->getElapsedTime(EvStart); } uint64_t ur_event_handle_t_::getEndTime() const { - float MiliSeconds = 0.0f; assert(isStarted() && isRecorded()); - - // hipEventSynchronize waits till the event is ready for call to - // hipEventElapsedTime. - UR_CHECK_ERROR(hipEventSynchronize(ur_platform_handle_t_::EvBase)); - UR_CHECK_ERROR(hipEventSynchronize(EvEnd)); - - UR_CHECK_ERROR( - hipEventElapsedTime(&MiliSeconds, ur_platform_handle_t_::EvBase, EvEnd)); - return static_cast(MiliSeconds * 1.0e6); + return Queue->getDevice()->getElapsedTime(EvEnd); } ur_result_t ur_event_handle_t_::record() { @@ -327,8 +291,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle( ur_event_handle_t *phEvent) { std::ignore = pProperties; - *phEvent = ur_event_handle_t_::makeWithNative( - hContext, reinterpret_cast(hNativeEvent)); + std::unique_ptr EventPtr{nullptr}; + + try { + EventPtr = + std::unique_ptr(ur_event_handle_t_::makeWithNative( + hContext, reinterpret_cast(hNativeEvent))); + } catch (const std::bad_alloc &) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } catch (...) { + return UR_RESULT_ERROR_UNKNOWN; + } + + *phEvent = EventPtr.release(); return UR_RESULT_SUCCESS; } diff --git a/source/adapters/hip/event.hpp b/source/adapters/hip/event.hpp index 5e7c1d7e7d..af333c8613 100644 --- a/source/adapters/hip/event.hpp +++ b/source/adapters/hip/event.hpp @@ -80,8 +80,23 @@ struct ur_event_handle_t_ { static ur_event_handle_t makeNative(ur_command_t Type, ur_queue_handle_t Queue, hipStream_t Stream, uint32_t StreamToken = std::numeric_limits::max()) { - return new ur_event_handle_t_(Type, Queue->getContext(), Queue, Stream, - StreamToken); + const bool RequiresTimings = + Queue->URFlags & UR_QUEUE_FLAG_PROFILING_ENABLE || + Type == UR_COMMAND_TIMESTAMP_RECORDING_EXP; + if (RequiresTimings) { + Queue->createHostSubmitTimeStream(); + } + native_type EvEnd{nullptr}, EvQueued{nullptr}, EvStart{nullptr}; + UR_CHECK_ERROR(hipEventCreateWithFlags( + &EvEnd, RequiresTimings ? hipEventDefault : hipEventDisableTiming)); + + if (RequiresTimings) { + UR_CHECK_ERROR(hipEventCreateWithFlags(&EvQueued, hipEventDefault)); + UR_CHECK_ERROR(hipEventCreateWithFlags(&EvStart, hipEventDefault)); + } + + return new ur_event_handle_t_(Type, Queue->getContext(), Queue, EvEnd, + EvQueued, EvStart, Stream, StreamToken); } static ur_event_handle_t makeWithNative(ur_context_handle_t context, @@ -97,8 +112,9 @@ struct ur_event_handle_t_ { // This constructor is private to force programmers to use the makeNative / // make_user static members in order to create a ur_event_handle_t for HIP. ur_event_handle_t_(ur_command_t Type, ur_context_handle_t Context, - ur_queue_handle_t Queue, hipStream_t Stream, - uint32_t StreamToken); + ur_queue_handle_t Queue, native_type EvEnd, + native_type EvQueued, native_type EvStart, + hipStream_t Stream, uint32_t StreamToken); // This constructor is private to force programmers to use the // makeWithNative for event interop diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp index f35d3957bc..b433c06852 100644 --- a/source/adapters/hip/kernel.cpp +++ b/source/adapters/hip/kernel.cpp @@ -9,6 +9,7 @@ //===----------------------------------------------------------------------===// #include "kernel.hpp" +#include "enqueue.hpp" #include "memory.hpp" #include "sampler.hpp" @@ -349,3 +350,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants( [[maybe_unused]] const ur_specialization_constant_info_t *pSpecConstants) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( + [[maybe_unused]] ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, + uint32_t workDim, [[maybe_unused]] const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, size_t *pSuggestedLocalWorkSize) { + UR_ASSERT(hQueue->getContext() == hKernel->getContext(), + UR_RESULT_ERROR_INVALID_QUEUE); + UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(pSuggestedLocalWorkSize != nullptr, + UR_RESULT_ERROR_INVALID_NULL_POINTER); + + size_t MaxThreadsPerBlock[3]; + size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; + + MaxThreadsPerBlock[0] = hQueue->Device->getMaxBlockDimX(); + MaxThreadsPerBlock[1] = hQueue->Device->getMaxBlockDimY(); + MaxThreadsPerBlock[2] = hQueue->Device->getMaxBlockDimZ(); + + ur_device_handle_t Device = hQueue->getDevice(); + ScopedContext Active(Device); + + guessLocalWorkSize(Device, ThreadsPerBlock, pGlobalWorkSize, workDim, + MaxThreadsPerBlock); + std::copy(ThreadsPerBlock, ThreadsPerBlock + workDim, + pSuggestedLocalWorkSize); + return UR_RESULT_SUCCESS; +} diff --git a/source/adapters/hip/memory.cpp b/source/adapters/hip/memory.cpp index ff209884ce..eb91f1620a 100644 --- a/source/adapters/hip/memory.cpp +++ b/source/adapters/hip/memory.cpp @@ -10,6 +10,7 @@ #include "memory.hpp" #include "context.hpp" +#include "enqueue.hpp" #include #include @@ -32,6 +33,28 @@ size_t imageElementByteSize(hipArray_Format ArrayFormat) { return 0; } +ur_result_t +checkSupportedImageChannelType(ur_image_channel_type_t ImageChannelType) { + switch (ImageChannelType) { + case UR_IMAGE_CHANNEL_TYPE_SNORM_INT8: + case UR_IMAGE_CHANNEL_TYPE_SNORM_INT16: + case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8: + case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: + case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8: + case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16: + case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16: + case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16: + case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT: + case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: + case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32: + case UR_IMAGE_CHANNEL_TYPE_FLOAT: + return UR_RESULT_SUCCESS; + default: + return UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT; + } + return UR_RESULT_SUCCESS; +} + /// Decreases the reference count of the Mem object. /// If this is zero, calls the relevant HIP Free function /// \return UR_RESULT_SUCCESS unless deallocation error @@ -339,7 +362,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( UR_ASSERT(pImageDesc->stype == UR_STRUCTURE_TYPE_IMAGE_DESC, UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); - UR_ASSERT(pImageDesc->type <= UR_MEM_TYPE_IMAGE1D_BUFFER, + UR_ASSERT(pImageDesc->type <= UR_MEM_TYPE_IMAGE1D_ARRAY, UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); UR_ASSERT(pImageDesc->numMipLevel == 0, UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); @@ -355,7 +378,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( // We only support RBGA channel order // TODO: check SYCL CTS and spec. May also have to support BGRA UR_ASSERT(pImageFormat->channelOrder == UR_IMAGE_CHANNEL_ORDER_RGBA, - UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION); + UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT); + + UR_CHECK_ERROR(checkSupportedImageChannelType(pImageFormat->channelType)); auto URMemObj = std::unique_ptr( new ur_mem_handle_t_{hContext, flags, *pImageFormat, *pImageDesc, pHost}); @@ -366,7 +391,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( if (PerformInitialCopy) { for (const auto &Dev : hContext->getDevices()) { - UR_CHECK_ERROR(migrateMemoryToDeviceIfNeeded(URMemObj.get(), Dev)); + ScopedContext Active(Dev); + hipStream_t Stream{0}; // Use default stream + UR_CHECK_ERROR( + enqueueMigrateMemoryToDeviceIfNeeded(URMemObj.get(), Dev, Stream)); + UR_CHECK_ERROR(hipStreamSynchronize(Stream)); } } *phMem = URMemObj.release(); @@ -455,11 +484,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) { ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem, const ur_device_handle_t hDevice) { ScopedContext Active(hDevice); + auto DeviceIdx = Mem->getContext()->getDeviceIndex(hDevice); ur_lock LockGuard(Mem->MemoryAllocationMutex); if (Mem->isBuffer()) { auto &Buffer = std::get(Mem->Mem); - hipDeviceptr_t &DevPtr = Buffer.Ptrs[hDevice->getIndex()]; + hipDeviceptr_t &DevPtr = Buffer.Ptrs[DeviceIdx]; // Allocation has already been made if (DevPtr != BufferMem::native_type{0}) { @@ -482,12 +512,12 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem, try { auto &Image = std::get(Mem->Mem); // Allocation has already been made - if (Image.Arrays[hDevice->getIndex()]) { + if (Image.Arrays[DeviceIdx]) { return UR_RESULT_SUCCESS; } UR_CHECK_ERROR(hipArray3DCreate( reinterpret_cast(&ImageArray), &Image.ArrayDesc)); - Image.Arrays[hDevice->getIndex()] = ImageArray; + Image.Arrays[DeviceIdx] = ImageArray; // HIP_RESOURCE_DESC is a union of different structs, shown here // We need to fill it as described here to use it for a surface or texture // HIP_RESOURCE_DESC::resType must be HIP_RESOURCE_TYPE_ARRAY and @@ -499,7 +529,7 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem, ImageResDesc.resType = hipResourceTypeArray; UR_CHECK_ERROR(hipCreateSurfaceObject(&Surface, &ImageResDesc)); - Image.SurfObjs[hDevice->getIndex()] = Surface; + Image.SurfObjs[DeviceIdx] = Surface; } catch (ur_result_t Err) { if (ImageArray) { UR_CHECK_ERROR(hipFreeArray(ImageArray)); @@ -516,27 +546,28 @@ ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem, } namespace { -inline ur_result_t migrateBufferToDevice(ur_mem_handle_t Mem, - ur_device_handle_t hDevice) { +inline ur_result_t enqueueMigrateBufferToDevice(ur_mem_handle_t Mem, + ur_device_handle_t hDevice, + hipStream_t Stream) { auto &Buffer = std::get(Mem->Mem); - if (Mem->LastEventWritingToMemObj == nullptr) { + if (Mem->LastQueueWritingToMemObj == nullptr) { // Device allocation being initialized from host for the first time if (Buffer.HostPtr) { - UR_CHECK_ERROR( - hipMemcpyHtoD(Buffer.getPtr(hDevice), Buffer.HostPtr, Buffer.Size)); + UR_CHECK_ERROR(hipMemcpyHtoDAsync(Buffer.getPtr(hDevice), Buffer.HostPtr, + Buffer.Size, Stream)); } - } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() != - hDevice) { - UR_CHECK_ERROR(hipMemcpyDtoD( + } else if (Mem->LastQueueWritingToMemObj->getDevice() != hDevice) { + UR_CHECK_ERROR(hipMemcpyDtoDAsync( Buffer.getPtr(hDevice), - Buffer.getPtr(Mem->LastEventWritingToMemObj->getQueue()->getDevice()), - Buffer.Size)); + Buffer.getPtr(Mem->LastQueueWritingToMemObj->getDevice()), Buffer.Size, + Stream)); } return UR_RESULT_SUCCESS; } -inline ur_result_t migrateImageToDevice(ur_mem_handle_t Mem, - ur_device_handle_t hDevice) { +inline ur_result_t enqueueMigrateImageToDevice(ur_mem_handle_t Mem, + ur_device_handle_t hDevice, + hipStream_t Stream) { auto &Image = std::get(Mem->Mem); // When a dimension isn't used image_desc has the size set to 1 size_t PixelSizeBytes = Image.PixelTypeSizeBytes * @@ -567,36 +598,40 @@ inline ur_result_t migrateImageToDevice(ur_mem_handle_t Mem, CpyDesc3D.Depth = Image.ImageDesc.depth; } - if (Mem->LastEventWritingToMemObj == nullptr) { + if (Mem->LastQueueWritingToMemObj == nullptr) { if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) { + UR_CHECK_ERROR(hipStreamSynchronize(Stream)); UR_CHECK_ERROR( hipMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes)); } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) { CpyDesc2D.srcHost = Image.HostPtr; - UR_CHECK_ERROR(hipMemcpyParam2D(&CpyDesc2D)); + UR_CHECK_ERROR(hipMemcpyParam2DAsync(&CpyDesc2D, Stream)); } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) { CpyDesc3D.srcHost = Image.HostPtr; + CpyDesc3D.srcMemoryType = hipMemoryTypeHost; UR_CHECK_ERROR(hipDrvMemcpy3D(&CpyDesc3D)); } - } else if (Mem->LastEventWritingToMemObj->getQueue()->getDevice() != - hDevice) { + } else if (Mem->LastQueueWritingToMemObj->getDevice() != hDevice) { if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) { + // Blocking wait needed + UR_CHECK_ERROR(urQueueFinish(Mem->LastQueueWritingToMemObj)); // FIXME: 1D memcpy from DtoD going through the host. UR_CHECK_ERROR(hipMemcpyAtoH( Image.HostPtr, - Image.getArray( - Mem->LastEventWritingToMemObj->getQueue()->getDevice()), + Image.getArray(Mem->LastQueueWritingToMemObj->getDevice()), 0 /*srcOffset*/, ImageSizeBytes)); UR_CHECK_ERROR( hipMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes)); } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) { - CpyDesc2D.srcArray = Image.getArray( - Mem->LastEventWritingToMemObj->getQueue()->getDevice()); - UR_CHECK_ERROR(hipMemcpyParam2D(&CpyDesc2D)); + CpyDesc2D.srcMemoryType = hipMemoryTypeDevice; + CpyDesc2D.srcArray = + Image.getArray(Mem->LastQueueWritingToMemObj->getDevice()); + UR_CHECK_ERROR(hipMemcpyParam2DAsync(&CpyDesc2D, Stream)); } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) { - CpyDesc3D.srcArray = Image.getArray( - Mem->LastEventWritingToMemObj->getQueue()->getDevice()); - UR_CHECK_ERROR(hipDrvMemcpy3D(&CpyDesc3D)); + CpyDesc3D.srcMemoryType = hipMemoryTypeDevice; + CpyDesc3D.srcArray = + Image.getArray(Mem->LastQueueWritingToMemObj->getDevice()); + UR_CHECK_ERROR(hipDrvMemcpy3DAsync(&CpyDesc3D, Stream)); } } return UR_RESULT_SUCCESS; @@ -605,22 +640,50 @@ inline ur_result_t migrateImageToDevice(ur_mem_handle_t Mem, // If calling this entry point it is necessary to lock the memoryMigrationMutex // beforehand -ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem, - const ur_device_handle_t hDevice) { +ur_result_t enqueueMigrateMemoryToDeviceIfNeeded( + ur_mem_handle_t Mem, const ur_device_handle_t hDevice, hipStream_t Stream) { UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + auto DeviceIdx = Mem->getContext()->getDeviceIndex(hDevice); // Device allocation has already been initialized with most up to date // data in buffer - if (Mem->HaveMigratedToDeviceSinceLastWrite[hDevice->getIndex()]) { + if (Mem->HaveMigratedToDeviceSinceLastWrite[DeviceIdx]) return UR_RESULT_SUCCESS; - } ScopedContext Active(hDevice); if (Mem->isBuffer()) { - UR_CHECK_ERROR(migrateBufferToDevice(Mem, hDevice)); + UR_CHECK_ERROR(enqueueMigrateBufferToDevice(Mem, hDevice, Stream)); } else { - UR_CHECK_ERROR(migrateImageToDevice(Mem, hDevice)); + UR_CHECK_ERROR(enqueueMigrateImageToDevice(Mem, hDevice, Stream)); } - Mem->HaveMigratedToDeviceSinceLastWrite[hDevice->getIndex()] = true; + Mem->HaveMigratedToDeviceSinceLastWrite[DeviceIdx] = true; return UR_RESULT_SUCCESS; } + +BufferMem::native_type +BufferMem::getPtrWithOffset(const ur_device_handle_t Device, size_t Offset) { + if (ur_result_t Err = allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device); + Err != UR_RESULT_SUCCESS) { + throw Err; + } + return reinterpret_cast( + reinterpret_cast( + Ptrs[OuterMemStruct->getContext()->getDeviceIndex(Device)]) + + Offset); +} + +hipArray *SurfaceMem::getArray(const ur_device_handle_t Device) { + if (ur_result_t Err = allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device); + Err != UR_RESULT_SUCCESS) { + throw Err; + } + return Arrays[OuterMemStruct->getContext()->getDeviceIndex(Device)]; +} + +hipSurfaceObject_t SurfaceMem::getSurface(const ur_device_handle_t Device) { + if (ur_result_t Err = allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device); + Err != UR_RESULT_SUCCESS) { + throw Err; + } + return SurfObjs[OuterMemStruct->getContext()->getDeviceIndex(Device)]; +} diff --git a/source/adapters/hip/memory.hpp b/source/adapters/hip/memory.hpp index 5d2aa6f9a5..425c2e7f53 100644 --- a/source/adapters/hip/memory.hpp +++ b/source/adapters/hip/memory.hpp @@ -9,6 +9,7 @@ //===----------------------------------------------------------------------===// #pragma once +#include "common.hpp" #include "context.hpp" #include "event.hpp" #include @@ -16,12 +17,11 @@ #include #include -#include "common.hpp" - ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t, const ur_device_handle_t); -ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t, - const ur_device_handle_t); +ur_result_t enqueueMigrateMemoryToDeviceIfNeeded(ur_mem_handle_t, + const ur_device_handle_t, + hipStream_t); // Handler for plain, pointer-based HIP allocations struct BufferMem { @@ -95,15 +95,7 @@ struct BufferMem { // This will allocate memory on device with index Index if there isn't already // an active allocation on the device - native_type getPtrWithOffset(const ur_device_handle_t Device, size_t Offset) { - if (ur_result_t Err = - allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device); - Err != UR_RESULT_SUCCESS) { - throw Err; - } - return reinterpret_cast( - reinterpret_cast(Ptrs[Device->getIndex()]) + Offset); - } + native_type getPtrWithOffset(const ur_device_handle_t Device, size_t Offset); // This will allocate memory on device if there isn't already an active // allocation on the device @@ -224,6 +216,7 @@ struct SurfaceMem { ArrayDesc.Format = HIP_AD_FORMAT_UNSIGNED_INT8; PixelTypeSizeBytes = 1; break; + case UR_IMAGE_CHANNEL_TYPE_SNORM_INT8: case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8: ArrayDesc.Format = HIP_AD_FORMAT_SIGNED_INT8; PixelTypeSizeBytes = 1; @@ -233,6 +226,7 @@ struct SurfaceMem { ArrayDesc.Format = HIP_AD_FORMAT_UNSIGNED_INT16; PixelTypeSizeBytes = 2; break; + case UR_IMAGE_CHANNEL_TYPE_SNORM_INT16: case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16: ArrayDesc.Format = HIP_AD_FORMAT_SIGNED_INT16; PixelTypeSizeBytes = 2; @@ -260,24 +254,10 @@ struct SurfaceMem { } // Will allocate a new array on device if not already allocated - hipArray *getArray(const ur_device_handle_t Device) { - if (ur_result_t Err = - allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device); - Err != UR_RESULT_SUCCESS) { - throw Err; - } - return Arrays[Device->getIndex()]; - } + hipArray *getArray(const ur_device_handle_t Device); // Will allocate a new surface on device if not already allocated - hipSurfaceObject_t getSurface(const ur_device_handle_t Device) { - if (ur_result_t Err = - allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device); - Err != UR_RESULT_SUCCESS) { - throw Err; - } - return SurfObjs[Device->getIndex()]; - } + hipSurfaceObject_t getSurface(const ur_device_handle_t Device); ur_mem_type_t getImageType() const noexcept { return ImageDesc.type; } @@ -306,7 +286,7 @@ struct SurfaceMem { /// /// The ur_mem_handle_t is responsible for memory allocation and migration /// across devices in the same ur_context_handle_t. If a kernel writes to a -/// ur_mem_handle_t then it will write to LastEventWritingToMemObj. Then all +/// ur_mem_handle_t then it will write to LastQueueWritingToMemObj. Then all /// subsequent operations that want to read from the ur_mem_handle_t must wait /// on the event referring to the last write. /// @@ -325,61 +305,7 @@ struct SurfaceMem { /// 2. urEnqueueMem(Buffer|Image)Read(Rect) /// /// Migrations will occur in both cases if the most recent version of data -/// is on a different device, marked by LastEventWritingToMemObj->getDevice(). -/// -/// Example trace: -/// ~~~~~~~~~~~~~~ -/// -/// =====> urContextCreate([device0, device1], ...) // associated with [q0, q1] -/// -> OUT: hContext -/// -/// =====> urMemBufferCreate(hContext,...); -/// -> No native allocations made -/// -> OUT: hBuffer -/// -/// =====> urEnqueueMemBufferWrite(q0, hBuffer,...); -/// -> Allocation made on q0 ie device0 -/// -> New allocation initialized with host data. -/// -/// =====> urKernelSetArgMemObj(hKernel0, hBuffer, ...); -/// -> ur_kernel_handle_t associated with a ur_program_handle_t, -/// which is in turn unique to a device. So we can set the kernel -/// arg with the ptr of the device specific allocation. -/// -> hKernel0->getProgram()->getDevice() == device0 -/// -> allocateMemObjOnDeviceIfNeeded(device0); -/// -> Native allocation already made on device0, continue. -/// -/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...); -/// -> Suppose that hKernel0 writes to hBuffer. -/// -> Call hBuffer->setLastEventWritingToMemObj with return event -/// from this operation -/// -> Enqueue native kernel launch -/// -/// =====> urKernelSetArgMemObj(hKernel1, hBuffer, ...); -/// -> hKernel1->getProgram()->getDevice() == device1 -/// -> New allocation will be made on device1 when calling -/// getPtr(device1) -/// -> No native allocation on device1 -/// -> Make native allocation on device1 -/// -/// =====> urEnqueueKernelLaunch(q1, hKernel1, ...); -/// -> Suppose hKernel1 wants to read from hBuffer and not write. -/// -> migrateMemoryToDeviceIfNeeded(device1); -/// -> hBuffer->LastEventWritingToMemObj is not nullptr -/// -> Check if memory has been migrated to device1 since the -/// last write -/// -> Hasn't been migrated -/// -> Wait on LastEventWritingToMemObj. -/// -> Migrate memory from device0's native allocation to -/// device1's native allocation. -/// -> Enqueue native kernel launch -/// -/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...); -/// -> migrateMemoryToDeviceIfNeeded(device0); -/// -> hBuffer->LastEventWritingToMemObj refers to an event -/// from q0 -/// -> Migration not necessary -/// -> Enqueue native kernel launch +/// is on a different device, marked by LastQueueWritingToMemObj->getDevice(). /// struct ur_mem_handle_t_ { @@ -403,15 +329,13 @@ struct ur_mem_handle_t_ { // Has the memory been migrated to a device since the last write? std::vector HaveMigratedToDeviceSinceLastWrite; - // We should wait on this event prior to migrating memory across allocations - // in this ur_mem_handle_t_ - ur_event_handle_t LastEventWritingToMemObj{nullptr}; + // Queue with most up to date data of ur_mem_handle_t_ + ur_queue_handle_t LastQueueWritingToMemObj{nullptr}; // Enumerates all possible types of accesses. enum access_mode_t { unknown, read_write, read_only, write_only }; ur_mutex MemoryAllocationMutex; // A mutex for allocations - ur_mutex MemoryMigrationMutex; // A mutex for memory transfers /// A UR Memory object represents either plain memory allocations ("Buffers" /// in OpenCL) or typed allocations ("Images" in OpenCL). @@ -500,18 +424,18 @@ struct ur_mem_handle_t_ { uint32_t getReferenceCount() const noexcept { return RefCount; } - void setLastEventWritingToMemObj(ur_event_handle_t NewEvent) { - assert(NewEvent && "Invalid event!"); - // This entry point should only ever be called when using multi device ctx - assert(Context->Devices.size() > 1); - if (LastEventWritingToMemObj != nullptr) { - urEventRelease(LastEventWritingToMemObj); + void setLastQueueWritingToMemObj(ur_queue_handle_t WritingQueue) { + if (LastQueueWritingToMemObj != nullptr) { + urQueueRelease(LastQueueWritingToMemObj); } - urEventRetain(NewEvent); - LastEventWritingToMemObj = NewEvent; + urQueueRetain(WritingQueue); + LastQueueWritingToMemObj = WritingQueue; for (const auto &Device : Context->getDevices()) { - HaveMigratedToDeviceSinceLastWrite[Device->getIndex()] = - Device == NewEvent->getQueue()->getDevice(); + HaveMigratedToDeviceSinceLastWrite[Context->getDeviceIndex(Device)] = + Device == WritingQueue->getDevice(); } } }; + +ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t, + const ur_device_handle_t); diff --git a/source/adapters/hip/platform.cpp b/source/adapters/hip/platform.cpp index 287f941c30..8671d70a57 100644 --- a/source/adapters/hip/platform.cpp +++ b/source/adapters/hip/platform.cpp @@ -11,8 +11,6 @@ #include "platform.hpp" #include "context.hpp" -hipEvent_t ur_platform_handle_t_::EvBase{nullptr}; - UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetInfo(ur_platform_handle_t, ur_platform_info_t propName, size_t propSize, void *pPropValue, size_t *pSizeRet) { @@ -81,18 +79,15 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries, UR_CHECK_ERROR(hipDeviceGet(&Device, i)); hipCtx_t Context; UR_CHECK_ERROR(hipDevicePrimaryCtxRetain(&Context, Device)); - Platform.Devices.emplace_back( - new ur_device_handle_t_{Device, Context, &Platform, i}); - } - - // Setup EvBase - { - ScopedContext Active(Platform.Devices.front().get()); hipEvent_t EvBase; UR_CHECK_ERROR(hipEventCreate(&EvBase)); + + // Use the default stream to record base event counter UR_CHECK_ERROR(hipEventRecord(EvBase, 0)); + Platform.Devices.emplace_back(new ur_device_handle_t_{ + Device, Context, EvBase, &Platform, i}); - ur_platform_handle_t_::EvBase = EvBase; + ScopedContext Active(Platform.Devices.front().get()); } } catch (const std::bad_alloc &) { // Signal out-of-memory situation @@ -140,12 +135,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetNativeHandle( } UR_APIEXPORT ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( - ur_native_handle_t hNativePlatform, - const ur_platform_native_properties_t *pProperties, - ur_platform_handle_t *phPlatform) { - std::ignore = hNativePlatform; - std::ignore = pProperties; - std::ignore = phPlatform; + ur_native_handle_t, ur_adapter_handle_t, + const ur_platform_native_properties_t *, ur_platform_handle_t *) { + // There is no HIP equivalent to ur_platform_handle_t return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/source/adapters/hip/platform.hpp b/source/adapters/hip/platform.hpp index f925692ec8..7b96de6473 100644 --- a/source/adapters/hip/platform.hpp +++ b/source/adapters/hip/platform.hpp @@ -20,6 +20,5 @@ /// when devices are used. /// struct ur_platform_handle_t_ { - static hipEvent_t EvBase; // HIP event used as base counter std::vector> Devices; }; diff --git a/source/adapters/hip/program.cpp b/source/adapters/hip/program.cpp index 8e3653ee02..f7cd6eebf1 100644 --- a/source/adapters/hip/program.cpp +++ b/source/adapters/hip/program.cpp @@ -283,7 +283,11 @@ urProgramCreateWithIL(ur_context_handle_t, const void *, size_t, UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram, const char *pOptions) { - return urProgramBuild(hContext, hProgram, pOptions); + UR_CHECK_ERROR(urProgramBuild(hContext, hProgram, pOptions)); + // urProgramBuild sets the BinaryType to UR_PROGRAM_BINARY_TYPE_EXECUTABLE, so + // set it to the correct value for urProgramCompile post-hoc. + hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urProgramCompileExp(ur_program_handle_t, @@ -312,6 +316,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t, ScopedContext Active(hProgram->getDevice()); hProgram->buildProgram(pOptions); + hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_EXECUTABLE; } catch (ur_result_t Err) { Result = Err; @@ -355,13 +360,14 @@ urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t, UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); switch (propName) { - case UR_PROGRAM_BUILD_INFO_STATUS: { + case UR_PROGRAM_BUILD_INFO_STATUS: return ReturnValue(hProgram->BuildStatus); - } case UR_PROGRAM_BUILD_INFO_OPTIONS: return ReturnValue(hProgram->BuildOptions.c_str()); case UR_PROGRAM_BUILD_INFO_LOG: return ReturnValue(hProgram->InfoLog, hProgram->MAX_LOG_SIZE); + case UR_PROGRAM_BUILD_INFO_BINARY_TYPE: + return ReturnValue(hProgram->BinaryType); default: break; } @@ -494,6 +500,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( UR_ASSERT(Result == UR_RESULT_SUCCESS, Result); *phProgram = RetProgram.release(); + (*phProgram)->BinaryType = UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT; return Result; } diff --git a/source/adapters/hip/program.hpp b/source/adapters/hip/program.hpp index cc5da40af5..25a70e6a7a 100644 --- a/source/adapters/hip/program.hpp +++ b/source/adapters/hip/program.hpp @@ -27,6 +27,12 @@ struct ur_program_handle_t_ { ur_device_handle_t Device; std::string ExecutableCache; + // The ur_program_binary_type_t property is defined individually for every + // device in a program. However, since the HIP adapter only has 1 device per + // program, there is no need to keep track of its value for each + // device. + ur_program_binary_type_t BinaryType = UR_PROGRAM_BINARY_TYPE_NONE; + // Metadata bool IsRelocatable = false; diff --git a/source/adapters/hip/queue.cpp b/source/adapters/hip/queue.cpp index 6e6496fec1..4f7b4060cb 100644 --- a/source/adapters/hip/queue.cpp +++ b/source/adapters/hip/queue.cpp @@ -117,12 +117,17 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, try { std::unique_ptr QueueImpl{nullptr}; - unsigned int Flags = 0; + unsigned int Flags = hipStreamNonBlocking; ur_queue_flags_t URFlags = 0; int Priority = 0; // Not guaranteed, but, in ROCm 5.0-6.0, 0 is the default - if (pProps && pProps->stype == UR_STRUCTURE_TYPE_QUEUE_PROPERTIES) { URFlags = pProps->flags; + if (URFlags == UR_QUEUE_FLAG_USE_DEFAULT_STREAM) { + Flags = hipStreamDefault; + } else if (URFlags == UR_QUEUE_FLAG_SYNC_WITH_DEFAULT_STREAM) { + Flags = 0; + } + if (URFlags & UR_QUEUE_FLAG_PRIORITY_HIGH) { ScopedContext Active(hDevice); UR_CHECK_ERROR(hipDeviceGetStreamPriorityRange(nullptr, &Priority)); @@ -143,7 +148,7 @@ urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, QueueImpl = std::unique_ptr(new ur_queue_handle_t_{ std::move(ComputeHipStreams), std::move(TransferHipStreams), hContext, - hDevice, Flags, pProps ? pProps->flags : 0, Priority}); + hDevice, Flags, URFlags, Priority}); *phQueue = QueueImpl.release(); @@ -186,10 +191,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue, }); return ReturnValue(IsReady); } + case UR_QUEUE_INFO_DEVICE_DEFAULT: + case UR_QUEUE_INFO_SIZE: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; default: - break; + return UR_RESULT_ERROR_INVALID_ENUMERATION; } - return {}; } UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) { @@ -217,6 +224,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) { UR_CHECK_ERROR(hipStreamDestroy(S)); }); + if (hQueue->getHostSubmitTimeStream() != hipStream_t{0}) { + UR_CHECK_ERROR(hipStreamSynchronize(hQueue->getHostSubmitTimeStream())); + UR_CHECK_ERROR(hipStreamDestroy(hQueue->getHostSubmitTimeStream())); + } + return UR_RESULT_SUCCESS; } catch (ur_result_t Err) { return Err; diff --git a/source/adapters/hip/queue.hpp b/source/adapters/hip/queue.hpp index ad2f0f016e..cfabd29bf7 100644 --- a/source/adapters/hip/queue.hpp +++ b/source/adapters/hip/queue.hpp @@ -10,6 +10,9 @@ #pragma once #include "common.hpp" +#include +#include +#include using ur_stream_quard = std::unique_lock; @@ -22,6 +25,10 @@ struct ur_queue_handle_t_ { std::vector ComputeStreams; std::vector TransferStreams; + // Stream used for recording EvQueue, which holds information about when the + // command in question is enqueued on host, as opposed to started. It is + // created only if profiling is enabled - either for queue or per event. + native_type HostSubmitTimeStream{0}; // DelayCompute keeps track of which streams have been recently reused and // their next use should be delayed. If a stream has been recently reused it // will be skipped the next time it would be selected round-robin style. When @@ -95,6 +102,17 @@ struct ur_queue_handle_t_ { native_type getNextTransferStream(); native_type get() { return getNextComputeStream(); }; + // Function which creates the profiling stream. Called only from makeNative + // event when profiling is required. + void createHostSubmitTimeStream() { + static std::once_flag HostSubmitTimeStreamFlag; + std::call_once(HostSubmitTimeStreamFlag, [&]() { + UR_CHECK_ERROR(hipStreamCreateWithFlags(&HostSubmitTimeStream, + hipStreamNonBlocking)); + }); + } + native_type getHostSubmitTimeStream() { return HostSubmitTimeStream; } + bool hasBeenSynchronized(uint32_t StreamToken) { // stream token not associated with one of the compute streams if (StreamToken == std::numeric_limits::max()) { diff --git a/source/adapters/hip/ur_interface_loader.cpp b/source/adapters/hip/ur_interface_loader.cpp index 71979b75b1..7a28623e0b 100644 --- a/source/adapters/hip/ur_interface_loader.cpp +++ b/source/adapters/hip/ur_interface_loader.cpp @@ -125,6 +125,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetArgValue = urKernelSetArgValue; pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; + pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/hip/usm.cpp b/source/adapters/hip/usm.cpp index 7c4f43c4ac..275125b2ac 100644 --- a/source/adapters/hip/usm.cpp +++ b/source/adapters/hip/usm.cpp @@ -161,41 +161,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem, ur_usm_alloc_info_t propName, size_t propValueSize, void *pPropValue, size_t *pPropValueSizeRet) { - ur_result_t Result = UR_RESULT_SUCCESS; - hipPointerAttribute_t hipPointerAttributeType; - UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); try { switch (propName) { case UR_USM_ALLOC_INFO_TYPE: { - // do not throw if hipPointerGetAttribute returns hipErrorInvalidValue - hipError_t Ret = hipPointerGetAttributes(&hipPointerAttributeType, pMem); - if (Ret == hipErrorInvalidValue) { - // pointer not known to the HIP subsystem - return ReturnValue(UR_USM_TYPE_UNKNOWN); - } - // Direct usage of the function, instead of UR_CHECK_ERROR, so we can - // get the line offset. - checkErrorUR(Ret, __func__, __LINE__ - 5, __FILE__); - // ROCm 6.0.0 introduces hipMemoryTypeUnregistered in the hipMemoryType - // enum to mark unregistered allocations (i.e., via system allocators). -#if HIP_VERSION_MAJOR >= 6 - if (hipPointerAttributeType.type == hipMemoryTypeUnregistered) { + auto MaybePointerAttrs = getPointerAttributes(pMem); + if (!MaybePointerAttrs.has_value()) { // pointer not known to the HIP subsystem return ReturnValue(UR_USM_TYPE_UNKNOWN); } -#endif - unsigned int Value; -#if HIP_VERSION >= 50600000 - Value = hipPointerAttributeType.type; -#else - Value = hipPointerAttributeType.memoryType; -#endif + auto Value = getMemoryType(*MaybePointerAttrs); UR_ASSERT(Value == hipMemoryTypeDevice || Value == hipMemoryTypeHost || Value == hipMemoryTypeManaged, UR_RESULT_ERROR_INVALID_MEM_OBJECT); - if (hipPointerAttributeType.isManaged || Value == hipMemoryTypeManaged) { + if (MaybePointerAttrs->isManaged || Value == hipMemoryTypeManaged) { // pointer to managed memory return ReturnValue(UR_USM_TYPE_SHARED); } @@ -211,21 +191,21 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem, ur::unreachable(); } case UR_USM_ALLOC_INFO_DEVICE: { - // get device index associated with this pointer - UR_CHECK_ERROR(hipPointerGetAttributes(&hipPointerAttributeType, pMem)); + auto MaybePointerAttrs = getPointerAttributes(pMem); + if (!MaybePointerAttrs.has_value()) { + // pointer not known to the HIP subsystem + return ReturnValue(UR_USM_TYPE_UNKNOWN); + } - int DeviceIdx = hipPointerAttributeType.device; + int DeviceIdx = MaybePointerAttrs->device; - // currently each device is in its own platform, so find the platform at - // the same index - std::vector Platforms; - Platforms.resize(DeviceIdx + 1); + // hip backend has only one platform containing all devices + ur_platform_handle_t platform; ur_adapter_handle_t AdapterHandle = &adapter; - Result = urPlatformGet(&AdapterHandle, 1, DeviceIdx + 1, Platforms.data(), - nullptr); + UR_CHECK_ERROR(urPlatformGet(&AdapterHandle, 1, 1, &platform, nullptr)); // get the device from the platform - ur_device_handle_t Device = Platforms[DeviceIdx]->Devices[0].get(); + ur_device_handle_t Device = platform->Devices[DeviceIdx].get(); return ReturnValue(Device); } case UR_USM_ALLOC_INFO_POOL: { @@ -240,15 +220,31 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem, return ReturnValue(Pool); } case UR_USM_ALLOC_INFO_BASE_PTR: - case UR_USM_ALLOC_INFO_SIZE: - return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + // HIP gives us the ability to query the base pointer for a device + // pointer, so check whether we've got one of those. + if (auto MaybePointerAttrs = getPointerAttributes(pMem)) { + if (getMemoryType(*MaybePointerAttrs) == hipMemoryTypeDevice) { + void *Base = nullptr; + UR_CHECK_ERROR(hipPointerGetAttribute( + &Base, HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR, + (hipDeviceptr_t)pMem)); + return ReturnValue(Base); + } + } + // If not, we can't be sure. + return UR_RESULT_ERROR_INVALID_VALUE; + case UR_USM_ALLOC_INFO_SIZE: { + size_t RangeSize = 0; + UR_CHECK_ERROR(hipMemPtrGetInfo(const_cast(pMem), &RangeSize)); + return ReturnValue(RangeSize); + } default: return UR_RESULT_ERROR_INVALID_ENUMERATION; } } catch (ur_result_t Error) { - Result = Error; + return Error; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urUSMImportExp(ur_context_handle_t Context, diff --git a/source/adapters/level_zero/CMakeLists.txt b/source/adapters/level_zero/CMakeLists.txt index d26d0aeb26..41f8ce8d12 100644 --- a/source/adapters/level_zero/CMakeLists.txt +++ b/source/adapters/level_zero/CMakeLists.txt @@ -5,22 +5,27 @@ set(TARGET_NAME ur_adapter_level_zero) +set(UR_LEVEL_ZERO_LOADER_LIBRARY "" CACHE FILEPATH "Path of the Level Zero Loader library") +set(UR_LEVEL_ZERO_INCLUDE_DIR "" CACHE FILEPATH "Directory containing the Level Zero Headers") +set(UR_LEVEL_ZERO_LOADER_REPO "" CACHE STRING "Github repo to get the Level Zero loader sources from") +set(UR_LEVEL_ZERO_LOADER_TAG "" CACHE STRING " GIT tag of the Level Loader taken from github repo") -# Copy L0 loader/headers locally to the build to avoid leaking their path. +# Copy Level Zero loader/headers locally to the build to avoid leaking their path. set(LEVEL_ZERO_COPY_DIR ${CMAKE_CURRENT_BINARY_DIR}/level_zero_loader) -if (DEFINED L0_LIBRARY) - get_filename_component(LEVEL_ZERO_LIB_NAME "${L0_LIBRARY}" NAME) +if (NOT UR_LEVEL_ZERO_LOADER_LIBRARY STREQUAL "") + get_filename_component(LEVEL_ZERO_LIB_NAME "${UR_LEVEL_ZERO_LOADER_LIBRARY}" NAME) set(LEVEL_ZERO_LIBRARY ${LEVEL_ZERO_COPY_DIR}/${LEVEL_ZERO_LIB_NAME}) - message(STATUS "Copying Level Zero loader and headers to local build tree") - file(COPY ${L0_LIBRARY} DESTINATION ${LEVEL_ZERO_COPY_DIR} FOLLOW_SYMLINK_CHAIN) + message(STATUS "Level Zero Adapter: Copying Level Zero loader to local build tree") + file(COPY ${UR_LEVEL_ZERO_LOADER_LIBRARY} DESTINATION ${LEVEL_ZERO_COPY_DIR} FOLLOW_SYMLINK_CHAIN) endif() -if (DEFINED L0_INCLUDE_DIR) +if (NOT UR_LEVEL_ZERO_INCLUDE_DIR STREQUAL "") set(LEVEL_ZERO_INCLUDE_DIR ${LEVEL_ZERO_COPY_DIR}/level_zero) - file(COPY ${L0_INCLUDE_DIR}/level_zero DESTINATION ${LEVEL_ZERO_COPY_DIR}) + message(STATUS "Level Zero Adapter: Copying Level Zero headers to local build tree") + file(COPY ${UR_LEVEL_ZERO_INCLUDE_DIR}/level_zero DESTINATION ${LEVEL_ZERO_COPY_DIR}) endif() if (NOT DEFINED LEVEL_ZERO_LIBRARY OR NOT DEFINED LEVEL_ZERO_INCLUDE_DIR) - message(STATUS "Download Level Zero loader and headers from github.com") + message(STATUS "Level Zero Adapter: Download Level Zero loader and headers from github.com") # Workaround warnings/errors for Level Zero build set(CMAKE_CXX_FLAGS_BAK "${CMAKE_CXX_FLAGS}") @@ -33,19 +38,23 @@ if (NOT DEFINED LEVEL_ZERO_LIBRARY OR NOT DEFINED LEVEL_ZERO_INCLUDE_DIR) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-warning-option") endif() - set(LEVEL_ZERO_LOADER_REPO "https://github.com/oneapi-src/level-zero.git") - set(LEVEL_ZERO_LOADER_TAG v1.16.1) + if (UR_LEVEL_ZERO_LOADER_REPO STREQUAL "") + set(UR_LEVEL_ZERO_LOADER_REPO "https://github.com/oneapi-src/level-zero.git") + endif() + if (UR_LEVEL_ZERO_LOADER_TAG STREQUAL "") + set(UR_LEVEL_ZERO_LOADER_TAG v1.16.1) + endif() # Disable due to a bug https://github.com/oneapi-src/level-zero/issues/104 set(CMAKE_INCLUDE_CURRENT_DIR OFF) # Prevent L0 loader from exporting extra symbols set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS OFF) - message(STATUS "Will fetch Level Zero Loader from ${LEVEL_ZERO_LOADER_REPO}") + message(STATUS "Level Zero Adapter: Will fetch Level Zero Loader from ${UR_LEVEL_ZERO_LOADER_REPO}") include(FetchContent) FetchContent_Declare(level-zero-loader - GIT_REPOSITORY ${LEVEL_ZERO_LOADER_REPO} - GIT_TAG ${LEVEL_ZERO_LOADER_TAG} + GIT_REPOSITORY ${UR_LEVEL_ZERO_LOADER_REPO} + GIT_TAG ${UR_LEVEL_ZERO_LOADER_TAG} ) if(MSVC) set(USE_Z7 ON) diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 67415a0de0..6d0d2aea02 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -15,6 +15,9 @@ https://github.com/intel/llvm/blob/sycl/sycl/doc/design/CommandGraph.md#level-zero */ +// Print the name of a variable and its value in the L0 debug log +#define DEBUG_LOG(VAR) logger::debug(#VAR " {}", VAR); + namespace { /// Checks the version of the level-zero driver. /// @param Context Execution context @@ -38,18 +41,31 @@ bool IsDriverVersionNewerOrSimilar(ur_context_handle_t Context, (DriverVersionMinor >= VersionMinor) && (DriverVersionBuild >= VersionBuild)); } + +// Default to using compute engine for fill operation, but allow to +// override this with an environment variable. +bool PreferCopyEngineForFill = [] { + const char *UrRet = std::getenv("UR_L0_USE_COPY_ENGINE_FOR_FILL"); + const char *PiRet = + std::getenv("SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_FILL"); + return (UrRet ? std::stoi(UrRet) : (PiRet ? std::stoi(PiRet) : 0)); +}(); + }; // namespace ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( ur_context_handle_t Context, ur_device_handle_t Device, ze_command_list_handle_t CommandList, ze_command_list_handle_t CommandListResetEvents, + ze_command_list_handle_t CopyCommandList, ZeStruct ZeDesc, + ZeStruct ZeCopyDesc, const ur_exp_command_buffer_desc_t *Desc, const bool IsInOrderCmdList) - : Context(Context), Device(Device), ZeCommandList(CommandList), + : Context(Context), Device(Device), ZeComputeCommandList(CommandList), ZeCommandListResetEvents(CommandListResetEvents), - ZeCommandListDesc(ZeDesc), ZeFencesList(), QueueProperties(), - SyncPoints(), NextSyncPoint(0), + ZeCommandListDesc(ZeDesc), ZeCopyCommandList(CopyCommandList), + ZeCopyCommandListDesc(ZeCopyDesc), ZeFencesMap(), ZeActiveFence(nullptr), + QueueProperties(), SyncPoints(), NextSyncPoint(0), IsUpdatable(Desc ? Desc->isUpdatable : false), IsProfilingEnabled(Desc ? Desc->enableProfiling : false), IsInOrderCmdList(IsInOrderCmdList) { @@ -68,8 +84,11 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { // Release the memory allocated to the CommandList stored in the // command_buffer - if (ZeCommandList) { - ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList)); + if (ZeComputeCommandList) { + ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeComputeCommandList)); + } + if (UseCopyEngine() && ZeCopyCommandList) { + ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCopyCommandList)); } // Release the memory allocated to the CommandListResetEvents stored in the @@ -99,8 +118,9 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { urEventReleaseInternal(Event); } - // Release Fences allocated to command_buffer - for (auto &ZeFence : ZeFencesList) { + // Release fences allocated to command-buffer + for (auto &ZeFencePair : ZeFencesMap) { + auto &ZeFence = ZeFencePair.second; ZE_CALL_NOCHECK(zeFenceDestroy, (ZeFence)); } @@ -300,13 +320,14 @@ static ur_result_t getEventsFromSyncPoints( // buffer. static ur_result_t enqueueCommandBufferMemCopyHelper( ur_command_t CommandType, ur_exp_command_buffer_handle_t CommandBuffer, - void *Dst, const void *Src, size_t Size, uint32_t NumSyncPointsInWaitList, + void *Dst, const void *Src, size_t Size, bool PreferCopyEngine, + uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - ur_exp_command_buffer_sync_point_t *SyncPoint) { + ur_exp_command_buffer_sync_point_t *RetSyncPoint) { if (CommandBuffer->IsInOrderCmdList) { - ZE2UR_CALL( - zeCommandListAppendMemoryCopy, - (CommandBuffer->ZeCommandList, Dst, Src, Size, nullptr, 0, nullptr)); + ZE2UR_CALL(zeCommandListAppendMemoryCopy, + (CommandBuffer->ZeComputeCommandList, Dst, Src, Size, nullptr, 0, + nullptr)); logger::debug("calling zeCommandListAppendMemoryCopy()"); } else { @@ -320,12 +341,26 @@ static ur_result_t enqueueCommandBufferMemCopyHelper( LaunchEvent->CommandType = CommandType; // Get sync point and register the event with it. - *SyncPoint = CommandBuffer->GetNextSyncPoint(); - CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); + ur_exp_command_buffer_sync_point_t SyncPoint = + CommandBuffer->GetNextSyncPoint(); + CommandBuffer->RegisterSyncPoint(SyncPoint, LaunchEvent); + if (RetSyncPoint) { + *RetSyncPoint = SyncPoint; + } + ze_command_list_handle_t ZeCommandList = + CommandBuffer->ZeComputeCommandList; + // If the copy engine available, the command is enqueued in the + // ZeCopyCommandList. + if (PreferCopyEngine && CommandBuffer->UseCopyEngine()) { + ZeCommandList = CommandBuffer->ZeCopyCommandList; + // We indicate that the ZeCopyCommandList contains commands to be + // submitted. + CommandBuffer->MCopyCommandListEmpty = false; + } ZE2UR_CALL(zeCommandListAppendMemoryCopy, - (CommandBuffer->ZeCommandList, Dst, Src, Size, - LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data())); + (ZeCommandList, Dst, Src, Size, LaunchEvent->ZeEvent, + ZeEventList.size(), ZeEventList.data())); logger::debug("calling zeCommandListAppendMemoryCopy() with" " ZeEvent {}", @@ -341,9 +376,9 @@ static ur_result_t enqueueCommandBufferMemCopyRectHelper( void *Dst, const void *Src, ur_rect_offset_t SrcOrigin, ur_rect_offset_t DstOrigin, ur_rect_region_t Region, size_t SrcRowPitch, size_t DstRowPitch, size_t SrcSlicePitch, size_t DstSlicePitch, - uint32_t NumSyncPointsInWaitList, + bool PreferCopyEngine, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - ur_exp_command_buffer_sync_point_t *SyncPoint) { + ur_exp_command_buffer_sync_point_t *RetSyncPoint) { uint32_t SrcOriginX = ur_cast(SrcOrigin.x); uint32_t SrcOriginY = ur_cast(SrcOrigin.y); @@ -378,9 +413,9 @@ static ur_result_t enqueueCommandBufferMemCopyRectHelper( if (CommandBuffer->IsInOrderCmdList) { ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion, - (CommandBuffer->ZeCommandList, Dst, &ZeDstRegion, DstPitch, - DstSlicePitch, Src, &ZeSrcRegion, SrcPitch, SrcSlicePitch, - nullptr, 0, nullptr)); + (CommandBuffer->ZeComputeCommandList, Dst, &ZeDstRegion, + DstPitch, DstSlicePitch, Src, &ZeSrcRegion, SrcPitch, + SrcSlicePitch, nullptr, 0, nullptr)); logger::debug("calling zeCommandListAppendMemoryCopyRegion()"); } else { @@ -395,13 +430,28 @@ static ur_result_t enqueueCommandBufferMemCopyRectHelper( LaunchEvent->CommandType = CommandType; // Get sync point and register the event with it. - *SyncPoint = CommandBuffer->GetNextSyncPoint(); - CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); + ur_exp_command_buffer_sync_point_t SyncPoint = + CommandBuffer->GetNextSyncPoint(); + CommandBuffer->RegisterSyncPoint(SyncPoint, LaunchEvent); + if (RetSyncPoint) { + *RetSyncPoint = SyncPoint; + } + + ze_command_list_handle_t ZeCommandList = + CommandBuffer->ZeComputeCommandList; + // If the copy engine available, the command is enqueued in the + // ZeCopyCommandList. + if (PreferCopyEngine && CommandBuffer->UseCopyEngine()) { + ZeCommandList = CommandBuffer->ZeCopyCommandList; + // We indicate that the ZeCopyCommandList contains commands to be + // submitted. + CommandBuffer->MCopyCommandListEmpty = false; + } ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion, - (CommandBuffer->ZeCommandList, Dst, &ZeDstRegion, DstPitch, - DstSlicePitch, Src, &ZeSrcRegion, SrcPitch, SrcSlicePitch, - LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data())); + (ZeCommandList, Dst, &ZeDstRegion, DstPitch, DstSlicePitch, Src, + &ZeSrcRegion, SrcPitch, SrcSlicePitch, LaunchEvent->ZeEvent, + ZeEventList.size(), ZeEventList.data())); logger::debug("calling zeCommandListAppendMemoryCopyRegion() with" " ZeEvent {}", @@ -415,25 +465,43 @@ static ur_result_t enqueueCommandBufferMemCopyRectHelper( static ur_result_t enqueueCommandBufferFillHelper( ur_command_t CommandType, ur_exp_command_buffer_handle_t CommandBuffer, void *Ptr, const void *Pattern, size_t PatternSize, size_t Size, - uint32_t NumSyncPointsInWaitList, + bool PreferCopyEngine, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - ur_exp_command_buffer_sync_point_t *SyncPoint) { + ur_exp_command_buffer_sync_point_t *RetSyncPoint) { // Pattern size must be a power of two. UR_ASSERT((PatternSize > 0) && ((PatternSize & (PatternSize - 1)) == 0), UR_RESULT_ERROR_INVALID_VALUE); - // Pattern size must fit the compute queue capabilities. - UR_ASSERT( + ze_command_list_handle_t ZeCommandList; + // If the copy engine available and patternsize is valid, the command is + // enqueued in the ZeCopyCommandList, otherwise enqueue it in the compute + // command list. + + if (PreferCopyEngine && CommandBuffer->UseCopyEngine() && PatternSize <= CommandBuffer->Device - ->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] - .ZeProperties.maxMemoryFillPatternSize, - UR_RESULT_ERROR_INVALID_VALUE); + ->QueueGroup[ur_device_handle_t_::queue_group_info_t::MainCopy] + .ZeProperties.maxMemoryFillPatternSize) { + + ZeCommandList = CommandBuffer->ZeCopyCommandList; + // We indicate that the ZeCopyCommandList contains commands to be + // submitted. + CommandBuffer->MCopyCommandListEmpty = false; + } else { + // Pattern size must fit the compute queue capabilities. + UR_ASSERT( + PatternSize <= + CommandBuffer->Device + ->QueueGroup[ur_device_handle_t_::queue_group_info_t::Compute] + .ZeProperties.maxMemoryFillPatternSize, + UR_RESULT_ERROR_INVALID_VALUE); + ZeCommandList = CommandBuffer->ZeComputeCommandList; + } if (CommandBuffer->IsInOrderCmdList) { ZE2UR_CALL(zeCommandListAppendMemoryFill, - (CommandBuffer->ZeCommandList, Ptr, Pattern, PatternSize, Size, - nullptr, 0, nullptr)); + (CommandBuffer->ZeComputeCommandList, Ptr, Pattern, PatternSize, + Size, nullptr, 0, nullptr)); logger::debug("calling zeCommandListAppendMemoryFill()"); } else { @@ -448,11 +516,15 @@ static ur_result_t enqueueCommandBufferFillHelper( LaunchEvent->CommandType = CommandType; // Get sync point and register the event with it. - *SyncPoint = CommandBuffer->GetNextSyncPoint(); - CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); + ur_exp_command_buffer_sync_point_t SyncPoint = + CommandBuffer->GetNextSyncPoint(); + CommandBuffer->RegisterSyncPoint(SyncPoint, LaunchEvent); + if (RetSyncPoint) { + *RetSyncPoint = SyncPoint; + } ZE2UR_CALL(zeCommandListAppendMemoryFill, - (CommandBuffer->ZeCommandList, Ptr, Pattern, PatternSize, Size, + (ZeCommandList, Ptr, Pattern, PatternSize, Size, LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data())); logger::debug("calling zeCommandListAppendMemoryFill() with" @@ -474,8 +546,6 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, ? (CommandBufferDesc ? CommandBufferDesc->isInOrder : false) : false; - // Force compute queue type for now. Copy engine types may be better suited - // for host to device copies. uint32_t QueueGroupOrdinal = Device->QueueGroup[ur_device_handle_t_::queue_group_info_t::type::Compute] .ZeOrdinal; @@ -495,21 +565,51 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, ZeCommandListDesc.flags = IsInOrder ? ZE_COMMAND_LIST_FLAG_IN_ORDER : ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING; + DEBUG_LOG(ZeCommandListDesc.flags); + ZeStruct ZeMutableCommandListDesc; if (CommandBufferDesc && CommandBufferDesc->isUpdatable) { ZeMutableCommandListDesc.flags = 0; ZeCommandListDesc.pNext = &ZeMutableCommandListDesc; } - ze_command_list_handle_t ZeCommandList; + ze_command_list_handle_t ZeComputeCommandList; // TODO We could optimize this by pooling both Level Zero command-lists and UR // command-buffers, then reusing them. ZE2UR_CALL(zeCommandListCreate, (Context->ZeContext, Device->ZeDevice, - &ZeCommandListDesc, &ZeCommandList)); + &ZeCommandListDesc, &ZeComputeCommandList)); + + // Create a list for copy commands. + // Note that to simplify the implementation, the current implementation only + // uses the main copy engine and does not use the link engine even if + // available. + ze_command_list_handle_t ZeCopyCommandList = nullptr; + ZeStruct ZeCopyCommandListDesc; + if (Device->hasMainCopyEngine()) { + uint32_t QueueGroupOrdinalCopy = + Device + ->QueueGroup + [ur_device_handle_t_::queue_group_info_t::type::MainCopy] + .ZeOrdinal; + + ZeCopyCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinalCopy; + // Dependencies between commands are explicitly enforced by sync points when + // enqueuing. Consequently, relax the command ordering in the command list + // can enable the backend to further optimize the workload + ZeCopyCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING; + + // TODO We could optimize this by pooling both Level Zero command-lists and + // UR command-buffers, then reusing them. + ZE2UR_CALL(zeCommandListCreate, + (Context->ZeContext, Device->ZeDevice, &ZeCopyCommandListDesc, + &ZeCopyCommandList)); + } + try { *CommandBuffer = new ur_exp_command_buffer_handle_t_( - Context, Device, ZeCommandList, ZeCommandListResetEvents, - ZeCommandListDesc, CommandBufferDesc, IsInOrder); + Context, Device, ZeComputeCommandList, ZeCommandListResetEvents, + ZeCopyCommandList, ZeCommandListDesc, ZeCopyCommandListDesc, + CommandBufferDesc, IsInOrder); } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } catch (...) { @@ -536,9 +636,18 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, std::vector PrecondEvents = { RetCommandBuffer->WaitEvent->ZeEvent, RetCommandBuffer->AllResetEvent->ZeEvent}; - ZE2UR_CALL( - zeCommandListAppendBarrier, - (ZeCommandList, nullptr, PrecondEvents.size(), PrecondEvents.data())); + ZE2UR_CALL(zeCommandListAppendBarrier, + (ZeComputeCommandList, nullptr, PrecondEvents.size(), + PrecondEvents.data())); + + if (Device->hasMainCopyEngine()) { + // The copy command-list must be executed once the preconditions have been + // met. We therefore begin this command-list with a barrier on the + // preconditions. + ZE2UR_CALL(zeCommandListAppendBarrier, + (ZeCopyCommandList, nullptr, PrecondEvents.size(), + PrecondEvents.data())); + } return UR_RESULT_SUCCESS; } @@ -579,9 +688,9 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) { CommandBuffer->AllResetEvent->ZeEvent)); if (CommandBuffer->IsInOrderCmdList) { - ZE2UR_CALL( - zeCommandListAppendSignalEvent, - (CommandBuffer->ZeCommandList, CommandBuffer->SignalEvent->ZeEvent)); + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (CommandBuffer->ZeComputeCommandList, + CommandBuffer->SignalEvent->ZeEvent)); } else { // Create a list of events for our signal event to wait on const size_t NumEvents = CommandBuffer->SyncPoints.size(); @@ -592,15 +701,21 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) { // Wait for all the user added commands to complete, and signal the // command-buffer signal-event when they are done. - ZE2UR_CALL(zeCommandListAppendBarrier, (CommandBuffer->ZeCommandList, + ZE2UR_CALL(zeCommandListAppendBarrier, (CommandBuffer->ZeComputeCommandList, CommandBuffer->SignalEvent->ZeEvent, NumEvents, WaitEventList.data())); } // Close the command lists and have them ready for dispatch. - ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandList)); + ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeComputeCommandList)); ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandListResetEvents)); + + if (CommandBuffer->UseCopyEngine()) { + ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCopyCommandList)); + } + CommandBuffer->IsFinalized = true; + return UR_RESULT_SUCCESS; } @@ -610,7 +725,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( const size_t *GlobalWorkSize, const size_t *LocalWorkSize, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - ur_exp_command_buffer_sync_point_t *SyncPoint, + ur_exp_command_buffer_sync_point_t *RetSyncPoint, ur_exp_command_buffer_command_handle_t *Command) { UR_ASSERT(CommandBuffer && Kernel && Kernel->Program, UR_RESULT_ERROR_INVALID_NULL_POINTER); @@ -675,9 +790,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( auto Plt = CommandBuffer->Context->getPlatform(); UR_ASSERT(Plt->ZeMutableCmdListExt.Supported, UR_RESULT_ERROR_UNSUPPORTED_FEATURE); - ZE2UR_CALL( - Plt->ZeMutableCmdListExt.zexCommandListGetNextCommandIdExp, - (CommandBuffer->ZeCommandList, &ZeMutableCommandDesc, &CommandId)); + ZE2UR_CALL(Plt->ZeMutableCmdListExt.zexCommandListGetNextCommandIdExp, + (CommandBuffer->ZeComputeCommandList, &ZeMutableCommandDesc, + &CommandId)); + DEBUG_LOG(CommandId); } try { if (Command) @@ -691,7 +807,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( if (CommandBuffer->IsInOrderCmdList) { ZE2UR_CALL(zeCommandListAppendLaunchKernel, - (CommandBuffer->ZeCommandList, Kernel->ZeKernel, + (CommandBuffer->ZeComputeCommandList, Kernel->ZeKernel, &ZeThreadGroupDimensions, nullptr, 0, nullptr)); logger::debug("calling zeCommandListAppendLaunchKernel()"); @@ -705,14 +821,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( !CommandBuffer->IsProfilingEnabled)); LaunchEvent->CommandType = UR_COMMAND_KERNEL_LAUNCH; - if (SyncPoint) { - // Get sync point and register the event with it. - *SyncPoint = CommandBuffer->GetNextSyncPoint(); - CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); + // Get sync point and register the event with it. + ur_exp_command_buffer_sync_point_t SyncPoint = + CommandBuffer->GetNextSyncPoint(); + CommandBuffer->RegisterSyncPoint(SyncPoint, LaunchEvent); + if (RetSyncPoint) { + *RetSyncPoint = SyncPoint; } ZE2UR_CALL(zeCommandListAppendLaunchKernel, - (CommandBuffer->ZeCommandList, Kernel->ZeKernel, + (CommandBuffer->ZeComputeCommandList, Kernel->ZeKernel, &ZeThreadGroupDimensions, LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data())); @@ -729,8 +847,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMMemcpyExp( size_t Size, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, ur_exp_command_buffer_sync_point_t *SyncPoint) { + + bool PreferCopyEngine = !IsDevicePointer(CommandBuffer->Context, Src) || + !IsDevicePointer(CommandBuffer->Context, Dst); + + PreferCopyEngine |= UseCopyEngineForD2DCopy; + return enqueueCommandBufferMemCopyHelper( - UR_COMMAND_USM_MEMCPY, CommandBuffer, Dst, Src, Size, + UR_COMMAND_USM_MEMCPY, CommandBuffer, Dst, Src, Size, PreferCopyEngine, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } @@ -740,8 +864,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, ur_exp_command_buffer_sync_point_t *SyncPoint) { - auto SrcBuffer = ur_cast(SrcMem); - auto DstBuffer = ur_cast(DstMem); + auto SrcBuffer = ur_cast<_ur_buffer *>(SrcMem); + auto DstBuffer = ur_cast<_ur_buffer *>(DstMem); std::shared_lock SrcLock(SrcBuffer->Mutex, std::defer_lock); std::scoped_lock, ur_shared_mutex> LockAll( @@ -754,10 +878,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyExp( UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, CommandBuffer->Device)); + bool PreferCopyEngine = (SrcBuffer->OnHost || SrcBuffer->OnHost); + + PreferCopyEngine |= UseCopyEngineForD2DCopy; + return enqueueCommandBufferMemCopyHelper( UR_COMMAND_MEM_BUFFER_COPY, CommandBuffer, ZeHandleDst + DstOffset, - ZeHandleSrc + SrcOffset, Size, NumSyncPointsInWaitList, SyncPointWaitList, - SyncPoint); + ZeHandleSrc + SrcOffset, Size, PreferCopyEngine, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( @@ -768,8 +896,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, ur_exp_command_buffer_sync_point_t *SyncPoint) { - auto SrcBuffer = ur_cast(SrcMem); - auto DstBuffer = ur_cast(DstMem); + auto SrcBuffer = ur_cast<_ur_buffer *>(SrcMem); + auto DstBuffer = ur_cast<_ur_buffer *>(DstMem); std::shared_lock SrcLock(SrcBuffer->Mutex, std::defer_lock); std::scoped_lock, ur_shared_mutex> LockAll( @@ -782,10 +910,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferCopyRectExp( UR_CALL(DstBuffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, CommandBuffer->Device)); + bool PreferCopyEngine = (SrcBuffer->OnHost || SrcBuffer->OnHost); + + PreferCopyEngine |= UseCopyEngineForD2DCopy; + return enqueueCommandBufferMemCopyRectHelper( UR_COMMAND_MEM_BUFFER_COPY_RECT, CommandBuffer, ZeHandleDst, ZeHandleSrc, SrcOrigin, DstOrigin, Region, SrcRowPitch, DstRowPitch, SrcSlicePitch, - DstSlicePitch, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + DstSlicePitch, PreferCopyEngine, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( @@ -799,12 +932,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteExp( char *ZeHandleDst = nullptr; UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, CommandBuffer->Device)); + // Always prefer copy engine for writes + bool PreferCopyEngine = true; return enqueueCommandBufferMemCopyHelper( UR_COMMAND_MEM_BUFFER_WRITE, CommandBuffer, ZeHandleDst + Offset, // dst Src, // src - Size, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + Size, PreferCopyEngine, NumSyncPointsInWaitList, SyncPointWaitList, + SyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( @@ -820,11 +956,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferWriteRectExp( char *ZeHandleDst = nullptr; UR_CALL(Buffer->getZeHandle(ZeHandleDst, ur_mem_handle_t_::write_only, CommandBuffer->Device)); + + // Always prefer copy engine for writes + bool PreferCopyEngine = true; + return enqueueCommandBufferMemCopyRectHelper( UR_COMMAND_MEM_BUFFER_WRITE_RECT, CommandBuffer, ZeHandleDst, const_cast(static_cast(Src)), HostOffset, BufferOffset, Region, HostRowPitch, BufferRowPitch, HostSlicePitch, - BufferSlicePitch, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + BufferSlicePitch, PreferCopyEngine, NumSyncPointsInWaitList, + SyncPointWaitList, SyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( @@ -837,9 +978,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadExp( char *ZeHandleSrc = nullptr; UR_CALL(Buffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, CommandBuffer->Device)); + + // Always prefer copy engine for reads + bool PreferCopyEngine = true; + return enqueueCommandBufferMemCopyHelper( UR_COMMAND_MEM_BUFFER_READ, CommandBuffer, Dst, ZeHandleSrc + Offset, - Size, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + Size, PreferCopyEngine, NumSyncPointsInWaitList, SyncPointWaitList, + SyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( @@ -855,25 +1001,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferReadRectExp( char *ZeHandleSrc; UR_CALL(Buffer->getZeHandle(ZeHandleSrc, ur_mem_handle_t_::read_only, CommandBuffer->Device)); + + // Always prefer copy engine for reads + bool PreferCopyEngine = true; + return enqueueCommandBufferMemCopyRectHelper( UR_COMMAND_MEM_BUFFER_READ_RECT, CommandBuffer, Dst, ZeHandleSrc, BufferOffset, HostOffset, Region, BufferRowPitch, HostRowPitch, - BufferSlicePitch, HostSlicePitch, NumSyncPointsInWaitList, - SyncPointWaitList, SyncPoint); + BufferSlicePitch, HostSlicePitch, PreferCopyEngine, + NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( ur_exp_command_buffer_handle_t CommandBuffer, const void *Mem, size_t Size, ur_usm_migration_flags_t Flags, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - ur_exp_command_buffer_sync_point_t *SyncPoint) { + ur_exp_command_buffer_sync_point_t *RetSyncPoint) { std::ignore = Flags; if (CommandBuffer->IsInOrderCmdList) { // Add the prefetch command to the command buffer. // Note that L0 does not handle migration flags. ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, - (CommandBuffer->ZeCommandList, Mem, Size)); + (CommandBuffer->ZeComputeCommandList, Mem, Size)); } else { std::vector ZeEventList; UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, @@ -881,7 +1031,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( if (NumSyncPointsInWaitList) { ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (CommandBuffer->ZeCommandList, NumSyncPointsInWaitList, + (CommandBuffer->ZeComputeCommandList, NumSyncPointsInWaitList, ZeEventList.data())); } @@ -892,18 +1042,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( LaunchEvent->CommandType = UR_COMMAND_USM_PREFETCH; // Get sync point and register the event with it. - *SyncPoint = CommandBuffer->GetNextSyncPoint(); - CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); + ur_exp_command_buffer_sync_point_t SyncPoint = + CommandBuffer->GetNextSyncPoint(); + CommandBuffer->RegisterSyncPoint(SyncPoint, LaunchEvent); + if (RetSyncPoint) { + *RetSyncPoint = SyncPoint; + } // Add the prefetch command to the command buffer. // Note that L0 does not handle migration flags. ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, - (CommandBuffer->ZeCommandList, Mem, Size)); + (CommandBuffer->ZeComputeCommandList, Mem, Size)); // Level Zero does not have a completion "event" with the prefetch API, // so manually add command to signal our event. ZE2UR_CALL(zeCommandListAppendSignalEvent, - (CommandBuffer->ZeCommandList, LaunchEvent->ZeEvent)); + (CommandBuffer->ZeComputeCommandList, LaunchEvent->ZeEvent)); } return UR_RESULT_SUCCESS; @@ -913,7 +1067,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( ur_exp_command_buffer_handle_t CommandBuffer, const void *Mem, size_t Size, ur_usm_advice_flags_t Advice, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, - ur_exp_command_buffer_sync_point_t *SyncPoint) { + ur_exp_command_buffer_sync_point_t *RetSyncPoint) { // A memory chunk can be advised with muliple memory advices // We therefore prefer if statements to switch cases to combine all potential // flags @@ -943,8 +1097,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( if (CommandBuffer->IsInOrderCmdList) { ZE2UR_CALL(zeCommandListAppendMemAdvise, - (CommandBuffer->ZeCommandList, CommandBuffer->Device->ZeDevice, - Mem, Size, ZeAdvice)); + (CommandBuffer->ZeComputeCommandList, + CommandBuffer->Device->ZeDevice, Mem, Size, ZeAdvice)); } else { std::vector ZeEventList; UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, @@ -952,7 +1106,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( if (NumSyncPointsInWaitList) { ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (CommandBuffer->ZeCommandList, NumSyncPointsInWaitList, + (CommandBuffer->ZeComputeCommandList, NumSyncPointsInWaitList, ZeEventList.data())); } @@ -963,17 +1117,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( LaunchEvent->CommandType = UR_COMMAND_USM_ADVISE; // Get sync point and register the event with it. - *SyncPoint = CommandBuffer->GetNextSyncPoint(); - CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); + ur_exp_command_buffer_sync_point_t SyncPoint = + CommandBuffer->GetNextSyncPoint(); + CommandBuffer->RegisterSyncPoint(SyncPoint, LaunchEvent); + if (RetSyncPoint) { + *RetSyncPoint = SyncPoint; + } ZE2UR_CALL(zeCommandListAppendMemAdvise, - (CommandBuffer->ZeCommandList, CommandBuffer->Device->ZeDevice, - Mem, Size, ZeAdvice)); + (CommandBuffer->ZeComputeCommandList, + CommandBuffer->Device->ZeDevice, Mem, Size, ZeAdvice)); // Level Zero does not have a completion "event" with the advise API, // so manually add command to signal our event. ZE2UR_CALL(zeCommandListAppendSignalEvent, - (CommandBuffer->ZeCommandList, LaunchEvent->ZeEvent)); + (CommandBuffer->ZeComputeCommandList, LaunchEvent->ZeEvent)); } return UR_RESULT_SUCCESS; @@ -997,7 +1155,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendMemBufferFillExp( UR_COMMAND_MEM_BUFFER_FILL, CommandBuffer, ZeHandleDst + Offset, Pattern, // It will be interpreted as an 8-bit value, PatternSize, // which is indicated with this pattern_size==1 - Size, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + Size, PreferCopyEngineForFill, NumSyncPointsInWaitList, SyncPointWaitList, + SyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( @@ -1011,7 +1170,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMFillExp( UR_COMMAND_MEM_BUFFER_FILL, CommandBuffer, Ptr, Pattern, // It will be interpreted as an 8-bit value, PatternSize, // which is indicated with this pattern_size==1 - Size, NumSyncPointsInWaitList, SyncPointWaitList, SyncPoint); + Size, PreferCopyEngineForFill, NumSyncPointsInWaitList, SyncPointWaitList, + SyncPoint); } UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( @@ -1025,11 +1185,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( uint32_t QueueGroupOrdinal; auto &ZeCommandQueue = QGroup.getZeQueue(&QueueGroupOrdinal); - ze_fence_handle_t ZeFence; - ZeStruct ZeFenceDesc; - - ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); - CommandBuffer->ZeFencesList.push_back(ZeFence); + // If we already have created a fence for this queue, first reset then reuse + // it, otherwise create a new fence. + ze_fence_handle_t &ZeFence = CommandBuffer->ZeActiveFence; + auto ZeWorkloadFenceForQueue = + CommandBuffer->ZeFencesMap.find(ZeCommandQueue); + if (ZeWorkloadFenceForQueue == CommandBuffer->ZeFencesMap.end()) { + ZeStruct ZeFenceDesc; + ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); + CommandBuffer->ZeFencesMap.insert({{ZeCommandQueue, ZeFence}}); + } else { + ZeFence = ZeWorkloadFenceForQueue->second; + ZE2UR_CALL(zeFenceReset, (ZeFence)); + } bool MustSignalWaitEvent = true; if (NumEventsInWaitList) { @@ -1078,8 +1246,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( // type, regardless of the UR Queue type. We therefore need to submit the list // directly using the Level-Zero API to avoid type mismatches if using UR // functions. - ZE2UR_CALL(zeCommandQueueExecuteCommandLists, - (ZeCommandQueue, 1, &CommandBuffer->ZeCommandList, ZeFence)); + ZE2UR_CALL( + zeCommandQueueExecuteCommandLists, + (ZeCommandQueue, 1, &CommandBuffer->ZeComputeCommandList, ZeFence)); + + // The Copy command-list is submitted to the main copy queue if it is not + // empty. + if (!CommandBuffer->MCopyCommandListEmpty) { + auto &QGroupCopy = Queue->getQueueGroup(true); + uint32_t QueueGroupOrdinal; + auto &ZeCopyCommandQueue = QGroupCopy.getZeQueue(&QueueGroupOrdinal); + ZE2UR_CALL( + zeCommandQueueExecuteCommandLists, + (ZeCopyCommandQueue, 1, &CommandBuffer->ZeCopyCommandList, nullptr)); + } // Execution event for this enqueue of the UR command-buffer ur_event_handle_t RetEvent{}; @@ -1089,6 +1269,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList, false, NumEventsInWaitList, EventWaitList, false)); + // Reset the wait-event for the UR command-buffer that is signaled when its // submission dependencies have been satisfied. ZE2UR_CALL(zeCommandListAppendEventReset, @@ -1201,6 +1382,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( auto SupportedFeatures = Command->CommandBuffer->Device->ZeDeviceMutableCmdListsProperties ->mutableCommandFlags; + logger::debug("Mutable features supported by device {}", SupportedFeatures); // We need the created descriptors to live till the point when // zexCommandListUpdateMutableCommandsExp is called at the end of the @@ -1228,10 +1410,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( auto MutableGroupOffestDesc = std::make_unique>(); MutableGroupOffestDesc->commandId = Command->CommandId; + DEBUG_LOG(MutableGroupOffestDesc->commandId); MutableGroupOffestDesc->pNext = NextDesc; + DEBUG_LOG(MutableGroupOffestDesc->pNext); MutableGroupOffestDesc->offsetX = NewGlobalWorkOffset[0]; + DEBUG_LOG(MutableGroupOffestDesc->offsetX); MutableGroupOffestDesc->offsetY = Dim >= 2 ? NewGlobalWorkOffset[1] : 0; + DEBUG_LOG(MutableGroupOffestDesc->offsetY); MutableGroupOffestDesc->offsetZ = Dim == 3 ? NewGlobalWorkOffset[2] : 0; + DEBUG_LOG(MutableGroupOffestDesc->offsetZ); NextDesc = MutableGroupOffestDesc.get(); OffsetDescs.push_back(std::move(MutableGroupOffestDesc)); } @@ -1245,10 +1432,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( auto MutableGroupSizeDesc = std::make_unique>(); MutableGroupSizeDesc->commandId = Command->CommandId; + DEBUG_LOG(MutableGroupSizeDesc->commandId); MutableGroupSizeDesc->pNext = NextDesc; + DEBUG_LOG(MutableGroupSizeDesc->pNext); MutableGroupSizeDesc->groupSizeX = NewLocalWorkSize[0]; + DEBUG_LOG(MutableGroupSizeDesc->groupSizeX); MutableGroupSizeDesc->groupSizeY = Dim >= 2 ? NewLocalWorkSize[1] : 1; + DEBUG_LOG(MutableGroupSizeDesc->groupSizeY); MutableGroupSizeDesc->groupSizeZ = Dim == 3 ? NewLocalWorkSize[2] : 1; + DEBUG_LOG(MutableGroupSizeDesc->groupSizeZ); NextDesc = MutableGroupSizeDesc.get(); GroupSizeDescs.push_back(std::move(MutableGroupSizeDesc)); } @@ -1261,8 +1453,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( UR_ASSERT(!(NewGlobalWorkSize && !NewLocalWorkSize) || (SupportedFeatures & ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE), UR_RESULT_ERROR_UNSUPPORTED_FEATURE); + + ze_group_count_t ZeThreadGroupDimensions{1, 1, 1}; if (NewGlobalWorkSize && Dim > 0) { - ze_group_count_t ZeThreadGroupDimensions{1, 1, 1}; uint32_t WG[3]; // If new global work size is provided but new local work size is not // provided then we still need to update local work size based on size @@ -1273,9 +1466,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( Dim, NewGlobalWorkSize, NewLocalWorkSize)); auto MutableGroupCountDesc = std::make_unique>(); - MutableGroupCountDesc->pNext = NextDesc; MutableGroupCountDesc->commandId = Command->CommandId; + DEBUG_LOG(MutableGroupCountDesc->commandId); + MutableGroupCountDesc->pNext = NextDesc; + DEBUG_LOG(MutableGroupCountDesc->pNext); MutableGroupCountDesc->pGroupCount = &ZeThreadGroupDimensions; + DEBUG_LOG(MutableGroupCountDesc->pGroupCount->groupCountX); + DEBUG_LOG(MutableGroupCountDesc->pGroupCount->groupCountY); + DEBUG_LOG(MutableGroupCountDesc->pGroupCount->groupCountZ); NextDesc = MutableGroupCountDesc.get(); GroupCountDescs.push_back(std::move(MutableGroupCountDesc)); @@ -1283,10 +1481,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( auto MutableGroupSizeDesc = std::make_unique>(); MutableGroupSizeDesc->commandId = Command->CommandId; + DEBUG_LOG(MutableGroupSizeDesc->commandId); MutableGroupSizeDesc->pNext = NextDesc; + DEBUG_LOG(MutableGroupSizeDesc->pNext); MutableGroupSizeDesc->groupSizeX = WG[0]; + DEBUG_LOG(MutableGroupSizeDesc->groupSizeX); MutableGroupSizeDesc->groupSizeY = WG[1]; + DEBUG_LOG(MutableGroupSizeDesc->groupSizeY); MutableGroupSizeDesc->groupSizeZ = WG[2]; + DEBUG_LOG(MutableGroupSizeDesc->groupSizeZ); + NextDesc = MutableGroupSizeDesc.get(); GroupSizeDescs.push_back(std::move(MutableGroupSizeDesc)); } @@ -1333,10 +1537,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( auto ZeMutableArgDesc = std::make_unique>(); ZeMutableArgDesc->commandId = Command->CommandId; + DEBUG_LOG(ZeMutableArgDesc->commandId); ZeMutableArgDesc->pNext = NextDesc; + DEBUG_LOG(ZeMutableArgDesc->pNext); ZeMutableArgDesc->argIndex = NewMemObjArgDesc.argIndex; + DEBUG_LOG(ZeMutableArgDesc->argIndex); ZeMutableArgDesc->argSize = sizeof(void *); + DEBUG_LOG(ZeMutableArgDesc->argSize); ZeMutableArgDesc->pArgValue = ZeHandlePtr; + DEBUG_LOG(ZeMutableArgDesc->pArgValue); NextDesc = ZeMutableArgDesc.get(); ArgDescs.push_back(std::move(ZeMutableArgDesc)); @@ -1350,10 +1559,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( auto ZeMutableArgDesc = std::make_unique>(); ZeMutableArgDesc->commandId = Command->CommandId; + DEBUG_LOG(ZeMutableArgDesc->commandId); ZeMutableArgDesc->pNext = NextDesc; + DEBUG_LOG(ZeMutableArgDesc->pNext); ZeMutableArgDesc->argIndex = NewPointerArgDesc.argIndex; + DEBUG_LOG(ZeMutableArgDesc->argIndex); ZeMutableArgDesc->argSize = sizeof(void *); + DEBUG_LOG(ZeMutableArgDesc->argSize); ZeMutableArgDesc->pArgValue = NewPointerArgDesc.pNewPointerArg; + DEBUG_LOG(ZeMutableArgDesc->pArgValue); NextDesc = ZeMutableArgDesc.get(); ArgDescs.push_back(std::move(ZeMutableArgDesc)); @@ -1367,9 +1581,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( auto ZeMutableArgDesc = std::make_unique>(); ZeMutableArgDesc->commandId = Command->CommandId; + DEBUG_LOG(ZeMutableArgDesc->commandId); ZeMutableArgDesc->pNext = NextDesc; + DEBUG_LOG(ZeMutableArgDesc->pNext); ZeMutableArgDesc->argIndex = NewValueArgDesc.argIndex; + DEBUG_LOG(ZeMutableArgDesc->argIndex); ZeMutableArgDesc->argSize = NewValueArgDesc.argSize; + DEBUG_LOG(ZeMutableArgDesc->argSize); // OpenCL: "the arg_value pointer can be NULL or point to a NULL value // in which case a NULL value will be used as the value for the argument // declared as a pointer to global or constant memory in the kernel" @@ -1383,6 +1601,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( ArgValuePtr = nullptr; } ZeMutableArgDesc->pArgValue = ArgValuePtr; + DEBUG_LOG(ZeMutableArgDesc->pArgValue); NextDesc = ZeMutableArgDesc.get(); ArgDescs.push_back(std::move(ZeMutableArgDesc)); } @@ -1392,15 +1611,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp( MutableCommandDesc.flags = 0; // We must synchronize mutable command list execution before mutating. - ZE2UR_CALL(zeEventHostSynchronize, - (CommandBuffer->SignalEvent->ZeEvent, UINT64_MAX)); + if (ze_fence_handle_t &ZeFence = CommandBuffer->ZeActiveFence) { + ZE2UR_CALL(zeFenceHostSynchronize, (ZeFence, UINT64_MAX)); + } - auto Plt = Command->CommandBuffer->Context->getPlatform(); + auto Plt = CommandBuffer->Context->getPlatform(); UR_ASSERT(Plt->ZeMutableCmdListExt.Supported, UR_RESULT_ERROR_UNSUPPORTED_FEATURE); ZE2UR_CALL(Plt->ZeMutableCmdListExt.zexCommandListUpdateMutableCommandsExp, - (CommandBuffer->ZeCommandList, &MutableCommandDesc)); - ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandList)); + (CommandBuffer->ZeComputeCommandList, &MutableCommandDesc)); + ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeComputeCommandList)); return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/command_buffer.hpp b/source/adapters/level_zero/command_buffer.hpp index 04d6a7d269..48f1c68330 100644 --- a/source/adapters/level_zero/command_buffer.hpp +++ b/source/adapters/level_zero/command_buffer.hpp @@ -29,7 +29,9 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { ur_context_handle_t Context, ur_device_handle_t Device, ze_command_list_handle_t CommandList, ze_command_list_handle_t CommandListResetEvents, + ze_command_list_handle_t CopyCommandList, ZeStruct ZeDesc, + ZeStruct ZeCopyDesc, const ur_exp_command_buffer_desc_t *Desc, const bool IsInOrderCmdList); ~ur_exp_command_buffer_handle_t_(); @@ -44,20 +46,33 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { return NextSyncPoint; } + // Indicates if a copy engine is available for use + bool UseCopyEngine() const { return ZeCopyCommandList != nullptr; } + // UR context associated with this command-buffer ur_context_handle_t Context; // Device associated with this command buffer ur_device_handle_t Device; // Level Zero command list handle - ze_command_list_handle_t ZeCommandList; + ze_command_list_handle_t ZeComputeCommandList; // Level Zero command list handle ze_command_list_handle_t ZeCommandListResetEvents; // Level Zero command list descriptor ZeStruct ZeCommandListDesc; - // List of Level Zero fences created when submitting a graph. - // This list is needed to release all fences retained by the - // command_buffer. - std::vector ZeFencesList; + // Level Zero Copy command list handle + ze_command_list_handle_t ZeCopyCommandList; + // Level Zero Copy command list descriptor + ZeStruct ZeCopyCommandListDesc; + // This flag is must be set to false if at least one copy command has been + // added to `ZeCopyCommandList` + bool MCopyCommandListEmpty = true; + // Level Zero fences for each queue the command-buffer has been enqueued to. + // These should be destroyed when the command-buffer is released. + std::unordered_map ZeFencesMap; + // The Level Zero fence from the most recent enqueue of the command-buffer. + // Must be an element in ZeFencesMap, so is not required to be destroyed + // itself. + ze_fence_handle_t ZeActiveFence; // Queue properties from command-buffer descriptor // TODO: Do we need these? ur_queue_properties_t QueueProperties; diff --git a/source/adapters/level_zero/common.hpp b/source/adapters/level_zero/common.hpp index e16d767b71..55cf1af5ca 100644 --- a/source/adapters/level_zero/common.hpp +++ b/source/adapters/level_zero/common.hpp @@ -99,8 +99,8 @@ static auto getUrResultString = [](ur_result_t Result) { return "UR_RESULT_ERROR_INVALID_IMAGE_SIZE"; case UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR"; - case UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED: - return "UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED"; + case UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT: + return "UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT"; case UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE: return "UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE"; case UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE: @@ -141,8 +141,6 @@ static auto getUrResultString = [](ur_result_t Result) { return "UR_RESULT_ERROR_INVALID_ENUMERATION"; case UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION: return "UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION"; - case UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT: - return "UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT"; case UR_RESULT_ERROR_INVALID_NATIVE_BINARY: return "UR_RESULT_ERROR_INVALID_NATIVE_BINARY"; case UR_RESULT_ERROR_INVALID_GLOBAL_NAME: diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index 7f832f30f2..087d459506 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -256,7 +256,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( // > The application must only use the module for the device, or its // > sub-devices, which was provided during creation. case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE: - return ReturnValue(uint32_t{0}); + return ReturnValue(ur_bool_t{0}); case UR_DEVICE_INFO_COMPILER_AVAILABLE: return ReturnValue(static_cast(true)); case UR_DEVICE_INFO_LINKER_AVAILABLE: @@ -326,10 +326,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue(Device->ZeDeviceImageProperties->maxImageDims1D > 0); case UR_DEVICE_INFO_HOST_UNIFIED_MEMORY: return ReturnValue( - static_cast((Device->ZeDeviceProperties->flags & - ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) != 0)); + static_cast((Device->ZeDeviceProperties->flags & + ZE_DEVICE_PROPERTY_FLAG_INTEGRATED) != 0)); case UR_DEVICE_INFO_AVAILABLE: - return ReturnValue(static_cast(ZeDevice ? true : false)); + return ReturnValue(static_cast(ZeDevice ? true : false)); case UR_DEVICE_INFO_VENDOR: // TODO: Level-Zero does not return vendor's name at the moment // only the ID. @@ -448,8 +448,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( case UR_DEVICE_INFO_ENDIAN_LITTLE: return ReturnValue(static_cast(true)); case UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT: - return ReturnValue(static_cast(Device->ZeDeviceProperties->flags & - ZE_DEVICE_PROPERTY_FLAG_ECC)); + return ReturnValue(static_cast( + Device->ZeDeviceProperties->flags & ZE_DEVICE_PROPERTY_FLAG_ECC)); case UR_DEVICE_INFO_PROFILING_TIMER_RESOLUTION: return ReturnValue( static_cast(Device->ZeDeviceProperties->timerResolution)); @@ -626,11 +626,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue(static_cast(false)); } case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { - // ze_device_compute_properties.subGroupSizes is in uint32_t whereas the - // expected return is size_t datatype. size_t can be 8 bytes of data. - return ReturnValue.template operator()( - Device->ZeDeviceComputeProperties->subGroupSizes, - Device->ZeDeviceComputeProperties->numSubGroupSizes); + return ReturnValue(Device->ZeDeviceComputeProperties->subGroupSizes, + Device->ZeDeviceComputeProperties->numSubGroupSizes); } case UR_DEVICE_INFO_IL_VERSION: { // Set to a space separated list of IL version strings of the form @@ -875,13 +872,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP: { - return ReturnValue(static_cast(true)); + return ReturnValue(static_cast(true)); } case UR_DEVICE_INFO_ESIMD_SUPPORT: { // ESIMD is only supported by Intel GPUs. - uint32_t result = Device->ZeDeviceProperties->type == ZE_DEVICE_TYPE_GPU && - Device->ZeDeviceProperties->vendorId == 0x8086; + ur_bool_t result = Device->ZeDeviceProperties->type == ZE_DEVICE_TYPE_GPU && + Device->ZeDeviceProperties->vendorId == 0x8086; return ReturnValue(result); } @@ -947,18 +944,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: return ReturnValue(true); case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: { - // TODO: Level Zero API allows to check support for all sub-features: - // ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS, - // ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT, - // ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE, - // ZE_MUTABLE_COMMAND_EXP_FLAG_GLOBAL_OFFSET, - // ZE_MUTABLE_COMMAND_EXP_FLAG_SIGNAL_EVENT, - // ZE_MUTABLE_COMMAND_EXP_FLAG_WAIT_EVENTS - // but UR has only one property to check the mutable command lists feature - // support. For now return true if kernel arguments can be updated. - auto KernelArgUpdateSupport = - Device->ZeDeviceMutableCmdListsProperties->mutableCommandFlags & - ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS; + // Update support requires being able to update kernel arguments and all + // aspects of the kernel NDRange. + const ze_mutable_command_exp_flags_t UpdateMask = + ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS | + ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT | + ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE | + ZE_MUTABLE_COMMAND_EXP_FLAG_GLOBAL_OFFSET; + + const bool KernelArgUpdateSupport = + (Device->ZeDeviceMutableCmdListsProperties->mutableCommandFlags & + UpdateMask) == UpdateMask; return ReturnValue(KernelArgUpdateSupport && Device->Platform->ZeMutableCmdListExt.Supported); } diff --git a/source/adapters/level_zero/image.cpp b/source/adapters/level_zero/image.cpp index b953e69f06..e2026b9dc7 100644 --- a/source/adapters/level_zero/image.cpp +++ b/source/adapters/level_zero/image.cpp @@ -766,7 +766,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp( UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(!(UR_EXP_IMAGE_COPY_FLAGS_MASK & imageCopyFlags), UR_RESULT_ERROR_INVALID_ENUMERATION); - UR_ASSERT(!(pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type), + UR_ASSERT(!(pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type), UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); ZeStruct ZeImageDesc; diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index 40d6260ac9..3f8e8c6986 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -13,6 +13,93 @@ #include "ur_api.h" #include "ur_level_zero.hpp" +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( + ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim, + [[maybe_unused]] const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, size_t *pSuggestedLocalWorkSize) { + UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(pSuggestedLocalWorkSize != nullptr, + UR_RESULT_ERROR_INVALID_NULL_POINTER); + + uint32_t LocalWorkSize[3]; + size_t GlobalWorkSize3D[3]{1, 1, 1}; + std::copy(pGlobalWorkSize, pGlobalWorkSize + workDim, GlobalWorkSize3D); + + ze_kernel_handle_t ZeKernel{}; + UR_CALL(getZeKernel(hQueue, hKernel, &ZeKernel)); + + UR_CALL(getSuggestedLocalWorkSize(hQueue, ZeKernel, GlobalWorkSize3D, + LocalWorkSize)); + + std::copy(LocalWorkSize, LocalWorkSize + workDim, pSuggestedLocalWorkSize); + return UR_RESULT_SUCCESS; +} + +ur_result_t getZeKernel(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, + ze_kernel_handle_t *phZeKernel) { + auto ZeDevice = hQueue->Device->ZeDevice; + + if (hKernel->ZeKernelMap.empty()) { + *phZeKernel = hKernel->ZeKernel; + } else { + auto It = hKernel->ZeKernelMap.find(ZeDevice); + if (It == hKernel->ZeKernelMap.end()) { + /* kernel and queue don't match */ + return UR_RESULT_ERROR_INVALID_QUEUE; + } + *phZeKernel = It->second; + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t getSuggestedLocalWorkSize(ur_queue_handle_t hQueue, + ze_kernel_handle_t hZeKernel, + size_t GlobalWorkSize3D[3], + uint32_t SuggestedLocalWorkSize3D[3]) { + uint32_t *WG = SuggestedLocalWorkSize3D; + + // We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize + // values do not fit to 32-bit that the API only supports currently. + bool SuggestGroupSize = true; + for (int I : {0, 1, 2}) { + if (GlobalWorkSize3D[I] > UINT32_MAX) { + SuggestGroupSize = false; + } + } + if (SuggestGroupSize) { + ZE2UR_CALL(zeKernelSuggestGroupSize, + (hZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1], + GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2])); + } else { + for (int I : {0, 1, 2}) { + // Try to find a I-dimension WG size that the GlobalWorkSize[I] is + // fully divisable with. Start with the max possible size in + // each dimension. + uint32_t GroupSize[] = { + hQueue->Device->ZeDeviceComputeProperties->maxGroupSizeX, + hQueue->Device->ZeDeviceComputeProperties->maxGroupSizeY, + hQueue->Device->ZeDeviceComputeProperties->maxGroupSizeZ}; + GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]); + while (GlobalWorkSize3D[I] % GroupSize[I]) { + --GroupSize[I]; + } + if (GlobalWorkSize3D[I] / GroupSize[I] > UINT32_MAX) { + logger::error("getSuggestedLocalWorkSize: can't find a WG size " + "suitable for global work size > UINT32_MAX"); + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; + } + WG[I] = GroupSize[I]; + } + logger::debug( + "getSuggestedLocalWorkSize: using computed WG size = {{{}, {}, {}}}", + WG[0], WG[1], WG[2]); + } + + return UR_RESULT_SUCCESS; +} + UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_queue_handle_t Queue, ///< [in] handle of the queue object ur_kernel_handle_t Kernel, ///< [in] handle of the kernel object @@ -43,19 +130,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( *OutEvent ///< [in,out][optional] return an event object that identifies ///< this particular kernel execution instance. ) { - auto ZeDevice = Queue->Device->ZeDevice; - ze_kernel_handle_t ZeKernel{}; - if (Kernel->ZeKernelMap.empty()) { - ZeKernel = Kernel->ZeKernel; - } else { - auto It = Kernel->ZeKernelMap.find(ZeDevice); - if (It == Kernel->ZeKernelMap.end()) { - /* kernel and queue don't match */ - return UR_RESULT_ERROR_INVALID_QUEUE; - } - ZeKernel = It->second; - } + UR_CALL(getZeKernel(Queue, Kernel, &ZeKernel)); + // Lock automatically releases when this goes out of scope. std::scoped_lock Lock( Queue->Mutex, Kernel->Mutex, Kernel->Program->Mutex); @@ -92,54 +169,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( std::copy(GlobalWorkSize, GlobalWorkSize + WorkDim, GlobalWorkSize3D); if (LocalWorkSize) { - // L0 - UR_ASSERT(LocalWorkSize[0] < (std::numeric_limits::max)(), - UR_RESULT_ERROR_INVALID_VALUE); - UR_ASSERT(LocalWorkSize[1] < (std::numeric_limits::max)(), - UR_RESULT_ERROR_INVALID_VALUE); - UR_ASSERT(LocalWorkSize[2] < (std::numeric_limits::max)(), - UR_RESULT_ERROR_INVALID_VALUE); - WG[0] = static_cast(LocalWorkSize[0]); - WG[1] = static_cast(LocalWorkSize[1]); - WG[2] = static_cast(LocalWorkSize[2]); - } else { - // We can't call to zeKernelSuggestGroupSize if 64-bit GlobalWorkSize - // values do not fit to 32-bit that the API only supports currently. - bool SuggestGroupSize = true; - for (int I : {0, 1, 2}) { - if (GlobalWorkSize3D[I] > UINT32_MAX) { - SuggestGroupSize = false; - } - } - if (SuggestGroupSize) { - ZE2UR_CALL(zeKernelSuggestGroupSize, - (ZeKernel, GlobalWorkSize3D[0], GlobalWorkSize3D[1], - GlobalWorkSize3D[2], &WG[0], &WG[1], &WG[2])); - } else { - for (int I : {0, 1, 2}) { - // Try to find a I-dimension WG size that the GlobalWorkSize[I] is - // fully divisable with. Start with the max possible size in - // each dimension. - uint32_t GroupSize[] = { - Queue->Device->ZeDeviceComputeProperties->maxGroupSizeX, - Queue->Device->ZeDeviceComputeProperties->maxGroupSizeY, - Queue->Device->ZeDeviceComputeProperties->maxGroupSizeZ}; - GroupSize[I] = (std::min)(size_t(GroupSize[I]), GlobalWorkSize3D[I]); - while (GlobalWorkSize3D[I] % GroupSize[I]) { - --GroupSize[I]; - } - - if (GlobalWorkSize3D[I] / GroupSize[I] > UINT32_MAX) { - logger::error("urEnqueueKernelLaunch: can't find a WG size " - "suitable for global work size > UINT32_MAX"); - return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; - } - WG[I] = GroupSize[I]; - } - logger::debug( - "urEnqueueKernelLaunch: using computed WG size = {{{}, {}, {}}}", - WG[0], WG[1], WG[2]); + for (uint32_t I = 0; I < WorkDim; ++I) { + UR_ASSERT(LocalWorkSize[I] < (std::numeric_limits::max)(), + UR_RESULT_ERROR_INVALID_VALUE); + WG[I] = static_cast(LocalWorkSize[I]); } + } else { + UR_CALL(getSuggestedLocalWorkSize(Queue, ZeKernel, GlobalWorkSize3D, WG)); } // TODO: assert if sizes do not fit into 32-bit? diff --git a/source/adapters/level_zero/kernel.hpp b/source/adapters/level_zero/kernel.hpp index 1cc146d262..2db3af0514 100644 --- a/source/adapters/level_zero/kernel.hpp +++ b/source/adapters/level_zero/kernel.hpp @@ -107,3 +107,10 @@ struct ur_kernel_handle_t_ : _ur_object { ZeCache> ZeKernelProperties; ZeCache ZeKernelName; }; + +ur_result_t getSuggestedLocalWorkSize(ur_queue_handle_t hQueue, + ze_kernel_handle_t hZeKernel, + size_t GlobalWorkSize3D[3], + uint32_t SuggestedLocalWorkSize3D[3]); +ur_result_t getZeKernel(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, + ze_kernel_handle_t *phZeKernel); diff --git a/source/adapters/level_zero/memory.cpp b/source/adapters/level_zero/memory.cpp index 77cb6abb38..42a5d22d47 100644 --- a/source/adapters/level_zero/memory.cpp +++ b/source/adapters/level_zero/memory.cpp @@ -1483,14 +1483,14 @@ static ur_result_t ur2zeImageDesc(const ur_image_format_t *ImageFormat, ZeImageFormatLayout = ZE_IMAGE_FORMAT_LAYOUT_32_32_32_32; break; default: - logger::error("urMemImageCreate: unexpected data type Size"); - return UR_RESULT_ERROR_INVALID_VALUE; + logger::error("urMemImageCreate: unexpected data type Size\n"); + return UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT; } break; } default: logger::error("format layout = {}", ImageFormat->channelOrder); - die("urMemImageCreate: unsupported image format layout\n"); + return UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT; break; } @@ -1519,7 +1519,7 @@ static ur_result_t ur2zeImageDesc(const ur_image_format_t *ImageFormat, break; default: logger::error("urMemImageCreate: unsupported image type"); - return UR_RESULT_ERROR_INVALID_VALUE; + return UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR; } ZeImageDesc.arraylevels = ZeImageDesc.flags = 0; diff --git a/source/adapters/level_zero/platform.cpp b/source/adapters/level_zero/platform.cpp index d508d85c82..ca162bbad2 100644 --- a/source/adapters/level_zero/platform.cpp +++ b/source/adapters/level_zero/platform.cpp @@ -123,6 +123,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetNativeHandle( UR_APIEXPORT ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( ur_native_handle_t NativePlatform, ///< [in] the native handle of the platform. + ur_adapter_handle_t, const ur_platform_native_properties_t *Properties, ///< [in][optional] pointer to native platform properties ///< struct. diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index f47f960cc0..641c11071b 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -1696,15 +1696,9 @@ ur_result_t ur_queue_handle_t_::synchronize() { // If event is discarded then it can be in reset state or underlying level // zero handle can have device scope, so we can't synchronize the last // event. + auto savedLastCommandEvent = LastCommandEvent; if (isInOrderQueue() && !LastCommandEvent->IsDiscarded) { - if (UrL0QueueSyncNonBlocking) { - auto SyncZeEvent = LastCommandEvent->ZeEvent; - this->Mutex.unlock(); - ZE2UR_CALL(zeHostSynchronize, (SyncZeEvent)); - this->Mutex.lock(); - } else { - ZE2UR_CALL(zeHostSynchronize, (LastCommandEvent->ZeEvent)); - } + ZE2UR_CALL(zeHostSynchronize, (LastCommandEvent->ZeEvent)); // clean up all events known to have been completed as well, // so they can be reused later @@ -1744,7 +1738,12 @@ ur_result_t ur_queue_handle_t_::synchronize() { } } } - LastCommandEvent = nullptr; + // If the current version of the LastCommandEvent == savedLastCommandEvent, + // then LastCommandEvent = nullptr; Otherwise, if LastCommandEvent != + // savedLastCommandEvent, then LastCommandEvent is unchanged. + if (LastCommandEvent == savedLastCommandEvent) { + LastCommandEvent = nullptr; + } } // Since all timestamp recordings should have finished with the diff --git a/source/adapters/level_zero/ur_interface_loader.cpp b/source/adapters/level_zero/ur_interface_loader.cpp index d6d6060ea6..45568a7885 100644 --- a/source/adapters/level_zero/ur_interface_loader.cpp +++ b/source/adapters/level_zero/ur_interface_loader.cpp @@ -151,6 +151,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetArgSampler = urKernelSetArgSampler; pDdiTable->pfnSetArgMemObj = urKernelSetArgMemObj; pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; + pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; return retVal; } diff --git a/source/adapters/native_cpu/device.cpp b/source/adapters/native_cpu/device.cpp index aabe0a3f13..067d149289 100644 --- a/source/adapters/native_cpu/device.cpp +++ b/source/adapters/native_cpu/device.cpp @@ -294,10 +294,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return UR_RESULT_ERROR_INVALID_VALUE; case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: { ur_memory_order_capability_flags_t Capabilities = - UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED | - UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE | - UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE | - UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQ_REL; + UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED; return ReturnValue(Capabilities); } case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES: { diff --git a/source/adapters/native_cpu/kernel.cpp b/source/adapters/native_cpu/kernel.cpp index 7ef17b0c28..29b54503eb 100644 --- a/source/adapters/native_cpu/kernel.cpp +++ b/source/adapters/native_cpu/kernel.cpp @@ -297,3 +297,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle( DIE_NO_IMPLEMENTATION } + +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( + [[maybe_unused]] ur_kernel_handle_t hKernel, + [[maybe_unused]] ur_queue_handle_t hQueue, + [[maybe_unused]] uint32_t workDim, + [[maybe_unused]] const size_t *pGlobalWorkOffset, + [[maybe_unused]] const size_t *pGlobalWorkSize, + [[maybe_unused]] size_t *pSuggestedLocalWorkSize) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} diff --git a/source/adapters/native_cpu/platform.cpp b/source/adapters/native_cpu/platform.cpp index 31f35d03b2..8d650764c1 100644 --- a/source/adapters/native_cpu/platform.cpp +++ b/source/adapters/native_cpu/platform.cpp @@ -96,13 +96,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetBackendOption( } UR_APIEXPORT ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( - ur_native_handle_t hNativePlatform, - const ur_platform_native_properties_t *pProperties, - ur_platform_handle_t *phPlatform) { - std::ignore = hNativePlatform; - std::ignore = pProperties; - std::ignore = phPlatform; - + ur_native_handle_t, ur_adapter_handle_t, + const ur_platform_native_properties_t *, ur_platform_handle_t *) { DIE_NO_IMPLEMENTATION; } diff --git a/source/adapters/native_cpu/program.cpp b/source/adapters/native_cpu/program.cpp index 678bdb8074..ee21a707b7 100644 --- a/source/adapters/native_cpu/program.cpp +++ b/source/adapters/native_cpu/program.cpp @@ -14,6 +14,7 @@ #include "common/ur_util.hpp" #include "program.hpp" #include +#include UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, @@ -63,11 +64,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( UR_ASSERT(phProgram, UR_RESULT_ERROR_INVALID_NULL_POINTER); UR_ASSERT(pBinary != nullptr, UR_RESULT_ERROR_INVALID_NULL_POINTER); - auto hProgram = new ur_program_handle_t_( + auto hProgram = std::make_unique( hContext, reinterpret_cast(pBinary)); if (pProperties != nullptr) { for (uint32_t i = 0; i < pProperties->count; i++) { - auto mdNode = pProperties->pMetadatas[i]; + const auto &mdNode = pProperties->pMetadatas[i]; std::string mdName(mdNode.pName); auto [Prefix, Tag] = splitMetadataName(mdName); if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) { @@ -89,7 +90,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( nativecpu_it++; } - *phProgram = hProgram; + *phProgram = hProgram.release(); return UR_RESULT_SUCCESS; } diff --git a/source/adapters/native_cpu/program.hpp b/source/adapters/native_cpu/program.hpp index 0673b4122c..e85749a7b2 100644 --- a/source/adapters/native_cpu/program.hpp +++ b/source/adapters/native_cpu/program.hpp @@ -13,6 +13,8 @@ #include #include "context.hpp" + +#include #include namespace native_cpu { diff --git a/source/adapters/native_cpu/ur_interface_loader.cpp b/source/adapters/native_cpu/ur_interface_loader.cpp index 065012613e..053fc32d9f 100644 --- a/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/source/adapters/native_cpu/ur_interface_loader.cpp @@ -123,6 +123,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetArgValue = urKernelSetArgValue; pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = urKernelSetSpecializationConstants; + pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/null/ur_nullddi.cpp b/source/adapters/null/ur_nullddi.cpp index 2278d5907e..19a9c9003b 100644 --- a/source/adapters/null/ur_nullddi.cpp +++ b/source/adapters/null/ur_nullddi.cpp @@ -263,6 +263,8 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetNativeHandle( __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( ur_native_handle_t hNativePlatform, ///< [in][nocheck] the native handle of the platform. + ur_adapter_handle_t + hAdapter, ///< [in] handle of the adapter associated with the native backend. const ur_platform_native_properties_t * pProperties, ///< [in][optional] pointer to native platform properties struct. ur_platform_handle_t * @@ -274,8 +276,8 @@ __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( auto pfnCreateWithNativeHandle = d_context.urDdiTable.Platform.pfnCreateWithNativeHandle; if (nullptr != pfnCreateWithNativeHandle) { - result = - pfnCreateWithNativeHandle(hNativePlatform, pProperties, phPlatform); + result = pfnCreateWithNativeHandle(hNativePlatform, hAdapter, + pProperties, phPlatform); } else { // generic implementation *phPlatform = reinterpret_cast(d_context.get()); @@ -2616,6 +2618,43 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreateWithNativeHandle( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelGetSuggestedLocalWorkSize +__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + uint32_t + numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global + ///< and work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the offset used to calculate the global ID of a work-item + const size_t * + pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the number of global work-items in workDim that will execute the + ///< kernel function + size_t * + pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify + ///< suggested local work size that will contain the result of the query + ) try { + ur_result_t result = UR_RESULT_SUCCESS; + + // if the driver has created a custom function, then call it instead of using the generic path + auto pfnGetSuggestedLocalWorkSize = + d_context.urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize; + if (nullptr != pfnGetSuggestedLocalWorkSize) { + result = pfnGetSuggestedLocalWorkSize( + hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize, + pSuggestedLocalWorkSize); + } else { + // generic implementation + } + + return result; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urQueueGetInfo __urdlllocal ur_result_t UR_APICALL urQueueGetInfo( @@ -6248,6 +6287,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnCreateWithNativeHandle = driver::urKernelCreateWithNativeHandle; + pDdiTable->pfnGetSuggestedLocalWorkSize = + driver::urKernelGetSuggestedLocalWorkSize; + pDdiTable->pfnSetArgValue = driver::urKernelSetArgValue; pDdiTable->pfnSetArgLocal = driver::urKernelSetArgLocal; diff --git a/source/adapters/opencl/common.cpp b/source/adapters/opencl/common.cpp index 750616235d..d557d3a202 100644 --- a/source/adapters/opencl/common.cpp +++ b/source/adapters/opencl/common.cpp @@ -87,6 +87,8 @@ ur_result_t mapCLErrorToUR(cl_int Result) { return UR_RESULT_ERROR_DEVICE_NOT_AVAILABLE; case CL_INVALID_KERNEL_ARGS: return UR_RESULT_ERROR_INVALID_KERNEL_ARGS; + case CL_INVALID_COMMAND_QUEUE: + return UR_RESULT_ERROR_INVALID_QUEUE; default: return UR_RESULT_ERROR_UNKNOWN; } diff --git a/source/adapters/opencl/device.cpp b/source/adapters/opencl/device.cpp index ac79b71876..0c03c727f3 100644 --- a/source/adapters/opencl/device.cpp +++ b/source/adapters/opencl/device.cpp @@ -910,7 +910,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, case UR_EXT_DEVICE_INFO_OPENCL_C_VERSION: case UR_DEVICE_INFO_BUILT_IN_KERNELS: case UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES: - case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: case UR_DEVICE_INFO_IP_VERSION: { /* We can just use the OpenCL outputs because the sizes of OpenCL types * are the same as UR. @@ -929,6 +928,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, return UR_RESULT_SUCCESS; } + case UR_DEVICE_INFO_SUB_GROUP_SIZES_INTEL: { + // Have to convert size_t to uint32_t + size_t SubGroupSizesSize = 0; + CL_RETURN_ON_FAILURE( + clGetDeviceInfo(cl_adapter::cast(hDevice), CLPropName, 0, + nullptr, &SubGroupSizesSize)); + std::vector SubGroupSizes(SubGroupSizesSize / sizeof(size_t)); + CL_RETURN_ON_FAILURE( + clGetDeviceInfo(cl_adapter::cast(hDevice), CLPropName, + SubGroupSizesSize, SubGroupSizes.data(), nullptr)); + return ReturnValue.template operator()(SubGroupSizes.data(), + SubGroupSizes.size()); + } case UR_DEVICE_INFO_EXTENSIONS: { cl_device_id Dev = cl_adapter::cast(hDevice); size_t ExtSize = 0; diff --git a/source/adapters/opencl/event.cpp b/source/adapters/opencl/event.cpp index 5141ce8ff0..45550a68e8 100644 --- a/source/adapters/opencl/event.cpp +++ b/source/adapters/opencl/event.cpp @@ -46,7 +46,8 @@ convertURProfilingInfoToCL(const ur_profiling_info_t PropName) { return CL_PROFILING_COMMAND_SUBMIT; case UR_PROFILING_INFO_COMMAND_START: return CL_PROFILING_COMMAND_START; - // TODO(ur) add UR_PROFILING_INFO_COMMAND_COMPLETE once spec has been updated + case UR_PROFILING_INFO_COMMAND_COMPLETE: + return CL_PROFILING_COMMAND_COMPLETE; case UR_PROFILING_INFO_COMMAND_END: return CL_PROFILING_COMMAND_END; default: diff --git a/source/adapters/opencl/kernel.cpp b/source/adapters/opencl/kernel.cpp index 4fcbdeefa5..3accd84778 100644 --- a/source/adapters/opencl/kernel.cpp +++ b/source/adapters/opencl/kernel.cpp @@ -419,3 +419,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgSampler( CL_RETURN_ON_FAILURE(RetErr); return UR_RESULT_SUCCESS; } + +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( + ur_kernel_handle_t hKernel, ur_queue_handle_t hQueue, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + size_t *pSuggestedLocalWorkSize) { + cl_device_id Device; + cl_platform_id Platform; + + CL_RETURN_ON_FAILURE(clGetCommandQueueInfo( + cl_adapter::cast(hQueue), CL_QUEUE_DEVICE, + sizeof(cl_device_id), &Device, nullptr)); + + CL_RETURN_ON_FAILURE(clGetDeviceInfo( + Device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &Platform, nullptr)); + + auto GetKernelSuggestedLocalWorkSizeFuncPtr = + (clGetKernelSuggestedLocalWorkSizeKHR_fn) + clGetExtensionFunctionAddressForPlatform( + Platform, "clGetKernelSuggestedLocalWorkSizeKHR"); + if (!GetKernelSuggestedLocalWorkSizeFuncPtr) + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + + CL_RETURN_ON_FAILURE(GetKernelSuggestedLocalWorkSizeFuncPtr( + cl_adapter::cast(hQueue), + cl_adapter::cast(hKernel), workDim, pGlobalWorkOffset, + pGlobalWorkSize, pSuggestedLocalWorkSize)); + return UR_RESULT_SUCCESS; +} diff --git a/source/adapters/opencl/memory.cpp b/source/adapters/opencl/memory.cpp index 2397e2b5f9..b2476fc420 100644 --- a/source/adapters/opencl/memory.cpp +++ b/source/adapters/opencl/memory.cpp @@ -123,9 +123,6 @@ cl_image_desc mapURImageDescToCL(const ur_image_desc_t *PImageDesc) { cl_adapter::cast(PImageDesc->type); switch (PImageDesc->type) { - case UR_MEM_TYPE_BUFFER: - CLImageDesc.image_type = CL_MEM_OBJECT_BUFFER; - break; case UR_MEM_TYPE_IMAGE2D: CLImageDesc.image_type = CL_MEM_OBJECT_IMAGE2D; break; @@ -141,9 +138,6 @@ cl_image_desc mapURImageDescToCL(const ur_image_desc_t *PImageDesc) { case UR_MEM_TYPE_IMAGE1D_ARRAY: CLImageDesc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY; break; - case UR_MEM_TYPE_IMAGE1D_BUFFER: - CLImageDesc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; - break; default: CLImageDesc.image_type = -1; break; diff --git a/source/adapters/opencl/platform.cpp b/source/adapters/opencl/platform.cpp index 7188a3e8f0..9fa5025196 100644 --- a/source/adapters/opencl/platform.cpp +++ b/source/adapters/opencl/platform.cpp @@ -106,8 +106,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGetNativeHandle( } UR_APIEXPORT ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( - ur_native_handle_t hNativePlatform, const ur_platform_native_properties_t *, - ur_platform_handle_t *phPlatform) { + ur_native_handle_t hNativePlatform, ur_adapter_handle_t, + const ur_platform_native_properties_t *, ur_platform_handle_t *phPlatform) { *phPlatform = reinterpret_cast(hNativePlatform); return UR_RESULT_SUCCESS; } diff --git a/source/adapters/opencl/ur_interface_loader.cpp b/source/adapters/opencl/ur_interface_loader.cpp index effb2128c3..687b541911 100644 --- a/source/adapters/opencl/ur_interface_loader.cpp +++ b/source/adapters/opencl/ur_interface_loader.cpp @@ -125,6 +125,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnSetArgValue = urKernelSetArgValue; pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; pDdiTable->pfnSetSpecializationConstants = nullptr; + pDdiTable->pfnGetSuggestedLocalWorkSize = urKernelGetSuggestedLocalWorkSize; return UR_RESULT_SUCCESS; } diff --git a/source/adapters/opencl/usm.cpp b/source/adapters/opencl/usm.cpp index 8ab868e679..4df99f3a9c 100644 --- a/source/adapters/opencl/usm.cpp +++ b/source/adapters/opencl/usm.cpp @@ -261,9 +261,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( return UR_RESULT_SUCCESS; } - // OpenCL only supports pattern sizes as large as the largest CL type - // (double16/long16 - 128 bytes), anything larger we need to do on the host - // side and copy it into the target allocation. + // OpenCL only supports pattern sizes which are powers of 2 and are as large + // as the largest CL type (double16/long16 - 128 bytes), anything larger or + // not a power of 2, we need to do on the host side and copy it into the + // target allocation. clHostMemAllocINTEL_fn HostMemAlloc = nullptr; UR_RETURN_ON_FAILURE(cl_ext::getExtFuncFromContext( CLContext, cl_ext::ExtFuncPtrCache->clHostMemAllocINTELCache, @@ -280,14 +281,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( cl_ext::MemBlockingFreeName, &USMFree)); cl_int ClErr = CL_SUCCESS; - auto HostBuffer = static_cast( - HostMemAlloc(CLContext, nullptr, size, 0, &ClErr)); + auto HostBuffer = + static_cast(HostMemAlloc(CLContext, nullptr, size, 0, &ClErr)); CL_RETURN_ON_FAILURE(ClErr); - auto NumValues = size / sizeof(uint64_t); - auto NumChunks = patternSize / sizeof(uint64_t); - for (size_t i = 0; i < NumValues; i++) { - HostBuffer[i] = static_cast(pPattern)[i % NumChunks]; + auto *End = HostBuffer + size; + for (auto *Iter = HostBuffer; Iter < End; Iter += patternSize) { + std::memcpy(Iter, pPattern, patternSize); } cl_event CopyEvent = nullptr; diff --git a/source/loader/CMakeLists.txt b/source/loader/CMakeLists.txt index c884607500..075d9909b0 100644 --- a/source/loader/CMakeLists.txt +++ b/source/loader/CMakeLists.txt @@ -15,6 +15,24 @@ add_ur_library(ur_loader ${CMAKE_CURRENT_BINARY_DIR}/UrLoaderVersion.rc ) +if (MSVC) + set(TARGET_LIBNAME ur_loader) + string(TOUPPER ${TARGET_LIBNAME} TARGET_LIBNAME) + + set(LOADER_VERSION_SCRIPT ${CMAKE_CURRENT_BINARY_DIR}/ur_loader.def) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/loader.def.in ${LOADER_VERSION_SCRIPT} @ONLY) + set_target_properties(ur_loader PROPERTIES + LINK_FLAGS "/DEF:${LOADER_VERSION_SCRIPT}" + ) +elseif(CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(TARGET_LIBNAME libur_loader_${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}) + string(TOUPPER ${TARGET_LIBNAME} TARGET_LIBNAME) + + set(LOADER_VERSION_SCRIPT ${CMAKE_CURRENT_BINARY_DIR}/ur_loader.map) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/loader.map.in ${LOADER_VERSION_SCRIPT} @ONLY) + target_link_options(ur_loader PRIVATE "-Wl,--version-script=${LOADER_VERSION_SCRIPT}") +endif() + set_target_properties(ur_loader PROPERTIES LIBRARY_OUTPUT_NAME ur_loader RUNTIME_OUTPUT_NAME ur_loader @@ -108,6 +126,8 @@ if(UR_ENABLE_SANITIZER) ${CMAKE_CURRENT_SOURCE_DIR}/../ur/ur.cpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_allocator.cpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_allocator.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_buffer.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_buffer.hpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_interceptor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_interceptor.hpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan_libdevice.hpp diff --git a/source/loader/layers/sanitizer/asan_buffer.cpp b/source/loader/layers/sanitizer/asan_buffer.cpp new file mode 100644 index 0000000000..bb50b53c06 --- /dev/null +++ b/source/loader/layers/sanitizer/asan_buffer.cpp @@ -0,0 +1,137 @@ +/* + * + * Copyright (C) 2024 Intel Corporation + * + * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. + * See LICENSE.TXT + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * @file asan_buffer.cpp + * + */ + +#include "asan_buffer.hpp" +#include "asan_interceptor.hpp" +#include "ur_sanitizer_layer.hpp" +#include "ur_sanitizer_utils.hpp" + +namespace ur_sanitizer_layer { + +ur_result_t EnqueueMemCopyRectHelper( + ur_queue_handle_t Queue, char *pSrc, char *pDst, ur_rect_offset_t SrcOffset, + ur_rect_offset_t DstOffset, ur_rect_region_t Region, size_t SrcRowPitch, + size_t SrcSlicePitch, size_t DstRowPitch, size_t DstSlicePitch, + bool Blocking, uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList, ur_event_handle_t *Event) { + // If user doesn't determine src/dst row pitch and slice pitch, just use + // region for it. + if (SrcRowPitch == 0) { + SrcRowPitch = Region.width; + } + + if (SrcSlicePitch == 0) { + SrcSlicePitch = SrcRowPitch * Region.height; + } + + if (DstRowPitch == 0) { + DstRowPitch = Region.width; + } + + if (DstSlicePitch == 0) { + DstSlicePitch = DstRowPitch * Region.height; + } + + // Calculate the src and dst addresses that actually will be copied. + char *SrcOrigin = pSrc + SrcOffset.x + SrcRowPitch * SrcOffset.y + + SrcSlicePitch * SrcOffset.z; + char *DstOrigin = pDst + DstOffset.x + DstRowPitch * DstOffset.y + + DstSlicePitch * DstOffset.z; + + std::vector Events; + Events.reserve(Region.depth); + // For now, USM doesn't support 3D memory copy operation, so we can only + // loop call 2D memory copy function to implement it. + for (size_t i = 0; i < Region.depth; i++) { + ur_event_handle_t NewEvent{}; + UR_CALL(context.urDdiTable.Enqueue.pfnUSMMemcpy2D( + Queue, Blocking, DstOrigin + (i * DstSlicePitch), DstRowPitch, + SrcOrigin + (i * SrcSlicePitch), SrcRowPitch, Region.width, + Region.height, NumEventsInWaitList, EventWaitList, &NewEvent)); + + Events.push_back(NewEvent); + } + + UR_CALL(context.urDdiTable.Enqueue.pfnEventsWait(Queue, Events.size(), + Events.data(), Event)); + + return UR_RESULT_SUCCESS; +} + +ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) { + // Sub-buffers don't maintain own allocations but rely on parent buffer. + if (SubBuffer) { + UR_CALL(SubBuffer->Parent->getHandle(Device, Handle)); + Handle += SubBuffer->Origin; + return UR_RESULT_SUCCESS; + } + + auto &Allocation = Allocations[Device]; + if (!Allocation) { + ur_usm_desc_t USMDesc{}; + USMDesc.align = getAlignment(); + ur_usm_pool_handle_t Pool{}; + ur_result_t URes = context.interceptor->allocateMemory( + Context, Device, &USMDesc, Pool, Size, AllocType::MEM_BUFFER, + ur_cast(&Allocation)); + if (URes != UR_RESULT_SUCCESS) { + context.logger.error( + "Failed to allocate {} bytes memory for buffer {}", Size, this); + return URes; + } + + if (HostPtr) { + ManagedQueue Queue(Context, Device); + URes = context.urDdiTable.Enqueue.pfnUSMMemcpy( + Queue, true, Allocation, HostPtr, Size, 0, nullptr, nullptr); + if (URes != UR_RESULT_SUCCESS) { + context.logger.error("Failed to copy {} bytes data from host " + "pointer {} to buffer {}", + Size, HostPtr, this); + return URes; + } + } + } + + Handle = Allocation; + + return UR_RESULT_SUCCESS; +} + +ur_result_t MemBuffer::free() { + for (const auto &[_, Ptr] : Allocations) { + ur_result_t URes = context.interceptor->releaseMemory(Context, Ptr); + if (URes != UR_RESULT_SUCCESS) { + context.logger.error("Failed to free buffer handle {}", Ptr); + return URes; + } + } + Allocations.clear(); + return UR_RESULT_SUCCESS; +} + +size_t MemBuffer::getAlignment() { + // Choose an alignment that is at most 128 and is the next power of 2 + // for sizes less than 128. + // TODO: If we don't set the alignment size explicitly, the device will + // usually choose a very large size (more than 1k). Then sanitizer will + // allocate extra unnessary memory. Not sure if this will impact + // performance. + size_t MsbIdx = 63 - __builtin_clz(Size); + size_t Alignment = (1 << (MsbIdx + 1)); + if (Alignment > 128) { + Alignment = 128; + } + return Alignment; +} + +} // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/asan_buffer.hpp b/source/loader/layers/sanitizer/asan_buffer.hpp new file mode 100644 index 0000000000..b4eba4e4ba --- /dev/null +++ b/source/loader/layers/sanitizer/asan_buffer.hpp @@ -0,0 +1,74 @@ +/* + * + * Copyright (C) 2024 Intel Corporation + * + * Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. + * See LICENSE.TXT + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + * @file asan_buffer.hpp + * + */ + +#pragma once + +#include +#include +#include + +#include "common.hpp" + +namespace ur_sanitizer_layer { + +struct MemBuffer { + // Buffer constructor + MemBuffer(ur_context_handle_t Context, size_t Size, char *HostPtr) + : Context(Context), Size(Size), HostPtr(HostPtr) {} + + // Sub-buffer constructor + MemBuffer(std::shared_ptr Parent, size_t Origin, size_t Size) + : Context(Parent->Context), Size(Size), SubBuffer{{Parent, Origin}} {} + + ur_result_t getHandle(ur_device_handle_t Device, char *&Handle); + + ur_result_t free(); + + size_t getAlignment(); + + std::unordered_map Allocations; + + enum AccessMode { UNKNOWN, READ_WRITE, READ_ONLY, WRITE_ONLY }; + + struct Mapping { + size_t Offset; + size_t Size; + }; + + std::unordered_map Mappings; + + ur_context_handle_t Context; + + size_t Size; + + char *HostPtr{}; + + struct SubBuffer_t { + std::shared_ptr Parent; + size_t Origin; + }; + + std::optional SubBuffer; + + std::atomic RefCount = 1; + + ur_shared_mutex Mutex; +}; + +ur_result_t EnqueueMemCopyRectHelper( + ur_queue_handle_t Queue, char *pSrc, char *pDst, ur_rect_offset_t SrcOffset, + ur_rect_offset_t DstOffset, ur_rect_region_t Region, size_t SrcRowPitch, + size_t SrcSlicePitch, size_t DstRowPitch, size_t DstSlicePitch, + bool Blocking, uint32_t NumEventsInWaitList, + const ur_event_handle_t *EventWaitList, ur_event_handle_t *Event); + +} // namespace ur_sanitizer_layer diff --git a/source/loader/layers/sanitizer/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan_interceptor.cpp index 37630e5c29..535247e863 100644 --- a/source/loader/layers/sanitizer/asan_interceptor.cpp +++ b/source/loader/layers/sanitizer/asan_interceptor.cpp @@ -259,6 +259,9 @@ ur_result_t SanitizerInterceptor::allocateMemory( } else if (Type == AllocType::SHARED_USM) { UR_CALL(context.urDdiTable.USM.pfnSharedAlloc( Context, Device, Properties, Pool, NeededSize, &Allocated)); + } else if (Type == AllocType::MEM_BUFFER) { + UR_CALL(context.urDdiTable.USM.pfnDeviceAlloc( + Context, Device, Properties, Pool, NeededSize, &Allocated)); } else { context.logger.error("Unsupport memory type"); return UR_RESULT_ERROR_INVALID_ARGUMENT; @@ -662,6 +665,32 @@ ur_result_t SanitizerInterceptor::eraseKernel(ur_kernel_handle_t Kernel) { return UR_RESULT_SUCCESS; } +ur_result_t +SanitizerInterceptor::insertMemBuffer(std::shared_ptr MemBuffer) { + std::scoped_lock Guard(m_MemBufferMapMutex); + assert(m_MemBufferMap.find(ur_cast(MemBuffer.get())) == + m_MemBufferMap.end()); + m_MemBufferMap.emplace(reinterpret_cast(MemBuffer.get()), + MemBuffer); + return UR_RESULT_SUCCESS; +} + +ur_result_t SanitizerInterceptor::eraseMemBuffer(ur_mem_handle_t MemHandle) { + std::scoped_lock Guard(m_MemBufferMapMutex); + assert(m_MemBufferMap.find(MemHandle) != m_MemBufferMap.end()); + m_MemBufferMap.erase(MemHandle); + return UR_RESULT_SUCCESS; +} + +std::shared_ptr +SanitizerInterceptor::getMemBuffer(ur_mem_handle_t MemHandle) { + std::shared_lock Guard(m_MemBufferMapMutex); + if (m_MemBufferMap.find(MemHandle) != m_MemBufferMap.end()) { + return m_MemBufferMap[MemHandle]; + } + return nullptr; +} + ur_result_t SanitizerInterceptor::prepareLaunch( ur_context_handle_t Context, std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, ur_kernel_handle_t Kernel, @@ -669,6 +698,21 @@ ur_result_t SanitizerInterceptor::prepareLaunch( auto Program = GetProgram(Kernel); do { + // Set membuffer arguments + auto KernelInfo = getKernelInfo(Kernel); + for (const auto &[ArgIndex, MemBuffer] : KernelInfo->BufferArgs) { + char *ArgPointer = nullptr; + UR_CALL(MemBuffer->getHandle(DeviceInfo->Handle, ArgPointer)); + ur_result_t URes = context.urDdiTable.Kernel.pfnSetArgPointer( + Kernel, ArgIndex, nullptr, &ArgPointer); + if (URes != UR_RESULT_SUCCESS) { + context.logger.error( + "Failed to set buffer {} as the {} arg to kernel {}: {}", + ur_cast(MemBuffer.get()), ArgIndex, Kernel, + URes); + } + } + // Set launch info argument auto ArgNums = GetKernelNumArgs(Kernel); if (ArgNums) { diff --git a/source/loader/layers/sanitizer/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan_interceptor.hpp index f02ca3b370..3d6ba9bdf2 100644 --- a/source/loader/layers/sanitizer/asan_interceptor.hpp +++ b/source/loader/layers/sanitizer/asan_interceptor.hpp @@ -13,6 +13,7 @@ #pragma once #include "asan_allocator.hpp" +#include "asan_buffer.hpp" #include "asan_libdevice.hpp" #include "common.hpp" #include "ur_sanitizer_layer.hpp" @@ -81,8 +82,10 @@ struct QueueInfo { struct KernelInfo { ur_kernel_handle_t Handle; - ur_shared_mutex Mutex; + std::atomic RefCount = 1; + std::unordered_map> BufferArgs; + // Need preserve the order of local arguments std::map LocalArgs; @@ -128,7 +131,7 @@ struct ContextInfo { }; struct USMLaunchInfo { - LaunchInfo *Data; + LaunchInfo *Data = nullptr; ur_context_handle_t Context = nullptr; ur_device_handle_t Device = nullptr; @@ -194,6 +197,10 @@ class SanitizerInterceptor { ur_result_t insertKernel(ur_kernel_handle_t Kernel); ur_result_t eraseKernel(ur_kernel_handle_t Kernel); + ur_result_t insertMemBuffer(std::shared_ptr MemBuffer); + ur_result_t eraseMemBuffer(ur_mem_handle_t MemHandle); + std::shared_ptr getMemBuffer(ur_mem_handle_t MemHandle); + std::optional findAllocInfoByAddress(uptr Address); std::shared_ptr getContextInfo(ur_context_handle_t Context) { @@ -245,6 +252,10 @@ class SanitizerInterceptor { m_KernelMap; ur_shared_mutex m_KernelMapMutex; + std::unordered_map> + m_MemBufferMap; + ur_shared_mutex m_MemBufferMapMutex; + /// Assumption: all USM chunks are allocated in one VA AllocationMap m_AllocationMap; ur_shared_mutex m_AllocationMapMutex; diff --git a/source/loader/layers/sanitizer/ur_sanddi.cpp b/source/loader/layers/sanitizer/ur_sanddi.cpp index 58f54c9338..e352ad69b8 100644 --- a/source/loader/layers/sanitizer/ur_sanddi.cpp +++ b/source/loader/layers/sanitizer/ur_sanddi.cpp @@ -376,6 +376,730 @@ __urdlllocal ur_result_t UR_APICALL urContextRelease( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urMemBufferCreate +__urdlllocal ur_result_t UR_APICALL urMemBufferCreate( + ur_context_handle_t hContext, ///< [in] handle of the context object + ur_mem_flags_t flags, ///< [in] allocation and usage information flags + size_t size, ///< [in] size in bytes of the memory object to be allocated + const ur_buffer_properties_t + *pProperties, ///< [in][optional] pointer to buffer creation properties + ur_mem_handle_t + *phBuffer ///< [out] pointer to handle of the memory buffer created +) { + auto pfnBufferCreate = context.urDdiTable.Mem.pfnBufferCreate; + + if (nullptr == pfnBufferCreate) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + if (nullptr == phBuffer) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + context.logger.debug("==== urMemBufferCreate"); + + void *Host = nullptr; + if (pProperties) { + Host = pProperties->pHost; + } + + char *hostPtrOrNull = (flags & UR_MEM_FLAG_USE_HOST_POINTER) + ? ur_cast(Host) + : nullptr; + + std::shared_ptr pMemBuffer = + std::make_shared(hContext, size, hostPtrOrNull); + ur_result_t result = context.interceptor->insertMemBuffer(pMemBuffer); + *phBuffer = ur_cast(pMemBuffer.get()); + + return result; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urMemGetInfo +__urdlllocal ur_result_t UR_APICALL urMemGetInfo( + ur_mem_handle_t + hMemory, ///< [in] handle to the memory object being queried. + ur_mem_info_t propName, ///< [in] type of the info to retrieve. + size_t + propSize, ///< [in] the number of bytes of memory pointed to by pPropValue. + void * + pPropValue, ///< [out][optional][typename(propName, propSize)] array of bytes holding + ///< the info. + ///< If propSize is less than the real number of bytes needed to return + ///< the info then the ::UR_RESULT_ERROR_INVALID_SIZE error is returned and + ///< pPropValue is not used. + size_t * + pPropSizeRet ///< [out][optional] pointer to the actual size in bytes of the queried propName. +) { + auto pfnGetInfo = context.urDdiTable.Mem.pfnGetInfo; + + if (nullptr == pfnGetInfo) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urMemGetInfo"); + + if (auto MemBuffer = context.interceptor->getMemBuffer(hMemory)) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + switch (propName) { + case UR_MEM_INFO_CONTEXT: { + return ReturnValue(MemBuffer->Context); + } + case UR_MEM_INFO_SIZE: { + return ReturnValue(size_t{MemBuffer->Size}); + } + default: { + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + } + } + } else { + UR_CALL( + pfnGetInfo(hMemory, propName, propSize, pPropValue, pPropSizeRet)); + } + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urMemRetain +__urdlllocal ur_result_t UR_APICALL urMemRetain( + ur_mem_handle_t hMem ///< [in] handle of the memory object to get access +) { + auto pfnRetain = context.urDdiTable.Mem.pfnRetain; + + if (nullptr == pfnRetain) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urMemRetain"); + + if (auto MemBuffer = context.interceptor->getMemBuffer(hMem)) { + MemBuffer->RefCount++; + } else { + UR_CALL(pfnRetain(hMem)); + } + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urMemRelease +__urdlllocal ur_result_t UR_APICALL urMemRelease( + ur_mem_handle_t hMem ///< [in] handle of the memory object to release +) { + auto pfnRelease = context.urDdiTable.Mem.pfnRelease; + + if (nullptr == pfnRelease) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urMemRelease"); + + if (auto MemBuffer = context.interceptor->getMemBuffer(hMem)) { + if (--MemBuffer->RefCount != 0) { + return UR_RESULT_SUCCESS; + } + UR_CALL(MemBuffer->free()); + UR_CALL(context.interceptor->eraseMemBuffer(hMem)); + } else { + UR_CALL(pfnRelease(hMem)); + } + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urMemBufferPartition +__urdlllocal ur_result_t UR_APICALL urMemBufferPartition( + ur_mem_handle_t + hBuffer, ///< [in] handle of the buffer object to allocate from + ur_mem_flags_t flags, ///< [in] allocation and usage information flags + ur_buffer_create_type_t bufferCreateType, ///< [in] buffer creation type + const ur_buffer_region_t + *pRegion, ///< [in] pointer to buffer create region information + ur_mem_handle_t + *phMem ///< [out] pointer to the handle of sub buffer created +) { + auto pfnBufferPartition = context.urDdiTable.Mem.pfnBufferPartition; + + if (nullptr == pfnBufferPartition) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urMemBufferPartition"); + + if (auto ParentBuffer = context.interceptor->getMemBuffer(hBuffer)) { + if (ParentBuffer->Size < (pRegion->origin + pRegion->size)) { + return UR_RESULT_ERROR_INVALID_BUFFER_SIZE; + } + std::shared_ptr SubBuffer = std::make_shared( + ParentBuffer, pRegion->origin, pRegion->size); + UR_CALL(context.interceptor->insertMemBuffer(SubBuffer)); + *phMem = reinterpret_cast(SubBuffer.get()); + } else { + UR_CALL(pfnBufferPartition(hBuffer, flags, bufferCreateType, pRegion, + phMem)); + } + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urMemGetNativeHandle +__urdlllocal ur_result_t UR_APICALL urMemGetNativeHandle( + ur_mem_handle_t hMem, ///< [in] handle of the mem. + ur_device_handle_t hDevice, + ur_native_handle_t + *phNativeMem ///< [out] a pointer to the native handle of the mem. +) { + auto pfnGetNativeHandle = context.urDdiTable.Mem.pfnGetNativeHandle; + + if (nullptr == pfnGetNativeHandle) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urMemGetNativeHandle"); + + if (auto MemBuffer = context.interceptor->getMemBuffer(hMem)) { + char *Handle = nullptr; + UR_CALL(MemBuffer->getHandle(hDevice, Handle)); + *phNativeMem = ur_cast(Handle); + } else { + UR_CALL(pfnGetNativeHandle(hMem, hDevice, phNativeMem)); + } + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueMemBufferRead +__urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferRead( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_mem_handle_t + hBuffer, ///< [in][bounds(offset, size)] handle of the buffer object + bool blockingRead, ///< [in] indicates blocking (true), non-blocking (false) + size_t offset, ///< [in] offset in bytes in the buffer object + size_t size, ///< [in] size in bytes of data being read + void *pDst, ///< [in] pointer to host memory where data is to be read into + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before this command can be executed. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that this + ///< command does not wait on any event to complete. + ur_event_handle_t * + phEvent ///< [out][optional] return an event object that identifies this particular + ///< command instance. +) { + auto pfnMemBufferRead = context.urDdiTable.Enqueue.pfnMemBufferRead; + + if (nullptr == pfnMemBufferRead) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urEnqueueMemBufferRead"); + + if (auto MemBuffer = context.interceptor->getMemBuffer(hBuffer)) { + ur_device_handle_t Device = GetDevice(hQueue); + char *pSrc = nullptr; + UR_CALL(MemBuffer->getHandle(Device, pSrc)); + UR_CALL(context.urDdiTable.Enqueue.pfnUSMMemcpy( + hQueue, blockingRead, pDst, pSrc + offset, size, + numEventsInWaitList, phEventWaitList, phEvent)); + } else { + UR_CALL(pfnMemBufferRead(hQueue, hBuffer, blockingRead, offset, size, + pDst, numEventsInWaitList, phEventWaitList, + phEvent)); + } + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueMemBufferWrite +__urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWrite( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_mem_handle_t + hBuffer, ///< [in][bounds(offset, size)] handle of the buffer object + bool + blockingWrite, ///< [in] indicates blocking (true), non-blocking (false) + size_t offset, ///< [in] offset in bytes in the buffer object + size_t size, ///< [in] size in bytes of data being written + const void + *pSrc, ///< [in] pointer to host memory where data is to be written from + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before this command can be executed. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that this + ///< command does not wait on any event to complete. + ur_event_handle_t * + phEvent ///< [out][optional] return an event object that identifies this particular + ///< command instance. +) { + auto pfnMemBufferWrite = context.urDdiTable.Enqueue.pfnMemBufferWrite; + + if (nullptr == pfnMemBufferWrite) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urEnqueueMemBufferWrite"); + + if (auto MemBuffer = context.interceptor->getMemBuffer(hBuffer)) { + ur_device_handle_t Device = GetDevice(hQueue); + char *pDst = nullptr; + UR_CALL(MemBuffer->getHandle(Device, pDst)); + UR_CALL(context.urDdiTable.Enqueue.pfnUSMMemcpy( + hQueue, blockingWrite, pDst + offset, pSrc, size, + numEventsInWaitList, phEventWaitList, phEvent)); + } else { + UR_CALL(pfnMemBufferWrite(hQueue, hBuffer, blockingWrite, offset, size, + pSrc, numEventsInWaitList, phEventWaitList, + phEvent)); + } + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueMemBufferReadRect +__urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferReadRect( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_mem_handle_t + hBuffer, ///< [in][bounds(bufferOrigin, region)] handle of the buffer object + bool blockingRead, ///< [in] indicates blocking (true), non-blocking (false) + ur_rect_offset_t bufferOrigin, ///< [in] 3D offset in the buffer + ur_rect_offset_t hostOrigin, ///< [in] 3D offset in the host region + ur_rect_region_t + region, ///< [in] 3D rectangular region descriptor: width, height, depth + size_t + bufferRowPitch, ///< [in] length of each row in bytes in the buffer object + size_t + bufferSlicePitch, ///< [in] length of each 2D slice in bytes in the buffer object being read + size_t + hostRowPitch, ///< [in] length of each row in bytes in the host memory region pointed by + ///< dst + size_t + hostSlicePitch, ///< [in] length of each 2D slice in bytes in the host memory region + ///< pointed by dst + void *pDst, ///< [in] pointer to host memory where data is to be read into + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before this command can be executed. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that this + ///< command does not wait on any event to complete. + ur_event_handle_t * + phEvent ///< [out][optional] return an event object that identifies this particular + ///< command instance. +) { + auto pfnMemBufferReadRect = context.urDdiTable.Enqueue.pfnMemBufferReadRect; + + if (nullptr == pfnMemBufferReadRect) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urEnqueueMemBufferReadRect"); + + if (auto MemBuffer = context.interceptor->getMemBuffer(hBuffer)) { + char *SrcHandle = nullptr; + ur_device_handle_t Device = GetDevice(hQueue); + UR_CALL(MemBuffer->getHandle(Device, SrcHandle)); + + UR_CALL(EnqueueMemCopyRectHelper( + hQueue, SrcHandle, ur_cast(pDst), bufferOrigin, hostOrigin, + region, bufferRowPitch, bufferSlicePitch, hostRowPitch, + hostSlicePitch, blockingRead, numEventsInWaitList, phEventWaitList, + phEvent)); + } else { + UR_CALL(pfnMemBufferReadRect( + hQueue, hBuffer, blockingRead, bufferOrigin, hostOrigin, region, + bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, + pDst, numEventsInWaitList, phEventWaitList, phEvent)); + } + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueMemBufferWriteRect +__urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_mem_handle_t + hBuffer, ///< [in][bounds(bufferOrigin, region)] handle of the buffer object + bool + blockingWrite, ///< [in] indicates blocking (true), non-blocking (false) + ur_rect_offset_t bufferOrigin, ///< [in] 3D offset in the buffer + ur_rect_offset_t hostOrigin, ///< [in] 3D offset in the host region + ur_rect_region_t + region, ///< [in] 3D rectangular region descriptor: width, height, depth + size_t + bufferRowPitch, ///< [in] length of each row in bytes in the buffer object + size_t + bufferSlicePitch, ///< [in] length of each 2D slice in bytes in the buffer object being + ///< written + size_t + hostRowPitch, ///< [in] length of each row in bytes in the host memory region pointed by + ///< src + size_t + hostSlicePitch, ///< [in] length of each 2D slice in bytes in the host memory region + ///< pointed by src + void + *pSrc, ///< [in] pointer to host memory where data is to be written from + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] points to a list of + ///< events that must be complete before this command can be executed. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that this + ///< command does not wait on any event to complete. + ur_event_handle_t * + phEvent ///< [out][optional] return an event object that identifies this particular + ///< command instance. +) { + auto pfnMemBufferWriteRect = + context.urDdiTable.Enqueue.pfnMemBufferWriteRect; + + if (nullptr == pfnMemBufferWriteRect) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urEnqueueMemBufferWriteRect"); + + if (auto MemBuffer = context.interceptor->getMemBuffer(hBuffer)) { + char *DstHandle = nullptr; + ur_device_handle_t Device = GetDevice(hQueue); + UR_CALL(MemBuffer->getHandle(Device, DstHandle)); + + UR_CALL(EnqueueMemCopyRectHelper( + hQueue, ur_cast(pSrc), DstHandle, hostOrigin, bufferOrigin, + region, hostRowPitch, hostSlicePitch, bufferRowPitch, + bufferSlicePitch, blockingWrite, numEventsInWaitList, + phEventWaitList, phEvent)); + } else { + UR_CALL(pfnMemBufferWriteRect( + hQueue, hBuffer, blockingWrite, bufferOrigin, hostOrigin, region, + bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, + pSrc, numEventsInWaitList, phEventWaitList, phEvent)); + } + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueMemBufferCopy +__urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopy( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_mem_handle_t + hBufferSrc, ///< [in][bounds(srcOffset, size)] handle of the src buffer object + ur_mem_handle_t + hBufferDst, ///< [in][bounds(dstOffset, size)] handle of the dest buffer object + size_t srcOffset, ///< [in] offset into hBufferSrc to begin copying from + size_t dstOffset, ///< [in] offset info hBufferDst to begin copying into + size_t size, ///< [in] size in bytes of data being copied + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before this command can be executed. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that this + ///< command does not wait on any event to complete. + ur_event_handle_t * + phEvent ///< [out][optional] return an event object that identifies this particular + ///< command instance. +) { + auto pfnMemBufferCopy = context.urDdiTable.Enqueue.pfnMemBufferCopy; + + if (nullptr == pfnMemBufferCopy) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urEnqueueMemBufferCopy"); + + auto SrcBuffer = context.interceptor->getMemBuffer(hBufferSrc); + auto DstBuffer = context.interceptor->getMemBuffer(hBufferDst); + + UR_ASSERT((SrcBuffer && DstBuffer) || (!SrcBuffer && !DstBuffer), + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + if (SrcBuffer && DstBuffer) { + ur_device_handle_t Device = GetDevice(hQueue); + char *SrcHandle = nullptr; + UR_CALL(SrcBuffer->getHandle(Device, SrcHandle)); + + char *DstHandle = nullptr; + UR_CALL(DstBuffer->getHandle(Device, DstHandle)); + + UR_CALL(context.urDdiTable.Enqueue.pfnUSMMemcpy( + hQueue, false, DstHandle + dstOffset, SrcHandle + srcOffset, size, + numEventsInWaitList, phEventWaitList, phEvent)); + } else { + UR_CALL(pfnMemBufferCopy(hQueue, hBufferSrc, hBufferDst, srcOffset, + dstOffset, size, numEventsInWaitList, + phEventWaitList, phEvent)); + } + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueMemBufferCopyRect +__urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_mem_handle_t + hBufferSrc, ///< [in][bounds(srcOrigin, region)] handle of the source buffer object + ur_mem_handle_t + hBufferDst, ///< [in][bounds(dstOrigin, region)] handle of the dest buffer object + ur_rect_offset_t srcOrigin, ///< [in] 3D offset in the source buffer + ur_rect_offset_t dstOrigin, ///< [in] 3D offset in the destination buffer + ur_rect_region_t + region, ///< [in] source 3D rectangular region descriptor: width, height, depth + size_t + srcRowPitch, ///< [in] length of each row in bytes in the source buffer object + size_t + srcSlicePitch, ///< [in] length of each 2D slice in bytes in the source buffer object + size_t + dstRowPitch, ///< [in] length of each row in bytes in the destination buffer object + size_t + dstSlicePitch, ///< [in] length of each 2D slice in bytes in the destination buffer object + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before this command can be executed. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that this + ///< command does not wait on any event to complete. + ur_event_handle_t * + phEvent ///< [out][optional] return an event object that identifies this particular + ///< command instance. +) { + auto pfnMemBufferCopyRect = context.urDdiTable.Enqueue.pfnMemBufferCopyRect; + + if (nullptr == pfnMemBufferCopyRect) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urEnqueueMemBufferCopyRect"); + + auto SrcBuffer = context.interceptor->getMemBuffer(hBufferSrc); + auto DstBuffer = context.interceptor->getMemBuffer(hBufferDst); + + UR_ASSERT((SrcBuffer && DstBuffer) || (!SrcBuffer && !DstBuffer), + UR_RESULT_ERROR_INVALID_MEM_OBJECT); + + if (SrcBuffer && DstBuffer) { + ur_device_handle_t Device = GetDevice(hQueue); + char *SrcHandle = nullptr; + UR_CALL(SrcBuffer->getHandle(Device, SrcHandle)); + + char *DstHandle = nullptr; + UR_CALL(DstBuffer->getHandle(Device, DstHandle)); + + UR_CALL(EnqueueMemCopyRectHelper( + hQueue, SrcHandle, DstHandle, srcOrigin, dstOrigin, region, + srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, false, + numEventsInWaitList, phEventWaitList, phEvent)); + } else { + UR_CALL(pfnMemBufferCopyRect( + hQueue, hBufferSrc, hBufferDst, srcOrigin, dstOrigin, region, + srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, + numEventsInWaitList, phEventWaitList, phEvent)); + } + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueMemBufferFill +__urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferFill( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_mem_handle_t + hBuffer, ///< [in][bounds(offset, size)] handle of the buffer object + const void *pPattern, ///< [in] pointer to the fill pattern + size_t patternSize, ///< [in] size in bytes of the pattern + size_t offset, ///< [in] offset into the buffer + size_t size, ///< [in] fill size in bytes, must be a multiple of patternSize + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before this command can be executed. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that this + ///< command does not wait on any event to complete. + ur_event_handle_t * + phEvent ///< [out][optional] return an event object that identifies this particular + ///< command instance. +) { + auto pfnMemBufferFill = context.urDdiTable.Enqueue.pfnMemBufferFill; + + if (nullptr == pfnMemBufferFill) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urEnqueueMemBufferFill"); + + if (auto MemBuffer = context.interceptor->getMemBuffer(hBuffer)) { + char *Handle = nullptr; + ur_device_handle_t Device = GetDevice(hQueue); + UR_CALL(MemBuffer->getHandle(Device, Handle)); + UR_CALL(context.urDdiTable.Enqueue.pfnUSMFill( + hQueue, Handle + offset, patternSize, pPattern, size, + numEventsInWaitList, phEventWaitList, phEvent)); + } else { + UR_CALL(pfnMemBufferFill(hQueue, hBuffer, pPattern, patternSize, offset, + size, numEventsInWaitList, phEventWaitList, + phEvent)); + } + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueMemBufferMap +__urdlllocal ur_result_t UR_APICALL urEnqueueMemBufferMap( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_mem_handle_t + hBuffer, ///< [in][bounds(offset, size)] handle of the buffer object + bool blockingMap, ///< [in] indicates blocking (true), non-blocking (false) + ur_map_flags_t mapFlags, ///< [in] flags for read, write, readwrite mapping + size_t offset, ///< [in] offset in bytes of the buffer region being mapped + size_t size, ///< [in] size in bytes of the buffer region being mapped + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before this command can be executed. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that this + ///< command does not wait on any event to complete. + ur_event_handle_t * + phEvent, ///< [out][optional] return an event object that identifies this particular + ///< command instance. + void **ppRetMap ///< [out] return mapped pointer. TODO: move it before + ///< numEventsInWaitList? +) { + auto pfnMemBufferMap = context.urDdiTable.Enqueue.pfnMemBufferMap; + + if (nullptr == pfnMemBufferMap) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urEnqueueMemBufferMap"); + + if (auto MemBuffer = context.interceptor->getMemBuffer(hBuffer)) { + + // Translate the host access mode info. + MemBuffer::AccessMode AccessMode = MemBuffer::UNKNOWN; + if (mapFlags & UR_MAP_FLAG_WRITE_INVALIDATE_REGION) { + AccessMode = MemBuffer::WRITE_ONLY; + } else { + if (mapFlags & UR_MAP_FLAG_READ) { + AccessMode = MemBuffer::READ_ONLY; + if (mapFlags & UR_MAP_FLAG_WRITE) { + AccessMode = MemBuffer::READ_WRITE; + } + } else if (mapFlags & UR_MAP_FLAG_WRITE) { + AccessMode = MemBuffer::WRITE_ONLY; + } + } + + UR_ASSERT(AccessMode != MemBuffer::UNKNOWN, + UR_RESULT_ERROR_INVALID_ARGUMENT); + + ur_device_handle_t Device = GetDevice(hQueue); + // If the buffer used host pointer, then we just reuse it. If not, we + // need to manually allocate a new host USM. + if (MemBuffer->HostPtr) { + *ppRetMap = MemBuffer->HostPtr + offset; + } else { + ur_context_handle_t Context = GetContext(hQueue); + ur_usm_desc_t USMDesc{}; + USMDesc.align = MemBuffer->getAlignment(); + ur_usm_pool_handle_t Pool{}; + UR_CALL(context.interceptor->allocateMemory( + Context, nullptr, &USMDesc, Pool, size, AllocType::HOST_USM, + ppRetMap)); + } + + // Actually, if the access mode is write only, we don't need to do this + // copy. However, in that way, we cannot generate a event to user. So, + // we'll aways do copy here. + char *SrcHandle = nullptr; + UR_CALL(MemBuffer->getHandle(Device, SrcHandle)); + UR_CALL(context.urDdiTable.Enqueue.pfnUSMMemcpy( + hQueue, blockingMap, *ppRetMap, SrcHandle + offset, size, + numEventsInWaitList, phEventWaitList, phEvent)); + + { + std::scoped_lock Guard(MemBuffer->Mutex); + UR_ASSERT(MemBuffer->Mappings.find(*ppRetMap) == + MemBuffer->Mappings.end(), + UR_RESULT_ERROR_INVALID_VALUE); + MemBuffer->Mappings[*ppRetMap] = {offset, size}; + } + } else { + UR_CALL(pfnMemBufferMap(hQueue, hBuffer, blockingMap, mapFlags, offset, + size, numEventsInWaitList, phEventWaitList, + phEvent, ppRetMap)); + } + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urEnqueueMemUnmap +__urdlllocal ur_result_t UR_APICALL urEnqueueMemUnmap( + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + ur_mem_handle_t + hMem, ///< [in] handle of the memory (buffer or image) object + void *pMappedPtr, ///< [in] mapped host address + uint32_t numEventsInWaitList, ///< [in] size of the event wait list + const ur_event_handle_t * + phEventWaitList, ///< [in][optional][range(0, numEventsInWaitList)] pointer to a list of + ///< events that must be complete before this command can be executed. + ///< If nullptr, the numEventsInWaitList must be 0, indicating that this + ///< command does not wait on any event to complete. + ur_event_handle_t * + phEvent ///< [out][optional] return an event object that identifies this particular + ///< command instance. +) { + auto pfnMemUnmap = context.urDdiTable.Enqueue.pfnMemUnmap; + + if (nullptr == pfnMemUnmap) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urEnqueueMemUnmap"); + + if (auto MemBuffer = context.interceptor->getMemBuffer(hMem)) { + MemBuffer::Mapping Mapping{}; + { + std::scoped_lock Guard(MemBuffer->Mutex); + auto It = MemBuffer->Mappings.find(pMappedPtr); + UR_ASSERT(It != MemBuffer->Mappings.end(), + UR_RESULT_ERROR_INVALID_VALUE); + Mapping = It->second; + MemBuffer->Mappings.erase(It); + } + + // Write back mapping memory data to device and release mapping memory + // if we allocated a host USM. But for now, UR doesn't support event + // call back, we can only do blocking copy here. + char *DstHandle = nullptr; + ur_context_handle_t Context = GetContext(hQueue); + ur_device_handle_t Device = GetDevice(hQueue); + UR_CALL(MemBuffer->getHandle(Device, DstHandle)); + UR_CALL(context.urDdiTable.Enqueue.pfnUSMMemcpy( + hQueue, true, DstHandle + Mapping.Offset, pMappedPtr, Mapping.Size, + numEventsInWaitList, phEventWaitList, phEvent)); + + if (!MemBuffer->HostPtr) { + UR_CALL(context.interceptor->releaseMemory(Context, pMappedPtr)); + } + } else { + UR_CALL(pfnMemUnmap(hQueue, hMem, pMappedPtr, numEventsInWaitList, + phEventWaitList, phEvent)); + } + + return UR_RESULT_SUCCESS; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urKernelCreate __urdlllocal ur_result_t UR_APICALL urKernelCreate( @@ -398,6 +1122,28 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreate( return UR_RESULT_SUCCESS; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelRetain +__urdlllocal ur_result_t UR_APICALL urKernelRetain( + ur_kernel_handle_t hKernel ///< [in] handle for the Kernel to retain +) { + auto pfnRetain = context.urDdiTable.Kernel.pfnRetain; + + if (nullptr == pfnRetain) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urKernelRetain"); + + UR_CALL(pfnRetain(hKernel)); + + if (auto KernelInfo = context.interceptor->getKernelInfo(hKernel)) { + KernelInfo->RefCount++; + } + + return UR_RESULT_SUCCESS; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urKernelRelease __urdlllocal ur_result_t urKernelRelease( @@ -413,13 +1159,72 @@ __urdlllocal ur_result_t urKernelRelease( UR_CALL(pfnRelease(hKernel)); if (auto KernelInfo = context.interceptor->getKernelInfo(hKernel)) { - uint32_t RefCount; - UR_CALL(context.urDdiTable.Kernel.pfnGetInfo( - hKernel, UR_KERNEL_INFO_REFERENCE_COUNT, sizeof(RefCount), - &RefCount, nullptr)); - if (RefCount == 1) { - UR_CALL(context.interceptor->eraseKernel(hKernel)); + if (--KernelInfo->RefCount != 0) { + return UR_RESULT_SUCCESS; } + UR_CALL(context.interceptor->eraseKernel(hKernel)); + } + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelSetArgValue +__urdlllocal ur_result_t UR_APICALL urKernelSetArgValue( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + uint32_t argIndex, ///< [in] argument index in range [0, num args - 1] + size_t argSize, ///< [in] size of argument type + const ur_kernel_arg_value_properties_t + *pProperties, ///< [in][optional] pointer to value properties. + const void + *pArgValue ///< [in] argument value represented as matching arg type. +) { + auto pfnSetArgValue = context.urDdiTable.Kernel.pfnSetArgValue; + + if (nullptr == pfnSetArgValue) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urKernelSetArgValue"); + + std::shared_ptr MemBuffer; + if (argSize == sizeof(ur_mem_handle_t) && + (MemBuffer = context.interceptor->getMemBuffer( + *ur_cast(pArgValue)))) { + auto KernelInfo = context.interceptor->getKernelInfo(hKernel); + std::scoped_lock Guard(KernelInfo->Mutex); + KernelInfo->BufferArgs[argIndex] = std::move(MemBuffer); + } else { + UR_CALL( + pfnSetArgValue(hKernel, argIndex, argSize, pProperties, pArgValue)); + } + + return UR_RESULT_SUCCESS; +} + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelSetArgMemObj +__urdlllocal ur_result_t UR_APICALL urKernelSetArgMemObj( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object + uint32_t argIndex, ///< [in] argument index in range [0, num args - 1] + const ur_kernel_arg_mem_obj_properties_t + *pProperties, ///< [in][optional] pointer to Memory object properties. + ur_mem_handle_t hArgValue ///< [in][optional] handle of Memory object. +) { + auto pfnSetArgMemObj = context.urDdiTable.Kernel.pfnSetArgMemObj; + + if (nullptr == pfnSetArgMemObj) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + context.logger.debug("==== urKernelSetArgMemObj"); + + if (auto MemBuffer = context.interceptor->getMemBuffer(hArgValue)) { + auto KernelInfo = context.interceptor->getKernelInfo(hKernel); + std::scoped_lock Guard(KernelInfo->Mutex); + KernelInfo->BufferArgs[argIndex] = std::move(MemBuffer); + } else { + UR_CALL(pfnSetArgMemObj(hKernel, argIndex, pProperties, hArgValue)); } return UR_RESULT_SUCCESS; @@ -525,16 +1330,16 @@ __urdlllocal ur_result_t UR_APICALL urGetProgramProcAddrTable( } /////////////////////////////////////////////////////////////////////////////// -/// @brief Exported function for filling application's ProgramExp table +/// @brief Exported function for filling application's Kernel table /// with current process' addresses /// /// @returns /// - ::UR_RESULT_SUCCESS /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION -__urdlllocal ur_result_t UR_APICALL urGetProgramExpProcAddrTable( +__urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable( ur_api_version_t version, ///< [in] API version requested - ur_program_exp_dditable_t + ur_kernel_dditable_t *pDdiTable ///< [in,out] pointer to table of DDI function pointers ) { if (nullptr == pDdiTable) { @@ -550,23 +1355,26 @@ __urdlllocal ur_result_t UR_APICALL urGetProgramExpProcAddrTable( ur_result_t result = UR_RESULT_SUCCESS; - pDdiTable->pfnBuildExp = ur_sanitizer_layer::urProgramBuildExp; - pDdiTable->pfnLinkExp = ur_sanitizer_layer::urProgramLinkExp; + pDdiTable->pfnCreate = ur_sanitizer_layer::urKernelCreate; + pDdiTable->pfnRetain = ur_sanitizer_layer::urKernelRetain; + pDdiTable->pfnRelease = ur_sanitizer_layer::urKernelRelease; + pDdiTable->pfnSetArgValue = ur_sanitizer_layer::urKernelSetArgValue; + pDdiTable->pfnSetArgMemObj = ur_sanitizer_layer::urKernelSetArgMemObj; + pDdiTable->pfnSetArgLocal = ur_sanitizer_layer::urKernelSetArgLocal; return result; } - /////////////////////////////////////////////////////////////////////////////// -/// @brief Exported function for filling application's Kernel table +/// @brief Exported function for filling application's Mem table /// with current process' addresses /// /// @returns /// - ::UR_RESULT_SUCCESS /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION -__urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable( +__urdlllocal ur_result_t UR_APICALL urGetMemProcAddrTable( ur_api_version_t version, ///< [in] API version requested - ur_kernel_dditable_t + ur_mem_dditable_t *pDdiTable ///< [in,out] pointer to table of DDI function pointers ) { if (nullptr == pDdiTable) { @@ -582,9 +1390,42 @@ __urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable( ur_result_t result = UR_RESULT_SUCCESS; - pDdiTable->pfnCreate = ur_sanitizer_layer::urKernelCreate; - pDdiTable->pfnRelease = ur_sanitizer_layer::urKernelRelease; - pDdiTable->pfnSetArgLocal = ur_sanitizer_layer::urKernelSetArgLocal; + pDdiTable->pfnBufferCreate = ur_sanitizer_layer::urMemBufferCreate; + pDdiTable->pfnRetain = ur_sanitizer_layer::urMemRetain; + pDdiTable->pfnRelease = ur_sanitizer_layer::urMemRelease; + pDdiTable->pfnBufferPartition = ur_sanitizer_layer::urMemBufferPartition; + pDdiTable->pfnGetNativeHandle = ur_sanitizer_layer::urMemGetNativeHandle; + pDdiTable->pfnGetInfo = ur_sanitizer_layer::urMemGetInfo; + + return result; +} +/// @brief Exported function for filling application's ProgramExp table +/// with current process' addresses +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// - ::UR_RESULT_ERROR_UNSUPPORTED_VERSION +__urdlllocal ur_result_t UR_APICALL urGetProgramExpProcAddrTable( + ur_api_version_t version, ///< [in] API version requested + ur_program_exp_dditable_t + *pDdiTable ///< [in,out] pointer to table of DDI function pointers +) { + if (nullptr == pDdiTable) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (UR_MAJOR_VERSION(ur_sanitizer_layer::context.version) != + UR_MAJOR_VERSION(version) || + UR_MINOR_VERSION(ur_sanitizer_layer::context.version) > + UR_MINOR_VERSION(version)) { + return UR_RESULT_ERROR_UNSUPPORTED_VERSION; + } + + ur_result_t result = UR_RESULT_SUCCESS; + + pDdiTable->pfnBuildExp = ur_sanitizer_layer::urProgramBuildExp; + pDdiTable->pfnLinkExp = ur_sanitizer_layer::urProgramLinkExp; return result; } @@ -614,6 +1455,18 @@ __urdlllocal ur_result_t UR_APICALL urGetEnqueueProcAddrTable( ur_result_t result = UR_RESULT_SUCCESS; + pDdiTable->pfnMemBufferRead = ur_sanitizer_layer::urEnqueueMemBufferRead; + pDdiTable->pfnMemBufferWrite = ur_sanitizer_layer::urEnqueueMemBufferWrite; + pDdiTable->pfnMemBufferReadRect = + ur_sanitizer_layer::urEnqueueMemBufferReadRect; + pDdiTable->pfnMemBufferWriteRect = + ur_sanitizer_layer::urEnqueueMemBufferWriteRect; + pDdiTable->pfnMemBufferCopy = ur_sanitizer_layer::urEnqueueMemBufferCopy; + pDdiTable->pfnMemBufferCopyRect = + ur_sanitizer_layer::urEnqueueMemBufferCopyRect; + pDdiTable->pfnMemBufferFill = ur_sanitizer_layer::urEnqueueMemBufferFill; + pDdiTable->pfnMemBufferMap = ur_sanitizer_layer::urEnqueueMemBufferMap; + pDdiTable->pfnMemUnmap = ur_sanitizer_layer::urEnqueueMemUnmap; pDdiTable->pfnKernelLaunch = ur_sanitizer_layer::urEnqueueKernelLaunch; return result; @@ -698,6 +1551,16 @@ ur_result_t context_t::init(ur_dditable_t *dditable, UR_API_VERSION_CURRENT, &dditable->Program); } + if (UR_RESULT_SUCCESS == result) { + result = ur_sanitizer_layer::urGetKernelProcAddrTable( + UR_API_VERSION_CURRENT, &dditable->Kernel); + } + + if (UR_RESULT_SUCCESS == result) { + result = ur_sanitizer_layer::urGetMemProcAddrTable( + UR_API_VERSION_CURRENT, &dditable->Mem); + } + if (UR_RESULT_SUCCESS == result) { result = ur_sanitizer_layer::urGetProgramExpProcAddrTable( UR_API_VERSION_CURRENT, &dditable->ProgramExp); diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index 56f270a9d9..e5cc8f81e2 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -352,6 +352,8 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetNativeHandle( __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( ur_native_handle_t hNativePlatform, ///< [in][nocheck] the native handle of the platform. + ur_adapter_handle_t + hAdapter, ///< [in] handle of the adapter associated with the native backend. const ur_platform_native_properties_t * pProperties, ///< [in][optional] pointer to native platform properties struct. ur_platform_handle_t * @@ -365,15 +367,15 @@ __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( } ur_platform_create_with_native_handle_params_t params = { - &hNativePlatform, &pProperties, &phPlatform}; + &hNativePlatform, &hAdapter, &pProperties, &phPlatform}; uint64_t instance = context.notify_begin(UR_FUNCTION_PLATFORM_CREATE_WITH_NATIVE_HANDLE, "urPlatformCreateWithNativeHandle", ¶ms); context.logger.info("---> urPlatformCreateWithNativeHandle"); - ur_result_t result = - pfnCreateWithNativeHandle(hNativePlatform, pProperties, phPlatform); + ur_result_t result = pfnCreateWithNativeHandle(hNativePlatform, hAdapter, + pProperties, phPlatform); context.notify_end(UR_FUNCTION_PLATFORM_CREATE_WITH_NATIVE_HANDLE, "urPlatformCreateWithNativeHandle", ¶ms, &result, @@ -3420,6 +3422,57 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreateWithNativeHandle( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelGetSuggestedLocalWorkSize +__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + uint32_t + numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global + ///< and work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the offset used to calculate the global ID of a work-item + const size_t * + pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the number of global work-items in workDim that will execute the + ///< kernel function + size_t * + pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify + ///< suggested local work size that will contain the result of the query +) { + auto pfnGetSuggestedLocalWorkSize = + context.urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize; + + if (nullptr == pfnGetSuggestedLocalWorkSize) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + ur_kernel_get_suggested_local_work_size_params_t params = { + &hKernel, &hQueue, &numWorkDim, + &pGlobalWorkOffset, &pGlobalWorkSize, &pSuggestedLocalWorkSize}; + uint64_t instance = + context.notify_begin(UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE, + "urKernelGetSuggestedLocalWorkSize", ¶ms); + + context.logger.info("---> urKernelGetSuggestedLocalWorkSize"); + + ur_result_t result = pfnGetSuggestedLocalWorkSize( + hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize, + pSuggestedLocalWorkSize); + + context.notify_end(UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE, + "urKernelGetSuggestedLocalWorkSize", ¶ms, &result, + instance); + + std::ostringstream args_str; + ur::extras::printFunctionParams( + args_str, UR_FUNCTION_KERNEL_GET_SUGGESTED_LOCAL_WORK_SIZE, ¶ms); + context.logger.info("({}) -> {};\n", args_str.str(), result); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urQueueGetInfo __urdlllocal ur_result_t UR_APICALL urQueueGetInfo( @@ -8348,6 +8401,11 @@ __urdlllocal ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnCreateWithNativeHandle = ur_tracing_layer::urKernelCreateWithNativeHandle; + dditable.pfnGetSuggestedLocalWorkSize = + pDdiTable->pfnGetSuggestedLocalWorkSize; + pDdiTable->pfnGetSuggestedLocalWorkSize = + ur_tracing_layer::urKernelGetSuggestedLocalWorkSize; + dditable.pfnSetArgValue = pDdiTable->pfnSetArgValue; pDdiTable->pfnSetArgValue = ur_tracing_layer::urKernelSetArgValue; diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index 4bdd801c1a..5ac97dfc1d 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -347,6 +347,8 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetNativeHandle( __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( ur_native_handle_t hNativePlatform, ///< [in][nocheck] the native handle of the platform. + ur_adapter_handle_t + hAdapter, ///< [in] handle of the adapter associated with the native backend. const ur_platform_native_properties_t * pProperties, ///< [in][optional] pointer to native platform properties struct. ur_platform_handle_t * @@ -360,13 +362,22 @@ __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( } if (context.enableParameterValidation) { + if (NULL == hAdapter) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + if (NULL == phPlatform) { return UR_RESULT_ERROR_INVALID_NULL_POINTER; } } - ur_result_t result = - pfnCreateWithNativeHandle(hNativePlatform, pProperties, phPlatform); + if (context.enableLifetimeValidation && + !refCountContext.isReferenceValid(hAdapter)) { + refCountContext.logInvalidReference(hAdapter); + } + + ur_result_t result = pfnCreateWithNativeHandle(hNativePlatform, hAdapter, + pProperties, phPlatform); return result; } @@ -1067,7 +1078,7 @@ __urdlllocal ur_result_t UR_APICALL urMemImageCreate( return UR_RESULT_ERROR_INVALID_ENUMERATION; } - if (pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type) { + if (pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type) { return UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR; } @@ -3857,6 +3868,71 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreateWithNativeHandle( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelGetSuggestedLocalWorkSize +__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + uint32_t + numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global + ///< and work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the offset used to calculate the global ID of a work-item + const size_t * + pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the number of global work-items in workDim that will execute the + ///< kernel function + size_t * + pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify + ///< suggested local work size that will contain the result of the query +) { + auto pfnGetSuggestedLocalWorkSize = + context.urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize; + + if (nullptr == pfnGetSuggestedLocalWorkSize) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + if (context.enableParameterValidation) { + if (NULL == hKernel) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + if (NULL == hQueue) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + if (NULL == pGlobalWorkOffset) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (NULL == pGlobalWorkSize) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + + if (NULL == pSuggestedLocalWorkSize) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + } + + if (context.enableLifetimeValidation && + !refCountContext.isReferenceValid(hKernel)) { + refCountContext.logInvalidReference(hKernel); + } + + if (context.enableLifetimeValidation && + !refCountContext.isReferenceValid(hQueue)) { + refCountContext.logInvalidReference(hQueue); + } + + ur_result_t result = pfnGetSuggestedLocalWorkSize( + hKernel, hQueue, numWorkDim, pGlobalWorkOffset, pGlobalWorkSize, + pSuggestedLocalWorkSize); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urQueueGetInfo __urdlllocal ur_result_t UR_APICALL urQueueGetInfo( @@ -6887,7 +6963,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageAllocateExp( return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - if (pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type) { + if (pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type) { return UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR; } } @@ -6996,7 +7072,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - if (pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type) { + if (pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type) { return UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR; } } @@ -7067,7 +7143,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - if (pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type) { + if (pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type) { return UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR; } } @@ -7159,7 +7235,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesImageCopyExp( return UR_RESULT_ERROR_INVALID_ENUMERATION; } - if (pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type) { + if (pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type) { return UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR; } @@ -7412,7 +7488,7 @@ __urdlllocal ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - if (pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type) { + if (pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type) { return UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR; } } @@ -10006,6 +10082,11 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnCreateWithNativeHandle = ur_validation_layer::urKernelCreateWithNativeHandle; + dditable.pfnGetSuggestedLocalWorkSize = + pDdiTable->pfnGetSuggestedLocalWorkSize; + pDdiTable->pfnGetSuggestedLocalWorkSize = + ur_validation_layer::urKernelGetSuggestedLocalWorkSize; + dditable.pfnSetArgValue = pDdiTable->pfnSetArgValue; pDdiTable->pfnSetArgValue = ur_validation_layer::urKernelSetArgValue; diff --git a/source/loader/loader.def.in b/source/loader/loader.def.in new file mode 100644 index 0000000000..b68c1ab6c1 --- /dev/null +++ b/source/loader/loader.def.in @@ -0,0 +1,538 @@ +LIBRARY @TARGET_LIBNAME@ +EXPORTS + urAdapterGet + urAdapterGetInfo + urAdapterGetLastError + urAdapterRelease + urAdapterRetain + urBindlessImagesDestroyExternalSemaphoreExp + urBindlessImagesImageAllocateExp + urBindlessImagesImageCopyExp + urBindlessImagesImageFreeExp + urBindlessImagesImageGetInfoExp + urBindlessImagesImportExternalSemaphoreOpaqueFDExp + urBindlessImagesImportOpaqueFDExp + urBindlessImagesMapExternalArrayExp + urBindlessImagesMipmapFreeExp + urBindlessImagesMipmapGetLevelExp + urBindlessImagesReleaseInteropExp + urBindlessImagesSampledImageCreateExp + urBindlessImagesSampledImageHandleDestroyExp + urBindlessImagesSignalExternalSemaphoreExp + urBindlessImagesUnsampledImageCreateExp + urBindlessImagesUnsampledImageHandleDestroyExp + urBindlessImagesWaitExternalSemaphoreExp + urCommandBufferAppendKernelLaunchExp + urCommandBufferAppendMemBufferCopyExp + urCommandBufferAppendMemBufferCopyRectExp + urCommandBufferAppendMemBufferFillExp + urCommandBufferAppendMemBufferReadExp + urCommandBufferAppendMemBufferReadRectExp + urCommandBufferAppendMemBufferWriteExp + urCommandBufferAppendMemBufferWriteRectExp + urCommandBufferAppendUSMAdviseExp + urCommandBufferAppendUSMFillExp + urCommandBufferAppendUSMMemcpyExp + urCommandBufferAppendUSMPrefetchExp + urCommandBufferCommandGetInfoExp + urCommandBufferCreateExp + urCommandBufferEnqueueExp + urCommandBufferFinalizeExp + urCommandBufferGetInfoExp + urCommandBufferReleaseCommandExp + urCommandBufferReleaseExp + urCommandBufferRetainCommandExp + urCommandBufferRetainExp + urCommandBufferUpdateKernelLaunchExp + urContextCreate + urContextCreateWithNativeHandle + urContextGetInfo + urContextGetNativeHandle + urContextRelease + urContextRetain + urContextSetExtendedDeleter + urDeviceCreateWithNativeHandle + urDeviceGet + urDeviceGetGlobalTimestamps + urDeviceGetInfo + urDeviceGetNativeHandle + urDeviceGetSelected + urDevicePartition + urDeviceRelease + urDeviceRetain + urDeviceSelectBinary + urEnqueueCooperativeKernelLaunchExp + urEnqueueDeviceGlobalVariableRead + urEnqueueDeviceGlobalVariableWrite + urEnqueueEventsWait + urEnqueueEventsWaitWithBarrier + urEnqueueKernelLaunch + urEnqueueKernelLaunchCustomExp + urEnqueueMemBufferCopy + urEnqueueMemBufferCopyRect + urEnqueueMemBufferFill + urEnqueueMemBufferMap + urEnqueueMemBufferRead + urEnqueueMemBufferReadRect + urEnqueueMemBufferWrite + urEnqueueMemBufferWriteRect + urEnqueueMemImageCopy + urEnqueueMemImageRead + urEnqueueMemImageWrite + urEnqueueMemUnmap + urEnqueueReadHostPipe + urEnqueueTimestampRecordingExp + urEnqueueUSMAdvise + urEnqueueUSMFill + urEnqueueUSMFill2D + urEnqueueUSMMemcpy + urEnqueueUSMMemcpy2D + urEnqueueUSMPrefetch + urEnqueueWriteHostPipe + urEventCreateWithNativeHandle + urEventGetInfo + urEventGetNativeHandle + urEventGetProfilingInfo + urEventRelease + urEventRetain + urEventSetCallback + urEventWait + urGetBindlessImagesExpProcAddrTable + urGetCommandBufferExpProcAddrTable + urGetContextProcAddrTable + urGetDeviceProcAddrTable + urGetEnqueueExpProcAddrTable + urGetEnqueueProcAddrTable + urGetEventProcAddrTable + urGetGlobalProcAddrTable + urGetKernelExpProcAddrTable + urGetKernelProcAddrTable + urGetMemProcAddrTable + urGetPhysicalMemProcAddrTable + urGetPlatformProcAddrTable + urGetProgramExpProcAddrTable + urGetProgramProcAddrTable + urGetQueueProcAddrTable + urGetSamplerProcAddrTable + urGetUSMExpProcAddrTable + urGetUSMProcAddrTable + urGetUsmP2PExpProcAddrTable + urGetVirtualMemProcAddrTable + urKernelCreate + urKernelCreateWithNativeHandle + urKernelGetGroupInfo + urKernelGetInfo + urKernelGetNativeHandle + urKernelGetSubGroupInfo + urKernelGetSuggestedLocalWorkSize + urKernelRelease + urKernelRetain + urKernelSetArgLocal + urKernelSetArgMemObj + urKernelSetArgPointer + urKernelSetArgSampler + urKernelSetArgValue + urKernelSetExecInfo + urKernelSetSpecializationConstants + urKernelSuggestMaxCooperativeGroupCountExp + urLoaderConfigCreate + urLoaderConfigEnableLayer + urLoaderConfigGetInfo + urLoaderConfigRelease + urLoaderConfigRetain + urLoaderConfigSetCodeLocationCallback + urLoaderInit + urLoaderTearDown + urMemBufferCreate + urMemBufferCreateWithNativeHandle + urMemBufferPartition + urMemGetInfo + urMemGetNativeHandle + urMemImageCreate + urMemImageCreateWithNativeHandle + urMemImageGetInfo + urMemRelease + urMemRetain + urPhysicalMemCreate + urPhysicalMemRelease + urPhysicalMemRetain + urPlatformCreateWithNativeHandle + urPlatformGet + urPlatformGetApiVersion + urPlatformGetBackendOption + urPlatformGetInfo + urPlatformGetNativeHandle + urPrintAdapterBackend + urPrintAdapterGetInfoParams + urPrintAdapterGetLastErrorParams + urPrintAdapterGetParams + urPrintAdapterInfo + urPrintAdapterReleaseParams + urPrintAdapterRetainParams + urPrintApiVersion + urPrintBaseDesc + urPrintBaseProperties + urPrintBindlessImagesDestroyExternalSemaphoreExpParams + urPrintBindlessImagesImageAllocateExpParams + urPrintBindlessImagesImageCopyExpParams + urPrintBindlessImagesImageFreeExpParams + urPrintBindlessImagesImageGetInfoExpParams + urPrintBindlessImagesImportExternalSemaphoreOpaqueFdExpParams + urPrintBindlessImagesImportOpaqueFdExpParams + urPrintBindlessImagesMapExternalArrayExpParams + urPrintBindlessImagesMipmapFreeExpParams + urPrintBindlessImagesMipmapGetLevelExpParams + urPrintBindlessImagesReleaseInteropExpParams + urPrintBindlessImagesSampledImageCreateExpParams + urPrintBindlessImagesSampledImageHandleDestroyExpParams + urPrintBindlessImagesSignalExternalSemaphoreExpParams + urPrintBindlessImagesUnsampledImageCreateExpParams + urPrintBindlessImagesUnsampledImageHandleDestroyExpParams + urPrintBindlessImagesWaitExternalSemaphoreExpParams + urPrintBufferAllocLocationProperties + urPrintBufferChannelProperties + urPrintBufferCreateType + urPrintBufferProperties + urPrintBufferRegion + urPrintCodeLocation + urPrintCommand + urPrintCommandBufferAppendKernelLaunchExpParams + urPrintCommandBufferAppendMemBufferCopyExpParams + urPrintCommandBufferAppendMemBufferCopyRectExpParams + urPrintCommandBufferAppendMemBufferFillExpParams + urPrintCommandBufferAppendMemBufferReadExpParams + urPrintCommandBufferAppendMemBufferReadRectExpParams + urPrintCommandBufferAppendMemBufferWriteExpParams + urPrintCommandBufferAppendMemBufferWriteRectExpParams + urPrintCommandBufferAppendUsmAdviseExpParams + urPrintCommandBufferAppendUsmFillExpParams + urPrintCommandBufferAppendUsmMemcpyExpParams + urPrintCommandBufferAppendUsmPrefetchExpParams + urPrintCommandBufferCommandGetInfoExpParams + urPrintCommandBufferCreateExpParams + urPrintCommandBufferEnqueueExpParams + urPrintCommandBufferFinalizeExpParams + urPrintCommandBufferGetInfoExpParams + urPrintCommandBufferReleaseCommandExpParams + urPrintCommandBufferReleaseExpParams + urPrintCommandBufferRetainCommandExpParams + urPrintCommandBufferRetainExpParams + urPrintCommandBufferUpdateKernelLaunchExpParams + urPrintContextCreateParams + urPrintContextCreateWithNativeHandleParams + urPrintContextFlags + urPrintContextGetInfoParams + urPrintContextGetNativeHandleParams + urPrintContextInfo + urPrintContextNativeProperties + urPrintContextProperties + urPrintContextReleaseParams + urPrintContextRetainParams + urPrintContextSetExtendedDeleterParams + urPrintDeviceAffinityDomainFlags + urPrintDeviceBinary + urPrintDeviceCreateWithNativeHandleParams + urPrintDeviceExecCapabilityFlags + urPrintDeviceFpCapabilityFlags + urPrintDeviceGetGlobalTimestampsParams + urPrintDeviceGetInfoParams + urPrintDeviceGetNativeHandleParams + urPrintDeviceGetParams + urPrintDeviceGetSelectedParams + urPrintDeviceInfo + urPrintDeviceInitFlags + urPrintDeviceLocalMemType + urPrintDeviceMemCacheType + urPrintDeviceNativeProperties + urPrintDevicePartition + urPrintDevicePartitionParams + urPrintDevicePartitionProperties + urPrintDevicePartitionProperty + urPrintDeviceReleaseParams + urPrintDeviceRetainParams + urPrintDeviceSelectBinaryParams + urPrintDeviceType + urPrintDeviceUsmAccessCapabilityFlags + urPrintEnqueueCooperativeKernelLaunchExpParams + urPrintEnqueueDeviceGlobalVariableReadParams + urPrintEnqueueDeviceGlobalVariableWriteParams + urPrintEnqueueEventsWaitParams + urPrintEnqueueEventsWaitWithBarrierParams + urPrintEnqueueKernelLaunchCustomExpParams + urPrintEnqueueKernelLaunchParams + urPrintEnqueueMemBufferCopyParams + urPrintEnqueueMemBufferCopyRectParams + urPrintEnqueueMemBufferFillParams + urPrintEnqueueMemBufferMapParams + urPrintEnqueueMemBufferReadParams + urPrintEnqueueMemBufferReadRectParams + urPrintEnqueueMemBufferWriteParams + urPrintEnqueueMemBufferWriteRectParams + urPrintEnqueueMemImageCopyParams + urPrintEnqueueMemImageReadParams + urPrintEnqueueMemImageWriteParams + urPrintEnqueueMemUnmapParams + urPrintEnqueueReadHostPipeParams + urPrintEnqueueTimestampRecordingExpParams + urPrintEnqueueUsmAdviseParams + urPrintEnqueueUsmFillParams + urPrintEnqueueUsmFill_2dParams + urPrintEnqueueUsmMemcpyParams + urPrintEnqueueUsmMemcpy_2dParams + urPrintEnqueueUsmPrefetchParams + urPrintEnqueueWriteHostPipeParams + urPrintEventCreateWithNativeHandleParams + urPrintEventGetInfoParams + urPrintEventGetNativeHandleParams + urPrintEventGetProfilingInfoParams + urPrintEventInfo + urPrintEventNativeProperties + urPrintEventReleaseParams + urPrintEventRetainParams + urPrintEventSetCallbackParams + urPrintEventStatus + urPrintEventWaitParams + urPrintExecutionInfo + urPrintExpCommandBufferCommandInfo + urPrintExpCommandBufferDesc + urPrintExpCommandBufferInfo + urPrintExpCommandBufferUpdateKernelLaunchDesc + urPrintExpCommandBufferUpdateMemobjArgDesc + urPrintExpCommandBufferUpdatePointerArgDesc + urPrintExpCommandBufferUpdateValueArgDesc + urPrintExpFileDescriptor + urPrintExpImageCopyFlags + urPrintExpInteropMemDesc + urPrintExpInteropSemaphoreDesc + urPrintExpLaunchProperty + urPrintExpLaunchPropertyId + urPrintExpPeerInfo + urPrintExpSamplerAddrModes + urPrintExpSamplerCubemapFilterMode + urPrintExpSamplerCubemapProperties + urPrintExpSamplerMipProperties + urPrintExpWin32Handle + urPrintFunction + urPrintFunctionParams + urPrintImageChannelOrder + urPrintImageChannelType + urPrintImageDesc + urPrintImageFormat + urPrintImageInfo + urPrintKernelArgLocalProperties + urPrintKernelArgMemObjProperties + urPrintKernelArgPointerProperties + urPrintKernelArgSamplerProperties + urPrintKernelArgValueProperties + urPrintKernelCacheConfig + urPrintKernelCreateParams + urPrintKernelCreateWithNativeHandleParams + urPrintKernelExecInfo + urPrintKernelExecInfoProperties + urPrintKernelGetGroupInfoParams + urPrintKernelGetInfoParams + urPrintKernelGetNativeHandleParams + urPrintKernelGetSubGroupInfoParams + urPrintKernelGetSuggestedLocalWorkSizeParams + urPrintKernelGroupInfo + urPrintKernelInfo + urPrintKernelNativeProperties + urPrintKernelReleaseParams + urPrintKernelRetainParams + urPrintKernelSetArgLocalParams + urPrintKernelSetArgMemObjParams + urPrintKernelSetArgPointerParams + urPrintKernelSetArgSamplerParams + urPrintKernelSetArgValueParams + urPrintKernelSetExecInfoParams + urPrintKernelSetSpecializationConstantsParams + urPrintKernelSubGroupInfo + urPrintKernelSuggestMaxCooperativeGroupCountExpParams + urPrintLoaderConfigCreateParams + urPrintLoaderConfigEnableLayerParams + urPrintLoaderConfigGetInfoParams + urPrintLoaderConfigInfo + urPrintLoaderConfigReleaseParams + urPrintLoaderConfigRetainParams + urPrintLoaderConfigSetCodeLocationCallbackParams + urPrintLoaderInitParams + urPrintLoaderTearDownParams + urPrintMapFlags + urPrintMemBufferCreateParams + urPrintMemBufferCreateWithNativeHandleParams + urPrintMemBufferPartitionParams + urPrintMemFlags + urPrintMemGetInfoParams + urPrintMemGetNativeHandleParams + urPrintMemImageCreateParams + urPrintMemImageCreateWithNativeHandleParams + urPrintMemImageGetInfoParams + urPrintMemInfo + urPrintMemNativeProperties + urPrintMemReleaseParams + urPrintMemRetainParams + urPrintMemType + urPrintMemoryOrderCapabilityFlags + urPrintMemoryScopeCapabilityFlags + urPrintPhysicalMemCreateParams + urPrintPhysicalMemFlags + urPrintPhysicalMemProperties + urPrintPhysicalMemReleaseParams + urPrintPhysicalMemRetainParams + urPrintPlatformBackend + urPrintPlatformCreateWithNativeHandleParams + urPrintPlatformGetApiVersionParams + urPrintPlatformGetBackendOptionParams + urPrintPlatformGetInfoParams + urPrintPlatformGetNativeHandleParams + urPrintPlatformGetParams + urPrintPlatformInfo + urPrintPlatformNativeProperties + urPrintProfilingInfo + urPrintProgramBinaryType + urPrintProgramBuildExpParams + urPrintProgramBuildInfo + urPrintProgramBuildParams + urPrintProgramBuildStatus + urPrintProgramCompileExpParams + urPrintProgramCompileParams + urPrintProgramCreateWithBinaryParams + urPrintProgramCreateWithIlParams + urPrintProgramCreateWithNativeHandleParams + urPrintProgramGetBuildInfoParams + urPrintProgramGetFunctionPointerParams + urPrintProgramGetGlobalVariablePointerParams + urPrintProgramGetInfoParams + urPrintProgramGetNativeHandleParams + urPrintProgramInfo + urPrintProgramLinkExpParams + urPrintProgramLinkParams + urPrintProgramMetadata + urPrintProgramMetadataType + urPrintProgramNativeProperties + urPrintProgramProperties + urPrintProgramReleaseParams + urPrintProgramRetainParams + urPrintProgramSetSpecializationConstantsParams + urPrintQueueCreateParams + urPrintQueueCreateWithNativeHandleParams + urPrintQueueFinishParams + urPrintQueueFlags + urPrintQueueFlushParams + urPrintQueueGetInfoParams + urPrintQueueGetNativeHandleParams + urPrintQueueIndexProperties + urPrintQueueInfo + urPrintQueueNativeDesc + urPrintQueueNativeProperties + urPrintQueueProperties + urPrintQueueReleaseParams + urPrintQueueRetainParams + urPrintRectOffset + urPrintRectRegion + urPrintResult + urPrintSamplerAddressingMode + urPrintSamplerCreateParams + urPrintSamplerCreateWithNativeHandleParams + urPrintSamplerDesc + urPrintSamplerFilterMode + urPrintSamplerGetInfoParams + urPrintSamplerGetNativeHandleParams + urPrintSamplerInfo + urPrintSamplerNativeProperties + urPrintSamplerReleaseParams + urPrintSamplerRetainParams + urPrintSpecializationConstantInfo + urPrintStructureType + urPrintUsmAdviceFlags + urPrintUsmAllocInfo + urPrintUsmAllocLocationDesc + urPrintUsmDesc + urPrintUsmDeviceAllocParams + urPrintUsmDeviceDesc + urPrintUsmDeviceMemFlags + urPrintUsmFreeParams + urPrintUsmGetMemAllocInfoParams + urPrintUsmHostAllocParams + urPrintUsmHostDesc + urPrintUsmHostMemFlags + urPrintUsmImportExpParams + urPrintUsmMigrationFlags + urPrintUsmP2pDisablePeerAccessExpParams + urPrintUsmP2pEnablePeerAccessExpParams + urPrintUsmP2pPeerAccessGetInfoExpParams + urPrintUsmPitchedAllocExpParams + urPrintUsmPoolCreateParams + urPrintUsmPoolDesc + urPrintUsmPoolFlags + urPrintUsmPoolGetInfoParams + urPrintUsmPoolInfo + urPrintUsmPoolLimitsDesc + urPrintUsmPoolReleaseParams + urPrintUsmPoolRetainParams + urPrintUsmReleaseExpParams + urPrintUsmSharedAllocParams + urPrintUsmType + urPrintVirtualMemAccessFlags + urPrintVirtualMemFreeParams + urPrintVirtualMemGetInfoParams + urPrintVirtualMemGranularityGetInfoParams + urPrintVirtualMemGranularityInfo + urPrintVirtualMemInfo + urPrintVirtualMemMapParams + urPrintVirtualMemReserveParams + urPrintVirtualMemSetAccessParams + urPrintVirtualMemUnmapParams + urProgramBuild + urProgramBuildExp + urProgramCompile + urProgramCompileExp + urProgramCreateWithBinary + urProgramCreateWithIL + urProgramCreateWithNativeHandle + urProgramGetBuildInfo + urProgramGetFunctionPointer + urProgramGetGlobalVariablePointer + urProgramGetInfo + urProgramGetNativeHandle + urProgramLink + urProgramLinkExp + urProgramRelease + urProgramRetain + urProgramSetSpecializationConstants + urQueueCreate + urQueueCreateWithNativeHandle + urQueueFinish + urQueueFlush + urQueueGetInfo + urQueueGetNativeHandle + urQueueRelease + urQueueRetain + urSamplerCreate + urSamplerCreateWithNativeHandle + urSamplerGetInfo + urSamplerGetNativeHandle + urSamplerRelease + urSamplerRetain + urUSMDeviceAlloc + urUSMFree + urUSMGetMemAllocInfo + urUSMHostAlloc + urUSMImportExp + urUSMPitchedAllocExp + urUSMPoolCreate + urUSMPoolGetInfo + urUSMPoolRelease + urUSMPoolRetain + urUSMReleaseExp + urUSMSharedAlloc + urUsmP2PDisablePeerAccessExp + urUsmP2PEnablePeerAccessExp + urUsmP2PPeerAccessGetInfoExp + urVirtualMemFree + urVirtualMemGetInfo + urVirtualMemGranularityGetInfo + urVirtualMemMap + urVirtualMemReserve + urVirtualMemSetAccess + urVirtualMemUnmap diff --git a/source/loader/loader.map.in b/source/loader/loader.map.in new file mode 100644 index 0000000000..fd390f7fc4 --- /dev/null +++ b/source/loader/loader.map.in @@ -0,0 +1,541 @@ +@TARGET_LIBNAME@ { + global: + urAdapterGet; + urAdapterGetInfo; + urAdapterGetLastError; + urAdapterRelease; + urAdapterRetain; + urBindlessImagesDestroyExternalSemaphoreExp; + urBindlessImagesImageAllocateExp; + urBindlessImagesImageCopyExp; + urBindlessImagesImageFreeExp; + urBindlessImagesImageGetInfoExp; + urBindlessImagesImportExternalSemaphoreOpaqueFDExp; + urBindlessImagesImportOpaqueFDExp; + urBindlessImagesMapExternalArrayExp; + urBindlessImagesMipmapFreeExp; + urBindlessImagesMipmapGetLevelExp; + urBindlessImagesReleaseInteropExp; + urBindlessImagesSampledImageCreateExp; + urBindlessImagesSampledImageHandleDestroyExp; + urBindlessImagesSignalExternalSemaphoreExp; + urBindlessImagesUnsampledImageCreateExp; + urBindlessImagesUnsampledImageHandleDestroyExp; + urBindlessImagesWaitExternalSemaphoreExp; + urCommandBufferAppendKernelLaunchExp; + urCommandBufferAppendMemBufferCopyExp; + urCommandBufferAppendMemBufferCopyRectExp; + urCommandBufferAppendMemBufferFillExp; + urCommandBufferAppendMemBufferReadExp; + urCommandBufferAppendMemBufferReadRectExp; + urCommandBufferAppendMemBufferWriteExp; + urCommandBufferAppendMemBufferWriteRectExp; + urCommandBufferAppendUSMAdviseExp; + urCommandBufferAppendUSMFillExp; + urCommandBufferAppendUSMMemcpyExp; + urCommandBufferAppendUSMPrefetchExp; + urCommandBufferCommandGetInfoExp; + urCommandBufferCreateExp; + urCommandBufferEnqueueExp; + urCommandBufferFinalizeExp; + urCommandBufferGetInfoExp; + urCommandBufferReleaseCommandExp; + urCommandBufferReleaseExp; + urCommandBufferRetainCommandExp; + urCommandBufferRetainExp; + urCommandBufferUpdateKernelLaunchExp; + urContextCreate; + urContextCreateWithNativeHandle; + urContextGetInfo; + urContextGetNativeHandle; + urContextRelease; + urContextRetain; + urContextSetExtendedDeleter; + urDeviceCreateWithNativeHandle; + urDeviceGet; + urDeviceGetGlobalTimestamps; + urDeviceGetInfo; + urDeviceGetNativeHandle; + urDeviceGetSelected; + urDevicePartition; + urDeviceRelease; + urDeviceRetain; + urDeviceSelectBinary; + urEnqueueCooperativeKernelLaunchExp; + urEnqueueDeviceGlobalVariableRead; + urEnqueueDeviceGlobalVariableWrite; + urEnqueueEventsWait; + urEnqueueEventsWaitWithBarrier; + urEnqueueKernelLaunch; + urEnqueueKernelLaunchCustomExp; + urEnqueueMemBufferCopy; + urEnqueueMemBufferCopyRect; + urEnqueueMemBufferFill; + urEnqueueMemBufferMap; + urEnqueueMemBufferRead; + urEnqueueMemBufferReadRect; + urEnqueueMemBufferWrite; + urEnqueueMemBufferWriteRect; + urEnqueueMemImageCopy; + urEnqueueMemImageRead; + urEnqueueMemImageWrite; + urEnqueueMemUnmap; + urEnqueueReadHostPipe; + urEnqueueTimestampRecordingExp; + urEnqueueUSMAdvise; + urEnqueueUSMFill; + urEnqueueUSMFill2D; + urEnqueueUSMMemcpy; + urEnqueueUSMMemcpy2D; + urEnqueueUSMPrefetch; + urEnqueueWriteHostPipe; + urEventCreateWithNativeHandle; + urEventGetInfo; + urEventGetNativeHandle; + urEventGetProfilingInfo; + urEventRelease; + urEventRetain; + urEventSetCallback; + urEventWait; + urGetBindlessImagesExpProcAddrTable; + urGetCommandBufferExpProcAddrTable; + urGetContextProcAddrTable; + urGetDeviceProcAddrTable; + urGetEnqueueExpProcAddrTable; + urGetEnqueueProcAddrTable; + urGetEventProcAddrTable; + urGetGlobalProcAddrTable; + urGetKernelExpProcAddrTable; + urGetKernelProcAddrTable; + urGetMemProcAddrTable; + urGetPhysicalMemProcAddrTable; + urGetPlatformProcAddrTable; + urGetProgramExpProcAddrTable; + urGetProgramProcAddrTable; + urGetQueueProcAddrTable; + urGetSamplerProcAddrTable; + urGetUSMExpProcAddrTable; + urGetUSMProcAddrTable; + urGetUsmP2PExpProcAddrTable; + urGetVirtualMemProcAddrTable; + urKernelCreate; + urKernelCreateWithNativeHandle; + urKernelGetGroupInfo; + urKernelGetInfo; + urKernelGetNativeHandle; + urKernelGetSubGroupInfo; + urKernelGetSuggestedLocalWorkSize; + urKernelRelease; + urKernelRetain; + urKernelSetArgLocal; + urKernelSetArgMemObj; + urKernelSetArgPointer; + urKernelSetArgSampler; + urKernelSetArgValue; + urKernelSetExecInfo; + urKernelSetSpecializationConstants; + urKernelSuggestMaxCooperativeGroupCountExp; + urLoaderConfigCreate; + urLoaderConfigEnableLayer; + urLoaderConfigGetInfo; + urLoaderConfigRelease; + urLoaderConfigRetain; + urLoaderConfigSetCodeLocationCallback; + urLoaderInit; + urLoaderTearDown; + urMemBufferCreate; + urMemBufferCreateWithNativeHandle; + urMemBufferPartition; + urMemGetInfo; + urMemGetNativeHandle; + urMemImageCreate; + urMemImageCreateWithNativeHandle; + urMemImageGetInfo; + urMemRelease; + urMemRetain; + urPhysicalMemCreate; + urPhysicalMemRelease; + urPhysicalMemRetain; + urPlatformCreateWithNativeHandle; + urPlatformGet; + urPlatformGetApiVersion; + urPlatformGetBackendOption; + urPlatformGetInfo; + urPlatformGetNativeHandle; + urPrintAdapterBackend; + urPrintAdapterGetInfoParams; + urPrintAdapterGetLastErrorParams; + urPrintAdapterGetParams; + urPrintAdapterInfo; + urPrintAdapterReleaseParams; + urPrintAdapterRetainParams; + urPrintApiVersion; + urPrintBaseDesc; + urPrintBaseProperties; + urPrintBindlessImagesDestroyExternalSemaphoreExpParams; + urPrintBindlessImagesImageAllocateExpParams; + urPrintBindlessImagesImageCopyExpParams; + urPrintBindlessImagesImageFreeExpParams; + urPrintBindlessImagesImageGetInfoExpParams; + urPrintBindlessImagesImportExternalSemaphoreOpaqueFdExpParams; + urPrintBindlessImagesImportOpaqueFdExpParams; + urPrintBindlessImagesMapExternalArrayExpParams; + urPrintBindlessImagesMipmapFreeExpParams; + urPrintBindlessImagesMipmapGetLevelExpParams; + urPrintBindlessImagesReleaseInteropExpParams; + urPrintBindlessImagesSampledImageCreateExpParams; + urPrintBindlessImagesSampledImageHandleDestroyExpParams; + urPrintBindlessImagesSignalExternalSemaphoreExpParams; + urPrintBindlessImagesUnsampledImageCreateExpParams; + urPrintBindlessImagesUnsampledImageHandleDestroyExpParams; + urPrintBindlessImagesWaitExternalSemaphoreExpParams; + urPrintBufferAllocLocationProperties; + urPrintBufferChannelProperties; + urPrintBufferCreateType; + urPrintBufferProperties; + urPrintBufferRegion; + urPrintCodeLocation; + urPrintCommand; + urPrintCommandBufferAppendKernelLaunchExpParams; + urPrintCommandBufferAppendMemBufferCopyExpParams; + urPrintCommandBufferAppendMemBufferCopyRectExpParams; + urPrintCommandBufferAppendMemBufferFillExpParams; + urPrintCommandBufferAppendMemBufferReadExpParams; + urPrintCommandBufferAppendMemBufferReadRectExpParams; + urPrintCommandBufferAppendMemBufferWriteExpParams; + urPrintCommandBufferAppendMemBufferWriteRectExpParams; + urPrintCommandBufferAppendUsmAdviseExpParams; + urPrintCommandBufferAppendUsmFillExpParams; + urPrintCommandBufferAppendUsmMemcpyExpParams; + urPrintCommandBufferAppendUsmPrefetchExpParams; + urPrintCommandBufferCommandGetInfoExpParams; + urPrintCommandBufferCreateExpParams; + urPrintCommandBufferEnqueueExpParams; + urPrintCommandBufferFinalizeExpParams; + urPrintCommandBufferGetInfoExpParams; + urPrintCommandBufferReleaseCommandExpParams; + urPrintCommandBufferReleaseExpParams; + urPrintCommandBufferRetainCommandExpParams; + urPrintCommandBufferRetainExpParams; + urPrintCommandBufferUpdateKernelLaunchExpParams; + urPrintContextCreateParams; + urPrintContextCreateWithNativeHandleParams; + urPrintContextFlags; + urPrintContextGetInfoParams; + urPrintContextGetNativeHandleParams; + urPrintContextInfo; + urPrintContextNativeProperties; + urPrintContextProperties; + urPrintContextReleaseParams; + urPrintContextRetainParams; + urPrintContextSetExtendedDeleterParams; + urPrintDeviceAffinityDomainFlags; + urPrintDeviceBinary; + urPrintDeviceCreateWithNativeHandleParams; + urPrintDeviceExecCapabilityFlags; + urPrintDeviceFpCapabilityFlags; + urPrintDeviceGetGlobalTimestampsParams; + urPrintDeviceGetInfoParams; + urPrintDeviceGetNativeHandleParams; + urPrintDeviceGetParams; + urPrintDeviceGetSelectedParams; + urPrintDeviceInfo; + urPrintDeviceInitFlags; + urPrintDeviceLocalMemType; + urPrintDeviceMemCacheType; + urPrintDeviceNativeProperties; + urPrintDevicePartition; + urPrintDevicePartitionParams; + urPrintDevicePartitionProperties; + urPrintDevicePartitionProperty; + urPrintDeviceReleaseParams; + urPrintDeviceRetainParams; + urPrintDeviceSelectBinaryParams; + urPrintDeviceType; + urPrintDeviceUsmAccessCapabilityFlags; + urPrintEnqueueCooperativeKernelLaunchExpParams; + urPrintEnqueueDeviceGlobalVariableReadParams; + urPrintEnqueueDeviceGlobalVariableWriteParams; + urPrintEnqueueEventsWaitParams; + urPrintEnqueueEventsWaitWithBarrierParams; + urPrintEnqueueKernelLaunchCustomExpParams; + urPrintEnqueueKernelLaunchParams; + urPrintEnqueueMemBufferCopyParams; + urPrintEnqueueMemBufferCopyRectParams; + urPrintEnqueueMemBufferFillParams; + urPrintEnqueueMemBufferMapParams; + urPrintEnqueueMemBufferReadParams; + urPrintEnqueueMemBufferReadRectParams; + urPrintEnqueueMemBufferWriteParams; + urPrintEnqueueMemBufferWriteRectParams; + urPrintEnqueueMemImageCopyParams; + urPrintEnqueueMemImageReadParams; + urPrintEnqueueMemImageWriteParams; + urPrintEnqueueMemUnmapParams; + urPrintEnqueueReadHostPipeParams; + urPrintEnqueueTimestampRecordingExpParams; + urPrintEnqueueUsmAdviseParams; + urPrintEnqueueUsmFillParams; + urPrintEnqueueUsmFill_2dParams; + urPrintEnqueueUsmMemcpyParams; + urPrintEnqueueUsmMemcpy_2dParams; + urPrintEnqueueUsmPrefetchParams; + urPrintEnqueueWriteHostPipeParams; + urPrintEventCreateWithNativeHandleParams; + urPrintEventGetInfoParams; + urPrintEventGetNativeHandleParams; + urPrintEventGetProfilingInfoParams; + urPrintEventInfo; + urPrintEventNativeProperties; + urPrintEventReleaseParams; + urPrintEventRetainParams; + urPrintEventSetCallbackParams; + urPrintEventStatus; + urPrintEventWaitParams; + urPrintExecutionInfo; + urPrintExpCommandBufferCommandInfo; + urPrintExpCommandBufferDesc; + urPrintExpCommandBufferInfo; + urPrintExpCommandBufferUpdateKernelLaunchDesc; + urPrintExpCommandBufferUpdateMemobjArgDesc; + urPrintExpCommandBufferUpdatePointerArgDesc; + urPrintExpCommandBufferUpdateValueArgDesc; + urPrintExpFileDescriptor; + urPrintExpImageCopyFlags; + urPrintExpInteropMemDesc; + urPrintExpInteropSemaphoreDesc; + urPrintExpLaunchProperty; + urPrintExpLaunchPropertyId; + urPrintExpPeerInfo; + urPrintExpSamplerAddrModes; + urPrintExpSamplerCubemapFilterMode; + urPrintExpSamplerCubemapProperties; + urPrintExpSamplerMipProperties; + urPrintExpWin32Handle; + urPrintFunction; + urPrintFunctionParams; + urPrintImageChannelOrder; + urPrintImageChannelType; + urPrintImageDesc; + urPrintImageFormat; + urPrintImageInfo; + urPrintKernelArgLocalProperties; + urPrintKernelArgMemObjProperties; + urPrintKernelArgPointerProperties; + urPrintKernelArgSamplerProperties; + urPrintKernelArgValueProperties; + urPrintKernelCacheConfig; + urPrintKernelCreateParams; + urPrintKernelCreateWithNativeHandleParams; + urPrintKernelExecInfo; + urPrintKernelExecInfoProperties; + urPrintKernelGetGroupInfoParams; + urPrintKernelGetInfoParams; + urPrintKernelGetNativeHandleParams; + urPrintKernelGetSubGroupInfoParams; + urPrintKernelGetSuggestedLocalWorkSizeParams; + urPrintKernelGroupInfo; + urPrintKernelInfo; + urPrintKernelNativeProperties; + urPrintKernelReleaseParams; + urPrintKernelRetainParams; + urPrintKernelSetArgLocalParams; + urPrintKernelSetArgMemObjParams; + urPrintKernelSetArgPointerParams; + urPrintKernelSetArgSamplerParams; + urPrintKernelSetArgValueParams; + urPrintKernelSetExecInfoParams; + urPrintKernelSetSpecializationConstantsParams; + urPrintKernelSubGroupInfo; + urPrintKernelSuggestMaxCooperativeGroupCountExpParams; + urPrintLoaderConfigCreateParams; + urPrintLoaderConfigEnableLayerParams; + urPrintLoaderConfigGetInfoParams; + urPrintLoaderConfigInfo; + urPrintLoaderConfigReleaseParams; + urPrintLoaderConfigRetainParams; + urPrintLoaderConfigSetCodeLocationCallbackParams; + urPrintLoaderInitParams; + urPrintLoaderTearDownParams; + urPrintMapFlags; + urPrintMemBufferCreateParams; + urPrintMemBufferCreateWithNativeHandleParams; + urPrintMemBufferPartitionParams; + urPrintMemFlags; + urPrintMemGetInfoParams; + urPrintMemGetNativeHandleParams; + urPrintMemImageCreateParams; + urPrintMemImageCreateWithNativeHandleParams; + urPrintMemImageGetInfoParams; + urPrintMemInfo; + urPrintMemNativeProperties; + urPrintMemReleaseParams; + urPrintMemRetainParams; + urPrintMemType; + urPrintMemoryOrderCapabilityFlags; + urPrintMemoryScopeCapabilityFlags; + urPrintPhysicalMemCreateParams; + urPrintPhysicalMemFlags; + urPrintPhysicalMemProperties; + urPrintPhysicalMemReleaseParams; + urPrintPhysicalMemRetainParams; + urPrintPlatformBackend; + urPrintPlatformCreateWithNativeHandleParams; + urPrintPlatformGetApiVersionParams; + urPrintPlatformGetBackendOptionParams; + urPrintPlatformGetInfoParams; + urPrintPlatformGetNativeHandleParams; + urPrintPlatformGetParams; + urPrintPlatformInfo; + urPrintPlatformNativeProperties; + urPrintProfilingInfo; + urPrintProgramBinaryType; + urPrintProgramBuildExpParams; + urPrintProgramBuildInfo; + urPrintProgramBuildParams; + urPrintProgramBuildStatus; + urPrintProgramCompileExpParams; + urPrintProgramCompileParams; + urPrintProgramCreateWithBinaryParams; + urPrintProgramCreateWithIlParams; + urPrintProgramCreateWithNativeHandleParams; + urPrintProgramGetBuildInfoParams; + urPrintProgramGetFunctionPointerParams; + urPrintProgramGetGlobalVariablePointerParams; + urPrintProgramGetInfoParams; + urPrintProgramGetNativeHandleParams; + urPrintProgramInfo; + urPrintProgramLinkExpParams; + urPrintProgramLinkParams; + urPrintProgramMetadata; + urPrintProgramMetadataType; + urPrintProgramNativeProperties; + urPrintProgramProperties; + urPrintProgramReleaseParams; + urPrintProgramRetainParams; + urPrintProgramSetSpecializationConstantsParams; + urPrintQueueCreateParams; + urPrintQueueCreateWithNativeHandleParams; + urPrintQueueFinishParams; + urPrintQueueFlags; + urPrintQueueFlushParams; + urPrintQueueGetInfoParams; + urPrintQueueGetNativeHandleParams; + urPrintQueueIndexProperties; + urPrintQueueInfo; + urPrintQueueNativeDesc; + urPrintQueueNativeProperties; + urPrintQueueProperties; + urPrintQueueReleaseParams; + urPrintQueueRetainParams; + urPrintRectOffset; + urPrintRectRegion; + urPrintResult; + urPrintSamplerAddressingMode; + urPrintSamplerCreateParams; + urPrintSamplerCreateWithNativeHandleParams; + urPrintSamplerDesc; + urPrintSamplerFilterMode; + urPrintSamplerGetInfoParams; + urPrintSamplerGetNativeHandleParams; + urPrintSamplerInfo; + urPrintSamplerNativeProperties; + urPrintSamplerReleaseParams; + urPrintSamplerRetainParams; + urPrintSpecializationConstantInfo; + urPrintStructureType; + urPrintUsmAdviceFlags; + urPrintUsmAllocInfo; + urPrintUsmAllocLocationDesc; + urPrintUsmDesc; + urPrintUsmDeviceAllocParams; + urPrintUsmDeviceDesc; + urPrintUsmDeviceMemFlags; + urPrintUsmFreeParams; + urPrintUsmGetMemAllocInfoParams; + urPrintUsmHostAllocParams; + urPrintUsmHostDesc; + urPrintUsmHostMemFlags; + urPrintUsmImportExpParams; + urPrintUsmMigrationFlags; + urPrintUsmP2pDisablePeerAccessExpParams; + urPrintUsmP2pEnablePeerAccessExpParams; + urPrintUsmP2pPeerAccessGetInfoExpParams; + urPrintUsmPitchedAllocExpParams; + urPrintUsmPoolCreateParams; + urPrintUsmPoolDesc; + urPrintUsmPoolFlags; + urPrintUsmPoolGetInfoParams; + urPrintUsmPoolInfo; + urPrintUsmPoolLimitsDesc; + urPrintUsmPoolReleaseParams; + urPrintUsmPoolRetainParams; + urPrintUsmReleaseExpParams; + urPrintUsmSharedAllocParams; + urPrintUsmType; + urPrintVirtualMemAccessFlags; + urPrintVirtualMemFreeParams; + urPrintVirtualMemGetInfoParams; + urPrintVirtualMemGranularityGetInfoParams; + urPrintVirtualMemGranularityInfo; + urPrintVirtualMemInfo; + urPrintVirtualMemMapParams; + urPrintVirtualMemReserveParams; + urPrintVirtualMemSetAccessParams; + urPrintVirtualMemUnmapParams; + urProgramBuild; + urProgramBuildExp; + urProgramCompile; + urProgramCompileExp; + urProgramCreateWithBinary; + urProgramCreateWithIL; + urProgramCreateWithNativeHandle; + urProgramGetBuildInfo; + urProgramGetFunctionPointer; + urProgramGetGlobalVariablePointer; + urProgramGetInfo; + urProgramGetNativeHandle; + urProgramLink; + urProgramLinkExp; + urProgramRelease; + urProgramRetain; + urProgramSetSpecializationConstants; + urQueueCreate; + urQueueCreateWithNativeHandle; + urQueueFinish; + urQueueFlush; + urQueueGetInfo; + urQueueGetNativeHandle; + urQueueRelease; + urQueueRetain; + urSamplerCreate; + urSamplerCreateWithNativeHandle; + urSamplerGetInfo; + urSamplerGetNativeHandle; + urSamplerRelease; + urSamplerRetain; + urUSMDeviceAlloc; + urUSMFree; + urUSMGetMemAllocInfo; + urUSMHostAlloc; + urUSMImportExp; + urUSMPitchedAllocExp; + urUSMPoolCreate; + urUSMPoolGetInfo; + urUSMPoolRelease; + urUSMPoolRetain; + urUSMReleaseExp; + urUSMSharedAlloc; + urUsmP2PDisablePeerAccessExp; + urUsmP2PEnablePeerAccessExp; + urUsmP2PPeerAccessGetInfoExp; + urVirtualMemFree; + urVirtualMemGetInfo; + urVirtualMemGranularityGetInfo; + urVirtualMemMap; + urVirtualMemReserve; + urVirtualMemSetAccess; + urVirtualMemUnmap; + local: + *; +}; diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index d7a9447b06..ed7e4299b1 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -22,7 +22,6 @@ ur_event_factory_t ur_event_factory; ur_program_factory_t ur_program_factory; ur_kernel_factory_t ur_kernel_factory; ur_queue_factory_t ur_queue_factory; -ur_native_factory_t ur_native_factory; ur_sampler_factory_t ur_sampler_factory; ur_mem_factory_t ur_mem_factory; ur_physical_mem_factory_t ur_physical_mem_factory; @@ -364,6 +363,8 @@ __urdlllocal ur_result_t UR_APICALL urPlatformGetNativeHandle( __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( ur_native_handle_t hNativePlatform, ///< [in][nocheck] the native handle of the platform. + ur_adapter_handle_t + hAdapter, ///< [in] handle of the adapter associated with the native backend. const ur_platform_native_properties_t * pProperties, ///< [in][optional] pointer to native platform properties struct. ur_platform_handle_t * @@ -372,8 +373,7 @@ __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( ur_result_t result = UR_RESULT_SUCCESS; // extract platform's function pointer table - auto dditable = - reinterpret_cast(hNativePlatform)->dditable; + auto dditable = reinterpret_cast(hAdapter)->dditable; auto pfnCreateWithNativeHandle = dditable->ur.Platform.pfnCreateWithNativeHandle; if (nullptr == pfnCreateWithNativeHandle) { @@ -381,12 +381,11 @@ __urdlllocal ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( } // convert loader handle to platform handle - hNativePlatform = - reinterpret_cast(hNativePlatform)->handle; + hAdapter = reinterpret_cast(hAdapter)->handle; // forward to device-platform - result = - pfnCreateWithNativeHandle(hNativePlatform, pProperties, phPlatform); + result = pfnCreateWithNativeHandle(hNativePlatform, hAdapter, pProperties, + phPlatform); if (UR_RESULT_SUCCESS != result) { return result; @@ -3449,6 +3448,49 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreateWithNativeHandle( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Intercept function for urKernelGetSuggestedLocalWorkSize +__urdlllocal ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + uint32_t + numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global + ///< and work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the offset used to calculate the global ID of a work-item + const size_t * + pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the number of global work-items in workDim that will execute the + ///< kernel function + size_t * + pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify + ///< suggested local work size that will contain the result of the query +) { + ur_result_t result = UR_RESULT_SUCCESS; + + // extract platform's function pointer table + auto dditable = reinterpret_cast(hKernel)->dditable; + auto pfnGetSuggestedLocalWorkSize = + dditable->ur.Kernel.pfnGetSuggestedLocalWorkSize; + if (nullptr == pfnGetSuggestedLocalWorkSize) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + // convert loader handle to platform handle + hKernel = reinterpret_cast(hKernel)->handle; + + // convert loader handle to platform handle + hQueue = reinterpret_cast(hQueue)->handle; + + // forward to device-platform + result = pfnGetSuggestedLocalWorkSize(hKernel, hQueue, numWorkDim, + pGlobalWorkOffset, pGlobalWorkSize, + pSuggestedLocalWorkSize); + + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Intercept function for urQueueGetInfo __urdlllocal ur_result_t UR_APICALL urQueueGetInfo( @@ -8599,6 +8641,8 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( pDdiTable->pfnGetNativeHandle = ur_loader::urKernelGetNativeHandle; pDdiTable->pfnCreateWithNativeHandle = ur_loader::urKernelCreateWithNativeHandle; + pDdiTable->pfnGetSuggestedLocalWorkSize = + ur_loader::urKernelGetSuggestedLocalWorkSize; pDdiTable->pfnSetArgValue = ur_loader::urKernelSetArgValue; pDdiTable->pfnSetArgLocal = ur_loader::urKernelSetArgLocal; pDdiTable->pfnSetArgPointer = ur_loader::urKernelSetArgPointer; diff --git a/source/loader/ur_ldrddi.hpp b/source/loader/ur_ldrddi.hpp index d98b99a655..af5b5d49b5 100644 --- a/source/loader/ur_ldrddi.hpp +++ b/source/loader/ur_ldrddi.hpp @@ -49,10 +49,6 @@ using ur_queue_object_t = object_t; using ur_queue_factory_t = singleton_factory_t; -using ur_native_object_t = object_t; -using ur_native_factory_t = - singleton_factory_t; - using ur_sampler_object_t = object_t; using ur_sampler_factory_t = singleton_factory_t; diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 35e5d68e36..1bc1fc82f0 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -653,6 +653,8 @@ ur_result_t UR_APICALL urPlatformGetNativeHandle( /// - ::UR_RESULT_ERROR_UNINITIALIZED /// - ::UR_RESULT_ERROR_DEVICE_LOST /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hAdapter` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == phPlatform` /// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE @@ -660,6 +662,8 @@ ur_result_t UR_APICALL urPlatformGetNativeHandle( ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( ur_native_handle_t hNativePlatform, ///< [in][nocheck] the native handle of the platform. + ur_adapter_handle_t + hAdapter, ///< [in] handle of the adapter associated with the native backend. const ur_platform_native_properties_t * pProperties, ///< [in][optional] pointer to native platform properties struct. ur_platform_handle_t * @@ -671,7 +675,8 @@ ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( return UR_RESULT_ERROR_UNINITIALIZED; } - return pfnCreateWithNativeHandle(hNativePlatform, pProperties, phPlatform); + return pfnCreateWithNativeHandle(hNativePlatform, hAdapter, pProperties, + phPlatform); } catch (...) { return exceptionToResult(std::current_exception()); } @@ -1470,6 +1475,22 @@ ur_result_t UR_APICALL urContextSetExtendedDeleter( /////////////////////////////////////////////////////////////////////////////// /// @brief Create an image object /// +/// @details +/// - The primary ::ur_image_format_t that must be supported by all the +/// adapters are {UR_IMAGE_CHANNEL_ORDER_RGBA, +/// UR_IMAGE_CHANNEL_TYPE_UNORM_INT8}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT16}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT8}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT16}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_FLOAT}. +/// /// @remarks /// _Analogues_ /// - **clCreateImage** @@ -1490,12 +1511,13 @@ ur_result_t UR_APICALL urContextSetExtendedDeleter( /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR -/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type` +/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type` /// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE /// - ::UR_RESULT_ERROR_INVALID_OPERATION /// - ::UR_RESULT_ERROR_INVALID_HOST_PTR /// + `pHost == NULL && (flags & (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) != 0` /// + `pHost != NULL && (flags & (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) == 0` +/// - ::UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urMemImageCreate( @@ -4096,6 +4118,58 @@ ur_result_t UR_APICALL urKernelCreateWithNativeHandle( return exceptionToResult(std::current_exception()); } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Get the suggested local work size for a kernel. +/// +/// @details +/// - Query a suggested local work size for a kernel given a global size for +/// each dimension. +/// - The application may call this function from simultaneous threads for +/// the same context. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hKernel` +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkOffset` +/// + `NULL == pGlobalWorkSize` +/// + `NULL == pSuggestedLocalWorkSize` +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + uint32_t + numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global + ///< and work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the offset used to calculate the global ID of a work-item + const size_t * + pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the number of global work-items in workDim that will execute the + ///< kernel function + size_t * + pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify + ///< suggested local work size that will contain the result of the query + ) try { + auto pfnGetSuggestedLocalWorkSize = + ur_lib::context->urDdiTable.Kernel.pfnGetSuggestedLocalWorkSize; + if (nullptr == pfnGetSuggestedLocalWorkSize) { + return UR_RESULT_ERROR_UNINITIALIZED; + } + + return pfnGetSuggestedLocalWorkSize(hKernel, hQueue, numWorkDim, + pGlobalWorkOffset, pGlobalWorkSize, + pSuggestedLocalWorkSize); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Query information about a command queue /// @@ -4120,7 +4194,7 @@ ur_result_t UR_APICALL urKernelCreateWithNativeHandle( /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `propSize != 0 && pPropValue == NULL` /// + `pPropValue == NULL && pPropSizeRet == NULL` -/// - ::UR_RESULT_ERROR_INVALID_QUEUE +/// - ::UR_RESULT_ERROR_INVALID_QUEUE - "If `hQueue` isn't a valid queue handle or if `propName` isn't supported by `hQueue`." /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urQueueGetInfo( @@ -6525,7 +6599,7 @@ ur_result_t UR_APICALL urBindlessImagesSampledImageHandleDestroyExp( /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR -/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type` +/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type` /// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE /// - ::UR_RESULT_ERROR_INVALID_OPERATION ur_result_t UR_APICALL urBindlessImagesImageAllocateExp( @@ -6607,7 +6681,7 @@ ur_result_t UR_APICALL urBindlessImagesImageFreeExp( /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR -/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type` +/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type` /// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE /// - ::UR_RESULT_ERROR_INVALID_OPERATION ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( @@ -6658,7 +6732,7 @@ ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR -/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type` +/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type` /// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE /// - ::UR_RESULT_ERROR_INVALID_SAMPLER /// - ::UR_RESULT_ERROR_INVALID_OPERATION @@ -6713,7 +6787,7 @@ ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( /// - ::UR_RESULT_ERROR_INVALID_QUEUE /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR -/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type` +/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type` /// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE /// - ::UR_RESULT_ERROR_INVALID_OPERATION ur_result_t UR_APICALL urBindlessImagesImageCopyExp( @@ -6936,7 +7010,7 @@ ur_result_t UR_APICALL urBindlessImagesImportOpaqueFDExp( /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR -/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type` +/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type` /// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE /// - ::UR_RESULT_ERROR_INVALID_OPERATION /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES @@ -8065,7 +8139,9 @@ ur_result_t UR_APICALL urCommandBufferReleaseCommandExp( } /////////////////////////////////////////////////////////////////////////////// -/// @brief Update a kernel launch command in a finalized command-buffer. +/// @brief Update a kernel launch command in a finalized command-buffer. This +/// entry-point is synchronous and may block if the command-buffer is +/// executing when the entry-point is called. /// /// @returns /// - ::UR_RESULT_SUCCESS diff --git a/source/loader/ur_print.cpp b/source/loader/ur_print.cpp index 5af2165ea4..79107c733d 100644 --- a/source/loader/ur_print.cpp +++ b/source/loader/ur_print.cpp @@ -1821,6 +1821,14 @@ ur_result_t urPrintKernelCreateWithNativeHandleParams( return str_copy(&ss, buffer, buff_size, out_size); } +ur_result_t urPrintKernelGetSuggestedLocalWorkSizeParams( + const struct ur_kernel_get_suggested_local_work_size_params_t *params, + char *buffer, const size_t buff_size, size_t *out_size) { + std::stringstream ss; + ss << params; + return str_copy(&ss, buffer, buff_size, out_size); +} + ur_result_t urPrintKernelSetArgValueParams( const struct ur_kernel_set_arg_value_params_t *params, char *buffer, const size_t buff_size, size_t *out_size) { diff --git a/source/loader/windows/adapter_search.cpp b/source/loader/windows/adapter_search.cpp index 609d5576f2..b514897d91 100644 --- a/source/loader/windows/adapter_search.cpp +++ b/source/loader/windows/adapter_search.cpp @@ -23,8 +23,14 @@ namespace fs = filesystem; namespace ur_loader { std::optional getLoaderLibPath() { + HMODULE hModule = NULL; char pathStr[MAX_PATH_LEN_WIN]; - if (GetModuleFileNameA(nullptr, pathStr, MAX_PATH_LEN_WIN)) { + + if (GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | + GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, + reinterpret_cast(&getLoaderLibPath), + &hModule) && + GetModuleFileNameA(hModule, pathStr, MAX_PATH_LEN_WIN)) { auto libPath = fs::path(pathStr); if (fs::exists(libPath)) { return fs::absolute(libPath).parent_path(); diff --git a/source/ur/ur.hpp b/source/ur/ur.hpp index ccca6a756b..629ba277c8 100644 --- a/source/ur/ur.hpp +++ b/source/ur/ur.hpp @@ -337,7 +337,7 @@ roundToHighestFactorOfGlobalSize(size_t &ThreadsPerBlockInDim, // Returns whether or not Value is a power of 2 template >> -bool isPowerOf2(const T &Value) { +bool isPowerOf2(const T Value) { return Value && !(Value & (Value - 1)); } diff --git a/source/ur_api.cpp b/source/ur_api.cpp index bf90700e7d..07202a8aad 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -569,6 +569,8 @@ ur_result_t UR_APICALL urPlatformGetNativeHandle( /// - ::UR_RESULT_ERROR_UNINITIALIZED /// - ::UR_RESULT_ERROR_DEVICE_LOST /// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hAdapter` /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `NULL == phPlatform` /// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE @@ -576,6 +578,8 @@ ur_result_t UR_APICALL urPlatformGetNativeHandle( ur_result_t UR_APICALL urPlatformCreateWithNativeHandle( ur_native_handle_t hNativePlatform, ///< [in][nocheck] the native handle of the platform. + ur_adapter_handle_t + hAdapter, ///< [in] handle of the adapter associated with the native backend. const ur_platform_native_properties_t * pProperties, ///< [in][optional] pointer to native platform properties struct. ur_platform_handle_t * @@ -1265,6 +1269,22 @@ ur_result_t UR_APICALL urContextSetExtendedDeleter( /////////////////////////////////////////////////////////////////////////////// /// @brief Create an image object /// +/// @details +/// - The primary ::ur_image_format_t that must be supported by all the +/// adapters are {UR_IMAGE_CHANNEL_ORDER_RGBA, +/// UR_IMAGE_CHANNEL_TYPE_UNORM_INT8}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT16}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT8}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT16}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT}, +/// {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_FLOAT}. +/// /// @remarks /// _Analogues_ /// - **clCreateImage** @@ -1285,12 +1305,13 @@ ur_result_t UR_APICALL urContextSetExtendedDeleter( /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR -/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type` +/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type` /// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE /// - ::UR_RESULT_ERROR_INVALID_OPERATION /// - ::UR_RESULT_ERROR_INVALID_HOST_PTR /// + `pHost == NULL && (flags & (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) != 0` /// + `pHost != NULL && (flags & (UR_MEM_FLAG_USE_HOST_POINTER | UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER)) == 0` +/// - ::UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urMemImageCreate( @@ -3472,6 +3493,49 @@ ur_result_t UR_APICALL urKernelCreateWithNativeHandle( return result; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Get the suggested local work size for a kernel. +/// +/// @details +/// - Query a suggested local work size for a kernel given a global size for +/// each dimension. +/// - The application may call this function from simultaneous threads for +/// the same context. +/// +/// @returns +/// - ::UR_RESULT_SUCCESS +/// - ::UR_RESULT_ERROR_UNINITIALIZED +/// - ::UR_RESULT_ERROR_DEVICE_LOST +/// - ::UR_RESULT_ERROR_ADAPTER_SPECIFIC +/// - ::UR_RESULT_ERROR_INVALID_NULL_HANDLE +/// + `NULL == hKernel` +/// + `NULL == hQueue` +/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkOffset` +/// + `NULL == pGlobalWorkSize` +/// + `NULL == pSuggestedLocalWorkSize` +/// - ::UR_RESULT_ERROR_UNSUPPORTED_FEATURE +ur_result_t UR_APICALL urKernelGetSuggestedLocalWorkSize( + ur_kernel_handle_t hKernel, ///< [in] handle of the kernel + ur_queue_handle_t hQueue, ///< [in] handle of the queue object + uint32_t + numWorkDim, ///< [in] number of dimensions, from 1 to 3, to specify the global + ///< and work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the offset used to calculate the global ID of a work-item + const size_t * + pGlobalWorkSize, ///< [in] pointer to an array of numWorkDim unsigned values that specify + ///< the number of global work-items in workDim that will execute the + ///< kernel function + size_t * + pSuggestedLocalWorkSize ///< [out] pointer to an array of numWorkDim unsigned values that specify + ///< suggested local work size that will contain the result of the query +) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Query information about a command queue /// @@ -3496,7 +3560,7 @@ ur_result_t UR_APICALL urKernelCreateWithNativeHandle( /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER /// + `propSize != 0 && pPropValue == NULL` /// + `pPropValue == NULL && pPropSizeRet == NULL` -/// - ::UR_RESULT_ERROR_INVALID_QUEUE +/// - ::UR_RESULT_ERROR_INVALID_QUEUE - "If `hQueue` isn't a valid queue handle or if `propName` isn't supported by `hQueue`." /// - ::UR_RESULT_ERROR_OUT_OF_HOST_MEMORY /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES ur_result_t UR_APICALL urQueueGetInfo( @@ -5572,7 +5636,7 @@ ur_result_t UR_APICALL urBindlessImagesSampledImageHandleDestroyExp( /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR -/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type` +/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type` /// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE /// - ::UR_RESULT_ERROR_INVALID_OPERATION ur_result_t UR_APICALL urBindlessImagesImageAllocateExp( @@ -5639,7 +5703,7 @@ ur_result_t UR_APICALL urBindlessImagesImageFreeExp( /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR -/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type` +/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type` /// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE /// - ::UR_RESULT_ERROR_INVALID_OPERATION ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( @@ -5681,7 +5745,7 @@ ur_result_t UR_APICALL urBindlessImagesUnsampledImageCreateExp( /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR -/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type` +/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type` /// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE /// - ::UR_RESULT_ERROR_INVALID_SAMPLER /// - ::UR_RESULT_ERROR_INVALID_OPERATION @@ -5728,7 +5792,7 @@ ur_result_t UR_APICALL urBindlessImagesSampledImageCreateExp( /// - ::UR_RESULT_ERROR_INVALID_QUEUE /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR -/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type` +/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type` /// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE /// - ::UR_RESULT_ERROR_INVALID_OPERATION ur_result_t UR_APICALL urBindlessImagesImageCopyExp( @@ -5911,7 +5975,7 @@ ur_result_t UR_APICALL urBindlessImagesImportOpaqueFDExp( /// - ::UR_RESULT_ERROR_INVALID_CONTEXT /// - ::UR_RESULT_ERROR_INVALID_VALUE /// - ::UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR -/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_BUFFER < pImageDesc->type` +/// + `pImageDesc && UR_MEM_TYPE_IMAGE1D_ARRAY < pImageDesc->type` /// - ::UR_RESULT_ERROR_INVALID_IMAGE_SIZE /// - ::UR_RESULT_ERROR_INVALID_OPERATION /// - ::UR_RESULT_ERROR_OUT_OF_RESOURCES @@ -6825,7 +6889,9 @@ ur_result_t UR_APICALL urCommandBufferReleaseCommandExp( } /////////////////////////////////////////////////////////////////////////////// -/// @brief Update a kernel launch command in a finalized command-buffer. +/// @brief Update a kernel launch command in a finalized command-buffer. This +/// entry-point is synchronous and may block if the command-buffer is +/// executing when the entry-point is called. /// /// @returns /// - ::UR_RESULT_SUCCESS diff --git a/test/adapters/level_zero/urKernelCreateWithNativeHandle.cpp b/test/adapters/level_zero/urKernelCreateWithNativeHandle.cpp index 19e4de21ec..62c667b242 100644 --- a/test/adapters/level_zero/urKernelCreateWithNativeHandle.cpp +++ b/test/adapters/level_zero/urKernelCreateWithNativeHandle.cpp @@ -3,9 +3,9 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include "level_zero/ze_api.h" #include "ur_api.h" #include "uur/checks.h" +#include "ze_api.h" #include using urLevelZeroKernelNativeHandleTest = uur::urContextTest; diff --git a/test/conformance/CMakeLists.txt b/test/conformance/CMakeLists.txt index 79cefdd06f..439b9b7a79 100644 --- a/test/conformance/CMakeLists.txt +++ b/test/conformance/CMakeLists.txt @@ -5,6 +5,10 @@ set(UR_CONFORMANCE_TEST_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(UR_CONFORMANCE_DEVICE_BINARIES_DIR + "${CMAKE_CURRENT_BINARY_DIR}/device_binaries" CACHE INTERNAL + "Internal cache variable for device binaries directory") + function(add_test_adapter name adapter) set(TEST_TARGET_NAME test-${name}) set(TEST_NAME ${name}-${adapter}) @@ -140,6 +144,7 @@ if(UR_DPCXX) add_subdirectory(exp_command_buffer) add_subdirectory(exp_usm_p2p) add_subdirectory(exp_launch_properties) + add_subdirectory(memory-migrate) else() message(WARNING "UR_DPCXX is not defined, the following conformance test executables \ diff --git a/test/conformance/context/context_adapter_hip.match b/test/conformance/context/context_adapter_hip.match index 0c9d1530b7..cecf5c8747 100644 --- a/test/conformance/context/context_adapter_hip.match +++ b/test/conformance/context/context_adapter_hip.match @@ -1,4 +1,3 @@ urContextCreateWithNativeHandleTest.Success/AMD_HIP_BACKEND___{{.*}}_ urContextCreateWithNativeHandleTest.SuccessWithOwnedNativeHandle/AMD_HIP_BACKEND___{{.*}}_ urContextCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle/AMD_HIP_BACKEND___{{.*}}_ -urContextGetInfoTestWithInfoParam.Success/AMD_HIP_BACKEND___{{.*}} diff --git a/test/conformance/context/urContextGetInfo.cpp b/test/conformance/context/urContextGetInfo.cpp index 1d75718d87..f9f699d511 100644 --- a/test/conformance/context/urContextGetInfo.cpp +++ b/test/conformance/context/urContextGetInfo.cpp @@ -2,7 +2,6 @@ // Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include #include struct urContextGetInfoTestWithInfoParam @@ -14,9 +13,7 @@ struct urContextGetInfoTestWithInfoParam ctx_info_size_map = { {UR_CONTEXT_INFO_NUM_DEVICES, sizeof(uint32_t)}, - {UR_CONTEXT_INFO_DEVICES, - sizeof(ur_device_handle_t) * - uur::DevicesEnvironment::instance->devices.size()}, + {UR_CONTEXT_INFO_DEVICES, sizeof(ur_device_handle_t)}, {UR_CONTEXT_INFO_REFERENCE_COUNT, sizeof(uint32_t)}, {UR_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT, sizeof(bool)}, {UR_CONTEXT_INFO_USM_FILL2D_SUPPORT, sizeof(bool)}, diff --git a/test/conformance/device/device_adapter_hip.match b/test/conformance/device/device_adapter_hip.match index f64efa4bac..9989fbd774 100644 --- a/test/conformance/device/device_adapter_hip.match +++ b/test/conformance/device/device_adapter_hip.match @@ -1,4 +1,2 @@ -urDeviceCreateWithNativeHandleTest.Success -urDeviceCreateWithNativeHandleTest.SuccessWithOwnedNativeHandle urDeviceCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle {{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime diff --git a/test/conformance/device/device_adapter_level_zero.match b/test/conformance/device/device_adapter_level_zero.match index 1f735d7a77..b65a51b9cb 100644 --- a/test/conformance/device/device_adapter_level_zero.match +++ b/test/conformance/device/device_adapter_level_zero.match @@ -1,10 +1,7 @@ urDeviceCreateWithNativeHandleTest.SuccessWithUnOwnedNativeHandle +{{OPT}}urDeviceGetGlobalTimestampTest.SuccessSynchronizedTime urDeviceGetInfoTest.Success/UR_DEVICE_INFO_GLOBAL_MEM_FREE -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ERROR_CORRECTION_SUPPORT -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_HOST_UNIFIED_MEMORY -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_AVAILABLE urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH -urDeviceGetInfoTest.Success/UR_DEVICE_INFO_BUILD_ON_SUBDEVICE urDeviceGetInfoTest.Success/UR_DEVICE_INFO_ASYNC_BARRIER urDeviceGetInfoTest.Success/UR_DEVICE_INFO_HOST_PIPE_READ_WRITE_SUPPORTED urDeviceGetInfoTest.Success/UR_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP diff --git a/test/conformance/device_code/CMakeLists.txt b/test/conformance/device_code/CMakeLists.txt index 1419604b9d..24c437e853 100644 --- a/test/conformance/device_code/CMakeLists.txt +++ b/test/conformance/device_code/CMakeLists.txt @@ -135,6 +135,7 @@ add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fill_3d.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/fill_usm.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/foo.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/image_copy.cpp) +add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/inc.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/mean.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/cpy_and_mult.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/cpy_and_mult_usm.cpp) diff --git a/test/conformance/device_code/cpy_and_mult.cpp b/test/conformance/device_code/cpy_and_mult.cpp index a2bdaccf55..644b151fd3 100644 --- a/test/conformance/device_code/cpy_and_mult.cpp +++ b/test/conformance/device_code/cpy_and_mult.cpp @@ -3,24 +3,24 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { size_t array_size = 16; - cl::sycl::queue sycl_queue; + sycl::queue sycl_queue; std::vector src(array_size, 1); std::vector dst(array_size, 1); auto src_buff = - cl::sycl::buffer(src.data(), cl::sycl::range<1>(array_size)); + sycl::buffer(src.data(), sycl::range<1>(array_size)); auto dst_buff = - cl::sycl::buffer(dst.data(), cl::sycl::range<1>(array_size)); + sycl::buffer(dst.data(), sycl::range<1>(array_size)); - sycl_queue.submit([&](cl::sycl::handler &cgh) { - auto src_acc = src_buff.get_access(cgh); - auto dst_acc = dst_buff.get_access(cgh); + sycl_queue.submit([&](sycl::handler &cgh) { + auto src_acc = src_buff.get_access(cgh); + auto dst_acc = dst_buff.get_access(cgh); cgh.parallel_for( - cl::sycl::range<1>{array_size}, - [src_acc, dst_acc](cl::sycl::item<1> itemId) { + sycl::range<1>{array_size}, + [src_acc, dst_acc](sycl::item<1> itemId) { auto id = itemId.get_id(0); dst_acc[id] = src_acc[id] * 2; }); diff --git a/test/conformance/device_code/cpy_and_mult_usm.cpp b/test/conformance/device_code/cpy_and_mult_usm.cpp index e253dfe14b..f34041b6c2 100644 --- a/test/conformance/device_code/cpy_and_mult_usm.cpp +++ b/test/conformance/device_code/cpy_and_mult_usm.cpp @@ -3,17 +3,16 @@ // See LICENSE.TXT // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include +#include int main() { size_t array_size = 16; - cl::sycl::queue sycl_queue; - uint32_t *src = cl::sycl::malloc_device(array_size, sycl_queue); - uint32_t *dst = cl::sycl::malloc_device(array_size, sycl_queue); - sycl_queue.submit([&](cl::sycl::handler &cgh) { + sycl::queue sycl_queue; + uint32_t *src = sycl::malloc_device(array_size, sycl_queue); + uint32_t *dst = sycl::malloc_device(array_size, sycl_queue); + sycl_queue.submit([&](sycl::handler &cgh) { cgh.parallel_for( - cl::sycl::range<1>{array_size}, - [src, dst](cl::sycl::item<1> itemId) { + sycl::range<1>{array_size}, [src, dst](sycl::item<1> itemId) { auto id = itemId.get_id(0); dst[id] = src[id] * 2; }); diff --git a/test/conformance/device_code/inc.cpp b/test/conformance/device_code/inc.cpp new file mode 100644 index 0000000000..d41c07366d --- /dev/null +++ b/test/conformance/device_code/inc.cpp @@ -0,0 +1,18 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +class inc; + +int main() { + uint32_t *ptr; + sycl::buffer buf{ptr, 1}; + sycl::queue{}.submit([&](sycl::handler &cgh) { + sycl::accessor acc{buf, cgh}; + auto kernel = [acc](sycl::item<1> it) { acc[it]++; }; + cgh.parallel_for(sycl::range<1>{1}, kernel); + }); +} diff --git a/test/conformance/enqueue/enqueue_adapter_hip.match b/test/conformance/enqueue/enqueue_adapter_hip.match index 56376ff37d..443628e36e 100644 --- a/test/conformance/enqueue/enqueue_adapter_hip.match +++ b/test/conformance/enqueue/enqueue_adapter_hip.match @@ -1,44 +1,15 @@ +# HIP can't check kernel arguments urEnqueueKernelLaunchTest.InvalidKernelArgs/AMD_HIP_BACKEND___{{.*}}_ urEnqueueKernelLaunchKernelWgSizeTest.NonMatchingLocalSize/AMD_HIP_BACKEND___{{.*}}_ urEnqueueKernelLaunchKernelSubGroupTest.Success/AMD_HIP_BACKEND___{{.*}}_ urEnqueueKernelLaunchUSMLinkedList.Success/AMD_HIP_BACKEND___{{.*}}___UsePoolEnabled {{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___copy_row_2D {{OPT}}urEnqueueMemBufferCopyRectTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___copy_3d_2d -urEnqueueMemBufferFillTest.Success/AMD_HIP_BACKEND___{{.*}}___size__256__patternSize__256 -urEnqueueMemBufferFillTest.Success/AMD_HIP_BACKEND___{{.*}}___size__1024__patternSize__256 {{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___write_row_2D {{OPT}}urEnqueueMemBufferWriteRectTestWithParam.Success/AMD_HIP_BACKEND___{{.*}}___write_3d_2d -urEnqueueMemImageCopyTest.Success/AMD_HIP_BACKEND___{{.*}}___1D -urEnqueueMemImageCopyTest.Success/AMD_HIP_BACKEND___{{.*}}___2D -urEnqueueMemImageCopyTest.Success/AMD_HIP_BACKEND___{{.*}}___3D -urEnqueueMemImageCopyTest.SuccessPartialCopy/AMD_HIP_BACKEND___{{.*}}___1D -urEnqueueMemImageCopyTest.SuccessPartialCopy/AMD_HIP_BACKEND___{{.*}}___2D -urEnqueueMemImageCopyTest.SuccessPartialCopy/AMD_HIP_BACKEND___{{.*}}___3D -urEnqueueMemImageCopyTest.SuccessPartialCopyWithSrcOffset/AMD_HIP_BACKEND___{{.*}}___1D -urEnqueueMemImageCopyTest.SuccessPartialCopyWithSrcOffset/AMD_HIP_BACKEND___{{.*}}___2D -urEnqueueMemImageCopyTest.SuccessPartialCopyWithSrcOffset/AMD_HIP_BACKEND___{{.*}}___3D -urEnqueueMemImageCopyTest.SuccessPartialCopyWithDstOffset/AMD_HIP_BACKEND___{{.*}}___1D -urEnqueueMemImageCopyTest.SuccessPartialCopyWithDstOffset/AMD_HIP_BACKEND___{{.*}}___2D -urEnqueueMemImageCopyTest.SuccessPartialCopyWithDstOffset/AMD_HIP_BACKEND___{{.*}}___3D -urEnqueueMemImageCopyTest.InvalidNullHandleQueue/AMD_HIP_BACKEND___{{.*}}___1D -urEnqueueMemImageCopyTest.InvalidNullHandleQueue/AMD_HIP_BACKEND___{{.*}}___3D -urEnqueueMemImageCopyTest.InvalidNullHandleImageSrc/AMD_HIP_BACKEND___{{.*}}___1D -urEnqueueMemImageCopyTest.InvalidNullHandleImageSrc/AMD_HIP_BACKEND___{{.*}}___3D -urEnqueueMemImageCopyTest.InvalidNullHandleImageDst/AMD_HIP_BACKEND___{{.*}}___1D -urEnqueueMemImageCopyTest.InvalidNullHandleImageDst/AMD_HIP_BACKEND___{{.*}}___3D -urEnqueueMemImageCopyTest.InvalidNullPtrEventWaitList/AMD_HIP_BACKEND___{{.*}}___1D -urEnqueueMemImageCopyTest.InvalidNullPtrEventWaitList/AMD_HIP_BACKEND___{{.*}}___3D -urEnqueueMemImageCopyTest.InvalidSize/AMD_HIP_BACKEND___{{.*}}___1D -urEnqueueMemImageCopyTest.InvalidSize/AMD_HIP_BACKEND___{{.*}}___3D -urEnqueueMemImageReadTest.Success1D/AMD_HIP_BACKEND___{{.*}}_ -urEnqueueMemImageReadTest.Success3D/AMD_HIP_BACKEND___{{.*}}_ -urEnqueueMemImageWriteTest.Success1D/AMD_HIP_BACKEND___{{.*}}_ -urEnqueueMemImageWriteTest.Success3D/AMD_HIP_BACKEND___{{.*}}_ -urEnqueueUSMFill2DNegativeTest.OutOfBounds/AMD_HIP_BACKEND___{{.*}}_ urEnqueueUSMAdviseWithParamTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_USM_ADVICE_FLAG_DEFAULT urEnqueueUSMAdviseTest.MultipleParamsSuccess/AMD_HIP_BACKEND___{{.*}}_ urEnqueueUSMAdviseTest.NonCoherentDeviceMemorySuccessOrWarning/AMD_HIP_BACKEND___{{.*}}_ -urEnqueueUSMMemcpy2DNegativeTest.InvalidSize/AMD_HIP_BACKEND___{{.*}}___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE urEnqueueUSMPrefetchWithParamTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_USM_MIGRATION_FLAG_DEFAULT urEnqueueUSMPrefetchWithParamTest.CheckWaitEvent/AMD_HIP_BACKEND___{{.*}}___UR_USM_MIGRATION_FLAG_DEFAULT urEnqueueTimestampRecordingExpTest.Success/AMD_HIP_BACKEND___{{.*}} diff --git a/test/conformance/enqueue/enqueue_adapter_native_cpu.match b/test/conformance/enqueue/enqueue_adapter_native_cpu.match index 155a400e89..8d6cf18c3e 100644 --- a/test/conformance/enqueue/enqueue_adapter_native_cpu.match +++ b/test/conformance/enqueue/enqueue_adapter_native_cpu.match @@ -46,4 +46,328 @@ {{OPT}}urEnqueueKernelLaunchTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___3D_256_79_8 {{OPT}}urEnqueueKernelLaunchWithVirtualMemory.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ {{OPT}}urEnqueueKernelLaunchMultiDeviceTest.KernelLaunchReadDifferentQueues/SYCL_NATIVE_CPU___SYCL_Native_CPU_ -{{Segmentation fault|Aborted}} +{{OPT}}urEnqueueKernelLaunchUSMLinkedList.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UsePoolEnabled +{{OPT}}urEnqueueKernelLaunchUSMLinkedList.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UsePoolDisabled +{{OPT}}urEnqueueMemBufferCopyRectTest.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueMemBufferCopyRectTest.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueMemBufferCopyTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024 +{{OPT}}urEnqueueMemBufferCopyTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500 +{{OPT}}urEnqueueMemBufferCopyTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096 +{{OPT}}urEnqueueMemBufferCopyTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000 +{{OPT}}urEnqueueMemBufferCopyTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024 +{{OPT}}urEnqueueMemBufferCopyTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500 +{{OPT}}urEnqueueMemBufferCopyTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096 +{{OPT}}urEnqueueMemBufferCopyTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000 +{{OPT}}urEnqueueMemBufferFillNegativeTest.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferMapTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferReadTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferReadRectTest.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueMemBufferReadRectTest.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferWriteTestWithParam.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemBufferWriteRectTest.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueMemBufferWriteRectTest.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueMemImageCopyTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D +{{OPT}}urEnqueueMemImageCopyTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___2D +{{OPT}}urEnqueueMemImageCopyTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___3D +{{OPT}}urEnqueueMemImageCopyTest.SuccessPartialCopy/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D +{{OPT}}urEnqueueMemImageCopyTest.SuccessPartialCopy/SYCL_NATIVE_CPU___SYCL_Native_CPU___2D +{{OPT}}urEnqueueMemImageCopyTest.SuccessPartialCopy/SYCL_NATIVE_CPU___SYCL_Native_CPU___3D +{{OPT}}urEnqueueMemImageCopyTest.SuccessPartialCopyWithSrcOffset/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D +{{OPT}}urEnqueueMemImageCopyTest.SuccessPartialCopyWithSrcOffset/SYCL_NATIVE_CPU___SYCL_Native_CPU___2D +{{OPT}}urEnqueueMemImageCopyTest.SuccessPartialCopyWithSrcOffset/SYCL_NATIVE_CPU___SYCL_Native_CPU___3D +{{OPT}}urEnqueueMemImageCopyTest.SuccessPartialCopyWithDstOffset/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D +{{OPT}}urEnqueueMemImageCopyTest.SuccessPartialCopyWithDstOffset/SYCL_NATIVE_CPU___SYCL_Native_CPU___2D +{{OPT}}urEnqueueMemImageCopyTest.SuccessPartialCopyWithDstOffset/SYCL_NATIVE_CPU___SYCL_Native_CPU___3D +{{OPT}}urEnqueueMemImageCopyTest.InvalidNullHandleQueue/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D +{{OPT}}urEnqueueMemImageCopyTest.InvalidNullHandleQueue/SYCL_NATIVE_CPU___SYCL_Native_CPU___2D +{{OPT}}urEnqueueMemImageCopyTest.InvalidNullHandleQueue/SYCL_NATIVE_CPU___SYCL_Native_CPU___3D +{{OPT}}urEnqueueMemImageCopyTest.InvalidNullHandleImageSrc/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D +{{OPT}}urEnqueueMemImageCopyTest.InvalidNullHandleImageSrc/SYCL_NATIVE_CPU___SYCL_Native_CPU___2D +{{OPT}}urEnqueueMemImageCopyTest.InvalidNullHandleImageSrc/SYCL_NATIVE_CPU___SYCL_Native_CPU___3D +{{OPT}}urEnqueueMemImageCopyTest.InvalidNullHandleImageDst/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D +{{OPT}}urEnqueueMemImageCopyTest.InvalidNullHandleImageDst/SYCL_NATIVE_CPU___SYCL_Native_CPU___2D +{{OPT}}urEnqueueMemImageCopyTest.InvalidNullHandleImageDst/SYCL_NATIVE_CPU___SYCL_Native_CPU___3D +{{OPT}}urEnqueueMemImageCopyTest.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D +{{OPT}}urEnqueueMemImageCopyTest.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2D +{{OPT}}urEnqueueMemImageCopyTest.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___3D +{{OPT}}urEnqueueMemImageCopyTest.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___1D +{{OPT}}urEnqueueMemImageCopyTest.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___2D +{{OPT}}urEnqueueMemImageCopyTest.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___3D +{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_WRITE +{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_WRITE_ONLY +{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_READ_ONLY +{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___1024_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___2500_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___4096_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueMemUnmapTestWithParam.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___6000_UR_MEM_FLAG_ALLOC_HOST_POINTER +{{OPT}}urEnqueueUSMFillTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___size__1__patternSize__1 +{{OPT}}urEnqueueUSMFillTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___size__256__patternSize__256 +{{OPT}}urEnqueueUSMFillTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___size__1024__patternSize__256 +{{OPT}}urEnqueueUSMFillTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___size__256__patternSize__4 +{{OPT}}urEnqueueUSMFillTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___size__256__patternSize__8 +{{OPT}}urEnqueueUSMFillTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___size__256__patternSize__16 +{{OPT}}urEnqueueUSMFillTestWithParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___size__256__patternSize__32 +{{OPT}}urEnqueueUSMFillNegativeTest.InvalidNullQueueHandle/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMFillNegativeTest.InvalidNullPtr/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMFillNegativeTest.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMFillNegativeTest.OutOfBounds/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMFillNegativeTest.invalidPatternSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMFillNegativeTest.InvalidEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMFill2DNegativeTest.InvalidNullQueueHandle/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMFill2DNegativeTest.InvalidNullPtr/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMFill2DNegativeTest.InvalidPitch/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMFill2DNegativeTest.InvalidWidth/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMFill2DNegativeTest.InvalidHeight/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMFill2DNegativeTest.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMFill2DNegativeTest.OutOfBounds/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMFill2DNegativeTest.invalidPatternSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMFill2DNegativeTest.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMAdviseWithParamTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_USM_ADVICE_FLAG_DEFAULT +{{OPT}}urEnqueueUSMAdviseTest.MultipleParamsSuccess/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMAdviseTest.InvalidNullHandleQueue/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMAdviseTest.InvalidNullPointerMem/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMAdviseTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMAdviseTest.InvalidSizeZero/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMAdviseTest.InvalidSizeTooLarge/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMAdviseTest.NonCoherentDeviceMemorySuccessOrWarning/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMMemcpyTest.Blocking/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMMemcpyTest.BlockingWithEvent/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMMemcpyTest.NonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMMemcpyTest.WaitForDependencies/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMMemcpyTest.InvalidNullQueueHandle/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMMemcpyTest.InvalidNullDst/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMMemcpyTest.InvalidNullSrc/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMMemcpyTest.InvalidNullPtrEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__1024__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1024__width__256__height__256__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__23__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_HOST__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_HOST +{{OPT}}urEnqueueUSMMemcpy2DTestWithParam.SuccessNonBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__234__width__233__height__1__src__UR_USM_TYPE_SHARED__dst__UR_USM_TYPE_SHARED +{{OPT}}urEnqueueUSMMemcpy2DNegativeTest.InvalidNullHandleQueue/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DNegativeTest.InvalidNullPointer/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DNegativeTest.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMMemcpy2DNegativeTest.InvalidEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU___pitch__1__width__1__height__1__src__UR_USM_TYPE_DEVICE__dst__UR_USM_TYPE_DEVICE +{{OPT}}urEnqueueUSMPrefetchWithParamTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_USM_MIGRATION_FLAG_DEFAULT +{{OPT}}urEnqueueUSMPrefetchWithParamTest.CheckWaitEvent/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_USM_MIGRATION_FLAG_DEFAULT +{{OPT}}urEnqueueUSMPrefetchTest.InvalidNullHandleQueue/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMPrefetchTest.InvalidNullPointerMem/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMPrefetchTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMPrefetchTest.InvalidSizeZero/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMPrefetchTest.InvalidSizeTooLarge/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueUSMPrefetchTest.InvalidEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueReadHostPipeTest.InvalidNullHandleQueue/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueReadHostPipeTest.InvalidNullHandleProgram/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueReadHostPipeTest.InvalidNullPointerPipeSymbol/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueReadHostPipeTest.InvalidNullPointerBuffer/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueReadHostPipeTest.InvalidEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueWriteHostPipeTest.InvalidNullHandleQueue/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueWriteHostPipeTest.InvalidNullHandleProgram/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueWriteHostPipeTest.InvalidNullPointerPipeSymbol/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueWriteHostPipeTest.InvalidNullPointerBuffer/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}urEnqueueWriteHostPipeTest.InvalidEventWaitList/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}{{Segmentation fault|Aborted}} diff --git a/test/conformance/enqueue/urEnqueueKernelLaunch.cpp b/test/conformance/enqueue/urEnqueueKernelLaunch.cpp index 79c8ac14da..3f9d6cb996 100644 --- a/test/conformance/enqueue/urEnqueueKernelLaunch.cpp +++ b/test/conformance/enqueue/urEnqueueKernelLaunch.cpp @@ -480,7 +480,7 @@ struct urEnqueueKernelLaunchUSMLinkedList const int num_nodes = 4; bool use_pool = false; ur_usm_pool_handle_t pool = nullptr; - ur_queue_handle_t queue; + ur_queue_handle_t queue = nullptr; }; UUR_TEST_SUITE_P( diff --git a/test/conformance/event/event_adapter_cuda.match b/test/conformance/event/event_adapter_cuda.match index e40ea36db6..3cffb24c5f 100644 --- a/test/conformance/event/event_adapter_cuda.match +++ b/test/conformance/event/event_adapter_cuda.match @@ -1,3 +1,5 @@ +urEventGetProfilingInfoTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}___UR_PROFILING_INFO_COMMAND_COMPLETE +urEventGetProfilingInfoWithTimingComparisonTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_ urEventSetCallbackTest.Success/NVIDIA_CUDA_BACKEND___{{.*}}_ urEventSetCallbackTest.ValidateParameters/NVIDIA_CUDA_BACKEND___{{.*}}_ urEventSetCallbackTest.AllStates/NVIDIA_CUDA_BACKEND___{{.*}}_ diff --git a/test/conformance/event/event_adapter_hip.match b/test/conformance/event/event_adapter_hip.match index 8682cdf4a6..b25428a187 100644 --- a/test/conformance/event/event_adapter_hip.match +++ b/test/conformance/event/event_adapter_hip.match @@ -1,3 +1,5 @@ +urEventGetProfilingInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_PROFILING_INFO_COMMAND_COMPLETE +urEventGetProfilingInfoWithTimingComparisonTest.Success/AMD_HIP_BACKEND___{{.*}}_ urEventSetCallbackTest.Success/AMD_HIP_BACKEND___{{.*}}_ urEventSetCallbackTest.ValidateParameters/AMD_HIP_BACKEND___{{.*}}_ urEventSetCallbackTest.AllStates/AMD_HIP_BACKEND___{{.*}}_ diff --git a/test/conformance/event/event_adapter_level_zero.match b/test/conformance/event/event_adapter_level_zero.match index c29f67cbc6..32ffbeaf1e 100644 --- a/test/conformance/event/event_adapter_level_zero.match +++ b/test/conformance/event/event_adapter_level_zero.match @@ -1,4 +1,6 @@ {{OPT}}urEventGetInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_EVENT_INFO_COMMAND_TYPE {{OPT}}urEventGetProfilingInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_PROFILING_INFO_COMMAND_QUEUED {{OPT}}urEventGetProfilingInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_PROFILING_INFO_COMMAND_SUBMIT +{{OPT}}urEventGetProfilingInfoTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_PROFILING_INFO_COMMAND_COMPLETE +{{OPT}}urEventGetProfilingInfoWithTimingComparisonTest.Success/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ {{OPT}}{{Segmentation fault|Aborted}} diff --git a/test/conformance/event/event_adapter_native_cpu.match b/test/conformance/event/event_adapter_native_cpu.match index f5769d271f..9d43898b09 100644 --- a/test/conformance/event/event_adapter_native_cpu.match +++ b/test/conformance/event/event_adapter_native_cpu.match @@ -13,6 +13,8 @@ urEventGetProfilingInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_PROFI urEventGetProfilingInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_PROFILING_INFO_COMMAND_SUBMIT urEventGetProfilingInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_PROFILING_INFO_COMMAND_START urEventGetProfilingInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_PROFILING_INFO_COMMAND_END +urEventGetProfilingInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_PROFILING_INFO_COMMAND_COMPLETE +urEventGetProfilingInfoWithTimingComparisonTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urEventGetProfilingInfoNegativeTest.InvalidNullHandle/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urEventGetProfilingInfoNegativeTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urEventGetProfilingInfoNegativeTest.InvalidValue/SYCL_NATIVE_CPU___SYCL_Native_CPU_ diff --git a/test/conformance/event/urEventGetProfilingInfo.cpp b/test/conformance/event/urEventGetProfilingInfo.cpp index b7ec52ffca..7b91679dad 100644 --- a/test/conformance/event/urEventGetProfilingInfo.cpp +++ b/test/conformance/event/urEventGetProfilingInfo.cpp @@ -30,9 +30,58 @@ UUR_TEST_SUITE_P(urEventGetProfilingInfoTest, ::testing::Values(UR_PROFILING_INFO_COMMAND_QUEUED, UR_PROFILING_INFO_COMMAND_SUBMIT, UR_PROFILING_INFO_COMMAND_START, - UR_PROFILING_INFO_COMMAND_END), + UR_PROFILING_INFO_COMMAND_END, + UR_PROFILING_INFO_COMMAND_COMPLETE), uur::deviceTestWithParamPrinter); +using urEventGetProfilingInfoWithTimingComparisonTest = uur::event::urEventTest; + +TEST_P(urEventGetProfilingInfoWithTimingComparisonTest, Success) { + uint8_t size = 8; + + std::vector queued_data(size); + ASSERT_SUCCESS(urEventGetProfilingInfo(event, + UR_PROFILING_INFO_COMMAND_QUEUED, + size, queued_data.data(), nullptr)); + auto queued_timing = reinterpret_cast(queued_data.data()); + ASSERT_NE(*queued_timing, 0); + + std::vector submit_data(size); + ASSERT_SUCCESS(urEventGetProfilingInfo(event, + UR_PROFILING_INFO_COMMAND_SUBMIT, + size, submit_data.data(), nullptr)); + auto submit_timing = reinterpret_cast(submit_data.data()); + ASSERT_NE(*submit_timing, 0); + + std::vector start_data(size); + ASSERT_SUCCESS(urEventGetProfilingInfo(event, + UR_PROFILING_INFO_COMMAND_START, + size, start_data.data(), nullptr)); + auto start_timing = reinterpret_cast(start_data.data()); + ASSERT_NE(*start_timing, 0); + + std::vector end_data(size); + ASSERT_SUCCESS(urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_END, + size, end_data.data(), nullptr)); + auto end_timing = reinterpret_cast(end_data.data()); + ASSERT_NE(*end_timing, 0); + + std::vector complete_data(size); + ASSERT_SUCCESS( + urEventGetProfilingInfo(event, UR_PROFILING_INFO_COMMAND_COMPLETE, size, + complete_data.data(), nullptr)); + auto complete_timing = reinterpret_cast(complete_data.data()); + ASSERT_NE(*complete_timing, 0); + + ASSERT_LE(*queued_timing, *submit_timing); + ASSERT_LT(*submit_timing, *start_timing); + ASSERT_LT(*start_timing, *end_timing); + ASSERT_LE(*end_timing, *complete_timing); +} + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P( + urEventGetProfilingInfoWithTimingComparisonTest); + using urEventGetProfilingInfoNegativeTest = uur::event::urEventTest; TEST_P(urEventGetProfilingInfoNegativeTest, InvalidNullHandle) { diff --git a/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp b/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp index 80561cc9e1..78e1ffd009 100644 --- a/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp +++ b/test/conformance/exp_command_buffer/buffer_fill_kernel_update.cpp @@ -148,10 +148,6 @@ TEST_P(BufferFillCommandTest, UpdateParameters) { // Test updating the global size so that the fill outputs to a larger buffer TEST_P(BufferFillCommandTest, UpdateGlobalSize) { - if (!updatable_execution_range_support) { - GTEST_SKIP() << "Execution range update is not supported."; - } - ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); @@ -208,8 +204,7 @@ TEST_P(BufferFillCommandTest, SeparateUpdateCalls) { ASSERT_SUCCESS(urQueueFinish(queue)); ValidateBuffer(buffer, sizeof(val) * global_size, val); - size_t new_global_size = - updatable_execution_range_support ? 64 : global_size; + size_t new_global_size = global_size * 2; const size_t new_buffer_size = sizeof(val) * new_global_size; ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, new_buffer_size, nullptr, &new_buffer)); @@ -272,26 +267,24 @@ TEST_P(BufferFillCommandTest, SeparateUpdateCalls) { ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp(command_handle, &input_update_desc)); - if (updatable_execution_range_support) { - ur_exp_command_buffer_update_kernel_launch_desc_t - global_size_update_desc = { - UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype - nullptr, // pNext - 0, // numNewMemObjArgs - 0, // numNewPointerArgs - 0, // numNewValueArgs - 0, // newWorkDim - nullptr, // pNewMemObjArgList - nullptr, // pNewPointerArgList - nullptr, // pNewValueArgList - nullptr, // pNewGlobalWorkOffset - &new_global_size, // pNewGlobalWorkSize - nullptr, // pNewLocalWorkSize - }; - - ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp( - command_handle, &global_size_update_desc)); - } + size_t new_local_size = local_size; + ur_exp_command_buffer_update_kernel_launch_desc_t global_size_update_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + 0, // numNewMemObjArgs + 0, // numNewPointerArgs + 0, // numNewValueArgs + static_cast(n_dimensions), // newWorkDim + nullptr, // pNewMemObjArgList + nullptr, // pNewPointerArgList + nullptr, // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + &new_global_size, // pNewGlobalWorkSize + &new_local_size, // pNewLocalWorkSize + }; + + ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp( + command_handle, &global_size_update_desc)); ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, nullptr, nullptr)); diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match index f9c9025ea1..8422cc5d2f 100644 --- a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match +++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_native_cpu.match @@ -4,9 +4,12 @@ {{OPT}}BufferFillCommandTest.OverrideUpdate/SYCL_NATIVE_CPU___SYCL_Native_CPU_ {{OPT}}BufferFillCommandTest.OverrideArgList/SYCL_NATIVE_CPU___SYCL_Native_CPU_ {{OPT}}USMFillCommandTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}USMFillCommandTest.UpdateBeforeEnqueue/SYCL_NATIVE_CPU___SYCL_Native_CPU_ {{OPT}}USMMultipleFillCommandTest.UpdateAllKernels/SYCL_NATIVE_CPU___SYCL_Native_CPU_ {{OPT}}BufferSaxpyKernelTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU_ {{OPT}}USMSaxpyKernelTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}USMMultiSaxpyKernelTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +{{OPT}}USMMultiSaxpyKernelTest.UpdateWithoutBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU_ {{OPT}}NDRangeUpdateTest.Update3D/SYCL_NATIVE_CPU___SYCL_Native_CPU_ {{OPT}}NDRangeUpdateTest.Update2D/SYCL_NATIVE_CPU___SYCL_Native_CPU_ {{OPT}}NDRangeUpdateTest.Update1D/SYCL_NATIVE_CPU___SYCL_Native_CPU_ diff --git a/test/conformance/exp_command_buffer/fixtures.h b/test/conformance/exp_command_buffer/fixtures.h index 7e5367aa9c..eeb0a5d5d8 100644 --- a/test/conformance/exp_command_buffer/fixtures.h +++ b/test/conformance/exp_command_buffer/fixtures.h @@ -112,11 +112,6 @@ struct urUpdatableCommandBufferExpExecutionTest GTEST_SKIP() << "Updating EXP command-buffers is not supported."; } - // Currently level zero driver doesn't support updating execution range. - if (backend == UR_PLATFORM_BACKEND_LEVEL_ZERO) { - updatable_execution_range_support = false; - } - // Create a command-buffer with update enabled. ur_exp_command_buffer_desc_t desc{ UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC, nullptr, true}; @@ -154,7 +149,6 @@ struct urUpdatableCommandBufferExpExecutionTest } ur_exp_command_buffer_handle_t updatable_cmd_buf_handle = nullptr; - ur_bool_t updatable_execution_range_support = true; ur_queue_handle_t queue = nullptr; }; diff --git a/test/conformance/exp_command_buffer/ndrange_update.cpp b/test/conformance/exp_command_buffer/ndrange_update.cpp index 486837df85..8f4edad095 100644 --- a/test/conformance/exp_command_buffer/ndrange_update.cpp +++ b/test/conformance/exp_command_buffer/ndrange_update.cpp @@ -15,10 +15,6 @@ struct NDRangeUpdateTest UUR_RETURN_ON_FATAL_FAILURE( urUpdatableCommandBufferExpExecutionTest::SetUp()); - if (!updatable_execution_range_support) { - GTEST_SKIP() << "Execution range update is not supported."; - } - ur_device_usm_access_capability_flags_t shared_usm_flags; ASSERT_SUCCESS( uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); diff --git a/test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp b/test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp index cf0259c7ab..413b555623 100644 --- a/test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp +++ b/test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp @@ -88,8 +88,7 @@ TEST_P(USMFillCommandTest, UpdateParameters) { Validate((uint32_t *)shared_ptr, global_size, val); // Allocate a new USM pointer of larger size if feature is supported. - size_t new_global_size = - updatable_execution_range_support ? 64 : global_size; + size_t new_global_size = global_size * 2; const size_t new_allocation_size = sizeof(val) * new_global_size; ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, new_allocation_size, &new_shared_ptr)); @@ -116,6 +115,60 @@ TEST_P(USMFillCommandTest, UpdateParameters) { &new_val, // hArgValue }; + size_t new_local_size = local_size; + ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + 0, // numNewMemObjArgs + 1, // numNewPointerArgs + 1, // numNewValueArgs + static_cast(n_dimensions), // newWorkDim + nullptr, // pNewMemObjArgList + &new_output_desc, // pNewPointerArgList + &new_input_desc, // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + &new_global_size, // pNewGlobalWorkSize + &new_local_size, // pNewLocalWorkSize + }; + + // Update kernel and enqueue command-buffer again + ASSERT_SUCCESS( + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + // Verify that update occurred correctly + Validate((uint32_t *)new_shared_ptr, new_global_size, new_val); +} + +// Test updating a command-buffer which hasn't been enqueued yet +TEST_P(USMFillCommandTest, UpdateBeforeEnqueue) { + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, &new_shared_ptr)); + ASSERT_NE(new_shared_ptr, nullptr); + std::memset(new_shared_ptr, 0, allocation_size); + + // Set new USM pointer as kernel output at index 0 + ur_exp_command_buffer_update_pointer_arg_desc_t new_output_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 0, // argIndex + nullptr, // pProperties + &new_shared_ptr, // pArgValue + }; + + // Set new value to use for fill at kernel index 1 + uint32_t new_val = 33; + ur_exp_command_buffer_update_value_arg_desc_t new_input_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 1, // argIndex + sizeof(new_val), // argSize + nullptr, // pProperties + &new_val, // hArgValue + }; + ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype nullptr, // pNext @@ -127,12 +180,11 @@ TEST_P(USMFillCommandTest, UpdateParameters) { &new_output_desc, // pNewPointerArgList &new_input_desc, // pNewValueArgList nullptr, // pNewGlobalWorkOffset - updatable_execution_range_support ? &new_global_size - : nullptr, // pNewGlobalWorkSize - nullptr, // pNewLocalWorkSize + nullptr, // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize }; - // Update kernel and enqueue command-buffer again + // Update kernel and enqueue command-buffer ASSERT_SUCCESS( urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, @@ -140,7 +192,7 @@ TEST_P(USMFillCommandTest, UpdateParameters) { ASSERT_SUCCESS(urQueueFinish(queue)); // Verify that update occurred correctly - Validate((uint32_t *)new_shared_ptr, new_global_size, new_val); + Validate((uint32_t *)new_shared_ptr, global_size, new_val); } // Test updating a command-buffer with multiple USM fill kernel commands diff --git a/test/conformance/exp_command_buffer/usm_saxpy_kernel_update.cpp b/test/conformance/exp_command_buffer/usm_saxpy_kernel_update.cpp index 8f213e8b24..d44fef3011 100644 --- a/test/conformance/exp_command_buffer/usm_saxpy_kernel_update.cpp +++ b/test/conformance/exp_command_buffer/usm_saxpy_kernel_update.cpp @@ -8,9 +8,10 @@ // Test that updating a command-buffer with a single kernel command // taking USM & scalar arguments works correctly. -struct USMSaxpyKernelTest + +struct USMSaxpyKernelTestBase : uur::command_buffer::urUpdatableCommandBufferExpExecutionTest { - void SetUp() override { + virtual void SetUp() override { program_name = "saxpy_usm"; UUR_RETURN_ON_FATAL_FAILURE( urUpdatableCommandBufferExpExecutionTest::SetUp()); @@ -44,14 +45,6 @@ struct USMSaxpyKernelTest // Index 3 is Y ASSERT_SUCCESS( urKernelSetArgPointer(kernel, 3, nullptr, &shared_ptrs[2])); - - // Append kernel command to command-buffer and close command-buffer - ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( - updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &global_size, &local_size, 0, nullptr, nullptr, &command_handle)); - ASSERT_NE(command_handle, nullptr); - - ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); } void Validate(uint32_t *output, uint32_t *X, uint32_t *Y, uint32_t A, @@ -62,17 +55,13 @@ struct USMSaxpyKernelTest } } - void TearDown() override { + virtual void TearDown() override { for (auto &shared_ptr : shared_ptrs) { if (shared_ptr) { EXPECT_SUCCESS(urUSMFree(context, shared_ptr)); } } - if (command_handle) { - EXPECT_SUCCESS(urCommandBufferReleaseCommandExp(command_handle)); - } - UUR_RETURN_ON_FATAL_FAILURE( urUpdatableCommandBufferExpExecutionTest::TearDown()); } @@ -83,6 +72,29 @@ struct USMSaxpyKernelTest static constexpr size_t n_dimensions = 1; static constexpr uint32_t A = 42; std::array shared_ptrs = {nullptr, nullptr, nullptr, nullptr}; +}; + +struct USMSaxpyKernelTest : USMSaxpyKernelTestBase { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(USMSaxpyKernelTestBase::SetUp()); + + // Append kernel command to command-buffer and close command-buffer + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, + &global_size, &local_size, 0, nullptr, nullptr, &command_handle)); + ASSERT_NE(command_handle, nullptr); + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + } + + void TearDown() override { + if (command_handle) { + EXPECT_SUCCESS(urCommandBufferReleaseCommandExp(command_handle)); + } + + UUR_RETURN_ON_FATAL_FAILURE(USMSaxpyKernelTestBase::TearDown()); + } + ur_exp_command_buffer_command_handle_t command_handle = nullptr; }; @@ -160,3 +172,179 @@ TEST_P(USMSaxpyKernelTest, UpdateParameters) { uint32_t *new_Y = (uint32_t *)shared_ptrs[4]; Validate(new_output, new_X, new_Y, new_A, global_size); } + +struct USMMultiSaxpyKernelTest : USMSaxpyKernelTestBase { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(USMSaxpyKernelTestBase::SetUp()); + + // Append kernel command to command-buffer and close command-buffer + for (unsigned node = 0; node < nodes; node++) { + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, + &global_size, &local_size, 0, nullptr, nullptr, + &command_handles[node])); + ASSERT_NE(command_handles[node], nullptr); + } + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + } + + void TearDown() override { + for (auto &handle : command_handles) { + if (handle) { + EXPECT_SUCCESS(urCommandBufferReleaseCommandExp(handle)); + } + } + UUR_RETURN_ON_FATAL_FAILURE(USMSaxpyKernelTestBase::TearDown()); + } + + static constexpr size_t nodes = 1024; + static constexpr uint32_t A = 42; + std::array command_handles{}; +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(USMMultiSaxpyKernelTest); + +TEST_P(USMMultiSaxpyKernelTest, UpdateParameters) { + // Run command-buffer prior to update an verify output + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptrs[0]; + uint32_t *X = (uint32_t *)shared_ptrs[1]; + uint32_t *Y = (uint32_t *)shared_ptrs[2]; + Validate(output, X, Y, A, global_size); + + // Update inputs + ur_exp_command_buffer_update_pointer_arg_desc_t new_input_descs[2]; + + // New X at index 2 + new_input_descs[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 2, // argIndex + nullptr, // pProperties + &shared_ptrs[3], // pArgValue + }; + + // New Y at index 3 + new_input_descs[1] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 3, // argIndex + nullptr, // pProperties + &shared_ptrs[4], // pArgValue + }; + + // New A at index 1 + uint32_t new_A = 33; + ur_exp_command_buffer_update_value_arg_desc_t new_A_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 1, // argIndex + sizeof(new_A), // argSize + nullptr, // pProperties + &new_A, // hArgValue + }; + + // Update kernel inputs + ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + 0, // numNewMemObjArgs + 2, // numNewPointerArgs + 1, // numNewValueArgs + 0, // newWorkDim + nullptr, // pNewMemObjArgList + new_input_descs, // pNewPointerArgList + &new_A_desc, // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + nullptr, // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize + }; + + // Update kernel and enqueue command-buffer again + for (auto &handle : command_handles) { + ASSERT_SUCCESS( + urCommandBufferUpdateKernelLaunchExp(handle, &update_desc)); + } + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + // Verify that update occurred correctly + uint32_t *new_output = (uint32_t *)shared_ptrs[0]; + uint32_t *new_X = (uint32_t *)shared_ptrs[3]; + uint32_t *new_Y = (uint32_t *)shared_ptrs[4]; + Validate(new_output, new_X, new_Y, new_A, global_size); +} + +TEST_P(USMMultiSaxpyKernelTest, UpdateWithoutBlocking) { + // Prepare new inputs + ur_exp_command_buffer_update_pointer_arg_desc_t new_input_descs[2]; + + // New X at index 2 + new_input_descs[0] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 2, // argIndex + nullptr, // pProperties + &shared_ptrs[3], // pArgValue + }; + + // New Y at index 3 + new_input_descs[1] = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext + 3, // argIndex + nullptr, // pProperties + &shared_ptrs[4], // pArgValue + }; + + // New A at index 1 + uint32_t new_A = 33; + ur_exp_command_buffer_update_value_arg_desc_t new_A_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext + 1, // argIndex + sizeof(new_A), // argSize + nullptr, // pProperties + &new_A, // hArgValue + }; + + // Update kernel inputs + ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + 0, // numNewMemObjArgs + 2, // numNewPointerArgs + 1, // numNewValueArgs + 0, // newWorkDim + nullptr, // pNewMemObjArgList + new_input_descs, // pNewPointerArgList + &new_A_desc, // pNewValueArgList + nullptr, // pNewGlobalWorkOffset + nullptr, // pNewGlobalWorkSize + nullptr, // pNewLocalWorkSize + }; + + // Run command-buffer prior to update without doing a blocking wait after + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + // Update kernel and enqueue command-buffer again + for (auto &handle : command_handles) { + ASSERT_SUCCESS( + urCommandBufferUpdateKernelLaunchExp(handle, &update_desc)); + } + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + // Verify that update occurred correctly + uint32_t *new_output = (uint32_t *)shared_ptrs[0]; + uint32_t *new_X = (uint32_t *)shared_ptrs[3]; + uint32_t *new_Y = (uint32_t *)shared_ptrs[4]; + Validate(new_output, new_X, new_Y, new_A, global_size); +} diff --git a/test/conformance/kernel/CMakeLists.txt b/test/conformance/kernel/CMakeLists.txt index df19ba2550..73ab3f1101 100644 --- a/test/conformance/kernel/CMakeLists.txt +++ b/test/conformance/kernel/CMakeLists.txt @@ -18,4 +18,5 @@ add_conformance_test_with_kernels_environment(kernel urKernelSetArgSampler.cpp urKernelSetArgValue.cpp urKernelSetExecInfo.cpp - urKernelSetSpecializationConstants.cpp) + urKernelSetSpecializationConstants.cpp + urKernelGetSuggestedLocalWorkSize.cpp) diff --git a/test/conformance/kernel/kernel_adapter_hip.match b/test/conformance/kernel/kernel_adapter_hip.match index 2c728224f2..eb023ff6a5 100644 --- a/test/conformance/kernel/kernel_adapter_hip.match +++ b/test/conformance/kernel/kernel_adapter_hip.match @@ -12,7 +12,3 @@ urKernelSetArgMemObjTest.InvalidKernelArgumentIndex/AMD_HIP_BACKEND___{{.*}}_ urKernelSetArgPointerNegativeTest.InvalidKernelArgumentIndex/AMD_HIP_BACKEND___{{.*}}_ urKernelSetArgValueTest.InvalidKernelArgumentIndex/AMD_HIP_BACKEND___{{.*}}_ urKernelSetArgValueTest.InvalidKernelArgumentSize/AMD_HIP_BACKEND___{{.*}}_ -urKernelSetSpecializationConstantsTest.Success/AMD_HIP_BACKEND___{{.*}}_ -urKernelSetSpecializationConstantsTest.InvalidNullHandleKernel/AMD_HIP_BACKEND___{{.*}}_ -urKernelSetSpecializationConstantsTest.InvalidNullPointerSpecConstants/AMD_HIP_BACKEND___{{.*}}_ -urKernelSetSpecializationConstantsTest.InvalidSizeCount/AMD_HIP_BACKEND___{{.*}}_ diff --git a/test/conformance/kernel/kernel_adapter_level_zero.match b/test/conformance/kernel/kernel_adapter_level_zero.match index 2668b6821a..82c92e3f28 100644 --- a/test/conformance/kernel/kernel_adapter_level_zero.match +++ b/test/conformance/kernel/kernel_adapter_level_zero.match @@ -8,9 +8,6 @@ urKernelGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_ urKernelGetInfoTest.InvalidSizeSmall/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}___UR_KERNEL_INFO_NUM_REGS urKernelSetArgLocalTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetArgMemObjTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgPointerTest.SuccessHost/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgPointerTest.SuccessDevice/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ -urKernelSetArgPointerTest.SuccessShared/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetArgPointerNegativeTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetArgSamplerTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ urKernelSetArgValueTest.InvalidKernelArgumentIndex/Intel_R__oneAPI_Unified_Runtime_over_Level_Zero___{{.*}}_ diff --git a/test/conformance/kernel/kernel_adapter_native_cpu.match b/test/conformance/kernel/kernel_adapter_native_cpu.match index 93e3ddd67d..818c625e92 100644 --- a/test/conformance/kernel/kernel_adapter_native_cpu.match +++ b/test/conformance/kernel/kernel_adapter_native_cpu.match @@ -162,3 +162,12 @@ urKernelSetSpecializationConstantsTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU urKernelSetSpecializationConstantsTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urKernelSetSpecializationConstantsTest.InvalidNullPointerSpecConstants/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urKernelSetSpecializationConstantsTest.InvalidSizeCount/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urKernelGetSuggestedLocalWorkSizeTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urKernelGetSuggestedLocalWorkSizeTest.Success2D/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urKernelGetSuggestedLocalWorkSizeTest.Success3D/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urKernelGetSuggestedLocalWorkSizeTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urKernelGetSuggestedLocalWorkSizeTest.InvalidNullHandleQueue/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urKernelGetSuggestedLocalWorkSizeTest.InvalidWorkDimension/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urKernelGetSuggestedLocalWorkSizeTest.InvalidGlobalOffset/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urKernelGetSuggestedLocalWorkSizeTest.InvalidGlobalSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urKernelGetSuggestedLocalWorkSizeTest.InvalidSuggestedLocalWorkSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_ diff --git a/test/conformance/kernel/urKernelGetSuggestedLocalWorkSize.cpp b/test/conformance/kernel/urKernelGetSuggestedLocalWorkSize.cpp new file mode 100644 index 0000000000..4eeabf5573 --- /dev/null +++ b/test/conformance/kernel/urKernelGetSuggestedLocalWorkSize.cpp @@ -0,0 +1,112 @@ +// Copyright (C) 2023 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +struct urKernelGetSuggestedLocalWorkSizeTest : uur::urKernelExecutionTest { + void SetUp() override { + program_name = "bar"; + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + } + size_t global_size = 32; + size_t global_offset = 0; + size_t n_dimensions = 1; + + size_t suggested_local_work_size; +}; +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urKernelGetSuggestedLocalWorkSizeTest); + +TEST_P(urKernelGetSuggestedLocalWorkSizeTest, Success) { + suggested_local_work_size = SIZE_MAX; + auto result = urKernelGetSuggestedLocalWorkSize( + kernel, queue, n_dimensions, &global_offset, &global_size, + &suggested_local_work_size); + if (result == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { + GTEST_SKIP(); + } + ASSERT_SUCCESS(result); + ASSERT_LE(suggested_local_work_size, global_size); +} + +TEST_P(urKernelGetSuggestedLocalWorkSizeTest, Success2D) { + size_t global_size_2d[2] = {32, 32}; + size_t global_offset_2d[2] = {0, 0}; + size_t suggested_local_work_size_2d[2] = {SIZE_MAX, SIZE_MAX}; + auto result = urKernelGetSuggestedLocalWorkSize( + kernel, queue, 2, global_offset_2d, global_size_2d, + suggested_local_work_size_2d); + if (result == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { + GTEST_SKIP(); + } + ASSERT_SUCCESS(result); + for (int I = 0; I < 2; ++I) { + ASSERT_LE(suggested_local_work_size_2d[I], global_size_2d[I]); + } +} + +TEST_P(urKernelGetSuggestedLocalWorkSizeTest, Success3D) { + size_t global_size_3d[3] = {32, 32, 32}; + size_t global_offset_3d[3] = {0, 0, 0}; + size_t suggested_local_work_size_3d[3] = {SIZE_MAX, SIZE_MAX, SIZE_MAX}; + auto result = urKernelGetSuggestedLocalWorkSize( + kernel, queue, 3, global_offset_3d, global_size_3d, + suggested_local_work_size_3d); + if (result == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { + GTEST_SKIP(); + } + ASSERT_SUCCESS(result); + for (int I = 0; I < 3; ++I) { + ASSERT_LE(suggested_local_work_size_3d[I], global_size_3d[I]); + } +} + +TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidNullHandleKernel) { + ASSERT_EQ_RESULT(urKernelGetSuggestedLocalWorkSize( + nullptr, queue, n_dimensions, &global_offset, + &global_size, &suggested_local_work_size), + UR_RESULT_ERROR_INVALID_NULL_HANDLE); +} + +TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidNullHandleQueue) { + ASSERT_EQ_RESULT(urKernelGetSuggestedLocalWorkSize( + kernel, nullptr, n_dimensions, &global_offset, + &global_size, &suggested_local_work_size), + UR_RESULT_ERROR_INVALID_NULL_HANDLE); +} + +TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidWorkDimension) { + uint32_t max_work_item_dimensions = 0; + ASSERT_SUCCESS(urDeviceGetInfo( + device, UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS, + sizeof(max_work_item_dimensions), &max_work_item_dimensions, nullptr)); + auto result = urKernelGetSuggestedLocalWorkSize( + kernel, queue, max_work_item_dimensions + 1, &global_offset, + &global_size, &suggested_local_work_size); + if (result == UR_RESULT_ERROR_UNSUPPORTED_FEATURE) { + GTEST_SKIP(); + } + ASSERT_EQ_RESULT(result, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); +} + +TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidGlobalOffset) { + ASSERT_EQ_RESULT(urKernelGetSuggestedLocalWorkSize( + kernel, queue, n_dimensions, nullptr, &global_size, + &suggested_local_work_size), + UR_RESULT_ERROR_INVALID_NULL_POINTER); +} + +TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidGlobalSize) { + ASSERT_EQ_RESULT( + urKernelGetSuggestedLocalWorkSize(kernel, queue, n_dimensions, + &global_offset, nullptr, nullptr), + UR_RESULT_ERROR_INVALID_NULL_POINTER); +} + +TEST_P(urKernelGetSuggestedLocalWorkSizeTest, InvalidSuggestedLocalWorkSize) { + ASSERT_EQ_RESULT( + urKernelGetSuggestedLocalWorkSize( + kernel, queue, n_dimensions, &global_offset, &global_size, nullptr), + UR_RESULT_ERROR_INVALID_NULL_POINTER); +} diff --git a/test/conformance/memory-migrate/CMakeLists.txt b/test/conformance/memory-migrate/CMakeLists.txt new file mode 100644 index 0000000000..785a99aa84 --- /dev/null +++ b/test/conformance/memory-migrate/CMakeLists.txt @@ -0,0 +1,7 @@ +# Copyright (C) 2022-2024 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +add_conformance_test_with_kernels_environment(memory-migrate + urMemBufferMigrateAcrossDevices.cpp) diff --git a/test/conformance/memory-migrate/memory-migrate_adapter_cuda.match b/test/conformance/memory-migrate/memory-migrate_adapter_cuda.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/conformance/memory-migrate/memory-migrate_adapter_hip.match b/test/conformance/memory-migrate/memory-migrate_adapter_hip.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/conformance/memory-migrate/memory-migrate_adapter_level_zero.match b/test/conformance/memory-migrate/memory-migrate_adapter_level_zero.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/conformance/memory-migrate/memory-migrate_adapter_native_cpu.match b/test/conformance/memory-migrate/memory-migrate_adapter_native_cpu.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/conformance/memory-migrate/memory-migrate_adapter_opencl.match b/test/conformance/memory-migrate/memory-migrate_adapter_opencl.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/conformance/memory-migrate/urMemBufferMigrateAcrossDevices.cpp b/test/conformance/memory-migrate/urMemBufferMigrateAcrossDevices.cpp new file mode 100644 index 0000000000..2e8856ac97 --- /dev/null +++ b/test/conformance/memory-migrate/urMemBufferMigrateAcrossDevices.cpp @@ -0,0 +1,263 @@ +// Copyright (C) 2023 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// Some tests to ensure implicit memory migration of buffers across devices +// in the same context. + +#include "uur/fixtures.h" + +using T = uint32_t; + +struct urMultiDeviceContextTest : uur::urPlatformTest { + void SetUp() { + uur::urPlatformTest::SetUp(); + ASSERT_SUCCESS(urDeviceGet(platform, UR_DEVICE_TYPE_ALL, 0, nullptr, + &num_devices)); + ASSERT_NE(num_devices, 0); + if (num_devices == 1) { + return; + } + + devices = std::vector(num_devices); + ASSERT_SUCCESS(urDeviceGet(platform, UR_DEVICE_TYPE_ALL, num_devices, + devices.data(), nullptr)); + ASSERT_SUCCESS( + urContextCreate(num_devices, devices.data(), nullptr, &context)); + + queues = std::vector(num_devices); + for (auto i = 0u; i < num_devices; ++i) { + ASSERT_SUCCESS( + urQueueCreate(context, devices[i], nullptr, &queues[i])); + } + } + + void TearDown() { + uur::urPlatformTest::TearDown(); + if (num_devices == 1) { + return; + } + for (auto i = 0u; i < num_devices; ++i) { + urDeviceRelease(devices[i]); + urQueueRelease(queues[i]); + } + urContextRelease(context); + } + + uint32_t num_devices = 0; + ur_context_handle_t context; + std::vector devices; + std::vector queues; +}; + +struct urMultiDeviceContextMemBufferTest : urMultiDeviceContextTest { + void SetUp() { + urMultiDeviceContextTest::SetUp(); + if (num_devices == 1) { + return; + } + ASSERT_SUCCESS(urMemBufferCreate(context, 0 /*flags=*/, + buffer_size_bytes, + nullptr /*pProperties=*/, &buffer)); + + UUR_RETURN_ON_FATAL_FAILURE( + uur::KernelsEnvironment::instance->LoadSource(program_name, + il_binary)); + + programs = std::vector(num_devices); + kernels = std::vector(num_devices); + + const ur_program_properties_t properties = { + UR_STRUCTURE_TYPE_PROGRAM_PROPERTIES, nullptr, + static_cast(metadatas.size()), + metadatas.empty() ? nullptr : metadatas.data()}; + for (auto i = 0u; i < num_devices; ++i) { + ASSERT_SUCCESS(uur::KernelsEnvironment::instance->CreateProgram( + platform, context, devices[i], *il_binary, &properties, + &programs[i])); + ASSERT_SUCCESS(urProgramBuild(context, programs[i], nullptr)); + auto kernel_names = + uur::KernelsEnvironment::instance->GetEntryPointNames( + program_name); + kernel_name = kernel_names[0]; + ASSERT_FALSE(kernel_name.empty()); + ASSERT_SUCCESS( + urKernelCreate(programs[i], kernel_name.data(), &kernels[i])); + } + } + + // Adds a kernel arg representing a sycl buffer constructed with a 1D range. + void AddBuffer1DArg(ur_kernel_handle_t kernel, size_t current_arg_index, + ur_mem_handle_t buffer) { + ASSERT_SUCCESS( + urKernelSetArgMemObj(kernel, current_arg_index, nullptr, buffer)); + + // SYCL device kernels have different interfaces depending on the + // backend being used. Typically a kernel which takes a buffer argument + // will take a pointer to the start of the buffer and a sycl::id param + // which is a struct that encodes the accessor to the buffer. However + // the AMD backend handles this differently and uses three separate + // arguments for each of the three dimensions of the accessor. + + ur_platform_backend_t backend; + ASSERT_SUCCESS(urPlatformGetInfo(platform, UR_PLATFORM_INFO_BACKEND, + sizeof(backend), &backend, nullptr)); + if (backend == UR_PLATFORM_BACKEND_HIP) { + // this emulates the three offset params for buffer accessor on AMD. + size_t val = 0; + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1, + sizeof(size_t), nullptr, &val)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 2, + sizeof(size_t), nullptr, &val)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 3, + sizeof(size_t), nullptr, &val)); + current_arg_index += 4; + } else { + // This emulates the offset struct sycl adds for a 1D buffer accessor. + struct { + size_t offsets[1] = {0}; + } accessor; + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1, + sizeof(accessor), nullptr, + &accessor)); + current_arg_index += 2; + } + } + + void TearDown() { + if (num_devices > 1) { + for (auto i = 0u; i < num_devices; ++i) { + ASSERT_SUCCESS(urKernelRelease(kernels[i])); + ASSERT_SUCCESS(urProgramRelease(programs[i])); + } + urMemRelease(buffer); + } + urMultiDeviceContextTest::TearDown(); + } + + size_t buffer_size = 4096; + size_t buffer_size_bytes = 4096 * sizeof(T); + ur_mem_handle_t buffer; + + // Program stuff so we can launch kernels + std::shared_ptr> il_binary; + std::string program_name = "inc"; + std::string kernel_name; + std::vector programs; + std::vector kernels; + std::vector metadatas{}; +}; + +TEST_F(urMultiDeviceContextMemBufferTest, WriteRead) { + if (num_devices == 1) { + GTEST_SKIP(); + } + T fill_val = 42; + std::vector in_vec(buffer_size, fill_val); + std::vector out_vec(buffer_size, 0); + ur_event_handle_t e1; + + ASSERT_SUCCESS(urEnqueueMemBufferWrite(queues[0], buffer, false, 0, + buffer_size_bytes, in_vec.data(), 0, + nullptr, &e1)); + + ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, false, 0, + buffer_size_bytes, out_vec.data(), 1, + &e1, nullptr)); + for (auto &a : out_vec) { + ASSERT_EQ(a, fill_val); + } +} + +TEST_F(urMultiDeviceContextMemBufferTest, FillRead) { + if (num_devices == 1) { + GTEST_SKIP(); + } + T fill_val = 42; + std::vector in_vec(buffer_size, fill_val); + std::vector out_vec(buffer_size); + ur_event_handle_t e1; + + ASSERT_SUCCESS(urEnqueueMemBufferFill(queues[0], buffer, &fill_val, + sizeof(fill_val), 0, + buffer_size_bytes, 0, nullptr, &e1)); + + ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, false, 0, + buffer_size_bytes, out_vec.data(), 1, + &e1, nullptr)); + for (auto &a : out_vec) { + ASSERT_EQ(a, fill_val); + } +} + +TEST_F(urMultiDeviceContextMemBufferTest, WriteKernelRead) { + if (num_devices == 1) { + GTEST_SKIP(); + } + + // Kernel to run on queues[1] + AddBuffer1DArg(kernels[1], 0, buffer); + + T fill_val = 42; + std::vector in_vec(buffer_size, fill_val); + std::vector out_vec(buffer_size); + ur_event_handle_t e1, e2; + + ASSERT_SUCCESS(urEnqueueMemBufferWrite(queues[0], buffer, false, 0, + buffer_size_bytes, in_vec.data(), 0, + nullptr, &e1)); + + size_t work_dims[3] = {buffer_size, 1, 1}; + size_t offset[3] = {0, 0, 0}; + + // Kernel increments the fill val by 1 + ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[1], kernels[1], 1 /*workDim=*/, + offset, work_dims, nullptr, 1, &e1, + &e2)); + + ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[0], buffer, false, 0, + buffer_size_bytes, out_vec.data(), 1, + &e2, nullptr)); + for (auto &a : out_vec) { + ASSERT_EQ(a, fill_val + 1); + } +} + +TEST_F(urMultiDeviceContextMemBufferTest, WriteKernelKernelRead) { + if (num_devices == 1) { + GTEST_SKIP(); + } + + AddBuffer1DArg(kernels[0], 0, buffer); + AddBuffer1DArg(kernels[1], 0, buffer); + + T fill_val = 42; + std::vector in_vec(buffer_size, fill_val); + std::vector out_vec(buffer_size); + ur_event_handle_t e1, e2, e3; + + ASSERT_SUCCESS(urEnqueueMemBufferWrite(queues[0], buffer, false, 0, + buffer_size_bytes, in_vec.data(), 0, + nullptr, &e1)); + + size_t work_dims[3] = {buffer_size, 1, 1}; + size_t offset[3] = {0, 0, 0}; + + // Kernel increments the fill val by 1 + ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[1], kernels[1], 1 /*workDim=*/, + offset, work_dims, nullptr, 1, &e1, + &e2)); + + // Kernel increments the fill val by 1 + ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[0], kernels[0], 1 /*workDim=*/, + offset, work_dims, nullptr, 1, &e2, + &e3)); + + ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, false, 0, + buffer_size_bytes, out_vec.data(), 1, + &e3, nullptr)); + for (auto &a : out_vec) { + ASSERT_EQ(a, fill_val + 2); + } +} diff --git a/test/conformance/memory/CMakeLists.txt b/test/conformance/memory/CMakeLists.txt index 64de8ef059..041f73a079 100644 --- a/test/conformance/memory/CMakeLists.txt +++ b/test/conformance/memory/CMakeLists.txt @@ -10,6 +10,7 @@ add_conformance_test_with_devices_environment(memory urMemGetInfo.cpp urMemGetNativeHandle.cpp urMemImageCreate.cpp + urMemImageCreateWithImageFormatParam.cpp urMemImageCreateWithNativeHandle.cpp urMemImageGetInfo.cpp urMemRelease.cpp diff --git a/test/conformance/memory/memory_adapter_cuda.match b/test/conformance/memory/memory_adapter_cuda.match index b9353f20be..7d2e6a1c01 100644 --- a/test/conformance/memory/memory_adapter_cuda.match +++ b/test/conformance/memory/memory_adapter_cuda.match @@ -1 +1,3 @@ urMemImageCreateTest.InvalidSize/NVIDIA_CUDA_BACKEND___{{.*}}_ +{{OPT}}urMemImageCremBufferCrateTestWith1DMemoryTypeParam.Success/NVIDIA_CUDA_BACKEND___{{.*}}___UR_MEM_TYPE_IMAGE1D_ARRAY +{{OPT}}urMemImageCreateTestWith2DMemoryTypeParam.Success/NVIDIA_CUDA_BACKEND___{{.*}}___UR_MEM_TYPE_IMAGE2D_ARRAY diff --git a/test/conformance/memory/memory_adapter_native_cpu.match b/test/conformance/memory/memory_adapter_native_cpu.match index d0cb7644d9..27e3d859e0 100644 --- a/test/conformance/memory/memory_adapter_native_cpu.match +++ b/test/conformance/memory/memory_adapter_native_cpu.match @@ -4,14 +4,243 @@ urMemGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_INFO_SIZE urMemGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_INFO_CONTEXT urMemGetInfoTest.InvalidSizeSmall/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_INFO_SIZE urMemGetInfoTest.InvalidSizeSmall/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_INFO_CONTEXT -urMemImageCreateTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urMemImageCreateTest.SuccessWith3DImageType/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urMemImageCreateTest.InvalidSize/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urMemImageCreateTest.InvalidImageDescStype/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urMemImageCreateTest.InvalidImageDescNumMipLevel/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urMemImageCreateTest.InvalidImageDescNumSamples/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urMemImageCreateTest.InvalidImageDescRowPitch/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urMemImageCreateTest.InvalidImageDescSlicePitch/SYCL_NATIVE_CPU___SYCL_Native_CPU_ +urMemImageCreateTestWith1DMemoryTypeParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_TYPE_IMAGE1D +urMemImageCreateTestWith1DMemoryTypeParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_TYPE_IMAGE1D_ARRAY +urMemImageCreateTestWith2DMemoryTypeParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_TYPE_IMAGE2D +urMemImageCreateTestWith2DMemoryTypeParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_TYPE_IMAGE2D_ARRAY urMemImageCreateWithHostPtrFlagsTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER urMemImageCreateWithHostPtrFlagsTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_MEM_FLAG_USE_HOST_POINTER +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_INT_101010 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_A__UR_IMAGE_CHANNEL_TYPE_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_INT_101010 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_R__UR_IMAGE_CHANNEL_TYPE_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_INT_101010 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RG__UR_IMAGE_CHANNEL_TYPE_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_INT_101010 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RA__UR_IMAGE_CHANNEL_TYPE_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_INT_101010 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGB__UR_IMAGE_CHANNEL_TYPE_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_INT_101010 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBA__UR_IMAGE_CHANNEL_TYPE_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_INT_101010 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_BGRA__UR_IMAGE_CHANNEL_TYPE_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_INT_101010 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ARGB__UR_IMAGE_CHANNEL_TYPE_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_INT_101010 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_ABGR__UR_IMAGE_CHANNEL_TYPE_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_INT_101010 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_INTENSITY__UR_IMAGE_CHANNEL_TYPE_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_INT_101010 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_LUMINANCE__UR_IMAGE_CHANNEL_TYPE_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_INT_101010 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RX__UR_IMAGE_CHANNEL_TYPE_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_INT_101010 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGX__UR_IMAGE_CHANNEL_TYPE_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_INT_101010 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_RGBX__UR_IMAGE_CHANNEL_TYPE_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_SNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_INT_101010 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32 +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT +urMemImageCreateTestWithImageFormatParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_IMAGE_CHANNEL_ORDER_SRGBA__UR_IMAGE_CHANNEL_TYPE_FLOAT urMemReleaseTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urMemRetainTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ diff --git a/test/conformance/memory/urMemImageCreate.cpp b/test/conformance/memory/urMemImageCreate.cpp index 7b19ed7232..482bff466e 100644 --- a/test/conformance/memory/urMemImageCreate.cpp +++ b/test/conformance/memory/urMemImageCreate.cpp @@ -25,11 +25,87 @@ static ur_image_desc_t image_desc{ 0 ///< [in] number of samples }; -TEST_P(urMemImageCreateTest, Success) { +using urMemImageCreateTestWith1DMemoryTypeParam = + uur::urContextTestWithParam; + +UUR_TEST_SUITE_P(urMemImageCreateTestWith1DMemoryTypeParam, + ::testing::Values(UR_MEM_TYPE_IMAGE1D, + UR_MEM_TYPE_IMAGE1D_ARRAY), + uur::deviceTestWithParamPrinter); + +TEST_P(urMemImageCreateTestWith1DMemoryTypeParam, Success) { + ur_image_desc_t image_desc_with_param{ + UR_STRUCTURE_TYPE_IMAGE_DESC, ///< [in] type of this structure + nullptr, ///< [in][optional] pointer to extension-specific structure + getParam(), ///< [in] memory object type + 1, ///< [in] image width + 0, ///< [in] image height + 0, ///< [in] image depth + 1, ///< [in] image array size + 0, ///< [in] image row pitch + 0, ///< [in] image slice pitch + 0, ///< [in] number of MIP levels + 0 ///< [in] number of samples + }; + ur_mem_handle_t image_handle = nullptr; ASSERT_SUCCESS(urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, - &image_format, &image_desc, nullptr, - &image_handle)); + &image_format, &image_desc_with_param, + nullptr, &image_handle)); + ASSERT_NE(nullptr, image_handle); + ASSERT_SUCCESS(urMemRelease(image_handle)); +} + +using urMemImageCreateTestWith2DMemoryTypeParam = + uur::urContextTestWithParam; + +UUR_TEST_SUITE_P(urMemImageCreateTestWith2DMemoryTypeParam, + ::testing::Values(UR_MEM_TYPE_IMAGE2D, + UR_MEM_TYPE_IMAGE2D_ARRAY), + uur::deviceTestWithParamPrinter); + +TEST_P(urMemImageCreateTestWith2DMemoryTypeParam, Success) { + ur_image_desc_t image_desc_with_param{ + UR_STRUCTURE_TYPE_IMAGE_DESC, ///< [in] type of this structure + nullptr, ///< [in][optional] pointer to extension-specific structure + getParam(), ///< [in] memory object type + 1, ///< [in] image width + 1, ///< [in] image height + 0, ///< [in] image depth + 1, ///< [in] image array size + 0, ///< [in] image row pitch + 0, ///< [in] image slice pitch + 0, ///< [in] number of MIP levels + 0 ///< [in] number of samples + }; + + ur_mem_handle_t image_handle = nullptr; + ASSERT_SUCCESS(urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, + &image_format, &image_desc_with_param, + nullptr, &image_handle)); + ASSERT_NE(nullptr, image_handle); + ASSERT_SUCCESS(urMemRelease(image_handle)); +} + +TEST_P(urMemImageCreateTest, SuccessWith3DImageType) { + ur_image_desc_t image_desc_with_param{ + UR_STRUCTURE_TYPE_IMAGE_DESC, ///< [in] type of this structure + nullptr, ///< [in][optional] pointer to extension-specific structure + UR_MEM_TYPE_IMAGE3D, ///< [in] memory object type + 1, ///< [in] image width + 1, ///< [in] image height + 1, ///< [in] image depth + 1, ///< [in] image array size + 0, ///< [in] image row pitch + 0, ///< [in] image slice pitch + 0, ///< [in] number of MIP levels + 0 ///< [in] number of samples + }; + + ur_mem_handle_t image_handle = nullptr; + ASSERT_SUCCESS(urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, + &image_format, &image_desc_with_param, + nullptr, &image_handle)); ASSERT_NE(nullptr, image_handle); ASSERT_SUCCESS(urMemRelease(image_handle)); } diff --git a/test/conformance/memory/urMemImageCreateWithImageFormatParam.cpp b/test/conformance/memory/urMemImageCreateWithImageFormatParam.cpp new file mode 100644 index 0000000000..c305f58f00 --- /dev/null +++ b/test/conformance/memory/urMemImageCreateWithImageFormatParam.cpp @@ -0,0 +1,120 @@ +// Copyright (C) 2024 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +// See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include +#include + +static ur_image_desc_t image_desc{ + UR_STRUCTURE_TYPE_IMAGE_DESC, ///< [in] type of this structure + nullptr, ///< [in][optional] pointer to extension-specific structure + UR_MEM_TYPE_IMAGE3D, ///< [in] memory object type + 1, ///< [in] image width + 1, ///< [in] image height + 1, ///< [in] image depth + 1, ///< [in] image array size + 0, ///< [in] image row pitch + 0, ///< [in] image slice pitch + 0, ///< [in] number of MIP levels + 0 ///< [in] number of samples +}; + +const std::vector primary_image_formats = { + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT8}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNORM_INT16}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT8}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SNORM_INT16}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT}, + {UR_IMAGE_CHANNEL_ORDER_RGBA, UR_IMAGE_CHANNEL_TYPE_FLOAT}}; + +const std::vector channel_orders = { + UR_IMAGE_CHANNEL_ORDER_A, UR_IMAGE_CHANNEL_ORDER_R, + UR_IMAGE_CHANNEL_ORDER_RG, UR_IMAGE_CHANNEL_ORDER_RA, + UR_IMAGE_CHANNEL_ORDER_RGB, UR_IMAGE_CHANNEL_ORDER_RGBA, + UR_IMAGE_CHANNEL_ORDER_BGRA, UR_IMAGE_CHANNEL_ORDER_ARGB, + UR_IMAGE_CHANNEL_ORDER_ABGR, UR_IMAGE_CHANNEL_ORDER_INTENSITY, + UR_IMAGE_CHANNEL_ORDER_LUMINANCE, UR_IMAGE_CHANNEL_ORDER_RX, + UR_IMAGE_CHANNEL_ORDER_RGX, UR_IMAGE_CHANNEL_ORDER_RGBX, + UR_IMAGE_CHANNEL_ORDER_SRGBA}; + +const std::vector channel_types = { + UR_IMAGE_CHANNEL_TYPE_SNORM_INT8, + UR_IMAGE_CHANNEL_TYPE_SNORM_INT16, + UR_IMAGE_CHANNEL_TYPE_UNORM_INT8, + UR_IMAGE_CHANNEL_TYPE_UNORM_INT16, + UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_565, + UR_IMAGE_CHANNEL_TYPE_UNORM_SHORT_555, + UR_IMAGE_CHANNEL_TYPE_INT_101010, + UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8, + UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16, + UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32, + UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8, + UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16, + UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32, + UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT, + UR_IMAGE_CHANNEL_TYPE_FLOAT}; + +std::vector all_image_formats; + +struct urMemImageCreateTestWithImageFormatParam + : uur::urContextTestWithParam { + void SetUp() { + UUR_RETURN_ON_FATAL_FAILURE( + uur::urContextTestWithParam::SetUp()); + } + void TearDown() { + UUR_RETURN_ON_FATAL_FAILURE( + uur::urContextTestWithParam::TearDown()); + } + + static std::vector makeImageFormats() { + for (auto channel_order : channel_orders) { + for (auto channel_type : channel_types) { + all_image_formats.push_back({channel_order, channel_type}); + } + } + return all_image_formats; + } +}; + +UUR_TEST_SUITE_P( + urMemImageCreateTestWithImageFormatParam, + ::testing::ValuesIn( + urMemImageCreateTestWithImageFormatParam::makeImageFormats()), + uur::deviceTestWithParamPrinter); + +TEST_P(urMemImageCreateTestWithImageFormatParam, Success) { + ur_image_channel_order_t channel_order = + std::get<1>(GetParam()).channelOrder; + ur_image_channel_type_t channel_type = std::get<1>(GetParam()).channelType; + + ur_image_format_t image_format{channel_order, channel_type}; + + ur_mem_handle_t image_handle = nullptr; + ur_result_t res = + urMemImageCreate(context, UR_MEM_FLAG_READ_WRITE, &image_format, + &image_desc, nullptr, &image_handle); + + bool is_primary_image_format = false; + for (auto primary_image_format : primary_image_formats) { + if (primary_image_format.channelOrder == image_format.channelOrder && + primary_image_format.channelType == image_format.channelType) { + is_primary_image_format = true; + break; + } + } + + if (!is_primary_image_format && + res == UR_RESULT_ERROR_UNSUPPORTED_IMAGE_FORMAT) { + GTEST_SKIP(); + } + ASSERT_SUCCESS(res); + ASSERT_NE(nullptr, image_handle); + ASSERT_SUCCESS(urMemRelease(image_handle)); +} diff --git a/test/conformance/platform/urPlatformCreateWithNativeHandle.cpp b/test/conformance/platform/urPlatformCreateWithNativeHandle.cpp index 435ac23a00..4f6684c432 100644 --- a/test/conformance/platform/urPlatformCreateWithNativeHandle.cpp +++ b/test/conformance/platform/urPlatformCreateWithNativeHandle.cpp @@ -20,8 +20,8 @@ TEST_F(urPlatformCreateWithNativeHandleTest, Success) { // We can however convert the native_handle back into a unified-runtime // handle and perform some query on it to verify that it works. ur_platform_handle_t plat = nullptr; - UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( - urPlatformCreateWithNativeHandle(native_handle, nullptr, &plat)); + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urPlatformCreateWithNativeHandle( + native_handle, adapters[0], nullptr, &plat)); ASSERT_NE(plat, nullptr); std::string input_platform_name = uur::GetPlatformName(platform); @@ -45,8 +45,8 @@ TEST_F(urPlatformCreateWithNativeHandleTest, SuccessWithOwnedNativeHandle) { ur_platform_native_properties_t props = { UR_STRUCTURE_TYPE_PLATFORM_NATIVE_PROPERTIES, nullptr, true}; ur_platform_handle_t plat = nullptr; - UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( - urPlatformCreateWithNativeHandle(native_handle, &props, &plat)); + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urPlatformCreateWithNativeHandle( + native_handle, adapters[0], &props, &plat)); ASSERT_NE(plat, nullptr); std::string input_platform_name = uur::GetPlatformName(platform); @@ -70,8 +70,8 @@ TEST_F(urPlatformCreateWithNativeHandleTest, SuccessWithUnOwnedNativeHandle) { ur_platform_native_properties_t props = { UR_STRUCTURE_TYPE_PLATFORM_NATIVE_PROPERTIES, nullptr, false}; ur_platform_handle_t plat = nullptr; - UUR_ASSERT_SUCCESS_OR_UNSUPPORTED( - urPlatformCreateWithNativeHandle(native_handle, &props, &plat)); + UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(urPlatformCreateWithNativeHandle( + native_handle, adapters[0], &props, &plat)); ASSERT_NE(plat, nullptr); std::string input_platform_name = uur::GetPlatformName(platform); @@ -84,8 +84,8 @@ TEST_F(urPlatformCreateWithNativeHandleTest, InvalidNullPointerPlatform) { for (auto platform : platforms) { ur_native_handle_t native_handle = nullptr; ASSERT_SUCCESS(urPlatformGetNativeHandle(platform, &native_handle)); - ASSERT_EQ_RESULT( - UR_RESULT_ERROR_INVALID_NULL_POINTER, - urPlatformCreateWithNativeHandle(native_handle, nullptr, nullptr)); + ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, + urPlatformCreateWithNativeHandle( + native_handle, adapters[0], nullptr, nullptr)); } } diff --git a/test/conformance/program/program_adapter_hip.match b/test/conformance/program/program_adapter_hip.match index 27bd6831a5..498d19f0b9 100644 --- a/test/conformance/program/program_adapter_hip.match +++ b/test/conformance/program/program_adapter_hip.match @@ -1,30 +1,16 @@ urProgramBuildTest.BuildFailure/AMD_HIP_BACKEND___{{.*}}_ +# HIP hasn't implemented urProgramCreateWithNativeHandleTest {{OPT}}urProgramCreateWithNativeHandleTest.Success/AMD_HIP_BACKEND___{{.*}}_ -{{OPT}}urProgramCreateWithNativeHandleTest.InvalidNullHandleContext/AMD_HIP_BACKEND___{{.*}}_ -{{OPT}}urProgramCreateWithNativeHandleTest.InvalidNullPointerProgram/AMD_HIP_BACKEND___{{.*}}_ -{{OPT}}urProgramGetBuildInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_BINARY_TYPE -{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_STATUS -{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_OPTIONS -{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_LOG -{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_BINARY_TYPE -{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleDevice/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_STATUS -{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleDevice/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_OPTIONS -{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleDevice/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_LOG -{{OPT}}urProgramGetBuildInfoTest.InvalidNullHandleDevice/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_BUILD_INFO_BINARY_TYPE +# This test flakily fails {{OPT}}urProgramGetBuildInfoSingleTest.LogIsNullTerminated/AMD_HIP_BACKEND___{{.*}}_ -{{OPT}}urProgramGetInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_NUM_KERNELS -{{OPT}}urProgramGetInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_KERNEL_NAMES -{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_REFERENCE_COUNT -{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_CONTEXT -{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_NUM_DEVICES -{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_DEVICES -{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_SOURCE -{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_BINARY_SIZES -{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_BINARIES -{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_NUM_KERNELS -{{OPT}}urProgramGetInfoTest.InvalidNullHandleProgram/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_KERNEL_NAMES +# HIP doesn't expose kernel numbers or names +urProgramGetInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_NUM_KERNELS +urProgramGetInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_PROGRAM_INFO_KERNEL_NAMES + +# HIP hasn't implemented urProgramLink {{OPT}}urProgramLinkTest.Success/AMD_HIP_BACKEND___{{.*}}_ -{{OPT}}urProgramSetSpecializationConstantsTest.Success/AMD_HIP_BACKEND___{{.*}}_ -{{OPT}}urProgramSetSpecializationConstantsTest.UseDefaultValue/AMD_HIP_BACKEND___{{.*}}_ -{{OPT}}urProgramSetMultipleSpecializationConstantsTest.MultipleCalls/AMD_HIP_BACKEND___{{.*}}_ -{{OPT}}urProgramSetMultipleSpecializationConstantsTest.SingleCall/AMD_HIP_BACKEND___{{.*}}_ + +urProgramSetSpecializationConstantsTest.Success/AMD_HIP_BACKEND___{{.*}}_ +urProgramSetSpecializationConstantsTest.UseDefaultValue/AMD_HIP_BACKEND___{{.*}}_ +urProgramSetMultipleSpecializationConstantsTest.MultipleCalls/AMD_HIP_BACKEND___{{.*}}_ +urProgramSetMultipleSpecializationConstantsTest.SingleCall/AMD_HIP_BACKEND___{{.*}}_ diff --git a/test/conformance/queue/queue_adapter_hip.match b/test/conformance/queue/queue_adapter_hip.match index d39b30aa73..e69de29bb2 100644 --- a/test/conformance/queue/queue_adapter_hip.match +++ b/test/conformance/queue/queue_adapter_hip.match @@ -1,2 +0,0 @@ -urQueueGetInfoTestWithInfoParam.Success/AMD_HIP_BACKEND___{{.*}}___UR_QUEUE_INFO_DEVICE_DEFAULT -urQueueGetInfoTestWithInfoParam.Success/AMD_HIP_BACKEND___{{.*}}___UR_QUEUE_INFO_SIZE diff --git a/test/conformance/queue/queue_adapter_native_cpu.match b/test/conformance/queue/queue_adapter_native_cpu.match index a4c2c502a4..90bc25b23b 100644 --- a/test/conformance/queue/queue_adapter_native_cpu.match +++ b/test/conformance/queue/queue_adapter_native_cpu.match @@ -26,11 +26,16 @@ urQueueFinishTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urQueueFlushTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urQueueGetInfoTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_CONTEXT urQueueGetInfoTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_DEVICE -urQueueGetInfoTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_DEVICE_DEFAULT urQueueGetInfoTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_FLAGS urQueueGetInfoTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_REFERENCE_COUNT -urQueueGetInfoTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_SIZE urQueueGetInfoTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_EMPTY +urQueueGetInfoDeviceQueueTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_CONTEXT +urQueueGetInfoDeviceQueueTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_DEVICE +urQueueGetInfoDeviceQueueTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_DEVICE_DEFAULT +urQueueGetInfoDeviceQueueTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_FLAGS +urQueueGetInfoDeviceQueueTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_REFERENCE_COUNT +urQueueGetInfoDeviceQueueTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_SIZE +urQueueGetInfoDeviceQueueTestWithInfoParam.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU___UR_QUEUE_INFO_EMPTY urQueueGetInfoTest.InvalidSizeSmall/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urQueueRetainTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ urQueueReleaseTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU_ diff --git a/test/conformance/queue/queue_adapter_opencl.match b/test/conformance/queue/queue_adapter_opencl.match index a374e0b4b1..e69de29bb2 100644 --- a/test/conformance/queue/queue_adapter_opencl.match +++ b/test/conformance/queue/queue_adapter_opencl.match @@ -1,2 +0,0 @@ -urQueueGetInfoTestWithInfoParam.Success/Intel_R__OpenCL___{{.*}}___UR_QUEUE_INFO_DEVICE_DEFAULT -urQueueGetInfoTestWithInfoParam.Success/Intel_R__OpenCL___{{.*}}___UR_QUEUE_INFO_SIZE diff --git a/test/conformance/queue/urQueueGetInfo.cpp b/test/conformance/queue/urQueueGetInfo.cpp index 9704c72f64..9ffb97e1ff 100644 --- a/test/conformance/queue/urQueueGetInfo.cpp +++ b/test/conformance/queue/urQueueGetInfo.cpp @@ -20,10 +20,9 @@ using urQueueGetInfoTestWithInfoParam = UUR_TEST_SUITE_P(urQueueGetInfoTestWithInfoParam, ::testing::Values(UR_QUEUE_INFO_CONTEXT, UR_QUEUE_INFO_DEVICE, - UR_QUEUE_INFO_DEVICE_DEFAULT, UR_QUEUE_INFO_FLAGS, UR_QUEUE_INFO_REFERENCE_COUNT, - UR_QUEUE_INFO_SIZE, UR_QUEUE_INFO_EMPTY), + UR_QUEUE_INFO_EMPTY), uur::deviceTestWithParamPrinter); TEST_P(urQueueGetInfoTestWithInfoParam, Success) { @@ -70,6 +69,65 @@ TEST_P(urQueueGetInfoTestWithInfoParam, Success) { } } +struct urQueueGetInfoDeviceQueueTestWithInfoParam + : public uur::urContextTestWithParam { + void SetUp() { + urContextTestWithParam::SetUp(); + ur_queue_flags_t deviceQueueCapabilities; + ASSERT_SUCCESS( + urDeviceGetInfo(device, UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES, + sizeof(deviceQueueCapabilities), + &deviceQueueCapabilities, nullptr)); + if (!deviceQueueCapabilities) { + GTEST_SKIP() << "Queue on device is not supported."; + } + ASSERT_SUCCESS( + urQueueCreate(context, device, &queueProperties, &queue)); + } + + void TearDown() { + if (queue) { + ASSERT_SUCCESS(urQueueRelease(queue)); + } + urContextTestWithParam::TearDown(); + } + + ur_queue_handle_t queue = nullptr; + ur_queue_properties_t queueProperties = { + UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr, + UR_QUEUE_FLAG_ON_DEVICE | UR_QUEUE_FLAG_ON_DEVICE_DEFAULT | + UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE}; +}; + +UUR_TEST_SUITE_P(urQueueGetInfoDeviceQueueTestWithInfoParam, + ::testing::Values(UR_QUEUE_INFO_CONTEXT, UR_QUEUE_INFO_DEVICE, + UR_QUEUE_INFO_DEVICE_DEFAULT, + UR_QUEUE_INFO_FLAGS, + UR_QUEUE_INFO_REFERENCE_COUNT, + UR_QUEUE_INFO_SIZE, UR_QUEUE_INFO_EMPTY), + uur::deviceTestWithParamPrinter); + +TEST_P(urQueueGetInfoDeviceQueueTestWithInfoParam, Success) { + ur_queue_info_t info_type = getParam(); + size_t size = 0; + auto result = urQueueGetInfo(queue, info_type, 0, nullptr, &size); + + if (result == UR_RESULT_SUCCESS) { + ASSERT_NE(size, 0); + + if (const auto expected_size = queue_info_size_map.find(info_type); + expected_size != queue_info_size_map.end()) { + ASSERT_EQ(expected_size->second, size); + } + + std::vector data(size); + ASSERT_SUCCESS( + urQueueGetInfo(queue, info_type, size, data.data(), nullptr)); + } else { + ASSERT_EQ_RESULT(result, UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION); + } +} + using urQueueGetInfoTest = uur::urQueueTest; UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urQueueGetInfoTest); diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h index cf64aa13d3..bcdc94d524 100644 --- a/test/conformance/testing/include/uur/fixtures.h +++ b/test/conformance/testing/include/uur/fixtures.h @@ -406,7 +406,7 @@ template struct urQueueTestWithParam : urContextTestWithParam { } ur_queue_properties_t queue_properties = { UR_STRUCTURE_TYPE_QUEUE_PROPERTIES, nullptr, 0}; - ur_queue_handle_t queue; + ur_queue_handle_t queue = nullptr; }; template @@ -1098,6 +1098,11 @@ std::string deviceTestWithParamPrinter( return uur::GetPlatformAndDeviceName(device) + "__" + ss.str(); } +template <> +std::string deviceTestWithParamPrinter( + const ::testing::TestParamInfo< + std::tuple> &info); + // Helper struct to allow bool param tests with meaningful names. struct BoolTestParam { std::string name; diff --git a/test/conformance/testing/include/uur/utils.h b/test/conformance/testing/include/uur/utils.h index 34e5f7768b..e9a1efce8e 100644 --- a/test/conformance/testing/include/uur/utils.h +++ b/test/conformance/testing/include/uur/utils.h @@ -189,9 +189,13 @@ inline std::string GetPlatformName(ur_platform_handle_t hPlatform) { } inline std::string GetDeviceName(ur_device_handle_t device) { - std::string device_name; + std::string device_name, device_uuid; GetDeviceInfo(device, UR_DEVICE_INFO_NAME, device_name); - return GTestSanitizeString(device_name); + GetDeviceInfo(device, UR_DEVICE_INFO_UUID, device_uuid); + if (!device_uuid.empty()) { + device_uuid += "____"; + } + return GTestSanitizeString(device_name + device_uuid); } inline std::string GetPlatformAndDeviceName(ur_device_handle_t device) { diff --git a/test/conformance/testing/source/fixtures.cpp b/test/conformance/testing/source/fixtures.cpp index a0349181eb..bdb80c60be 100644 --- a/test/conformance/testing/source/fixtures.cpp +++ b/test/conformance/testing/source/fixtures.cpp @@ -39,4 +39,18 @@ std::string deviceTestWithParamPrinter( ss << addr_mode << "_" << filter_mode; return uur::GetPlatformAndDeviceName(device) + "__" + ss.str(); } + +template <> +std::string deviceTestWithParamPrinter( + const ::testing::TestParamInfo< + std::tuple> &info) { + auto device = std::get<0>(info.param); + auto param = std::get<1>(info.param); + auto ChannelOrder = param.channelOrder; + auto ChannelType = param.channelType; + + std::stringstream ss; + ss << ChannelOrder << "__" << ChannelType; + return uur::GetPlatformAndDeviceName(device) + "__" + ss.str(); +} } // namespace uur diff --git a/test/conformance/usm/urUSMFree.cpp b/test/conformance/usm/urUSMFree.cpp index 6dc1f9ffd5..f5502c89a6 100644 --- a/test/conformance/usm/urUSMFree.cpp +++ b/test/conformance/usm/urUSMFree.cpp @@ -42,7 +42,7 @@ TEST_P(urUSMFreeTest, SuccessHostAlloc) { void *ptr = nullptr; size_t allocation_size = sizeof(int); ASSERT_SUCCESS( - urUSMHostAlloc(context, nullptr, nullptr, sizeof(int), &ptr)); + urUSMHostAlloc(context, nullptr, nullptr, allocation_size, &ptr)); ur_event_handle_t event = nullptr; uint8_t pattern = 0; diff --git a/test/conformance/usm/usm_adapter_hip.match b/test/conformance/usm/usm_adapter_hip.match index 19b88b8b75..2dfdaf7253 100644 --- a/test/conformance/usm/usm_adapter_hip.match +++ b/test/conformance/usm/usm_adapter_hip.match @@ -21,8 +21,6 @@ urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/AMD_HIP_BACKEND___{{.*}} urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/AMD_HIP_BACKEND___{{.*}}___UsePoolEnabled_64_8 urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/AMD_HIP_BACKEND___{{.*}}___UsePoolEnabled_64_512 urUSMDeviceAllocAlignmentTest.SuccessAlignedAllocations/AMD_HIP_BACKEND___{{.*}}___UsePoolEnabled_64_2048 -urUSMGetMemAllocInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_USM_ALLOC_INFO_BASE_PTR -urUSMGetMemAllocInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_USM_ALLOC_INFO_SIZE urUSMGetMemAllocInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_USM_ALLOC_INFO_POOL urUSMHostAllocTest.Success/AMD_HIP_BACKEND___{{.*}}___UsePoolEnabled urUSMHostAllocTest.SuccessWithDescriptors/AMD_HIP_BACKEND___{{.*}}___UsePoolEnabled