diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 196fffa19c569..01ccc049321dc 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -170,11 +170,16 @@ sycl/test-e2e/LLVMIntrinsicLowering/ @intel/dpcpp-spirv-reviewers # Sanitizer clang/lib/Driver/SanitizerArgs.cpp @intel/dpcpp-sanitizers-review libdevice/include/asan_rtl.hpp @intel/dpcpp-sanitizers-review +libdevice/include/msan_rtl.hpp @intel/dpcpp-sanitizers-review libdevice/include/sanitizer_defs.hpp @intel/dpcpp-sanitizers-review libdevice/sanitizer/ @intel/dpcpp-sanitizers-review llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h @intel/dpcpp-sanitizers-review llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h @intel/dpcpp-sanitizers-review llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerOptions.h @intel/dpcpp-sanitizers-review +llvm/include/llvm/Transforms/Instrumentation/MemorySanitizer.h @intel/dpcpp-sanitizers-review llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @intel/dpcpp-sanitizers-review +llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @intel/dpcpp-sanitizers-review llvm/test/Instrumentation/AddressSanitizer/ @intel/dpcpp-sanitizers-review +llvm/test/Instrumentation/MemorySanitizer/ @intel/dpcpp-sanitizers-review sycl/test-e2e/AddressSanitizer/ @intel/dpcpp-sanitizers-review +sycl/test-e2e/MemorySanitizer/ @intel/dpcpp-sanitizers-review diff --git a/.github/workflows/sycl-containers-igc-dev.yaml b/.github/workflows/sycl-containers-igc-dev.yaml index 8a8f73285ae15..ae2a660a79e93 100644 --- a/.github/workflows/sycl-containers-igc-dev.yaml +++ b/.github/workflows/sycl-containers-igc-dev.yaml @@ -33,7 +33,7 @@ jobs: imagefile: ubuntu2404_intel_drivers tag: devigc build_args: | - "use_latest=false" + "use_unstable_driver=false" "use_igc_dev=true" steps: - name: Checkout diff --git a/.github/workflows/sycl-containers.yaml b/.github/workflows/sycl-containers.yaml index 1c280109f674e..a33839022c952 100644 --- a/.github/workflows/sycl-containers.yaml +++ b/.github/workflows/sycl-containers.yaml @@ -43,10 +43,14 @@ jobs: file: ubuntu2404_base tag: latest build_args: "" - - name: Build Ubuntu Docker image + - name: Build Ubuntu 22.04 Docker image file: ubuntu2204_build tag: latest build_args: "" + - name: Build Ubuntu 24.04 Docker image + file: ubuntu2404_build + tag: latest + build_args: "" - name: Build Ubuntu 24.04 oneAPI Docker image file: ubuntu2404_build_oneapi tag: latest @@ -54,18 +58,22 @@ jobs: - name: Intel Drivers Ubuntu 22.04 Docker image file: ubuntu2204_intel_drivers tag: latest - build_args: "use_latest=false" + build_args: "use_unstable_driver=false" + - name: Intel Drivers Ubuntu 24.04 Docker image + file: ubuntu2404_intel_drivers + tag: latest + build_args: "use_unstable_driver=false" - name: Intel Drivers (unstable) Ubuntu 24.04 Docker image file: ubuntu2404_intel_drivers tag: unstable - build_args: "use_latest=true" + build_args: "use_unstable_driver=true" - name: Build + Intel Drivers Ubuntu 22.04 Docker image file: ubuntu2204_intel_drivers tag: alldeps build_args: | base_image=ghcr.io/intel/llvm/ubuntu2204_build base_tag=latest - use_latest=false + use_unstable_driver=false steps: - name: Checkout uses: actions/checkout@v4 @@ -78,7 +86,6 @@ jobs: file: ${{ matrix.file }} username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} - sycl_ci_passwd: ${{ secrets.DOCKER_SUDO_PASSWORD }} tags: | ghcr.io/${{ github.repository }}/${{ matrix.file }}:${{ matrix.tag }}-${{ github.sha }} ghcr.io/${{ github.repository }}/${{ matrix.file }}:${{ matrix.tag }} diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl-linux-precommit.yml index cf1a488c87c81..8baa70e8b878b 100644 --- a/.github/workflows/sycl-linux-precommit.yml +++ b/.github/workflows/sycl-linux-precommit.yml @@ -72,6 +72,53 @@ jobs: else echo 'arc_tests="Matrix/"' >> "$GITHUB_OUTPUT" fi + + build_e2e_tests: + needs: [build] + if: ${{ always() && !cancelled() && needs.build.outputs.build_conclusion == 'success' }} + uses: ./.github/workflows/sycl-linux-run-tests.yml + with: + name: Build e2e tests + runner: '["Linux", "build"]' + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest + image_options: -u 1001 + ref: ${{ github.sha }} + merge_ref: '' + sycl_toolchain_artifact: sycl_linux_default + sycl_toolchain_archive: ${{ needs.build.outputs.artifact_archive_name }} + sycl_toolchain_decompress_command: ${{ needs.build.outputs.artifact_decompress_command }} + e2e_testing_mode: 'build-only' + run_prebuilt_e2e_tests: + needs: [build, build_e2e_tests] + if: ${{ always() && !cancelled() && needs.build.outputs.build_conclusion == 'success' }} + strategy: + fail-fast: false + matrix: + include: + - name: Intel + runner: '["Linux", "gen12"]' + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest + image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN + target_devices: level_zero:gpu;opencl:gpu;opencl:cpu + reset_intel_gpu: true + extra_lit_opts: --param gpu-intel-gen12=True + uses: ./.github/workflows/sycl-linux-run-tests.yml + with: + name: ${{ matrix.name }} + runner: ${{ matrix.runner }} + image: ${{ matrix.image }} + image_options: ${{ matrix.image_options }} + target_devices: ${{ matrix.target_devices }} + extra_lit_opts: --param fallback-to-build-if-requires-build-and-run=True ${{ matrix.extra_lit_opts }} + reset_intel_gpu: ${{ matrix.reset_intel_gpu }} + ref: ${{ github.sha }} + merge_ref: '' + sycl_toolchain_artifact: sycl_linux_default + sycl_toolchain_archive: ${{ needs.build.outputs.artifact_archive_name }} + sycl_toolchain_decompress_command: ${{ needs.build.outputs.artifact_decompress_command }} + e2e_binaries_artifact: sycl_e2e_bin_default + e2e_testing_mode: 'run-only' + test: needs: [build, detect_changes, determine_arc_tests] if: ${{ always() && !cancelled() && needs.build.outputs.build_conclusion == 'success' }} @@ -90,16 +137,9 @@ jobs: image_options: -u 1001 --device=/dev/dri --device=/dev/kfd target_devices: ext_oneapi_hip:gpu reset_intel_gpu: false - - name: Intel - runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest - image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN - target_devices: level_zero:gpu;opencl:gpu;opencl:cpu - reset_intel_gpu: true - extra_lit_opts: --param gpu-intel-gen12=True - name: E2E tests on Intel Arc A-Series Graphics runner: '["Linux", "arc"]' - image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu;opencl:gpu reset_intel_gpu: true @@ -161,12 +201,12 @@ jobs: include: - name: Intel GEN12 Graphics system runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_extra_opts: --device=/dev/dri reset_intel_gpu: true - name: Intel Arc A-Series Graphics system runner: '["Linux", "arc"]' - image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_extra_opts: --device=/dev/dri reset_intel_gpu: true - name: AMD system diff --git a/.github/workflows/sycl-linux-run-tests.yml b/.github/workflows/sycl-linux-run-tests.yml index 73b2a1f336db7..8e54f90348427 100644 --- a/.github/workflows/sycl-linux-run-tests.yml +++ b/.github/workflows/sycl-linux-run-tests.yml @@ -19,7 +19,7 @@ on: target_devices: type: string - required: True + required: False extra_cmake_args: type: string required: False @@ -59,6 +59,30 @@ on: default: '' required: False + e2e_binaries_artifact: + description: | + By setting this the E2E binaries folder will not be created, rather it + will be downloaded and extracted from the specified artifact. When + running tests in `run-only` mode this must be provided. + type: string + default: '' + required: False + e2e_testing_mode: + description: | + Testing mode to run E2E tests in, can be either `full`, `build-only` + or `run-only`. In `build-only` mode an artifact of the E2E binaries + will be uploaded. + type: string + default: 'full' + artifact_suffix: + description: 'Suffix for E2E binaries artifact that is output when in `build-only`.' + type: string + default: 'default' + retention-days: + description: 'E2E binaries artifact retention period.' + type: string + default: 1 + reset_intel_gpu: type: string required: False @@ -86,7 +110,9 @@ on: - '["Linux", "gen12"]' - '["amdgpu"]' - '["Linux", "arc"]' + - '["Linux", "pvc"]' - '["cts-cpu"]' + - '["Linux", "build"]' image: description: | Use option ending with ":build" for AMDGPU, ":latest" for the rest. @@ -142,6 +168,11 @@ on: options: - false - true + e2e_testing_mode: + type: choice + options: + - "full" + - "build-only" permissions: contents: read @@ -270,8 +301,19 @@ jobs: cat /usr/local/lib/igc/IGCTAG.txt fi + - name: Download E2E Binaries + if: inputs.e2e_binaries_artifact != '' + uses: actions/download-artifact@v4 + with: + name: ${{ inputs.e2e_binaries_artifact }} + - name: Extract E2E Binaries + if: inputs.e2e_binaries_artifact != '' + run: | + mkdir build-e2e + tar -I 'zstd' -xf e2e_binaries.tar.zst -C build-e2e + - name: Deduce E2E CMake options - if: inputs.tests_selector == 'e2e' + if: inputs.tests_selector == 'e2e' && inputs.e2e_binaries_artifact == '' id: cmake_opts shell: bash env: @@ -281,14 +323,14 @@ jobs: echo "opts=$CMAKE_EXTRA_ARGS" >> $GITHUB_OUTPUT fi - name: Configure E2E tests - if: inputs.tests_selector == 'e2e' + if: inputs.tests_selector == 'e2e' && inputs.e2e_binaries_artifact == '' run: | - cmake -GNinja -B./build-e2e -S./llvm/sycl/test-e2e -DSYCL_TEST_E2E_TARGETS="${{ inputs.target_devices }}" -DCMAKE_CXX_COMPILER="$(which clang++)" -DLLVM_LIT="$PWD/llvm/llvm/utils/lit/lit.py" ${{ steps.cmake_opts.outputs.opts }} + cmake -GNinja -B./build-e2e -S./llvm/sycl/test-e2e -DCMAKE_CXX_COMPILER="$(which clang++)" -DLLVM_LIT="$PWD/llvm/llvm/utils/lit/lit.py" ${{ steps.cmake_opts.outputs.opts }} - name: SYCL End-to-end tests shell: bash {0} if: inputs.tests_selector == 'e2e' env: - LIT_OPTS: -v --no-progress-bar --show-unsupported --show-pass --show-xfail --max-time 3600 --time-tests ${{ inputs.extra_lit_opts }} + LIT_OPTS: -v --no-progress-bar --show-unsupported --show-pass --show-xfail --max-time 3600 --time-tests --param test-mode=${{ inputs.e2e_testing_mode }} --param sycl_devices=${{ inputs.target_devices }} ${{ inputs.extra_lit_opts }} run: | ninja -C build-e2e check-sycl-e2e > e2e.log 2>&1 exit_code=$? @@ -375,3 +417,13 @@ jobs: grep 'exit code: [^0]' -r logs >> $GITHUB_STEP_SUMMARY exit $ret + - name: Pack E2E binaries + if: ${{ always() && !cancelled() && inputs.e2e_testing_mode == 'build-only'}} + run: tar -I 'zstd -9' -cf e2e_binaries.tar.zst -C ./build-e2e . + - name: Upload E2E binaries + if: ${{ always() && !cancelled() && inputs.e2e_testing_mode == 'build-only'}} + uses: actions/upload-artifact@v4 + with: + name: sycl_e2e_bin_${{ inputs.artifact_suffix }} + path: e2e_binaries.tar.zst + retention-days: ${{ inputs.retention-days }} diff --git a/.github/workflows/sycl-nightly.yml b/.github/workflows/sycl-nightly.yml index 74129a444d2c7..1c1c5c6d1f398 100644 --- a/.github/workflows/sycl-nightly.yml +++ b/.github/workflows/sycl-nightly.yml @@ -52,7 +52,7 @@ jobs: - name: Intel L0 GPU runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu reset_intel_gpu: true @@ -61,7 +61,7 @@ jobs: - name: Intel OCL GPU runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: opencl:gpu reset_intel_gpu: true @@ -70,35 +70,35 @@ jobs: - name: OCL CPU (AMD) runner: '["Linux", "amdgpu"]' - image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 target_devices: opencl:cpu tests_selector: e2e - name: OCL CPU (Intel/GEN12) runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --privileged --cap-add SYS_ADMIN target_devices: opencl:cpu tests_selector: e2e - name: OCL CPU (Intel/Arc) runner: '["Linux", "arc"]' - image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 target_devices: opencl:cpu tests_selector: e2e - name: SYCL-CTS on OCL CPU runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN target_devices: opencl:cpu tests_selector: cts - name: SYCL-CTS on L0 gen12 runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu tests_selector: cts @@ -229,7 +229,7 @@ jobs: username: ${{ github.repository_owner }} password: ${{ secrets.GITHUB_TOKEN }} build-args: | - base_image=ghcr.io/intel/llvm/ubuntu2204_intel_drivers + base_image=ghcr.io/intel/llvm/ubuntu2404_intel_drivers base_tag=latest tags: | ghcr.io/${{ github.repository }}/sycl_ubuntu2204_nightly:${{ github.sha }} diff --git a/.github/workflows/sycl-post-commit.yml b/.github/workflows/sycl-post-commit.yml index 89e293736a1bd..816cb313a1284 100644 --- a/.github/workflows/sycl-post-commit.yml +++ b/.github/workflows/sycl-post-commit.yml @@ -60,6 +60,21 @@ jobs: image_options: -u 1001 --device=/dev/dri --device=/dev/kfd target_devices: ext_oneapi_hip:gpu reset_intel_gpu: false + - name: E2E tests on Intel Ponte Vecchio GPU + runner: '["Linux", "pvc"]' + env: '{"LIT_FILTER_OUT":"ESIMD/unified_memory_api/"}' + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest + image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN + target_devices: ext_oneapi_level_zero:gpu;opencl:gpu + extra_lit_opts: -j 50 + - name: E2E tests with dev igc on Intel Ponte Vecchio GPU + runner: '["Linux", "pvc"]' + env: '{"LIT_FILTER_OUT":"ESIMD/unified_memory_api/"}' + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:devigc + image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN + target_devices: ext_oneapi_level_zero:gpu;opencl:gpu + use_igc_dev: true + extra_lit_opts: -j 50 # Performance tests below. Specifics: # - only run performance tests (use LIT_FILTER env) # - ask llvm-lit to show all the output, even for PASS (-a) @@ -82,7 +97,7 @@ jobs: with: name: ${{ matrix.name }} runner: ${{ matrix. runner }} - image: ${{ matrix.image || 'ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest' }} + image: ${{ matrix.image || 'ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest' }} image_options: ${{ matrix.image_options || '-u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN' }} target_devices: ${{ matrix.target_devices || 'level_zero:gpu' }} reset_intel_gpu: ${{ matrix.reset_intel_gpu }} diff --git a/.github/workflows/sycl-rel-nightly.yml b/.github/workflows/sycl-rel-nightly.yml index 01ecd64882b74..3563b14bcb88c 100644 --- a/.github/workflows/sycl-rel-nightly.yml +++ b/.github/workflows/sycl-rel-nightly.yml @@ -54,14 +54,14 @@ jobs: include: - name: AMD/HIP runner: '["Linux", "amdgpu"]' - image: ghcr.io/intel/llvm/ubuntu2204_build:latest + image: ghcr.io/intel/llvm/ubuntu2404_build:latest image_options: -u 1001 --device=/dev/dri --device=/dev/kfd target_devices: ext_oneapi_hip:gpu tests_selector: e2e - name: Intel L0 GPU runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu reset_intel_gpu: true @@ -70,7 +70,7 @@ jobs: - name: Intel OCL GPU runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: opencl:gpu reset_intel_gpu: true @@ -79,21 +79,21 @@ jobs: - name: Intel OCL CPU runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --privileged --cap-add SYS_ADMIN target_devices: opencl:cpu tests_selector: e2e - name: SYCL-CTS on OCL CPU runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN target_devices: opencl:cpu tests_selector: cts - name: SYCL-CTS on L0 gen12 runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu tests_selector: cts diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index ee36cb414175c..c32cb7fb2811a 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -823,8 +823,10 @@ static bool isValidSYCLTriple(llvm::Triple T) { T.getOS() == llvm::Triple::CUDA && !T.hasEnvironment()) return true; - // AMDGCN is valid for SYCL - if (T.isAMDGCN()) + // 'amdgcn-amd-amdhsa' is the valid SYCL triple for AMD GPUs. + if (T.getArch() == llvm::Triple::amdgcn && + T.getVendor() == llvm::Triple::AMD && T.getOS() == llvm::Triple::AMDHSA && + !T.hasEnvironment()) return true; // Check for invalid SYCL device triple values. diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index 26ead5783e8e9..df98170dc528a 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -5669,9 +5669,27 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, CmdArgs.push_back("-Wno-sycl-strict"); } - // Set O2 optimization level by default - if (!Args.getLastArg(options::OPT_O_Group)) - CmdArgs.push_back("-O2"); + // If no optimization controlling flags (-O) are provided, check if + // any debug information flags(-g) are passed. + // "-fintelfpga" implies "-g" and we preserve the default optimization for + // this flow(-O2). + // if "-g" is explicitly passed from the command-line, set default + // optimization to -O0. + + if (!Args.hasArgNoClaim(options::OPT_O_Group, options::OPT__SLASH_O)) { + StringRef OptLevel = "-O2"; + const Arg *DebugInfoGroup = Args.getLastArg(options::OPT_g_Group); + // -fintelfpga -g case + if ((Args.hasArg(options::OPT_fintelfpga) && + Args.hasMultipleArgs(options::OPT_g_Group)) || + /* -fsycl -g case */ (!Args.hasArg(options::OPT_fintelfpga) && + DebugInfoGroup)) { + if (!DebugInfoGroup->getOption().matches(options::OPT_g0)) { + OptLevel = "-O0"; + } + } + CmdArgs.push_back(OptLevel.data()); + } // Add the integration header option to generate the header. StringRef Header(D.getIntegrationHeader(Input.getBaseInput())); @@ -10935,7 +10953,25 @@ static std::string getSYCLPostLinkOptimizationLevel(const ArgList &Args) { [=](char c) { return c == S[0]; })) return std::string("-O") + S[0]; } - + // If no optimization controlling flags (-O) are provided, check if + // any debug information flags(-g) are passed. + // "-fintelfpga" implies "-g" and we preserve the default optimization for + // this flow(-O2). + // if "-g" is explicitly passed from the command-line, set default + // optimization to -O0. + + if (!Args.hasArg(options::OPT_O_Group)) { + const Arg *DebugInfoGroup = Args.getLastArg(options::OPT_g_Group); + // -fintelfpga -g case + if ((Args.hasArg(options::OPT_fintelfpga) && + Args.hasMultipleArgs(options::OPT_g_Group)) || + /* -fsycl -g case */ + (!Args.hasArg(options::OPT_fintelfpga) && DebugInfoGroup)) { + if (!DebugInfoGroup->getOption().matches(options::OPT_g0)) { + return "-O0"; + } + } + } // The default for SYCL device code optimization return "-O2"; } diff --git a/clang/test/Driver/sycl-device-optimizations.cpp b/clang/test/Driver/sycl-device-optimizations.cpp index 7741b9586aff4..dfe7ca354c09f 100644 --- a/clang/test/Driver/sycl-device-optimizations.cpp +++ b/clang/test/Driver/sycl-device-optimizations.cpp @@ -53,3 +53,59 @@ // RUN: %clang -### -fsycl --offload-new-driver %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHECK-NO-THRESH %s // CHECK-NO-THRESH-NOT: "-mllvm" "-inline-threshold + +/// Check that optimizations for sycl device are disabled with -g passed: +// RUN: %clang -### -fsycl -g %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHECK-DEBUG %s +// RUN: %clang_cl -### -fsycl -g %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHECK-DEBUG %s +// CHECK-DEBUG: clang{{.*}} "-fsycl-is-device{{.*}}" "-O0" +// CHECK-DEBUG: sycl-post-link{{.*}} "-O0" +// CHECK-DEBUG-NOT: "-O2" + +/// Check that optimizations for sycl device are enabled with -g and O2 passed: +// RUN: %clang -### -fsycl -O2 -g %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHECK-G-O2 %s +// For clang_cl, -O2 maps to -O3 +// RUN: %clang_cl -### -fsycl -O2 -g %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHECK-G-O3 %s +// CHECK-G-O2: clang{{.*}} "-fsycl-is-device{{.*}}" "-O2" +// CHECK-G-O2: sycl-post-link{{.*}} "-O2" +// CHECK-G-O2-NOT: "-O0" +// CHECK-G-O3: clang{{.*}} "-fsycl-is-device{{.*}}" "-O3" +// CHECK-G-O3: sycl-post-link{{.*}} "-O3" +// CHECK-G-O3-NOT: "-O0" + +/// Check that -O2 is passed for FPGA +// RUN: %clang -### -fintelfpga -fsycl-early-optimizations %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHECK-FPGA %s +// RUN: %clang_cl -### -fintelfpga -fsycl-early-optimizations %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHECK-FPGA %s +// CHECK-FPGA: clang{{.*}} "-fsycl-is-device{{.*}}" "-O2" +// CHECK-FPGA: sycl-post-link{{.*}} "-O2" +// CHECK-FPGA-NOT: "-O0" + +/// Check that -O2 preserves for FPGA when it's explicitly passed +// RUN: %clang -### -O2 -fintelfpga -fsycl-early-optimizations %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHECK-FPGA-O2 %s +// For clang_cl, -O2 maps to -O3 +// RUN: %clang_cl -### -O2 -fintelfpga -fsycl-early-optimizations %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHECK-FPGA-O3 %s +// CHECK-FPGA-O2: clang{{.*}} "-fsycl-is-device{{.*}}" "-O2" +// CHECK-FPGA-O2: sycl-post-link{{.*}} "-O2" +// CHECK-FPGA-O2-NOT: "-O0" +// CHECK-FPGA-O3: clang{{.*}} "-fsycl-is-device{{.*}}" "-O3" +// CHECK-FPGA-O3: sycl-post-link{{.*}} "-O3" +// CHECK-FPGA-O3-NOT: "-O0" + +/// Check that -O0 is passed for FPGA when -g is explicitly passed +// RUN: %clang -### -fintelfpga -g %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHECK-FPGA-O0 %s +// RUN: %clang_cl -### -fintelfpga -g %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHECK-FPGA-O0 %s +// CHECK-FPGA-O0: clang{{.*}} "-fsycl-is-device{{.*}}" "-O0" +// CHECK-FPGA-O0: sycl-post-link{{.*}} "-O0" +// CHECK-FPGA-O0-NOT: "-O2" + + + diff --git a/clang/test/Driver/sycl-offload-intelfpga.cpp b/clang/test/Driver/sycl-offload-intelfpga.cpp index 21d62f8c29b92..a00579a5b5fa7 100644 --- a/clang/test/Driver/sycl-offload-intelfpga.cpp +++ b/clang/test/Driver/sycl-offload-intelfpga.cpp @@ -228,8 +228,8 @@ // RUN: | FileCheck -check-prefix=CHK-TOOLS-IMPLIED-OPTS %s // RUN: %clang_cl -### -fintelfpga -Zi -Od -Xs "-DFOO1 -DFOO2" -Xshardware %s 2>&1 \ // RUN: | FileCheck -check-prefix=CHK-TOOLS-IMPLIED-OPTS %s -// CHK-TOOLS-IMPLIED-OPTS-NOT: clang{{.*}} "-fsycl-is-device"{{.*}} "-O0" -// CHK-TOOLS-IMPLIED-OPTS: sycl-post-link{{.*}} "-O2" +// CHK-TOOLS-IMPLIED-OPTS: clang{{.*}} "-fsycl-is-device"{{.*}} "-fno-sycl-early-optimizations"{{.*}} "-O0" +// CHK-TOOLS-IMPLIED-OPTS: sycl-post-link{{.*}} "-O0" // CHK-TOOLS-IMPLIED-OPTS: aoc{{.*}} "-g" "-DFOO1" "-DFOO2" /// shared objects should not be checked for FPGA contents diff --git a/clang/test/Driver/sycl-offload-new-driver.c b/clang/test/Driver/sycl-offload-new-driver.c index dd656192b80f3..e03709f97cb2a 100644 --- a/clang/test/Driver/sycl-offload-new-driver.c +++ b/clang/test/Driver/sycl-offload-new-driver.c @@ -163,12 +163,12 @@ // WRAPPER_OPTIONS_BACKEND_AOT-SAME: "--cpu-tool-arg=-backend-cpu-opt" /// Verify arch settings for nvptx and amdgcn targets -// RUN: %clangxx -fsycl -### -fsycl-targets=amdgcn-amd-gpu -fno-sycl-libspirv \ +// RUN: %clangxx -fsycl -### -fsycl-targets=amdgcn-amd-amdhsa -fno-sycl-libspirv \ // RUN: -nocudalib --offload-new-driver \ -// RUN: -Xsycl-target-backend=amdgcn-amd-gpu --offload-arch=gfx600 \ +// RUN: -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=gfx600 \ // RUN: %s 2>&1 \ // RUN: | FileCheck -check-prefix AMD_ARCH %s -// AMD_ARCH: clang-offload-packager{{.*}} "--image=file={{.*}},triple=amdgcn-amd-gpu,arch=gfx600,kind=sycl,compile-opts=--offload-arch=gfx600" +// AMD_ARCH: clang-offload-packager{{.*}} "--image=file={{.*}},triple=amdgcn-amd-amdhsa,arch=gfx600,kind=sycl,compile-opts=--offload-arch=gfx600" // RUN: %clangxx -fsycl -### -fsycl-targets=nvptx64-nvidia-cuda \ // RUN: -fno-sycl-libspirv -nocudalib --offload-new-driver %s 2>&1 \ diff --git a/clang/test/Driver/sycl-oneapi-gpu-amdgpu.cpp b/clang/test/Driver/sycl-oneapi-gpu-amdgpu.cpp index e6c8530c53130..4258f5b1f04cb 100644 --- a/clang/test/Driver/sycl-oneapi-gpu-amdgpu.cpp +++ b/clang/test/Driver/sycl-oneapi-gpu-amdgpu.cpp @@ -136,6 +136,27 @@ // RUN: FileCheck %s --check-prefix=BAD_AMD_INPUT // BAD_AMD_INPUT: error: SYCL target is invalid: 'amd_gpu_bad' +// Check the SYCL triple for AMD GPUs. +// RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amdgcn-amd-amdhsa -### \ +// RUN: -Xsycl-target-backend --offload-arch=gfx908 \ +// RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc %s 2>&1 | \ +// RUN: FileCheck %s --check-prefixes=AMD-TRIPLE +// AMD-TRIPLE: clang{{.*}} "-triple" "amdgcn-amd-amdhsa" + +// Check if SYCL triples with 'Environment' component are rejected for AMD GPUs. +// RUN: not %clangxx -c -fsycl -nogpulib -fsycl-targets=amdgcn-amd-amdhsa-sycl -### %s 2>&1 | \ + // RUN: FileCheck %s --check-prefix=BAD_TARGET_TRIPLE_ENV +// RUN: not %clang_cl -c -fsycl -fsycl-targets=amdgcn-amd-amdhsa-sycl -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=BAD_TARGET_TRIPLE_ENV +// BAD_TARGET_TRIPLE_ENV: error: SYCL target is invalid: 'amdgcn-amd-amdhsa-sycl' + +// Check for invalid SYCL triple for AMD GPUs. +// RUN: not %clangxx -c -fsycl -nogpulib -fsycl-targets=amdgcn -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=BAD_TARGET_TRIPLE +// RUN: not %clang_cl -c -fsycl -fsycl-targets=amdgcn-amd -### %s 2>&1 | \ +// RUN: FileCheck %s --check-prefix=BAD_TARGET_TRIPLE +// BAD_TARGET_TRIPLE: error: SYCL target is invalid: 'amdgcn{{.*}}' + /// Test for proper creation of fat object // RUN: %clangxx -fsycl -nogpulib -fsycl-targets=amd_gpu_gfx700 \ // RUN: -fsycl-libspirv-path=%S/Inputs/SYCL/libspirv.bc \ diff --git a/clang/tools/clang-offload-bundler/CMakeLists.txt b/clang/tools/clang-offload-bundler/CMakeLists.txt index e75292ab2a5bd..a466cf31d82db 100644 --- a/clang/tools/clang-offload-bundler/CMakeLists.txt +++ b/clang/tools/clang-offload-bundler/CMakeLists.txt @@ -18,6 +18,9 @@ add_clang_tool(clang-offload-bundler intrinsics_gen ) +setup_host_tool(clang-offload-bundler CLANG_OFFLOAD_BUNDLER + clang-offload-bundler_exe clang-offload-bundler_target) + set(CLANG_OFFLOAD_BUNDLER_LIB_DEPS clangBasic clangDriver diff --git a/clang/tools/clang-offload-packager/CMakeLists.txt b/clang/tools/clang-offload-packager/CMakeLists.txt index cd7e1b0181e2a..3518788c55af1 100644 --- a/clang/tools/clang-offload-packager/CMakeLists.txt +++ b/clang/tools/clang-offload-packager/CMakeLists.txt @@ -12,6 +12,9 @@ add_clang_tool(clang-offload-packager ${tablegen_deps} ) +setup_host_tool(clang-offload-packager CLANG_OFFLOAD_PACKAGER_EXE + clang-offload-packager_exe clang-offload-packager_target) + clang_target_link_libraries(clang-offload-packager PRIVATE clangBasic diff --git a/devops/actions/build_container/action.yml b/devops/actions/build_container/action.yml index 8b992b5a2c050..5ff5f3e209bff 100644 --- a/devops/actions/build_container/action.yml +++ b/devops/actions/build_container/action.yml @@ -21,9 +21,6 @@ inputs: file: description: "Dockerfile" required: true - sycl_ci_passwd: - description: "Password to assign to sycl_ci user within a container" - required: true runs: using: "composite" @@ -44,4 +41,3 @@ runs: file: ${{ github.workspace }}/devops/containers/${{ inputs.file }}.Dockerfile secrets: | github_token=${{ github.token }} - sycl_ci_passwd=${{ inputs.sycl_ci_passwd }} diff --git a/devops/containers/ubuntu2204_base.Dockerfile b/devops/containers/ubuntu2204_base.Dockerfile index 07bb343cae93c..d08b6025e9908 100644 --- a/devops/containers/ubuntu2204_base.Dockerfile +++ b/devops/containers/ubuntu2204_base.Dockerfile @@ -8,25 +8,14 @@ USER root COPY scripts/install_build_tools.sh /install.sh RUN /install.sh -# By default Ubuntu sets an arbitrary UID value, that is different from host -# system. When CI passes default UID value of 1001, some of LLVM tools fail to -# discover user home directory and fail a few LIT tests. Fixes UID and GID to -# 1001, that is used as default by GitHub Actions. -RUN groupadd -g 1001 sycl && useradd sycl -u 1001 -g 1001 -m -s /bin/bash -# Add sycl user to video/irc groups so that it can access GPU -RUN usermod -aG video sycl -RUN usermod -aG irc sycl - -# group 109 is required for sycl user to access PVC card. -RUN groupadd -g 109 render -RUN usermod -aG render sycl - -# Allow sycl user to run as sudo -RUN echo "sycl ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers +COPY scripts/create-sycl-user.sh /user-setup.sh +RUN /user-setup.sh COPY actions/cached_checkout /actions/cached_checkout COPY actions/cleanup /actions/cleanup COPY scripts/docker_entrypoint.sh /docker_entrypoint.sh COPY scripts/install_drivers.sh /opt/install_drivers.sh +USER sycl + ENTRYPOINT ["/docker_entrypoint.sh"] diff --git a/devops/containers/ubuntu2204_build.Dockerfile b/devops/containers/ubuntu2204_build.Dockerfile index 313b455dbc25b..1aa814aaa775c 100644 --- a/devops/containers/ubuntu2204_build.Dockerfile +++ b/devops/containers/ubuntu2204_build.Dockerfile @@ -24,23 +24,18 @@ gpg --dearmor | tee /etc/apt/keyrings/rocm.gpg > /dev/null && \ # Add rocm repo echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/6.1.1 jammy main" \ | tee --append /etc/apt/sources.list.d/rocm.list && \ -printf 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | tee /etc/apt/preferences.d/rocm-pin-600 && \ -apt update +printf 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | tee /etc/apt/preferences.d/rocm-pin-600 # Install the kernel driver -RUN apt install -yqq rocm-dev && \ +RUN apt update && apt install -yqq rocm-dev && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -# By default Ubuntu sets an arbitrary UID value, that is different from host -# system. When CI passes default UID value of 1001, some of LLVM tools fail to -# discover user home directory and fail a few LIT tests. Fixes UID and GID to -# 1001, that is used as default by GitHub Actions. -RUN groupadd -g 1001 sycl && useradd sycl -u 1001 -g 1001 -m -s /bin/bash -# Add sycl user to video/irc groups so that it can access GPU -RUN usermod -aG video sycl -RUN usermod -aG irc sycl +COPY scripts/create-sycl-user.sh /user-setup.sh +RUN /user-setup.sh COPY scripts/docker_entrypoint.sh /docker_entrypoint.sh +USER sycl + ENTRYPOINT ["/docker_entrypoint.sh"] diff --git a/devops/containers/ubuntu2204_intel_drivers.Dockerfile b/devops/containers/ubuntu2204_intel_drivers.Dockerfile index fb018f195a48f..58ec09bb5feb2 100644 --- a/devops/containers/ubuntu2204_intel_drivers.Dockerfile +++ b/devops/containers/ubuntu2204_intel_drivers.Dockerfile @@ -5,7 +5,9 @@ FROM $base_image:$base_tag ENV DEBIAN_FRONTEND=noninteractive -ARG use_latest=true +ARG use_unstable_driver=true + +USER root RUN apt update && apt install -yqq wget @@ -16,7 +18,7 @@ COPY dependencies.json / RUN mkdir /runtimes ENV INSTALL_LOCATION=/runtimes RUN --mount=type=secret,id=github_token \ - if [ "$use_latest" = "true" ]; then \ + if [ "$use_unstable_driver" = "true" ]; then \ install_driver_opt=" --use-latest"; \ else \ install_driver_opt=" dependencies.json"; \ @@ -25,5 +27,7 @@ RUN --mount=type=secret,id=github_token \ COPY scripts/drivers_entrypoint.sh /drivers_entrypoint.sh +USER sycl_ci + ENTRYPOINT ["/bin/bash", "/drivers_entrypoint.sh"] diff --git a/devops/containers/ubuntu2204_preinstalled.Dockerfile b/devops/containers/ubuntu2204_preinstalled.Dockerfile index 4d39cca4d6baa..18ec7de01acb3 100644 --- a/devops/containers/ubuntu2204_preinstalled.Dockerfile +++ b/devops/containers/ubuntu2204_preinstalled.Dockerfile @@ -3,6 +3,8 @@ ARG base_image=ghcr.io/intel/llvm/ubuntu2204_intel_drivers FROM $base_image:$base_tag +USER root + COPY scripts/drivers_entrypoint.sh /drivers_entrypoint.sh RUN mkdir -p /opt/sycl ADD sycl_linux.tar.gz /opt/sycl/ @@ -10,5 +12,7 @@ ADD sycl_linux.tar.gz /opt/sycl/ ENV PATH /opt/sycl/bin:$PATH ENV LD_LIBRARY_PATH /opt/sycl/lib:$LD_LIBRARY_PATH +USER sycl_ci + ENTRYPOINT ["/bin/bash", "/drivers_entrypoint.sh"] diff --git a/devops/containers/ubuntu2404_base.Dockerfile b/devops/containers/ubuntu2404_base.Dockerfile index c30e562c630e8..7af9ccfec1e5f 100644 --- a/devops/containers/ubuntu2404_base.Dockerfile +++ b/devops/containers/ubuntu2404_base.Dockerfile @@ -8,25 +8,14 @@ USER root COPY scripts/install_build_tools.sh /install.sh RUN /install.sh -# By default Ubuntu sets an arbitrary UID value, that is different from host -# system. When CI passes default UID value of 1001, some of LLVM tools fail to -# discover user home directory and fail a few LIT tests. Fixes UID and GID to -# 1001, that is used as default by GitHub Actions. -RUN groupadd -g 1001 sycl && useradd sycl -u 1001 -g 1001 -m -s /bin/bash -# Add sycl user to video/irc groups so that it can access GPU -RUN usermod -aG video sycl -RUN usermod -aG irc sycl - -# group 109 is required for sycl user to access PVC card. -RUN groupadd -g 109 render -RUN usermod -aG render sycl - -# Allow sycl user to run as sudo -RUN echo "sycl ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers +COPY scripts/create-sycl-user.sh /user-setup.sh +RUN /user-setup.sh COPY actions/cached_checkout /actions/cached_checkout COPY actions/cleanup /actions/cleanup COPY scripts/docker_entrypoint.sh /docker_entrypoint.sh COPY scripts/install_drivers.sh /opt/install_drivers.sh +USER sycl + ENTRYPOINT ["/docker_entrypoint.sh"] diff --git a/devops/containers/ubuntu2404_build.Dockerfile b/devops/containers/ubuntu2404_build.Dockerfile new file mode 100644 index 0000000000000..5391030df006a --- /dev/null +++ b/devops/containers/ubuntu2404_build.Dockerfile @@ -0,0 +1,42 @@ +FROM nvidia/cuda:12.6.3-devel-ubuntu24.04 + +ENV DEBIAN_FRONTEND=noninteractive + +USER root + +# Install SYCL prerequisites +COPY scripts/install_build_tools.sh /install.sh +RUN /install.sh + +SHELL ["/bin/bash", "-ec"] + +# Make the directory if it doesn't exist yet. +# This location is recommended by the distribution maintainers. +RUN mkdir --parents --mode=0755 /etc/apt/keyrings +# Download the key, convert the signing-key to a full +# keyring required by apt and store in the keyring directory +RUN wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | \ +gpg --dearmor | tee /etc/apt/keyrings/rocm.gpg > /dev/null && \ +# Add rocm repo +echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/amdgpu/6.3/ubuntu noble main" \ + | tee /etc/apt/sources.list.d/amdgpu.list && \ +echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/6.3 noble main" \ + | tee --append /etc/apt/sources.list.d/rocm.list && \ +echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \ + | tee /etc/apt/preferences.d/rocm-pin-600 && \ +echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \ + | tee /etc/apt/preferences.d/rocm-pin-600 +# Install the ROCM kernel driver +RUN apt update && apt install -yqq rocm-dev && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +COPY scripts/create-sycl-user.sh /user-setup.sh +RUN /user-setup.sh + +COPY scripts/docker_entrypoint.sh /docker_entrypoint.sh + +USER sycl + +ENTRYPOINT ["/docker_entrypoint.sh"] + diff --git a/devops/containers/ubuntu2404_build_oneapi.Dockerfile b/devops/containers/ubuntu2404_build_oneapi.Dockerfile index e5c576016f398..48e617328539d 100644 --- a/devops/containers/ubuntu2404_build_oneapi.Dockerfile +++ b/devops/containers/ubuntu2404_build_oneapi.Dockerfile @@ -31,23 +31,19 @@ echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \ wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor \ | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \ echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \ -| tee /etc/apt/sources.list.d/oneAPI.list && \ -apt update +| tee /etc/apt/sources.list.d/oneAPI.list + # Install the ROCM kernel driver and oneAPI -RUN apt install -yqq rocm-dev intel-oneapi-compiler-dpcpp-cpp && \ +RUN apt update && apt install -yqq rocm-dev intel-oneapi-compiler-dpcpp-cpp && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -# By default Ubuntu sets an arbitrary UID value, that is different from host -# system. When CI passes default UID value of 1001, some of LLVM tools fail to -# discover user home directory and fail a few LIT tests. Fixes UID and GID to -# 1001, that is used as default by GitHub Actions. -RUN groupadd -g 1001 sycl && useradd sycl -u 1001 -g 1001 -m -s /bin/bash -# Add sycl user to video/irc groups so that it can access GPU -RUN usermod -aG video sycl -RUN usermod -aG irc sycl +COPY scripts/create-sycl-user.sh /user-setup.sh +RUN /user-setup.sh COPY scripts/docker_entrypoint.sh /docker_entrypoint.sh +USER sycl + ENTRYPOINT ["/docker_entrypoint.sh"] diff --git a/devops/containers/ubuntu2404_intel_drivers.Dockerfile b/devops/containers/ubuntu2404_intel_drivers.Dockerfile index b45e4a8adbb51..2e9997c3a3770 100644 --- a/devops/containers/ubuntu2404_intel_drivers.Dockerfile +++ b/devops/containers/ubuntu2404_intel_drivers.Dockerfile @@ -5,7 +5,9 @@ FROM $base_image:$base_tag ENV DEBIAN_FRONTEND=noninteractive -ARG use_latest=true +ARG use_unstable_driver=true + +USER root RUN apt update && apt install -yqq wget @@ -16,7 +18,7 @@ COPY dependencies.json / RUN mkdir /runtimes ENV INSTALL_LOCATION=/runtimes RUN --mount=type=secret,id=github_token \ - if [ "$use_latest" = "true" ]; then \ + if [ "$use_unstable_driver" = "true" ]; then \ install_driver_opt=" --use-latest"; \ else \ install_driver_opt=" dependencies.json"; \ @@ -25,5 +27,7 @@ RUN --mount=type=secret,id=github_token \ COPY scripts/drivers_entrypoint.sh /drivers_entrypoint.sh +USER sycl_ci + ENTRYPOINT ["/bin/bash", "/drivers_entrypoint.sh"] diff --git a/devops/containers/ubuntu2404_intel_drivers_igc_dev.Dockerfile b/devops/containers/ubuntu2404_intel_drivers_igc_dev.Dockerfile index fd40361a379f5..8ab6f2b5fdb7c 100644 --- a/devops/containers/ubuntu2404_intel_drivers_igc_dev.Dockerfile +++ b/devops/containers/ubuntu2404_intel_drivers_igc_dev.Dockerfile @@ -5,6 +5,8 @@ FROM $base_image:$base_tag ENV DEBIAN_FRONTEND=noninteractive +USER root + RUN apt update && apt install -yqq libllvm14 COPY scripts/get_release.py / @@ -20,5 +22,7 @@ RUN --mount=type=secret,id=github_token \ COPY scripts/drivers_entrypoint.sh /drivers_entrypoint.sh +USER sycl_ci + ENTRYPOINT ["/bin/bash", "/drivers_entrypoint.sh"] diff --git a/devops/dependencies.json b/devops/dependencies.json index 755a1f10625fb..fc9003ec4ad54 100644 --- a/devops/dependencies.json +++ b/devops/dependencies.json @@ -1,15 +1,15 @@ { "linux": { "compute_runtime": { - "github_tag": "24.39.31294.12", - "version": "24.39.31294.12", - "url": "https://github.com/intel/compute-runtime/releases/tag/24.39.31294.12", + "github_tag": "24.45.31740.9", + "version": "24.45.31740.9", + "url": "https://github.com/intel/compute-runtime/releases/tag/24.45.31740.9", "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu" }, "igc": { - "github_tag": "igc-1.0.17791.9", - "version": "1.0.17791.9", - "url": "https://github.com/intel/intel-graphics-compiler/releases/tag/igc-1.0.17791.9", + "github_tag": "v2.1.12", + "version": "2.1.12", + "url": "https://github.com/intel/intel-graphics-compiler/releases/tag/v2.1.12", "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu" }, "cm": { @@ -19,9 +19,9 @@ "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu" }, "level_zero": { - "github_tag": "v1.18.5", - "version": "v1.18.5", - "url": "https://github.com/oneapi-src/level-zero/releases/tag/v1.18.5", + "github_tag": "v1.19.2", + "version": "v1.19.2", + "url": "https://github.com/oneapi-src/level-zero/releases/tag/v1.19.2", "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu" }, "tbb": { diff --git a/devops/scripts/create-sycl-user.sh b/devops/scripts/create-sycl-user.sh new file mode 100755 index 0000000000000..7199f1eaf10cd --- /dev/null +++ b/devops/scripts/create-sycl-user.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +set -e + +USER_NAME=sycl + +# By default Ubuntu sets an arbitrary UID value, that is different from host +# system. When CI passes default UID value of 1001, some of LLVM tools fail to +# discover user home directory and fail a few LIT tests. Fixes UID and GID to +# 1001, that is used as default by GitHub Actions. +USER_ID=1001 + +groupadd -g $USER_ID $USER_NAME && useradd $USER_NAME -u $USER_ID -g $USER_ID -m -s /bin/bash +# Add user to video/irc groups so that it can access GPU +usermod -aG video $USER_NAME +usermod -aG irc $USER_NAME + +# group 109 is required for user to access PVC card. +groupadd -f -g 109 render +usermod -aG render $USER_NAME + +# Allow user to run as sudo (without a password) +echo "$USER_NAME ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers diff --git a/devops/scripts/docker_entrypoint.sh b/devops/scripts/docker_entrypoint.sh index f0e89244d013f..5fc44481d2355 100755 --- a/devops/scripts/docker_entrypoint.sh +++ b/devops/scripts/docker_entrypoint.sh @@ -1,8 +1,3 @@ #!/bin/bash -if [ -d "$GITHUB_WORKSPACE" ]; then - chown -R sycl:sycl $GITHUB_WORKSPACE - su sycl -fi - exec "$@" diff --git a/devops/scripts/install_drivers.sh b/devops/scripts/install_drivers.sh index e790691462d94..570f78091d9f2 100755 --- a/devops/scripts/install_drivers.sh +++ b/devops/scripts/install_drivers.sh @@ -120,10 +120,7 @@ InstallIGFX () { # # Of course, this also installed the libopencl-clang so that we can copy and use later as a temporariy workaround. IS_IGC_DEV=$(CheckIGCdevTag $IGCTAG) - UBUNTU_VER="u22\.04" - if [ "$IS_IGC_DEV" == "Yes" ] || [ "$L0_TAG" == "latest" ]; then - UBUNTU_VER="u24\.04" - fi + UBUNTU_VER="u24\.04" get_release intel/intel-graphics-compiler $IGC_TAG \ | grep ".*deb" \ | wget -qi - @@ -143,7 +140,7 @@ InstallIGFX () { get_release oneapi-src/level-zero $L0_TAG \ | grep ".*$UBUNTU_VER.*deb" \ | wget -qi - - dpkg -i *.deb && rm *.deb *.sum + dpkg -i --force-overwrite *.deb && rm *.deb *.sum mkdir -p /usr/local/lib/igc/ echo "$IGC_TAG" > /usr/local/lib/igc/IGCTAG.txt if [ "$IS_IGC_DEV" == "Yes" ]; then diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index 4fc818e548a19..dbdea692685c4 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -70,7 +70,7 @@ if( LIBCLC_STANDALONE_BUILD OR CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DI # Import required tools if( NOT EXISTS ${LIBCLC_CUSTOM_LLVM_TOOLS_BINARY_DIR} ) - foreach( tool IN ITEMS clang llvm-as llvm-link opt ) + foreach( tool IN ITEMS clang llvm-as llvm-link llvm-spirv opt ) find_program( LLVM_TOOL_${tool} ${tool} PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH ) set( ${tool}_exe ${LLVM_TOOL_${tool}} ) set( ${tool}_target ) @@ -93,6 +93,7 @@ else() get_host_tool_path( clang CLANG clang_exe clang_target ) get_host_tool_path( llvm-as LLVM_AS llvm-as_exe llvm-as_target ) get_host_tool_path( llvm-link LLVM_LINK llvm-link_exe llvm-link_target ) + get_host_tool_path( llvm-spirv LLVM_SPIRV llvm-spirv_exe llvm-spirv_target ) get_host_tool_path( opt OPT opt_exe opt_target ) endif() endif() @@ -114,13 +115,15 @@ if( EXISTS ${LIBCLC_CUSTOM_LLVM_TOOLS_BINARY_DIR} ) # If we've requested a custom binary directory, there are some otherwise # optional tools which we want to ensure are present. - if( NOT TARGET libclc::llvm-spirv OR NOT TARGET libclc::libclc-remangler ) - message( FATAL_ERROR "libclc toolchain incomplete!" ) - endif() + foreach( tool IN ITEMS llvm-spirv libclc-remangler ) + if( NOT EXISTS "${${tool}_exe}" AND "${${tool}_target}" STREQUAL "" ) + message( FATAL_ERROR "libclc toolchain incomplete!" ) + endif() + endforeach() endif() foreach( tool IN ITEMS clang opt llvm-as llvm-link ) - if( NOT EXISTS "${${tool}_exe}" AND "${tool}_target" STREQUAL "" ) + if( NOT EXISTS "${${tool}_exe}" AND "${${tool}_target}" STREQUAL "" ) message( FATAL_ERROR "libclc toolchain incomplete - missing tool ${tool}!" ) endif() endforeach() diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake index f174a03efee91..17d9760866bac 100644 --- a/libclc/cmake/modules/AddLibclc.cmake +++ b/libclc/cmake/modules/AddLibclc.cmake @@ -456,13 +456,13 @@ function(add_libclc_builtin_set) "${LIBCLC_LIBRARY_OUTPUT_INTDIR}/remangled-${long_width}-${signedness}_char.${obj_suffix_mangled}" ) add_custom_command( OUTPUT "${builtins_remangle_path}" COMMAND ${CMAKE_COMMAND} -E make_directory ${LIBCLC_LIBRARY_OUTPUT_INTDIR} - COMMAND libclc::libclc-remangler + COMMAND ${libclc-remangler_exe} -o "${builtins_remangle_path}" --long-width=${long_width} --char-signedness=${signedness} --input-ir=${builtins_lib} ${dummy_in} - DEPENDS ${builtins_lib} libclc::libclc-remangler ${dummy_in}) + DEPENDS ${builtins_lib} ${libclc-remangler_target} ${dummy_in}) add_custom_target( "remangled-${long_width}-${signedness}_char.${obj_suffix_mangled}" ALL DEPENDS "${builtins_remangle_path}" "${dummy_in}") set_target_properties("remangled-${long_width}-${signedness}_char.${obj_suffix_mangled}" @@ -489,12 +489,12 @@ function(add_libclc_builtin_set) math(EXPR libclc-remangler-test-no "${libclc-remangler-test-no}+1") set(current-test "libclc-remangler-test-${obj_suffix}-${libclc-remangler-test-no}") add_custom_target(${current-test} - COMMAND libclc::libclc-remangler + COMMAND ${libclc-remangler_exe} --long-width=l32 --char-signedness=signed --input-ir=${target-ir} ${dummy_in} -t -o - - DEPENDS ${builtins_lib} "${dummy_in}" libclc::libclc-remangler) + DEPENDS ${builtins_lib} "${dummy_in}" ${libclc-remangler_target}) list(APPEND libclc-remangler-tests ${current-test}) endforeach() endif() diff --git a/libclc/utils/libclc-remangler/CMakeLists.txt b/libclc/utils/libclc-remangler/CMakeLists.txt index 9e50722cbbb7b..e5783f134feab 100644 --- a/libclc/utils/libclc-remangler/CMakeLists.txt +++ b/libclc/utils/libclc-remangler/CMakeLists.txt @@ -11,6 +11,9 @@ set(LLVM_LINK_COMPONENTS add_clang_tool(libclc-remangler LibclcRemangler.cpp) +setup_host_tool( libclc-remangler LIBCLC_REMANGLER + libclc-remangler_exe libclc-remangler_target ) + target_include_directories(libclc-remangler PRIVATE ${CMAKE_SOURCE_DIR}/../clang/include ${CMAKE_BINARY_DIR}/tools/clang/include) @@ -24,9 +27,3 @@ clang_target_link_libraries(libclc-remangler clangSerialization LLVMOption ) - -# If we've not already imported a libclc-remangler tool from an external build, -# set it up now. -if(NOT TARGET libclc::libclc-remangler) - add_executable(libclc::libclc-remangler ALIAS libclc-remangler) -endif() diff --git a/libdevice/CMakeLists.txt b/libdevice/CMakeLists.txt index a985c101a6f94..d96c4fe69201d 100644 --- a/libdevice/CMakeLists.txt +++ b/libdevice/CMakeLists.txt @@ -19,6 +19,25 @@ else() will not build libdevice sanitizer") endif() +if(NOT EXISTS ${LIBCLC_CUSTOM_LLVM_TOOLS_BINARY_DIR}) + get_host_tool_path(clang CLANG clang_exe clang_target) + get_host_tool_path(llvm-ar LLVM_AR llvm-ar_exe llvm-ar_target) + get_host_tool_path(append-file APPEND_FILE append-file_exe append-file_target) + get_host_tool_path(clang-offload-bundler CLANG_OFFLOAD_BUNDLER clang-offload-bundler_exe clang-offload-bundler_target) + get_host_tool_path(clang-offload-packager CLANG_OFFLOAD_PACKAGER clang-offload-packager_exe clang-offload-packager_target) + get_host_tool_path(file-table-tform FILE_TABLE_TFORM file-table-tform_exe file-table-tform_target) + get_host_tool_path(llvm-foreach LLVM_FOREACH llvm-foreach_exe llvm-foreach_target) + get_host_tool_path(llvm-spirv LLVM_SPIRV llvm-spirv_exe llvm-spirv_target) + get_host_tool_path(sycl-post-link SYCL_POST_LINK sycl-post-link_exe sycl-post-link_target) +else() + foreach(tool IN ITEMS clang llvm-ar append-file clang-offload-bundler clang-offload-packager file-table-tform llvm-foreach llvm-spirv sycl-post-link) + find_program(LLVM_CUSTOM_TOOL_${tool} ${tool} + PATHS ${LIBCLC_CUSTOM_LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH) + set(${tool}_exe ${LLVM_CUSTOM_TOOL_${tool}}) + set(${tool}_target) + endforeach() +endif() + # Build libdevice for SYCL. include(SYCLLibdevice) diff --git a/libdevice/cmake/modules/SYCLLibdevice.cmake b/libdevice/cmake/modules/SYCLLibdevice.cmake index d2afc546d69d9..63a079fdc0d40 100644 --- a/libdevice/cmake/modules/SYCLLibdevice.cmake +++ b/libdevice/cmake/modules/SYCLLibdevice.cmake @@ -22,11 +22,6 @@ set(install_dest_obj lib${LLVM_LIBDIR_SUFFIX}) set(install_dest_obj-new-offload lib${LLVM_LIBDIR_SUFFIX}) set(install_dest_bc lib${LLVM_LIBDIR_SUFFIX}) -set(clang $) -set(llvm-ar $) -set(llvm-link $) -set(llvm-opt $) - string(CONCAT sycl_targets_opt "-fsycl-targets=" "spir64_x86_64-unknown-unknown," @@ -55,6 +50,7 @@ set(compile_opts # we declare all functions as 'static'. -Wno-undefined-internal -sycl-std=2020 + --target=${LLVM_HOST_TRIPLE} ) set(SYCL_LIBDEVICE_GCC_TOOLCHAIN "" CACHE PATH "Path to GCC installation") @@ -146,7 +142,8 @@ function(compile_lib_ext filename) add_custom_command( OUTPUT ${devicelib-file} - COMMAND ${clang} ${ARG_OPTS} + COMMAND ${clang_exe} -I ${PROJECT_BINARY_DIR}/include + ${ARG_OPTS} ${CMAKE_CURRENT_SOURCE_DIR}/${ARG_SRC} -o ${devicelib-file} MAIN_DEPENDENCY ${ARG_SRC} DEPENDS ${ARG_DEPENDENCIES} @@ -224,20 +221,26 @@ function(add_devicelibs filename) endforeach() endfunction() -# Set up the dependency lists for the libdevice libraries -set(crt_obj_deps wrapper.h device.h spirv_vars.h sycl-compiler) -set(complex_obj_deps device_complex.h device.h sycl-compiler) -set(cmath_obj_deps device_math.h device.h sycl-compiler) -set(imf_obj_deps device_imf.hpp imf_half.hpp imf_bf16.hpp imf_rounding_op.hpp imf_impl_utils.hpp device.h sycl-compiler) -set(itt_obj_deps device_itt.h spirv_vars.h device.h sycl-compiler) -set(bfloat16_obj_deps sycl-headers sycl-compiler) +# For native builds, sycl-compiler will already include everything we need. +# For cross builds, we also need native versions of the tools. +set(sycl-compiler_deps + sycl-compiler ${clang_target} ${append-file_target} + ${clang-offload-bundler_target} ${clang-offload-packager_target} + ${file-table-tform_target} ${llvm-foreach_target} ${llvm-spirv_target} + ${sycl-post-link_target}) +set(crt_obj_deps wrapper.h device.h spirv_vars.h ${sycl-compiler_deps}) +set(complex_obj_deps device_complex.h device.h ${sycl-compiler_deps}) +set(cmath_obj_deps device_math.h device.h ${sycl-compiler_deps}) +set(imf_obj_deps device_imf.hpp imf_half.hpp imf_bf16.hpp imf_rounding_op.hpp imf_impl_utils.hpp device.h ${sycl-compiler_deps}) +set(itt_obj_deps device_itt.h spirv_vars.h device.h ${sycl-compiler_deps}) +set(bfloat16_obj_deps sycl-headers ${sycl-compiler_deps}) if (NOT MSVC AND UR_SANITIZER_INCLUDE_DIR) set(asan_obj_deps device.h atomic.hpp spirv_vars.h ${UR_SANITIZER_INCLUDE_DIR}/asan/asan_libdevice.hpp include/asan_rtl.hpp include/spir_global_var.hpp - sycl-compiler) + ${sycl-compiler_deps}) set(sanitizer_generic_compile_opts ${compile_opts} -fno-sycl-instrument-device-code @@ -442,6 +445,7 @@ set(imf_fp64_fallback_src ${imf_fallback_src_dir}/imf_fp64_fallback.cpp) set(imf_bf16_fallback_src ${imf_fallback_src_dir}/imf_bf16_fallback.cpp) set(imf_host_cxx_flags -c + --target=${LLVM_HOST_TRIPLE} -D__LIBDEVICE_HOST_IMPL__ ) @@ -535,13 +539,13 @@ function(add_lib_imf name) add_custom_command( OUTPUT ${ARG_DIR}/${name}.${${ARG_FTYPE}-suffix} - COMMAND ${clang} ${compile_opts} ${ARG_EXTRA_OPTS} + COMMAND ${clang_exe} ${compile_opts} ${ARG_EXTRA_OPTS} -I ${CMAKE_CURRENT_SOURCE_DIR}/imf ${imf_${ARG_DTYPE}_fallback_src} -o ${ARG_DIR}/${name}.${${ARG_FTYPE}-suffix} DEPENDS ${imf_fallback_${ARG_DTYPE}_deps} - get_imf_fallback_${ARG_DTYPE} sycl-compiler + get_imf_fallback_${ARG_DTYPE} ${sycl-compiler_deps} VERBATIM) add_custom_target(${ARG_TGT_NAME} @@ -635,7 +639,7 @@ foreach(dtype IN ITEMS bf16 fp32 fp64) endif() add_custom_command( OUTPUT ${${ftype}_binary_dir}/imf-${dtype}-host.${${ftype}-suffix} - COMMAND ${clang} ${${ftype}_host_compile_opts} + COMMAND ${clang_exe} ${${ftype}_host_compile_opts} ${CMAKE_CURRENT_SOURCE_DIR}/${wrapper_name} -o ${${ftype}_binary_dir}/imf-${dtype}-host.${${ftype}-suffix} MAIN_DEPENDENCY ${CMAKE_CURRENT_SOURCE_DIR}/${wrapper_name} @@ -652,7 +656,7 @@ foreach(ftype IN ITEMS obj obj-new-offload) DEPENDS ${${ftype}_binary_dir}/${devicelib_host_static_${ftype}}) add_custom_command( OUTPUT ${${ftype}_binary_dir}/${devicelib_host_static_${ftype}} - COMMAND ${llvm-ar} rcs + COMMAND ${llvm-ar_exe} rcs ${${ftype}_binary_dir}/${devicelib_host_static_${ftype}} ${${ftype}_binary_dir}/imf-fp32-host.${${ftype}-suffix} ${${ftype}_binary_dir}/fallback-imf-fp32-host.${${ftype}-suffix} @@ -663,7 +667,7 @@ foreach(ftype IN ITEMS obj obj-new-offload) DEPENDS imf_fp32_host_${ftype} imf_fallback_fp32_host_${ftype} DEPENDS imf_fp64_host_${ftype} imf_fallback_fp64_host_${ftype} DEPENDS imf_bf16_host_${ftype} imf_fallback_bf16_host_${ftype} - DEPENDS sycl-compiler + DEPENDS ${llvm-ar_target} VERBATIM) add_dependencies(libsycldevice-obj imf_host_${ftype}) diff --git a/llvm/lib/SYCLLowerIR/SYCLJointMatrixTransform.cpp b/llvm/lib/SYCLLowerIR/SYCLJointMatrixTransform.cpp index 231ec9a818c19..c5c03b2ae1c16 100644 --- a/llvm/lib/SYCLLowerIR/SYCLJointMatrixTransform.cpp +++ b/llvm/lib/SYCLLowerIR/SYCLJointMatrixTransform.cpp @@ -22,6 +22,74 @@ namespace { static constexpr char ACCESS_CHAIN[] = "_Z19__spirv_AccessChain"; static constexpr char MATRIX_TYPE[] = "spirv.CooperativeMatrixKHR"; +Type *getInnermostType(Type *Ty) { + while (auto *ArrayTy = dyn_cast(Ty)) + Ty = ArrayTy->getElementType(); + return Ty; +} + +Type *replaceInnermostType(Type *Ty, Type *NewInnermostTy) { + if (auto *ArrayTy = dyn_cast(Ty)) + return ArrayType::get( + replaceInnermostType(ArrayTy->getElementType(), NewInnermostTy), + ArrayTy->getNumElements()); + return NewInnermostTy; +} + +// This function is a copy of stripPointerCastsAndOffsets from Value.cpp, +// simplified and modified to strip non-zero GEP indices as well and also +// find nearest GEP instruction. +Value *stripPointerCastsAndOffsets(Value *V, bool StopOnGEP = false) { + if (!V->getType()->isPointerTy()) + return V; + + // Even though we don't look through PHI nodes, we could be called on an + // instruction in an unreachable block, which may be on a cycle. + SmallPtrSet Visited; + + Visited.insert(V); + do { + if (auto *GEP = dyn_cast(V)) { + if (StopOnGEP && isa(GEP)) + return V; + V = GEP->getPointerOperand(); + } else if (auto *BC = dyn_cast(V)) { + Value *NewV = BC->getOperand(0); + if (!NewV->getType()->isPointerTy()) + return V; + V = NewV; + } else if (auto *ASC = dyn_cast(V)) { + V = ASC->getOperand(0); + } else { + if (auto *Call = dyn_cast(V)) { + if (Value *RV = Call->getReturnedArgOperand()) { + V = RV; + // Strip the call instruction, since callee returns its RV + // argument as return value. So, we need to continue stripping. + continue; + } + } + return V; + } + assert(V->getType()->isPointerTy() && "Unexpected operand type!"); + } while (Visited.insert(V).second); + + return V; +} + +TargetExtType *extractMatrixType(StructType *WrapperMatrixTy) { + if (!WrapperMatrixTy) + return nullptr; + TargetExtType *MatrixTy = + dyn_cast(WrapperMatrixTy->getElementType(0)); + + if (!MatrixTy) + return nullptr; + if (MatrixTy->getName() != MATRIX_TYPE) + return nullptr; + return MatrixTy; +} + // This function finds all calls to __spirv_AccessChain function and transforms // its users and operands to make LLVM IR more SPIR-V friendly. bool transformAccessChain(Function *F) { @@ -60,33 +128,59 @@ bool transformAccessChain(Function *F) { // from sycl::joint_matrix class object if it's used in __spirv_AccessChain // function call. It's necessary because otherwise OpAccessChain indices // would be wrong. - Instruction *Ptr = - dyn_cast(CI->getArgOperand(0)->stripPointerCasts()); + Instruction *Ptr = dyn_cast( + stripPointerCastsAndOffsets(CI->getArgOperand(0))); if (!Ptr || !isa(Ptr)) continue; - StructType *WrapperMatrixTy = - dyn_cast(cast(Ptr)->getAllocatedType()); - if (!WrapperMatrixTy) - continue; - TargetExtType *MatrixTy = - dyn_cast(WrapperMatrixTy->getElementType(0)); - if (!MatrixTy) + + Type *AllocaTy = cast(Ptr)->getAllocatedType(); + // It may happen that sycl::joint_matrix class object is wrapped into + // nested arrays. We need to find the innermost type to extract + if (StructType *WrapperMatrixTy = + dyn_cast(getInnermostType(AllocaTy))) { + TargetExtType *MatrixTy = extractMatrixType(WrapperMatrixTy); + if (!MatrixTy) + continue; + + AllocaInst *Alloca = nullptr; + { + IRBuilder Builder(CI); + IRBuilderBase::InsertPointGuard IG(Builder); + Builder.SetInsertPointPastAllocas(CI->getFunction()); + Alloca = Builder.CreateAlloca(replaceInnermostType(AllocaTy, MatrixTy)); + Alloca->takeName(Ptr); + } + Ptr->replaceAllUsesWith(Alloca); + Ptr->dropAllReferences(); + Ptr->eraseFromParent(); + ModuleChanged = true; + } + + // In case spirv.CooperativeMatrixKHR is used in arrays, we also need to + // insert GEP to get pointer to target exention type and use it instead of + // pointer to sycl::joint_matrix class object when it is passed to + // __spirv_AccessChain + // First we check if the argument came from a GEP instruction + GetElementPtrInst *GEP = dyn_cast( + stripPointerCastsAndOffsets(CI->getArgOperand(0), /*StopOnGEP=*/true)); + if (!GEP) continue; - StringRef Name = MatrixTy->getName(); - if (Name != MATRIX_TYPE) + + // Check if GEP return type is a pointer to sycl::joint_matrix class object + StructType *WrapperMatrixTy = + dyn_cast(GEP->getResultElementType()); + if (!extractMatrixType(WrapperMatrixTy)) continue; - AllocaInst *Alloca = nullptr; + // Insert GEP right before the __spirv_AccessChain call { IRBuilder Builder(CI); - IRBuilderBase::InsertPointGuard IG(Builder); - Builder.SetInsertPointPastAllocas(CI->getFunction()); - Alloca = Builder.CreateAlloca(MatrixTy); + Value *NewGEP = + Builder.CreateInBoundsGEP(WrapperMatrixTy, CI->getArgOperand(0), + {Builder.getInt64(0), Builder.getInt32(0)}); + CI->setArgOperand(0, NewGEP); + ModuleChanged = true; } - Ptr->replaceAllUsesWith(Alloca); - Ptr->dropAllReferences(); - Ptr->eraseFromParent(); - ModuleChanged = true; } return ModuleChanged; } diff --git a/llvm/lib/SYCLLowerIR/SanitizerKernelMetadata.cpp b/llvm/lib/SYCLLowerIR/SanitizerKernelMetadata.cpp index c99fc0ed83509..90bde2ee73837 100644 --- a/llvm/lib/SYCLLowerIR/SanitizerKernelMetadata.cpp +++ b/llvm/lib/SYCLLowerIR/SanitizerKernelMetadata.cpp @@ -38,6 +38,29 @@ PreservedAnalyses SanitizerKernelMetadataPass::run(Module &M, auto &DL = M.getDataLayout(); auto &Ctx = M.getContext(); + // Fix device global type, by wrapping a structure type + { + assert(KernelMetadata->getValueType()->isArrayTy()); + + auto *KernelMetadataOld = KernelMetadata; + + StructType *StructTypeWithArray = StructType::create(Ctx); + StructTypeWithArray->setBody(KernelMetadataOld->getValueType()); + + KernelMetadata = new GlobalVariable( + M, StructTypeWithArray, false, GlobalValue::ExternalLinkage, + ConstantStruct::get(StructTypeWithArray, + KernelMetadataOld->getInitializer()), + "", nullptr, GlobalValue::NotThreadLocal, 1); // Global AddressSpace + KernelMetadata->takeName(KernelMetadataOld); + KernelMetadata->setUnnamedAddr(GlobalValue::UnnamedAddr::Local); + KernelMetadata->setDSOLocal(true); + KernelMetadata->copyAttributesFrom(KernelMetadataOld); + KernelMetadata->copyMetadata(KernelMetadataOld, 0); + + KernelMetadataOld->eraseFromParent(); + } + // Fix attributes KernelMetadata->addAttribute( "sycl-device-global-size", diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 2ffbb67722a6c..c218951387bbd 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -1333,18 +1333,32 @@ static void ExtendSpirKernelArgs(Module &M, FunctionAnalysisManager &FAM, if (!HasESIMD) for (Function &F : M) { - if (F.getCallingConv() != CallingConv::SPIR_KERNEL) - continue; - if (!F.hasFnAttribute(Attribute::SanitizeAddress) || F.hasFnAttribute(Attribute::DisableSanitizerInstrumentation)) continue; + if (F.getName().contains("__sycl_service_kernel__")) { + F.addFnAttr(Attribute::DisableSanitizerInstrumentation); + continue; + } + + // Skip referenced-indirectly function as we insert access to shared + // local memory (SLM) __AsanLaunchInfo and access to SLM in + // referenced-indirectly function isn't supported yet in + // intel-graphics-compiler. + if (F.hasFnAttribute("referenced-indirectly")) { + F.addFnAttr(Attribute::DisableSanitizerInstrumentation); + continue; + } + + if (F.getCallingConv() != CallingConv::SPIR_KERNEL) + continue; + SpirFixupKernels.emplace_back(&F); auto KernelName = F.getName(); auto *KernelNameGV = GetOrCreateGlobalString( - M, "__asan_kernel", KernelName, kSpirOffloadGlobalAS); + M, "__asan_kernel", KernelName, kSpirOffloadConstantAS); SpirKernelsMetadata.emplace_back(ConstantStruct::get( StructTy, ConstantExpr::getPointerCast(KernelNameGV, IntptrTy), ConstantInt::get(IntptrTy, KernelName.size()))); @@ -3685,16 +3699,6 @@ bool AddressSanitizer::instrumentFunction(Function &F, if (F.isPresplitCoroutine()) return false; - if (TargetTriple.isSPIROrSPIRV()) { - if (F.getName().contains("__sycl_service_kernel__")) - return false; - // Skip referenced-indirectly function as we insert access to shared local - // memory (SLM) __AsanLaunchInfo and access to SLM in referenced-indirectly - // function isn't supported yet in intel-graphics-compiler. - if (F.hasFnAttribute("referenced-indirectly")) - return false; - } - bool FunctionModified = false; // Do not apply any instrumentation for naked functions. diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index b49b5da07b2ab..8ed60d73209e2 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -763,7 +763,7 @@ Constant *getOrCreateGlobalString(Module &M, StringRef Name, StringRef Value, static void extendSpirKernelArgs(Module &M) { SmallVector SpirKernelsMetadata; - auto DL = M.getDataLayout(); + const auto &DL = M.getDataLayout(); Type *IntptrTy = DL.getIntPtrType(M.getContext()); // SpirKernelsMetadata only saves fixed kernels, and is described by @@ -781,7 +781,7 @@ static void extendSpirKernelArgs(Module &M) { auto KernelName = F.getName(); auto *KernelNameGV = getOrCreateGlobalString(M, "__msan_kernel", KernelName, - kSpirOffloadGlobalAS); + kSpirOffloadConstantAS); SpirKernelsMetadata.emplace_back(ConstantStruct::get( StructTy, ConstantExpr::getPointerCast(KernelNameGV, IntptrTy), ConstantInt::get(IntptrTy, KernelName.size()))); diff --git a/llvm/test/Instrumentation/AddressSanitizer/SPIRV/skip_referenced_indirectly.ll b/llvm/test/Instrumentation/AddressSanitizer/SPIRV/skip_referenced_indirectly.ll index c2203ae7f242b..a22b8d347e178 100644 --- a/llvm/test/Instrumentation/AddressSanitizer/SPIRV/skip_referenced_indirectly.ll +++ b/llvm/test/Instrumentation/AddressSanitizer/SPIRV/skip_referenced_indirectly.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 | FileCheck %s +; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -S | FileCheck %s ; Check referenced-indirectly function isn't instrumented. @@ -9,6 +9,7 @@ target triple = "spir64-unknown-unknown" @_ZTV8Derived1 = linkonce_odr addrspace(1) constant %structtype { [3 x ptr addrspace(4)] [ptr addrspace(4) null, ptr addrspace(4) null, ptr addrspace(4) addrspacecast (ptr @_ZN8Derived17displayEv to ptr addrspace(4))] }, align 8, !spirv.Decorations !0 define linkonce_odr spir_func i32 @_ZN8Derived17displayEv(ptr addrspace(4) align 8 %this) sanitize_address "referenced-indirectly" { +; CHECK: @_ZN8Derived17displayEv{{.*}}#1 entry: ; CHECK-NOT: call void @__asan_load @@ -17,6 +18,8 @@ entry: ret i32 %1 } +; CHECK: #1 {{.*}} disable_sanitizer_instrumentation + !0 = !{!1, !2, !3} !1 = !{i32 22} !2 = !{i32 41, !"_ZTV8Derived1", i32 2} diff --git a/llvm/test/Instrumentation/AddressSanitizer/SPIRV/skip_sycl_service_kernel.ll b/llvm/test/Instrumentation/AddressSanitizer/SPIRV/skip_sycl_service_kernel.ll new file mode 100644 index 0000000000000..0bd832e25b746 --- /dev/null +++ b/llvm/test/Instrumentation/AddressSanitizer/SPIRV/skip_sycl_service_kernel.ll @@ -0,0 +1,19 @@ +; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -S | FileCheck %s + +; Check "sycl_service_kernel" function isn't instrumented. + +target triple = "spir64-unknown-unknown" + +%structtype = type { [3 x ptr addrspace(4)] } +%class.Base = type <{ ptr addrspace(4), i32, [4 x i8] }> + +define linkonce_odr spir_func i32 @_ZTSN4sycl3_V16detail23__sycl_service_kernel__16AssertInfoCopierE(ptr addrspace(4) align 8 %this) sanitize_address "referenced-indirectly" { +; CHECK: @_ZTSN4sycl3_V16detail23__sycl_service_kernel__16AssertInfoCopierE{{.*}}#1 +entry: +; CHECK-NOT: call void @__asan_load + %base_data = getelementptr inbounds %class.Base, ptr addrspace(4) %this, i64 0, i32 1 + %1 = load i32, ptr addrspace(4) %base_data, align 8 + ret i32 %1 +} + +; CHECK: #1 {{.*}} disable_sanitizer_instrumentation diff --git a/llvm/test/SYCLLowerIR/JointMatrixTransform/access_chain.ll b/llvm/test/SYCLLowerIR/JointMatrixTransform/access_chain.ll index 5373938405717..7a68d9d537a57 100644 --- a/llvm/test/SYCLLowerIR/JointMatrixTransform/access_chain.ll +++ b/llvm/test/SYCLLowerIR/JointMatrixTransform/access_chain.ll @@ -3,24 +3,69 @@ ; RUN: opt -passes=sycl-joint-matrix-transform < %s -S | FileCheck %s -; CHECK: %[[#Alloca:]] = alloca target("spirv.CooperativeMatrixKHR", i8, 3, 16, 64, 0) -; CHECK: %[[#Cast:]] = addrspacecast ptr %[[#Alloca]] to ptr addrspace(4) -; CHECK: call spir_func ptr addrspace(4) @_Z19__spirv_AccessChain{{.*}}(ptr addrspace(4) noundef %[[#Cast]], i64 noundef 0) - ; ModuleID = 'test.bc' source_filename = "test.cpp" target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1" target triple = "spir64-unknown-unknown" -%"struct.sycl::_V1::ext::oneapi::experimental::matrix::joint_matrix" = type { target("spirv.CooperativeMatrixKHR", i8, 3, 16, 64, 0) } +%"struct.sycl::joint_matrix" = type { target("spirv.CooperativeMatrixKHR", i8, 3, 16, 64, 0) } +%"struct.sycl::_V1::long" = type { i64 } + +define weak_odr dso_local spir_kernel void @test(i64 %ind) { +; CHECK-LABEL: define weak_odr dso_local spir_kernel void @test( +; CHECK-SAME: i64 [[IND:%.*]]) { + +; non-matrix alloca not touched +; CHECK: [[NOT_MATR:%.*]] = alloca [2 x [4 x %"struct.sycl::_V1::long"]] +; both matrix-related allocas updated to use target extension types +; CHECK-NEXT: [[MATR:%.*]] = alloca target("spirv.CooperativeMatrixKHR", i8, 3, 16, 64, 0) +; CHECK-NEXT: [[MATR_ARR:%.*]] = alloca [2 x [4 x target("spirv.CooperativeMatrixKHR", i8, 3, 16, 64, 0)]] + +; CHECK-NEXT: [[ASCAST:%.*]] = addrspacecast ptr [[MATR]] to ptr addrspace(4) +; no gep inserted, since not needed +; CHECK-NEXT: call spir_func ptr addrspace(4) @_Z19__spirv_AccessChain{{.*}}(ptr addrspace(4) noundef [[ASCAST]], i64 noundef 0) + +; CHECK: [[GEP:%.*]] = getelementptr inbounds [2 x [4 x %"struct.sycl::joint_matrix"]], ptr [[MATR_ARR]], i64 0, i64 [[IND]], i64 [[IND]] +; CHECK-NEXT: [[ASCAST_1:%.*]] = addrspacecast ptr [[GEP]] to ptr addrspace(4) +; CHECK-NEXT: [[ASCAST_2:%.*]] = addrspacecast ptr [[GEP]] to ptr addrspace(4) +; gep is inserted for each of the accesschain calls to extract target extension type +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds %"struct.sycl::joint_matrix", ptr addrspace(4) [[ASCAST_1]], i64 0, i32 0 +; CHECK-NEXT: call spir_func ptr addrspace(4) @_Z19__spirv_AccessChain{{.*}}(ptr addrspace(4) noundef [[TMP2]], i64 noundef 0) +; CHECK: [[TMP5:%.*]] = getelementptr inbounds %"struct.sycl::joint_matrix", ptr addrspace(4) [[ASCAST_2]], i64 0, i32 0 +; CHECK-NEXT: call spir_func ptr addrspace(4) @_Z19__spirv_AccessChain{{.*}}(ptr addrspace(4) noundef [[TMP5]], i64 noundef 0) + +; negative test - not touching non-matrix code +; CHECK: [[GEP_1:%.*]] = getelementptr inbounds [2 x [4 x %"struct.sycl::_V1::long"]], ptr [[NOT_MATR]], i64 0, i64 [[IND]], i64 [[IND]] +; CHECK-NEXT: [[ASCAST_3:%.*]] = addrspacecast ptr [[GEP_1]] to ptr addrspace(4) +; CHECK-NEXT: call spir_func ptr addrspace(4) @_Z19__spirv_AccessChain{{.*}}(ptr addrspace(4) noundef [[ASCAST_3]], i64 noundef 0) -define weak_odr dso_local spir_kernel void @test() { entry: - %0 = alloca %"struct.sycl::_V1::ext::oneapi::experimental::matrix::joint_matrix", align 8 - %1 = addrspacecast ptr %0 to ptr addrspace(4) - %2 = call spir_func ptr addrspace(4) @_Z19__spirv_AccessChainIiiLm16ELm16ELN5__spv9MatrixUseE2ELNS0_5Scope4FlagE3EEPT_PPNS0_28__spirv_CooperativeMatrixKHRIT0_XT4_EXT1_EXT2_EXT3_EEEm(ptr addrspace(4) noundef %1, i64 noundef 0) + ; allocas + %matr = alloca %"struct.sycl::joint_matrix", align 8 + %matr.arr = alloca [2 x [4 x %"struct.sycl::joint_matrix"]], align 8 + %not.matr = alloca [2 x [4 x %"struct.sycl::_V1::long"]], align 8 + + ; simple case + %ascast = addrspacecast ptr %matr to ptr addrspace(4) + %0 = call spir_func ptr addrspace(4) @_Z19__spirv_AccessChainIiiLm16ELm16ELN5__spv9MatrixUseE2ELNS0_5Scope4FlagE3EEPT_PPNS0_28__spirv_CooperativeMatrixKHRIT0(ptr addrspace(4) noundef %ascast, i64 noundef 0) + %1 = load i8, ptr addrspace(4) %0 + + ; gep with non-zero inidices and multiple access chains per 1 alloca + %gep = getelementptr inbounds [2 x [4 x %"struct.sycl::joint_matrix"]], ptr %matr.arr, i64 0, i64 %ind, i64 %ind + %ascast.1 = addrspacecast ptr %gep to ptr addrspace(4) + %ascast.2 = addrspacecast ptr %gep to ptr addrspace(4) + %2 = call spir_func ptr addrspace(4) @_Z19__spirv_AccessChainIiiLm16ELm16ELN5__spv9MatrixUseE2ELNS0_5Scope4FlagE3EEPT_PPNS0_28__spirv_CooperativeMatrixKHRIT0(ptr addrspace(4) noundef %ascast.1, i64 noundef 0) %3 = load i8, ptr addrspace(4) %2 + %4 = call spir_func ptr addrspace(4) @_Z19__spirv_AccessChainIiiLm16ELm16ELN5__spv9MatrixUseE2ELNS0_5Scope4FlagE3EEPT_PPNS0_28__spirv_CooperativeMatrixKHRIT0(ptr addrspace(4) noundef %ascast.2, i64 noundef 0) + %5 = load i8, ptr addrspace(4) %4 + + ; negative test - not touching non-matrix code + %gep.1 = getelementptr inbounds [2 x [4 x %"struct.sycl::_V1::long"]], ptr %not.matr, i64 0, i64 %ind, i64 %ind + %ascast.3 = addrspacecast ptr %gep.1 to ptr addrspace(4) + %6 = call spir_func ptr addrspace(4) @_Z19__spirv_AccessChainIiiLm16ELm16ELN5__spv9MatrixUseE2ELNS0_5Scope4FlagE3EEPT_PPNS0_28__spirv_CooperativeMatrixKHRIT0(ptr addrspace(4) noundef %ascast.3, i64 noundef 0) + %7 = load i8, ptr addrspace(4) %6 + ret void } -declare dso_local spir_func ptr addrspace(4) @_Z19__spirv_AccessChainIiiLm16ELm16ELN5__spv9MatrixUseE2ELNS0_5Scope4FlagE3EEPT_PPNS0_28__spirv_CooperativeMatrixKHRIT0_XT4_EXT1_EXT2_EXT3_EEEm(ptr addrspace(4) noundef, i64 noundef) +declare dso_local spir_func ptr addrspace(4) @_Z19__spirv_AccessChainIiiLm16ELm16ELN5__spv9MatrixUseE2ELNS0_5Scope4FlagE3EEPT_PPNS0_28__spirv_CooperativeMatrixKHRIT0(ptr addrspace(4) noundef, i64 noundef) diff --git a/llvm/test/tools/sycl-post-link/device-sanitizer/asan.ll b/llvm/test/tools/sycl-post-link/device-sanitizer/asan.ll index e956876032084..f9ac67c26e93e 100644 --- a/llvm/test/tools/sycl-post-link/device-sanitizer/asan.ll +++ b/llvm/test/tools/sycl-post-link/device-sanitizer/asan.ll @@ -18,7 +18,7 @@ $_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E8MyKernel = comdat any @__asan_kernel = internal addrspace(1) constant [55 x i8] c"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E8MyKernel\00" @__AsanKernelMetadata = appending dso_local local_unnamed_addr addrspace(1) global [1 x { i64, i64 }] [{ i64, i64 } { i64 ptrtoint (ptr addrspace(1) @__asan_kernel to i64), i64 54 }] #2 -; CHECK-IR: @__AsanKernelMetadata {{.*}} !spirv.Decorations +; CHECK-IR: @__AsanKernelMetadata = dso_local local_unnamed_addr addrspace(1) global %0 { {{.*}} }, !spirv.Decorations @__spirv_BuiltInGlobalInvocationId = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32 @__asan_func = internal addrspace(2) constant [106 x i8] c"typeinfo name for main::'lambda'(sycl::_V1::handler&)::operator()(sycl::_V1::handler&) const::MyKernelR_4\00" diff --git a/llvm/test/tools/sycl-post-link/device-sanitizer/msan.ll b/llvm/test/tools/sycl-post-link/device-sanitizer/msan.ll index 41110bb30af2f..756874b37d079 100644 --- a/llvm/test/tools/sycl-post-link/device-sanitizer/msan.ll +++ b/llvm/test/tools/sycl-post-link/device-sanitizer/msan.ll @@ -18,7 +18,7 @@ $_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E8MyKernel = comdat any @__msan_kernel = internal addrspace(1) constant [55 x i8] c"_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E8MyKernel\00" @__MsanKernelMetadata = appending dso_local local_unnamed_addr addrspace(1) global [1 x { i64, i64 }] [{ i64, i64 } { i64 ptrtoint (ptr addrspace(1) @__msan_kernel to i64), i64 54 }] #0 -; CHECK-IR: @__MsanKernelMetadata {{.*}} !spirv.Decorations +; CHECK-IR: @__MsanKernelMetadata = dso_local local_unnamed_addr addrspace(1) global %0 { {{.*}} }, !spirv.Decorations @__spirv_BuiltInGlobalInvocationId = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32 @__asan_func = internal addrspace(2) constant [106 x i8] c"typeinfo name for main::'lambda'(sycl::_V1::handler&)::operator()(sycl::_V1::handler&) const::MyKernelR_4\00" diff --git a/llvm/tools/append-file/CMakeLists.txt b/llvm/tools/append-file/CMakeLists.txt index 21a9260d3491a..8479a2c3a21ce 100644 --- a/llvm/tools/append-file/CMakeLists.txt +++ b/llvm/tools/append-file/CMakeLists.txt @@ -9,3 +9,5 @@ add_llvm_tool(append-file DEPENDS intrinsics_gen ) + +setup_host_tool(append-file APPEND_FILE append_file_exe append_file_target) diff --git a/llvm/tools/file-table-tform/CMakeLists.txt b/llvm/tools/file-table-tform/CMakeLists.txt index b96e494508fe8..7a7b4305c2965 100644 --- a/llvm/tools/file-table-tform/CMakeLists.txt +++ b/llvm/tools/file-table-tform/CMakeLists.txt @@ -5,3 +5,5 @@ set(LLVM_LINK_COMPONENTS add_llvm_tool(file-table-tform file-table-tform.cpp ) + +setup_host_tool(file-table-tform FILE_TABLE_TFORM file-table-tform_exe file-table-tform_target) diff --git a/llvm/tools/llvm-ar/CMakeLists.txt b/llvm/tools/llvm-ar/CMakeLists.txt index 4d0718f8cefe6..30e2507c6e6f2 100644 --- a/llvm/tools/llvm-ar/CMakeLists.txt +++ b/llvm/tools/llvm-ar/CMakeLists.txt @@ -19,6 +19,8 @@ add_llvm_tool(llvm-ar GENERATE_DRIVER ) +setup_host_tool(llvm-ar LLVM_AR llvm_ar_exe llvm_ar_target) + add_llvm_tool_symlink(llvm-ranlib llvm-ar) add_llvm_tool_symlink(llvm-lib llvm-ar) add_llvm_tool_symlink(llvm-dlltool llvm-ar) diff --git a/llvm/tools/llvm-foreach/CMakeLists.txt b/llvm/tools/llvm-foreach/CMakeLists.txt index 12f037101a6d8..7258c29fd2a68 100644 --- a/llvm/tools/llvm-foreach/CMakeLists.txt +++ b/llvm/tools/llvm-foreach/CMakeLists.txt @@ -5,3 +5,5 @@ set(LLVM_LINK_COMPONENTS add_llvm_tool(llvm-foreach llvm-foreach.cpp ) + +setup_host_tool(llvm-foreach LLVM_FOREACH llvm-foreach_exe llvm-foreach_target) diff --git a/llvm/tools/sycl-post-link/CMakeLists.txt b/llvm/tools/sycl-post-link/CMakeLists.txt index 2a8f99795d599..f95d159d0174a 100644 --- a/llvm/tools/sycl-post-link/CMakeLists.txt +++ b/llvm/tools/sycl-post-link/CMakeLists.txt @@ -33,4 +33,7 @@ add_llvm_tool(sycl-post-link LLVMGenXIntrinsics ) +setup_host_tool(sycl-post-link SYCL_POST_LINK + sycl-post-link_exe sycl-post-link_target) + target_link_libraries(sycl-post-link PRIVATE LLVMGenXIntrinsics) diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 6a43c3098288b..ed520578b22f0 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit 39df0317814c164f5242eda8d6f08550f6268492 -# Merge: 68d93efd be27d8f0 -# Author: Kenneth Benzie (Benie) -# Date: Mon Dec 16 13:53:13 2024 +0000 -# Merge pull request #2467 from nrspruit/fix_external_import_function_call -# [L0] Fix external semaphore import function calls to match the header -set(UNIFIED_RUNTIME_TAG 39df0317814c164f5242eda8d6f08550f6268492) +# commit 93fc1331edfd5519ea1fab42a8b96abe9e53672f +# Merge: 259b67db d2c04518 +# Author: aarongreig +# Date: Thu Dec 26 16:52:33 2024 +0000 +# Merge pull request #2402 from yingcong-wu/yc/1202-mmap-failure +# [DevASAN][CPU] bugfix for mmap return value check. +set(UNIFIED_RUNTIME_TAG 93fc1331edfd5519ea1fab42a8b96abe9e53672f) diff --git a/sycl/doc/UsersManual.md b/sycl/doc/UsersManual.md index d8f48ee1b5205..4b22acba6af76 100644 --- a/sycl/doc/UsersManual.md +++ b/sycl/doc/UsersManual.md @@ -40,6 +40,7 @@ and not recommended to use in production environment. spir64_fpga-unknown-unknown, spir64_gen-unknown-unknown Available in special build configuration: * nvptx64-nvidia-cuda - generate code ahead of time for CUDA target; + * amdgcn-amd-amdhsa - generate code ahead of time for HIP target; * native_cpu - allows to run SYCL applications with no need of an additional backend (note that this feature is WIP and experimental, and currently overrides all the other specified SYCL targets when enabled.) diff --git a/sycl/doc/design/KernelProgramCache.md b/sycl/doc/design/KernelProgramCache.md index fd242e7d9749d..487e427980f16 100644 --- a/sycl/doc/design/KernelProgramCache.md +++ b/sycl/doc/design/KernelProgramCache.md @@ -415,15 +415,16 @@ When adding a new program to cache, we check if the size of the program cache ex #### Persistent cache eviction -Persistent cache eviction is going to be applied based on file last access -(read/write) date (access time). On SYCL application shutdown phase cache -eviction process is initiated which walks through cache directories as follows: - -- if the file is locked, go to the next file; -- otherwise check file access time: - - if file access time is above threshold, delete the file and remove parent - directory while they are unlocked and empty; - - otherwise do nothing. +Persistent cache eviction can be enabled using the SYCL_CACHE_MAX_SIZE environment variable and is based on the LRU strategy. + +- A new file, called `cache_size.txt`, is created at the root of the persistent cache directory. This file contains the total size of the cache in bytes. When a new item is added to the cache, the size of the item is added to the total size in the `cache_size.txt` file. When the total size exceeds the threshold, the eviction process is initiated. + +- Whenever a cache entry is added or accessed, the corresponding cache item directory is updated with the current time. This is done by creating a new file, called `_access_time.txt`, in the cache item directory. This file contains the current time in nanoseconds since the epoch. When the eviction process is initiated, we use this file to determine the last access time of the cache item. + +- When a new item is added to the cache, we check if the total size exceeds the threshold. If so, we iterate through the cache item directories and delete the least recently accessed items until the total size is below half the cache size. + +Note that once the eviction is triggered, the cache size is reduced to half the cache size to avoid frequent eviction. + ## Cache limitations diff --git a/sycl/doc/developer/DockerBKMs.md b/sycl/doc/developer/DockerBKMs.md index f03254a5fb6db..918967ded8fd3 100644 --- a/sycl/doc/developer/DockerBKMs.md +++ b/sycl/doc/developer/DockerBKMs.md @@ -36,33 +36,46 @@ identical for Docker and Podman. Choose whatever is available on your system. The following containers are publicly available for DPC++ compiler development: -- `ghcr.io/intel/llvm/ubuntu2204_base`: contains basic Ubuntu 22.04 environment - setup for building DPC++ compiler from source. -- `ghcr.io/intel/llvm/ubuntu2404_base`: contains basic Ubuntu 24.04 environment +### Ubuntu 22.04-based images + +- `ghcr.io/intel/llvm/ubuntu2204_base`: contains basic environment setup for building DPC++ compiler from source. - `ghcr.io/intel/llvm/ubuntu2204_intel_drivers`: contains everything from the - Ubuntu 22.04 base container + pre-installed Intel drivers. + base container + pre-installed Intel drivers. The image comes in two flavors/tags: * `latest`: Intel drivers are downloaded from release/tag and saved in dependencies.json. The drivers are tested/validated everytime we upgrade the driver. * `alldeps`: Includes the same Intel drivers as `latest`, as well as the development kits for NVidia/AMD from the `ubuntu2204_build` container. +- `ghcr.io/intel/llvm/ubuntu2204_build`: has development kits installed for + NVidia/AMD and can be used for building DPC++ + compiler from source with all backends enabled or for end-to-end testing + with HIP/CUDA on machines with corresponding GPUs available. + - `ghcr.io/intel/llvm/sycl_ubuntu2204_nightly`: contains the latest successfully + built nightly build of DPC++ compiler. The image comes in three flavors: + with pre-installed Intel drivers (`latest`), without them (`no-drivers`) and + with development kits installed (`build`). + +### Ubuntu 24.04-based images + +- `ghcr.io/intel/llvm/ubuntu2404_base`: contains basic environment + setup for building DPC++ compiler from source. - `ghcr.io/intel/llvm/ubuntu2404_intel_drivers`: contains everything from the - Ubuntu 24.04 base container + pre-installed Intel drivers. - The image comes in two flavors/tags: + base container + pre-installed Intel drivers. + The image comes in three flavors/tags: + * `latest`: Intel drivers are downloaded from release/tag and saved in + dependencies.json. The drivers are tested/validated everytime we upgrade + the driver. * `devigc`: Intel Graphics Compiler driver from github actions artifacts, other drivers are downloaded from release/tag and saved in dependencies.json. * `unstable`: Intel drivers are downloaded from release/latest. The drivers are installed as it is, not tested or validated. -- `ghcr.io/intel/llvm/ubuntu2204_build`: has development kits installed for - NVidia/AMD and can be used for building DPC++ compiler from source with all - backends enabled or for end-to-end testing with HIP/CUDA on machines with - corresponding GPUs available. -- `ghcr.io/intel/llvm/sycl_ubuntu2204_nightly`: contains the latest successfully - built nightly build of DPC++ compiler. The image comes in three flavors: - with pre-installed Intel drivers (`latest`), without them (`no-drivers`) and - with development kits installed (`build`). +- `ghcr.io/intel/llvm/ubuntu2404_build`: has development kits installed for + NVidia/AMD and can be used for building DPC++ + compiler from source with all backends enabled or for end-to-end testing + with HIP/CUDA on machines with corresponding GPUs available. + ## Running Docker container interactively @@ -140,10 +153,11 @@ instructions. ## Changing Docker user -By default all processes inside Docker run as root. Some LLVM or Clang tests -expect your user to be anything but root. You can change the user by specifying -`-u ` option. All Docker containers come with user `sycl` -created. +By default all processes within our containers are run as the `sycl` user which +has password-less `sudo` access. + +If you want to change the user, you can do that by specifying the +`-u ` option when running the container. ## Managing downloaded Docker images diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index 1cdcb2860432d..649c00fb474b5 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -2075,6 +2075,45 @@ recording mode, as opposed to throwing. This section defines the interaction of `sycl_ext_oneapi_graph` with other extensions. +==== sycl_ext_codeplay_enqueue_native_command + +`ext_codeplay_enqueue_native_command`, defined in +link:../experimental/sycl_ext_codeplay_enqueue_native_command.asciidoc[sycl_ext_codeplay_enqueue_native_command] +cannot be used in graph nodes. A synchronous exception will be thrown with error +code `invalid` if a user tries to add them to a graph. + +Removing this restriction is something we may look at for future revisions of +`sycl_ext_oneapi_graph`. + +==== sycl_ext_intel_queue_index + +The compute index queue property defined by +link:../supported/sycl_ext_intel_queue_index.asciidoc[sycl_ext_intel_queue_index] +is ignored during queue recording. + +Using this information is something we may look at for future revisions of +`sycl_ext_oneapi_graph`. + +==== sycl_ext_oneapi_bindless_images + +The new handler methods, and queue shortcuts, defined by +link:../experimental/sycl_ext_oneapi_bindless_images.asciidoc[sycl_ext_oneapi_bindless_images] +cannot be used in graph nodes. A synchronous exception will be thrown with error +code `invalid` if a user tries to add them to a graph. + +Removing this restriction is something we may look at for future revisions of +`sycl_ext_oneapi_graph`. + +==== sycl_ext_oneapi_device_global + +The new handler methods, and queue shortcuts, defined by +link:../experimental/sycl_ext_oneapi_device_global.asciidoc[sycl_ext_oneapi_device_global]. +cannot be used in graph nodes. A synchronous exception will be thrown with error +code `invalid` if a user tries to add them to a graph. + +Removing this restriction is something we may look at for future revisions of +`sycl_ext_oneapi_graph`. + ==== sycl_ext_oneapi_discard_queue_events When recording a `sycl::queue` which has been created with the @@ -2108,37 +2147,25 @@ nodes that are recorded from multiple queues and/or added by the explicit API: * The only commands which have an implicit dependency on the barrier command are those recorded from the same queue the barrier command was submitted to. -==== sycl_ext_oneapi_memcpy2d - -The new handler methods, and queue shortcuts, defined by -link:../supported/sycl_ext_oneapi_memcpy2d.asciidoc[sycl_ext_oneapi_memcpy2d] -cannot be used in graph nodes. A synchronous exception will be thrown with -error code `invalid` if a user tries to add them to a graph. - -Removing this restriction is something we may look at for future revisions of -`sycl_ext_oneapi_graph`. - -==== sycl_ext_oneapi_queue_priority - -The queue priority property defined by -link:../supported/sycl_ext_oneapi_queue_priority.asciidoc[sycl_ext_oneapi_queue_priority] -is ignored during queue recording. +==== sycl_ext_oneapi_enqueue_functions -==== sycl_ext_oneapi_queue_empty +The command submission functions defined in +link:../experimental/sycl_ext_oneapi_enqueue_functions.asciidoc[sycl_ext_oneapi_enqueue_functions] +can be used adding nodes to a graph when creating a graph from queue recording. +New methods are also defined that enable submitting an executable graph, +e.g. directly to a queue without returning an event. -The `queue::ext_oneapi_empty()` query defined by the -link:../supported/sycl_ext_oneapi_queue_empty.asciidoc[sycl_ext_oneapi_queue_empty] -extension behaves as normal during queue recording and is not captured to the graph. -Recorded commands are not counted as submitted for the purposes of this query. +==== sycl_ext_oneapi_free_function_kernels -==== sycl_ext_intel_queue_index +`sycl_ext_oneapi_free_function_kernels`, defined in +link:../proposed/sycl_ext_oneapi_free_function_kernels.asciidoc[sycl_ext_oneapi_free_function_kernels] +can be used with SYCL Graphs. -The compute index queue property defined by -link:../supported/sycl_ext_intel_queue_index.asciidoc[sycl_ext_intel_queue_index] -is ignored during queue recording. +==== sycl_ext_oneapi_kernel_compiler_spirv -Using this information is something we may look at for future revisions of -`sycl_ext_oneapi_graph`. +The kernels loaded using +link:../experimental/sycl_ext_oneapi_kernel_compiler_spirv.asciidoc[sycl_ext_oneapi_kernel_compiler_spirv] +behave as normal when used in graph nodes. ==== sycl_ext_oneapi_kernel_properties @@ -2147,62 +2174,41 @@ link:../experimental/sycl_ext_oneapi_kernel_properties.asciidoc[sycl_ext_oneapi_ can be used in graph nodes in the same way as they are used in normal queue submission. -==== sycl_ext_oneapi_prod +==== sycl_ext_oneapi_local_memory -The new `sycl::queue::ext_oneapi_prod()` method added by -link:../proposed/sycl_ext_oneapi_prod.asciidoc[sycl_ext_oneapi_prod] -behaves as normal during queue recording and is not captured to the graph. -Recorded commands are not counted as submitted for the purposes of its operation. +Allocating local memory inside a graph kernel node with `sycl::ext::oneapi::group_local_memory()` or +`sycl::ext::oneapi::group_local_memory_for_overwrite()` is supported. These methods are defined by +link:../supported/sycl_ext_oneapi_local_memory.asciidoc[sycl_ext_oneapi_local_memory.] -==== sycl_ext_oneapi_device_global - -The new handler methods, and queue shortcuts, defined by -link:../experimental/sycl_ext_oneapi_device_global.asciidoc[sycl_ext_oneapi_device_global]. -cannot be used in graph nodes. A synchronous exception will be thrown with error -code `invalid` if a user tries to add them to a graph. - -Removing this restriction is something we may look at for future revisions of -`sycl_ext_oneapi_graph`. - -==== sycl_ext_oneapi_bindless_images +==== sycl_ext_oneapi_memcpy2d The new handler methods, and queue shortcuts, defined by -link:../experimental/sycl_ext_oneapi_bindless_images.asciidoc[sycl_ext_oneapi_bindless_images] -cannot be used in graph nodes. A synchronous exception will be thrown with error -code `invalid` if a user tries to add them to a graph. +link:../supported/sycl_ext_oneapi_memcpy2d.asciidoc[sycl_ext_oneapi_memcpy2d] +cannot be used in graph nodes. A synchronous exception will be thrown with +error code `invalid` if a user tries to add them to a graph. Removing this restriction is something we may look at for future revisions of `sycl_ext_oneapi_graph`. -==== sycl_ext_oneapi_kernel_compiler_spirv - -The kernels loaded using -link:../experimental/sycl_ext_oneapi_kernel_compiler_spirv.asciidoc[sycl_ext_oneapi_kernel_compiler_spirv] -behave as normal when used in graph nodes. - -==== sycl_ext_codeplay_enqueue_native_command - -`ext_codeplay_enqueue_native_command`, defined in -link:../experimental/sycl_ext_codeplay_enqueue_native_command.asciidoc[sycl_ext_codeplay_enqueue_native_command] -cannot be used in graph nodes. A synchronous exception will be thrown with error -code `invalid` if a user tries to add them to a graph. +==== sycl_ext_oneapi_prod -Removing this restriction is something we may look at for future revisions of -`sycl_ext_oneapi_graph`. +The new `sycl::queue::ext_oneapi_prod()` method added by +link:../proposed/sycl_ext_oneapi_prod.asciidoc[sycl_ext_oneapi_prod] +behaves as normal during queue recording and is not captured to the graph. +Recorded commands are not counted as submitted for the purposes of its operation. -==== sycl_ext_oneapi_enqueue_functions +==== sycl_ext_oneapi_queue_empty -The command submission functions defined in -link:../experimental/sycl_ext_oneapi_enqueue_functions.asciidoc[sycl_ext_oneapi_enqueue_functions] -can be used adding nodes to a graph when creating a graph from queue recording. -New methods are also defined that enable submitting an executable graph, -e.g. directly to a queue without returning an event. +The `queue::ext_oneapi_empty()` query defined by the +link:../supported/sycl_ext_oneapi_queue_empty.asciidoc[sycl_ext_oneapi_queue_empty] +extension behaves as normal during queue recording and is not captured to the graph. +Recorded commands are not counted as submitted for the purposes of this query. -==== sycl_ext_oneapi_free_function_kernels +==== sycl_ext_oneapi_queue_priority -`sycl_ext_oneapi_free_function_kernels`, defined in -link:../proposed/sycl_ext_oneapi_free_function_kernels.asciidoc[sycl_ext_oneapi_free_function_kernels] -can be used with SYCL Graphs. +The queue priority property defined by +link:../supported/sycl_ext_oneapi_queue_priority.asciidoc[sycl_ext_oneapi_queue_priority] +is ignored during queue recording. ==== sycl_ext_oneapi_work_group_memory diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_group_load_store.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_group_load_store.asciidoc index b8ad67f332f12..4aa58a7df3eac 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_group_load_store.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_group_load_store.asciidoc @@ -99,6 +99,7 @@ in the group. * Value type of `InputIteratorT` must be convertible to `OutputT`. * Value type of `InputIteratorT` and `OutputT` must be trivially copyable and default constructible. +* `Properties` is an instance of `sycl::ext::oneapi::experimental::properties` _Effects_: Loads single element from `in_iter` to `out` by using the `g` group object to identify memory location as `in_iter` + `g.get_local_linear_id()`. @@ -129,6 +130,7 @@ in the group. * Value type of `InputIteratorT` must be convertible to `OutputT`. * Value type of `InputIteratorT` and `OutputT` must be trivially copyable and default constructible. +* `Properties` is an instance of `sycl::ext::oneapi::experimental::properties` _Effects_: Loads `N` elements from `in_iter` to `out` using the `g` group object. @@ -165,6 +167,7 @@ work-group or sub-group. * Value type of `InputIteratorT` must be convertible to `OutputT`. * Value type of `InputIteratorT` and `OutputT` must be trivially copyable and default constructible. +* `Properties` is an instance of `sycl::ext::oneapi::experimental::properties` _Effects_: Loads `ElementsPerWorkItem` elements from `in_iter` to `out` using the `g` group object. @@ -204,6 +207,7 @@ in the group. * `InputT` must be convertible to value type of `OutputIteratorT`. * `InputT` and value type of `OutputIteratorT` must be trivially copyable and default constructible. +* `Properties` is an instance of `sycl::ext::oneapi::experimental::properties` _Effects_: Stores single element `in` to `out_iter` by using the `g` group object to identify memory location as `out_iter` + `g.get_local_linear_id()` @@ -235,6 +239,7 @@ in the group. * `InputT` must be convertible to value type of `OutputIteratorT`. * `InputT` and value type of `OutputIteratorT` must be trivially copyable and default constructible. +* `Properties` is an instance of `sycl::ext::oneapi::experimental::properties` _Effects_: Stores `N` elements from `in` vec to `out_iter` using the `g` group object. @@ -273,6 +278,7 @@ work-group or sub-group. * `InputT` must be convertible to value type of `OutputIteratorT`. * `InputT` and value type of `OutputIteratorT` must be trivially copyable and default constructible. +* `Properties` is an instance of `sycl::ext::oneapi::experimental::properties` _Effects_: Stores `ElementsPerWorkItem` elements from `in` span to `out_iter` using the `g` group object. @@ -370,7 +376,7 @@ Specifies data layout used in group_load/store for `sycl::vec` or fixed-size arrays functions. Example: -`group_load(g, input, output_span, data_placement_blocked);` +`group_load(g, input, output_span, properties{data_placement_blocked});` === Optimization Properties @@ -398,7 +404,7 @@ inline constexpr contiguous_memory_key::value_t contiguous_memory; ---- For example, we can assert that `input` is a contiguous iterator: -`group_load(g, input, output_span, contiguous_memory);` +`group_load(g, input, output_span, properties{contiguous_memory});` If `input` isn't a contiguous iterator, the behavior is undefined. @@ -432,7 +438,7 @@ inline constexpr full_group_key::value_t full_group; For example, we can assert that there is no uneven group partition, so the implementation can rely on `get_max_local_range()` range size: -`group_load(sg, input, output_span, full_group);` +`group_load(sg, input, output_span, properties{full_group});` If partition is uneven the behavior is undefined. @@ -466,11 +472,13 @@ q.submit([&](sycl::handler& cgh) { auto offset = g.get_group_id(0) * g.get_local_range(0) * items_per_thread; - sycl_exp::group_load(g, input + offset, sycl::span{ data }, sycl_exp::contiguous_memory); + auto props = sycl_exp::properties{sycl_exp::contiguous_memory}; + + sycl_exp::group_load(g, input + offset, sycl::span{ data }, props); // Work with data... - sycl_exp::group_store(g, output + offset, sycl::span{ data }, sycl_exp::contiguous_memory); + sycl_exp::group_store(g, output + offset, sycl::span{ data }, props); }); }); ---- @@ -546,11 +554,13 @@ q.submit([&](sycl::handler& cgh) { sycl_exp::group_with_scratchpad gh{ g, sycl::span{ buf_ptr, temp_memory_size } }; - sycl_exp::group_load(gh, input + offset, sycl::span{ data }, sycl_exp::contiguous_memory); + auto props = sycl_exp::properties{sycl_exp::contiguous_memory}; + + sycl_exp::group_load(gh, input + offset, sycl::span{ data }, props); // Work with data... - sycl_exp::group_store(gh, output + offset, sycl::span{ data }, sycl_exp::contiguous_memory); + sycl_exp::group_store(gh, output + offset, sycl::span{ data }, props); }); }); ---- @@ -583,11 +593,13 @@ q.submit([&](sycl::handler& cgh) { sycl_exp::group_with_scratchpad gh{ g, sycl::span{ buf_ptr, temp_memory_size } }; - sycl_exp::group_load(gh, input + offset, sycl::span{ data }, sycl_exp::data_placement_striped); + auto striped = sycl_exp::properties{sycl_exp::data_placement_striped}; + + sycl_exp::group_load(gh, input + offset, sycl::span{ data }, striped); // Work with data... - sycl_exp::group_store(gh, output + offset, sycl::span{ data }, sycl_exp::data_placement_striped); + sycl_exp::group_store(gh, output + offset, sycl::span{ data }, striped); }); }); ---- diff --git a/sycl/include/sycl/backend.hpp b/sycl/include/sycl/backend.hpp index 7800032db33cc..35304d06428a3 100644 --- a/sycl/include/sycl/backend.hpp +++ b/sycl/include/sycl/backend.hpp @@ -339,9 +339,17 @@ make_queue(const typename backend_traits::template input_type const context &TargetContext, const async_handler Handler = {}) { auto KeepOwnership = Backend == backend::ext_oneapi_cuda || Backend == backend::ext_oneapi_hip; - return detail::make_queue(detail::ur::cast(BackendObject), - false, TargetContext, nullptr, KeepOwnership, {}, - Handler, Backend); + if constexpr (Backend == backend::ext_oneapi_level_zero) { + return detail::make_queue( + detail::ur::cast( + std::get(BackendObject.NativeHandle)), + false, TargetContext, nullptr, KeepOwnership, {}, Handler, Backend); + } + if constexpr (Backend != backend::ext_oneapi_level_zero) { + return detail::make_queue( + detail::ur::cast(BackendObject), false, + TargetContext, nullptr, KeepOwnership, {}, Handler, Backend); + } } template @@ -350,8 +358,16 @@ std::enable_if_t::MakeEvent == true, make_event(const typename backend_traits::template input_type &BackendObject, const context &TargetContext) { - return detail::make_event(detail::ur::cast(BackendObject), - TargetContext, Backend); + if constexpr (Backend == backend::ext_oneapi_level_zero) { + return detail::make_event( + detail::ur::cast(BackendObject.NativeHandle), + TargetContext, Backend); + } + if constexpr (Backend != backend::ext_oneapi_level_zero) { + return detail::make_event( + detail::ur::cast(BackendObject), TargetContext, + Backend); + } } template diff --git a/sycl/include/sycl/detail/named_swizzles_mixin.hpp b/sycl/include/sycl/detail/named_swizzles_mixin.hpp new file mode 100644 index 0000000000000..f35656b2e6381 --- /dev/null +++ b/sycl/include/sycl/detail/named_swizzles_mixin.hpp @@ -0,0 +1,824 @@ +//==---------------- named_swizzles_mixin.hpp ------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +// This files implements two mixins +// `NamedSwizzlesMixinConst`/`NamedSwizzlesMixinBoth` that abstract away named +// swizzles implementation for SYCL vector and swizzles classes + +#include + +namespace sycl { +inline namespace _V1 { +namespace detail { + +#ifndef SYCL_SIMPLE_SWIZZLES +#define __SYCL_SWIZZLE_MIXIN_SIMPLE_SWIZZLES +#else +// TODO: It might be beneficial to use partial specializations for different Ns, +// instead of making all the named swizzles templates with SFINAE conditions. +#define __SYCL_SWIZZLE_MIXIN_SIMPLE_SWIZZLES \ + /* __swizzled_vec__ XYZW_SWIZZLE() const; */ \ + __SYCL_SWIZZLE_MIXIN_METHOD(N <= 4, xx, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, xy, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xz, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xw, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, yx, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, yy, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yz, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yw, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zx, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zy, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zz, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zw, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wx, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wy, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wz, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ww, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N <= 4, xxx, 0, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, xxy, 0, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xxz, 0, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xxw, 0, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, xyx, 0, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, xyy, 0, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xyz, 0, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xyw, 0, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xzx, 0, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xzy, 0, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xzz, 0, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xzw, 0, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xwx, 0, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xwy, 0, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xwz, 0, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xww, 0, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, yxx, 1, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, yxy, 1, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yxz, 1, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yxw, 1, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, yyx, 1, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, yyy, 1, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yyz, 1, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yyw, 1, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yzx, 1, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yzy, 1, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yzz, 1, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yzw, 1, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ywx, 1, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ywy, 1, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ywz, 1, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yww, 1, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zxx, 2, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zxy, 2, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zxz, 2, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zxw, 2, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zyx, 2, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zyy, 2, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zyz, 2, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zyw, 2, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zzx, 2, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zzy, 2, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zzz, 2, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zzw, 2, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zwx, 2, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zwy, 2, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zwz, 2, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zww, 2, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wxx, 3, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wxy, 3, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wxz, 3, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wxw, 3, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wyx, 3, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wyy, 3, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wyz, 3, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wyw, 3, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wzx, 3, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wzy, 3, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wzz, 3, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wzw, 3, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wwx, 3, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wwy, 3, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wwz, 3, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, www, 3, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N <= 4, xxxx, 0, 0, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, xxxy, 0, 0, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xxxz, 0, 0, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xxxw, 0, 0, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, xxyx, 0, 0, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, xxyy, 0, 0, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xxyz, 0, 0, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xxyw, 0, 0, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xxzx, 0, 0, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xxzy, 0, 0, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xxzz, 0, 0, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xxzw, 0, 0, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xxwx, 0, 0, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xxwy, 0, 0, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xxwz, 0, 0, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xxww, 0, 0, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, xyxx, 0, 1, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, xyxy, 0, 1, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xyxz, 0, 1, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xyxw, 0, 1, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, xyyx, 0, 1, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, xyyy, 0, 1, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xyyz, 0, 1, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xyyw, 0, 1, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xyzx, 0, 1, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xyzy, 0, 1, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xyzz, 0, 1, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xyzw, 0, 1, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xywx, 0, 1, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xywy, 0, 1, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xywz, 0, 1, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xyww, 0, 1, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xzxx, 0, 2, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xzxy, 0, 2, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xzxz, 0, 2, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xzxw, 0, 2, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xzyx, 0, 2, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xzyy, 0, 2, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xzyz, 0, 2, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xzyw, 0, 2, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xzzx, 0, 2, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xzzy, 0, 2, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, xzzz, 0, 2, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xzzw, 0, 2, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xzwx, 0, 2, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xzwy, 0, 2, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xzwz, 0, 2, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xzww, 0, 2, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xwxx, 0, 3, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xwxy, 0, 3, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xwxz, 0, 3, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xwxw, 0, 3, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xwyx, 0, 3, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xwyy, 0, 3, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xwyz, 0, 3, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xwyw, 0, 3, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xwzx, 0, 3, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xwzy, 0, 3, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xwzz, 0, 3, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xwzw, 0, 3, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xwwx, 0, 3, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xwwy, 0, 3, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xwwz, 0, 3, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, xwww, 0, 3, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, yxxx, 1, 0, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, yxxy, 1, 0, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yxxz, 1, 0, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yxxw, 1, 0, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, yxyx, 1, 0, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, yxyy, 1, 0, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yxyz, 1, 0, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yxyw, 1, 0, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yxzx, 1, 0, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yxzy, 1, 0, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yxzz, 1, 0, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yxzw, 1, 0, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yxwx, 1, 0, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yxwy, 1, 0, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yxwz, 1, 0, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yxww, 1, 0, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, yyxx, 1, 1, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, yyxy, 1, 1, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yyxz, 1, 1, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yyxw, 1, 1, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, yyyx, 1, 1, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(2 <= N && N <= 4, yyyy, 1, 1, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yyyz, 1, 1, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yyyw, 1, 1, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yyzx, 1, 1, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yyzy, 1, 1, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yyzz, 1, 1, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yyzw, 1, 1, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yywx, 1, 1, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yywy, 1, 1, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yywz, 1, 1, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yyww, 1, 1, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yzxx, 1, 2, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yzxy, 1, 2, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yzxz, 1, 2, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yzxw, 1, 2, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yzyx, 1, 2, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yzyy, 1, 2, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yzyz, 1, 2, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yzyw, 1, 2, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yzzx, 1, 2, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yzzy, 1, 2, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, yzzz, 1, 2, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yzzw, 1, 2, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yzwx, 1, 2, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yzwy, 1, 2, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yzwz, 1, 2, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, yzww, 1, 2, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ywxx, 1, 3, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ywxy, 1, 3, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ywxz, 1, 3, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ywxw, 1, 3, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ywyx, 1, 3, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ywyy, 1, 3, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ywyz, 1, 3, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ywyw, 1, 3, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ywzx, 1, 3, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ywzy, 1, 3, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ywzz, 1, 3, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ywzw, 1, 3, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ywwx, 1, 3, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ywwy, 1, 3, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ywwz, 1, 3, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ywww, 1, 3, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zxxx, 2, 0, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zxxy, 2, 0, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zxxz, 2, 0, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zxxw, 2, 0, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zxyx, 2, 0, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zxyy, 2, 0, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zxyz, 2, 0, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zxyw, 2, 0, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zxzx, 2, 0, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zxzy, 2, 0, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zxzz, 2, 0, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zxzw, 2, 0, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zxwx, 2, 0, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zxwy, 2, 0, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zxwz, 2, 0, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zxww, 2, 0, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zyxx, 2, 1, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zyxy, 2, 1, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zyxz, 2, 1, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zyxw, 2, 1, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zyyx, 2, 1, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zyyy, 2, 1, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zyyz, 2, 1, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zyyw, 2, 1, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zyzx, 2, 1, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zyzy, 2, 1, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zyzz, 2, 1, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zyzw, 2, 1, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zywx, 2, 1, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zywy, 2, 1, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zywz, 2, 1, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zyww, 2, 1, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zzxx, 2, 2, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zzxy, 2, 2, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zzxz, 2, 2, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zzxw, 2, 2, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zzyx, 2, 2, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zzyy, 2, 2, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zzyz, 2, 2, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zzyw, 2, 2, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zzzx, 2, 2, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zzzy, 2, 2, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3 || N == 4, zzzz, 2, 2, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zzzw, 2, 2, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zzwx, 2, 2, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zzwy, 2, 2, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zzwz, 2, 2, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zzww, 2, 2, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zwxx, 2, 3, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zwxy, 2, 3, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zwxz, 2, 3, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zwxw, 2, 3, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zwyx, 2, 3, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zwyy, 2, 3, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zwyz, 2, 3, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zwyw, 2, 3, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zwzx, 2, 3, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zwzy, 2, 3, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zwzz, 2, 3, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zwzw, 2, 3, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zwwx, 2, 3, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zwwy, 2, 3, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zwwz, 2, 3, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, zwww, 2, 3, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wxxx, 3, 0, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wxxy, 3, 0, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wxxz, 3, 0, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wxxw, 3, 0, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wxyx, 3, 0, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wxyy, 3, 0, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wxyz, 3, 0, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wxyw, 3, 0, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wxzx, 3, 0, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wxzy, 3, 0, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wxzz, 3, 0, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wxzw, 3, 0, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wxwx, 3, 0, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wxwy, 3, 0, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wxwz, 3, 0, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wxww, 3, 0, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wyxx, 3, 1, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wyxy, 3, 1, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wyxz, 3, 1, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wyxw, 3, 1, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wyyx, 3, 1, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wyyy, 3, 1, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wyyz, 3, 1, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wyyw, 3, 1, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wyzx, 3, 1, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wyzy, 3, 1, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wyzz, 3, 1, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wyzw, 3, 1, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wywx, 3, 1, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wywy, 3, 1, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wywz, 3, 1, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wyww, 3, 1, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wzxx, 3, 2, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wzxy, 3, 2, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wzxz, 3, 2, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wzxw, 3, 2, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wzyx, 3, 2, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wzyy, 3, 2, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wzyz, 3, 2, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wzyw, 3, 2, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wzzx, 3, 2, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wzzy, 3, 2, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wzzz, 3, 2, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wzzw, 3, 2, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wzwx, 3, 2, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wzwy, 3, 2, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wzwz, 3, 2, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wzww, 3, 2, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wwxx, 3, 3, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wwxy, 3, 3, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wwxz, 3, 3, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wwxw, 3, 3, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wwyx, 3, 3, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wwyy, 3, 3, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wwyz, 3, 3, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wwyw, 3, 3, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wwzx, 3, 3, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wwzy, 3, 3, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wwzz, 3, 3, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wwzw, 3, 3, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wwwx, 3, 3, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wwwy, 3, 3, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wwwz, 3, 3, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, wwww, 3, 3, 3, 3) \ + \ + /* __swizzled_vec__ RGBA_SWIZZLE() const; */ \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rr, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rg, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rb, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ra, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gr, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gg, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gb, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ga, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, br, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bg, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bb, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ba, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ar, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ag, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ab, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aa, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rrr, 0, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rrg, 0, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rrb, 0, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rra, 0, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rgr, 0, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rgg, 0, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rgb, 0, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rga, 0, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rbr, 0, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rbg, 0, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rbb, 0, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rba, 0, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rar, 0, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rag, 0, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rab, 0, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, raa, 0, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, grr, 1, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, grg, 1, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, grb, 1, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gra, 1, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ggr, 1, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ggg, 1, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ggb, 1, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gga, 1, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gbr, 1, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gbg, 1, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gbb, 1, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gba, 1, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gar, 1, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gag, 1, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gab, 1, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gaa, 1, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, brr, 2, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, brg, 2, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, brb, 2, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bra, 2, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bgr, 2, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bgg, 2, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bgb, 2, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bga, 2, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bbr, 2, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bbg, 2, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bbb, 2, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bba, 2, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bar, 2, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bag, 2, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bab, 2, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, baa, 2, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, arr, 3, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, arg, 3, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, arb, 3, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ara, 3, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, agr, 3, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, agg, 3, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, agb, 3, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aga, 3, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, abr, 3, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, abg, 3, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, abb, 3, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aba, 3, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aar, 3, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aag, 3, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aab, 3, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aaa, 3, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rrrr, 0, 0, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rrrg, 0, 0, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rrrb, 0, 0, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rrra, 0, 0, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rrgr, 0, 0, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rrgg, 0, 0, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rrgb, 0, 0, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rrga, 0, 0, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rrbr, 0, 0, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rrbg, 0, 0, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rrbb, 0, 0, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rrba, 0, 0, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rrar, 0, 0, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rrag, 0, 0, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rrab, 0, 0, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rraa, 0, 0, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rgrr, 0, 1, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rgrg, 0, 1, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rgrb, 0, 1, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rgra, 0, 1, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rggr, 0, 1, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rggg, 0, 1, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rggb, 0, 1, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rgga, 0, 1, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rgbr, 0, 1, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rgbg, 0, 1, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rgbb, 0, 1, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rgba, 0, 1, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rgar, 0, 1, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rgag, 0, 1, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rgab, 0, 1, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rgaa, 0, 1, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rbrr, 0, 2, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rbrg, 0, 2, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rbrb, 0, 2, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rbra, 0, 2, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rbgr, 0, 2, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rbgg, 0, 2, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rbgb, 0, 2, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rbga, 0, 2, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rbbr, 0, 2, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rbbg, 0, 2, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rbbb, 0, 2, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rbba, 0, 2, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rbar, 0, 2, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rbag, 0, 2, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rbab, 0, 2, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rbaa, 0, 2, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rarr, 0, 3, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rarg, 0, 3, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rarb, 0, 3, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rara, 0, 3, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ragr, 0, 3, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ragg, 0, 3, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ragb, 0, 3, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, raga, 0, 3, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rabr, 0, 3, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rabg, 0, 3, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, rabb, 0, 3, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, raba, 0, 3, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, raar, 0, 3, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, raag, 0, 3, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, raab, 0, 3, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, raaa, 0, 3, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, grrr, 1, 0, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, grrg, 1, 0, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, grrb, 1, 0, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, grra, 1, 0, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, grgr, 1, 0, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, grgg, 1, 0, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, grgb, 1, 0, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, grga, 1, 0, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, grbr, 1, 0, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, grbg, 1, 0, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, grbb, 1, 0, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, grba, 1, 0, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, grar, 1, 0, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, grag, 1, 0, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, grab, 1, 0, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, graa, 1, 0, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ggrr, 1, 1, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ggrg, 1, 1, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ggrb, 1, 1, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ggra, 1, 1, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gggr, 1, 1, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gggg, 1, 1, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gggb, 1, 1, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ggga, 1, 1, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ggbr, 1, 1, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ggbg, 1, 1, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ggbb, 1, 1, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ggba, 1, 1, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ggar, 1, 1, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ggag, 1, 1, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ggab, 1, 1, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, ggaa, 1, 1, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gbrr, 1, 2, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gbrg, 1, 2, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gbrb, 1, 2, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gbra, 1, 2, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gbgr, 1, 2, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gbgg, 1, 2, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gbgb, 1, 2, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gbga, 1, 2, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gbbr, 1, 2, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gbbg, 1, 2, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gbbb, 1, 2, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gbba, 1, 2, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gbar, 1, 2, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gbag, 1, 2, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gbab, 1, 2, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gbaa, 1, 2, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, garr, 1, 3, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, garg, 1, 3, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, garb, 1, 3, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gara, 1, 3, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gagr, 1, 3, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gagg, 1, 3, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gagb, 1, 3, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gaga, 1, 3, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gabr, 1, 3, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gabg, 1, 3, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gabb, 1, 3, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gaba, 1, 3, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gaar, 1, 3, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gaag, 1, 3, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gaab, 1, 3, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, gaaa, 1, 3, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, brrr, 2, 0, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, brrg, 2, 0, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, brrb, 2, 0, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, brra, 2, 0, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, brgr, 2, 0, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, brgg, 2, 0, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, brgb, 2, 0, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, brga, 2, 0, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, brbr, 2, 0, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, brbg, 2, 0, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, brbb, 2, 0, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, brba, 2, 0, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, brar, 2, 0, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, brag, 2, 0, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, brab, 2, 0, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, braa, 2, 0, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bgrr, 2, 1, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bgrg, 2, 1, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bgrb, 2, 1, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bgra, 2, 1, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bggr, 2, 1, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bggg, 2, 1, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bggb, 2, 1, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bgga, 2, 1, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bgbr, 2, 1, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bgbg, 2, 1, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bgbb, 2, 1, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bgba, 2, 1, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bgar, 2, 1, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bgag, 2, 1, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bgab, 2, 1, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bgaa, 2, 1, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bbrr, 2, 2, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bbrg, 2, 2, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bbrb, 2, 2, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bbra, 2, 2, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bbgr, 2, 2, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bbgg, 2, 2, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bbgb, 2, 2, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bbga, 2, 2, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bbbr, 2, 2, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bbbg, 2, 2, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bbbb, 2, 2, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bbba, 2, 2, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bbar, 2, 2, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bbag, 2, 2, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bbab, 2, 2, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bbaa, 2, 2, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, barr, 2, 3, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, barg, 2, 3, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, barb, 2, 3, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bara, 2, 3, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bagr, 2, 3, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bagg, 2, 3, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, bagb, 2, 3, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, baga, 2, 3, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, babr, 2, 3, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, babg, 2, 3, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, babb, 2, 3, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, baba, 2, 3, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, baar, 2, 3, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, baag, 2, 3, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, baab, 2, 3, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, baaa, 2, 3, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, arrr, 3, 0, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, arrg, 3, 0, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, arrb, 3, 0, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, arra, 3, 0, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, argr, 3, 0, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, argg, 3, 0, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, argb, 3, 0, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, arga, 3, 0, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, arbr, 3, 0, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, arbg, 3, 0, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, arbb, 3, 0, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, arba, 3, 0, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, arar, 3, 0, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, arag, 3, 0, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, arab, 3, 0, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, araa, 3, 0, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, agrr, 3, 1, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, agrg, 3, 1, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, agrb, 3, 1, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, agra, 3, 1, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aggr, 3, 1, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aggg, 3, 1, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aggb, 3, 1, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, agga, 3, 1, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, agbr, 3, 1, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, agbg, 3, 1, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, agbb, 3, 1, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, agba, 3, 1, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, agar, 3, 1, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, agag, 3, 1, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, agab, 3, 1, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, agaa, 3, 1, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, abrr, 3, 2, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, abrg, 3, 2, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, abrb, 3, 2, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, abra, 3, 2, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, abgr, 3, 2, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, abgg, 3, 2, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, abgb, 3, 2, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, abga, 3, 2, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, abbr, 3, 2, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, abbg, 3, 2, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, abbb, 3, 2, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, abba, 3, 2, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, abar, 3, 2, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, abag, 3, 2, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, abab, 3, 2, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, abaa, 3, 2, 3, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aarr, 3, 3, 0, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aarg, 3, 3, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aarb, 3, 3, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aara, 3, 3, 0, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aagr, 3, 3, 1, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aagg, 3, 3, 1, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aagb, 3, 3, 1, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aaga, 3, 3, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aabr, 3, 3, 2, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aabg, 3, 3, 2, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aabb, 3, 3, 2, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aaba, 3, 3, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aaar, 3, 3, 3, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aaag, 3, 3, 3, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aaab, 3, 3, 3, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, aaaa, 3, 3, 3, 3) +#endif + +#define __SYCL_SWIZZLE_MIXIN_ALL_SWIZZLES \ + /* __swizzled_vec__ XYZW_ACCESS() const; */ \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N <= 4, x, 0) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N == 2 || N == 3 || N == 4, y, 1) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N == 3 || N == 4, z, 2) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N == 4, w, 3) \ + \ + /* __swizzled_vec__ RGBA_ACCESS() const; */ \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N == 4, r, 0) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N == 4, g, 1) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N == 4, b, 2) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N == 4, a, 3) \ + \ + /* __swizzled_vec__ INDEX_ACCESS() const; */ \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N > 0, s0, 0) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N > 1, s1, 1) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N > 2, s2, 2) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N > 2, s3, 3) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N > 4, s4, 4) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N > 4, s5, 5) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N > 4, s6, 6) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N > 4, s7, 7) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N == 16, s8, 8) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N == 16, s9, 9) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N == 16, sA, 10) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N == 16, sB, 11) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N == 16, sC, 12) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N == 16, sD, 13) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N == 16, sE, 14) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(N == 16, sF, 15) \ + \ + /* __swizzled_vec__ lo()/hi() const; */ \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 2, lo, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3, lo, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, lo, 0, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 8, lo, 0, 1, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 16, lo, 0, 1, 2, 3, 4, 5, 6, 7) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 2, hi, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3, hi, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, hi, 2, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 8, hi, 4, 5, 6, 7) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 16, hi, 8, 9, 10, 11, 12, 13, 14, 15) \ + /* __swizzled_vec__ odd()/even() const; */ \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 2, odd, 1) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3, odd, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, odd, 1, 3) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 8, odd, 1, 3, 5, 7) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 16, odd, 1, 3, 5, 7, 9, 11, 13, 15) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 2, even, 0) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 3, even, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 4, even, 0, 2) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 8, even, 0, 2, 4, 6) \ + __SYCL_SWIZZLE_MIXIN_METHOD(N == 16, even, 0, 2, 4, 6, 8, 10, 12, 14) \ + /* SYCL_SIMPLE_SWIZZLES */ \ + __SYCL_SWIZZLE_MIXIN_SIMPLE_SWIZZLES + +#define __SYCL_SWIZZLE_MIXIN_METHOD_NON_CONST(COND, NAME, ...) \ + template \ + std::enable_if_t< \ + (COND), decltype(std::declval().template swizzle<__VA_ARGS__>())> \ + NAME() { \ + return static_cast(this)->template swizzle<__VA_ARGS__>(); \ + } + +#define __SYCL_SWIZZLE_MIXIN_METHOD_CONST(COND, NAME, ...) \ + template \ + std::enable_if_t<(COND), decltype(std::declval() \ + .template swizzle<__VA_ARGS__>())> \ + NAME() const { \ + return static_cast(this)->template swizzle<__VA_ARGS__>(); \ + } + +#define __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS_NON_CONST(COND, NAME, INDEX) \ + template \ + std::enable_if_t<(COND), decltype(std::declval()[0])> NAME() { \ + return (*static_cast(this))[INDEX]; \ + } +#define __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS_CONST(COND, NAME, INDEX) \ + template \ + std::enable_if_t<(COND), decltype(std::declval()[0])> NAME() \ + const { \ + return (*static_cast(this))[INDEX]; \ + } + +template struct NamedSwizzlesMixinConst { +#define __SYCL_SWIZZLE_MIXIN_METHOD(COND, NAME, ...) \ + __SYCL_SWIZZLE_MIXIN_METHOD_CONST(COND, NAME, __VA_ARGS__) + +#define __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(COND, NAME, INDEX) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS_CONST(COND, NAME, INDEX) + + __SYCL_SWIZZLE_MIXIN_ALL_SWIZZLES + +#undef __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS +#undef __SYCL_SWIZZLE_MIXIN_METHOD +}; + +template struct NamedSwizzlesMixinBoth { +#define __SYCL_SWIZZLE_MIXIN_METHOD(COND, NAME, ...) \ + __SYCL_SWIZZLE_MIXIN_METHOD_NON_CONST(COND, NAME, __VA_ARGS__) \ + __SYCL_SWIZZLE_MIXIN_METHOD_CONST(COND, NAME, __VA_ARGS__) + +#define __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS(COND, NAME, INDEX) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS_NON_CONST(COND, NAME, INDEX) \ + __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS_CONST(COND, NAME, INDEX) + + __SYCL_SWIZZLE_MIXIN_ALL_SWIZZLES + +#undef __SYCL_SWIZLLE_MIXIN_SCALAR_ACCESS +#undef __SYCL_SWIZZLE_MIXIN_METHOD +}; + +#undef __SYCL_SWIZZLE_MIXIN_METHOD_CONST +#undef __SYCL_SWIZZLE_MIXIN_METHOD_NON_CONST + +#undef __SYCL_SWIZZLE_MIXIN_ALL_SWIZZLES +#undef __SYCL_SWIZZLE_MIXIN_SIMPLE_SWIZZLES + +} // namespace detail +} // namespace _V1 +} // namespace sycl diff --git a/sycl/include/sycl/detail/os_util.hpp b/sycl/include/sycl/detail/os_util.hpp index c77cdb4913c36..a0c3a8483373e 100644 --- a/sycl/include/sycl/detail/os_util.hpp +++ b/sycl/include/sycl/detail/os_util.hpp @@ -12,7 +12,8 @@ #include // for __SYCL_EXPORT -#include // for size_t +#include // for size_t +#include #include // for string #include // for stat @@ -90,6 +91,20 @@ class __SYCL_EXPORT OSUtil { } }; +// These functions are not a part of OSUtils class to prevent +// exporting them as ABI. They are only used in persistent cache +// implementation and should not be exposed to the end users. +// Get size of directory in bytes. +size_t getDirectorySize(const std::string &Path); + +// Get size of file in bytes. +size_t getFileSize(const std::string &Path); + +// Function to recursively iterate over the directory and execute +// 'Func' on each regular file. +void fileTreeWalk(const std::string Path, + std::function Func); + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/ext/intel/fpga_lsu.hpp b/sycl/include/sycl/ext/intel/fpga_lsu.hpp index 9f61edc4aa054..97d22acf28661 100644 --- a/sycl/include/sycl/ext/intel/fpga_lsu.hpp +++ b/sycl/include/sycl/ext/intel/fpga_lsu.hpp @@ -8,6 +8,7 @@ #pragma once #include "fpga_utils.hpp" +#include #include #include diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index bed78046140bf..8b399848d4b2f 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -340,7 +340,13 @@ class __SYCL_EXPORT modifiable_command_graph /// @param path The path to write the DOT file to. /// @param verbose If true, print additional information about the nodes such /// as kernel args or memory access where applicable. +#ifdef ___INTEL_PREVIEW_BREAKING_CHANGES + void print_graph(const std::string path, bool verbose = false) const { + print_graph(sycl::detail::string_view{path}, verbose); + } +#else void print_graph(const std::string path, bool verbose = false) const; +#endif /// Get a list of all nodes contained in this graph. std::vector get_nodes() const; @@ -387,6 +393,7 @@ class __SYCL_EXPORT modifiable_command_graph std::shared_ptr impl; }; +#ifndef ___INTEL_PREVIEW_BREAKING_CHANGES #ifdef __SYCL_GRAPH_IMPL_CPP // Magic combination found by trial and error: __SYCL_EXPORT @@ -401,6 +408,7 @@ inline bool verbose) const { print_graph(sycl::detail::string_view{path}, verbose); } +#endif // Templateless executable command-graph base class. class __SYCL_EXPORT executable_command_graph diff --git a/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp b/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp index a4af373753c10..7a051d7648640 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp @@ -233,7 +233,8 @@ template std::enable_if_t && - detail::is_generic_group_v> + detail::is_generic_group_v && + is_property_list_v> group_load(Group g, InputIteratorT in_ptr, span out, Properties props = {}) { constexpr bool blocked = detail::isBlocked(props); @@ -305,7 +306,8 @@ template std::enable_if_t && - detail::is_generic_group_v> + detail::is_generic_group_v && + is_property_list_v> group_store(Group g, const span in, OutputIteratorT out_ptr, Properties props = {}) { constexpr bool blocked = detail::isBlocked(props); @@ -352,7 +354,8 @@ group_store(Group g, const span in, template std::enable_if_t && - detail::is_generic_group_v> + detail::is_generic_group_v && + is_property_list_v> group_load(Group g, InputIteratorT in_ptr, OutputT &out, Properties properties = {}) { group_load(g, in_ptr, span(&out, 1), properties); @@ -362,7 +365,8 @@ group_load(Group g, InputIteratorT in_ptr, OutputT &out, template std::enable_if_t && - detail::is_generic_group_v> + detail::is_generic_group_v && + is_property_list_v> group_store(Group g, const InputT &in, OutputIteratorT out_ptr, Properties properties = {}) { group_store(g, span(&in, 1), out_ptr, properties); @@ -372,7 +376,8 @@ group_store(Group g, const InputT &in, OutputIteratorT out_ptr, template std::enable_if_t && - detail::is_generic_group_v> + detail::is_generic_group_v && + is_property_list_v> group_load(Group g, InputIteratorT in_ptr, sycl::vec &out, Properties properties = {}) { group_load(g, in_ptr, span(&out[0], N), properties); @@ -382,7 +387,8 @@ group_load(Group g, InputIteratorT in_ptr, sycl::vec &out, template std::enable_if_t && - detail::is_generic_group_v> + detail::is_generic_group_v && + is_property_list_v> group_store(Group g, const sycl::vec &in, OutputIteratorT out_ptr, Properties properties = {}) { group_store(g, span(&in[0], N), out_ptr, properties); diff --git a/sycl/include/sycl/swizzles.def b/sycl/include/sycl/swizzles.def deleted file mode 100644 index fd503e445d7fe..0000000000000 --- a/sycl/include/sycl/swizzles.def +++ /dev/null @@ -1,823 +0,0 @@ -//==---------------- swizzles.def --- SYCL types ---------------------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// Included by types.hpp twice, once for vec<> and once for SwizzleOp<>. - -// TODO: exclude L-Value swizzle like vec.xxxx() - -#if defined(__SYCL_ACCESS) || defined(__SYCL_INDEXER) || \ - defined(__SYCL_EXPAND) || defined(__SYCL_NTH_ARG) || \ - defined(__SYCL_E0) || defined(__SYCL_E1) || defined(__SYCL_E2) || \ - defined(__SYCL_E3) || defined(__SYCL_E4) || defined(__SYCL_E5) || \ - defined(__SYCL_E6) || defined(__SYCL_E7) || defined(__SYCL_E8) -#error "Undefine __SYCL_{ACCESS, INDEXER, EXPAND, NTH_ARG, E[0-8]} macros." -#endif - -#define __SYCL_INDEXER(_X) Indexer<_X>::value - -// Accepts any number of args >= _N, but expands to just the _N-th one. -// Make N equal to the max number of args handled + 1. Here, N == 9. -#define __SYCL_NTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _N, ...) _N - -// Some compilers for such defines as '#define M1(_A, ...) M2(__VA_ARGS__)' and -// expression M1(1, 2, 3, 4) may pass (2, 3, 4) to M2 as one argument, -// not as three. To avoid that the following macro is needed. -// So, defining M1 as '#define M1(_A, ...) __SYCL_EXPAND( M2(__VA_ARGS__) )' -// solves the problem. -#define __SYCL_EXPAND(_X) _X - -// These are helper macros to create overrides based on the arity. -#define __SYCL_E0(_M, ...) -#define __SYCL_E1(_M, _X) _M(_X) -#define __SYCL_E2(_M, _X, ...) _M(_X), __SYCL_EXPAND(__SYCL_E1(_M, __VA_ARGS__)) -#define __SYCL_E3(_M, _X, ...) _M(_X), __SYCL_EXPAND(__SYCL_E2(_M, __VA_ARGS__)) -#define __SYCL_E4(_M, _X, ...) _M(_X), __SYCL_EXPAND(__SYCL_E3(_M, __VA_ARGS__)) -#define __SYCL_E5(_M, _X, ...) _M(_X), __SYCL_EXPAND(__SYCL_E4(_M, __VA_ARGS__)) -#define __SYCL_E6(_M, _X, ...) _M(_X), __SYCL_EXPAND(__SYCL_E5(_M, __VA_ARGS__)) -#define __SYCL_E7(_M, _X, ...) _M(_X), __SYCL_EXPAND(__SYCL_E6(_M, __VA_ARGS__)) -#define __SYCL_E8(_M, _X, ...) _M(_X), __SYCL_EXPAND(__SYCL_E7(_M, __VA_ARGS__)) - -// Creates template functions with required number of parameters. -// Each of the parameters X from __VA_ARGS__ is transformed into -// __SYCL_INDEXER(X). -// Currently the max number of parameters handled by this macro is 8. -#define __SYCL_ACCESS(_COND, _NAME, ...) \ - template \ - typename std::enable_if<(_COND), Swizzle< \ - __SYCL_EXPAND(__SYCL_NTH_ARG(__VA_ARGS__, \ - __SYCL_E8, __SYCL_E7, __SYCL_E6, __SYCL_E5, __SYCL_E4, \ - __SYCL_E3, __SYCL_E2, __SYCL_E1, __SYCL_E0) \ - (__SYCL_INDEXER, __VA_ARGS__))>>::type _NAME() { \ - return __SYCL_ACCESS_RETURN; \ - } \ - template \ - typename std::enable_if<(_COND), ConstSwizzle< \ - __SYCL_EXPAND(__SYCL_NTH_ARG(__VA_ARGS__, \ - __SYCL_E8, __SYCL_E7, __SYCL_E6, __SYCL_E5, __SYCL_E4, \ - __SYCL_E3, __SYCL_E2, __SYCL_E1, __SYCL_E0) \ - (__SYCL_INDEXER, __VA_ARGS__))>>::type _NAME() const { \ - return __SYCL_ACCESS_RETURN; \ - } - -#define __SYCL_SCALAR_ACCESS(_COND, _NAME, _INDEX) \ - template \ - typename std::enable_if<(_COND), DataT &>::type _NAME() { \ - return (*__SYCL_ACCESS_RETURN)[_INDEX]; \ - } \ - template \ - typename std::enable_if<(_COND), const DataT &>::type _NAME() const { \ - return (*__SYCL_ACCESS_RETURN)[_INDEX]; \ - } - -//__swizzled_vec__ XYZW_ACCESS() const; -__SYCL_SCALAR_ACCESS(N <= 4, x, 0) -__SYCL_SCALAR_ACCESS(N == 2 || N == 3 || N == 4, y, 1) -__SYCL_SCALAR_ACCESS(N == 3 || N == 4, z, 2) -__SYCL_SCALAR_ACCESS(N == 4, w, 3) - -//__swizzled_vec__ RGBA_ACCESS() const; -__SYCL_SCALAR_ACCESS(N == 4, r, 0) -__SYCL_SCALAR_ACCESS(N == 4, g, 1) -__SYCL_SCALAR_ACCESS(N == 4, b, 2) -__SYCL_SCALAR_ACCESS(N == 4, a, 3) - -//__swizzled_vec__ INDEX_ACCESS() const; -__SYCL_SCALAR_ACCESS(N > 0, s0, 0) -__SYCL_SCALAR_ACCESS(N > 1, s1, 1) -__SYCL_SCALAR_ACCESS(N > 2, s2, 2) -__SYCL_SCALAR_ACCESS(N > 2, s3, 3) -__SYCL_SCALAR_ACCESS(N > 4, s4, 4) -__SYCL_SCALAR_ACCESS(N > 4, s5, 5) -__SYCL_SCALAR_ACCESS(N > 4, s6, 6) -__SYCL_SCALAR_ACCESS(N > 4, s7, 7) -__SYCL_SCALAR_ACCESS(N == 16, s8, 8) -__SYCL_SCALAR_ACCESS(N == 16, s9, 9) -__SYCL_SCALAR_ACCESS(N == 16, sA, 10) -__SYCL_SCALAR_ACCESS(N == 16, sB, 11) -__SYCL_SCALAR_ACCESS(N == 16, sC, 12) -__SYCL_SCALAR_ACCESS(N == 16, sD, 13) -__SYCL_SCALAR_ACCESS(N == 16, sE, 14) -__SYCL_SCALAR_ACCESS(N == 16, sF, 15) - -#ifdef SYCL_SIMPLE_SWIZZLES -//__swizzled_vec__ XYZW_SWIZZLE() const; -__SYCL_ACCESS(N <= 4, xx, 0, 0) -__SYCL_ACCESS(2 <= N && N <= 4, xy, 0, 1) -__SYCL_ACCESS(N == 3 || N == 4, xz, 0, 2) -__SYCL_ACCESS(N == 4, xw, 0, 3) -__SYCL_ACCESS(2 <= N && N <= 4, yx, 1, 0) -__SYCL_ACCESS(2 <= N && N <= 4, yy, 1, 1) -__SYCL_ACCESS(N == 3 || N == 4, yz, 1, 2) -__SYCL_ACCESS(N == 4, yw, 1, 3) -__SYCL_ACCESS(N == 3 || N == 4, zx, 2, 0) -__SYCL_ACCESS(N == 3 || N == 4, zy, 2, 1) -__SYCL_ACCESS(N == 3 || N == 4, zz, 2, 2) -__SYCL_ACCESS(N == 4, zw, 2, 3) -__SYCL_ACCESS(N == 4, wx, 3, 0) -__SYCL_ACCESS(N == 4, wy, 3, 1) -__SYCL_ACCESS(N == 4, wz, 3, 2) -__SYCL_ACCESS(N == 4, ww, 3, 3) -__SYCL_ACCESS(N <= 4, xxx, 0, 0, 0) -__SYCL_ACCESS(2 <= N && N <= 4, xxy, 0, 0, 1) -__SYCL_ACCESS(N == 3 || N == 4, xxz, 0, 0, 2) -__SYCL_ACCESS(N == 4, xxw, 0, 0, 3) -__SYCL_ACCESS(2 <= N && N <= 4, xyx, 0, 1, 0) -__SYCL_ACCESS(2 <= N && N <= 4, xyy, 0, 1, 1) -__SYCL_ACCESS(N == 3 || N == 4, xyz, 0, 1, 2) -__SYCL_ACCESS(N == 4, xyw, 0, 1, 3) -__SYCL_ACCESS(N == 3 || N == 4, xzx, 0, 2, 0) -__SYCL_ACCESS(N == 3 || N == 4, xzy, 0, 2, 1) -__SYCL_ACCESS(N == 3 || N == 4, xzz, 0, 2, 2) -__SYCL_ACCESS(N == 4, xzw, 0, 2, 3) -__SYCL_ACCESS(N == 4, xwx, 0, 3, 0) -__SYCL_ACCESS(N == 4, xwy, 0, 3, 1) -__SYCL_ACCESS(N == 4, xwz, 0, 3, 2) -__SYCL_ACCESS(N == 4, xww, 0, 3, 3) -__SYCL_ACCESS(2 <= N && N <= 4, yxx, 1, 0, 0) -__SYCL_ACCESS(2 <= N && N <= 4, yxy, 1, 0, 1) -__SYCL_ACCESS(N == 3 || N == 4, yxz, 1, 0, 2) -__SYCL_ACCESS(N == 4, yxw, 1, 0, 3) -__SYCL_ACCESS(2 <= N && N <= 4, yyx, 1, 1, 0) -__SYCL_ACCESS(2 <= N && N <= 4, yyy, 1, 1, 1) -__SYCL_ACCESS(N == 3 || N == 4, yyz, 1, 1, 2) -__SYCL_ACCESS(N == 4, yyw, 1, 1, 3) -__SYCL_ACCESS(N == 3 || N == 4, yzx, 1, 2, 0) -__SYCL_ACCESS(N == 3 || N == 4, yzy, 1, 2, 1) -__SYCL_ACCESS(N == 3 || N == 4, yzz, 1, 2, 2) -__SYCL_ACCESS(N == 4, yzw, 1, 2, 3) -__SYCL_ACCESS(N == 4, ywx, 1, 3, 0) -__SYCL_ACCESS(N == 4, ywy, 1, 3, 1) -__SYCL_ACCESS(N == 4, ywz, 1, 3, 2) -__SYCL_ACCESS(N == 4, yww, 1, 3, 3) -__SYCL_ACCESS(N == 3 || N == 4, zxx, 2, 0, 0) -__SYCL_ACCESS(N == 3 || N == 4, zxy, 2, 0, 1) -__SYCL_ACCESS(N == 3 || N == 4, zxz, 2, 0, 2) -__SYCL_ACCESS(N == 4, zxw, 2, 0, 3) -__SYCL_ACCESS(N == 3 || N == 4, zyx, 2, 1, 0) -__SYCL_ACCESS(N == 3 || N == 4, zyy, 2, 1, 1) -__SYCL_ACCESS(N == 3 || N == 4, zyz, 2, 1, 2) -__SYCL_ACCESS(N == 4, zyw, 2, 1, 3) -__SYCL_ACCESS(N == 3 || N == 4, zzx, 2, 2, 0) -__SYCL_ACCESS(N == 3 || N == 4, zzy, 2, 2, 1) -__SYCL_ACCESS(N == 3 || N == 4, zzz, 2, 2, 2) -__SYCL_ACCESS(N == 4, zzw, 2, 2, 3) -__SYCL_ACCESS(N == 4, zwx, 2, 3, 0) -__SYCL_ACCESS(N == 4, zwy, 2, 3, 1) -__SYCL_ACCESS(N == 4, zwz, 2, 3, 2) -__SYCL_ACCESS(N == 4, zww, 2, 3, 3) -__SYCL_ACCESS(N == 4, wxx, 3, 0, 0) -__SYCL_ACCESS(N == 4, wxy, 3, 0, 1) -__SYCL_ACCESS(N == 4, wxz, 3, 0, 2) -__SYCL_ACCESS(N == 4, wxw, 3, 0, 3) -__SYCL_ACCESS(N == 4, wyx, 3, 1, 0) -__SYCL_ACCESS(N == 4, wyy, 3, 1, 1) -__SYCL_ACCESS(N == 4, wyz, 3, 1, 2) -__SYCL_ACCESS(N == 4, wyw, 3, 1, 3) -__SYCL_ACCESS(N == 4, wzx, 3, 2, 0) -__SYCL_ACCESS(N == 4, wzy, 3, 2, 1) -__SYCL_ACCESS(N == 4, wzz, 3, 2, 2) -__SYCL_ACCESS(N == 4, wzw, 3, 2, 3) -__SYCL_ACCESS(N == 4, wwx, 3, 3, 0) -__SYCL_ACCESS(N == 4, wwy, 3, 3, 1) -__SYCL_ACCESS(N == 4, wwz, 3, 3, 2) -__SYCL_ACCESS(N == 4, www, 3, 3, 3) -__SYCL_ACCESS(N <= 4, xxxx, 0, 0, 0, 0) -__SYCL_ACCESS(2 <= N && N <= 4, xxxy, 0, 0, 0, 1) -__SYCL_ACCESS(N == 3 || N == 4, xxxz, 0, 0, 0, 2) -__SYCL_ACCESS(N == 4, xxxw, 0, 0, 0, 3) -__SYCL_ACCESS(2 <= N && N <= 4, xxyx, 0, 0, 1, 0) -__SYCL_ACCESS(2 <= N && N <= 4, xxyy, 0, 0, 1, 1) -__SYCL_ACCESS(N == 3 || N == 4, xxyz, 0, 0, 1, 2) -__SYCL_ACCESS(N == 4, xxyw, 0, 0, 1, 3) -__SYCL_ACCESS(N == 3 || N == 4, xxzx, 0, 0, 2, 0) -__SYCL_ACCESS(N == 3 || N == 4, xxzy, 0, 0, 2, 1) -__SYCL_ACCESS(N == 3 || N == 4, xxzz, 0, 0, 2, 2) -__SYCL_ACCESS(N == 4, xxzw, 0, 0, 2, 3) -__SYCL_ACCESS(N == 4, xxwx, 0, 0, 3, 0) -__SYCL_ACCESS(N == 4, xxwy, 0, 0, 3, 1) -__SYCL_ACCESS(N == 4, xxwz, 0, 0, 3, 2) -__SYCL_ACCESS(N == 4, xxww, 0, 0, 3, 3) -__SYCL_ACCESS(2 <= N && N <= 4, xyxx, 0, 1, 0, 0) -__SYCL_ACCESS(2 <= N && N <= 4, xyxy, 0, 1, 0, 1) -__SYCL_ACCESS(N == 3 || N == 4, xyxz, 0, 1, 0, 2) -__SYCL_ACCESS(N == 4, xyxw, 0, 1, 0, 3) -__SYCL_ACCESS(2 <= N && N <= 4, xyyx, 0, 1, 1, 0) -__SYCL_ACCESS(2 <= N && N <= 4, xyyy, 0, 1, 1, 1) -__SYCL_ACCESS(N == 3 || N == 4, xyyz, 0, 1, 1, 2) -__SYCL_ACCESS(N == 4, xyyw, 0, 1, 1, 3) -__SYCL_ACCESS(N == 3 || N == 4, xyzx, 0, 1, 2, 0) -__SYCL_ACCESS(N == 3 || N == 4, xyzy, 0, 1, 2, 1) -__SYCL_ACCESS(N == 3 || N == 4, xyzz, 0, 1, 2, 2) -__SYCL_ACCESS(N == 4, xyzw, 0, 1, 2, 3) -__SYCL_ACCESS(N == 4, xywx, 0, 1, 3, 0) -__SYCL_ACCESS(N == 4, xywy, 0, 1, 3, 1) -__SYCL_ACCESS(N == 4, xywz, 0, 1, 3, 2) -__SYCL_ACCESS(N == 4, xyww, 0, 1, 3, 3) -__SYCL_ACCESS(N == 3 || N == 4, xzxx, 0, 2, 0, 0) -__SYCL_ACCESS(N == 3 || N == 4, xzxy, 0, 2, 0, 1) -__SYCL_ACCESS(N == 3 || N == 4, xzxz, 0, 2, 0, 2) -__SYCL_ACCESS(N == 4, xzxw, 0, 2, 0, 3) -__SYCL_ACCESS(N == 3 || N == 4, xzyx, 0, 2, 1, 0) -__SYCL_ACCESS(N == 3 || N == 4, xzyy, 0, 2, 1, 1) -__SYCL_ACCESS(N == 3 || N == 4, xzyz, 0, 2, 1, 2) -__SYCL_ACCESS(N == 4, xzyw, 0, 2, 1, 3) -__SYCL_ACCESS(N == 3 || N == 4, xzzx, 0, 2, 2, 0) -__SYCL_ACCESS(N == 3 || N == 4, xzzy, 0, 2, 2, 1) -__SYCL_ACCESS(N == 3 || N == 4, xzzz, 0, 2, 2, 2) -__SYCL_ACCESS(N == 4, xzzw, 0, 2, 2, 3) -__SYCL_ACCESS(N == 4, xzwx, 0, 2, 3, 0) -__SYCL_ACCESS(N == 4, xzwy, 0, 2, 3, 1) -__SYCL_ACCESS(N == 4, xzwz, 0, 2, 3, 2) -__SYCL_ACCESS(N == 4, xzww, 0, 2, 3, 3) -__SYCL_ACCESS(N == 4, xwxx, 0, 3, 0, 0) -__SYCL_ACCESS(N == 4, xwxy, 0, 3, 0, 1) -__SYCL_ACCESS(N == 4, xwxz, 0, 3, 0, 2) -__SYCL_ACCESS(N == 4, xwxw, 0, 3, 0, 3) -__SYCL_ACCESS(N == 4, xwyx, 0, 3, 1, 0) -__SYCL_ACCESS(N == 4, xwyy, 0, 3, 1, 1) -__SYCL_ACCESS(N == 4, xwyz, 0, 3, 1, 2) -__SYCL_ACCESS(N == 4, xwyw, 0, 3, 1, 3) -__SYCL_ACCESS(N == 4, xwzx, 0, 3, 2, 0) -__SYCL_ACCESS(N == 4, xwzy, 0, 3, 2, 1) -__SYCL_ACCESS(N == 4, xwzz, 0, 3, 2, 2) -__SYCL_ACCESS(N == 4, xwzw, 0, 3, 2, 3) -__SYCL_ACCESS(N == 4, xwwx, 0, 3, 3, 0) -__SYCL_ACCESS(N == 4, xwwy, 0, 3, 3, 1) -__SYCL_ACCESS(N == 4, xwwz, 0, 3, 3, 2) -__SYCL_ACCESS(N == 4, xwww, 0, 3, 3, 3) -__SYCL_ACCESS(2 <= N && N <= 4, yxxx, 1, 0, 0, 0) -__SYCL_ACCESS(2 <= N && N <= 4, yxxy, 1, 0, 0, 1) -__SYCL_ACCESS(N == 3 || N == 4, yxxz, 1, 0, 0, 2) -__SYCL_ACCESS(N == 4, yxxw, 1, 0, 0, 3) -__SYCL_ACCESS(2 <= N && N <= 4, yxyx, 1, 0, 1, 0) -__SYCL_ACCESS(2 <= N && N <= 4, yxyy, 1, 0, 1, 1) -__SYCL_ACCESS(N == 3 || N == 4, yxyz, 1, 0, 1, 2) -__SYCL_ACCESS(N == 4, yxyw, 1, 0, 1, 3) -__SYCL_ACCESS(N == 3 || N == 4, yxzx, 1, 0, 2, 0) -__SYCL_ACCESS(N == 3 || N == 4, yxzy, 1, 0, 2, 1) -__SYCL_ACCESS(N == 3 || N == 4, yxzz, 1, 0, 2, 2) -__SYCL_ACCESS(N == 4, yxzw, 1, 0, 2, 3) -__SYCL_ACCESS(N == 4, yxwx, 1, 0, 3, 0) -__SYCL_ACCESS(N == 4, yxwy, 1, 0, 3, 1) -__SYCL_ACCESS(N == 4, yxwz, 1, 0, 3, 2) -__SYCL_ACCESS(N == 4, yxww, 1, 0, 3, 3) -__SYCL_ACCESS(2 <= N && N <= 4, yyxx, 1, 1, 0, 0) -__SYCL_ACCESS(2 <= N && N <= 4, yyxy, 1, 1, 0, 1) -__SYCL_ACCESS(N == 3 || N == 4, yyxz, 1, 1, 0, 2) -__SYCL_ACCESS(N == 4, yyxw, 1, 1, 0, 3) -__SYCL_ACCESS(2 <= N && N <= 4, yyyx, 1, 1, 1, 0) -__SYCL_ACCESS(2 <= N && N <= 4, yyyy, 1, 1, 1, 1) -__SYCL_ACCESS(N == 3 || N == 4, yyyz, 1, 1, 1, 2) -__SYCL_ACCESS(N == 4, yyyw, 1, 1, 1, 3) -__SYCL_ACCESS(N == 3 || N == 4, yyzx, 1, 1, 2, 0) -__SYCL_ACCESS(N == 3 || N == 4, yyzy, 1, 1, 2, 1) -__SYCL_ACCESS(N == 3 || N == 4, yyzz, 1, 1, 2, 2) -__SYCL_ACCESS(N == 4, yyzw, 1, 1, 2, 3) -__SYCL_ACCESS(N == 4, yywx, 1, 1, 3, 0) -__SYCL_ACCESS(N == 4, yywy, 1, 1, 3, 1) -__SYCL_ACCESS(N == 4, yywz, 1, 1, 3, 2) -__SYCL_ACCESS(N == 4, yyww, 1, 1, 3, 3) -__SYCL_ACCESS(N == 3 || N == 4, yzxx, 1, 2, 0, 0) -__SYCL_ACCESS(N == 3 || N == 4, yzxy, 1, 2, 0, 1) -__SYCL_ACCESS(N == 3 || N == 4, yzxz, 1, 2, 0, 2) -__SYCL_ACCESS(N == 4, yzxw, 1, 2, 0, 3) -__SYCL_ACCESS(N == 3 || N == 4, yzyx, 1, 2, 1, 0) -__SYCL_ACCESS(N == 3 || N == 4, yzyy, 1, 2, 1, 1) -__SYCL_ACCESS(N == 3 || N == 4, yzyz, 1, 2, 1, 2) -__SYCL_ACCESS(N == 4, yzyw, 1, 2, 1, 3) -__SYCL_ACCESS(N == 3 || N == 4, yzzx, 1, 2, 2, 0) -__SYCL_ACCESS(N == 3 || N == 4, yzzy, 1, 2, 2, 1) -__SYCL_ACCESS(N == 3 || N == 4, yzzz, 1, 2, 2, 2) -__SYCL_ACCESS(N == 4, yzzw, 1, 2, 2, 3) -__SYCL_ACCESS(N == 4, yzwx, 1, 2, 3, 0) -__SYCL_ACCESS(N == 4, yzwy, 1, 2, 3, 1) -__SYCL_ACCESS(N == 4, yzwz, 1, 2, 3, 2) -__SYCL_ACCESS(N == 4, yzww, 1, 2, 3, 3) -__SYCL_ACCESS(N == 4, ywxx, 1, 3, 0, 0) -__SYCL_ACCESS(N == 4, ywxy, 1, 3, 0, 1) -__SYCL_ACCESS(N == 4, ywxz, 1, 3, 0, 2) -__SYCL_ACCESS(N == 4, ywxw, 1, 3, 0, 3) -__SYCL_ACCESS(N == 4, ywyx, 1, 3, 1, 0) -__SYCL_ACCESS(N == 4, ywyy, 1, 3, 1, 1) -__SYCL_ACCESS(N == 4, ywyz, 1, 3, 1, 2) -__SYCL_ACCESS(N == 4, ywyw, 1, 3, 1, 3) -__SYCL_ACCESS(N == 4, ywzx, 1, 3, 2, 0) -__SYCL_ACCESS(N == 4, ywzy, 1, 3, 2, 1) -__SYCL_ACCESS(N == 4, ywzz, 1, 3, 2, 2) -__SYCL_ACCESS(N == 4, ywzw, 1, 3, 2, 3) -__SYCL_ACCESS(N == 4, ywwx, 1, 3, 3, 0) -__SYCL_ACCESS(N == 4, ywwy, 1, 3, 3, 1) -__SYCL_ACCESS(N == 4, ywwz, 1, 3, 3, 2) -__SYCL_ACCESS(N == 4, ywww, 1, 3, 3, 3) -__SYCL_ACCESS(N == 3 || N == 4, zxxx, 2, 0, 0, 0) -__SYCL_ACCESS(N == 3 || N == 4, zxxy, 2, 0, 0, 1) -__SYCL_ACCESS(N == 3 || N == 4, zxxz, 2, 0, 0, 2) -__SYCL_ACCESS(N == 4, zxxw, 2, 0, 0, 3) -__SYCL_ACCESS(N == 3 || N == 4, zxyx, 2, 0, 1, 0) -__SYCL_ACCESS(N == 3 || N == 4, zxyy, 2, 0, 1, 1) -__SYCL_ACCESS(N == 3 || N == 4, zxyz, 2, 0, 1, 2) -__SYCL_ACCESS(N == 4, zxyw, 2, 0, 1, 3) -__SYCL_ACCESS(N == 3 || N == 4, zxzx, 2, 0, 2, 0) -__SYCL_ACCESS(N == 3 || N == 4, zxzy, 2, 0, 2, 1) -__SYCL_ACCESS(N == 3 || N == 4, zxzz, 2, 0, 2, 2) -__SYCL_ACCESS(N == 4, zxzw, 2, 0, 2, 3) -__SYCL_ACCESS(N == 4, zxwx, 2, 0, 3, 0) -__SYCL_ACCESS(N == 4, zxwy, 2, 0, 3, 1) -__SYCL_ACCESS(N == 4, zxwz, 2, 0, 3, 2) -__SYCL_ACCESS(N == 4, zxww, 2, 0, 3, 3) -__SYCL_ACCESS(N == 3 || N == 4, zyxx, 2, 1, 0, 0) -__SYCL_ACCESS(N == 3 || N == 4, zyxy, 2, 1, 0, 1) -__SYCL_ACCESS(N == 3 || N == 4, zyxz, 2, 1, 0, 2) -__SYCL_ACCESS(N == 4, zyxw, 2, 1, 0, 3) -__SYCL_ACCESS(N == 3 || N == 4, zyyx, 2, 1, 1, 0) -__SYCL_ACCESS(N == 3 || N == 4, zyyy, 2, 1, 1, 1) -__SYCL_ACCESS(N == 3 || N == 4, zyyz, 2, 1, 1, 2) -__SYCL_ACCESS(N == 4, zyyw, 2, 1, 1, 3) -__SYCL_ACCESS(N == 3 || N == 4, zyzx, 2, 1, 2, 0) -__SYCL_ACCESS(N == 3 || N == 4, zyzy, 2, 1, 2, 1) -__SYCL_ACCESS(N == 3 || N == 4, zyzz, 2, 1, 2, 2) -__SYCL_ACCESS(N == 4, zyzw, 2, 1, 2, 3) -__SYCL_ACCESS(N == 4, zywx, 2, 1, 3, 0) -__SYCL_ACCESS(N == 4, zywy, 2, 1, 3, 1) -__SYCL_ACCESS(N == 4, zywz, 2, 1, 3, 2) -__SYCL_ACCESS(N == 4, zyww, 2, 1, 3, 3) -__SYCL_ACCESS(N == 3 || N == 4, zzxx, 2, 2, 0, 0) -__SYCL_ACCESS(N == 3 || N == 4, zzxy, 2, 2, 0, 1) -__SYCL_ACCESS(N == 3 || N == 4, zzxz, 2, 2, 0, 2) -__SYCL_ACCESS(N == 4, zzxw, 2, 2, 0, 3) -__SYCL_ACCESS(N == 3 || N == 4, zzyx, 2, 2, 1, 0) -__SYCL_ACCESS(N == 3 || N == 4, zzyy, 2, 2, 1, 1) -__SYCL_ACCESS(N == 3 || N == 4, zzyz, 2, 2, 1, 2) -__SYCL_ACCESS(N == 4, zzyw, 2, 2, 1, 3) -__SYCL_ACCESS(N == 3 || N == 4, zzzx, 2, 2, 2, 0) -__SYCL_ACCESS(N == 3 || N == 4, zzzy, 2, 2, 2, 1) -__SYCL_ACCESS(N == 3 || N == 4, zzzz, 2, 2, 2, 2) -__SYCL_ACCESS(N == 4, zzzw, 2, 2, 2, 3) -__SYCL_ACCESS(N == 4, zzwx, 2, 2, 3, 0) -__SYCL_ACCESS(N == 4, zzwy, 2, 2, 3, 1) -__SYCL_ACCESS(N == 4, zzwz, 2, 2, 3, 2) -__SYCL_ACCESS(N == 4, zzww, 2, 2, 3, 3) -__SYCL_ACCESS(N == 4, zwxx, 2, 3, 0, 0) -__SYCL_ACCESS(N == 4, zwxy, 2, 3, 0, 1) -__SYCL_ACCESS(N == 4, zwxz, 2, 3, 0, 2) -__SYCL_ACCESS(N == 4, zwxw, 2, 3, 0, 3) -__SYCL_ACCESS(N == 4, zwyx, 2, 3, 1, 0) -__SYCL_ACCESS(N == 4, zwyy, 2, 3, 1, 1) -__SYCL_ACCESS(N == 4, zwyz, 2, 3, 1, 2) -__SYCL_ACCESS(N == 4, zwyw, 2, 3, 1, 3) -__SYCL_ACCESS(N == 4, zwzx, 2, 3, 2, 0) -__SYCL_ACCESS(N == 4, zwzy, 2, 3, 2, 1) -__SYCL_ACCESS(N == 4, zwzz, 2, 3, 2, 2) -__SYCL_ACCESS(N == 4, zwzw, 2, 3, 2, 3) -__SYCL_ACCESS(N == 4, zwwx, 2, 3, 3, 0) -__SYCL_ACCESS(N == 4, zwwy, 2, 3, 3, 1) -__SYCL_ACCESS(N == 4, zwwz, 2, 3, 3, 2) -__SYCL_ACCESS(N == 4, zwww, 2, 3, 3, 3) -__SYCL_ACCESS(N == 4, wxxx, 3, 0, 0, 0) -__SYCL_ACCESS(N == 4, wxxy, 3, 0, 0, 1) -__SYCL_ACCESS(N == 4, wxxz, 3, 0, 0, 2) -__SYCL_ACCESS(N == 4, wxxw, 3, 0, 0, 3) -__SYCL_ACCESS(N == 4, wxyx, 3, 0, 1, 0) -__SYCL_ACCESS(N == 4, wxyy, 3, 0, 1, 1) -__SYCL_ACCESS(N == 4, wxyz, 3, 0, 1, 2) -__SYCL_ACCESS(N == 4, wxyw, 3, 0, 1, 3) -__SYCL_ACCESS(N == 4, wxzx, 3, 0, 2, 0) -__SYCL_ACCESS(N == 4, wxzy, 3, 0, 2, 1) -__SYCL_ACCESS(N == 4, wxzz, 3, 0, 2, 2) -__SYCL_ACCESS(N == 4, wxzw, 3, 0, 2, 3) -__SYCL_ACCESS(N == 4, wxwx, 3, 0, 3, 0) -__SYCL_ACCESS(N == 4, wxwy, 3, 0, 3, 1) -__SYCL_ACCESS(N == 4, wxwz, 3, 0, 3, 2) -__SYCL_ACCESS(N == 4, wxww, 3, 0, 3, 3) -__SYCL_ACCESS(N == 4, wyxx, 3, 1, 0, 0) -__SYCL_ACCESS(N == 4, wyxy, 3, 1, 0, 1) -__SYCL_ACCESS(N == 4, wyxz, 3, 1, 0, 2) -__SYCL_ACCESS(N == 4, wyxw, 3, 1, 0, 3) -__SYCL_ACCESS(N == 4, wyyx, 3, 1, 1, 0) -__SYCL_ACCESS(N == 4, wyyy, 3, 1, 1, 1) -__SYCL_ACCESS(N == 4, wyyz, 3, 1, 1, 2) -__SYCL_ACCESS(N == 4, wyyw, 3, 1, 1, 3) -__SYCL_ACCESS(N == 4, wyzx, 3, 1, 2, 0) -__SYCL_ACCESS(N == 4, wyzy, 3, 1, 2, 1) -__SYCL_ACCESS(N == 4, wyzz, 3, 1, 2, 2) -__SYCL_ACCESS(N == 4, wyzw, 3, 1, 2, 3) -__SYCL_ACCESS(N == 4, wywx, 3, 1, 3, 0) -__SYCL_ACCESS(N == 4, wywy, 3, 1, 3, 1) -__SYCL_ACCESS(N == 4, wywz, 3, 1, 3, 2) -__SYCL_ACCESS(N == 4, wyww, 3, 1, 3, 3) -__SYCL_ACCESS(N == 4, wzxx, 3, 2, 0, 0) -__SYCL_ACCESS(N == 4, wzxy, 3, 2, 0, 1) -__SYCL_ACCESS(N == 4, wzxz, 3, 2, 0, 2) -__SYCL_ACCESS(N == 4, wzxw, 3, 2, 0, 3) -__SYCL_ACCESS(N == 4, wzyx, 3, 2, 1, 0) -__SYCL_ACCESS(N == 4, wzyy, 3, 2, 1, 1) -__SYCL_ACCESS(N == 4, wzyz, 3, 2, 1, 2) -__SYCL_ACCESS(N == 4, wzyw, 3, 2, 1, 3) -__SYCL_ACCESS(N == 4, wzzx, 3, 2, 2, 0) -__SYCL_ACCESS(N == 4, wzzy, 3, 2, 2, 1) -__SYCL_ACCESS(N == 4, wzzz, 3, 2, 2, 2) -__SYCL_ACCESS(N == 4, wzzw, 3, 2, 2, 3) -__SYCL_ACCESS(N == 4, wzwx, 3, 2, 3, 0) -__SYCL_ACCESS(N == 4, wzwy, 3, 2, 3, 1) -__SYCL_ACCESS(N == 4, wzwz, 3, 2, 3, 2) -__SYCL_ACCESS(N == 4, wzww, 3, 2, 3, 3) -__SYCL_ACCESS(N == 4, wwxx, 3, 3, 0, 0) -__SYCL_ACCESS(N == 4, wwxy, 3, 3, 0, 1) -__SYCL_ACCESS(N == 4, wwxz, 3, 3, 0, 2) -__SYCL_ACCESS(N == 4, wwxw, 3, 3, 0, 3) -__SYCL_ACCESS(N == 4, wwyx, 3, 3, 1, 0) -__SYCL_ACCESS(N == 4, wwyy, 3, 3, 1, 1) -__SYCL_ACCESS(N == 4, wwyz, 3, 3, 1, 2) -__SYCL_ACCESS(N == 4, wwyw, 3, 3, 1, 3) -__SYCL_ACCESS(N == 4, wwzx, 3, 3, 2, 0) -__SYCL_ACCESS(N == 4, wwzy, 3, 3, 2, 1) -__SYCL_ACCESS(N == 4, wwzz, 3, 3, 2, 2) -__SYCL_ACCESS(N == 4, wwzw, 3, 3, 2, 3) -__SYCL_ACCESS(N == 4, wwwx, 3, 3, 3, 0) -__SYCL_ACCESS(N == 4, wwwy, 3, 3, 3, 1) -__SYCL_ACCESS(N == 4, wwwz, 3, 3, 3, 2) -__SYCL_ACCESS(N == 4, wwww, 3, 3, 3, 3) - -//__swizzled_vec__ RGBA_SWIZZLE() const; -__SYCL_ACCESS(N == 4, rr, 0, 0) -__SYCL_ACCESS(N == 4, rg, 0, 1) -__SYCL_ACCESS(N == 4, rb, 0, 2) -__SYCL_ACCESS(N == 4, ra, 0, 3) -__SYCL_ACCESS(N == 4, gr, 1, 0) -__SYCL_ACCESS(N == 4, gg, 1, 1) -__SYCL_ACCESS(N == 4, gb, 1, 2) -__SYCL_ACCESS(N == 4, ga, 1, 3) -__SYCL_ACCESS(N == 4, br, 2, 0) -__SYCL_ACCESS(N == 4, bg, 2, 1) -__SYCL_ACCESS(N == 4, bb, 2, 2) -__SYCL_ACCESS(N == 4, ba, 2, 3) -__SYCL_ACCESS(N == 4, ar, 3, 0) -__SYCL_ACCESS(N == 4, ag, 3, 1) -__SYCL_ACCESS(N == 4, ab, 3, 2) -__SYCL_ACCESS(N == 4, aa, 3, 3) -__SYCL_ACCESS(N == 4, rrr, 0, 0, 0) -__SYCL_ACCESS(N == 4, rrg, 0, 0, 1) -__SYCL_ACCESS(N == 4, rrb, 0, 0, 2) -__SYCL_ACCESS(N == 4, rra, 0, 0, 3) -__SYCL_ACCESS(N == 4, rgr, 0, 1, 0) -__SYCL_ACCESS(N == 4, rgg, 0, 1, 1) -__SYCL_ACCESS(N == 4, rgb, 0, 1, 2) -__SYCL_ACCESS(N == 4, rga, 0, 1, 3) -__SYCL_ACCESS(N == 4, rbr, 0, 2, 0) -__SYCL_ACCESS(N == 4, rbg, 0, 2, 1) -__SYCL_ACCESS(N == 4, rbb, 0, 2, 2) -__SYCL_ACCESS(N == 4, rba, 0, 2, 3) -__SYCL_ACCESS(N == 4, rar, 0, 3, 0) -__SYCL_ACCESS(N == 4, rag, 0, 3, 1) -__SYCL_ACCESS(N == 4, rab, 0, 3, 2) -__SYCL_ACCESS(N == 4, raa, 0, 3, 3) -__SYCL_ACCESS(N == 4, grr, 1, 0, 0) -__SYCL_ACCESS(N == 4, grg, 1, 0, 1) -__SYCL_ACCESS(N == 4, grb, 1, 0, 2) -__SYCL_ACCESS(N == 4, gra, 1, 0, 3) -__SYCL_ACCESS(N == 4, ggr, 1, 1, 0) -__SYCL_ACCESS(N == 4, ggg, 1, 1, 1) -__SYCL_ACCESS(N == 4, ggb, 1, 1, 2) -__SYCL_ACCESS(N == 4, gga, 1, 1, 3) -__SYCL_ACCESS(N == 4, gbr, 1, 2, 0) -__SYCL_ACCESS(N == 4, gbg, 1, 2, 1) -__SYCL_ACCESS(N == 4, gbb, 1, 2, 2) -__SYCL_ACCESS(N == 4, gba, 1, 2, 3) -__SYCL_ACCESS(N == 4, gar, 1, 3, 0) -__SYCL_ACCESS(N == 4, gag, 1, 3, 1) -__SYCL_ACCESS(N == 4, gab, 1, 3, 2) -__SYCL_ACCESS(N == 4, gaa, 1, 3, 3) -__SYCL_ACCESS(N == 4, brr, 2, 0, 0) -__SYCL_ACCESS(N == 4, brg, 2, 0, 1) -__SYCL_ACCESS(N == 4, brb, 2, 0, 2) -__SYCL_ACCESS(N == 4, bra, 2, 0, 3) -__SYCL_ACCESS(N == 4, bgr, 2, 1, 0) -__SYCL_ACCESS(N == 4, bgg, 2, 1, 1) -__SYCL_ACCESS(N == 4, bgb, 2, 1, 2) -__SYCL_ACCESS(N == 4, bga, 2, 1, 3) -__SYCL_ACCESS(N == 4, bbr, 2, 2, 0) -__SYCL_ACCESS(N == 4, bbg, 2, 2, 1) -__SYCL_ACCESS(N == 4, bbb, 2, 2, 2) -__SYCL_ACCESS(N == 4, bba, 2, 2, 3) -__SYCL_ACCESS(N == 4, bar, 2, 3, 0) -__SYCL_ACCESS(N == 4, bag, 2, 3, 1) -__SYCL_ACCESS(N == 4, bab, 2, 3, 2) -__SYCL_ACCESS(N == 4, baa, 2, 3, 3) -__SYCL_ACCESS(N == 4, arr, 3, 0, 0) -__SYCL_ACCESS(N == 4, arg, 3, 0, 1) -__SYCL_ACCESS(N == 4, arb, 3, 0, 2) -__SYCL_ACCESS(N == 4, ara, 3, 0, 3) -__SYCL_ACCESS(N == 4, agr, 3, 1, 0) -__SYCL_ACCESS(N == 4, agg, 3, 1, 1) -__SYCL_ACCESS(N == 4, agb, 3, 1, 2) -__SYCL_ACCESS(N == 4, aga, 3, 1, 3) -__SYCL_ACCESS(N == 4, abr, 3, 2, 0) -__SYCL_ACCESS(N == 4, abg, 3, 2, 1) -__SYCL_ACCESS(N == 4, abb, 3, 2, 2) -__SYCL_ACCESS(N == 4, aba, 3, 2, 3) -__SYCL_ACCESS(N == 4, aar, 3, 3, 0) -__SYCL_ACCESS(N == 4, aag, 3, 3, 1) -__SYCL_ACCESS(N == 4, aab, 3, 3, 2) -__SYCL_ACCESS(N == 4, aaa, 3, 3, 3) -__SYCL_ACCESS(N == 4, rrrr, 0, 0, 0, 0) -__SYCL_ACCESS(N == 4, rrrg, 0, 0, 0, 1) -__SYCL_ACCESS(N == 4, rrrb, 0, 0, 0, 2) -__SYCL_ACCESS(N == 4, rrra, 0, 0, 0, 3) -__SYCL_ACCESS(N == 4, rrgr, 0, 0, 1, 0) -__SYCL_ACCESS(N == 4, rrgg, 0, 0, 1, 1) -__SYCL_ACCESS(N == 4, rrgb, 0, 0, 1, 2) -__SYCL_ACCESS(N == 4, rrga, 0, 0, 1, 3) -__SYCL_ACCESS(N == 4, rrbr, 0, 0, 2, 0) -__SYCL_ACCESS(N == 4, rrbg, 0, 0, 2, 1) -__SYCL_ACCESS(N == 4, rrbb, 0, 0, 2, 2) -__SYCL_ACCESS(N == 4, rrba, 0, 0, 2, 3) -__SYCL_ACCESS(N == 4, rrar, 0, 0, 3, 0) -__SYCL_ACCESS(N == 4, rrag, 0, 0, 3, 1) -__SYCL_ACCESS(N == 4, rrab, 0, 0, 3, 2) -__SYCL_ACCESS(N == 4, rraa, 0, 0, 3, 3) -__SYCL_ACCESS(N == 4, rgrr, 0, 1, 0, 0) -__SYCL_ACCESS(N == 4, rgrg, 0, 1, 0, 1) -__SYCL_ACCESS(N == 4, rgrb, 0, 1, 0, 2) -__SYCL_ACCESS(N == 4, rgra, 0, 1, 0, 3) -__SYCL_ACCESS(N == 4, rggr, 0, 1, 1, 0) -__SYCL_ACCESS(N == 4, rggg, 0, 1, 1, 1) -__SYCL_ACCESS(N == 4, rggb, 0, 1, 1, 2) -__SYCL_ACCESS(N == 4, rgga, 0, 1, 1, 3) -__SYCL_ACCESS(N == 4, rgbr, 0, 1, 2, 0) -__SYCL_ACCESS(N == 4, rgbg, 0, 1, 2, 1) -__SYCL_ACCESS(N == 4, rgbb, 0, 1, 2, 2) -__SYCL_ACCESS(N == 4, rgba, 0, 1, 2, 3) -__SYCL_ACCESS(N == 4, rgar, 0, 1, 3, 0) -__SYCL_ACCESS(N == 4, rgag, 0, 1, 3, 1) -__SYCL_ACCESS(N == 4, rgab, 0, 1, 3, 2) -__SYCL_ACCESS(N == 4, rgaa, 0, 1, 3, 3) -__SYCL_ACCESS(N == 4, rbrr, 0, 2, 0, 0) -__SYCL_ACCESS(N == 4, rbrg, 0, 2, 0, 1) -__SYCL_ACCESS(N == 4, rbrb, 0, 2, 0, 2) -__SYCL_ACCESS(N == 4, rbra, 0, 2, 0, 3) -__SYCL_ACCESS(N == 4, rbgr, 0, 2, 1, 0) -__SYCL_ACCESS(N == 4, rbgg, 0, 2, 1, 1) -__SYCL_ACCESS(N == 4, rbgb, 0, 2, 1, 2) -__SYCL_ACCESS(N == 4, rbga, 0, 2, 1, 3) -__SYCL_ACCESS(N == 4, rbbr, 0, 2, 2, 0) -__SYCL_ACCESS(N == 4, rbbg, 0, 2, 2, 1) -__SYCL_ACCESS(N == 4, rbbb, 0, 2, 2, 2) -__SYCL_ACCESS(N == 4, rbba, 0, 2, 2, 3) -__SYCL_ACCESS(N == 4, rbar, 0, 2, 3, 0) -__SYCL_ACCESS(N == 4, rbag, 0, 2, 3, 1) -__SYCL_ACCESS(N == 4, rbab, 0, 2, 3, 2) -__SYCL_ACCESS(N == 4, rbaa, 0, 2, 3, 3) -__SYCL_ACCESS(N == 4, rarr, 0, 3, 0, 0) -__SYCL_ACCESS(N == 4, rarg, 0, 3, 0, 1) -__SYCL_ACCESS(N == 4, rarb, 0, 3, 0, 2) -__SYCL_ACCESS(N == 4, rara, 0, 3, 0, 3) -__SYCL_ACCESS(N == 4, ragr, 0, 3, 1, 0) -__SYCL_ACCESS(N == 4, ragg, 0, 3, 1, 1) -__SYCL_ACCESS(N == 4, ragb, 0, 3, 1, 2) -__SYCL_ACCESS(N == 4, raga, 0, 3, 1, 3) -__SYCL_ACCESS(N == 4, rabr, 0, 3, 2, 0) -__SYCL_ACCESS(N == 4, rabg, 0, 3, 2, 1) -__SYCL_ACCESS(N == 4, rabb, 0, 3, 2, 2) -__SYCL_ACCESS(N == 4, raba, 0, 3, 2, 3) -__SYCL_ACCESS(N == 4, raar, 0, 3, 3, 0) -__SYCL_ACCESS(N == 4, raag, 0, 3, 3, 1) -__SYCL_ACCESS(N == 4, raab, 0, 3, 3, 2) -__SYCL_ACCESS(N == 4, raaa, 0, 3, 3, 3) -__SYCL_ACCESS(N == 4, grrr, 1, 0, 0, 0) -__SYCL_ACCESS(N == 4, grrg, 1, 0, 0, 1) -__SYCL_ACCESS(N == 4, grrb, 1, 0, 0, 2) -__SYCL_ACCESS(N == 4, grra, 1, 0, 0, 3) -__SYCL_ACCESS(N == 4, grgr, 1, 0, 1, 0) -__SYCL_ACCESS(N == 4, grgg, 1, 0, 1, 1) -__SYCL_ACCESS(N == 4, grgb, 1, 0, 1, 2) -__SYCL_ACCESS(N == 4, grga, 1, 0, 1, 3) -__SYCL_ACCESS(N == 4, grbr, 1, 0, 2, 0) -__SYCL_ACCESS(N == 4, grbg, 1, 0, 2, 1) -__SYCL_ACCESS(N == 4, grbb, 1, 0, 2, 2) -__SYCL_ACCESS(N == 4, grba, 1, 0, 2, 3) -__SYCL_ACCESS(N == 4, grar, 1, 0, 3, 0) -__SYCL_ACCESS(N == 4, grag, 1, 0, 3, 1) -__SYCL_ACCESS(N == 4, grab, 1, 0, 3, 2) -__SYCL_ACCESS(N == 4, graa, 1, 0, 3, 3) -__SYCL_ACCESS(N == 4, ggrr, 1, 1, 0, 0) -__SYCL_ACCESS(N == 4, ggrg, 1, 1, 0, 1) -__SYCL_ACCESS(N == 4, ggrb, 1, 1, 0, 2) -__SYCL_ACCESS(N == 4, ggra, 1, 1, 0, 3) -__SYCL_ACCESS(N == 4, gggr, 1, 1, 1, 0) -__SYCL_ACCESS(N == 4, gggg, 1, 1, 1, 1) -__SYCL_ACCESS(N == 4, gggb, 1, 1, 1, 2) -__SYCL_ACCESS(N == 4, ggga, 1, 1, 1, 3) -__SYCL_ACCESS(N == 4, ggbr, 1, 1, 2, 0) -__SYCL_ACCESS(N == 4, ggbg, 1, 1, 2, 1) -__SYCL_ACCESS(N == 4, ggbb, 1, 1, 2, 2) -__SYCL_ACCESS(N == 4, ggba, 1, 1, 2, 3) -__SYCL_ACCESS(N == 4, ggar, 1, 1, 3, 0) -__SYCL_ACCESS(N == 4, ggag, 1, 1, 3, 1) -__SYCL_ACCESS(N == 4, ggab, 1, 1, 3, 2) -__SYCL_ACCESS(N == 4, ggaa, 1, 1, 3, 3) -__SYCL_ACCESS(N == 4, gbrr, 1, 2, 0, 0) -__SYCL_ACCESS(N == 4, gbrg, 1, 2, 0, 1) -__SYCL_ACCESS(N == 4, gbrb, 1, 2, 0, 2) -__SYCL_ACCESS(N == 4, gbra, 1, 2, 0, 3) -__SYCL_ACCESS(N == 4, gbgr, 1, 2, 1, 0) -__SYCL_ACCESS(N == 4, gbgg, 1, 2, 1, 1) -__SYCL_ACCESS(N == 4, gbgb, 1, 2, 1, 2) -__SYCL_ACCESS(N == 4, gbga, 1, 2, 1, 3) -__SYCL_ACCESS(N == 4, gbbr, 1, 2, 2, 0) -__SYCL_ACCESS(N == 4, gbbg, 1, 2, 2, 1) -__SYCL_ACCESS(N == 4, gbbb, 1, 2, 2, 2) -__SYCL_ACCESS(N == 4, gbba, 1, 2, 2, 3) -__SYCL_ACCESS(N == 4, gbar, 1, 2, 3, 0) -__SYCL_ACCESS(N == 4, gbag, 1, 2, 3, 1) -__SYCL_ACCESS(N == 4, gbab, 1, 2, 3, 2) -__SYCL_ACCESS(N == 4, gbaa, 1, 2, 3, 3) -__SYCL_ACCESS(N == 4, garr, 1, 3, 0, 0) -__SYCL_ACCESS(N == 4, garg, 1, 3, 0, 1) -__SYCL_ACCESS(N == 4, garb, 1, 3, 0, 2) -__SYCL_ACCESS(N == 4, gara, 1, 3, 0, 3) -__SYCL_ACCESS(N == 4, gagr, 1, 3, 1, 0) -__SYCL_ACCESS(N == 4, gagg, 1, 3, 1, 1) -__SYCL_ACCESS(N == 4, gagb, 1, 3, 1, 2) -__SYCL_ACCESS(N == 4, gaga, 1, 3, 1, 3) -__SYCL_ACCESS(N == 4, gabr, 1, 3, 2, 0) -__SYCL_ACCESS(N == 4, gabg, 1, 3, 2, 1) -__SYCL_ACCESS(N == 4, gabb, 1, 3, 2, 2) -__SYCL_ACCESS(N == 4, gaba, 1, 3, 2, 3) -__SYCL_ACCESS(N == 4, gaar, 1, 3, 3, 0) -__SYCL_ACCESS(N == 4, gaag, 1, 3, 3, 1) -__SYCL_ACCESS(N == 4, gaab, 1, 3, 3, 2) -__SYCL_ACCESS(N == 4, gaaa, 1, 3, 3, 3) -__SYCL_ACCESS(N == 4, brrr, 2, 0, 0, 0) -__SYCL_ACCESS(N == 4, brrg, 2, 0, 0, 1) -__SYCL_ACCESS(N == 4, brrb, 2, 0, 0, 2) -__SYCL_ACCESS(N == 4, brra, 2, 0, 0, 3) -__SYCL_ACCESS(N == 4, brgr, 2, 0, 1, 0) -__SYCL_ACCESS(N == 4, brgg, 2, 0, 1, 1) -__SYCL_ACCESS(N == 4, brgb, 2, 0, 1, 2) -__SYCL_ACCESS(N == 4, brga, 2, 0, 1, 3) -__SYCL_ACCESS(N == 4, brbr, 2, 0, 2, 0) -__SYCL_ACCESS(N == 4, brbg, 2, 0, 2, 1) -__SYCL_ACCESS(N == 4, brbb, 2, 0, 2, 2) -__SYCL_ACCESS(N == 4, brba, 2, 0, 2, 3) -__SYCL_ACCESS(N == 4, brar, 2, 0, 3, 0) -__SYCL_ACCESS(N == 4, brag, 2, 0, 3, 1) -__SYCL_ACCESS(N == 4, brab, 2, 0, 3, 2) -__SYCL_ACCESS(N == 4, braa, 2, 0, 3, 3) -__SYCL_ACCESS(N == 4, bgrr, 2, 1, 0, 0) -__SYCL_ACCESS(N == 4, bgrg, 2, 1, 0, 1) -__SYCL_ACCESS(N == 4, bgrb, 2, 1, 0, 2) -__SYCL_ACCESS(N == 4, bgra, 2, 1, 0, 3) -__SYCL_ACCESS(N == 4, bggr, 2, 1, 1, 0) -__SYCL_ACCESS(N == 4, bggg, 2, 1, 1, 1) -__SYCL_ACCESS(N == 4, bggb, 2, 1, 1, 2) -__SYCL_ACCESS(N == 4, bgga, 2, 1, 1, 3) -__SYCL_ACCESS(N == 4, bgbr, 2, 1, 2, 0) -__SYCL_ACCESS(N == 4, bgbg, 2, 1, 2, 1) -__SYCL_ACCESS(N == 4, bgbb, 2, 1, 2, 2) -__SYCL_ACCESS(N == 4, bgba, 2, 1, 2, 3) -__SYCL_ACCESS(N == 4, bgar, 2, 1, 3, 0) -__SYCL_ACCESS(N == 4, bgag, 2, 1, 3, 1) -__SYCL_ACCESS(N == 4, bgab, 2, 1, 3, 2) -__SYCL_ACCESS(N == 4, bgaa, 2, 1, 3, 3) -__SYCL_ACCESS(N == 4, bbrr, 2, 2, 0, 0) -__SYCL_ACCESS(N == 4, bbrg, 2, 2, 0, 1) -__SYCL_ACCESS(N == 4, bbrb, 2, 2, 0, 2) -__SYCL_ACCESS(N == 4, bbra, 2, 2, 0, 3) -__SYCL_ACCESS(N == 4, bbgr, 2, 2, 1, 0) -__SYCL_ACCESS(N == 4, bbgg, 2, 2, 1, 1) -__SYCL_ACCESS(N == 4, bbgb, 2, 2, 1, 2) -__SYCL_ACCESS(N == 4, bbga, 2, 2, 1, 3) -__SYCL_ACCESS(N == 4, bbbr, 2, 2, 2, 0) -__SYCL_ACCESS(N == 4, bbbg, 2, 2, 2, 1) -__SYCL_ACCESS(N == 4, bbbb, 2, 2, 2, 2) -__SYCL_ACCESS(N == 4, bbba, 2, 2, 2, 3) -__SYCL_ACCESS(N == 4, bbar, 2, 2, 3, 0) -__SYCL_ACCESS(N == 4, bbag, 2, 2, 3, 1) -__SYCL_ACCESS(N == 4, bbab, 2, 2, 3, 2) -__SYCL_ACCESS(N == 4, bbaa, 2, 2, 3, 3) -__SYCL_ACCESS(N == 4, barr, 2, 3, 0, 0) -__SYCL_ACCESS(N == 4, barg, 2, 3, 0, 1) -__SYCL_ACCESS(N == 4, barb, 2, 3, 0, 2) -__SYCL_ACCESS(N == 4, bara, 2, 3, 0, 3) -__SYCL_ACCESS(N == 4, bagr, 2, 3, 1, 0) -__SYCL_ACCESS(N == 4, bagg, 2, 3, 1, 1) -__SYCL_ACCESS(N == 4, bagb, 2, 3, 1, 2) -__SYCL_ACCESS(N == 4, baga, 2, 3, 1, 3) -__SYCL_ACCESS(N == 4, babr, 2, 3, 2, 0) -__SYCL_ACCESS(N == 4, babg, 2, 3, 2, 1) -__SYCL_ACCESS(N == 4, babb, 2, 3, 2, 2) -__SYCL_ACCESS(N == 4, baba, 2, 3, 2, 3) -__SYCL_ACCESS(N == 4, baar, 2, 3, 3, 0) -__SYCL_ACCESS(N == 4, baag, 2, 3, 3, 1) -__SYCL_ACCESS(N == 4, baab, 2, 3, 3, 2) -__SYCL_ACCESS(N == 4, baaa, 2, 3, 3, 3) -__SYCL_ACCESS(N == 4, arrr, 3, 0, 0, 0) -__SYCL_ACCESS(N == 4, arrg, 3, 0, 0, 1) -__SYCL_ACCESS(N == 4, arrb, 3, 0, 0, 2) -__SYCL_ACCESS(N == 4, arra, 3, 0, 0, 3) -__SYCL_ACCESS(N == 4, argr, 3, 0, 1, 0) -__SYCL_ACCESS(N == 4, argg, 3, 0, 1, 1) -__SYCL_ACCESS(N == 4, argb, 3, 0, 1, 2) -__SYCL_ACCESS(N == 4, arga, 3, 0, 1, 3) -__SYCL_ACCESS(N == 4, arbr, 3, 0, 2, 0) -__SYCL_ACCESS(N == 4, arbg, 3, 0, 2, 1) -__SYCL_ACCESS(N == 4, arbb, 3, 0, 2, 2) -__SYCL_ACCESS(N == 4, arba, 3, 0, 2, 3) -__SYCL_ACCESS(N == 4, arar, 3, 0, 3, 0) -__SYCL_ACCESS(N == 4, arag, 3, 0, 3, 1) -__SYCL_ACCESS(N == 4, arab, 3, 0, 3, 2) -__SYCL_ACCESS(N == 4, araa, 3, 0, 3, 3) -__SYCL_ACCESS(N == 4, agrr, 3, 1, 0, 0) -__SYCL_ACCESS(N == 4, agrg, 3, 1, 0, 1) -__SYCL_ACCESS(N == 4, agrb, 3, 1, 0, 2) -__SYCL_ACCESS(N == 4, agra, 3, 1, 0, 3) -__SYCL_ACCESS(N == 4, aggr, 3, 1, 1, 0) -__SYCL_ACCESS(N == 4, aggg, 3, 1, 1, 1) -__SYCL_ACCESS(N == 4, aggb, 3, 1, 1, 2) -__SYCL_ACCESS(N == 4, agga, 3, 1, 1, 3) -__SYCL_ACCESS(N == 4, agbr, 3, 1, 2, 0) -__SYCL_ACCESS(N == 4, agbg, 3, 1, 2, 1) -__SYCL_ACCESS(N == 4, agbb, 3, 1, 2, 2) -__SYCL_ACCESS(N == 4, agba, 3, 1, 2, 3) -__SYCL_ACCESS(N == 4, agar, 3, 1, 3, 0) -__SYCL_ACCESS(N == 4, agag, 3, 1, 3, 1) -__SYCL_ACCESS(N == 4, agab, 3, 1, 3, 2) -__SYCL_ACCESS(N == 4, agaa, 3, 1, 3, 3) -__SYCL_ACCESS(N == 4, abrr, 3, 2, 0, 0) -__SYCL_ACCESS(N == 4, abrg, 3, 2, 0, 1) -__SYCL_ACCESS(N == 4, abrb, 3, 2, 0, 2) -__SYCL_ACCESS(N == 4, abra, 3, 2, 0, 3) -__SYCL_ACCESS(N == 4, abgr, 3, 2, 1, 0) -__SYCL_ACCESS(N == 4, abgg, 3, 2, 1, 1) -__SYCL_ACCESS(N == 4, abgb, 3, 2, 1, 2) -__SYCL_ACCESS(N == 4, abga, 3, 2, 1, 3) -__SYCL_ACCESS(N == 4, abbr, 3, 2, 2, 0) -__SYCL_ACCESS(N == 4, abbg, 3, 2, 2, 1) -__SYCL_ACCESS(N == 4, abbb, 3, 2, 2, 2) -__SYCL_ACCESS(N == 4, abba, 3, 2, 2, 3) -__SYCL_ACCESS(N == 4, abar, 3, 2, 3, 0) -__SYCL_ACCESS(N == 4, abag, 3, 2, 3, 1) -__SYCL_ACCESS(N == 4, abab, 3, 2, 3, 2) -__SYCL_ACCESS(N == 4, abaa, 3, 2, 3, 3) -__SYCL_ACCESS(N == 4, aarr, 3, 3, 0, 0) -__SYCL_ACCESS(N == 4, aarg, 3, 3, 0, 1) -__SYCL_ACCESS(N == 4, aarb, 3, 3, 0, 2) -__SYCL_ACCESS(N == 4, aara, 3, 3, 0, 3) -__SYCL_ACCESS(N == 4, aagr, 3, 3, 1, 0) -__SYCL_ACCESS(N == 4, aagg, 3, 3, 1, 1) -__SYCL_ACCESS(N == 4, aagb, 3, 3, 1, 2) -__SYCL_ACCESS(N == 4, aaga, 3, 3, 1, 3) -__SYCL_ACCESS(N == 4, aabr, 3, 3, 2, 0) -__SYCL_ACCESS(N == 4, aabg, 3, 3, 2, 1) -__SYCL_ACCESS(N == 4, aabb, 3, 3, 2, 2) -__SYCL_ACCESS(N == 4, aaba, 3, 3, 2, 3) -__SYCL_ACCESS(N == 4, aaar, 3, 3, 3, 0) -__SYCL_ACCESS(N == 4, aaag, 3, 3, 3, 1) -__SYCL_ACCESS(N == 4, aaab, 3, 3, 3, 2) -__SYCL_ACCESS(N == 4, aaaa, 3, 3, 3, 3) - -#endif // #ifdef SYCL_SIMPLE_SWIZZLES - -//__swizzled_vec__ lo()/hi() const; -__SYCL_ACCESS(N == 2, lo, 0) -__SYCL_ACCESS(N == 3, lo, 0, 1) -__SYCL_ACCESS(N == 4, lo, 0, 1) -__SYCL_ACCESS(N == 8, lo, 0, 1, 2, 3) -__SYCL_ACCESS(N == 16, lo, 0, 1, 2, 3, 4, 5, 6, 7) -__SYCL_ACCESS(N == 2, hi, 1) -__SYCL_ACCESS(N == 3, hi, 2, 3) -__SYCL_ACCESS(N == 4, hi, 2, 3) -__SYCL_ACCESS(N == 8, hi, 4, 5, 6, 7) -__SYCL_ACCESS(N == 16, hi, 8, 9, 10, 11, 12, 13, 14, 15) -//__swizzled_vec__ odd()/even() const; -__SYCL_ACCESS(N == 2, odd, 1) -__SYCL_ACCESS(N == 3, odd, 1, 3) -__SYCL_ACCESS(N == 4, odd, 1, 3) -__SYCL_ACCESS(N == 8, odd, 1, 3, 5, 7) -__SYCL_ACCESS(N == 16, odd, 1, 3, 5, 7, 9, 11, 13, 15) -__SYCL_ACCESS(N == 2, even, 0) -__SYCL_ACCESS(N == 3, even, 0, 2) -__SYCL_ACCESS(N == 4, even, 0, 2) -__SYCL_ACCESS(N == 8, even, 0, 2, 4, 6) -__SYCL_ACCESS(N == 16, even, 0, 2, 4, 6, 8, 10, 12, 14) - -#undef __SYCL_E0 -#undef __SYCL_E1 -#undef __SYCL_E2 -#undef __SYCL_E3 -#undef __SYCL_E4 -#undef __SYCL_E5 -#undef __SYCL_E6 -#undef __SYCL_E7 -#undef __SYCL_E8 - -#undef __SYCL_NTH_ARG -#undef __SYCL_EXPAND -#undef __SYCL_INDEXER -#undef __SYCL_ACCESS -#undef __SYCL_SCALAR_ACCESS diff --git a/sycl/include/sycl/vector.hpp b/sycl/include/sycl/vector.hpp index 75eb80cac829c..3c5e7c3a305b6 100644 --- a/sycl/include/sycl/vector.hpp +++ b/sycl/include/sycl/vector.hpp @@ -32,9 +32,10 @@ #include // for __SYCL2020_DEPRECATED #include // for is_sigeninteger, is_s... #include // for memcpy -#include // for is_floating_point +#include +#include // for is_floating_point #include -#include // for StorageT, half, Vec16... +#include // for StorageT, half, Vec16... #include // bfloat16 @@ -134,7 +135,9 @@ template class __SYCL_EBO vec : public detail::vec_arith, public detail::ScalarConversionOperatorMixIn, - DataT, NumElements> { + DataT, NumElements>, + public detail::NamedSwizzlesMixinBoth, + NumElements> { static_assert(std::is_same_v>, "DataT must be cv-unqualified"); @@ -379,26 +382,6 @@ class __SYCL_EBO vec DataT &operator[](int i) { return m_Data[i]; } - // Begin hi/lo, even/odd, xyzw, and rgba swizzles. @{ -private: - // Indexer used in the swizzles.def - // Currently it is defined as a template struct. Replacing it with a constexpr - // function would activate a bug in MSVC that is fixed only in v19.20. - // Until then MSVC does not recognize such constexpr functions as const and - // thus does not let using them in template parameters inside swizzle.def. - template struct Indexer { - static constexpr int value = Index; - }; - -public: -#ifdef __SYCL_ACCESS_RETURN -#error "Undefine __SYCL_ACCESS_RETURN macro" -#endif -#define __SYCL_ACCESS_RETURN this -#include "swizzles.def" -#undef __SYCL_ACCESS_RETURN - // }@ End of hi/lo, even/odd, xyzw, and rgba swizzles. - template void load(size_t Offset, multi_ptr Ptr) { for (int I = 0; I < NumElements; I++) { @@ -560,7 +543,10 @@ template struct LShift { // Actual computation performed on conversion or assignment operators. template class OperationCurrentT, int... Indexes> -class SwizzleOp { +class SwizzleOp : public detail::NamedSwizzlesMixinBoth< + SwizzleOp, + sizeof...(Indexes)> { using DataT = typename VecT::element_type; // Certain operators return a vector with a different element type. Also, the // left and right operand types may differ. CommonDataT selects a result type @@ -1194,37 +1180,25 @@ class SwizzleOp { return NewRelOp(NULL, *this, Rhs); } - // Begin hi/lo, even/odd, xyzw, and rgba swizzles. private: - // Indexer used in the swizzles.def. - // Currently it is defined as a template struct. Replacing it with a constexpr - // function would activate a bug in MSVC that is fixed only in v19.20. - // Until then MSVC does not recognize such constexpr functions as const and - // thus does not let using them in template parameters inside swizzle.def. - template struct Indexer { - static constexpr int IDXs[sizeof...(Indexes)] = {Indexes...}; - static constexpr int value = IDXs[Index >= getNumElements() ? 0 : Index]; - }; + static constexpr int get_vec_idx(int idx) { + int counter = 0; + int result = -1; + ((result = counter++ == idx ? Indexes : result), ...); + return result; + } public: template - ConstSwizzle::value...> swizzle() const { + ConstSwizzle swizzle() const { return m_Vector; } template - Swizzle::value...> swizzle() { + Swizzle swizzle() { return m_Vector; } -#ifdef __SYCL_ACCESS_RETURN -#error "Undefine __SYCL_ACCESS_RETURN macro" -#endif -#define __SYCL_ACCESS_RETURN m_Vector -#include "swizzles.def" -#undef __SYCL_ACCESS_RETURN - // End of hi/lo, even/odd, xyzw, and rgba swizzles. - // Leave store() interface to automatic conversion to vec<>. // Load to vec_t and then assign to swizzle. template diff --git a/sycl/source/detail/config.hpp b/sycl/source/detail/config.hpp index ace69d0a9420e..3e0a591e27d14 100644 --- a/sycl/source/detail/config.hpp +++ b/sycl/source/detail/config.hpp @@ -806,6 +806,56 @@ template <> class SYCLConfig { } }; +// SYCL_CACHE_MAX_SIZE accepts an integer that specifies +// the maximum size of the on-disk Program cache. +// Cache eviction is performed when the cache size exceeds the threshold. +// The thresholds are specified in bytes. +// The default value is "0" which means that eviction is disabled. +template <> class SYCLConfig { + using BaseT = SYCLConfigBase; + +public: + static long long get() { return getCachedValue(); } + static void reset() { (void)getCachedValue(true); } + + static long long getProgramCacheSize() { return getCachedValue(); } + + static bool isPersistentCacheEvictionEnabled() { + return getProgramCacheSize() > 0; + } + +private: + static long long getCachedValue(bool ResetCache = false) { + const auto Parser = []() { + const char *ValStr = BaseT::getRawValue(); + + // Disable eviction by default. + if (!ValStr) + return (long long)0; + + long long CacheSize = 0; + try { + CacheSize = std::stoll(ValStr); + if (CacheSize < 0) + throw INVALID_CONFIG_EXCEPTION(BaseT, "Value must be non-negative"); + } catch (...) { + std::string Msg = + std::string{"Invalid input to SYCL_CACHE_MAX_SIZE. Please try " + "a positive integer."}; + throw exception(make_error_code(errc::runtime), Msg); + } + + return CacheSize; + }; + + static auto EvictionThresholds = Parser(); + if (ResetCache) + EvictionThresholds = Parser(); + + return EvictionThresholds; + } +}; + #undef INVALID_CONFIG_EXCEPTION } // namespace detail diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 4f8ce5e21d9de..67ccf0a1885e7 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -1011,10 +1011,8 @@ exec_graph_impl::enqueue(const std::shared_ptr &Queue, // If we have no requirements or dependent events for the command buffer, // enqueue it directly if (CGData.MRequirements.empty() && CGData.MEvents.empty()) { - if (NewEvent != nullptr) { - NewEvent->setSubmissionTime(); - NewEvent->setHostEnqueueTime(); - } + NewEvent->setSubmissionTime(); + NewEvent->setHostEnqueueTime(); ur_result_t Res = Queue->getAdapter() ->call_nocheck< @@ -1639,7 +1637,7 @@ node modifiable_command_graph::addImpl(dynamic_command_group &DynCGF, graph_impl::WriteLock Lock(impl->MMutex); std::shared_ptr NodeImpl = impl->add(DynCGFImpl, DepImpls); - return sycl::detail::createSyclObjFromImpl(NodeImpl); + return sycl::detail::createSyclObjFromImpl(std::move(NodeImpl)); } node modifiable_command_graph::addImpl(const std::vector &Deps) { @@ -1651,7 +1649,7 @@ node modifiable_command_graph::addImpl(const std::vector &Deps) { graph_impl::WriteLock Lock(impl->MMutex); std::shared_ptr NodeImpl = impl->add(DepImpls); - return sycl::detail::createSyclObjFromImpl(NodeImpl); + return sycl::detail::createSyclObjFromImpl(std::move(NodeImpl)); } node modifiable_command_graph::addImpl(std::function CGF, @@ -1664,7 +1662,7 @@ node modifiable_command_graph::addImpl(std::function CGF, graph_impl::WriteLock Lock(impl->MMutex); std::shared_ptr NodeImpl = impl->add(CGF, {}, DepImpls); - return sycl::detail::createSyclObjFromImpl(NodeImpl); + return sycl::detail::createSyclObjFromImpl(std::move(NodeImpl)); } void modifiable_command_graph::addGraphLeafDependencies(node Node) { @@ -1989,8 +1987,8 @@ void dynamic_command_group_impl::finalizeCGFList( // shared_ptr to store sycl::detail::CG *RawCGPtr = Handler.impl->MGraphNodeCG.release(); auto RawCGExecPtr = static_cast(RawCGPtr); - auto CGExecSP = std::shared_ptr(RawCGExecPtr); - MKernels.push_back(CGExecSP); + MKernels.push_back( + std::shared_ptr(RawCGExecPtr)); // Track dynamic_parameter usage in command-list auto &DynamicParams = Handler.impl->MDynamicParameters; diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 6144e3f51b9da..2d714bcc96d66 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -140,7 +140,8 @@ class node_impl : public std::enable_shared_from_this { /// @param NodeType Type of the command-group. /// @param CommandGroup The CG which stores the command information for this /// node. - node_impl(node_type NodeType, std::shared_ptr CommandGroup) + node_impl(node_type NodeType, + const std::shared_ptr &CommandGroup) : MCGType(CommandGroup->getType()), MNodeType(NodeType), MCommandGroup(CommandGroup) { if (NodeType == node_type::subgraph) { diff --git a/sycl/source/detail/kernel_impl.hpp b/sycl/source/detail/kernel_impl.hpp index 57f1f10392997..1b07d866dcc4c 100644 --- a/sycl/source/detail/kernel_impl.hpp +++ b/sycl/source/detail/kernel_impl.hpp @@ -320,6 +320,7 @@ kernel_impl::queryMaxNumWorkGroups(queue Queue, const auto &Adapter = getAdapter(); const auto &Handle = getHandleRef(); auto Device = Queue.get_device(); + auto DeviceHandleRef = sycl::detail::getSyclObjImpl(Device)->getHandleRef(); size_t WG[Dimensions]; WG[0] = WorkGroupSize[0]; @@ -331,7 +332,8 @@ kernel_impl::queryMaxNumWorkGroups(queue Queue, uint32_t GroupCount{0}; if (auto Result = Adapter->call_nocheck< UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>( - Handle, Dimensions, WG, DynamicLocalMemorySize, &GroupCount); + Handle, DeviceHandleRef, Dimensions, WG, DynamicLocalMemorySize, + &GroupCount); Result != UR_RESULT_ERROR_UNSUPPORTED_FEATURE && Result != UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE) { // The feature is supported and the group size is valid. Check for other diff --git a/sycl/source/detail/kernel_program_cache.hpp b/sycl/source/detail/kernel_program_cache.hpp index 9f06d0ebcde8d..968cb9b24b053 100644 --- a/sycl/source/detail/kernel_program_cache.hpp +++ b/sycl/source/detail/kernel_program_cache.hpp @@ -442,6 +442,7 @@ class KernelProgramCache { return; // Save reference between the program and the fast cache key. + std::unique_lock Lock(MKernelFastCacheMutex); MProgramToKernelFastCacheKeyMap[Program].emplace_back(CacheKey); } @@ -495,16 +496,18 @@ class KernelProgramCache { LockedCacheKP.get().erase(NativePrg); } - // Remove corresponding entries from KernelFastCache. - auto FastCacheKeyItr = - MProgramToKernelFastCacheKeyMap.find(NativePrg); - if (FastCacheKeyItr != MProgramToKernelFastCacheKeyMap.end()) { - for (const auto &FastCacheKey : FastCacheKeyItr->second) { - std::unique_lock Lock(MKernelFastCacheMutex); - MKernelFastCache.erase(FastCacheKey); - traceKernel("Kernel evicted.", std::get<2>(FastCacheKey), true); + { + // Remove corresponding entries from KernelFastCache. + std::unique_lock Lock(MKernelFastCacheMutex); + if (auto FastCacheKeyItr = + MProgramToKernelFastCacheKeyMap.find(NativePrg); + FastCacheKeyItr != MProgramToKernelFastCacheKeyMap.end()) { + for (const auto &FastCacheKey : FastCacheKeyItr->second) { + MKernelFastCache.erase(FastCacheKey); + traceKernel("Kernel evicted.", std::get<2>(FastCacheKey), true); + } + MProgramToKernelFastCacheKeyMap.erase(FastCacheKeyItr); } - MProgramToKernelFastCacheKeyMap.erase(FastCacheKeyItr); } // Remove entry from ProgramCache KeyMap. @@ -617,6 +620,7 @@ class KernelProgramCache { /// /// This member function should only be used in unit tests. void reset() { + std::lock_guard EvictionListLock(MProgramEvictionListMutex); std::lock_guard L1(MProgramCacheMutex); std::lock_guard L2(MKernelsPerProgramCacheMutex); std::lock_guard L3(MKernelFastCacheMutex); @@ -624,9 +628,7 @@ class KernelProgramCache { MKernelsPerProgramCache = KernelCacheT{}; MKernelFastCache = KernelFastCacheT{}; MProgramToKernelFastCacheKeyMap.clear(); - // Clear the eviction lists and its mutexes. - std::lock_guard EvictionListLock(MProgramEvictionListMutex); MEvictionList.clear(); } diff --git a/sycl/source/detail/online_compiler/online_compiler.cpp b/sycl/source/detail/online_compiler/online_compiler.cpp index f5e62f8a1d4d8..138fc880edb92 100644 --- a/sycl/source/detail/online_compiler/online_compiler.cpp +++ b/sycl/source/detail/online_compiler/online_compiler.cpp @@ -98,12 +98,18 @@ compileToSPIRV(string_view Src, sycl::info::device_type DeviceType, #else static const std::string OclocLibraryName = "libocloc.so"; #endif - void *OclocLibrary = sycl::detail::ur::loadOsLibrary(OclocLibraryName); + auto CustomDeleter = [](void *StoredPtr) { + if (!StoredPtr) + return; + std::ignore = sycl::detail::ur::unloadOsLibrary(StoredPtr); + }; + std::unique_ptr OclocLibrary( + sycl::detail::ur::loadOsLibrary(OclocLibraryName), CustomDeleter); if (!OclocLibrary) throw online_compile_error("Cannot load ocloc library: " + OclocLibraryName); - void *OclocVersionHandle = - sycl::detail::ur::getOsLibraryFuncAddress(OclocLibrary, "oclocVersion"); + void *OclocVersionHandle = sycl::detail::ur::getOsLibraryFuncAddress( + OclocLibrary.get(), "oclocVersion"); // The initial versions of ocloc library did not have the oclocVersion() // function. Those versions had the same API as the first version of ocloc // library having that oclocVersion() function. @@ -129,18 +135,21 @@ compileToSPIRV(string_view Src, sycl::info::device_type DeviceType, std::to_string(CurrentVersionMajor) + ".N), where (N >= " + std::to_string(CurrentVersionMinor) + ")."); - CompileToSPIRVHandle = - sycl::detail::ur::getOsLibraryFuncAddress(OclocLibrary, "oclocInvoke"); + CompileToSPIRVHandle = sycl::detail::ur::getOsLibraryFuncAddress( + OclocLibrary.get(), "oclocInvoke"); if (!CompileToSPIRVHandle) throw online_compile_error("Cannot load oclocInvoke() function"); FreeSPIRVOutputsHandle = sycl::detail::ur::getOsLibraryFuncAddress( - OclocLibrary, "oclocFreeOutput"); - if (!FreeSPIRVOutputsHandle) + OclocLibrary.get(), "oclocFreeOutput"); + if (!FreeSPIRVOutputsHandle) { + CompileToSPIRVHandle = NULL; throw online_compile_error("Cannot load oclocFreeOutput() function"); + } + OclocLibrary.release(); } std::string CombinedUserArgs; - for (auto UserArg : UserArgs) { + for (const auto &UserArg : UserArgs) { if (UserArg == "") continue; if (CombinedUserArgs != "") diff --git a/sycl/source/detail/os_util.cpp b/sycl/source/detail/os_util.cpp index 018ba97cff05c..8c4c0bff1293d 100644 --- a/sycl/source/detail/os_util.cpp +++ b/sycl/source/detail/os_util.cpp @@ -7,14 +7,20 @@ //===----------------------------------------------------------------------===// #include +#include #include #include -#if __GNUC__ && __GNUC__ < 8 -// Don't include for GCC versions less than 8 +// For GCC versions less than 8, use experimental/filesystem. +#if defined(__has_include) && __has_include() +#include +namespace fs = std::filesystem; +#elif defined(__has_include) && __has_include() +#include +namespace fs = std::experimental::filesystem; #else -#include // C++ 17 std::create_directories +#error "OSUtils requires C++ filesystem support" #endif #if defined(__SYCL_RT_OS_LINUX) @@ -277,6 +283,45 @@ int OSUtil::makeDir(const char *Dir) { return 0; } +// Get size of file in bytes. +size_t getFileSize(const std::string &Path) { + return static_cast(fs::file_size(Path)); +} + +// Function to recursively iterate over the directory and execute +// 'Func' on each regular file. +void fileTreeWalk(const std::string Path, + std::function Func) { + + std::error_code EC; + for (auto It = fs::recursive_directory_iterator(Path, EC); + It != fs::recursive_directory_iterator(); It.increment(EC)) { + + // Errors can happen if a file was removed/added during the iteration. + if (EC) + throw sycl::exception( + make_error_code(errc::runtime), + "Failed to do File Tree Walk. Ensure that the directory is not " + "getting updated while FileTreeWalk is in progress.: " + + Path + "\n" + EC.message()); + + if (fs::is_regular_file(It->path())) + Func(It->path().string()); + } +} + +// Get size of a directory in bytes. +size_t getDirectorySize(const std::string &Path) { + size_t DirSizeVar = 0; + + auto CollectFIleSize = [&DirSizeVar](const std::string Path) { + DirSizeVar += getFileSize(Path); + }; + fileTreeWalk(Path, CollectFIleSize); + + return DirSizeVar; +} + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/persistent_device_code_cache.cpp b/sycl/source/detail/persistent_device_code_cache.cpp index a86e727dcca3a..47adbf133e85c 100644 --- a/sycl/source/detail/persistent_device_code_cache.cpp +++ b/sycl/source/detail/persistent_device_code_cache.cpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -178,6 +179,246 @@ getProgramBinaryData(const ur_program_handle_t &NativePrg, return Result; } +// Save the current time in a file. +void PersistentDeviceCodeCache::saveCurrentTimeInAFile(std::string FileName) { + // Lock the file to prevent concurrent writes. + LockCacheItem Lock{FileName}; + if (Lock.isOwned()) { + try { + std::ofstream FileStream{FileName, std::ios::trunc}; + FileStream << std::chrono::high_resolution_clock::now() + .time_since_epoch() + .count(); + FileStream.close(); + } catch (std::exception &e) { + throw sycl::exception(make_error_code(errc::runtime), + "Failed to save current time in a file: " + + FileName + "\n" + std::string(e.what())); + } + } +} + +// Check if cache_size.txt file is present in the cache root directory. +// If not, create it and populate it with the size of the cache directory. +void PersistentDeviceCodeCache::repopulateCacheSizeFile( + const std::string &CacheRoot) { + + // No need to store cache size if eviction is disabled. + if (!isEvictionEnabled()) + return; + + const std::string CacheSizeFileName = "cache_size.txt"; + const std::string CacheSizeFile = CacheRoot + "/" + CacheSizeFileName; + + // If the cache size file is not present, calculate the size of the cache size + // directory and write it to the file. + if (!OSUtil::isPathPresent(CacheSizeFile)) { + PersistentDeviceCodeCache::trace( + "Cache size file not present. Creating one."); + + // Take the lock to write the cache size to the file. + { + LockCacheItem Lock{CacheSizeFile}; + if (!Lock.isOwned()) { + // If some other process is writing the cache size, do not write it. + PersistentDeviceCodeCache::trace("Didnot create the cache size file. " + "Some other process is creating one."); + + // Stall until the other process creates the file. Stalling is important + // to prevent race between one process that's calculating the directory + // size and another process that's trying to create a new cache entry. + while (!OSUtil::isPathPresent(CacheSizeFile)) + continue; + } else { + // Calculate the size of the cache directory. + // During directory size calculation, do not add anything + // in the cache. Otherwise, we'll get a std::fs_error. + size_t CacheSize = getDirectorySize(CacheRoot); + + std::ofstream FileStream{CacheSizeFile}; + FileStream << CacheSize; + FileStream.close(); + PersistentDeviceCodeCache::trace("Cache size file created."); + } + } + } +} + +void PersistentDeviceCodeCache::evictItemsFromCache( + const std::string &CacheRoot, size_t CacheSize, size_t MaxCacheSize) { + PersistentDeviceCodeCache::trace("Cache eviction triggered."); + + // EVict half of the cache. + constexpr float HowMuchCacheToEvict = 0.5; + + // Create a file eviction_in_progress.lock to indicate that eviction is in + // progress. This file is used to prevent two processes from evicting the + // cache at the same time. + LockCacheItem Lock{CacheRoot + EvictionInProgressFileSuffix}; + if (!Lock.isOwned()) { + // If some other process is evicting the cache, return. + PersistentDeviceCodeCache::trace( + "Another process is evicting the cache. Returning."); + return; + } + + // Get the list of all files in the cache directory along with their last + // modification time. + std::vector> FilesWithAccessTime; + + auto CollectFileAccessTime = [&FilesWithAccessTime](const std::string File) { + if (File.find(CacheEntryAccessTimeSuffix) != std::string::npos) { + std::ifstream FileStream{File}; + uint64_t AccessTime; + FileStream >> AccessTime; + FilesWithAccessTime.push_back({AccessTime, File}); + } + }; + + // fileTreeWalk can throw if any new file is created or removed during the + // iteration. Retry in that case. When eviction is in progress, we don't + // insert any new item but processes can still read the cache. Reading from + // cache can create/remove .lock file which can cause the exception. + while (true) { + try { + fileTreeWalk(CacheRoot, CollectFileAccessTime); + break; + } catch (...) { + FilesWithAccessTime.clear(); + // If the cache directory is removed during the iteration, retry. + continue; + } + } + + // Sort the files in the cache directory based on their last access time. + std::sort(FilesWithAccessTime.begin(), FilesWithAccessTime.end(), + [](const std::pair &A, + const std::pair &B) { + return A.first < B.first; + }); + + // Evict files from the cache directory until the cache size is less than the + // threshold. + size_t CurrCacheSize = CacheSize; + for (const auto &File : FilesWithAccessTime) { + + int pos = File.second.find(CacheEntryAccessTimeSuffix); + const std::string FileNameWOExt = File.second.substr(0, pos); + const std::string BinFile = FileNameWOExt + ".bin"; + const std::string SrcFile = FileNameWOExt + ".src"; + + while (OSUtil::isPathPresent(BinFile) || OSUtil::isPathPresent(SrcFile)) { + + // Lock to prevent race between writer and eviction thread. + LockCacheItem Lock{FileNameWOExt}; + if (Lock.isOwned()) { + // Remove the file and subtract its size from the cache size. + auto RemoveFileAndSubtractSize = [&CurrCacheSize]( + const std::string &FileName) { + // If the file is not present, return. + if (!OSUtil::isPathPresent(FileName)) + return; + + auto FileSize = getFileSize(FileName); + if (std::remove(FileName.c_str())) { + throw sycl::exception(make_error_code(errc::runtime), + "Failed to evict cache entry: " + FileName); + } else { + PersistentDeviceCodeCache::trace("File removed: " + FileName); + CurrCacheSize -= FileSize; + } + }; + + // If removal fails due to a race, retry. + // Races are rare, but can happen if another process is reading the + // file. Locking down the entire cache and blocking all readers would be + // inefficient. + try { + RemoveFileAndSubtractSize(SrcFile); + RemoveFileAndSubtractSize(BinFile); + } catch (...) { + continue; + } + } + } + + // If the cache size is less than the threshold, break. + if (CurrCacheSize <= (size_t)(HowMuchCacheToEvict * MaxCacheSize)) + break; + } + + // Update the cache size file with the new cache size. + { + const std::string CacheSizeFileName = "cache_size.txt"; + const std::string CacheSizeFile = CacheRoot + "/" + CacheSizeFileName; + while (true) { + LockCacheItem Lock{CacheSizeFile}; + if (!Lock.isOwned()) { + // If some other process is writing the cache size, spin lock. + continue; + } else { + std::fstream FileStream; + FileStream.open(CacheSizeFile, std::ios::out | std::ios::trunc); + FileStream << CurrCacheSize; + FileStream.close(); + + PersistentDeviceCodeCache::trace( + "Updating the cache size file after eviction. New size: " + + std::to_string(CurrCacheSize)); + break; + } + } + } +} + +// Update the cache size file and trigger cache eviction if needed. +void PersistentDeviceCodeCache::updateCacheFileSizeAndTriggerEviction( + const std::string &CacheRoot, size_t ItemSize) { + + // No need to store cache size if eviction is disabled. + if (!isEvictionEnabled()) + return; + + const std::string CacheSizeFileName = "cache_size.txt"; + const std::string CacheSizeFile = CacheRoot + "/" + CacheSizeFileName; + size_t CurrentCacheSize = 0; + // Read the cache size from the file. + while (true) { + LockCacheItem Lock{CacheSizeFile}; + if (!Lock.isOwned()) { + // If some other process is writing the cache size, spin lock. + continue; + } else { + PersistentDeviceCodeCache::trace("Updating the cache size file."); + std::fstream FileStream; + FileStream.open(CacheSizeFile, std::ios::in); + + // Read the cache size from the file; + std::string line; + if (std::getline(FileStream, line)) { + CurrentCacheSize = std::stoull(line); + } + FileStream.close(); + + CurrentCacheSize += ItemSize; + + // Write the updated cache size to the file. + FileStream.open(CacheSizeFile, std::ios::out | std::ios::trunc); + FileStream << CurrentCacheSize; + FileStream.close(); + break; + } + } + + // Check if the cache size exceeds the threshold and trigger cache eviction if + // needed. + size_t MaxCacheSize = SYCLConfig::getProgramCacheSize(); + if (CurrentCacheSize > MaxCacheSize) { + // Trigger cache eviction. + evictItemsFromCache(CacheRoot, CurrentCacheSize, MaxCacheSize); + } +} + /* Stores built program in persistent cache. We will put the binary for each * device in the list to a separate file. */ @@ -190,8 +431,21 @@ void PersistentDeviceCodeCache::putItemToDisc( if (!areImagesCacheable(Imgs)) return; + repopulateCacheSizeFile(getRootDir()); + + // Do not insert any new item if eviction is in progress. + // Since evictions are rare, we can afford to spin lock here. + const std::string EvictionInProgressFile = + getRootDir() + EvictionInProgressFileSuffix; + // Stall until the other process finishes eviction. + while (OSUtil::isPathPresent(EvictionInProgressFile)) + continue; + std::vector SortedImgs = getSortedImages(Imgs); auto BinaryData = getProgramBinaryData(NativePrg, Devices); + + // Total size of the item that we just wrote to the cache. + size_t TotalSize = 0; for (size_t DeviceIndex = 0; DeviceIndex < Devices.size(); DeviceIndex++) { // If we don't have binary for the device, skip it. if (BinaryData[DeviceIndex].empty()) @@ -202,9 +456,10 @@ void PersistentDeviceCodeCache::putItemToDisc( if (DirName.empty()) return; + std::string FileName; try { OSUtil::makeDir(DirName.c_str()); - std::string FileName = getUniqueFilename(DirName); + FileName = getUniqueFilename(DirName); LockCacheItem Lock{FileName}; if (Lock.isOwned()) { std::string FullFileName = FileName + ".bin"; @@ -212,6 +467,12 @@ void PersistentDeviceCodeCache::putItemToDisc( trace("device binary has been cached: " + FullFileName); writeSourceItem(FileName + ".src", Devices[DeviceIndex], SortedImgs, SpecConsts, BuildOptionsString); + + // Update Total cache size after adding the new items. + TotalSize += getFileSize(FileName + ".src"); + TotalSize += getFileSize(FileName + ".bin"); + + saveCurrentTimeInAFile(FileName + CacheEntryAccessTimeSuffix); } else { PersistentDeviceCodeCache::trace("cache lock not owned " + FileName); } @@ -225,6 +486,10 @@ void PersistentDeviceCodeCache::putItemToDisc( std::strerror(errno)); } } + + // Update the cache size file and trigger cache eviction if needed. + if (TotalSize) + updateCacheFileSizeAndTriggerEviction(getRootDir(), TotalSize); } void PersistentDeviceCodeCache::putCompiledKernelToDisc( @@ -298,6 +563,12 @@ std::vector> PersistentDeviceCodeCache::getItemFromDisc( try { std::string FullFileName = FileName + ".bin"; Binaries[DeviceIndex] = readBinaryDataFromFile(FullFileName); + + // Explicitly update the access time of the file. This is required for + // eviction. + if (isEvictionEnabled()) + saveCurrentTimeInAFile(FileName + CacheEntryAccessTimeSuffix); + FileNames += FullFileName + ";"; break; } catch (...) { @@ -367,11 +638,18 @@ std::string PersistentDeviceCodeCache::getDeviceIDString(const device &Device) { } /* Write built binary to persistent cache - * Format: BinarySize, Binary + * Format: NumBinaries(=1), BinarySize, Binary */ void PersistentDeviceCodeCache::writeBinaryDataToFile( const std::string &FileName, const std::vector &Data) { std::ofstream FileStream{FileName, std::ios::binary}; + // The reason why we need to write number of binaries (in current + // implementation always 1) is to keep compatibility with the old format of + // files in persistent cache, so that new runtime can use binaries from + // persistent cache generated by old compiler/runtime. + size_t NumBinaries = 1; + FileStream.write((char *)&NumBinaries, sizeof(NumBinaries)); + auto Size = Data.size(); FileStream.write((char *)&Size, sizeof(Size)); FileStream.write(Data.data(), Size); @@ -380,11 +658,26 @@ void PersistentDeviceCodeCache::writeBinaryDataToFile( } /* Read built binary from persistent cache. Each persistent cache file contains - * binary for a single device. Format: BinarySize, Binary + * binary for a single device. + * Format: NumBinaries(=1), BinarySize, Binary */ std::vector PersistentDeviceCodeCache::readBinaryDataFromFile(const std::string &FileName) { std::ifstream FileStream{FileName, std::ios::binary}; + // We ignore this number, we always read single device binary from a file and + // we need this just to keep compatibility with the old format of files in + // persistent cache, so that new runtime can use binaries from persistent + // cache generated by old compiler/runtime. + size_t NumBinaries = 0; + FileStream.read((char *)&NumBinaries, sizeof(NumBinaries)); + if (FileStream.fail()) { + trace("Failed to read number of binaries from " + FileName); + return {}; + } + // Even in the old implementation we could only put a single binary to the + // persistent cache in all scenarios, multi-device case wasn't supported. + assert(NumBinaries == 1); + size_t BinarySize = 0; FileStream.read((char *)&BinarySize, sizeof(BinarySize)); @@ -401,8 +694,8 @@ PersistentDeviceCodeCache::readBinaryDataFromFile(const std::string &FileName) { } /* Writing cache item key sources to be used for reliable identification - * Format: Four pairs of [size, value] for device, build options, specialization - * constant values, device code SPIR-V images. + * Format: Four pairs of [size, value] for device, build options, + * specialization constant values, device code SPIR-V images. */ void PersistentDeviceCodeCache::writeSourceItem( const std::string &FileName, const device &Device, diff --git a/sycl/source/detail/persistent_device_code_cache.hpp b/sycl/source/detail/persistent_device_code_cache.hpp index 24cc0bfad83f1..c51e5e55bc22b 100644 --- a/sycl/source/detail/persistent_device_code_cache.hpp +++ b/sycl/source/detail/persistent_device_code_cache.hpp @@ -91,13 +91,19 @@ class PersistentDeviceCodeCache { */ private: /* Write built binary to persistent cache - * Format: BinarySize, Binary + * Format: NumBinaries(=1), BinarySize, Binary + * The reason why we need to write a number of binaries (always 1 in current + * implementation) is to keep compatibility with the old format of files in + * the persistent cache, so that new runtime can use binaries from the + * persistent cache generated by an old compiler/runtime. NumBinaries can be + * removed at next ABI breaking window. */ static void writeBinaryDataToFile(const std::string &FileName, const std::vector &Data); - /* Read built binary to persistent cache - * Format: BinarySize, Binary + /* Read built binary from persistent cache + * Format: NumBinaries(=1), BinarySize, Binary + * See comment above regarding the reason why we need NumBinaries. */ static std::vector readBinaryDataFromFile(const std::string &FileName); @@ -118,9 +124,6 @@ class PersistentDeviceCodeCache { const std::vector &SortedImgs, const SerializedObj &SpecConsts, const std::string &BuildOptionsString); - /* Returns the path to directory storing persistent device code cache.*/ - static std::string getRootDir(); - /* Form string representing device version */ static std::string getDeviceIDString(const device &Device); @@ -152,6 +155,9 @@ class PersistentDeviceCodeCache { 1024 * 1024 * 1024; public: + /* Returns the path to directory storing persistent device code cache.*/ + static std::string getRootDir(); + /* Check if on-disk cache enabled. */ static bool isEnabled(); @@ -214,6 +220,34 @@ class PersistentDeviceCodeCache { if (traceEnabled) std::cerr << "[kernel_compiler Persistent Cache]: " << msg << std::endl; } + +private: + // Check if cache_size.lock file is present in the cache root directory. + // If not, create it and populate it with the size of the cache directory. + static void repopulateCacheSizeFile(const std::string &CacheRoot); + + // Update the cache size file and trigger cache eviction if needed. + static void + updateCacheFileSizeAndTriggerEviction(const std::string &CacheRoot, + size_t CacheSize); + + // Evict LRU items from the cache to make space for new items. + static void evictItemsFromCache(const std::string &CacheRoot, + size_t CacheSize, size_t MaxCacheSize); + + static void saveCurrentTimeInAFile(std::string FileName); + + // Check if eviction is enabled. + static bool isEvictionEnabled() { + return SYCLConfig::isPersistentCacheEvictionEnabled(); + } + + // Suffix for access time file. Every cache entry will have one. + static inline std::string CacheEntryAccessTimeSuffix = "_access_time.txt"; + // Suffix for eviction in progress file. It is created when eviction is + // triggered and removed when eviction is done. + static inline std::string EvictionInProgressFileSuffix = + "_eviction_in_progress"; }; } // namespace detail } // namespace _V1 diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index e23673b3ee341..ad9cf4d9b92f8 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -799,9 +799,9 @@ ur_program_handle_t ProgramManager::getBuiltURProgram( RootDevImpl->getHandleRef(), UR_DEVICE_INFO_BUILD_ON_SUBDEVICE, sizeof(ur_bool_t), &MustBuildOnSubdevice, nullptr); - DeviceImplPtr Dev = (MustBuildOnSubdevice == true) ? DeviceImpl : RootDevImpl; auto Context = createSyclObjFromImpl(ContextImpl); - auto Device = createSyclObjFromImpl(Dev); + auto Device = createSyclObjFromImpl( + MustBuildOnSubdevice == true ? DeviceImpl : RootDevImpl); const RTDeviceBinaryImage &Img = getDeviceImage(KernelName, Context, Device, JITCompilationIsRequired); @@ -822,7 +822,7 @@ ur_program_handle_t ProgramManager::getBuiltURProgram( std::copy(DeviceImagesToLink.begin(), DeviceImagesToLink.end(), std::back_inserter(AllImages)); - return getBuiltURProgram(std::move(AllImages), Context, {Device}); + return getBuiltURProgram(std::move(AllImages), Context, {std::move(Device)}); } ur_program_handle_t ProgramManager::getBuiltURProgram( @@ -1008,7 +1008,7 @@ ur_program_handle_t ProgramManager::getBuiltURProgram( } } // Change device in the cache key to reduce copying of spec const data. - CacheKey.second = Subset; + CacheKey.second = std::move(Subset); bool DidInsert = Cache.insertBuiltProgram(CacheKey, ResProgram); if (DidInsert) { // For every cached copy of the program, we need to increment its diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 7a20d8122dc36..31db161f88726 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2569,9 +2569,8 @@ getCGKernelInfo(const CGExecKernel &CommandGroup, ContextImplPtr ContextImpl, // they can simply be launched directly. if (auto KernelBundleImplPtr = CommandGroup.MKernelBundle; KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) { - auto KernelName = CommandGroup.MKernelName; - kernel_id KernelID = - detail::ProgramManager::getInstance().getSYCLKernelID(KernelName); + kernel_id KernelID = detail::ProgramManager::getInstance().getSYCLKernelID( + CommandGroup.MKernelName); kernel SyclKernel = KernelBundleImplPtr->get_kernel(KernelID, KernelBundleImplPtr); @@ -2775,8 +2774,8 @@ void enqueueImpKernel( // Initialize device globals associated with this. std::vector DeviceGlobalInitEvents = ContextImpl->initializeDeviceGlobals(Program, Queue); - std::vector EventsWithDeviceGlobalInits; if (!DeviceGlobalInitEvents.empty()) { + std::vector EventsWithDeviceGlobalInits; EventsWithDeviceGlobalInits.reserve(RawEvents.size() + DeviceGlobalInitEvents.size()); EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(), @@ -2784,7 +2783,7 @@ void enqueueImpKernel( EventsWithDeviceGlobalInits.insert(EventsWithDeviceGlobalInits.end(), DeviceGlobalInitEvents.begin(), DeviceGlobalInitEvents.end()); - EventsWaitList = EventsWithDeviceGlobalInits; + EventsWaitList = std::move(EventsWithDeviceGlobalInits); } ur_result_t Error = UR_RESULT_SUCCESS; @@ -3642,6 +3641,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { // we don't need to enqueue anything. return UR_RESULT_SUCCESS; } + assert(MQueue && "Empty node should have an associated queue"); const detail::AdapterPtr &Adapter = MQueue->getAdapter(); ur_event_handle_t Event; ur_result_t Result = Adapter->call_nocheck( diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index e7b32c0fe38ae..19d0fe95706cf 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -543,11 +543,10 @@ event handler::finalize() { // In-order queues create implicit linear dependencies between nodes. // Find the last node added to the graph from this queue, so our new // node can set it as a predecessor. - auto DependentNode = GraphImpl->getLastInorderNode(MQueue); std::vector> Deps; - if (DependentNode) { - Deps.push_back(DependentNode); + if (auto DependentNode = GraphImpl->getLastInorderNode(MQueue)) { + Deps.push_back(std::move(DependentNode)); } NodeImpl = GraphImpl->add(NodeType, std::move(CommandGroup), Deps); @@ -571,7 +570,7 @@ event handler::finalize() { } // Associate an event with this new node and return the event. - GraphImpl->addEventForNode(EventImpl, NodeImpl); + GraphImpl->addEventForNode(EventImpl, std::move(NodeImpl)); return detail::createSyclObjFromImpl(EventImpl); } diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 399d67af8afc8..831c3d0a87555 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -199,12 +199,12 @@ event queue::mem_advise(const void *Ptr, size_t Length, int Advice, /// TODO: Unused. Remove these when ABI-break window is open. event queue::submit_impl(std::function CGH, const detail::code_location &CodeLoc) { - return submit_with_event_impl(CGH, {}, CodeLoc, true); + return submit_with_event_impl(std::move(CGH), {}, CodeLoc, true); } event queue::submit_impl(std::function CGH, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - return submit_with_event_impl(CGH, {}, CodeLoc, IsTopCodeLoc); + return submit_with_event_impl(std::move(CGH), {}, CodeLoc, IsTopCodeLoc); } event queue::submit_impl(std::function CGH, queue SecondQueue, @@ -219,12 +219,12 @@ event queue::submit_impl(std::function CGH, queue SecondQueue, void queue::submit_without_event_impl(std::function CGH, const detail::code_location &CodeLoc) { - submit_without_event_impl(CGH, {}, CodeLoc, true); + submit_without_event_impl(std::move(CGH), {}, CodeLoc, true); } void queue::submit_without_event_impl(std::function CGH, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { - submit_without_event_impl(CGH, {}, CodeLoc, IsTopCodeLoc); + submit_without_event_impl(std::move(CGH), {}, CodeLoc, IsTopCodeLoc); } event queue::submit_impl_and_postprocess( @@ -232,14 +232,14 @@ event queue::submit_impl_and_postprocess( const detail::SubmitPostProcessF &PostProcess) { detail::SubmissionInfo SI{}; SI.PostProcessorFunc() = std::move(PostProcess); - return submit_with_event_impl(CGH, SI, CodeLoc, true); + return submit_with_event_impl(std::move(CGH), SI, CodeLoc, true); } event queue::submit_impl_and_postprocess( std::function CGH, const detail::code_location &CodeLoc, const detail::SubmitPostProcessF &PostProcess, bool IsTopCodeLoc) { detail::SubmissionInfo SI{}; SI.PostProcessorFunc() = std::move(PostProcess); - return submit_with_event_impl(CGH, SI, CodeLoc, IsTopCodeLoc); + return submit_with_event_impl(std::move(CGH), SI, CodeLoc, IsTopCodeLoc); } event queue::submit_impl_and_postprocess( diff --git a/sycl/test-e2e/AOT/cpu.cpp b/sycl/test-e2e/AOT/cpu.cpp index f0c71be2d481e..e7bedc04b07ba 100644 --- a/sycl/test-e2e/AOT/cpu.cpp +++ b/sycl/test-e2e/AOT/cpu.cpp @@ -7,6 +7,7 @@ //===---------------------------------------------------------------------===// // REQUIRES: opencl-aot, cpu +// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64 %S/Inputs/aot.cpp -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/AOT/double.cpp b/sycl/test-e2e/AOT/double.cpp index ead41bfc8c741..4d48f5c7b57ec 100644 --- a/sycl/test-e2e/AOT/double.cpp +++ b/sycl/test-e2e/AOT/double.cpp @@ -2,6 +2,7 @@ // using fp64 can be compiled AOT. // REQUIRES: ocloc, opencl-aot, any-device-is-cpu +// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_tgllp -o %t.tgllp.out %s // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64 -o %t.x86.out %s // RUN: %if cpu %{ %{run} %t.x86.out %} diff --git a/sycl/test-e2e/AOT/half.cpp b/sycl/test-e2e/AOT/half.cpp index e843ee65bbeef..5711b21735f72 100644 --- a/sycl/test-e2e/AOT/half.cpp +++ b/sycl/test-e2e/AOT/half.cpp @@ -2,6 +2,7 @@ // using fp16 can be compiled AOT. // REQUIRES: ocloc, opencl-aot, any-device-is-cpu +// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_tgllp -o %t.tgllp.out %s // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64 -o %t.x86.out %s // RUN: %if cpu %{ %{run} %t.x86.out %} diff --git a/sycl/test-e2e/Adapters/sycl-ls-gpu-default-any.cpp b/sycl/test-e2e/Adapters/sycl-ls-gpu-default-any.cpp index 046a41494370b..1c90ea9f1e32f 100644 --- a/sycl/test-e2e/Adapters/sycl-ls-gpu-default-any.cpp +++ b/sycl/test-e2e/Adapters/sycl-ls-gpu-default-any.cpp @@ -1,4 +1,5 @@ // REQUIRES: gpu +// REQUIRES: build-and-run-mode // TODO: Remove unsetting SYCL_DEVICE_FILTER when feature is dropped // RUN: env --unset=SYCL_DEVICE_FILTER --unset=ONEAPI_DEVICE_SELECTOR sycl-ls --verbose >%t.default.out diff --git a/sycl/test-e2e/Adapters/sycl-ls-gpu-default-level-zero.cpp b/sycl/test-e2e/Adapters/sycl-ls-gpu-default-level-zero.cpp index 6f144553fff7b..cd6ece996d9f8 100644 --- a/sycl/test-e2e/Adapters/sycl-ls-gpu-default-level-zero.cpp +++ b/sycl/test-e2e/Adapters/sycl-ls-gpu-default-level-zero.cpp @@ -1,4 +1,5 @@ // REQUIRES: gpu, level-zero +// REQUIRES: build-and-run-mode // TODO: Remove unsetting SYCL_DEVICE_FILTER when feature is dropped // RUN: env --unset=SYCL_DEVICE_FILTER --unset=ONEAPI_DEVICE_SELECTOR sycl-ls --verbose >%t.default.out diff --git a/sycl/test-e2e/Adapters/sycl-ls-gpu-level-zero.cpp b/sycl/test-e2e/Adapters/sycl-ls-gpu-level-zero.cpp index 36eb621e923a6..f20ea34182833 100644 --- a/sycl/test-e2e/Adapters/sycl-ls-gpu-level-zero.cpp +++ b/sycl/test-e2e/Adapters/sycl-ls-gpu-level-zero.cpp @@ -1,4 +1,5 @@ // REQUIRES: gpu, level_zero +// REQUIRES: build-and-run-mode // RUN: sycl-ls --verbose >%t.default.out // RUN: FileCheck %s --check-prefixes=CHECK-GPU-BUILTIN,CHECK-GPU-CUSTOM --input-file %t.default.out diff --git a/sycl/test-e2e/Adapters/sycl-ls-gpu-opencl.cpp b/sycl/test-e2e/Adapters/sycl-ls-gpu-opencl.cpp index fa764d0629a0f..fda84100fecd6 100644 --- a/sycl/test-e2e/Adapters/sycl-ls-gpu-opencl.cpp +++ b/sycl/test-e2e/Adapters/sycl-ls-gpu-opencl.cpp @@ -1,4 +1,5 @@ // REQUIRES: gpu, opencl +// REQUIRES: build-and-run-mode // RUN: env ONEAPI_DEVICE_SELECTOR="opencl:*" sycl-ls --verbose >%t.opencl.out // RUN: FileCheck %s --check-prefixes=CHECK-GPU-BUILTIN,CHECK-GPU-CUSTOM --input-file %t.opencl.out diff --git a/sycl/test-e2e/Adapters/sycl-ls-uuid-subdevs.cpp b/sycl/test-e2e/Adapters/sycl-ls-uuid-subdevs.cpp index b7f83f8f337cd..f5d155d15ee5b 100644 --- a/sycl/test-e2e/Adapters/sycl-ls-uuid-subdevs.cpp +++ b/sycl/test-e2e/Adapters/sycl-ls-uuid-subdevs.cpp @@ -1,4 +1,5 @@ /* Test to check that sycl-ls is outputting UUID and number of sub and sub-sub +// REQUIRES: build-and-run-mode * devices. */ // REQUIRES: gpu, level_zero diff --git a/sycl/test-e2e/AddressSanitizer/common/ignorelist.txt b/sycl/test-e2e/AddressSanitizer/common/ignorelist.txt index 7251d5e24767d..79d54df5465d3 100644 --- a/sycl/test-e2e/AddressSanitizer/common/ignorelist.txt +++ b/sycl/test-e2e/AddressSanitizer/common/ignorelist.txt @@ -1 +1 @@ -fun:*MyKernel* +fun:*NoSanitized* diff --git a/sycl/test-e2e/AddressSanitizer/common/kernel-filter.cpp b/sycl/test-e2e/AddressSanitizer/common/kernel-filter.cpp index 2830241eda489..b7dc64c3c3de8 100644 --- a/sycl/test-e2e/AddressSanitizer/common/kernel-filter.cpp +++ b/sycl/test-e2e/AddressSanitizer/common/kernel-filter.cpp @@ -1,27 +1,51 @@ -// REQUIRES: linux -// RUN: %{build} %device_asan_flags -O2 -fsanitize-ignorelist=%p/ignorelist.txt -o %t -// RUN: %{run} %t 2>&1 | FileCheck %s -// RUN: %{build} %device_asan_flags %if cpu %{ -fsycl-targets=spir64_x86_64 %} %if gpu %{ -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen %gpu_aot_target_opts %} -O2 -fsanitize-ignorelist=%p/ignorelist.txt -o %t2 +// REQUIRES: linux, cpu || (gpu && level_zero) +// RUN: %{build} %device_asan_flags -O2 -fsanitize-ignorelist=%p/ignorelist.txt -o %t1 +// RUN: %{run} %t1 2>&1 | FileCheck %s +// RUN: %{build} %device_asan_aot_flags -O2 -fsanitize-ignorelist=%p/ignorelist.txt -o %t2 // RUN: %{run} %t2 2>&1 | FileCheck %s #include #include int main() { + constexpr std::size_t N = 8; + constexpr std::size_t group_size = 4; + sycl::queue Q; - constexpr std::size_t N = 16; + auto *array = sycl::malloc_device(N, Q); + std::vector v(N); + sycl::buffer buf(v.data(), v.size()); + Q.submit([&](sycl::handler &h) { - h.parallel_for( - sycl::nd_range<1>(N + 1, 1), - [=](sycl::nd_item<1> item) { ++array[item.get_global_id(0)]; }); + auto buf_acc = buf.get_access(h); + auto loc_acc = sycl::local_accessor(group_size, h); + h.parallel_for( + sycl::nd_range<1>(N, group_size), [=](sycl::nd_item<1> item) { + auto gid = item.get_global_id(0); + auto lid = item.get_local_id(0); + array[gid] = buf_acc[gid] + loc_acc[lid]; + }); + }); + Q.wait(); + // CHECK-NOT: ERROR: DeviceSanitizer: out-of-bounds-access + + Q.submit([&](sycl::handler &h) { + auto buf_acc = buf.get_access(h); + auto loc_acc = sycl::local_accessor(group_size, h); + h.parallel_for(sycl::nd_range<1>(N, group_size), + [=](sycl::nd_item<1> item) { + auto gid = item.get_global_id(0); + auto lid = item.get_local_id(0); + array[gid] = buf_acc[gid] + loc_acc[lid]; + }); }); Q.wait(); sycl::free(array, Q); std::cout << "PASS" << std::endl; + // CHECK: PASS + return 0; } - -// CHECK: PASS diff --git a/sycl/test-e2e/AddressSanitizer/lit.local.cfg b/sycl/test-e2e/AddressSanitizer/lit.local.cfg index e7575dcca0e13..d768697d07f6d 100644 --- a/sycl/test-e2e/AddressSanitizer/lit.local.cfg +++ b/sycl/test-e2e/AddressSanitizer/lit.local.cfg @@ -1,14 +1,12 @@ -# This test assumes it can only run in CPU/PVC/DG2 devices, which support usm_device_allocations aspect - config.substitutions.append( ("%device_asan_flags", "-Xarch_device -fsanitize=address") ) - +config.substitutions.append( + ("%device_asan_aot_flags", "-Xarch_device -fsanitize=address %if cpu %{ -fsycl-targets=spir64_x86_64 %} %if gpu %{ -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen %gpu_aot_target_opts %}") +) config.substitutions.append( ("%force_device_asan_rt", "env UR_ENABLE_LAYERS=UR_LAYER_ASAN") ) -config.unsupported_features += ['cuda', 'hip'] - # https://github.com/intel/llvm/issues/15953 config.unsupported_features += ['gpu-intel-gen12'] diff --git a/sycl/test-e2e/AddressSanitizer/nullpointer/private_nullptr.cpp b/sycl/test-e2e/AddressSanitizer/nullpointer/private_nullptr.cpp index 27b4b5a6fab17..fd7cae43dd3ea 100644 --- a/sycl/test-e2e/AddressSanitizer/nullpointer/private_nullptr.cpp +++ b/sycl/test-e2e/AddressSanitizer/nullpointer/private_nullptr.cpp @@ -7,7 +7,7 @@ // RUN: %{run} not %t3.out 2>&1 | FileCheck %s // FIXME: There's an issue in gfx driver, so this test pending here. -// XFAIL: * +// XFAIL: run-mode #include #include diff --git a/sycl/test-e2e/AddressSanitizer/out-of-bounds/DeviceGlobal/device_global.cpp b/sycl/test-e2e/AddressSanitizer/out-of-bounds/DeviceGlobal/device_global.cpp index 36c66562c08f4..94325a52eb7fa 100644 --- a/sycl/test-e2e/AddressSanitizer/out-of-bounds/DeviceGlobal/device_global.cpp +++ b/sycl/test-e2e/AddressSanitizer/out-of-bounds/DeviceGlobal/device_global.cpp @@ -6,6 +6,10 @@ // RUN: %{build} %device_asan_flags -O2 -g -o %t3.out // RUN: %{run} not %t3.out 2>&1 | FileCheck %s +// Flakily timesout on PVC +// UNSUPPORTED: arch-intel_gpu_pvc +// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/16401 + #include #include diff --git a/sycl/test-e2e/Assert/assert_in_multiple_tus.cpp b/sycl/test-e2e/Assert/assert_in_multiple_tus.cpp index d54bcdf93df1d..bae87a0144c89 100644 --- a/sycl/test-e2e/Assert/assert_in_multiple_tus.cpp +++ b/sycl/test-e2e/Assert/assert_in_multiple_tus.cpp @@ -9,8 +9,8 @@ // XFAIL: (opencl && gpu) // XFAIL-TRACKER: https://github.com/intel/llvm/issues/11364 -// UNSUPPORTED: gpu-intel-dg2 -// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/15029 +// Test requires at least this version of the Intel GPU driver on Arc. +// REQUIRES-INTEL-DRIVER: lin: 31294 // RUN: %{build} -DSYCL_FALLBACK_ASSERT=1 -I %S/Inputs %S/Inputs/kernels_in_file2.cpp -o %t.out // RUN: %{run} %t.out &> %t.txt ; FileCheck %s --input-file %t.txt %if fpga %{ --check-prefix=CHECK-ACC %} diff --git a/sycl/test-e2e/BFloat16/bfloat16_example_aot.cpp b/sycl/test-e2e/BFloat16/bfloat16_example_aot.cpp index 8337716c3191e..32c6c3aae8f06 100644 --- a/sycl/test-e2e/BFloat16/bfloat16_example_aot.cpp +++ b/sycl/test-e2e/BFloat16/bfloat16_example_aot.cpp @@ -4,6 +4,7 @@ /// // REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, any-device-is-cpu +// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -fsycl-targets=spir64 %s -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp b/sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp index 2f6d893768c4e..9231a62428272 100644 --- a/sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp +++ b/sycl/test-e2e/BFloat16/bfloat16_example_aot_cpu.cpp @@ -4,6 +4,7 @@ /// // REQUIRES: opencl-aot, ocloc, gpu-intel-gen12, any-device-is-cpu +// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -fsycl-targets=spir64,spir64_gen -Xsycl-target-backend=spir64_gen "-device dg1" %s -o %t.out // RUN: %if cpu %{ %{run} %t.out %} diff --git a/sycl/test-e2e/Basic/accessor/accessor.cpp b/sycl/test-e2e/Basic/accessor/accessor.cpp index de944ba12754a..f51bf53173a82 100644 --- a/sycl/test-e2e/Basic/accessor/accessor.cpp +++ b/sycl/test-e2e/Basic/accessor/accessor.cpp @@ -1,6 +1,10 @@ // RUN: %{build} -DSYCL2020_DISABLE_DEPRECATION_WARNINGS -o %t.out // RUN: %{run} %t.out +// Test flakily fails on PVC. +// UNSUPPORTED: arch-intel_gpu_pvc +// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/16401 + //==----------------accessor.cpp - SYCL accessor basic test ----------------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. diff --git a/sycl/test-e2e/Basic/build_log.cpp b/sycl/test-e2e/Basic/build_log.cpp index ae63b0bb5b287..69f039dde80db 100644 --- a/sycl/test-e2e/Basic/build_log.cpp +++ b/sycl/test-e2e/Basic/build_log.cpp @@ -1,5 +1,5 @@ // REQUIRES: opencl || level_zero, gpu, ocloc -// UNSUPPORTED: gpu-intel-dg1 +// UNSUPPORTED: arch-intel_gpu_dg1 // // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device dg1" %s -o %t.out // RUN: env SYCL_RT_WARNING_LEVEL=2 %{run} %t.out 2>&1 | FileCheck %s diff --git a/sycl/test-e2e/Basic/interop/interop_all_backends.cpp b/sycl/test-e2e/Basic/interop/interop_all_backends.cpp index fd6503772d356..902ab1f1d09b9 100644 --- a/sycl/test-e2e/Basic/interop/interop_all_backends.cpp +++ b/sycl/test-e2e/Basic/interop/interop_all_backends.cpp @@ -1,8 +1,7 @@ -// XFAIL: any-device-is-opencl, any-device-is-cuda, any-device-is-level_zero, gpu-intel-dg2, hip_amd -// XFAIL-TRACKER: https://github.com/intel/llvm/issues/15819 +// XFAIL: any-device-is-cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16070 // RUN: %if any-device-is-opencl %{ %{build} -o %t-opencl.out %} -// RUN: %if any-device-is-level_zero %{ %{build} -DBUILD_FOR_L0 -o %t-l0.out %} -// RUN: %if any-device-is-cuda %{ %{build} -DBUILD_FOR_CUDA -o %t-cuda.out %} +// RUN: %if any-device-is-cuda %{ %{build} -isystem %sycl_include -DBUILD_FOR_CUDA -o %t-cuda.out %} // RUN: %if any-device-is-hip %{ %{build} -DBUILD_FOR_HIP -o %t-hip.out %} #include @@ -20,17 +19,12 @@ using nativeEvent = CUevent; #elif defined(BUILD_FOR_HIP) #include constexpr auto BACKEND = backend::ext_oneapi_hip; -using nativeDevice = hipDevice_t; -using nativeQueue = hipStream_t; -using nativeEvent = hipEvent_t; -#elif defined(BUILD_FOR_L0) -constexpr auto BACKEND = backend::ext_oneapi_level_zero; -using nativeDevice = ze_device_handle_t; -using nativeQueue = ze_command_queue_handle_t; -using nativeEvent = ze_event_handle_t; +using nativeDevice = device; +using nativeQueue = ihipStream_t; +using nativeEvent = ihipEvent_t; #else constexpr auto BACKEND = backend::opencl; -using nativeDevice = cl_device; +using nativeDevice = cl_device_id; using nativeQueue = cl_command_queue; using nativeEvent = cl_event; #endif diff --git a/sycl/test-e2e/Basic/query.cpp b/sycl/test-e2e/Basic/query.cpp index 01576c78d2587..12a64a2e4452c 100644 --- a/sycl/test-e2e/Basic/query.cpp +++ b/sycl/test-e2e/Basic/query.cpp @@ -1,5 +1,5 @@ // RUN: %{build} -o %t.out // RUN: %{run-unfiltered-devices} env ONEAPI_DEVICE_SELECTOR="*:gpu" %t.out -// REQUIRES: gpu-intel-dg1 +// REQUIRES: arch-intel_gpu_dg1 #include "query.hpp" diff --git a/sycl/test-e2e/Compression/compression_aot.cpp b/sycl/test-e2e/Compression/compression_aot.cpp index b8293f1c79351..ba95974fde781 100644 --- a/sycl/test-e2e/Compression/compression_aot.cpp +++ b/sycl/test-e2e/Compression/compression_aot.cpp @@ -1,5 +1,6 @@ // End-to-End test for testing device image compression in AOT. // REQUIRES: zstd, opencl-aot, cpu +// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64 %O0 --offload-compress --offload-compression-level=3 %S/Inputs/single_kernel.cpp -o %t_compress.out // RUN: %{run} %t_compress.out diff --git a/sycl/test-e2e/Compression/compression_separate_compile.cpp b/sycl/test-e2e/Compression/compression_separate_compile.cpp index dab17e3506b4e..afbe442230e29 100644 --- a/sycl/test-e2e/Compression/compression_separate_compile.cpp +++ b/sycl/test-e2e/Compression/compression_separate_compile.cpp @@ -2,6 +2,7 @@ // seperatly compile and link device images. // REQUIRES: zstd, opencl-aot, cpu, linux +// REQUIRES: build-and-run-mode ////////////////////// Compile device images // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64 -fsycl-host-compiler=clang++ -fsycl-host-compiler-options='-std=c++17 -Wno-attributes -Wno-deprecated-declarations -fPIC -DENABLE_KERNEL1' -DENABLE_KERNEL1 -c %s -o %t_kernel1_aot.o diff --git a/sycl/test-e2e/DeviceArchitecture/device_architecture_comparison_on_device_aot.cpp b/sycl/test-e2e/DeviceArchitecture/device_architecture_comparison_on_device_aot.cpp index 10dcf2f5b528f..88f55d00aa903 100644 --- a/sycl/test-e2e/DeviceArchitecture/device_architecture_comparison_on_device_aot.cpp +++ b/sycl/test-e2e/DeviceArchitecture/device_architecture_comparison_on_device_aot.cpp @@ -1,5 +1,8 @@ // REQUIRES: arch-intel_gpu_pvc, ocloc +// XFAIL: arch-intel_gpu_pvc +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16401 + // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_pvc %s -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/DeviceArchitecture/device_architecture_on_device_aot.cpp b/sycl/test-e2e/DeviceArchitecture/device_architecture_on_device_aot.cpp index a62ae965683ae..1d4c41f117136 100644 --- a/sycl/test-e2e/DeviceArchitecture/device_architecture_on_device_aot.cpp +++ b/sycl/test-e2e/DeviceArchitecture/device_architecture_on_device_aot.cpp @@ -1,4 +1,5 @@ // REQUIRES: opencl-aot, cpu +// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64 %s -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/DeviceCodeSplit/aot-cpu.cpp b/sycl/test-e2e/DeviceCodeSplit/aot-cpu.cpp index d2937b796fe63..319c69af0faef 100644 --- a/sycl/test-e2e/DeviceCodeSplit/aot-cpu.cpp +++ b/sycl/test-e2e/DeviceCodeSplit/aot-cpu.cpp @@ -1,4 +1,5 @@ // REQUIRES: opencl-aot, cpu +// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -fsycl-device-code-split=per_source -fsycl-targets=spir64_x86_64 -I %S/Inputs -o %t.out %S/split-per-source-main.cpp %S/Inputs/split-per-source-second-file.cpp \ // RUN: -fsycl-dead-args-optimization diff --git a/sycl/test-e2e/DeviceCodeSplit/grf.cpp b/sycl/test-e2e/DeviceCodeSplit/grf.cpp index b3d02a7fea35c..1e5b085d207d6 100644 --- a/sycl/test-e2e/DeviceCodeSplit/grf.cpp +++ b/sycl/test-e2e/DeviceCodeSplit/grf.cpp @@ -14,6 +14,8 @@ // compiler option // REQUIRES: arch-intel_gpu_pvc +// XFAIL: arch-intel_gpu_pvc +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16401 // RUN: %{build} -Wno-error=deprecated-declarations -o %t1.out // RUN: env SYCL_UR_TRACE=2 %{run} %t1.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NO-VAR diff --git a/sycl/test-e2e/DeviceLib/assert-aot.cpp b/sycl/test-e2e/DeviceLib/assert-aot.cpp index ccff887e80adf..f5c8c61f07d78 100644 --- a/sycl/test-e2e/DeviceLib/assert-aot.cpp +++ b/sycl/test-e2e/DeviceLib/assert-aot.cpp @@ -1,4 +1,5 @@ // REQUIRES: opencl-aot, cpu, linux +// REQUIRES: build-and-run-mode // RUN: %clangxx -DSYCL_FALLBACK_ASSERT=1 -fsycl -fsycl-targets=spir64_x86_64 %S/assert.cpp -o %t.aot.out // RUN: env EXPECTED_SIGNAL=SIGABRT SHOULD_CRASH=1 %{run} %t.aot.out 2>&1 | FileCheck %S/assert.cpp --check-prefixes=CHECK-MESSAGE diff --git a/sycl/test-e2e/DeviceLib/cmath-aot.cpp b/sycl/test-e2e/DeviceLib/cmath-aot.cpp index bb9e201de9282..c9897f652f034 100644 --- a/sycl/test-e2e/DeviceLib/cmath-aot.cpp +++ b/sycl/test-e2e/DeviceLib/cmath-aot.cpp @@ -1,4 +1,5 @@ // REQUIRES: opencl-aot, cpu +// REQUIRES: build-and-run-mode // UNSUPPORTED: windows // DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%} diff --git a/sycl/test-e2e/ESIMD/PerformanceTests/invoke_simd_smoke.cpp b/sycl/test-e2e/ESIMD/PerformanceTests/invoke_simd_smoke.cpp index c6d419f486a9b..8c148091356fa 100644 --- a/sycl/test-e2e/ESIMD/PerformanceTests/invoke_simd_smoke.cpp +++ b/sycl/test-e2e/ESIMD/PerformanceTests/invoke_simd_smoke.cpp @@ -5,7 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// XFAIL: igc-dev +// XFAIL: gpu-intel-dg2 // XFAIL-TRACKER: https://github.com/intel/llvm/issues/16388 // REQUIRES: gpu-intel-dg2 && level_zero // UNSUPPORTED: windows diff --git a/sycl/test-e2e/ESIMD/dp4a.cpp b/sycl/test-e2e/ESIMD/dp4a.cpp index 129e7bdb702d6..0b99016d06a23 100644 --- a/sycl/test-e2e/ESIMD/dp4a.cpp +++ b/sycl/test-e2e/ESIMD/dp4a.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // TODO enable on Windows -// REQUIRES: linux && gpu-intel-dg1 +// REQUIRES: linux && arch-intel_gpu_dg1 // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Graph/Explicit/compile_time_local_memory.cpp b/sycl/test-e2e/Graph/Explicit/compile_time_local_memory.cpp new file mode 100644 index 0000000000000..759d3d852b061 --- /dev/null +++ b/sycl/test-e2e/Graph/Explicit/compile_time_local_memory.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_EXPLICIT + +#include "../Inputs/compile_time_local_memory.cpp" diff --git a/sycl/test-e2e/Graph/Inputs/compile_time_local_memory.cpp b/sycl/test-e2e/Graph/Inputs/compile_time_local_memory.cpp new file mode 100644 index 0000000000000..53401c073697b --- /dev/null +++ b/sycl/test-e2e/Graph/Inputs/compile_time_local_memory.cpp @@ -0,0 +1,73 @@ +// Tests adding kernel nodes with local memory that is allocated using +// the sycl_ext_oneapi_local_memory extension. + +#include "../graph_common.hpp" +#include + +int main() { + queue Queue{}; + + using T = int; + constexpr size_t LocalSize = 128; + + std::vector HostData(Size); + std::iota(HostData.begin(), HostData.end(), 10); + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + + Queue.copy(HostData.data(), PtrA, Size); + Queue.wait_and_throw(); + + auto NodeA = add_node(Graph, Queue, [&](handler &CGH) { + CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) { + multi_ptr + LocalMem = sycl::ext::oneapi::group_local_memory( + Item.get_group()); + *LocalMem[Item.get_local_linear_id()] = Item.get_global_linear_id() * 2; + PtrA[Item.get_global_linear_id()] += + *LocalMem[Item.get_local_linear_id()]; + }); + }); + + add_node( + Graph, Queue, + [&](handler &CGH) { + depends_on_helper(CGH, NodeA); + CGH.parallel_for(nd_range({Size}, {LocalSize}), [=](nd_item<1> Item) { + multi_ptr + LocalMem = sycl::ext::oneapi::group_local_memory_for_overwrite< + size_t[LocalSize]>(Item.get_group()); + *LocalMem[Item.get_local_linear_id()] = + Item.get_global_linear_id() + 4; + PtrA[Item.get_global_linear_id()] *= + *LocalMem[Item.get_local_linear_id()]; + }); + }, + NodeA); + + auto GraphExec = Graph.finalize(); + + for (unsigned n = 0; n < Iterations; n++) { + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } + + Queue.wait_and_throw(); + + Queue.copy(PtrA, HostData.data(), Size); + Queue.wait_and_throw(); + + free(PtrA, Queue); + + for (size_t i = 0; i < Size; i++) { + T Ref = 10 + i; + for (size_t iter = 0; iter < Iterations; ++iter) { + Ref += (i * 2); + Ref *= (i + 4); + } + assert(check_value(i, Ref, HostData[i], "PtrA")); + } + + return 0; +} diff --git a/sycl/test-e2e/Graph/Inputs/whole_update_raw_arg.cpp b/sycl/test-e2e/Graph/Inputs/whole_update_raw_arg.cpp index 0c9573b06c41d..5ddc4f8852364 100644 --- a/sycl/test-e2e/Graph/Inputs/whole_update_raw_arg.cpp +++ b/sycl/test-e2e/Graph/Inputs/whole_update_raw_arg.cpp @@ -4,7 +4,17 @@ void SubmitKernelNode( exp_ext::command_graph Graph, queue Queue, - int32_t *Ptr, exp_ext::raw_kernel_arg &RawArg) { + int32_t *Ptr, exp_ext::raw_kernel_arg &RawArg, kernel Kernel) { + + add_node(Graph, Queue, [&](handler &cgh) { + cgh.set_arg(0, RawArg); + cgh.set_arg(1, Ptr); + cgh.parallel_for(sycl::range<1>{Size}, Kernel); + }); +} + +int main() { + queue Queue{}; auto constexpr CLSource = R"===( __kernel void RawArgKernel(int scalar, __global int *out) { @@ -17,18 +27,9 @@ __kernel void RawArgKernel(int scalar, __global int *out) { sycl::ext::oneapi::experimental::create_kernel_bundle_from_source( Queue.get_context(), sycl::ext::oneapi::experimental::source_language::opencl, CLSource); - auto ExecKB = sycl::ext::oneapi::experimental::build(SourceKB); - - add_node(Graph, Queue, [&](handler &cgh) { - cgh.set_arg(0, RawArg); - cgh.set_arg(1, Ptr); - cgh.parallel_for(sycl::range<1>{Size}, - ExecKB.ext_oneapi_get_kernel("RawArgKernel")); - }); -} - -int main() { - queue Queue{}; + auto Kernel = + sycl::ext::oneapi::experimental::build(SourceKB).ext_oneapi_get_kernel( + "RawArgKernel"); exp_ext::command_graph GraphA{Queue}; @@ -40,7 +41,7 @@ int main() { sycl::ext::oneapi::experimental::raw_kernel_arg RawScalarA(&ScalarA, sizeof(int32_t)); - SubmitKernelNode(GraphA, Queue, PtrA, RawScalarA); + SubmitKernelNode(GraphA, Queue, PtrA, RawScalarA, Kernel); auto ExecGraphA = GraphA.finalize(exp_ext::property::graph::updatable{}); // PtrA should be filled with values based on ScalarA @@ -62,7 +63,7 @@ int main() { sizeof(int32_t)); // Swap ScalarB and PtrB to be the new inputs/outputs - SubmitKernelNode(GraphB, Queue, PtrB, RawScalarB); + SubmitKernelNode(GraphB, Queue, PtrB, RawScalarB, Kernel); ExecGraphA.update(GraphB); Queue.ext_oneapi_graph(ExecGraphA).wait(); diff --git a/sycl/test-e2e/Graph/RecordReplay/compile_time_local_memory.cpp b/sycl/test-e2e/Graph/RecordReplay/compile_time_local_memory.cpp new file mode 100644 index 0000000000000..50b14c69f8f62 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/compile_time_local_memory.cpp @@ -0,0 +1,10 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +#define GRAPH_E2E_RECORD_REPLAY + +#include "../Inputs/compile_time_local_memory.cpp" diff --git a/sycl/test-e2e/InvokeSimd/Regression/ImplicitSubgroup/debug_symbols.cpp b/sycl/test-e2e/InvokeSimd/Regression/ImplicitSubgroup/debug_symbols.cpp index edaa297b3efdd..7334e8ea7f4dd 100644 --- a/sycl/test-e2e/InvokeSimd/Regression/ImplicitSubgroup/debug_symbols.cpp +++ b/sycl/test-e2e/InvokeSimd/Regression/ImplicitSubgroup/debug_symbols.cpp @@ -1,5 +1,5 @@ // Check that full compilation works: -// RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -g %S/../debug_symbols.cpp -o %t.out +// RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -g -O2 %S/../debug_symbols.cpp -o %t.out // RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out // // VISALTO enable run diff --git a/sycl/test-e2e/InvokeSimd/Regression/ImplicitSubgroup/dp4a.cpp b/sycl/test-e2e/InvokeSimd/Regression/ImplicitSubgroup/dp4a.cpp index 72bad9c621b8e..078a02eed1f9b 100644 --- a/sycl/test-e2e/InvokeSimd/Regression/ImplicitSubgroup/dp4a.cpp +++ b/sycl/test-e2e/InvokeSimd/Regression/ImplicitSubgroup/dp4a.cpp @@ -1,4 +1,4 @@ -// REQUIRES: gpu-intel-dg1 || gpu-intel-dg2 || arch-intel_gpu_pvc +// REQUIRES: arch-intel_gpu_dg1 || gpu-intel-dg2 || arch-intel_gpu_pvc // // Check that full compilation works: // RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %S/../dp4a.cpp -o %t.out diff --git a/sycl/test-e2e/InvokeSimd/Regression/debug_symbols.cpp b/sycl/test-e2e/InvokeSimd/Regression/debug_symbols.cpp index 11c118dad7615..1fc6c44dbac91 100644 --- a/sycl/test-e2e/InvokeSimd/Regression/debug_symbols.cpp +++ b/sycl/test-e2e/InvokeSimd/Regression/debug_symbols.cpp @@ -1,5 +1,5 @@ // Check that full compilation works: -// RUN: %{build} -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -g -o %t.out +// RUN: %{build} -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -g -O2 -o %t.out // RUN: env IGC_VCSaveStackCallLinkage=1 IGC_VCDirectCallsOnly=1 %{run} %t.out // // VISALTO enable run diff --git a/sycl/test-e2e/InvokeSimd/Regression/dp4a.cpp b/sycl/test-e2e/InvokeSimd/Regression/dp4a.cpp index 15be55f7dde64..1b7a728a925a1 100644 --- a/sycl/test-e2e/InvokeSimd/Regression/dp4a.cpp +++ b/sycl/test-e2e/InvokeSimd/Regression/dp4a.cpp @@ -1,4 +1,4 @@ -// REQUIRES: gpu-intel-dg1 || gpu-intel-dg2 || arch-intel_gpu_pvc +// REQUIRES: arch-intel_gpu_dg1 || gpu-intel-dg2 || arch-intel_gpu_pvc // // Check that full compilation works: // RUN: %{build} -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr -o %t.out diff --git a/sycl/test-e2e/KernelAndProgram/kernel-bundle-merge-options.cpp b/sycl/test-e2e/KernelAndProgram/kernel-bundle-merge-options.cpp index ba09fa1ea16ea..10037fe75f6b4 100644 --- a/sycl/test-e2e/KernelAndProgram/kernel-bundle-merge-options.cpp +++ b/sycl/test-e2e/KernelAndProgram/kernel-bundle-merge-options.cpp @@ -3,19 +3,19 @@ // RUN: env SYCL_UR_TRACE=2 %{run} %t.out | FileCheck %s // UNSUPPORTED: hip -// Debug option -g is not passed to device code compiler when CL-style driver -// is used and /DEBUG options is passed. -// XFAIL: cl_options -// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16309 +// Note that the UR call might be urProgramBuild OR urProgramBuildExp . +// The same is true for Compile and Link. +// We want the first match. Don't put parentheses after. #include "kernel-bundle-merge-options.hpp" // CHECK: <--- urProgramBuild // CHECK-SAME: -g -// TODO: Uncomment when build options are properly passed to compile and link +// CHECK: <--- urProgramCompile +// CHECK-SAME: -g + +// TODO: Uncomment when build options are properly passed to link // commands for kernel_bundle -// xCHECK: <--- urProgramCompile( -// xCHECK-SAME: -g -// xCHECK: <--- urProgramLink( +// xCHECK: <--- urProgramLink // xCHECK-SAME: -g diff --git a/sycl/test-e2e/KernelAndProgram/persistent-cache-multi-device.cpp b/sycl/test-e2e/KernelAndProgram/persistent-cache-multi-device.cpp index 48a9bdf74d8cf..24cfb57341e12 100644 --- a/sycl/test-e2e/KernelAndProgram/persistent-cache-multi-device.cpp +++ b/sycl/test-e2e/KernelAndProgram/persistent-cache-multi-device.cpp @@ -5,6 +5,9 @@ // RUN: env NEOReadDebugKeys=1 CreateMultipleRootDevices=3 SYCL_CACHE_PERSISTENT=1 SYCL_CACHE_TRACE=1 SYCL_CACHE_DIR=%t/cache_dir env -u XDG_CACHE_HOME env -u HOME %{run} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK-BUILD // RUN: env NEOReadDebugKeys=1 CreateMultipleRootDevices=3 SYCL_CACHE_PERSISTENT=1 SYCL_CACHE_TRACE=1 SYCL_CACHE_DIR=%t/cache_dir env -u XDG_CACHE_HOME env -u HOME %{run} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK-CACHE +// XFAIL: arch-intel_gpu_pvc +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16401 + // Test checks that persistent cache works correctly with multiple devices. #include diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp deleted file mode 100644 index 9b4777faced20..0000000000000 --- a/sycl/test-e2e/Matrix/SG32/get_coord_float_matC.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//==----------- get_coord_float_matC.cpp - DPC++ joint_matrix---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 -// UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix -// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -#include "../common.hpp" - -#define SG_SZ 32 - -#include "../get_coord_float_matC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp deleted file mode 100644 index 09c3ccf9983ab..0000000000000 --- a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matA.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//==----------- get_coord_int8_matA.cpp - DPC++ joint_matrix---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 -// UNSUPPORTED: gpu-intel-dg2 -// REQUIRES: aspect-ext_intel_matrix -// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -#include "../common.hpp" - -#define SG_SZ 32 - -#include "../get_coord_int8_matA_impl.hpp" diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/SG32/get_coordinate_ops.cpp similarity index 71% rename from sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp rename to sycl/test-e2e/Matrix/SG32/get_coordinate_ops.cpp index 7d7e92e102780..0402a56e5c15d 100644 --- a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp +++ b/sycl/test-e2e/Matrix/SG32/get_coordinate_ops.cpp @@ -1,4 +1,4 @@ -//==----------- get_coord_int8_matB.cpp - DPC++ joint_matrix---------==// +//==----------- get_coordinate_ops.cpp - DPC++ joint_matrix---------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -10,6 +10,12 @@ // REQUIRES: aspect-ext_intel_matrix // REQUIRES-INTEL-DRIVER: lin: 30049, win: 101.4943 +// XFAIL: !igc-dev && run-mode +// XFAIL-TRACKER: GSD-6376 + +// XFAIL: arch-intel_gpu_pvc +// XFAIL-TRACKER: GSD-10524 + // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -17,4 +23,4 @@ #define SG_SZ 32 -#include "../get_coord_int8_matB_impl.hpp" +#include "../get_coordinate_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/get_coord_float_matC.cpp b/sycl/test-e2e/Matrix/get_coord_float_matC.cpp deleted file mode 100644 index 044c8072bb3bc..0000000000000 --- a/sycl/test-e2e/Matrix/get_coord_float_matC.cpp +++ /dev/null @@ -1,14 +0,0 @@ -//==----------- get_coord_float_matC.cpp - DPC++ joint_matrix---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -#include "common.hpp" -#include "get_coord_float_matC_impl.hpp" diff --git a/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp b/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp deleted file mode 100644 index a6c7c5646a548..0000000000000 --- a/sycl/test-e2e/Matrix/get_coord_float_matC_impl.hpp +++ /dev/null @@ -1,130 +0,0 @@ -//==----------- get_coord_float_matC_impl.hpp - DPC++ joint_matrix---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include -#include - -template class add_rows; - -// clang-format off -/* -Here's how the data is distributed for sub group size = 16 on PVC -W0 --> 0 1 2 3 4 5 6 7 -wi [0,0] -> i=0, [0, 0] wi [0,1] --> i=0, [0, 1] wi [0,15] --> i=0, [0, 15] - i=1, [1, 0] i=1, [1, 1] i=1, [1, 15] - i=2, [2, 0] i=2, [2, 1] ... - ... .... - i=7, [7, 0] i=7, [7, 1] -*/ -// clang-format on - -template -void matrix_sum_rows(big_matrix &C, T *sum_rows) { - buffer bufC((T *)C.get_data(), range<2>(Rows, Cols)); - buffer sum_rows_v(sum_rows, Rows); - - queue q; - size_t sg_size = get_sg_size>(q); - q.submit([&](handler &cgh) { - sycl::accessor accC{bufC, cgh, sycl::read_write}; - sycl::accessor v{sum_rows_v, cgh, sycl::read_write}; - - cgh.parallel_for>( - nd_range<2>({Rows / TileRows, Cols / TileCols * sg_size}, - {1, 1 * sg_size}), - [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[sycl::reqd_sub_group_size(SG_SZ)]] -#endif - { - // The submatrix API has to be accessed by all the workitems in a - // subgroup these functions will be called once by the subgroup no - // code divergence between the workitems - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_c; - - joint_matrix_load( - sg, sub_c, - accC.template get_multi_ptr() + - (sg_startx * TileRows) * Cols + - sg_starty / sg_size * TileCols, - Cols, layout::row_major); - - T sum_local_rows[Rows] = {0}; - - ext::intel::experimental::matrix::joint_matrix_apply( - sg, sub_c, [&](T &x, size_t row, size_t col) { - sum_local_rows[row + global_idx * TileRows] += x; - }); - for (int i = 0; i < Rows; i++) { - sum_local_rows[i] = - reduce_over_group(sg, sum_local_rows[i], sycl::plus<>()); - // only Groups leader perform the global reduction - if (global_idy % sg_size == 0) { - sycl::atomic_ref - aref(v[i]); - aref.fetch_add(sum_local_rows[i]); - } - } - }); // parallel for - }).wait(); -} - -template void test() { - constexpr size_t SCALE = 2; - static constexpr size_t Rows = TM * SCALE; - static constexpr size_t Cols = TN * SCALE; - - T sum_rows[Rows] = {0}; - T sum_rows_ref[Rows] = {0}; - T C[Rows][Cols]; - big_matrix MC((T *)&C); - - matrix_rand(Rows, Cols, (T *)&C, (T)100); - matrix_sum_rows(MC, sum_rows); - - for (int i = 0; i < Rows; i++) { - for (int j = 0; j < Cols; j++) { - sum_rows_ref[i] += C[i][j]; - } - assert(std::fabs(sum_rows_ref[i] - sum_rows[i]) <= FLOAT_EPSILON); - } -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - return 0; -} diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp deleted file mode 100644 index 8e953c8689305..0000000000000 --- a/sycl/test-e2e/Matrix/get_coord_int8_matA.cpp +++ /dev/null @@ -1,14 +0,0 @@ -//==----------- get_coord_int8_matA.cpp - DPC++ joint_matrix---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -#include "common.hpp" -#include "get_coord_int8_matA_impl.hpp" diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp b/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp deleted file mode 100644 index 0bf9281ab9f45..0000000000000 --- a/sycl/test-e2e/Matrix/get_coord_int8_matA_impl.hpp +++ /dev/null @@ -1,176 +0,0 @@ -//==----------- get_coord_int8_matA_impl.hpp - DPC++ joint_matrix---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include -#include - -template class add_rows; - -template -void sum_rows_ref( - host_accessor A, - host_accessor sum_rows) { - int sum_rows_ref[Rows] = {0}; - for (size_t i = 0; i < Rows; i++) { - for (size_t j = 0; j < Cols; j++) { - sum_rows_ref[i] += A[i][j]; - } - auto diff = sum_rows[i] - sum_rows_ref[i]; - assert(std::fabs(static_cast(diff)) <= - std::numeric_limits::epsilon()); - } -} - -// clang-format off -/* For sub group size = 16: -wi [0,0] -> i=0, [0, 0] wi [0,1] --> i=0, [0, 2] wi [0,15] --> i=0, [0, 30] - i=1, [0, 1] i=1, [0, 3] i=1, [0, 31] - i=2, [1, 0] i=2, [1, 2] i=2, [1, 30] - i=3, [1, 1] i=3, [1, 3] i=3, [1, 31] - i=4, [2, 0] i=4, [2, 2] ... - i=5, [2, 1] i=5, [2, 3] - ... .... - i=14,[7, 0] i=14, [7, 2] - i=15,[7, 1] i=15, [7, 3] i=15, [7, 31] - -Here's how the distribution of the A matrix looks like for this test case -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -<--------------------------------- SG1 ---------------------------------> -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x x -<0> <1> <2> <3> <4> <5> <6> <7> ..... WORK ITEMS -Each work item has 16 elements <8 rows and 2 cols of the original matrix> -the data_slice holds the matrix elements in the following order: -0 0 0 0 - / - / -1 1 1 1 - / - / -2 2 2 2 - / - / -3 3 3 3 -W0 --> 0 0 1 1 2 2 3 3 .... 7 7 -*/ -// clang-format on - -template -void matrix_sum_rows(big_matrix &A) { - buffer bufA(A.get_data(), range<2>(Rows, Cols)); - - // size of vector is equal to number of rows in big matrix - TResult sum_rows[Rows] = {0}; - buffer sum_rows_v(sum_rows, Rows); - queue q; - size_t sg_size = get_sg_size>(q); - nd_range<2> r({Rows / TileRows, Cols / TileCols * sg_size}, {1, 1 * sg_size}); - q.submit([&](handler &cgh) { - sycl::accessor accA{bufA, cgh, sycl::read_write}; - sycl::accessor v{sum_rows_v, cgh, sycl::read_write}; - - cgh.parallel_for>( - r, [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[sycl::reqd_sub_group_size(SG_SZ)]] -#endif - { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sycl::sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - joint_matrix_load( - sg, sub_a, - accA.template get_multi_ptr() + - (sg_startx * TileRows * Cols) + - sg_starty / sg_size * TileCols, - Cols); - - TResult sum_local_rows[Rows] = {0}; - - ext::intel::experimental::matrix::joint_matrix_apply( - sg, sub_a, [&](T &x, size_t row, size_t col) { - sum_local_rows[row + global_idx * TileRows] += x; - }); - for (int i = 0; i < Rows; ++i) { - sum_local_rows[i] = - reduce_over_group(sg, sum_local_rows[i], sycl::plus<>()); - - // only Groups leader performs the global reduction - if (global_idy % sg_size == 0) { - sycl::atomic_ref - aref(v[i]); - aref.fetch_add(sum_local_rows[i]); - } - } - }); // parallel for - }).wait(); - sum_rows_ref(bufA.get_host_access(), - sum_rows_v.get_host_access()); -} - -template void test() { - static constexpr size_t Rows = TM * 2; - static constexpr size_t Cols = TK * 2; - T A[Rows][Cols]; - - big_matrix MA((T *)&A); - - for (int i = 0; i < Rows; i++) { - for (int j = 0; j < Cols; j++) { - A[i][j] = i + j; - } - } - - matrix_sum_rows(MA); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - return 0; -} diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp b/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp deleted file mode 100644 index 08cb616cc6cc4..0000000000000 --- a/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp +++ /dev/null @@ -1,214 +0,0 @@ -//==----------- get_coord_int8_matB_impl.hpp - DPC++ joint_matrix---------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -#include -#include - -template class add_cols; - -template -void sum_cols_ref( - host_accessor B, - host_accessor sum_cols) { - TResult sum_cols_ref[Cols] = {0}; - for (size_t j = 0; j < Cols; j++) { - for (size_t i = 0; i < Rows; i++) { - sum_cols_ref[j] += B[i][j]; - } - auto diff = sum_cols[j] - sum_cols_ref[j]; - assert(std::fabs(static_cast(diff)) <= - std::numeric_limits::epsilon()); - } -} - -// clang-format off -/* - Here is a demonstration of how matrix B will be divided across - work items for this test case for sub group size = 16 on PVC. - < --------------- 128 ----------------------------------> - x x x x x x x x x x x x x x x x .......... x x x x x x ^ - x x x x x x x x x x x x x x x x .......... x x x x x x 16 - x x x x x x x x x x x x x x x x .......... x x x x x x | - ..... | - x x x x x x x x x x x x x x x x .......... x x x x x x | - x x x x x x x x x x x x x x x x .......... x x x x x x v - - --------------- 64 ----------------> - x x x x x x .......... x x x x x x ^ - x x x x x x .......... x x x x x x 8 - x x x x x x .......... x x x x x x | <-- part of (VNNI-ed) - ..... | original matrix each SG - x x x x x x .......... x x x x x x | holds - x x x x x x .......... x x x x x x v - < WI0 > < WI15 > - <-------- 16 -------------> - x x x .......... x x x ^ - x x x .......... x x x | - x x x .......... x x x | <-- part of (non-VNNI-ed) original matrix - ..... | each SG holds - x x x .......... x x x | - x x x .......... x x x | - x x x .......... x x x 32 - x x x .......... x x x | - x x x .......... x x x | - x x x .......... x x x | - x x x .......... x x x | - x x x .......... x x x | - x x x .......... x x x v - If we divide the above matrix across 16 (SG_SZ) work items, - each WI will hold 32 elements. And these 32 elements will be - 8x4 chunks as shown in the VNNI-ed matrix figure. - -The total distribution among the WIs in ALL the sub-groups is as follows: -This is useful to figure out how the global index is to be calculated - -W0 --> 0 0 0 0 1 1 1 1 ... 7 7 7 7 --> total 32 elements -wi [0,0] --> i=0, [0, 0] wi [0,1] --> i=0, [0, 4] wi [0,15] --> i=0, [0, 60] | wi [0,16] --> i=0, [0, 64] - i=1, [0, 1] i=1, [0, 5] i=1, [0, 61] | i=1, [0, 65] - i=2, [0, 2] i=2, [0, 6] i=2, [0, 62] | i=2, [0, 66] - i=3, [0, 3] i=3, [0, 7] i=3, [0, 63] | i=3, [0, 67] - i=4, [1, 0] i=4, [1, 4] i=4, [1, 60] | .... - i=5, [1, 1] i=5, [1, 5] i=5, [1, 61] | - i=6, [1, 2] i=6, [1, 6] i=6, [1, 62] | - i=7, [1, 3] i=7, [1, 7] i=7, [1, 63] | - ... ... .... | - i=28,[7, 0] i=28,[7, 4] i=28,[7, 60] | i=28, [7, 124] - i=29,[7, 1] i=29,[7, 5] i=29,[7, 61] | i=29, [7, 125] - i=30,[7, 2] i=30,[7, 6] i=30,[7, 62] | i=30, [7, 126] - i=31,[7, 3] i=31,[7, 7] i=31,[7, 63] | i=31, [7, 127] ----------------------------------------------------------------------------------------- --------------------------- -wi [1,0] --> i=0, [8, 0] - i=1, [8, 1] - i=2, [8, 2] - i=3, [8, 2] - ... - i=28, [15, 0] - i=29, [15, 1] - i=30, [15, 2] - i=31, [15, 3] -*/ - -// clang-format on - -template -void matrix_sum_cols(big_matrix &B, - big_matrix &Bvnni) { - buffer bufB(B.get_data(), range<2>(Rows, Cols)); - buffer bufBvnni(Bvnni.get_data(), range<2>(Rows / VNNI, Cols * VNNI)); - - TResult sum_cols[Cols] = {0}; - buffer sum_cols_v(sum_cols, Cols); - - size_t NDRangeK = Rows / TileRows; - size_t NDRangeN = Cols / TileCols; - queue q; - size_t sg_size = get_sg_size>(q); - nd_range<2> r({NDRangeK, NDRangeN * sg_size}, {1, 1 * sg_size}); - - q.submit([&](handler &cgh) { - sycl::accessor accB{bufBvnni, cgh, sycl::read_write}; - sycl::accessor v{sum_cols_v, cgh, sycl::read_write}; - - cgh.parallel_for>( - r, [=](nd_item<2> spmd_item) -#ifdef SG_SZ - [[sycl::reqd_sub_group_size(SG_SZ)]] -#endif - { - const auto global_idx = spmd_item.get_global_id(0); - const auto global_idy = spmd_item.get_global_id(1); - const auto sg_startx = global_idx - spmd_item.get_local_id(0); - const auto sg_starty = global_idy - spmd_item.get_local_id(1); - - sycl::sub_group sg = spmd_item.get_sub_group(); - - joint_matrix - sub_b; - - joint_matrix_load( - sg, sub_b, - accB.template get_multi_ptr() + - (sg_startx * (TileRows / VNNI) * Cols * VNNI) + - sg_starty / sg_size * TileCols * VNNI, - Cols * VNNI); - - TResult sum_local_cols[Cols] = {0}; - ext::intel::experimental::matrix::joint_matrix_apply( - sg, sub_b, [&](T &x, size_t row, size_t col) { - // the coordinates returned are in the logical range - // [Rows,Cols] If users want to retrieve the VNNIed - // coordinates, they can be obtained using colVNNI = col/VNNI - // rowVNNI = row*VNNI - size_t global_index = col + global_idy / sg_size * TileCols; - sum_local_cols[global_index] += x; - }); - - for (int i = 0; i < Cols; i++) { - sum_local_cols[i] = - reduce_over_group(sg, sum_local_cols[i], sycl::plus<>()); - if (global_idy % sg_size == 0) { - sycl::atomic_ref - aref(v[i]); - aref.fetch_add(sum_local_cols[i]); - } - } - }); // parallel for - }).wait(); - sum_cols_ref(bufB.get_host_access(), - sum_cols_v.get_host_access()); -} - -template -void test() { - static constexpr size_t scale = 2; - static constexpr size_t MATRIX_K = TK * scale; - static constexpr size_t MATRIX_N = TN * scale; - - T B[MATRIX_K][MATRIX_N]; - big_matrix MB((T *)&B); - - T Bvnni[MATRIX_K / VNNI][MATRIX_N * VNNI]; - big_matrix MBvnni((T *)&Bvnni); - - for (int i = 0; i < MATRIX_K; i++) { - for (int j = 0; j < MATRIX_N; j++) { - B[i][j] = i + j; - } - } - matrix_vnni(MATRIX_K, MATRIX_N, *B, *Bvnni, VNNI); - // This test calculates sum of columns in the non VNNI B matrix - matrix_sum_cols(MB, MBvnni); -} - -int main() { - queue q; - std::vector combinations = - q.get_device() - .get_info(); - - for (unsigned int i = 0; i < combinations.size(); i++) { - if (combinations[i].nsize == 0) { // Intel AMX - test(); - break; - } - - if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc - test(); - break; - } - - if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* - test(); - break; - } - } - return 0; -} \ No newline at end of file diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/get_coordinate_ops.cpp similarity index 79% rename from sycl/test-e2e/Matrix/get_coord_int8_matB.cpp rename to sycl/test-e2e/Matrix/get_coordinate_ops.cpp index 255331dca89cb..ed404612a9ff3 100644 --- a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp +++ b/sycl/test-e2e/Matrix/get_coordinate_ops.cpp @@ -1,4 +1,4 @@ -//==----------- get_coord_int8_matB.cpp - DPC++ joint_matrix---------==// +//==----------- get_coordinate_ops.cpp - DPC++ joint_matrix---------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,10 +6,10 @@ // //===----------------------------------------------------------------------===// // REQUIRES: aspect-ext_intel_matrix +// REQUIRES-INTEL-DRIVER: lin: 30049 // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// REQUIRES-INTEL-DRIVER: lin: 30049 #include "common.hpp" -#include "get_coord_int8_matB_impl.hpp" +#include "get_coordinate_ops_impl.hpp" diff --git a/sycl/test-e2e/Matrix/get_coordinate_ops_impl.hpp b/sycl/test-e2e/Matrix/get_coordinate_ops_impl.hpp new file mode 100644 index 0000000000000..8cbd24b302171 --- /dev/null +++ b/sycl/test-e2e/Matrix/get_coordinate_ops_impl.hpp @@ -0,0 +1,221 @@ +//==----------- get_coordinate_ops_impl.hpp - DPC++ joint_matrix---------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include +#include + +template +class matrix_process; + +template +void reduce_and_accumulate(sub_group sg, size_t sg_size, size_t global_idy, + AccessorType &global_acc, TResult *local_sums, + size_t count) { + for (size_t i = 0; i < count; i++) { + local_sums[i] = reduce_over_group(sg, local_sums[i], sycl::plus<>()); + + // Only the subgroup leader performs the global accumulation + if (global_idy % sg_size == 0) { + sycl::atomic_ref + aref(global_acc[i]); + aref.fetch_add(local_sums[i]); + } + } +} + +template +void matrix_sum(big_matrix &M, + TResult *sum_rows, TResult *sum_cols) { + buffer buf((T *)M.get_data(), range<2>(NUM_ROWS / VF, NUM_COLS * VF)); + buffer sum_rows_v(sum_rows, NUM_ROWS); + buffer sum_cols_v(sum_cols, NUM_COLS); + + queue q; + size_t sg_size = + get_sg_size>(q); + q.submit([&](handler &cgh) { + sycl::accessor acc{buf, cgh, sycl::read_write}; + sycl::accessor v_rows{sum_rows_v, cgh, sycl::read_write}; + sycl::accessor v_cols{sum_cols_v, cgh, sycl::read_write}; + + cgh.parallel_for>( + nd_range<2>({NUM_ROWS / SROWS, NUM_COLS / SCOLS * sg_size}, + {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[sycl::reqd_sub_group_size(SG_SZ)]] +#endif + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + + TResult sum_local_rows[NUM_ROWS] = {0}; + TResult sum_local_cols[NUM_COLS] = {0}; + + if (Use == use::accumulator) { + joint_matrix + sub; + + joint_matrix_load( + sg, sub, + acc.template get_multi_ptr() + + (sg_startx * SROWS * NUM_COLS) + + sg_starty / sg_size * SCOLS, + NUM_COLS, Layout); + + ext::intel::experimental::matrix::joint_matrix_apply( + sg, sub, [&](T &x, size_t row, size_t col) { + sum_local_rows[row + global_idx * SROWS] += x; + sum_local_cols[col + global_idy / sg_size * SCOLS] += x; + }); + + } else { + joint_matrix sub; + + joint_matrix_load( + sg, sub, + acc.template get_multi_ptr() + + (sg_startx * (SROWS / VF) * NUM_COLS * VF) + + sg_starty / sg_size * SCOLS * VF, + NUM_COLS * VF); + + ext::intel::experimental::matrix::joint_matrix_apply( + sg, sub, [&](T &x, size_t row, size_t col) { + sum_local_rows[row + global_idx * SROWS] += x; + sum_local_cols[col + global_idy / sg_size * SCOLS] += x; + }); + } + + reduce_and_accumulate(sg, sg_size, global_idy, v_rows, + sum_local_rows, NUM_ROWS); + reduce_and_accumulate(sg, sg_size, global_idy, v_cols, + sum_local_cols, NUM_COLS); + }); // parallel for + }).wait(); +} + +template +void test_get_coord_op() { + constexpr size_t SCALE = 2; + static constexpr size_t Rows = SROWS * SCALE; + static constexpr size_t Cols = SCOLS * SCALE; + + T M[Rows][Cols]; + T Mvnni[Rows / VF][Cols * VF]; + TResult sum_rows[Rows] = {0}; + TResult sum_rows_ref[Rows] = {0}; + TResult sum_cols[Cols] = {0}; + TResult sum_cols_ref[Cols] = {0}; + + matrix_fill(Rows, Cols, (T *)M, [](int i, int j) { return T(i + j); }); + + matrix_vnni(Rows, Cols, *M, *Mvnni, VF); + big_matrix MM((T *)&Mvnni); + + matrix_sum( + MM, sum_rows, sum_cols); + + for (int i = 0; i < Rows; i++) { + for (int j = 0; j < Cols; j++) { + sum_rows_ref[i] += (int)M[i][j]; + } + assert(std::fabs(sum_rows_ref[i] - sum_rows[i]) <= FLOAT_EPSILON); + } + + for (int j = 0; j < Cols; j++) { + for (int i = 0; i < Rows; i++) { + sum_cols_ref[j] += (int)M[i][j]; + } + assert(std::fabs(sum_cols_ref[j] - sum_cols[j]) <= FLOAT_EPSILON); + } +} + +int main() { + queue q; + std::vector combinations = + q.get_device() + .get_info(); + + for (unsigned int i = 0; i < combinations.size(); i++) { + if (combinations[i].nsize == 0) { // Intel AMX + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + break; + } + + if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + // This combination is not currently supported for sub group size = 32 in + // IGC +#if (!defined(SG_SZ) || SG_SZ != 32) + test_get_coord_op(); + test_get_coord_op(); +#endif + break; + } + + if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + test_get_coord_op(); + break; + } + } + return 0; +} diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp index febb1230d91dd..bd30efe1b217c 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_arg_dim.cpp @@ -14,8 +14,8 @@ // Waiting for the commit in IGC to be pulled into the driver to resolve the // test. -// XFAIL: !igc-dev || gpu-intel-dg2 -// XFAIL-TRACKER: CMPLRLLVM-63710 +// XFAIL: (!igc-dev || gpu-intel-dg2) && run-mode +// XFAIL-TRACKER: GSD-10510 #include "common.hpp" #include "joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp index 291852eaa612d..ff30d4c40f6a7 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp @@ -14,8 +14,8 @@ // Waiting for the commit in IGC to be pulled into the driver to resolve the // test. -// XFAIL: !igc-dev || gpu-intel-dg2 -// XFAIL-TRACKER: CMPLRLLVM-63710 +// XFAIL: (!igc-dev || gpu-intel-dg2) && run-mode +// XFAIL-TRACKER: GSD-10510 #include "common.hpp" #include "joint_matrix_bf16_fill_k_cache_impl.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp index de2d1d89deaf5..d4abd258368b6 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp @@ -140,8 +140,8 @@ int main() { test(); test(); test(); - break; #endif + break; } if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2* diff --git a/sycl/test-e2e/MemorySanitizer/check_buffer.cpp b/sycl/test-e2e/MemorySanitizer/check_buffer.cpp index dfce88803a96e..a8da8d2e4d159 100644 --- a/sycl/test-e2e/MemorySanitizer/check_buffer.cpp +++ b/sycl/test-e2e/MemorySanitizer/check_buffer.cpp @@ -4,9 +4,6 @@ // RUN: %{build} %device_msan_flags -O2 -g -o %t3.out // RUN: %{run} not %t3.out 2>&1 | FileCheck %s -// XFAIL: gpu-intel-gen12 || gpu-intel-dg2 -// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16184 - #include __attribute__((noinline)) long long foo(int data1, long long data2) { diff --git a/sycl/test-e2e/MemorySanitizer/check_call.cpp b/sycl/test-e2e/MemorySanitizer/check_call.cpp index fc1d41ac4c1e8..985fa9080aae6 100644 --- a/sycl/test-e2e/MemorySanitizer/check_call.cpp +++ b/sycl/test-e2e/MemorySanitizer/check_call.cpp @@ -4,9 +4,6 @@ // RUN: %{build} %device_msan_flags -O2 -g -o %t3.out // RUN: %{run} not %t3.out 2>&1 | FileCheck %s -// XFAIL: gpu-intel-gen12 || gpu-intel-dg2 -// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16184 - #include #include diff --git a/sycl/test-e2e/MemorySanitizer/check_divide.cpp b/sycl/test-e2e/MemorySanitizer/check_divide.cpp index 4769e1a3b9d44..5513f2fef53a8 100644 --- a/sycl/test-e2e/MemorySanitizer/check_divide.cpp +++ b/sycl/test-e2e/MemorySanitizer/check_divide.cpp @@ -4,9 +4,6 @@ // RUN: %{build} %device_msan_flags -O2 -g -o %t3.out // RUN: %{run} not %t3.out 2>&1 | FileCheck %s -// XFAIL: gpu-intel-gen12 || gpu-intel-dg2 -// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16184 - #include #include diff --git a/sycl/test-e2e/MemorySanitizer/lit.local.cfg b/sycl/test-e2e/MemorySanitizer/lit.local.cfg index 8817570e9ae06..f9437ee4a9048 100644 --- a/sycl/test-e2e/MemorySanitizer/lit.local.cfg +++ b/sycl/test-e2e/MemorySanitizer/lit.local.cfg @@ -1,3 +1,9 @@ +# TRACKER: https://github.com/intel/llvm/issues/16184 +# TRACKER for PVC: https://github.com/intel/llvm/issues/16401 +#has_arch_gpu_intel_pvc = any('arch-intel_gpu_pvc' in T for T in config.sycl_dev_features.values()) +#if not has_arch_gpu_intel_pvc: +config.unsupported_features += ['gpu'] + config.substitutions.append( ("%device_msan_flags", "-Xarch_device -fsanitize=memory") ) diff --git a/sycl/test-e2e/NewOffloadDriver/aot-cpu.cpp b/sycl/test-e2e/NewOffloadDriver/aot-cpu.cpp index 585b368641d6e..0016e8498bd9d 100644 --- a/sycl/test-e2e/NewOffloadDriver/aot-cpu.cpp +++ b/sycl/test-e2e/NewOffloadDriver/aot-cpu.cpp @@ -1,4 +1,5 @@ // REQUIRES: opencl-aot, cpu +// REQUIRES: build-and-run-mode // Test with `--offload-new-driver` // RUN: %clangxx -fsycl -fsycl-device-code-split=per_source -fsycl-targets=spir64_x86_64 -I %S/Inputs -o %t.out %S/split-per-source-main.cpp %S/Inputs/split-per-source-second-file.cpp \ diff --git a/sycl/test-e2e/NewOffloadDriver/cpu.cpp b/sycl/test-e2e/NewOffloadDriver/cpu.cpp index ef93d69f8693b..943ea71c3386e 100644 --- a/sycl/test-e2e/NewOffloadDriver/cpu.cpp +++ b/sycl/test-e2e/NewOffloadDriver/cpu.cpp @@ -7,6 +7,7 @@ //===---------------------------------------------------------------------===// // REQUIRES: opencl-aot, cpu +// REQUIRES: build-and-run-mode // Test with `--offload-new-driver` // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64 --offload-new-driver %S/Inputs/aot.cpp -o %t.out diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp index f24bffb81526a..1532c54879ecc 100644 --- a/sycl/test-e2e/NonUniformGroups/ballot_group.cpp +++ b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp @@ -4,6 +4,7 @@ // RUN: %if any-device-is-cpu && opencl-aot %{ %clangxx -fsycl -fsycl-targets=spir64_x86_64 -o %t.x86.out %s %} // RUN: %if cpu %{ %{run} %t.x86.out %} // +// REQUIRES: build-and-run-mode // REQUIRES: cpu || gpu // UNSUPPORTED: hip diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp index 8f6b6a8f17197..14e976edc8492 100644 --- a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp @@ -4,6 +4,7 @@ // RUN: %if any-device-is-cpu && opencl-aot %{ %clangxx -fsycl -fsycl-targets=spir64_x86_64 -o %t.x86.out %s %} // RUN: %if cpu %{ %{run} %t.x86.out %} // +// REQUIRES: build-and-run-mode // REQUIRES: cpu || gpu // REQUIRES: sg-32 // REQUIRES: aspect-ext_oneapi_ballot_group diff --git a/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp b/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp index 939be57799dd4..5d2b98b58ddc2 100644 --- a/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp +++ b/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp @@ -4,6 +4,7 @@ // RUN: %if any-device-is-cpu && opencl-aot %{ %clangxx -fsycl -fsycl-targets=spir64_x86_64 -o %t.x86.out %s %} // RUN: %if cpu %{ %{run} %t.x86.out %} // +// REQUIRES: build-and-run-mode // REQUIRES: cpu || gpu // UNSUPPORTED: hip // REQUIRES: sg-32 diff --git a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp index c1c172c4189c3..dfed10e9f587e 100644 --- a/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/fixed_size_group_algorithms.cpp @@ -4,6 +4,7 @@ // RUN: %if any-device-is-cpu && opencl-aot %{ %clangxx -fsycl -fsycl-targets=spir64_x86_64 -fsycl-device-code-split=per_kernel -o %t.x86.out %s %} // RUN: %if cpu %{ %{run} %t.x86.out %} // +// REQUIRES: build-and-run-mode // REQUIRES: cpu || gpu // REQUIRES: sg-32 // REQUIRES: aspect-ext_oneapi_fixed_size_group diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp index 18d42487b7768..2069113c3f939 100644 --- a/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp +++ b/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp @@ -4,6 +4,7 @@ // RUN: %if any-device-is-cpu && opencl-aot %{ %clangxx -fsycl -fsycl-targets=spir64_x86_64 -o %t.x86.out %s %} // RUN: %if cpu %{ %{run} %t.x86.out %} // +// REQUIRES: build-and-run-mode // REQUIRES: cpu || gpu // UNSUPPORTED: hip diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp index af4d45cee862f..8e7626650e01c 100644 --- a/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/opportunistic_group_algorithms.cpp @@ -4,6 +4,7 @@ // RUN: %if any-device-is-cpu && opencl-aot %{ %clangxx -fsycl -fsycl-targets=spir64_x86_64 -o %t.x86.out %s %} // RUN: %if cpu %{ %{run} %t.x86.out %} // +// REQUIRES: build-and-run-mode // REQUIRES: cpu || gpu // REQUIRES: sg-32 // REQUIRES: aspect-ext_oneapi_opportunistic_group diff --git a/sycl/test-e2e/NonUniformGroups/tangle_group.cpp b/sycl/test-e2e/NonUniformGroups/tangle_group.cpp index 44191955048f3..686d1b34c6185 100644 --- a/sycl/test-e2e/NonUniformGroups/tangle_group.cpp +++ b/sycl/test-e2e/NonUniformGroups/tangle_group.cpp @@ -4,6 +4,7 @@ // RUN: %if any-device-is-cpu && opencl-aot %{ %clangxx -fsycl -fsycl-targets=spir64_x86_64 -fno-sycl-early-optimizations -o %t.x86.out %s %} // RUN: %if cpu %{ %{run} %t.x86.out %} // +// REQUIRES: build-and-run-mode // REQUIRES: cpu || gpu // UNSUPPORTED: cuda || hip diff --git a/sycl/test-e2e/NonUniformGroups/tangle_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/tangle_group_algorithms.cpp index 7033c4c9e4df5..51070ed1731e4 100644 --- a/sycl/test-e2e/NonUniformGroups/tangle_group_algorithms.cpp +++ b/sycl/test-e2e/NonUniformGroups/tangle_group_algorithms.cpp @@ -4,6 +4,7 @@ // RUN: %if any-device-is-cpu && opencl-aot %{ %clangxx -fsycl -fsycl-targets=spir64_x86_64 -fno-sycl-early-optimizations -o %t.x86.out %s %} // RUN: %if cpu %{ %{run} %t.x86.out %} // +// REQUIRES: build-and-run-mode // REQUIRES: cpu || gpu // REQUIRES: sg-32 // REQUIRES: aspect-ext_oneapi_tangle_group diff --git a/sycl/test-e2e/OneapiDeviceSelector/illegal_input.cpp b/sycl/test-e2e/OneapiDeviceSelector/illegal_input.cpp index 79675bbac23cf..35430c7b12ff6 100644 --- a/sycl/test-e2e/OneapiDeviceSelector/illegal_input.cpp +++ b/sycl/test-e2e/OneapiDeviceSelector/illegal_input.cpp @@ -1,15 +1,15 @@ // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %S/Inputs/trivial.cpp -o %t.out -// RUN: not env ONEAPI_DEVICE_SELECTOR="macaroni:*" %{run-unfiltered-devices} %t.out -// RUN: not env ONEAPI_DEVICE_SELECTOR=":" %{run-unfiltered-devices} %t.out -// RUN: not env ONEAPI_DEVICE_SELECTOR="level_zero:." %{run-unfiltered-devices} %t.out -// RUN: not env ONEAPI_DEVICE_SELECTOR="macaroni_level_zero:." %{run-unfiltered-devices} %t.out -// RUN: not env ONEAPI_DEVICE_SELECTOR="level_zero:macaroni_gpu" %{run-unfiltered-devices} %t.out -// RUN: not env ONEAPI_DEVICE_SELECTOR="level_zero:0..0" %{run-unfiltered-devices} %t.out -// RUN: not env ONEAPI_DEVICE_SELECTOR="level_zero:" %{run-unfiltered-devices} %t.out -// RUN: not env ONEAPI_DEVICE_SELECTOR="level_zero:::gpu" %{run-unfiltered-devices} %t.out -// RUN: not env ONEAPI_DEVICE_SELECTOR="level_zero:.1" %{run-unfiltered-devices} %t.out -// RUN: not env ONEAPI_DEVICE_SELECTOR="" %{run-unfiltered-devices} %t.out +// RUN: not --crash env ONEAPI_DEVICE_SELECTOR="macaroni:*" %{run-unfiltered-devices} %t.out +// RUN: not --crash env ONEAPI_DEVICE_SELECTOR=":" %{run-unfiltered-devices} %t.out +// RUN: not --crash env ONEAPI_DEVICE_SELECTOR="level_zero:." %{run-unfiltered-devices} %t.out +// RUN: not --crash env ONEAPI_DEVICE_SELECTOR="macaroni_level_zero:." %{run-unfiltered-devices} %t.out +// RUN: not --crash env ONEAPI_DEVICE_SELECTOR="level_zero:macaroni_gpu" %{run-unfiltered-devices} %t.out +// RUN: not --crash env ONEAPI_DEVICE_SELECTOR="level_zero:0..0" %{run-unfiltered-devices} %t.out +// RUN: not --crash env ONEAPI_DEVICE_SELECTOR="level_zero:" %{run-unfiltered-devices} %t.out +// RUN: not --crash env ONEAPI_DEVICE_SELECTOR="level_zero:::gpu" %{run-unfiltered-devices} %t.out +// RUN: not --crash env ONEAPI_DEVICE_SELECTOR="level_zero:.1" %{run-unfiltered-devices} %t.out +// RUN: not --crash env ONEAPI_DEVICE_SELECTOR="" %{run-unfiltered-devices} %t.out // Calling ONEAPI_DEVICE_SELECTOR with an illegal input should result in an // error. diff --git a/sycl/test-e2e/OnlineCompiler/online_compiler_L0.cpp b/sycl/test-e2e/OnlineCompiler/online_compiler_L0.cpp index 64a902bcbb1a4..0d80e37e7d9fc 100644 --- a/sycl/test-e2e/OnlineCompiler/online_compiler_L0.cpp +++ b/sycl/test-e2e/OnlineCompiler/online_compiler_L0.cpp @@ -1,5 +1,6 @@ // REQUIRES: level_zero, level_zero_dev_kit, cm-compiler - +// XFAIL: gpu && !(arch-intel_gpu_pvc && igc-dev) +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16406 // RUN: %{build} -Wno-error=deprecated-declarations -DRUN_KERNELS %level_zero_options -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/OnlineCompiler/online_compiler_OpenCL.cpp b/sycl/test-e2e/OnlineCompiler/online_compiler_OpenCL.cpp index 161eb65478653..360592289a969 100644 --- a/sycl/test-e2e/OnlineCompiler/online_compiler_OpenCL.cpp +++ b/sycl/test-e2e/OnlineCompiler/online_compiler_OpenCL.cpp @@ -1,5 +1,6 @@ // REQUIRES: opencl, opencl_icd, cm-compiler - +// XFAIL: (gpu && !(arch-intel_gpu_pvc && igc-dev)) || cpu || accelerator +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16406 // RUN: %{build} -Wno-error=deprecated-declarations -DRUN_KERNELS %opencl_lib -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/OptionalKernelFeatures/fp64-conv-emu-1.cpp b/sycl/test-e2e/OptionalKernelFeatures/fp64-conv-emu-1.cpp index 988f014c2d8b4..1f61e1b4bec0f 100644 --- a/sycl/test-e2e/OptionalKernelFeatures/fp64-conv-emu-1.cpp +++ b/sycl/test-e2e/OptionalKernelFeatures/fp64-conv-emu-1.cpp @@ -17,7 +17,7 @@ // UNSUPPORTED: cuda, hip // UNSUPPORTED-REASON: FP64 emulation is an Intel specific feature. -// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_dg2_g10,intel_gpu_dg2_g11,intel_gpu_dg2_g12,intel_gpu_pvc,intel_gpu_mtl_h,intel_gpu_mtl_u -fsycl-fp64-conv-emu -O0 %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_dg2_g10,intel_gpu_dg2_g11,intel_gpu_dg2_g12,intel_gpu_pvc,intel_gpu_mtl_h,intel_gpu_mtl_u -fsycl-fp64-conv-emu %O0 %s -o %t.out // RUN: %{run} %t.out // Tests that aspect::fp64 is not emitted correctly when -fsycl-fp64-conv-emu diff --git a/sycl/test-e2e/OptionalKernelFeatures/fp64-conv-emu-2.cpp b/sycl/test-e2e/OptionalKernelFeatures/fp64-conv-emu-2.cpp index 4dccf2f0f5a1b..eeb69159dd006 100644 --- a/sycl/test-e2e/OptionalKernelFeatures/fp64-conv-emu-2.cpp +++ b/sycl/test-e2e/OptionalKernelFeatures/fp64-conv-emu-2.cpp @@ -20,7 +20,7 @@ // UNSUPPORTED: cuda, hip // UNSUPPORTED-REASON: FP64 emulation is an Intel specific feature. -// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_dg2_g10,intel_gpu_dg2_g11,intel_gpu_dg2_g12,intel_gpu_pvc,intel_gpu_mtl_h,intel_gpu_mtl_u -fsycl-fp64-conv-emu -O0 %s -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_dg2_g10,intel_gpu_dg2_g11,intel_gpu_dg2_g12,intel_gpu_pvc,intel_gpu_mtl_h,intel_gpu_mtl_u -fsycl-fp64-conv-emu %O0 %s -o %t.out // RUN: %{run} %t.out #include diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/Inputs/is_compatible_with_env.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/Inputs/is_compatible_with_env.cpp index 365614f780337..fe956704868a0 100644 --- a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/Inputs/is_compatible_with_env.cpp +++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/Inputs/is_compatible_with_env.cpp @@ -1,4 +1,5 @@ #include +#include int main() { sycl::device dev; diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_several_targets.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_several_targets.cpp index 55b9f7037e4f2..3e9124954b774 100644 --- a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_several_targets.cpp +++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_several_targets.cpp @@ -1,9 +1,8 @@ -// REQUIRES: ocloc, level_zero, gpu, cpu +// REQUIRES: ocloc, any-device-is-level_zero, any-device-is-gpu, any-device-is-cpu // REQUIRES: build-and-run-mode -// RUN: %clangxx -fsycl -fsycl-targets=spir64_fpga,spir64_gen -Xsycl-target-backend "-device *" %S/Inputs/is_compatible_with_env.cpp -o %t.out +// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device *" %S/Inputs/is_compatible_with_env.cpp -o %t.out -// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run} not %t.out -// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:fpga %{run} %t.out -// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:gpu %{run} %t.out -// RUN: env ONEAPI_DEVICE_SELECTOR=level_zero:gpu %{run} %t.out +// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t.out +// RUN: env ONEAPI_DEVICE_SELECTOR=opencl:gpu %{run-unfiltered-devices} %t.out +// RUN: env ONEAPI_DEVICE_SELECTOR=level_zero:gpu %{run-unfiltered-devices} %t.out \ No newline at end of file diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64.cpp index 465a79056906a..fcf6affb809fb 100644 --- a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64.cpp +++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64.cpp @@ -1,4 +1,5 @@ // REQUIRES: cuda, opencl, gpu, cpu +// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -fsycl-targets=spir64 %S/Inputs/is_compatible_with_env.cpp -o %t.out diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_fpga.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_fpga.cpp index 57366482e7082..1372c352c09ea 100644 --- a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_fpga.cpp +++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_fpga.cpp @@ -1,4 +1,5 @@ // REQUIRES: opencl-aot, accelerator, gpu, cpu +// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -fsycl-targets=spir64_fpga %S/Inputs/is_compatible_with_env.cpp -o %t.out diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_gen.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_gen.cpp index 5adb27e0ae697..c6a01b3a6dc18 100644 --- a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_gen.cpp +++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_gen.cpp @@ -1,4 +1,5 @@ // REQUIRES: ocloc, gpu, level_zero, cpu +// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %S/Inputs/is_compatible_with_env.cpp -o %t.out diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_x86_64.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_x86_64.cpp index 0a6f2c39df8af..5de21b8984d71 100644 --- a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_x86_64.cpp +++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_x86_64.cpp @@ -1,4 +1,5 @@ // REQUIRES: opencl-aot, cpu, gpu, level_zero +// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64 %S/Inputs/is_compatible_with_env.cpp -o %t.out diff --git a/sycl/test-e2e/ProgramManager/multi_device_bundle/build_twice.cpp b/sycl/test-e2e/ProgramManager/multi_device_bundle/build_twice.cpp index d4aaea78c7173..a0069d81fe2d6 100644 --- a/sycl/test-e2e/ProgramManager/multi_device_bundle/build_twice.cpp +++ b/sycl/test-e2e/ProgramManager/multi_device_bundle/build_twice.cpp @@ -6,6 +6,9 @@ // RUN: %{build} -o %t.out // RUN: env NEOReadDebugKeys=1 CreateMultipleRootDevices=3 SYCL_UR_TRACE=2 %{run} %t.out | FileCheck %s +// XFAIL: arch-intel_gpu_pvc +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16401 + #include #include diff --git a/sycl/test-e2e/ProgramManager/multi_device_bundle/device_libs_and_caching.cpp b/sycl/test-e2e/ProgramManager/multi_device_bundle/device_libs_and_caching.cpp index d596518165762..9b8a070a31bcf 100644 --- a/sycl/test-e2e/ProgramManager/multi_device_bundle/device_libs_and_caching.cpp +++ b/sycl/test-e2e/ProgramManager/multi_device_bundle/device_libs_and_caching.cpp @@ -1,4 +1,5 @@ // REQUIRES: ocloc && gpu && linux && (opencl || level_zero) +// REQUIRES: build-and-run-mode // Test to check several use cases for multi-device kernel bundles. // Test covers AOT and JIT cases. Kernel is using some math functions to enforce @@ -28,6 +29,9 @@ // Check the case when in-memory caching of the programs is disabled. // RUN: env SYCL_CACHE_IN_MEM=0 NEOReadDebugKeys=1 CreateMultipleRootDevices=4 %{run} %t.out +// XFAIL: arch-intel_gpu_pvc +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16401 + #include #include #include diff --git a/sycl/test-e2e/README.md b/sycl/test-e2e/README.md index 1df9e685968f9..09e763a29e792 100644 --- a/sycl/test-e2e/README.md +++ b/sycl/test-e2e/README.md @@ -242,7 +242,6 @@ place. No new tests should use these features: * **gpu-intel-gen11** - Intel GPU Gen11 availability; * **gpu-intel-gen12** - Intel GPU Gen12 availability; - * **gpu-intel-dg1** - Intel GPU DG1 availability; * **gpu-intel-dg2** - Intel GPU DG2 availability; * **gpu-intel-pvc** - Intel GPU PVC availability; * **gpu-intel-pvc-vg** - Intel GPU PVC-VG availability; @@ -268,12 +267,6 @@ configure specific single test execution in the command line: * **gpu_aot_target_opts** - defines additional options which are passed to AOT compilation command line for GPU device. It can be also set by CMake variable GPU_AOT_TARGET_OPTS. If not specified "-device *" value is used. - * **gpu-intel-dg1** - tells LIT infra that Intel GPU DG1 is present in the - system. It is developer / CI infra responsibility to make sure that the - device is available in the system. Tests requiring DG1 to run must use proper - device selector to ensure that. Use SYCL_DEVICE_ALLOWLIST or - ONEAPI_DEVICE_SELECTOR to get proper configuration (see - [EnvironmentVariables.md](https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md)); * **gpu-intel-dg2** - tells LIT infra that Intel GPU DG2 is present in the system. It is developer / CI infra responsibility to make sure that the device is available in the system. @@ -379,3 +372,112 @@ Note: please avoid using `REQUIRES: TEMPORARY_DISABLED` for this purpose, it's a non-standard mechanism. Use `UNSUPPORTED: true` instead, we track `UNSUPPORTED` tests using the mechanism described above. Otherwise the test risks remaining untraceable. + +## Compiling and executing tests on separate systems + +The execution of e2e tests can be separated into compilation and execution +stages via the `test-mode` lit parameter. This allows us to reduce testing time +by compiling tests on more powerful systems and reusing the binaries on other +machines. By default the `test-mode` parameter is set to `full`, indicating +that both stages will run. This parameter can be set to `build-only`, or +`run-only`, to only run the compilation stage, or the execution stage +respectively. + +**NOTE:** This feature is a work-in-progress and current limitations are expected +to be addressed in the near future. + +The modes work as follow: +### `--param test-mode=full` +This is the default mode tests run in. Tests are marked as unsupported if no +device on the machine can fulfill the `REQUIRES`/`UNSUPPORTED` statements. In +this mode all `RUN:` lines are executed normally, and two extra features are +added: the `build-and-run-mode` and `run-mode`. + +To make a test only run in `full` mode add a `REQUIRES: build-and-run-mode` line. + +### `--param test-mode=build-only` +This mode can be used to compile all test binaries. To do this all `UNSUPPORTED` +and `REQUIRES` statements are ignored unless they contain `UNSUPPORTED: true` or +`REQUIRES: build-and-run-mode`. All `RUN:` lines within a test are ran in this +mode unless they contain the following expansions: `%{run}`, +`%{run-unfiltered-devices}`, or `%if run-mode`. + +Currently, the only triple supported for `build-only` mode is `spir64`. + +#### `build-only` future work +Note, the fact that `build-only` ignores general `UNSUPPORTED`/`REQUIRES` +statements is a current limitation. The logic for taking into account the +features that affect compilation, and ignoring those that are only relevant to +the execution of the program is currently being worked on. + +### `--param test-mode=run-only` +In this mode, tests will not be compiled, they will only run. To do this only the +`RUN:` lines that contain `%{run}`, `%{run-unfiltered-devices}` or `%if run-mode` +are executed. Tests are marked as unsupported in the same manner as `full` mode. +Since tests are not compiled in this mode, for any test to pass the test +binaries should already be in the `test_exec_root` directory, either by having +ran `full` or `build-only` modes previously on the system, or having +transferred the test binaries into that directory. The `run-mode` feature is +added when in this mode. + +### Resolving common Issues with separate compilation and execution: +A number of extra considerations need to be taken to write tests that are able +to be compiled and executed on separate machines. + +- Tests that build and execute multiple binaries need to be written such that +the output of each compilation has a different name. This way no files are +overwritten, and all the necessary binaries can be transferred to the running +system. + +- Two scenarios need to be considered for tests that expectedly fail: + - Tests that are expected to fail on compilation, and thus also during + execution, need to be marked as `XFAIL` with a feature that is device + agnostic, or with `XFAIL: *`. Device agnostic features are those which are + added added through a method other than processing the output of sycl-ls, for + example the OS, or the presence of a library. This needs to be done because + sycl-ls is not ran in `build-only` mode. + - If the expected failure occurs during run-time we will need to mark the test + with `XFAIL` on a device specific feature (A feature that we add through + processing sycl-ls output), or if its expected to always fail on run-time we + can use `XFAIL: run-mode`. This is because otherwise the test would compile + and pass on `build-only` mode and be reported as an `XPASS`. + +- To separate compilation and execution of tests, we classify `RUN:` directives +as being either build or run lines. If a line contains `%{run}`, +`%{run-unfiltered-devices}` or `%if run-mode` it is classified as a run line, +otherwise it is classified as a build line. + - All `RUN:` lines that execute test binaries should be marked with either + `%{run}` or `%{run-unfiltered-devices}`. Otherwise they will be incorrectly + marked as a build line, likely causing a failure at the `build-only` stage as + we try to execute the program without having the appropriate devices. + - The vast majority of `RUN:` lines that do not execute the test binaries are + needed to either set up files prior to compilation, or to compile the binary, + as such `RUN:` lines are by default considered as build lines. In the case + that we need to run a line on the `run-only` system, and it does not make + sense to mark them with `%{run}` or `%{run-unfiltered-devices}`, we can mark + a line with `%if run-mode` to specifically make the line a run line. This + situation usually appears when we need to run a command in response to the + execution of the test binary. + +- Currently the `build-only` mode does not support logic to properly assess the +features in `REQUIRES`/`UNSUPPORTED` to know if a test can be built in the +system environment, or for `spir64`. Only tests that are marked with +`REQUIRES: build-and-run-mode` or `UNSUPPORTED: true` are skipped. Thus if a +test will fail building for the build environment we have on CI or for `spir64` +we will need to mark this as `REQUIRES: build-and-run-mode`. This is only +temporary solution, until further work is done to properly mark tests as +unsupported on `build-only` based on features. + +- CPU and FPGA AOT tests are currently expected to fail when compiling and +executing on separate machines. These failures occur on the `run-only` side, +because during compilation the host machine's CPU architecture is targeted, +which may be different than that of the running machine. These tests are marked +as `REQUIRES: build-and-run-mode` as a result, until they can be refactored to +compile for the architectures that will be used on the run side. + +### Falling back to `full` testing mode on `run-only` +To not lose coverage of tests marked as `REQUIRES: build-and-run-mode` when +using `run-only` mode, lit can be called using +`--param fallback-to-build-if-requires-build-and-run=True`. When this option is +enabled in `run-only` mode, tests marked as requiring `build-and-run-mode` will +fallback to running on `full` mode, instead of being reported as unsupported. diff --git a/sycl/test-e2e/Regression/multithread_write_accessor.cpp b/sycl/test-e2e/Regression/multithread_write_accessor.cpp index c8ef2c534d57a..87299ed3e4d5c 100644 --- a/sycl/test-e2e/Regression/multithread_write_accessor.cpp +++ b/sycl/test-e2e/Regression/multithread_write_accessor.cpp @@ -1,5 +1,9 @@ // RUN: %{build} -o %t.out %threads_lib // RUN: %{run} %t.out + +// XFAIL: arch-intel_gpu_pvc +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16401 + #include #include diff --git a/sycl/test-e2e/SpecConstants/2020/native_specialization_constant.cpp b/sycl/test-e2e/SpecConstants/2020/native_specialization_constant.cpp index 170e0aad6b33f..c15c4ba892ed7 100644 --- a/sycl/test-e2e/SpecConstants/2020/native_specialization_constant.cpp +++ b/sycl/test-e2e/SpecConstants/2020/native_specialization_constant.cpp @@ -4,6 +4,7 @@ // FIXME: This set is never satisfied all at once in our infrastructure. // REQUIRES: opencl, level-zero, cpu, gpu, opencl-aot, ocloc +// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -DJIT %s -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/SpecConstants/2020/non_native/SpecConstBuffer.cpp b/sycl/test-e2e/SpecConstants/2020/non_native/SpecConstBuffer.cpp index 3ee1d4bd2490b..bc7145049ddcb 100644 --- a/sycl/test-e2e/SpecConstants/2020/non_native/SpecConstBuffer.cpp +++ b/sycl/test-e2e/SpecConstants/2020/non_native/SpecConstBuffer.cpp @@ -1,4 +1,5 @@ // REQUIRES: opencl-aot, cpu +// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64 %S/Inputs/common.cpp -o %t.out \ // RUN: -fsycl-dead-args-optimization diff --git a/sycl/test-e2e/SpecConstants/2020/non_native/cpu.cpp b/sycl/test-e2e/SpecConstants/2020/non_native/cpu.cpp index 8453a6556ffec..f6b7b836e873c 100644 --- a/sycl/test-e2e/SpecConstants/2020/non_native/cpu.cpp +++ b/sycl/test-e2e/SpecConstants/2020/non_native/cpu.cpp @@ -1,4 +1,5 @@ // REQUIRES: opencl-aot, cpu +// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64 %S/Inputs/common.cpp -o %t.out \ // RUN: -fsycl-dead-args-optimization diff --git a/sycl/test-e2e/Tracing/usm/queue_copy_released_pointer.cpp b/sycl/test-e2e/Tracing/usm/queue_copy_released_pointer.cpp index 9585a9dc485ed..a37e5b8f4238c 100644 --- a/sycl/test-e2e/Tracing/usm/queue_copy_released_pointer.cpp +++ b/sycl/test-e2e/Tracing/usm/queue_copy_released_pointer.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: windows || hip_amd // RUN: %{build} -o %t.out -// RUN: not env SYCL_TRACE_TERMINATE_ON_WARNING=1 %{run} sycl-trace --verify %t.out | FileCheck %s +// RUN: not --crash env SYCL_TRACE_TERMINATE_ON_WARNING=1 %{run} sycl-trace --verify %t.out | FileCheck %s // Test parameter analysis of USM function diff --git a/sycl/test-e2e/Tracing/usm/queue_single_task_nullptr.cpp b/sycl/test-e2e/Tracing/usm/queue_single_task_nullptr.cpp index 6d19f88dd5487..4c4299dd93d8e 100644 --- a/sycl/test-e2e/Tracing/usm/queue_single_task_nullptr.cpp +++ b/sycl/test-e2e/Tracing/usm/queue_single_task_nullptr.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: windows || hip_amd // RUN: %{build} -o %t.out -// RUN: not env SYCL_TRACE_TERMINATE_ON_WARNING=1 %{run} sycl-trace --verify %t.out | FileCheck %s +// RUN: not --crash env SYCL_TRACE_TERMINATE_ON_WARNING=1 %{run} sycl-trace --verify %t.out | FileCheck %s // Test parameter analysis of USM usage diff --git a/sycl/test-e2e/Tracing/usm/queue_single_task_released_pointer.cpp b/sycl/test-e2e/Tracing/usm/queue_single_task_released_pointer.cpp index f87717c8efc28..61e27b7927f7b 100644 --- a/sycl/test-e2e/Tracing/usm/queue_single_task_released_pointer.cpp +++ b/sycl/test-e2e/Tracing/usm/queue_single_task_released_pointer.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: windows || hip_amd // RUN: %{build} -o %t.out -// RUN: not env SYCL_TRACE_TERMINATE_ON_WARNING=1 %{run} sycl-trace --verify %t.out | FileCheck %s +// RUN: not --crash env SYCL_TRACE_TERMINATE_ON_WARNING=1 %{run} sycl-trace --verify %t.out | FileCheck %s // Test parameter analysis of USM usage diff --git a/sycl/test-e2e/USM/fill_any_size.cpp b/sycl/test-e2e/USM/fill_any_size.cpp deleted file mode 100644 index 91f215d517633..0000000000000 --- a/sycl/test-e2e/USM/fill_any_size.cpp +++ /dev/null @@ -1,81 +0,0 @@ -// RUN: %{build} -o %t1.out -// RUN: %{run} %t1.out -// XFAIL: (opencl && cpu) -// XFAIL-TRACKER: https://github.com/oneapi-src/unified-runtime/issues/2440 - -/** - * Test of the queue::fill interface with a range of pattern sizes and values. - * - * Loops over pattern sizes from 1 to MaxPatternSize bytes and calls queue::fill - * with std::array for the pattern. Two pattern values are tested, - * all zeros and value=index+42. The output is copied back to host and - * validated. - */ - -#include -#include -#include -#include - -constexpr size_t MaxPatternSize{32}; // Bytes. -constexpr size_t NumElements{10}; -constexpr size_t NumRepeats{1}; -constexpr bool verbose{false}; - -template -int test(sycl::queue &q, uint8_t firstValue = 0) { - using T = std::array; - T value{}; - for (size_t i{0}; i < PatternSize; ++i) { - if constexpr (SameValue) { - value[i] = firstValue; - } else { - value[i] = firstValue + i; - } - } - - T *dptr{sycl::malloc_device(NumElements, q)}; - for (size_t repeat{0}; repeat < NumRepeats; ++repeat) { - q.fill(dptr, value, NumElements).wait(); - } - - std::array host{}; - q.copy(dptr, host.data(), NumElements).wait(); - bool pass{true}; - for (size_t i{0}; i < NumElements; ++i) { - for (size_t j{0}; j < PatternSize; ++j) { - if (host[i][j] != value[j]) { - pass = false; - } - } - } - sycl::free(dptr, q); - - if (!pass || verbose) { - printf("Pattern size %3zu bytes, %s values (initial %3u) %s\n", PatternSize, - (SameValue ? " equal" : "varied"), firstValue, - (pass ? "== PASS ==" : "== FAIL ==")); - } - - return !pass; -} - -template int testOneSize(sycl::queue &q) { - return test(q, 0) + test(q, 42); -} - -template -int testSizes(sycl::queue &q, std::index_sequence) { - return (testOneSize<1u + Sizes>(q) + ...); -} - -int main() { - sycl::queue q{}; - int failures = testSizes(q, std::make_index_sequence{}); - if (failures > 0) { - printf("%d / %zu tests failed\n", failures, 2u * MaxPatternSize); - } else { - printf("All %zu tests passed\n", 2u * MaxPatternSize); - } - return failures; -} diff --git a/sycl/test-e2e/bindless_images/dx12_interop/read_write_unsampled.cpp b/sycl/test-e2e/bindless_images/dx12_interop/read_write_unsampled.cpp index aac2c1f23d788..1d5a59a40b5ab 100644 --- a/sycl/test-e2e/bindless_images/dx12_interop/read_write_unsampled.cpp +++ b/sycl/test-e2e/bindless_images/dx12_interop/read_write_unsampled.cpp @@ -3,7 +3,7 @@ // REQUIRES: build-and-run-mode // DEFINE: %{link-flags}=%if cl_options %{ /clang:-ld3d12 /clang:-ldxgi /clang:-ldxguid %} %else %{ -ld3d12 -ldxgi -ldxguid %} -// RUN: %{build} %{link-flags} -o %t.out +// RUN: %{build} %{link-flags} -o %t.out %if any-device-is-level_zero %{ -DDISABLE_UNORM_TESTS %} // RUN: %{run-unfiltered-devices} env NEOReadDebugKeys=1 UseBindlessMode=1 UseExternalAllocatorForSshAndDsh=1 %t.out #pragma clang diagnostic ignored "-Waddress-of-temporary" @@ -733,8 +733,10 @@ int main() { validated &= runTest<1, uint32_t, 1>(device, sycl::image_channel_type::unsigned_int32, globalSize1, localSize1); +#ifndef DISABLE_UNORM_TESTS validated &= runTest<1, uint8_t, 4>( device, sycl::image_channel_type::unorm_int8, globalSize1, localSize1); +#endif validated &= runTest<1, float, 1>(device, sycl::image_channel_type::fp32, globalSize1, localSize1); validated &= runTest<1, sycl::half, 2>(device, sycl::image_channel_type::fp16, @@ -752,8 +754,10 @@ int main() { validated &= runTest<2, uint32_t, 1>(device, sycl::image_channel_type::unsigned_int32, globalSize2[0], {16, 16}); +#ifndef DISABLE_UNORM_TESTS validated &= runTest<2, uint8_t, 4>( device, sycl::image_channel_type::unorm_int8, globalSize2[1], {16, 8}); +#endif validated &= runTest<2, float, 1>(device, sycl::image_channel_type::fp32, globalSize2[2], {16, 8}); validated &= runTest<2, sycl::half, 2>(device, sycl::image_channel_type::fp16, @@ -774,8 +778,10 @@ int main() { validated &= runTest<3, uint32_t, 1>(device, sycl::image_channel_type::unsigned_int32, globalSize3[0], {16, 16, 1}); +#ifndef DISABLE_UNORM_TESTS validated &= runTest<3, uint8_t, 4>( device, sycl::image_channel_type::unorm_int8, globalSize3[1], {16, 8, 2}); +#endif validated &= runTest<3, float, 1>(device, sycl::image_channel_type::fp32, globalSize3[2], {16, 8, 1}); validated &= runTest<3, sycl::half, 2>(device, sycl::image_channel_type::fp16, diff --git a/sycl/test-e2e/bindless_images/helpers/common.hpp b/sycl/test-e2e/bindless_images/helpers/common.hpp index 2950adb83f972..2d32a315a8c72 100644 --- a/sycl/test-e2e/bindless_images/helpers/common.hpp +++ b/sycl/test-e2e/bindless_images/helpers/common.hpp @@ -82,6 +82,8 @@ constexpr sycl::vec init_vector(DType val) { return sycl::vec{val}; } else if constexpr (NChannel == 2) { return sycl::vec{val, val}; + } else if constexpr (NChannel == 3) { + return sycl::vec{val, val, val}; } else if constexpr (NChannel == 4) { return sycl::vec{val, val, val, val}; } else { diff --git a/sycl/test-e2e/bindless_images/vulkan_interop/sampled_images.cpp b/sycl/test-e2e/bindless_images/vulkan_interop/sampled_images.cpp index 328c01355599d..12e5cfe5cad87 100644 --- a/sycl/test-e2e/bindless_images/vulkan_interop/sampled_images.cpp +++ b/sycl/test-e2e/bindless_images/vulkan_interop/sampled_images.cpp @@ -483,6 +483,12 @@ bool run_tests() { valid &= run_test<2, sycl::half, 2, sycl::image_channel_type::fp16, sycl::image_channel_order::rg, class fp16_2d_c2>( {1920, 1080}, {16, 8}, 0); + valid &= run_test<2, sycl::half, 3, sycl::image_channel_type::fp16, + sycl::image_channel_order::rgb, class fp16_2d_c3>( + {2048, 2048}, {16, 16}, 0); + valid &= run_test<2, uint8_t, 3, sycl::image_channel_type::unorm_int8, + sycl::image_channel_order::rgb, class unorm_int8_2d_c3>( + {2048, 2048}, {16, 16}, 0); valid &= run_test<2, sycl::half, 4, sycl::image_channel_type::fp16, sycl::image_channel_order::rgba, class fp16_2d_c4>( {2048, 2048}, {16, 16}, 0); diff --git a/sycl/test-e2e/bindless_images/vulkan_interop/unsampled_images.cpp b/sycl/test-e2e/bindless_images/vulkan_interop/unsampled_images.cpp index b93ac2f55c47e..b6d046904eea9 100644 --- a/sycl/test-e2e/bindless_images/vulkan_interop/unsampled_images.cpp +++ b/sycl/test-e2e/bindless_images/vulkan_interop/unsampled_images.cpp @@ -590,11 +590,6 @@ bool run_all() { sycl::image_channel_order::rgba, class fp16_3d_c4>( {2048, 2048, 4}, {16, 16, 1}, seed); - printString("Running 3D unorm_int8_c4\n"); - valid &= run_test<3, uint8_t, 4, sycl::image_channel_type::unorm_int8, - sycl::image_channel_order::rgba, class unorm_int8_3d_c4>( - {2048, 2048, 2}, {16, 16, 1}, seed); - printString("Running 2D float\n"); valid &= run_test<2, float, 1, sycl::image_channel_type::fp32, sycl::image_channel_order::r, class fp32_2d_c1>( @@ -608,10 +603,12 @@ bool run_all() { sycl::image_channel_order::rgba, class fp16_2d_c4>( {2048, 2048}, {16, 16}, seed); - printString("Running 2D unorm_int8_c4\n"); - valid &= run_test<2, uint8_t, 4, sycl::image_channel_type::unorm_int8, - sycl::image_channel_order::rgba, class unorm_int8_2d_c4>( + // 3-channels + printString("Running 2D half3\n"); + valid &= run_test<2, sycl::half, 3, sycl::image_channel_type::fp16, + sycl::image_channel_order::rgb, class fp16_2d_c3>( {2048, 2048}, {2, 2}, seed); + #else printString("Running 3D uint4\n"); valid &= run_test<3, uint32_t, 4, sycl::image_channel_type::signed_int32, diff --git a/sycl/test-e2e/bindless_images/vulkan_interop/vulkan_common.hpp b/sycl/test-e2e/bindless_images/vulkan_interop/vulkan_common.hpp index a1698c5bf8b10..50221dd2d9f6e 100644 --- a/sycl/test-e2e/bindless_images/vulkan_interop/vulkan_common.hpp +++ b/sycl/test-e2e/bindless_images/vulkan_interop/vulkan_common.hpp @@ -827,6 +827,8 @@ VkFormat to_vulkan_format(sycl::image_channel_order order, return VK_FORMAT_R8_UNORM; case sycl::image_channel_order::rg: return VK_FORMAT_R8G8_UNORM; + case sycl::image_channel_order::rgb: + return VK_FORMAT_R8G8B8_UNORM; case sycl::image_channel_order::rgba: return VK_FORMAT_R8G8B8A8_UNORM; default: { @@ -894,6 +896,8 @@ VkFormat to_vulkan_format(sycl::image_channel_order order, return VK_FORMAT_R16_SFLOAT; case sycl::image_channel_order::rg: return VK_FORMAT_R16G16_SFLOAT; + case sycl::image_channel_order::rgb: + return VK_FORMAT_R16G16B16_SFLOAT; case sycl::image_channel_order::rgba: return VK_FORMAT_R16G16B16A16_SFLOAT; default: { diff --git a/sycl/test-e2e/format.py b/sycl/test-e2e/format.py index cb4ac86008db2..a69810145507d 100644 --- a/sycl/test-e2e/format.py +++ b/sycl/test-e2e/format.py @@ -234,16 +234,21 @@ def get_extra_env(sycl_devices): new_script.append(directive) continue - # Filter commands based on split-mode + # Filter commands based on testing mode is_run_line = any( i in directive.command for i in ["%{run}", "%{run-unfiltered-devices}", "%if run-mode"] ) - if (is_run_line and test.config.test_mode == "build-only") or ( - not is_run_line and test.config.test_mode == "run-only" + ignore_line_filtering = ( + "build-and-run-mode" in test.requires + and test.config.fallback_build_run_only + ) + if not ignore_line_filtering and ( + (is_run_line and test.config.test_mode == "build-only") + or (not is_run_line and test.config.test_mode == "run-only") ): - directive.command = "" + continue if "%{run}" not in directive.command: new_script.append(directive) @@ -295,7 +300,13 @@ def get_extra_env(sycl_devices): conditions, recursion_limit=test.config.recursiveExpansionLimit, ) - useExternalSh = False + + # TODO: workaround for lit hanging when executing non-existent binary + # inside our containers + if len(script) == 0: + return lit.Test.Result(lit.Test.UNSUPPORTED, "Lit script is empty") + useExternalSh = test.config.test_mode == "run-only" + result = lit.TestRunner._runShTest( test, litConfig, useExternalSh, script, tmpBase ) diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py index acd681718facd..a1da5ff46b5ff 100644 --- a/sycl/test-e2e/lit.cfg.py +++ b/sycl/test-e2e/lit.cfg.py @@ -39,12 +39,16 @@ # test-mode: Set if tests should run normally or only build/run config.test_mode = lit_config.params.get("test-mode", "full") +config.fallback_build_run_only = False if config.test_mode == "full": config.available_features.add("run-mode") config.available_features.add("build-and-run-mode") elif config.test_mode == "run-only": lit_config.note("run-only test mode enabled, only executing tests") config.available_features.add("run-mode") + if lit_config.params.get("fallback-to-build-if-requires-build-and-run", False): + config.available_features.add("build-and-run-mode") + config.fallback_build_run_only = True elif config.test_mode == "build-only": lit_config.note("build-only test mode enabled, only compiling tests") config.sycl_devices = [] @@ -164,8 +168,6 @@ config.available_features.add("gpu-intel-gen12") # Intel GPU DEVICE availability -if lit_config.params.get("gpu-intel-dg1", False): - config.available_features.add("gpu-intel-dg1") if lit_config.params.get("gpu-intel-dg2", False): config.available_features.add("gpu-intel-dg2") if lit_config.params.get("gpu-intel-pvc-vg", False): @@ -381,7 +383,7 @@ def open_check_file(file_name): ) ) config.substitutions.append(("%include_option", "/FI")) - config.substitutions.append(("%debug_option", "/DEBUG")) + config.substitutions.append(("%debug_option", "/Zi /DEBUG")) config.substitutions.append(("%cxx_std_option", "/std:")) config.substitutions.append(("%fPIC", "")) config.substitutions.append(("%shared_lib", "/LD")) diff --git a/sycl/test-e2e/syclcompat/launch/launch_policy_lmem.cpp b/sycl/test-e2e/syclcompat/launch/launch_policy_lmem.cpp index b5c82af8938df..033f5c99d74e1 100644 --- a/sycl/test-e2e/syclcompat/launch/launch_policy_lmem.cpp +++ b/sycl/test-e2e/syclcompat/launch/launch_policy_lmem.cpp @@ -26,6 +26,9 @@ // UNSUPPORTED: linux && opencl && (gpu-intel-gen12 || gpu-intel-dg2) // UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/15275 +// XFAIL: arch-intel_gpu_pvc +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16401 + #include #include #include diff --git a/sycl/test/abi/layout_vec.cpp b/sycl/test/abi/layout_vec.cpp index ade053f068f30..7824f84e7bdd2 100644 --- a/sycl/test/abi/layout_vec.cpp +++ b/sycl/test/abi/layout_vec.cpp @@ -13,7 +13,7 @@ SYCL_EXTERNAL void foo(sycl::vec) {} // CHECK: 0 | class sycl::vec // ignore empty base classes // CHECK: 0 | struct std::array m_Data -// CHECK-NEXT: 0 | typename _AT_Type::_Type _M_elems +// CHECK-NEXT: 0 | typename {{.+}}::_Type _M_elems // CHECK-NEXT: | [sizeof=16, dsize=16, align=16, // CHECK-NEXT: | nvsize=16, nvalign=16] @@ -24,6 +24,6 @@ SYCL_EXTERNAL void foo(sycl::vec) {} // CHECK: 0 | class sycl::vec<_Bool, 16> // ignore empty base classes // CHECK: 0 | struct std::array<_Bool, 16> m_Data -// CHECK-NEXT: 0 | typename _AT_Type::_Type _M_elems +// CHECK-NEXT: 0 | typename {{.+}}::_Type _M_elems // CHECK-NEXT: | [sizeof=16, dsize=16, align=16, // CHECK-NEXT: | nvsize=16, nvalign=16] diff --git a/sycl/test/check_device_code/group_load.cpp b/sycl/test/check_device_code/group_load.cpp index 54380a1a2be0d..adc240172a357 100644 --- a/sycl/test/check_device_code/group_load.cpp +++ b/sycl/test/check_device_code/group_load.cpp @@ -1,5 +1,4 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --functions "group_load" --include-generated-funcs --version 4 -// NOTE: and manually adjusted to follow the related explicit instantiation. +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clangxx -O3 -fsycl -fsycl-device-only -fno-discard-value-names -S -emit-llvm -fno-sycl-instrument-device-code -o - %s | FileCheck %s // Windows/linux have some slight differences in IR generation (function @@ -36,613 +35,644 @@ using opt_striped = template using plain_global_ptr = typename sycl::detail::DecoratedType< T, access::address_space::global_space>::type *; - -// Ensure `detail::naive` always results in no block loads/stores. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< - sycl::sub_group, plain_global_ptr, int, naive_blocked>( - sycl::sub_group, plain_global_ptr, int &, naive_blocked); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESN_SL_RSM_T2_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(4) [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties") align 1 [[PROPERTIES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] comdat !srcloc [[META6:![0-9]+]] !sycl_fixed_targets [[META7:![0-9]+]] { +namespace blocked { +// CHECK-LABEL: @_ZN7blocked10test_naiveERN4sycl3_V19sub_groupEPU3AS1iRi( // CHECK-NEXT: entry: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4:[0-9]+]] // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[TMP0]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8:![0-9]+]] -// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(4) [[OUT]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8:![0-9]+]] +// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(4) [[OUT:%.*]], align 4, !tbaa [[TBAA8]] // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_naive(sycl::sub_group &sg, plain_global_ptr p, + int &out) { + // Ensure `detail::naive` always results in no block loads/stores. + group_load(sg, p, out, naive_blocked{}); +} -// Check that optimized implementation is selected. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< - sycl::sub_group, plain_global_ptr, int, opt_blocked>( - sycl::sub_group, plain_global_ptr, int &, opt_blocked); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_RSO_T2_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(4) [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.4") align 1 [[PROPERTIES:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META6]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7blocked14test_optimizedERN4sycl3_V19sub_groupEPU3AS1iRi( // CHECK-NEXT: entry: -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[IN_PTR]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) -// CHECK-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[IN_PTR]]) #[[ATTR4]] -// CHECK-NEXT: store i32 [[CALL4_I]], ptr addrspace(4) [[OUT]], align 4 +// CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-NEXT: [[CALL4_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] +// CHECK-NEXT: store i32 [[CALL4_I_I]], ptr addrspace(4) [[OUT:%.*]], align 4 // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_optimized(sycl::sub_group &sg, plain_global_ptr p, + int &out) { + // Check that optimized implementation is selected. + group_load(sg, p, out, opt_blocked{}); +} -// Check that contiguous_memory can be auto-detected. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< - sycl::sub_group, plain_global_ptr, int, full_group_blocked>( - sycl::sub_group, plain_global_ptr, int &, full_group_blocked); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESN_SL_RSM_T2_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(4) [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.12") align 1 [[PROPERTIES:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META6]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7blocked27test_contiguous_auto_detectERN4sycl3_V19sub_groupEPU3AS1iRi( // CHECK-NEXT: entry: -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[IN_PTR]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) -// CHECK-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[IN_PTR]]) #[[ATTR4]] -// CHECK-NEXT: store i32 [[CALL4_I]], ptr addrspace(4) [[OUT]], align 4 +// CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-NEXT: [[CALL4_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] +// CHECK-NEXT: store i32 [[CALL4_I_I]], ptr addrspace(4) [[OUT:%.*]], align 4 // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_contiguous_auto_detect(sycl::sub_group &sg, + plain_global_ptr p, + int &out) { + // Check that optimized implementation is selected. + group_load(sg, p, out, full_group_blocked{}); +} // SYCL 2020's accessor can't be statically known to be contiguous. using accessor_iter_t = accessor::iterator; -// Can't be optimized. -template SYCL_EXTERNAL void -sycl::ext::oneapi::experimental::group_load( - sycl::sub_group, accessor_iter_t, int &, full_group_blocked); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSD_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_RSO_T2_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::detail::accessor_iterator") align 8 [[IN_PTR:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(4) [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.12") align 1 [[PROPERTIES:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META6]] !sycl_fixed_targets [[META7]] { + +// CHECK-LABEL: @_ZN7blocked18test_accessor_iterERN4sycl3_V19sub_groupERNS1_6detail17accessor_iteratorIKiLi1EEERi( // CHECK-NEXT: entry: -// CHECK-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr [[IN_PTR]], align 8, !tbaa [[TBAA12:![0-9]+]] -// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_IN_PTR_ASCAST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[IN_PTR]], i64 8 -// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr [[AGG_TMP1_SROA_2_0_IN_PTR_ASCAST_SROA_IDX]], align 8, !tbaa [[TBAA15:![0-9]+]] +// CHECK-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15:![0-9]+]] +// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 +// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18:![0-9]+]] // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 -// CHECK-NEXT: [[CONV3_I_I:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-NEXT: [[CONV3_I_I_I:%.*]] = sext i32 [[TMP0]] to i64 // CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] -// CHECK-NEXT: [[ADD_PTR_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP1]], i64 [[CONV3_I_I]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ADD_PTR_I_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(4) [[OUT]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[ADD_PTR_I_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP1]], i64 [[CONV3_I_I_I]] +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ADD_PTR_I_I_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(4) [[OUT:%.*]], align 4, !tbaa [[TBAA8]] // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_accessor_iter(sycl::sub_group &sg, + accessor_iter_t &iter, int &out) { + // Can't be optimized. + group_load(sg, iter, out, full_group_blocked{}); +} -// Explicit property - optimize. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< - sycl::sub_group, accessor_iter_t, int, opt_blocked>(sycl::sub_group, - accessor_iter_t, int &, - opt_blocked); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSD_INS3_21contiguous_memory_keyEJEEENSD_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_RSQ_T2_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::detail::accessor_iterator") align 8 [[IN_PTR:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(4) [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.4") align 1 [[PROPERTIES:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META6]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7blocked34test_accessor_iter_force_optimizedERN4sycl3_V19sub_groupERNS1_6detail17accessor_iteratorIKiLi1EEERi( // CHECK-NEXT: entry: -// CHECK-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr [[IN_PTR]], align 8, !tbaa [[TBAA12]] -// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_IN_PTR_ASCAST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[IN_PTR]], i64 8 -// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr [[AGG_TMP1_SROA_2_0_IN_PTR_ASCAST_SROA_IDX]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[ADD_PTR_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] -// CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(4) [[ADD_PTR_I_I_I]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) -// CHECK-NEXT: [[CALL_I_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvi(ptr addrspace(4) noundef nonnull [[ADD_PTR_I_I_I]], i32 noundef 5) #[[ATTR5:[0-9]+]] -// CHECK-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp eq ptr addrspace(1) [[CALL_I_I_I_I]], null -// CHECK-NEXT: br i1 [[TOBOOL_NOT_I]], label [[IF_THEN_I:%.*]], label [[IF_END_I:%.*]] -// CHECK: if.then.i: +// CHECK-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 +// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18]] +// CHECK-NEXT: [[ADD_PTR_I_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] +// CHECK-NEXT: [[CMP_I_I_I_I:%.*]] = icmp ne ptr addrspace(4) [[ADD_PTR_I_I_I_I]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I_I]]) +// CHECK-NEXT: [[CALL_I_I_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvi(ptr addrspace(4) noundef nonnull [[ADD_PTR_I_I_I_I]], i32 noundef 5) #[[ATTR5:[0-9]+]] +// CHECK-NEXT: [[TOBOOL_NOT_I_I:%.*]] = icmp eq ptr addrspace(1) [[CALL_I_I_I_I_I]], null +// CHECK-NEXT: br i1 [[TOBOOL_NOT_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_END_I_I:%.*]] +// CHECK: if.then.i.i: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 -// CHECK-NEXT: [[CONV3_I_I:%.*]] = sext i32 [[TMP0]] to i64 -// CHECK-NEXT: [[ADD_PTR_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[ADD_PTR_I_I_I]], i64 [[CONV3_I_I]] -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ADD_PTR_I_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(4) [[OUT]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[CONV3_I_I_I:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-NEXT: [[ADD_PTR_I_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[ADD_PTR_I_I_I_I]], i64 [[CONV3_I_I_I]] +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ADD_PTR_I_I_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(4) [[OUT:%.*]], align 4, !tbaa [[TBAA8]] // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEILM1ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSD_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSD_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] -// CHECK: if.end.i: -// CHECK-NEXT: [[CALL6_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[CALL_I_I_I_I]]) #[[ATTR4]] -// CHECK-NEXT: store i32 [[CALL6_I]], ptr addrspace(4) [[OUT]], align 4 -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEILM1ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSD_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSD_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiLm1ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSD_INS3_21contiguous_memory_keyEJEEENSD_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEINS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSD_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSD_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_RSQ_T2__EXIT:%.*]] +// CHECK: if.end.i.i: +// CHECK-NEXT: [[CALL6_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[CALL_I_I_I_I_I]]) #[[ATTR4]] +// CHECK-NEXT: store i32 [[CALL6_I_I]], ptr addrspace(4) [[OUT]], align 4 +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEINS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSD_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSD_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_RSQ_T2__EXIT]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSD_INS3_21contiguous_memory_keyEJEEENSD_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_RSQ_T2_.exit: // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_accessor_iter_force_optimized(sycl::sub_group &sg, + accessor_iter_t &iter, + int &out) { + // Explicit property - optimize. + group_load(sg, iter, out, opt_blocked{}); +} -// Run-time alignment check is needed if type's alignment is less than BlockRead -// requirements. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< - sycl::sub_group, plain_global_ptr, char, opt_blocked>( - sycl::sub_group, plain_global_ptr, char &, opt_blocked); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ccNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_RSO_T2_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr addrspace(4) noundef align 1 dereferenceable(1) [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.4") align 1 [[PROPERTIES:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META6]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7blocked24test_runtime_align_checkERN4sycl3_V19sub_groupEPU3AS1cRc( // CHECK-NEXT: entry: -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[IN_PTR]], null +// CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-NEXT: [[REM_I_I_I:%.*]] = and i64 [[TMP0]], 3 +// CHECK-NEXT: [[CMP1_I_NOT_I_I:%.*]] = icmp eq i64 [[REM_I_I_I]], 0 +// CHECK-NEXT: br i1 [[CMP1_I_NOT_I_I]], label [[IF_END_I_I:%.*]], label [[IF_THEN_I_I:%.*]] +// CHECK: if.then.i.i: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-NEXT: [[IDXPROM_I_I_I:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I_I]] +// CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr addrspace(1) [[ARRAYIDX_I_I_I]], align 1, !tbaa [[TBAA22:![0-9]+]] +// CHECK-NEXT: store i8 [[TMP2]], ptr addrspace(4) [[OUT:%.*]], align 1, !tbaa [[TBAA22]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1CCNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_SN_RSO_T2__EXIT:%.*]] +// CHECK: if.end.i.i: +// CHECK-NEXT: [[CALL4_I_I:%.*]] = tail call spir_func noundef zeroext i8 @_Z30__spirv_SubgroupBlockReadINTELIhET_PU3AS1Kh(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] +// CHECK-NEXT: store i8 [[CALL4_I_I]], ptr addrspace(4) [[OUT]], align 1 +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1CCNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_SN_RSO_T2__EXIT]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ccNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_RSO_T2_.exit: +// CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_runtime_align_check(sycl::sub_group &sg, + plain_global_ptr p, + char &out) { + // Run-time alignment check is needed if type's alignment is less than + // BlockRead requirements. + group_load(sg, p, out, opt_blocked{}); +} + +// CHECK-LABEL: @_ZN7blocked16test_four_shortsERN4sycl3_V19sub_groupEPU3AS1sNS1_4spanIsLm4EEE( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA24:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null // CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) -// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[IN_PTR]] to i64 -// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP0]], 3 +// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 3 // CHECK-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 // CHECK-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] // CHECK: if.then.i: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[TMP1]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr addrspace(1) [[ARRAYIDX_I_I]], align 1, !tbaa [[TBAA17:![0-9]+]] -// CHECK-NEXT: store i8 [[TMP2]], ptr addrspace(4) [[OUT]], align 1, !tbaa [[TBAA17]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META26:![0-9]+]] +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = shl i32 [[TMP3]], 2 +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I14_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 +// CHECK-NEXT: br i1 [[CMP_I14_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT_I:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = or disjoint i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr addrspace(1) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA29:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: store i16 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA29]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP31:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit.i: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1CCLM1ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEET3__EXIT:%.*]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEET3__EXIT:%.*]] // CHECK: if.end.i: -// CHECK-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef zeroext i8 @_Z30__spirv_SubgroupBlockReadINTELIhET_PU3AS1Kh(ptr addrspace(1) noundef nonnull [[IN_PTR]]) #[[ATTR4]] -// CHECK-NEXT: store i8 [[CALL4_I]], ptr addrspace(4) [[OUT]], align 1 -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1CCLM1ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEET3__EXIT]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ccLm1ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_.exit: +// CHECK-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef i64 @_Z30__spirv_SubgroupBlockReadINTELImET_PU3AS1Km(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] +// CHECK-NEXT: store i64 [[CALL4_I]], ptr addrspace(4) [[TMP1]], align 2 +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEET3__EXIT]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_.exit: // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_four_shorts(sycl::sub_group &sg, + plain_global_ptr p, + span out) { + // Four shorts in blocked data layout could be loaded as a single 64-bit + // integer. + group_load(sg, p, out, opt_blocked{}); +} -// Four shorts in blocked data layout could be loaded as a single 64-bit -// integer. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< - sycl::sub_group, plain_global_ptr, short, 4, opt_blocked>( - sycl::sub_group, plain_global_ptr, span, opt_blocked); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.15") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.4") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META18:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7blocked21test_non_power_of_twoERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm3EEE( // CHECK-NEXT: entry: -// CHECK-NEXT: [[CMP_I:%.*]] = icmp ne ptr addrspace(1) [[IN_PTR]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I]]) -// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[IN_PTR]] to i64 -// CHECK-NEXT: [[REM_I:%.*]] = and i64 [[TMP0]], 3 -// CHECK-NEXT: [[CMP1_I_NOT:%.*]] = icmp eq i64 [[REM_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] -// CHECK: if.then: -// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA19:![0-9]+]] -// CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META21:![0-9]+]] -// CHECK-NEXT: [[MUL_I_I:%.*]] = shl i32 [[TMP3]], 2 -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_THEN]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I14:%.*]] = icmp samesign ult i32 [[I_0_I]], 4 -// CHECK-NEXT: br i1 [[CMP_I14]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ADD_I_I:%.*]] = or disjoint i32 [[MUL_I_I]], [[I_0_I]] -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr addrspace(1) [[ARRAYIDX_I]], align 2, !tbaa [[TBAA24:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP2]], i64 [[CONV_I]] -// CHECK-NEXT: store i16 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA24]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP26:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META34:![0-9]+]] +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP2]], 3 +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 3 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM3ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEET3__EXIT:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP37:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm3ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_.exit: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: br label [[CLEANUP:%.*]] -// CHECK: if.end: -// CHECK-NEXT: [[CALL4:%.*]] = tail call spir_func noundef i64 @_Z30__spirv_SubgroupBlockReadINTELImET_PU3AS1Km(ptr addrspace(1) noundef nonnull [[IN_PTR]]) #[[ATTR4]] -// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8, !tbaa [[TBAA28:![0-9]+]] -// CHECK-NEXT: store i64 [[CALL4]], ptr addrspace(4) [[TMP5]], align 2 -// CHECK-NEXT: br label [[CLEANUP]] -// CHECK: cleanup: // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_non_power_of_two(sycl::sub_group &sg, + plain_global_ptr p, + span out) { + // Check for non-power-of-two size. + group_load(sg, p, out, opt_blocked{}); +} -// Check for non-power-of-two size. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< - sycl::sub_group, plain_global_ptr, int, 3, opt_blocked>( - sycl::sub_group, plain_global_ptr, span, opt_blocked); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm3ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.16") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.4") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META18]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7blocked14test_four_intsERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm4EEE( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA12]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] // CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META30:![0-9]+]] -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul i32 [[TMP2]], 3 -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 3 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM3ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[MUL_I_I]], [[I_0_I]] -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP33:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm3ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META39:![0-9]+]] +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = shl i32 [[TMP2]], 2 +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEET3__EXIT:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = or disjoint i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP42:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_.exit: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_four_ints(sycl::sub_group &sg, plain_global_ptr p, + span out) { + // Four int elements in blocked data layout don't map directly to any + // BlockRead API. + group_load(sg, p, out, opt_blocked{}); +} -// Four int elements in blocked data layout don't map directly to any BlockRead -// API. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< - sycl::sub_group, plain_global_ptr, int, 4, opt_blocked>( - sycl::sub_group, plain_global_ptr, span, opt_blocked); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.17") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.4") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META18]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7blocked15test_seven_intsERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm7EEE( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA12]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] // CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META34:![0-9]+]] -// CHECK-NEXT: [[MUL_I_I:%.*]] = shl i32 [[TMP2]], 2 -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 4 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ADD_I_I:%.*]] = or disjoint i32 [[MUL_I_I]], [[I_0_I]] -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP37:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META44:![0-9]+]] +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP2]], 7 +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 7 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM7ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEET3__EXIT:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP47:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm7ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_.exit: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] // CHECK-NEXT: ret void -// Similar to four elements case but more complex to optimize. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< - sycl::sub_group, plain_global_ptr, int, 7, opt_blocked>( - sycl::sub_group, plain_global_ptr, span, opt_blocked); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm7ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.18") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.4") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META18]] !sycl_fixed_targets [[META7]] { +// +SYCL_EXTERNAL void test_seven_ints(sycl::sub_group &sg, plain_global_ptr p, + span out) { + // Similar to four elements case but more complex to optimize. + group_load(sg, p, out, opt_blocked{}); +} +} // namespace blocked + +namespace striped { +// Striped data layout with one element per work item isn't different from +// blocked data layout, so use span version only in the checks below. + +// CHECK-LABEL: @_ZN7striped10test_naiveERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm2EEE( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA12]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] // CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META38:![0-9]+]] -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul i32 [[TMP2]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META49:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META52:![0-9]+]] // CHECK-NEXT: br label [[FOR_COND_I:%.*]] // CHECK: for.cond.i: // CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 7 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM7ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESN_SL_NS0_4SPANISM_XT2_EEET3__EXIT:%.*]] // CHECK: for.body.i: // CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[MUL_I_I]], [[I_0_I]] +// CHECK-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I]] +// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] // CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] // CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] // CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP41:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm7ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP55:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESN_SL_NS0_4spanISM_XT2_EEET3_.exit: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_naive(sycl::sub_group &sg, plain_global_ptr p, + span out) { + // Ensure `detail::naive` always results in no block loads/stores. + group_load(sg, p, out, naive_striped{}); +} -// Striped data layout with one element per work item isn't different from -// blocked data layout, so use span version only in the checks below. - -// Ensure `detail::naive` always results in no block loads/stores. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< - sycl::sub_group, plain_global_ptr, int, 2, naive_striped>( - sycl::sub_group, plain_global_ptr, span, naive_striped); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESN_SL_NS0_4spanISM_XT2_EEET3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.19") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.20") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META18]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7striped14test_optimizedERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm2EEE( // CHECK-NEXT: entry: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META42:![0-9]+]] -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META45:![0-9]+]] -// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8, !tbaa [[TBAA48:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND:%.*]] -// CHECK: for.cond: -// CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -// CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i32 [[I_0]], 2 -// CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -// CHECK: for.cond.cleanup: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: ret void -// CHECK: for.body: -// CHECK-NEXT: [[CONV:%.*]] = zext nneg i32 [[I_0]] to i64 -// CHECK-NEXT: [[MUL_I:%.*]] = mul nuw nsw i32 [[TMP1]], [[I_0]] -// CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[TMP0]], [[MUL_I]] -// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[ADD_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP2]], i64 [[CONV]] -// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_0]], 1 -// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP50:![0-9]+]] - -// Check that optimized implementation is selected. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< - sycl::sub_group, plain_global_ptr, int, 2, opt_striped>( - sycl::sub_group, plain_global_ptr, span, opt_striped); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.19") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.25") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META18]] !sycl_fixed_targets [[META7]] { -// CHECK-NEXT: cleanup: -// CHECK-NEXT: [[CMP_I:%.*]] = icmp ne ptr addrspace(1) [[IN_PTR]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I]]) -// CHECK-NEXT: [[CALL4:%.*]] = tail call spir_func noundef <2 x i32> @_Z30__spirv_SubgroupBlockReadINTELIDv2_jET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[IN_PTR]]) #[[ATTR4]] -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8, !tbaa [[TBAA48]] -// CHECK-NEXT: store <2 x i32> [[CALL4]], ptr addrspace(4) [[TMP0]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef <2 x i32> @_Z30__spirv_SubgroupBlockReadINTELIDv2_jET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] +// CHECK-NEXT: store <2 x i32> [[CALL4_I]], ptr addrspace(4) [[TMP1]], align 4 // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_optimized(sycl::sub_group &sg, plain_global_ptr p, + span out) { + // Check that optimized implementation is selected. + group_load(sg, p, out, opt_striped{}); +} -// Check that contiguous_memory can be auto-detected. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< - sycl::sub_group, plain_global_ptr, int, 2, full_group_striped>( - sycl::sub_group, plain_global_ptr, span, full_group_striped); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESN_SL_NS0_4spanISM_XT2_EEET3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.19") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.27") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META18]] !sycl_fixed_targets [[META7]] { -// CHECK-NEXT: cleanup: -// CHECK-NEXT: [[CMP_I:%.*]] = icmp ne ptr addrspace(1) [[IN_PTR]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I]]) -// CHECK-NEXT: [[CALL4:%.*]] = tail call spir_func noundef <2 x i32> @_Z30__spirv_SubgroupBlockReadINTELIDv2_jET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[IN_PTR]]) #[[ATTR4]] -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8, !tbaa [[TBAA48]] -// CHECK-NEXT: store <2 x i32> [[CALL4]], ptr addrspace(4) [[TMP0]], align 4 +// CHECK-LABEL: @_ZN7striped27test_contiguous_auto_detectERN4sycl3_V19sub_groupEPU3AS1iRi( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-NEXT: [[CALL4_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] +// CHECK-NEXT: store i32 [[CALL4_I_I]], ptr addrspace(4) [[OUT:%.*]], align 4 // CHECK-NEXT: ret void - +// +SYCL_EXTERNAL void test_contiguous_auto_detect(sycl::sub_group &sg, + plain_global_ptr p, + int &out) { + // Check that optimized implementation is selected. + group_load(sg, p, out, full_group_striped{}); +} // SYCL 2020's accessor can't be statically known to be contiguous. using accessor_iter_t = accessor::iterator; -// Can't be optimized. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< - sycl::sub_group, accessor_iter_t, int, 2, full_group_striped>( - sycl::sub_group, accessor_iter_t, span, full_group_striped); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSD_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::detail::accessor_iterator") align 8 [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.19") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.27") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META18]] !sycl_fixed_targets [[META7]] { + +// CHECK-LABEL: @_ZN7striped18test_accessor_iterERN4sycl3_V19sub_groupERNS1_6detail17accessor_iteratorIKiLi1EEENS1_4spanIiLm2EEE( // CHECK-NEXT: entry: -// CHECK-NEXT: [[AGG_TMP3_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr [[IN_PTR]], align 8, !tbaa [[TBAA12]] -// CHECK-NEXT: [[AGG_TMP3_SROA_2_0_IN_PTR_ASCAST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[IN_PTR]], i64 8 -// CHECK-NEXT: [[AGG_TMP3_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr [[AGG_TMP3_SROA_2_0_IN_PTR_ASCAST_SROA_IDX]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA12]] +// CHECK-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 +// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] // CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META51:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META54:![0-9]+]] -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP3_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP3_SROA_2_0_COPYLOAD]] -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEILM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSD_INS3_14FULL_GROUP_KEYEJEEENSD_INSB_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] -// CHECK-NEXT: [[CONV3_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ADD_PTR_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP4]], i64 [[CONV3_I]] -// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ADD_PTR_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP57:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSD_INS3_14full_group_keyEJEEENSD_INSB_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META59:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META62:![0-9]+]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEILM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSD_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEET3__EXIT:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I_I]] +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-NEXT: [[CONV3_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ADD_PTR_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP4]], i64 [[CONV3_I_I]] +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ADD_PTR_I_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP65:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSD_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_.exit: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_accessor_iter(sycl::sub_group &sg, + accessor_iter_t &iter, span out) { + // Can't be optimized. + group_load(sg, iter, out, full_group_striped{}); +} -// Explicit property - optimize. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< - sycl::sub_group, accessor_iter_t, int, 2, opt_striped>(sycl::sub_group, - accessor_iter_t, - span, - opt_striped); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSD_INS3_21contiguous_memory_keyEJEEENSD_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::detail::accessor_iterator") align 8 [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.19") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.25") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META18]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7striped34test_accessor_iter_force_optimizedERN4sycl3_V19sub_groupERNS1_6detail17accessor_iteratorIKiLi1EEENS1_4spanIiLm2EEE( // CHECK-NEXT: entry: -// CHECK-NEXT: [[AGG_TMP_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr [[IN_PTR]], align 8, !tbaa [[TBAA12]] -// CHECK-NEXT: [[AGG_TMP_SROA_2_0_IN_PTR_ASCAST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[IN_PTR]], i64 8 -// CHECK-NEXT: [[AGG_TMP_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr [[AGG_TMP_SROA_2_0_IN_PTR_ASCAST_SROA_IDX]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[ADD_PTR_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[AGG_TMP_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP_SROA_2_0_COPYLOAD]] -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(4) [[ADD_PTR_I_I]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) -// CHECK-NEXT: [[CALL_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvi(ptr addrspace(4) noundef nonnull [[ADD_PTR_I_I]], i32 noundef 5) #[[ATTR5]] -// CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq ptr addrspace(1) [[CALL_I_I_I]], null -// CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] -// CHECK: if.then: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA12]] +// CHECK-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 +// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] // CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: [[ADD_PTR_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] +// CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(4) [[ADD_PTR_I_I_I]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-NEXT: [[CALL_I_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvi(ptr addrspace(4) noundef nonnull [[ADD_PTR_I_I_I]], i32 noundef 5) #[[ATTR5]] +// CHECK-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp eq ptr addrspace(1) [[CALL_I_I_I_I]], null +// CHECK-NEXT: br i1 [[TOBOOL_NOT_I]], label [[IF_THEN_I:%.*]], label [[IF_END_I:%.*]] +// CHECK: if.then.i: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META58:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META61:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_THEN]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEILM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSD_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSD_INS3_14FULL_GROUP_KEYEJEEENSD_INSB_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEEST_SR_NS0_4SPANISS_XT2_EEET3__EXIT:%.*]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] -// CHECK-NEXT: [[CONV3_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ADD_PTR_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[ADD_PTR_I_I]], i64 [[CONV3_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ADD_PTR_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP64:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSD_INS3_21contiguous_memory_keyEJEEENSD_INS3_14full_group_keyEJEEENSD_INSB_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeEST_SR_NS0_4spanISS_XT2_EEET3_.exit: +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META67:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META70:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEILM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSD_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSD_INS3_14FULL_GROUP_KEYEJEEENSD_INSB_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEEST_SR_NS0_4SPANISS_XT2_EEET3__EXIT_I:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I_I]] +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-NEXT: [[CONV3_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ADD_PTR_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[ADD_PTR_I_I_I]], i64 [[CONV3_I_I]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ADD_PTR_I_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP73:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSD_INS3_21contiguous_memory_keyEJEEENSD_INS3_14full_group_keyEJEEENSD_INSB_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeEST_SR_NS0_4spanISS_XT2_EEET3_.exit.i: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: br label [[CLEANUP:%.*]] -// CHECK: if.end: -// CHECK-NEXT: [[CALL6:%.*]] = tail call spir_func noundef <2 x i32> @_Z30__spirv_SubgroupBlockReadINTELIDv2_jET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[CALL_I_I_I]]) #[[ATTR4]] -// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8, !tbaa [[TBAA48]] -// CHECK-NEXT: store <2 x i32> [[CALL6]], ptr addrspace(4) [[TMP5]], align 4 -// CHECK-NEXT: br label [[CLEANUP]] -// CHECK: cleanup: +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEILM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSD_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSD_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] +// CHECK: if.end.i: +// CHECK-NEXT: [[CALL6_I:%.*]] = tail call spir_func noundef <2 x i32> @_Z30__spirv_SubgroupBlockReadINTELIDv2_jET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[CALL_I_I_I_I]]) #[[ATTR4]] +// CHECK-NEXT: store <2 x i32> [[CALL6_I]], ptr addrspace(4) [[TMP1]], align 4 +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEILM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSD_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSD_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSD_INS3_21contiguous_memory_keyEJEEENSD_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_accessor_iter_force_optimized(sycl::sub_group &sg, + accessor_iter_t &iter, + span out) { + // Explicit property - optimize. + group_load(sg, iter, out, opt_striped{}); +} -// Run-time alignment check is needed if type's alignment is less than BlockRead -// requirements. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< - sycl::sub_group, plain_global_ptr, char, 2, opt_striped>( - sycl::sub_group, plain_global_ptr, span, opt_striped); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ccLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.29") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.25") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META18]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7striped24test_runtime_align_checkERN4sycl3_V19sub_groupEPU3AS1cNS1_4spanIcLm2EEE( // CHECK-NEXT: entry: -// CHECK-NEXT: [[CMP_I:%.*]] = icmp ne ptr addrspace(1) [[IN_PTR]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I]]) -// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[IN_PTR]] to i64 -// CHECK-NEXT: [[REM_I:%.*]] = and i64 [[TMP0]], 3 -// CHECK-NEXT: [[CMP1_I_NOT:%.*]] = icmp eq i64 [[REM_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] -// CHECK: if.then: -// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA65:![0-9]+]] -// CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA75:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 3 +// CHECK-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK: if.then.i: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META67:![0-9]+]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META70:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_THEN]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I14:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 -// CHECK-NEXT: br i1 [[CMP_I14]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1CCLM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP4]], [[I_0_I]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I]] -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr addrspace(1) [[ARRAYIDX_I]], align 1, !tbaa [[TBAA17]] -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP2]], i64 [[CONV_I]] -// CHECK-NEXT: store i8 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 1, !tbaa [[TBAA17]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP73:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ccLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META77:![0-9]+]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META80:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I14_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 +// CHECK-NEXT: br i1 [[CMP_I14_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1CCLM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT_I:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP4]], [[I_0_I_I]] +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr addrspace(1) [[ARRAYIDX_I_I]], align 1, !tbaa [[TBAA22]] +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: store i8 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 1, !tbaa [[TBAA22]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP83:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ccLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit.i: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: br label [[CLEANUP:%.*]] -// CHECK: if.end: -// CHECK-NEXT: [[CALL4:%.*]] = tail call spir_func noundef <2 x i8> @_Z30__spirv_SubgroupBlockReadINTELIDv2_hET_PU3AS1Kh(ptr addrspace(1) noundef nonnull [[IN_PTR]]) #[[ATTR4]] -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8, !tbaa [[TBAA74:![0-9]+]] -// CHECK-NEXT: store <2 x i8> [[CALL4]], ptr addrspace(4) [[TMP6]], align 1 -// CHECK-NEXT: br label [[CLEANUP]] -// CHECK: cleanup: +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1CCLM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEET3__EXIT:%.*]] +// CHECK: if.end.i: +// CHECK-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef <2 x i8> @_Z30__spirv_SubgroupBlockReadINTELIDv2_hET_PU3AS1Kh(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] +// CHECK-NEXT: store <2 x i8> [[CALL4_I]], ptr addrspace(4) [[TMP1]], align 1 +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1CCLM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEET3__EXIT]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ccLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_.exit: // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_runtime_align_check(sycl::sub_group &sg, + plain_global_ptr p, + span out) { + // Run-time alignment check is needed if type's alignment is less than + // BlockRead requirements. + group_load(sg, p, out, opt_striped{}); +} -// Just because there is a blocked data layout testcase, nothing inherently -// useful here. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< - sycl::sub_group, plain_global_ptr, short, 4, opt_striped>( - sycl::sub_group, plain_global_ptr, span, opt_striped); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.15") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.25") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META18]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7striped16test_four_shortsERN4sycl3_V19sub_groupEPU3AS1sNS1_4spanIsLm4EEE( // CHECK-NEXT: entry: -// CHECK-NEXT: [[CMP_I:%.*]] = icmp ne ptr addrspace(1) [[IN_PTR]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I]]) -// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[IN_PTR]] to i64 -// CHECK-NEXT: [[REM_I:%.*]] = and i64 [[TMP0]], 3 -// CHECK-NEXT: [[CMP1_I_NOT:%.*]] = icmp eq i64 [[REM_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] -// CHECK: if.then: -// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA19]] -// CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(4) +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA24]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 3 +// CHECK-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK: if.then.i: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META76:![0-9]+]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META79:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_THEN]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I14:%.*]] = icmp samesign ult i32 [[I_0_I]], 4 -// CHECK-NEXT: br i1 [[CMP_I14]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul i32 [[TMP4]], [[I_0_I]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I]] -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(1) [[ARRAYIDX_I]], align 2, !tbaa [[TBAA24]] -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP2]], i64 [[CONV_I]] -// CHECK-NEXT: store i16 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA24]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP82:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META85:![0-9]+]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META88:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I14_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 +// CHECK-NEXT: br i1 [[CMP_I14_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT_I:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP4]], [[I_0_I_I]] +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(1) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA29]] +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: store i16 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA29]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP91:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit.i: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: br label [[CLEANUP:%.*]] -// CHECK: if.end: -// CHECK-NEXT: [[CALL4:%.*]] = tail call spir_func noundef <4 x i16> @_Z30__spirv_SubgroupBlockReadINTELIDv4_tET_PU3AS1Kt(ptr addrspace(1) noundef nonnull [[IN_PTR]]) #[[ATTR4]] -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(4), ptr [[OUT]], align 8, !tbaa [[TBAA28]] -// CHECK-NEXT: store <4 x i16> [[CALL4]], ptr addrspace(4) [[TMP6]], align 2 -// CHECK-NEXT: br label [[CLEANUP]] -// CHECK: cleanup: +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEET3__EXIT:%.*]] +// CHECK: if.end.i: +// CHECK-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef <4 x i16> @_Z30__spirv_SubgroupBlockReadINTELIDv4_tET_PU3AS1Kt(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] +// CHECK-NEXT: store <4 x i16> [[CALL4_I]], ptr addrspace(4) [[TMP1]], align 2 +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEET3__EXIT]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_.exit: // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_four_shorts(sycl::sub_group &sg, + plain_global_ptr p, + span out) { + // Just because there is a blocked data layout testcase, nothing inherently + // useful here. + group_load(sg, p, out, opt_striped{}); +} -// Check for non-power-of-two size. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< - sycl::sub_group, plain_global_ptr, int, 3, opt_striped>( - sycl::sub_group, plain_global_ptr, span, opt_striped); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm3ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.16") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.25") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META18]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7striped21test_non_power_of_twoERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm3EEE( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA12]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] // CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META83:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META86:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 3 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM3ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP89:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm3ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META93:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META96:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 3 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM3ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEET3__EXIT:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP99:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm3ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_.exit: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_non_power_of_two(sycl::sub_group &sg, + plain_global_ptr p, + span out) { + // Check for non-power-of-two size. + group_load(sg, p, out, opt_striped{}); +} -// Even though power of two, still too many to map directly onto BloadRead API. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< - sycl::sub_group, plain_global_ptr, int, 16, opt_striped>( - sycl::sub_group, plain_global_ptr, span, opt_striped); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm16ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.30") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.25") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META18]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7striped17test_sixteen_intsERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm16EEE( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA12]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] // CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META90:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META93:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 16 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM16ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP96:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm16ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META101:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META104:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 16 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM16ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEET3__EXIT:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP107:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm16ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_.exit: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_sixteen_ints(sycl::sub_group &sg, + plain_global_ptr p, + span out) { + // Even though power of two, still too many to map directly onto BloadRead + // API. + group_load(sg, p, out, opt_striped{}); +} -// Non-power of two case bigger than max natively supported power of two case. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_load< - sycl::sub_group, plain_global_ptr, int, 11, opt_striped>( - sycl::sub_group, plain_global_ptr, span, opt_striped); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm11ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(1) noundef [[IN_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::span.31") align 8 [[OUT:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.25") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META18]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7striped16test_eleven_intsERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm11EEE( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT]], align 8, !tbaa [[TBAA12]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] // CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META97:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META100:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 11 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM11ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEET3__EXIT:%.*]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[IN_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP103:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm11ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEET3_.exit: +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META109:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META112:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 11 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM11ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEET3__EXIT:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP115:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm11ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEET3_.exit: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_eleven_ints(sycl::sub_group &sg, + plain_global_ptr p, + span out) { + // Non-power of two case bigger than max natively supported power of two case. + group_load(sg, p, out, opt_striped{}); +} +} // namespace striped diff --git a/sycl/test/check_device_code/group_store.cpp b/sycl/test/check_device_code/group_store.cpp index ac42b0de24ab5..21f3f10f5f0c7 100644 --- a/sycl/test/check_device_code/group_store.cpp +++ b/sycl/test/check_device_code/group_store.cpp @@ -1,5 +1,4 @@ -// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --functions "group_store" --check-globals none --include-generated-funcs --version 4 -// NOTE: and manually adjusted to follow the related explicit instantiation. +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py // RUN: %clangxx -O3 -fsycl -fsycl-device-only -fno-discard-value-names -S -emit-llvm -fno-sycl-instrument-device-code -o - %s | FileCheck %s // Windows/linux have some slight differences in IR generation (function @@ -37,790 +36,803 @@ template using plain_global_ptr = typename sycl::detail::DecoratedType< T, access::address_space::global_space>::type *; -// Ensure `detail::naive` always results in no block loads/stores. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_store< - sycl::sub_group, int, plain_global_ptr, naive_blocked>( - sycl::sub_group, const int &, plain_global_ptr, naive_blocked); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESN_RKSL_SM_T2_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(4) [[IN:%.*]], ptr addrspace(1) noundef [[OUT_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties") align 1 [[PROPERTIES:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] comdat !srcloc [[META6:![0-9]+]] !sycl_fixed_targets [[META7:![0-9]+]] { +namespace blocked { +// CHECK-LABEL: @_ZN7blocked10test_naiveERN4sycl3_V19sub_groupEiPU3AS1i( // CHECK-NEXT: entry: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5:[0-9]+]] // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[TMP0]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[IN]], align 4, !tbaa [[TBAA8:![0-9]+]] -// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: store i32 [[V:%.*]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8:![0-9]+]] // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_naive(sycl::sub_group &sg, int v, + plain_global_ptr p) { + // Ensure `detail::naive` always results in no block loads/stores. + group_store(sg, v, p, naive_blocked{}); +} -// Check that optimized implementation is selected. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_store< - sycl::sub_group, int, plain_global_ptr, opt_blocked>( - sycl::sub_group, const int &, plain_global_ptr, opt_blocked); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_RKSN_SO_T2_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(4) [[IN:%.*]], ptr addrspace(1) noundef [[OUT_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.4") align 1 [[PROPERTIES:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META6]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7blocked14test_optimizedERN4sycl3_V19sub_groupEiPU3AS1i( // CHECK-NEXT: entry: -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[OUT_PTR]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) -// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[OUT_PTR]] to i64 -// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP0]], 15 -// CHECK-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] -// CHECK: if.then.i: +// CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-NEXT: [[REM_I_I_I:%.*]] = and i64 [[TMP0]], 15 +// CHECK-NEXT: [[CMP1_I_NOT_I_I:%.*]] = icmp eq i64 [[REM_I_I_I]], 0 +// CHECK-NEXT: br i1 [[CMP1_I_NOT_I_I]], label [[IF_END_I_I:%.*]], label [[IF_THEN_I_I:%.*]] +// CHECK: if.then.i.i: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[TMP1]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT_PTR]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[IN]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEKILM1EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESQ_NS0_4SPANISO_XT1_EEESP_T3__EXIT:%.*]] -// CHECK: if.end.i: -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[IN]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[OUT_PTR]], i32 noundef [[TMP3]]) #[[ATTR5]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEKILM1EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESQ_NS0_4SPANISO_XT1_EEESP_T3__EXIT]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEKiLm1EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESQ_NS0_4spanISO_XT1_EEESP_T3_.exit: +// CHECK-NEXT: [[IDXPROM_I_I_I:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I_I]] +// CHECK-NEXT: store i32 [[V:%.*]], ptr addrspace(1) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_RKSN_SO_T2__EXIT:%.*]] +// CHECK: if.end.i.i: +// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR5]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_RKSN_SO_T2__EXIT]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_RKSN_SO_T2_.exit: // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_optimized(sycl::sub_group &sg, int v, + plain_global_ptr p) { + // Check that optimized implementation is selected. + group_store(sg, v, p, opt_blocked{}); +} -// Check that contiguous_memory can be auto-detected. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_store< - sycl::sub_group, int, plain_global_ptr, full_group_blocked>( - sycl::sub_group, const int &, plain_global_ptr, full_group_blocked); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESN_RKSL_SM_T2_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(4) [[IN:%.*]], ptr addrspace(1) noundef [[OUT_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.12") align 1 [[PROPERTIES:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META6]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7blocked27test_contiguous_auto_detectERN4sycl3_V19sub_groupEiPU3AS1i( // CHECK-NEXT: entry: -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[OUT_PTR]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) -// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[OUT_PTR]] to i64 -// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP0]], 15 -// CHECK-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] -// CHECK: if.then.i: +// CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-NEXT: [[REM_I_I_I:%.*]] = and i64 [[TMP0]], 15 +// CHECK-NEXT: [[CMP1_I_NOT_I_I:%.*]] = icmp eq i64 [[REM_I_I_I]], 0 +// CHECK-NEXT: br i1 [[CMP1_I_NOT_I_I]], label [[IF_END_I_I:%.*]], label [[IF_THEN_I_I:%.*]] +// CHECK: if.then.i.i: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[TMP1]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT_PTR]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[IN]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEKILM1EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESO_NS0_4SPANISM_XT1_EEESN_T3__EXIT:%.*]] -// CHECK: if.end.i: -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[IN]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[OUT_PTR]], i32 noundef [[TMP3]]) #[[ATTR5]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEKILM1EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESO_NS0_4SPANISM_XT1_EEESN_T3__EXIT]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEKiLm1EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSC_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_NS0_4spanISM_XT1_EEESN_T3_.exit: +// CHECK-NEXT: [[IDXPROM_I_I_I:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I_I]] +// CHECK-NEXT: store i32 [[V:%.*]], ptr addrspace(1) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESN_RKSL_SM_T2__EXIT:%.*]] +// CHECK: if.end.i.i: +// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR5]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESN_RKSL_SM_T2__EXIT]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESN_RKSL_SM_T2_.exit: // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_contiguous_auto_detect(sycl::sub_group &sg, int v, + plain_global_ptr p) { + // Check that contiguous_memory can be auto-detected. + group_store(sg, v, p, full_group_blocked{}); +} // SYCL 2020's accessor can't be statically known to be contiguous. using accessor_iter_t = accessor::iterator; -// Can't be optimized. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_store< - sycl::sub_group, int, accessor_iter_t, full_group_blocked>( - sycl::sub_group, const int &, accessor_iter_t, full_group_blocked); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiNS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSC_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_RKSM_SN_T2_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(4) [[IN:%.*]], ptr noundef byval(%"class.sycl::_V1::detail::accessor_iterator") align 8 [[OUT_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.12") align 1 [[PROPERTIES:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META6]] !sycl_fixed_targets [[META7]] { + +// +// CHECK-LABEL: @_ZN7blocked18test_accessor_iterERN4sycl3_V19sub_groupEiRNS1_6detail17accessor_iteratorIiLi1EEE( // CHECK-NEXT: entry: -// CHECK-NEXT: [[AGG_TMP2_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr [[OUT_PTR]], align 8, !tbaa [[TBAA12:![0-9]+]] -// CHECK-NEXT: [[AGG_TMP2_SROA_2_0_OUT_PTR_ASCAST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OUT_PTR]], i64 8 -// CHECK-NEXT: [[AGG_TMP2_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr [[AGG_TMP2_SROA_2_0_OUT_PTR_ASCAST_SROA_IDX]], align 8, !tbaa [[TBAA15:![0-9]+]] +// CHECK-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15:![0-9]+]] +// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 +// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18:![0-9]+]] // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 -// CHECK-NEXT: [[CONV5_I_I:%.*]] = sext i32 [[TMP0]] to i64 -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP2_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP2_SROA_2_0_COPYLOAD]] -// CHECK-NEXT: [[ADD_PTR_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP1]], i64 [[CONV5_I_I]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[IN]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(4) [[ADD_PTR_I_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[CONV5_I_I_I:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] +// CHECK-NEXT: [[ADD_PTR_I_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP1]], i64 [[CONV5_I_I_I]] +// CHECK-NEXT: store i32 [[V:%.*]], ptr addrspace(4) [[ADD_PTR_I_I_I_I_I]], align 4, !tbaa [[TBAA8]] // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_accessor_iter(sycl::sub_group &sg, int v, + accessor_iter_t &iter) { + // Can't be optimized. + group_store(sg, v, iter, full_group_blocked{}); +} -// Explicit property - optimize. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_store< - sycl::sub_group, int, accessor_iter_t, opt_blocked>(sycl::sub_group, - const int &, - accessor_iter_t, - opt_blocked); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiNS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESQ_RKSO_SP_T2_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr addrspace(4) noundef align 4 dereferenceable(4) [[IN:%.*]], ptr noundef byval(%"class.sycl::_V1::detail::accessor_iterator") align 8 [[OUT_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.4") align 1 [[PROPERTIES:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META6]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7blocked34test_accessor_iter_force_optimizedERN4sycl3_V19sub_groupEiRNS1_6detail17accessor_iteratorIiLi1EEE( // CHECK-NEXT: entry: -// CHECK-NEXT: [[AGG_TMP2_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr [[OUT_PTR]], align 8, !tbaa [[TBAA12]] -// CHECK-NEXT: [[AGG_TMP2_SROA_2_0_OUT_PTR_ASCAST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OUT_PTR]], i64 8 -// CHECK-NEXT: [[AGG_TMP2_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr [[AGG_TMP2_SROA_2_0_OUT_PTR_ASCAST_SROA_IDX]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[ADD_PTR_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[AGG_TMP2_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP2_SROA_2_0_COPYLOAD]] -// CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(4) [[ADD_PTR_I_I_I]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) -// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(4) [[ADD_PTR_I_I_I]] to i64 -// CHECK-NEXT: [[REM_I_I_I:%.*]] = and i64 [[TMP0]], 15 -// CHECK-NEXT: [[CMP1_I_I_I:%.*]] = icmp eq i64 [[REM_I_I_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL6DETAIL16GET_BLOCK_OP_PTRILI16ELM1ENS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS4_20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEEDAT1_T2__EXIT_I:%.*]], label [[IF_THEN_I:%.*]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental6detail16get_block_op_ptrILi16ELm1ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS4_20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEEDaT1_T2_.exit.i: -// CHECK-NEXT: [[CALL_I_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef nonnull [[ADD_PTR_I_I_I]], i32 noundef 5) #[[ATTR6:[0-9]+]] -// CHECK-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp eq ptr addrspace(1) [[CALL_I_I_I_I]], null -// CHECK-NEXT: br i1 [[TOBOOL_NOT_I]], label [[IF_THEN_I]], label [[IF_END_I:%.*]] -// CHECK: if.then.i: +// CHECK-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 +// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18]] +// CHECK-NEXT: [[ADD_PTR_I_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] +// CHECK-NEXT: [[CMP_I_I_I_I:%.*]] = icmp ne ptr addrspace(4) [[ADD_PTR_I_I_I_I]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I_I]]) +// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(4) [[ADD_PTR_I_I_I_I]] to i64 +// CHECK-NEXT: [[REM_I_I_I_I:%.*]] = and i64 [[TMP0]], 15 +// CHECK-NEXT: [[CMP1_I_I_I_I:%.*]] = icmp eq i64 [[REM_I_I_I_I]], 0 +// CHECK-NEXT: br i1 [[CMP1_I_I_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL6DETAIL16GET_BLOCK_OP_PTRILI16ELM1ENS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS4_20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEEDAT1_T2__EXIT_I_I:%.*]], label [[IF_THEN_I_I:%.*]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental6detail16get_block_op_ptrILi16ELm1ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS4_20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEEDaT1_T2_.exit.i.i: +// CHECK-NEXT: [[CALL_I_I_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef nonnull [[ADD_PTR_I_I_I_I]], i32 noundef 5) #[[ATTR6:[0-9]+]] +// CHECK-NEXT: [[TOBOOL_NOT_I_I:%.*]] = icmp eq ptr addrspace(1) [[CALL_I_I_I_I_I]], null +// CHECK-NEXT: br i1 [[TOBOOL_NOT_I_I]], label [[IF_THEN_I_I]], label [[IF_END_I_I:%.*]] +// CHECK: if.then.i.i: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 -// CHECK-NEXT: [[CONV5_I_I:%.*]] = sext i32 [[TMP1]] to i64 -// CHECK-NEXT: [[ADD_PTR_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[ADD_PTR_I_I_I]], i64 [[CONV5_I_I]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[IN]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(4) [[ADD_PTR_I_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEKILM1ENS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSD_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSD_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_T3__EXIT:%.*]] -// CHECK: if.end.i: -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[IN]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[CALL_I_I_I_I]], i32 noundef [[TMP3]]) #[[ATTR5]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEKILM1ENS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSD_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSD_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_T3__EXIT]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEKiLm1ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSD_INS3_21contiguous_memory_keyEJEEENSD_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_T3_.exit: +// CHECK-NEXT: [[CONV5_I_I_I:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[ADD_PTR_I_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[ADD_PTR_I_I_I_I]], i64 [[CONV5_I_I_I]] +// CHECK-NEXT: store i32 [[V:%.*]], ptr addrspace(4) [[ADD_PTR_I_I_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEINS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESQ_RKSO_SP_T2__EXIT:%.*]] +// CHECK: if.end.i.i: +// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[CALL_I_I_I_I_I]], i32 noundef [[V]]) #[[ATTR5]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEINS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESQ_RKSO_SP_T2__EXIT]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiNS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_EEvE4typeESQ_RKSO_SP_T2_.exit: // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_accessor_iter_force_optimized(sycl::sub_group &sg, + int v, + accessor_iter_t &iter) { + // Explicit property - optimize. + group_store(sg, v, iter, opt_blocked{}); +} -// Four shorts in blocked data layout could be stored as a single 64-bit -// integer. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_store< - sycl::sub_group, short, 4, plain_global_ptr, opt_blocked>( - sycl::sub_group, span, plain_global_ptr, opt_blocked); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_T3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::span.14") align 8 [[IN:%.*]], ptr addrspace(1) noundef [[OUT_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.4") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META17:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7blocked16test_four_shortsERN4sycl3_V19sub_groupENS1_4spanIsLm4EEEPU3AS1s( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VALUES:%.*]] = alloca [4 x i16], align 2 -// CHECK-NEXT: [[CMP_I:%.*]] = icmp ne ptr addrspace(1) [[OUT_PTR]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I]]) -// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[OUT_PTR]] to i64 -// CHECK-NEXT: [[REM_I:%.*]] = and i64 [[TMP0]], 15 -// CHECK-NEXT: [[CMP1_I_NOT:%.*]] = icmp eq i64 [[REM_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] -// CHECK: if.then: -// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[IN]], align 8, !tbaa [[TBAA18:![0-9]+]] -// CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META20:![0-9]+]] -// CHECK-NEXT: [[MUL_I_I:%.*]] = shl i32 [[TMP3]], 2 +// CHECK-NEXT: [[VALUES_I:%.*]] = alloca [4 x i16], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA22:![0-9]+]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 +// CHECK-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK: if.then.i: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META24:![0-9]+]] +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = shl i32 [[TMP3]], 2 +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 +// CHECK-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_T3__EXIT_I:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA27:![0-9]+]] +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = or disjoint i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: store i16 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA27]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP29:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_T3_.exit.i: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_T3__EXIT:%.*]] +// CHECK: if.end.i: +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7:[0-9]+]] // CHECK-NEXT: br label [[FOR_COND_I:%.*]] // CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_THEN]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I19:%.*]] = icmp samesign ult i32 [[I_0_I]], 4 -// CHECK-NEXT: br i1 [[CMP_I19]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_T3__EXIT:%.*]] +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 4 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] +// CHECK: for.cond.cleanup.i: +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VALUES_I]], align 2, !tbaa [[TBAA31:![0-9]+]] +// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELImEvPU3AS1mT_(ptr addrspace(1) noundef nonnull [[P]], i64 noundef [[TMP5]]) #[[ATTR5]] +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_T3__EXIT]] // CHECK: for.body.i: // CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP2]], i64 [[CONV_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA23:![0-9]+]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = or disjoint i32 [[MUL_I_I]], [[I_0_I]] -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[OUT_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: store i16 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I]], align 2, !tbaa [[TBAA23]] +// CHECK-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 2, !tbaa [[TBAA27]] +// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x i16], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] +// CHECK-NEXT: store i16 [[TMP6]], ptr [[ARRAYIDX_I]], align 2, !tbaa [[TBAA27]] // CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP25:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_T3_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: br label [[CLEANUP:%.*]] -// CHECK: if.end: -// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES]]) #[[ATTR7:[0-9]+]] -// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(4), ptr [[IN]], align 8, !tbaa [[TBAA27:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND:%.*]] -// CHECK: for.cond: -// CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[IF_END]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -// CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i32 [[I_0]], 4 -// CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -// CHECK: for.cond.cleanup: -// CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[VALUES]], align 2, !tbaa [[TBAA29:![0-9]+]] -// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELImEvPU3AS1mT_(ptr addrspace(1) noundef nonnull [[OUT_PTR]], i64 noundef [[TMP6]]) #[[ATTR5]] -// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES]]) #[[ATTR7]] -// CHECK-NEXT: br label [[CLEANUP]] -// CHECK: for.body: -// CHECK-NEXT: [[CONV:%.*]] = zext nneg i32 [[I_0]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I20:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP5]], i64 [[CONV]] -// CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I20]], align 2, !tbaa [[TBAA23]] -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x i16], ptr [[VALUES]], i64 0, i64 [[CONV]] -// CHECK-NEXT: store i16 [[TMP7]], ptr [[ARRAYIDX]], align 2, !tbaa [[TBAA23]] -// CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_0]], 1 -// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP30:![0-9]+]] -// CHECK: cleanup: +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP32:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_T3_.exit: // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_four_shorts(sycl::sub_group &sg, span v, + plain_global_ptr p) { + // Four shorts in blocked data layout could be stored as a single 64-bit + // integer. + group_store(sg, v, p, opt_blocked{}); +} -// Same, but make it `const short`. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_store< - sycl::sub_group, const short, 4, plain_global_ptr, opt_blocked>( - sycl::sub_group, span, plain_global_ptr, - opt_blocked); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEKsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESQ_NS0_4spanISO_XT1_EEESP_T3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::span.15") align 8 [[IN:%.*]], ptr addrspace(1) noundef [[OUT_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.4") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META17]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7blocked22test_four_const_shortsERN4sycl3_V19sub_groupENS1_4spanIKsLm4EEEPU3AS1s( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VALUES:%.*]] = alloca [4 x i16], align 2 -// CHECK-NEXT: [[CMP_I:%.*]] = icmp ne ptr addrspace(1) [[OUT_PTR]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I]]) -// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[OUT_PTR]] to i64 -// CHECK-NEXT: [[REM_I:%.*]] = and i64 [[TMP0]], 15 -// CHECK-NEXT: [[CMP1_I_NOT:%.*]] = icmp eq i64 [[REM_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] -// CHECK: if.then: -// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[IN]], align 8, !tbaa [[TBAA18]] -// CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META31:![0-9]+]] -// CHECK-NEXT: [[MUL_I_I:%.*]] = shl i32 [[TMP3]], 2 +// CHECK-NEXT: [[VALUES_I:%.*]] = alloca [4 x i16], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA22]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 +// CHECK-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK: if.then.i: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META34:![0-9]+]] +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = shl i32 [[TMP3]], 2 +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 +// CHECK-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEKSLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEENSC_INSA_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESS_NS0_4SPANISQ_XT1_EEESR_T3__EXIT_I:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA27]] +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = or disjoint i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: store i16 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA27]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP37:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEKsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEENSC_INSA_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESS_NS0_4spanISQ_XT1_EEESR_T3_.exit.i: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEKSLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESQ_NS0_4SPANISO_XT1_EEESP_T3__EXIT:%.*]] +// CHECK: if.end.i: +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7]] // CHECK-NEXT: br label [[FOR_COND_I:%.*]] // CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_THEN]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I19:%.*]] = icmp samesign ult i32 [[I_0_I]], 4 -// CHECK-NEXT: br i1 [[CMP_I19]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEKSLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEENSC_INSA_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESS_NS0_4SPANISQ_XT1_EEESR_T3__EXIT:%.*]] +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 4 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] +// CHECK: for.cond.cleanup.i: +// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VALUES_I]], align 2, !tbaa [[TBAA31]] +// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELImEvPU3AS1mT_(ptr addrspace(1) noundef nonnull [[P]], i64 noundef [[TMP5]]) #[[ATTR5]] +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEKSLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESQ_NS0_4SPANISO_XT1_EEESP_T3__EXIT]] // CHECK: for.body.i: // CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP2]], i64 [[CONV_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA23]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = or disjoint i32 [[MUL_I_I]], [[I_0_I]] -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[OUT_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: store i16 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I]], align 2, !tbaa [[TBAA23]] +// CHECK-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 2, !tbaa [[TBAA27]] +// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x i16], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] +// CHECK-NEXT: store i16 [[TMP6]], ptr [[ARRAYIDX_I]], align 2, !tbaa [[TBAA27]] // CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP34:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEKsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEENSC_INSA_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESS_NS0_4spanISQ_XT1_EEESR_T3_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: br label [[CLEANUP:%.*]] -// CHECK: if.end: -// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES]]) #[[ATTR7]] -// CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(4), ptr [[IN]], align 8, !tbaa [[TBAA35:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND:%.*]] -// CHECK: for.cond: -// CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[IF_END]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -// CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i32 [[I_0]], 4 -// CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -// CHECK: for.cond.cleanup: -// CHECK-NEXT: [[TMP6:%.*]] = load i64, ptr [[VALUES]], align 2, !tbaa [[TBAA29]] -// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELImEvPU3AS1mT_(ptr addrspace(1) noundef nonnull [[OUT_PTR]], i64 noundef [[TMP6]]) #[[ATTR5]] -// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES]]) #[[ATTR7]] -// CHECK-NEXT: br label [[CLEANUP]] -// CHECK: for.body: -// CHECK-NEXT: [[CONV:%.*]] = zext nneg i32 [[I_0]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I20:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP5]], i64 [[CONV]] -// CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I20]], align 2, !tbaa [[TBAA23]] -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x i16], ptr [[VALUES]], i64 0, i64 [[CONV]] -// CHECK-NEXT: store i16 [[TMP7]], ptr [[ARRAYIDX]], align 2, !tbaa [[TBAA23]] -// CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_0]], 1 -// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP37:![0-9]+]] -// CHECK: cleanup: +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP38:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEKsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESQ_NS0_4spanISO_XT1_EEESP_T3_.exit: // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_four_const_shorts(sycl::sub_group &sg, + span v, + plain_global_ptr p) { + // Same, but make it `const short`. + group_store(sg, v, p, opt_blocked{}); +} -// Check for non-power-of-two size. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_store< - sycl::sub_group, int, 3, plain_global_ptr, opt_blocked>( - sycl::sub_group, span, plain_global_ptr, opt_blocked); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm3EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_T3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::span.16") align 8 [[IN:%.*]], ptr addrspace(1) noundef [[OUT_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.4") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META17]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7blocked21test_non_power_of_twoERN4sycl3_V19sub_groupENS1_4spanIiLm3EEEPU3AS1i( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[IN]], align 8, !tbaa [[TBAA12]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] // CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META38:![0-9]+]] -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul i32 [[TMP2]], 3 -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 3 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM3EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_T3__EXIT:%.*]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[MUL_I_I]], [[I_0_I]] -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP41:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm3EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_T3_.exit: +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META40:![0-9]+]] +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP2]], 3 +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 3 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM3EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_T3__EXIT:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP43:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm3EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_T3_.exit: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_non_power_of_two(sycl::sub_group &sg, span v, + plain_global_ptr p) { + // Check for non-power-of-two size. + group_store(sg, v, p, opt_blocked{}); +} -// Four int elements in blocked data layout don't map directly to any BlockWrite -// API. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_store< - sycl::sub_group, int, 4, plain_global_ptr, opt_blocked>( - sycl::sub_group, span, plain_global_ptr, opt_blocked); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm4EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_T3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::span.17") align 8 [[IN:%.*]], ptr addrspace(1) noundef [[OUT_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.4") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META17]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7blocked14test_four_intsERN4sycl3_V19sub_groupENS1_4spanIiLm4EEEPU3AS1i( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[IN]], align 8, !tbaa [[TBAA12]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] // CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META42:![0-9]+]] -// CHECK-NEXT: [[MUL_I_I:%.*]] = shl i32 [[TMP2]], 2 -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 4 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM4EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_T3__EXIT:%.*]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = or disjoint i32 [[MUL_I_I]], [[I_0_I]] -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP45:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm4EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_T3_.exit: +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META45:![0-9]+]] +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = shl i32 [[TMP2]], 2 +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM4EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_T3__EXIT:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = or disjoint i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP48:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm4EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_T3_.exit: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_four_ints(sycl::sub_group &sg, span v, + plain_global_ptr p) { + // Four int elements in blocked data layout don't map directly to any + // BlockWrite API. + group_store(sg, v, p, opt_blocked{}); +} + +// CHECK-LABEL: @_ZN7blocked15test_seven_intsERN4sycl3_V19sub_groupENS1_4spanIiLm7EEEPU3AS1i( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META50:![0-9]+]] +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP2]], 7 +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 7 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM7EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_T3__EXIT:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP53:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm7EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_T3_.exit: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_seven_ints(sycl::sub_group &sg, span v, + plain_global_ptr p) { + // Similar to four elements case but more complex to optimize. + group_store(sg, v, p, opt_blocked{}); +} +} // namespace blocked -// Similar to four elements case but more complex to optimize. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_store< - sycl::sub_group, int, 7, plain_global_ptr, opt_blocked>( - sycl::sub_group, span, plain_global_ptr, opt_blocked); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm7EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_T3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::span.18") align 8 [[IN:%.*]], ptr addrspace(1) noundef [[OUT_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.4") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META17]] !sycl_fixed_targets [[META7]] { +namespace striped { +// Striped data layout with one element per work item isn't different from +// blocked data layout, so use span version only in the checks below. + +// CHECK-LABEL: @_ZN7striped10test_naiveERN4sycl3_V19sub_groupENS1_4spanIiLm2EEEPU3AS1i( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[IN]], align 8, !tbaa [[TBAA12]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] // CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META46:![0-9]+]] -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul i32 [[TMP2]], 7 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META55:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META58:![0-9]+]] // CHECK-NEXT: br label [[FOR_COND_I:%.*]] // CHECK: for.cond.i: // CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 7 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM7EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_T3__EXIT:%.*]] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESN_NS0_4SPANISL_XT1_EEESM_T3__EXIT:%.*]] // CHECK: for.body.i: // CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 // CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[MUL_I_I]], [[I_0_I]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I]] +// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] // CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I]] +// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] // CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP49:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm7EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_T3_.exit: +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP61:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESN_NS0_4spanISL_XT1_EEESM_T3_.exit: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_naive(sycl::sub_group &sg, span v, + plain_global_ptr p) { + // Ensure `detail::naive` always results in no block loads/stores. + group_store(sg, v, p, naive_striped{}); +} -// Striped data layout with one element per work item isn't different from -// blocked data layout, so use span version only in the checks below. - -// Ensure `detail::naive` always results in no block loads/stores. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_store< - sycl::sub_group, int, 2, plain_global_ptr, naive_striped>( - sycl::sub_group, span, plain_global_ptr, naive_striped); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESN_NS0_4spanISL_XT1_EEESM_T3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::span.19") align 8 [[IN:%.*]], ptr addrspace(1) noundef [[OUT_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.20") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META17]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7striped14test_optimizedERN4sycl3_V19sub_groupENS1_4spanIiLm2EEEPU3AS1i( // CHECK-NEXT: entry: +// CHECK-NEXT: [[VALUES_I:%.*]] = alloca [2 x i32], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 +// CHECK-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK: if.then.i: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr [[IN]], align 8, !tbaa [[TBAA50:![0-9]+]] -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META52:![0-9]+]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META55:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND:%.*]] -// CHECK: for.cond: -// CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -// CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i32 [[I_0]], 2 -// CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -// CHECK: for.cond.cleanup: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: ret void -// CHECK: for.body: -// CHECK-NEXT: [[CONV:%.*]] = zext nneg i32 [[I_0]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP0]], i64 [[CONV]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[MUL_I:%.*]] = mul nuw nsw i32 [[TMP2]], [[I_0]] -// CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[TMP1]], [[MUL_I]] -// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[ADD_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT_PTR]], i64 [[IDXPROM]] -// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[ARRAYIDX]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_0]], 1 -// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP58:![0-9]+]] - -// Check that optimized implementation is selected. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_store< - sycl::sub_group, int, 2, plain_global_ptr, opt_striped>( - sycl::sub_group, span, plain_global_ptr, opt_striped); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_T3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::span.19") align 8 [[IN:%.*]], ptr addrspace(1) noundef [[OUT_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.25") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META17]] !sycl_fixed_targets [[META7]] { -// CHECK-NEXT: entry: -// CHECK-NEXT: [[VALUES:%.*]] = alloca [2 x i32], align 4 -// CHECK-NEXT: [[CMP_I:%.*]] = icmp ne ptr addrspace(1) [[OUT_PTR]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I]]) -// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[OUT_PTR]] to i64 -// CHECK-NEXT: [[REM_I:%.*]] = and i64 [[TMP0]], 15 -// CHECK-NEXT: [[CMP1_I_NOT:%.*]] = icmp eq i64 [[REM_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] -// CHECK: if.then: -// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[IN]], align 8, !tbaa [[TBAA12]] -// CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META59:![0-9]+]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META62:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META63:![0-9]+]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META66:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 +// CHECK-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_T3__EXIT_I:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP4]], [[I_0_I_I]] +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP69:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_T3_.exit.i: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_T3__EXIT:%.*]] +// CHECK: if.end.i: +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7]] // CHECK-NEXT: br label [[FOR_COND_I:%.*]] // CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_THEN]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I19:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 -// CHECK-NEXT: br i1 [[CMP_I19]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_T3__EXIT:%.*]] +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] +// CHECK: for.cond.cleanup.i: +// CHECK-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[VALUES_I]], align 4, !tbaa [[TBAA31]] +// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv2_jEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[P]], <2 x i32> noundef [[TMP6]]) #[[ATTR5]] +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_T3__EXIT]] // CHECK: for.body.i: // CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP2]], i64 [[CONV_I]] -// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP4]], [[I_0_I]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I]] -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x i32], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] +// CHECK-NEXT: store i32 [[TMP7]], ptr [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] // CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP65:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_T3_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: br label [[CLEANUP:%.*]] -// CHECK: if.end: -// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES]]) #[[ATTR7]] -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(4), ptr [[IN]], align 8, !tbaa [[TBAA50]] -// CHECK-NEXT: br label [[FOR_COND:%.*]] -// CHECK: for.cond: -// CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[IF_END]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -// CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i32 [[I_0]], 2 -// CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -// CHECK: for.cond.cleanup: -// CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[VALUES]], align 4, !tbaa [[TBAA29]] -// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv2_jEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[OUT_PTR]], <2 x i32> noundef [[TMP7]]) #[[ATTR5]] -// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES]]) #[[ATTR7]] -// CHECK-NEXT: br label [[CLEANUP]] -// CHECK: for.body: -// CHECK-NEXT: [[CONV:%.*]] = zext nneg i32 [[I_0]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I20:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP6]], i64 [[CONV]] -// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I20]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VALUES]], i64 0, i64 [[CONV]] -// CHECK-NEXT: store i32 [[TMP8]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_0]], 1 -// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP66:![0-9]+]] -// CHECK: cleanup: +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP70:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_T3_.exit: // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_optimized(sycl::sub_group &sg, span v, + plain_global_ptr p) { + // Check that optimized implementation is selected. + group_store(sg, v, p, opt_striped{}); +} -// Check that contiguous_memory can be auto-detected. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_store< - sycl::sub_group, int, 2, plain_global_ptr, full_group_striped>( - sycl::sub_group, span, plain_global_ptr, full_group_striped); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESN_NS0_4spanISL_XT1_EEESM_T3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::span.19") align 8 [[IN:%.*]], ptr addrspace(1) noundef [[OUT_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.27") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META17]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7striped27test_contiguous_auto_detectERN4sycl3_V19sub_groupENS1_4spanIiLm2EEEPU3AS1i( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VALUES:%.*]] = alloca [2 x i32], align 4 -// CHECK-NEXT: [[CMP_I:%.*]] = icmp ne ptr addrspace(1) [[OUT_PTR]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I]]) -// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[OUT_PTR]] to i64 -// CHECK-NEXT: [[REM_I:%.*]] = and i64 [[TMP0]], 15 -// CHECK-NEXT: [[CMP1_I_NOT:%.*]] = icmp eq i64 [[REM_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] -// CHECK: if.then: -// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[IN]], align 8, !tbaa [[TBAA12]] -// CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META67:![0-9]+]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META70:![0-9]+]] +// CHECK-NEXT: [[VALUES_I:%.*]] = alloca [2 x i32], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 +// CHECK-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK: if.then.i: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META72:![0-9]+]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META75:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 +// CHECK-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_T3__EXIT_I:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP4]], [[I_0_I_I]] +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP78:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_T3_.exit.i: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESN_NS0_4SPANISL_XT1_EEESM_T3__EXIT:%.*]] +// CHECK: if.end.i: +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7]] // CHECK-NEXT: br label [[FOR_COND_I:%.*]] // CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_THEN]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I19:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 -// CHECK-NEXT: br i1 [[CMP_I19]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_T3__EXIT:%.*]] +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] +// CHECK: for.cond.cleanup.i: +// CHECK-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[VALUES_I]], align 4, !tbaa [[TBAA31]] +// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv2_jEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[P]], <2 x i32> noundef [[TMP6]]) #[[ATTR5]] +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESN_NS0_4SPANISL_XT1_EEESM_T3__EXIT]] // CHECK: for.body.i: // CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP2]], i64 [[CONV_I]] -// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP4]], [[I_0_I]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I]] -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x i32], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] +// CHECK-NEXT: store i32 [[TMP7]], ptr [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] // CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP73:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_T3_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: br label [[CLEANUP:%.*]] -// CHECK: if.end: -// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES]]) #[[ATTR7]] -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(4), ptr [[IN]], align 8, !tbaa [[TBAA50]] -// CHECK-NEXT: br label [[FOR_COND:%.*]] -// CHECK: for.cond: -// CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[IF_END]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -// CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i32 [[I_0]], 2 -// CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -// CHECK: for.cond.cleanup: -// CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[VALUES]], align 4, !tbaa [[TBAA29]] -// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv2_jEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[OUT_PTR]], <2 x i32> noundef [[TMP7]]) #[[ATTR5]] -// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES]]) #[[ATTR7]] -// CHECK-NEXT: br label [[CLEANUP]] -// CHECK: for.body: -// CHECK-NEXT: [[CONV:%.*]] = zext nneg i32 [[I_0]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I20:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP6]], i64 [[CONV]] -// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I20]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VALUES]], i64 0, i64 [[CONV]] -// CHECK-NEXT: store i32 [[TMP8]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_0]], 1 -// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP74:![0-9]+]] -// CHECK: cleanup: +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP79:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESN_NS0_4spanISL_XT1_EEESM_T3_.exit: // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_contiguous_auto_detect(sycl::sub_group &sg, + span v, + plain_global_ptr p) { + // Check that contiguous_memory can be auto-detected. + group_store(sg, v, p, full_group_striped{}); +} // SYCL 2020's accessor can't be statically known to be contiguous. using accessor_iter_t = accessor::iterator; -// Can't be optimized. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_store< - sycl::sub_group, int, 2, accessor_iter_t, full_group_striped>( - sycl::sub_group, span, accessor_iter_t, full_group_striped); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSC_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_NS0_4spanISM_XT1_EEESN_T3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::span.19") align 8 [[IN:%.*]], ptr noundef byval(%"class.sycl::_V1::detail::accessor_iterator") align 8 [[OUT_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.27") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META17]] !sycl_fixed_targets [[META7]] { + +// CHECK-LABEL: @_ZN7striped18test_accessor_iterERN4sycl3_V19sub_groupENS1_4spanIiLm2EEERNS1_6detail17accessor_iteratorIiLi1EEE( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[IN]], align 8, !tbaa [[TBAA12]] -// CHECK-NEXT: [[AGG_TMP4_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr [[OUT_PTR]], align 8, !tbaa [[TBAA12]] -// CHECK-NEXT: [[AGG_TMP4_SROA_2_0_OUT_PTR_ASCAST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OUT_PTR]], i64 8 -// CHECK-NEXT: [[AGG_TMP4_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr [[AGG_TMP4_SROA_2_0_OUT_PTR_ASCAST_SROA_IDX]], align 8, !tbaa [[TBAA15]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-NEXT: [[AGG_TMP2_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-NEXT: [[AGG_TMP2_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 +// CHECK-NEXT: [[AGG_TMP2_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP2_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18]] // CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META75:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META78:![0-9]+]] -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP4_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP4_SROA_2_0_COPYLOAD]] -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2ENS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSC_INS3_14FULL_GROUP_KEYEJEEENSC_INSA_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESQ_NS0_4SPANISO_XT1_EEESP_T3__EXIT:%.*]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] -// CHECK-NEXT: [[CONV5_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ADD_PTR_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP4]], i64 [[CONV5_I]] -// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(4) [[ADD_PTR_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP81:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSC_INS3_14full_group_keyEJEEENSC_INSA_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESQ_NS0_4spanISO_XT1_EEESP_T3_.exit: +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META81:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META84:![0-9]+]] +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP2_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP2_SROA_2_0_COPYLOAD]] +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2ENS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSC_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESO_NS0_4SPANISM_XT1_EEESN_T3__EXIT:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I_I]] +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-NEXT: [[CONV5_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ADD_PTR_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP4]], i64 [[CONV5_I_I]] +// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(4) [[ADD_PTR_I_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP87:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSC_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESO_NS0_4spanISM_XT1_EEESN_T3_.exit: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_accessor_iter(sycl::sub_group &sg, span v, + accessor_iter_t &iter) { + // Can't be optimized. + group_store(sg, v, iter, full_group_striped{}); +} -// Explicit property - optimize. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_store< - sycl::sub_group, int, 2, accessor_iter_t, opt_striped>(sycl::sub_group, - span, - accessor_iter_t, - opt_striped); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESQ_NS0_4spanISO_XT1_EEESP_T3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::span.19") align 8 [[IN:%.*]], ptr noundef byval(%"class.sycl::_V1::detail::accessor_iterator") align 8 [[OUT_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.25") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META17]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7striped34test_accessor_iter_force_optimizedERN4sycl3_V19sub_groupENS1_4spanIiLm2EEERNS1_6detail17accessor_iteratorIiLi1EEE( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VALUES:%.*]] = alloca [2 x i32], align 4 -// CHECK-NEXT: [[AGG_TMP_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr [[OUT_PTR]], align 8, !tbaa [[TBAA12]] -// CHECK-NEXT: [[AGG_TMP_SROA_2_0_OUT_PTR_ASCAST_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OUT_PTR]], i64 8 -// CHECK-NEXT: [[AGG_TMP_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr [[AGG_TMP_SROA_2_0_OUT_PTR_ASCAST_SROA_IDX]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[ADD_PTR_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[AGG_TMP_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP_SROA_2_0_COPYLOAD]] -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(4) [[ADD_PTR_I_I]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) -// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(4) [[ADD_PTR_I_I]] to i64 -// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP0]], 15 -// CHECK-NEXT: [[CMP1_I_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL6DETAIL16GET_BLOCK_OP_PTRILI16ELM2ENS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS4_20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEEDAT1_T2__EXIT:%.*]], label [[IF_THEN:%.*]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental6detail16get_block_op_ptrILi16ELm2ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS4_20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEEDaT1_T2_.exit: -// CHECK-NEXT: [[CALL_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef nonnull [[ADD_PTR_I_I]], i32 noundef 5) #[[ATTR6]] -// CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq ptr addrspace(1) [[CALL_I_I_I]], null -// CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_THEN]], label [[IF_END:%.*]] -// CHECK: if.then: -// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[IN]], align 8, !tbaa [[TBAA12]] -// CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META82:![0-9]+]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META85:![0-9]+]] +// CHECK-NEXT: [[VALUES_I:%.*]] = alloca [2 x i32], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-NEXT: [[AGG_TMP2_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-NEXT: [[AGG_TMP2_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 +// CHECK-NEXT: [[AGG_TMP2_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP2_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: [[ADD_PTR_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[AGG_TMP2_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP2_SROA_2_0_COPYLOAD]] +// CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(4) [[ADD_PTR_I_I_I]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(4) [[ADD_PTR_I_I_I]] to i64 +// CHECK-NEXT: [[REM_I_I_I:%.*]] = and i64 [[TMP2]], 15 +// CHECK-NEXT: [[CMP1_I_I_I:%.*]] = icmp eq i64 [[REM_I_I_I]], 0 +// CHECK-NEXT: br i1 [[CMP1_I_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL6DETAIL16GET_BLOCK_OP_PTRILI16ELM2ENS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS4_20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEEDAT1_T2__EXIT_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental6detail16get_block_op_ptrILi16ELm2ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS4_20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEEDaT1_T2_.exit.i: +// CHECK-NEXT: [[CALL_I_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef nonnull [[ADD_PTR_I_I_I]], i32 noundef 5) #[[ATTR6]] +// CHECK-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp eq ptr addrspace(1) [[CALL_I_I_I_I]], null +// CHECK-NEXT: br i1 [[TOBOOL_NOT_I]], label [[IF_THEN_I]], label [[IF_END_I:%.*]] +// CHECK: if.then.i: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META89:![0-9]+]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META92:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2ENS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEENSC_INSA_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESS_NS0_4SPANISQ_XT1_EEESR_T3__EXIT_I:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP4]], [[I_0_I_I]] +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] +// CHECK-NEXT: [[CONV5_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ADD_PTR_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[ADD_PTR_I_I_I]], i64 [[CONV5_I_I]] +// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(4) [[ADD_PTR_I_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP95:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEENSC_INSA_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESS_NS0_4spanISQ_XT1_EEESR_T3_.exit.i: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2ENS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESQ_NS0_4SPANISO_XT1_EEESP_T3__EXIT:%.*]] +// CHECK: if.end.i: +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7]] // CHECK-NEXT: br label [[FOR_COND_I:%.*]] // CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_THEN]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] // CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2ENS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEENSC_INSA_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESS_NS0_4SPANISQ_XT1_EEESR_T3__EXIT:%.*]] +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] +// CHECK: for.cond.cleanup.i: +// CHECK-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[VALUES_I]], align 4, !tbaa [[TBAA31]] +// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv2_jEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[CALL_I_I_I_I]], <2 x i32> noundef [[TMP6]]) #[[ATTR5]] +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2ENS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESQ_NS0_4SPANISO_XT1_EEESP_T3__EXIT]] // CHECK: for.body.i: // CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP2]], i64 [[CONV_I]] -// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP4]], [[I_0_I]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I]] -// CHECK-NEXT: [[CONV5_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ADD_PTR_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[ADD_PTR_I_I]], i64 [[CONV5_I]] -// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(4) [[ADD_PTR_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x i32], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] +// CHECK-NEXT: store i32 [[TMP7]], ptr [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] // CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP88:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEENSC_INSA_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESS_NS0_4spanISQ_XT1_EEESR_T3_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: br label [[CLEANUP:%.*]] -// CHECK: if.end: -// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES]]) #[[ATTR7]] -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(4), ptr [[IN]], align 8, !tbaa [[TBAA50]] -// CHECK-NEXT: br label [[FOR_COND:%.*]] -// CHECK: for.cond: -// CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[IF_END]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -// CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i32 [[I_0]], 2 -// CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -// CHECK: for.cond.cleanup: -// CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[VALUES]], align 4, !tbaa [[TBAA29]] -// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv2_jEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[CALL_I_I_I]], <2 x i32> noundef [[TMP7]]) #[[ATTR5]] -// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES]]) #[[ATTR7]] -// CHECK-NEXT: br label [[CLEANUP]] -// CHECK: for.body: -// CHECK-NEXT: [[CONV:%.*]] = zext nneg i32 [[I_0]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP6]], i64 [[CONV]] -// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i32], ptr [[VALUES]], i64 0, i64 [[CONV]] -// CHECK-NEXT: store i32 [[TMP8]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_0]], 1 -// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP89:![0-9]+]] -// CHECK: cleanup: +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP96:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESQ_NS0_4spanISO_XT1_EEESP_T3_.exit: // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_accessor_iter_force_optimized(sycl::sub_group &sg, + span v, + accessor_iter_t &iter) { + // Explicit property - optimize. + group_store(sg, v, iter, opt_striped{}); +} -// Just because there is a blocked data layout testcase, nothing inherently -// useful here. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_store< - sycl::sub_group, short, 4, plain_global_ptr, opt_striped>( - sycl::sub_group, span, plain_global_ptr, opt_striped); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_T3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::span.14") align 8 [[IN:%.*]], ptr addrspace(1) noundef [[OUT_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.25") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META17]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7striped16test_four_shortsERN4sycl3_V19sub_groupENS1_4spanIsLm4EEEPU3AS1s( // CHECK-NEXT: entry: -// CHECK-NEXT: [[VALUES:%.*]] = alloca [4 x i16], align 2 -// CHECK-NEXT: [[CMP_I:%.*]] = icmp ne ptr addrspace(1) [[OUT_PTR]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I]]) -// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[OUT_PTR]] to i64 -// CHECK-NEXT: [[REM_I:%.*]] = and i64 [[TMP0]], 15 -// CHECK-NEXT: [[CMP1_I_NOT:%.*]] = icmp eq i64 [[REM_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] -// CHECK: if.then: -// CHECK-NEXT: [[TMP1:%.*]] = load i64, ptr [[IN]], align 8, !tbaa [[TBAA18]] -// CHECK-NEXT: [[TMP2:%.*]] = inttoptr i64 [[TMP1]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META90:![0-9]+]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META93:![0-9]+]] +// CHECK-NEXT: [[VALUES_I:%.*]] = alloca [4 x i16], align 2 +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA22]] +// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 +// CHECK-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK: if.then.i: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META98:![0-9]+]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META101:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 +// CHECK-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_T3__EXIT_I:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA27]] +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP4]], [[I_0_I_I]] +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA27]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP104:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_T3_.exit.i: +// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_T3__EXIT:%.*]] +// CHECK: if.end.i: +// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7]] // CHECK-NEXT: br label [[FOR_COND_I:%.*]] // CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_THEN]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I19:%.*]] = icmp samesign ult i32 [[I_0_I]], 4 -// CHECK-NEXT: br i1 [[CMP_I19]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_T3__EXIT:%.*]] +// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 4 +// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] +// CHECK: for.cond.cleanup.i: +// CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[VALUES_I]], align 2, !tbaa [[TBAA31]] +// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv4_tEvPU3AS1tT_(ptr addrspace(1) noundef nonnull [[P]], <4 x i16> noundef [[TMP6]]) #[[ATTR5]] +// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7]] +// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_T3__EXIT]] // CHECK: for.body.i: // CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP2]], i64 [[CONV_I]] -// CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA23]] -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul i32 [[TMP4]], [[I_0_I]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I]] -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[OUT_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[ARRAYIDX_I]], align 2, !tbaa [[TBAA23]] +// CHECK-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 2, !tbaa [[TBAA27]] +// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x i16], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] +// CHECK-NEXT: store i16 [[TMP7]], ptr [[ARRAYIDX_I]], align 2, !tbaa [[TBAA27]] // CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP96:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_T3_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: br label [[CLEANUP:%.*]] -// CHECK: if.end: -// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES]]) #[[ATTR7]] -// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(4), ptr [[IN]], align 8, !tbaa [[TBAA27]] -// CHECK-NEXT: br label [[FOR_COND:%.*]] -// CHECK: for.cond: -// CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[IF_END]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ] -// CHECK-NEXT: [[CMP:%.*]] = icmp samesign ult i32 [[I_0]], 4 -// CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] -// CHECK: for.cond.cleanup: -// CHECK-NEXT: [[TMP7:%.*]] = load <4 x i16>, ptr [[VALUES]], align 2, !tbaa [[TBAA29]] -// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv4_tEvPU3AS1tT_(ptr addrspace(1) noundef nonnull [[OUT_PTR]], <4 x i16> noundef [[TMP7]]) #[[ATTR5]] -// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES]]) #[[ATTR7]] -// CHECK-NEXT: br label [[CLEANUP]] -// CHECK: for.body: -// CHECK-NEXT: [[CONV:%.*]] = zext nneg i32 [[I_0]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I20:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP6]], i64 [[CONV]] -// CHECK-NEXT: [[TMP8:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I20]], align 2, !tbaa [[TBAA23]] -// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x i16], ptr [[VALUES]], i64 0, i64 [[CONV]] -// CHECK-NEXT: store i16 [[TMP8]], ptr [[ARRAYIDX]], align 2, !tbaa [[TBAA23]] -// CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_0]], 1 -// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP97:![0-9]+]] -// CHECK: cleanup: +// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP105:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_T3_.exit: // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_four_shorts(sycl::sub_group &sg, span v, + plain_global_ptr p) { + // Just because there is a blocked data layout testcase, nothing inherently + // useful here. + group_store(sg, v, p, opt_striped{}); +} -// Check for non-power-of-two size. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_store< - sycl::sub_group, int, 3, plain_global_ptr, opt_striped>( - sycl::sub_group, span, plain_global_ptr, opt_striped); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm3EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_T3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::span.16") align 8 [[IN:%.*]], ptr addrspace(1) noundef [[OUT_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.25") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META17]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7striped21test_non_power_of_twoERN4sycl3_V19sub_groupENS1_4spanIiLm3EEEPU3AS1i( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[IN]], align 8, !tbaa [[TBAA12]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] // CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META98:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META101:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 3 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM3EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_T3__EXIT:%.*]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP104:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm3EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_T3_.exit: +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META107:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META110:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 3 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM3EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_T3__EXIT:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP113:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm3EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_T3_.exit: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_non_power_of_two(sycl::sub_group &sg, span v, + plain_global_ptr p) { + // Check for non-power-of-two size. + group_store(sg, v, p, opt_striped{}); +} -// Even though power of two, still too many to map directly onto BloadRead API. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_store< - sycl::sub_group, int, 16, plain_global_ptr, opt_striped>( - sycl::sub_group, span, plain_global_ptr, opt_striped); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm16EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_T3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::span.29") align 8 [[IN:%.*]], ptr addrspace(1) noundef [[OUT_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.25") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META17]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7striped17test_sixteen_intsERN4sycl3_V19sub_groupENS1_4spanIiLm16EEEPU3AS1i( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[IN]], align 8, !tbaa [[TBAA12]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] // CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META105:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META108:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 16 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM16EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_T3__EXIT:%.*]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP111:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm16EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_T3_.exit: +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META115:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META118:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 16 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM16EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_T3__EXIT:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP121:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm16EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_T3_.exit: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_sixteen_ints(sycl::sub_group &sg, span v, + plain_global_ptr p) { + // Even though power of two, still too many to map directly onto BloadWrite + // API. + group_store(sg, v, p, opt_striped{}); +} -// Non-power of two case bigger than max natively supported power of two case. -template SYCL_EXTERNAL void sycl::ext::oneapi::experimental::group_store< - sycl::sub_group, int, 11, plain_global_ptr, opt_striped>( - sycl::sub_group, span, plain_global_ptr, opt_striped); -// CHECK-LABEL: define weak_odr dso_local spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm11EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_T3_( -// CHECK-SAME: ptr noundef byval(%"struct.sycl::_V1::sub_group") align 1 [[G:%.*]], ptr noundef byval(%"class.sycl::_V1::span.30") align 8 [[IN:%.*]], ptr addrspace(1) noundef [[OUT_PTR:%.*]], ptr noundef byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.25") align 1 [[PROPS:%.*]]) local_unnamed_addr #[[ATTR0]] comdat !srcloc [[META17]] !sycl_fixed_targets [[META7]] { +// CHECK-LABEL: @_ZN7striped16test_eleven_intsERN4sycl3_V19sub_groupENS1_4spanIiLm11EEEPU3AS1i( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[IN]], align 8, !tbaa [[TBAA12]] +// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] // CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META112:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META115:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 11 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM11EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_T3__EXIT:%.*]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT_PTR]], i64 [[IDXPROM_I]] -// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP118:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm11EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_T3_.exit: +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META123:![0-9]+]] +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META126:![0-9]+]] +// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK: for.cond.i.i: +// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 11 +// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM11EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_T3__EXIT:%.*]] +// CHECK: for.body.i.i: +// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] +// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP129:![0-9]+]] +// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm11EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_T3_.exit: // CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] // CHECK-NEXT: ret void +// +SYCL_EXTERNAL void test_eleven_ints(sycl::sub_group &sg, span v, + plain_global_ptr p) { + // Non-power of two case bigger than max natively supported power of two case. + group_store(sg, v, p, opt_striped{}); +} +} // namespace striped diff --git a/sycl/test/check_device_code/vector/vector_bf16_builtins.cpp b/sycl/test/check_device_code/vector/vector_bf16_builtins.cpp index ee3716e4389c1..a8c916b5fe3c4 100644 --- a/sycl/test/check_device_code/vector/vector_bf16_builtins.cpp +++ b/sycl/test/check_device_code/vector/vector_bf16_builtins.cpp @@ -69,7 +69,7 @@ SYCL_EXTERNAL auto TestFMin(vec a, vec b) { } // CHECK-LABEL: define dso_local spir_func void @_Z8TestFMaxN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEES5_( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.5") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.5") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.5") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META24:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.6") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.6") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.6") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META24:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VEC_ADDR_I_I_I_I12_I:%.*]] = alloca <3 x float>, align 16 // CHECK-NEXT: [[DST_I_I_I_I13_I:%.*]] = alloca [4 x %"class.sycl::_V1::ext::oneapi::bfloat16"], align 2 @@ -123,7 +123,7 @@ SYCL_EXTERNAL auto TestFMax(vec a, vec b) { } // CHECK-LABEL: define dso_local spir_func void @_Z9TestIsNanN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi4EEE( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.15") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.20") align 8 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META38:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.18") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.24") align 8 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META38:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VEC_ADDR_I_I_I_I_I:%.*]] = alloca <4 x i16>, align 8 // CHECK-NEXT: [[DST_I_I_I_I_I:%.*]] = alloca [4 x float], align 4 @@ -149,7 +149,7 @@ SYCL_EXTERNAL auto TestIsNan(vec a) { } // CHECK-LABEL: define dso_local spir_func void @_Z8TestFabsN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEE( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.38") align 16 initializes((0, 16)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.38") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META48:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.46") align 16 initializes((0, 16)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.46") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META48:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VEC_ADDR_I_I_I_I2_I:%.*]] = alloca <8 x float>, align 32 // CHECK-NEXT: [[DST_I_I_I_I3_I:%.*]] = alloca [8 x %"class.sycl::_V1::ext::oneapi::bfloat16"], align 2 @@ -185,7 +185,7 @@ SYCL_EXTERNAL auto TestFabs(vec a) { } // CHECK-LABEL: define dso_local spir_func void @_Z8TestCeilN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEE( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.38") align 16 initializes((0, 16)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.38") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META59:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.46") align 16 initializes((0, 16)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.46") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META59:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VEC_ADDR_I_I_I_I2_I:%.*]] = alloca <8 x float>, align 32 // CHECK-NEXT: [[DST_I_I_I_I3_I:%.*]] = alloca [8 x %"class.sycl::_V1::ext::oneapi::bfloat16"], align 2 @@ -221,7 +221,7 @@ SYCL_EXTERNAL auto TestCeil(vec a) { } // CHECK-LABEL: define dso_local spir_func void @_Z7TestFMAN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEES5_S5_( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.48") align 32 initializes((0, 32)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.48") align 32 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.48") align 32 [[B:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.48") align 32 [[C:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META70:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.58") align 32 initializes((0, 32)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.58") align 32 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.58") align 32 [[B:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.58") align 32 [[C:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META70:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[VEC_ADDR_I_I_I_I14_I:%.*]] = alloca <16 x float>, align 64 // CHECK-NEXT: [[DST_I_I_I_I15_I:%.*]] = alloca [16 x %"class.sycl::_V1::ext::oneapi::bfloat16"], align 2 diff --git a/sycl/test/check_device_code/vector/vector_convert_bfloat.cpp b/sycl/test/check_device_code/vector/vector_convert_bfloat.cpp index d5b6963c5033c..a63ace9885745 100644 --- a/sycl/test/check_device_code/vector/vector_convert_bfloat.cpp +++ b/sycl/test/check_device_code/vector/vector_convert_bfloat.cpp @@ -63,7 +63,7 @@ SYCL_EXTERNAL auto TestBFtoFDeviceRZ(vec &inp) { } // CHECK-LABEL: define dso_local spir_func void @_Z19TestBFtointDeviceRZRN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEE( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.5") align 16 [[AGG_RESULT:%.*]], ptr addrspace(4) nocapture noundef readonly align 8 dereferenceable(8) [[INP:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META18:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.6") align 16 [[AGG_RESULT:%.*]], ptr addrspace(4) nocapture noundef readonly align 8 dereferenceable(8) [[INP:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META18:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META19:![0-9]+]]) // CHECK-NEXT: [[LOADVEC4_I_I:%.*]] = load <4 x i16>, ptr addrspace(4) [[INP]], align 8, !noalias [[META19]] @@ -93,7 +93,7 @@ SYCL_EXTERNAL auto TestBFtointDeviceRZ(vec &inp) { } // CHECK-LABEL: define dso_local spir_func void @_Z20TestBFtointDeviceRNERN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi1EEE( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.10") align 4 initializes((0, 4)) [[AGG_RESULT:%.*]], ptr addrspace(4) nocapture noundef readonly align 2 dereferenceable(2) [[INP:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META24:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.12") align 4 initializes((0, 4)) [[AGG_RESULT:%.*]], ptr addrspace(4) nocapture noundef readonly align 2 dereferenceable(2) [[INP:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META24:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META25:![0-9]+]]) // CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr addrspace(4) [[INP]], align 2, !tbaa [[TBAA11]], !noalias [[META25]] @@ -191,7 +191,7 @@ SYCL_EXTERNAL auto TestInttoBFDeviceRZ(vec &inp) { } // CHECK-LABEL: define dso_local spir_func void @_Z19TestLLtoBFDeviceRTPRN4sycl3_V13vecIxLi1EEE( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.15") align 2 initializes((0, 2)) [[AGG_RESULT:%.*]], ptr addrspace(4) nocapture noundef readonly align 8 dereferenceable(8) [[INP:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META42:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.18") align 2 initializes((0, 2)) [[AGG_RESULT:%.*]], ptr addrspace(4) nocapture noundef readonly align 8 dereferenceable(8) [[INP:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META42:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META43:![0-9]+]]) // CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr addrspace(4) [[INP]], align 8, !tbaa [[TBAA46:![0-9]+]], !noalias [[META43]] @@ -204,7 +204,7 @@ SYCL_EXTERNAL auto TestLLtoBFDeviceRTP(vec &inp) { } // CHECK-LABEL: define dso_local spir_func void @_Z22TestShorttoBFDeviceRTNRN4sycl3_V13vecIsLi2EEE( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.25") align 4 [[AGG_RESULT:%.*]], ptr addrspace(4) nocapture noundef readonly align 4 dereferenceable(4) [[INP:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META48:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.30") align 4 [[AGG_RESULT:%.*]], ptr addrspace(4) nocapture noundef readonly align 4 dereferenceable(4) [[INP:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META48:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META49:![0-9]+]]) // CHECK-NEXT: [[TMP0:%.*]] = load <2 x i16>, ptr addrspace(4) [[INP]], align 4, !tbaa [[TBAA11]], !noalias [[META49]] diff --git a/sycl/test/check_device_code/vector/vector_math_ops.cpp b/sycl/test/check_device_code/vector/vector_math_ops.cpp index 4d51005e3921c..89e2af23db1c4 100644 --- a/sycl/test/check_device_code/vector/vector_math_ops.cpp +++ b/sycl/test/check_device_code/vector/vector_math_ops.cpp @@ -46,7 +46,7 @@ SYCL_EXTERNAL auto TestAdd(vec a, vec b) { return a + b; } SYCL_EXTERNAL auto TestAdd(vec a, vec b) { return a + b; } // CHECK-LABEL: define dso_local spir_func void @_Z7TestAddN4sycl3_V13vecIcLi16EEES2_( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.8") align 16 initializes((0, 16)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.8") align 16 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.8") align 16 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META26:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.9") align 16 initializes((0, 16)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.9") align 16 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.9") align 16 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META26:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META27:![0-9]+]]) // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]]) @@ -60,7 +60,7 @@ SYCL_EXTERNAL auto TestAdd(vec a, vec b) { return a + b; } // std::byte does not support '+'. Therefore, using bitwise XOR as a substitute. // CHECK-LABEL: define dso_local spir_func void @_Z7TestXorN4sycl3_V13vecISt4byteLi8EEES3_( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.15") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.15") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.15") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META34:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.17") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.17") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.17") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META34:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META35:![0-9]+]]) // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META38:![0-9]+]]) @@ -75,7 +75,7 @@ SYCL_EXTERNAL auto TestXor(vec a, vec b) { } // CHECK-LABEL: define dso_local spir_func void @_Z7TestAddN4sycl3_V13vecIbLi4EEES2_( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.22") align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.22") align 4 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.22") align 4 [[B:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] !srcloc [[META42:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.25") align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.25") align 4 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.25") align 4 [[B:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] !srcloc [[META42:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META43:![0-9]+]]) // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META46:![0-9]+]]) @@ -102,7 +102,7 @@ SYCL_EXTERNAL auto TestXor(vec a, vec b) { SYCL_EXTERNAL auto TestAdd(vec a, vec b) { return a + b; } // CHECK-LABEL: define dso_local spir_func void @_Z7TestAddN4sycl3_V13vecINS0_6detail9half_impl4halfELi3EEES5_( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.29") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.29") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.29") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META52:![0-9]+]] !sycl_used_aspects [[META53:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.33") align 8 initializes((0, 8)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.33") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.33") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META52:![0-9]+]] !sycl_used_aspects [[META53:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META55:![0-9]+]]) // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META58:![0-9]+]]) @@ -116,10 +116,10 @@ SYCL_EXTERNAL auto TestAdd(vec a, vec b) { return a + b; } SYCL_EXTERNAL auto TestAdd(vec a, vec b) { return a + b; } // CHECK-LABEL: define dso_local spir_func void @_Z7TestAddN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEES5_( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.36") align 8 [[AGG_RESULT:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.36") align 8 [[A:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.36") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] !srcloc [[META62:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.41") align 8 [[AGG_RESULT:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.41") align 8 [[A:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.41") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] !srcloc [[META62:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[REF_TMP_I_I_I_I:%.*]] = alloca float, align 4 -// CHECK-NEXT: [[RES_I_I:%.*]] = alloca %"class.sycl::_V1::vec.36", align 8 +// CHECK-NEXT: [[RES_I_I:%.*]] = alloca %"class.sycl::_V1::vec.41", align 8 // CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(4) // CHECK-NEXT: [[B_ASCAST:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(4) // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META63:![0-9]+]]) @@ -160,7 +160,7 @@ SYCL_EXTERNAL auto TestAdd(vec a, /***************** Binary Logical Ops *******************/ // CHECK-LABEL: define dso_local spir_func void @_Z15TestGreaterThanN4sycl3_V13vecIiLi16EEES2_( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.41") align 64 initializes((0, 64)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.41") align 64 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.41") align 64 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META81:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.47") align 64 initializes((0, 64)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.47") align 64 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.47") align 64 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META81:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META82:![0-9]+]]) // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META85:![0-9]+]]) @@ -176,7 +176,7 @@ SYCL_EXTERNAL auto TestGreaterThan(vec a, vec b) { } // CHECK-LABEL: define dso_local spir_func noundef range(i8 -1, 1) <3 x i8> @_Z15TestGreaterThanN4sycl3_V13vecISt4byteLi3EEES3_( -// CHECK-SAME: ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.49") align 4 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.49") align 4 [[B:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] !srcloc [[META89:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.56") align 4 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.56") align 4 [[B:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] !srcloc [[META89:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[LOADVEC4_I_I:%.*]] = load <4 x i8>, ptr [[A]], align 1 // CHECK-NEXT: [[LOADVEC4_I_I2:%.*]] = load <4 x i8>, ptr [[B]], align 1 @@ -190,7 +190,7 @@ SYCL_EXTERNAL auto TestGreaterThan(vec a, vec b) { } // CHECK-LABEL: define dso_local spir_func void @_Z15TestGreaterThanN4sycl3_V13vecIbLi2EEES2_( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.54") align 2 initializes((0, 2)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.59") align 2 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.59") align 2 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META90:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.62") align 2 initializes((0, 2)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.68") align 2 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.68") align 2 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META90:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META91:![0-9]+]]) // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META94:![0-9]+]]) @@ -206,7 +206,7 @@ SYCL_EXTERNAL auto TestGreaterThan(vec a, vec b) { } // CHECK-LABEL: define dso_local spir_func void @_Z15TestGreaterThanN4sycl3_V13vecINS0_6detail9half_impl4halfELi8EEES5_( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.68") align 16 initializes((0, 16)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.73") align 16 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.73") align 16 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META98:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.78") align 16 initializes((0, 16)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.84") align 16 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.84") align 16 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META98:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META99:![0-9]+]]) // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META102:![0-9]+]]) @@ -222,9 +222,9 @@ SYCL_EXTERNAL auto TestGreaterThan(vec a, vec b) { } // CHECK-LABEL: define dso_local spir_func void @_Z15TestGreaterThanN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi4EEES5_( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.82") align 8 [[AGG_RESULT:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.87") align 8 [[A:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.87") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR2]] !srcloc [[META106:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.94") align 8 [[AGG_RESULT:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.100") align 8 [[A:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.100") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR2]] !srcloc [[META106:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[RES_I_I:%.*]] = alloca %"class.sycl::_V1::vec.82", align 8 +// CHECK-NEXT: [[RES_I_I:%.*]] = alloca %"class.sycl::_V1::vec.94", align 8 // CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(4) // CHECK-NEXT: [[B_ASCAST:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(4) // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META107:![0-9]+]]) @@ -261,7 +261,7 @@ SYCL_EXTERNAL auto TestGreaterThan(vec a, /********************** Unary Ops **********************/ // CHECK-LABEL: define dso_local spir_func void @_Z12TestNegationN4sycl3_V13vecIiLi3EEE( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.91") align 16 initializes((0, 16)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.91") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META115:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.105") align 16 initializes((0, 16)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.105") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META115:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META116:![0-9]+]]) // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META119:![0-9]+]]) @@ -276,7 +276,7 @@ SYCL_EXTERNAL auto TestGreaterThan(vec a, SYCL_EXTERNAL auto TestNegation(vec a) { return !a; } // CHECK-LABEL: define dso_local spir_func void @_Z9TestMinusN4sycl3_V13vecIiLi4EEE( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.97") align 16 initializes((0, 16)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.97") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META123:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.112") align 16 initializes((0, 16)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.112") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META123:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META124:![0-9]+]]) // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META127:![0-9]+]]) @@ -289,7 +289,7 @@ SYCL_EXTERNAL auto TestMinus(vec a) { return -a; } // Negation is not valid for std::byte. Therefore, using bitwise negation. // CHECK-LABEL: define dso_local spir_func void @_Z19TestBitwiseNegationN4sycl3_V13vecISt4byteLi16EEE( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.102") align 16 initializes((0, 16)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.102") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META131:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.118") align 16 initializes((0, 16)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.118") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META131:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META132:![0-9]+]]) // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META135:![0-9]+]]) @@ -301,7 +301,7 @@ SYCL_EXTERNAL auto TestMinus(vec a) { return -a; } SYCL_EXTERNAL auto TestBitwiseNegation(vec a) { return ~a; } // CHECK-LABEL: define dso_local spir_func void @_Z12TestNegationN4sycl3_V13vecIbLi4EEE( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.108") align 4 initializes((0, 4)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.22") align 4 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META139:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.125") align 4 initializes((0, 4)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.25") align 4 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META139:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META140:![0-9]+]]) // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META143:![0-9]+]]) @@ -314,7 +314,7 @@ SYCL_EXTERNAL auto TestBitwiseNegation(vec a) { return ~a; } SYCL_EXTERNAL auto TestNegation(vec a) { return !a; } // CHECK-LABEL: define dso_local spir_func void @_Z12TestNegationN4sycl3_V13vecINS0_6detail9half_impl4halfELi2EEE( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.114") align 4 initializes((0, 4)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.119") align 4 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META147:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.132") align 4 initializes((0, 4)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.138") align 4 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META147:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META148:![0-9]+]]) // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META151:![0-9]+]]) @@ -327,7 +327,7 @@ SYCL_EXTERNAL auto TestNegation(vec a) { return !a; } SYCL_EXTERNAL auto TestNegation(vec a) { return !a; } // CHECK-LABEL: define dso_local spir_func void @_Z9TestMinusN4sycl3_V13vecINS0_6detail9half_impl4halfELi8EEE( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.73") align 16 initializes((0, 16)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.73") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META155:![0-9]+]] !sycl_used_aspects [[META53]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.84") align 16 initializes((0, 16)) [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.84") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META155:![0-9]+]] !sycl_used_aspects [[META53]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META156:![0-9]+]]) // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META159:![0-9]+]]) @@ -339,9 +339,9 @@ SYCL_EXTERNAL auto TestNegation(vec a) { return !a; } SYCL_EXTERNAL auto TestMinus(vec a) { return -a; } // CHECK-LABEL: define dso_local spir_func void @_Z12TestNegationN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEE( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.126") align 8 [[AGG_RESULT:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.36") align 8 [[A:%.*]]) local_unnamed_addr #[[ATTR2]] !srcloc [[META163:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.146") align 8 [[AGG_RESULT:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.41") align 8 [[A:%.*]]) local_unnamed_addr #[[ATTR2]] !srcloc [[META163:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: -// CHECK-NEXT: [[RES_I_I:%.*]] = alloca %"class.sycl::_V1::vec.126", align 8 +// CHECK-NEXT: [[RES_I_I:%.*]] = alloca %"class.sycl::_V1::vec.146", align 8 // CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(4) // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META164:![0-9]+]]) // CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META167:![0-9]+]]) @@ -370,10 +370,10 @@ SYCL_EXTERNAL auto TestMinus(vec a) { return -a; } SYCL_EXTERNAL auto TestNegation(vec a) { return !a; } // CHECK-LABEL: define dso_local spir_func void @_Z9TestMinusN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEE( -// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.130") align 32 [[AGG_RESULT:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.130") align 32 [[A:%.*]]) local_unnamed_addr #[[ATTR2]] !srcloc [[META172:![0-9]+]] !sycl_fixed_targets [[META7]] { +// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.151") align 32 [[AGG_RESULT:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.151") align 32 [[A:%.*]]) local_unnamed_addr #[[ATTR2]] !srcloc [[META172:![0-9]+]] !sycl_fixed_targets [[META7]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[REF_TMP_I_I_I_I:%.*]] = alloca float, align 4 -// CHECK-NEXT: [[RES_I_I:%.*]] = alloca %"class.sycl::_V1::vec.130", align 32 +// CHECK-NEXT: [[RES_I_I:%.*]] = alloca %"class.sycl::_V1::vec.151", align 32 // CHECK-NEXT: [[A_ASCAST:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(4) // CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[RES_I_I]]), !noalias [[META173:![0-9]+]] // CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 32 [[RES_I_I]], i8 0, i64 32, i1 false), !noalias [[META176:![0-9]+]] diff --git a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp index ec44b32f182f7..33a61de5bc8cd 100644 --- a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp +++ b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp @@ -54,7 +54,7 @@ // tests to match the required format and in that case you should just update // (i.e. reduce) the number and the list below. // -// NUMBER-OF-UNSUPPORTED-WITHOUT-INFO: 381 +// NUMBER-OF-UNSUPPORTED-WITHOUT-INFO: 379 // // List of improperly UNSUPPORTED tests. // Remove the CHECK once the test has been properly UNSUPPORTED. @@ -264,9 +264,7 @@ // CHECK-NEXT: Matrix/SG32/element_wise_all_ops_int8_packed.cpp // CHECK-NEXT: Matrix/SG32/element_wise_all_sizes.cpp // CHECK-NEXT: Matrix/SG32/element_wise_ops.cpp -// CHECK-NEXT: Matrix/SG32/get_coord_float_matC.cpp -// CHECK-NEXT: Matrix/SG32/get_coord_int8_matA.cpp -// CHECK-NEXT: Matrix/SG32/get_coord_int8_matB.cpp +// CHECK-NEXT: Matrix/SG32/get_coordinate_ops.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_all_sizes.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_apply_bf16.cpp // CHECK-NEXT: Matrix/SG32/joint_matrix_apply_two_matrices.cpp diff --git a/sycl/test/extensions/group_load_store_negative.cpp b/sycl/test/extensions/group_load_store_negative.cpp new file mode 100644 index 0000000000000..41540020c2d18 --- /dev/null +++ b/sycl/test/extensions/group_load_store_negative.cpp @@ -0,0 +1,15 @@ +// RUN: %clangxx -fsycl -fsycl-device-only -fsyntax-only -Xclang -verify -Xclang -verify-ignore-unexpected=warning,note %s + +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental; + +void foo(sub_group sg, int *p, vec &v) { + // expected-error@+2 {{no matching function for call to 'group_load'}} + // expected-note-re@*:* {{candidate template ignored: requirement 'is_property_list_v<{{.*data_placement.*}}>' was not satisfied {{.*}}}} + group_load(sg, p, v, data_placement_blocked); + + // This is ok: + group_load(sg, p, v, properties{data_placement_blocked}); +} diff --git a/sycl/unittests/config/ConfigTests.cpp b/sycl/unittests/config/ConfigTests.cpp index 756a340c8f82d..0f990bc3c9847 100644 --- a/sycl/unittests/config/ConfigTests.cpp +++ b/sycl/unittests/config/ConfigTests.cpp @@ -387,3 +387,65 @@ TEST(ConfigTests, CheckSyclCacheEvictionThresholdTest) { InMemEvicType::reset(); TestConfig(0); } + +// SYCL_CACHE_MAX_SIZE accepts an integer that specifies +// the maximum size of the persistent Program cache. +// Cache eviction is performed when the cache size exceeds the threshold. +// The thresholds are specified in bytes. +// The default value is "0" which means that eviction is disabled. +TEST(ConfigTests, CheckPersistentCacheEvictionThresholdTest) { + + using OnDiskEvicType = sycl::detail::SYCLConfig; + + // Lambda to test parsing of SYCL_CACHE_MAX_SIZE. + auto TestConfig = [](int expectedProgramCacheSize) { + EXPECT_EQ(expectedProgramCacheSize, OnDiskEvicType::getProgramCacheSize()); + EXPECT_EQ(expectedProgramCacheSize > 0, + OnDiskEvicType::isPersistentCacheEvictionEnabled()); + }; + + // Lambda to set SYCL_CACHE_MAX_SIZE. + auto SetSyclDiskCacheEvictionThresholdEnv = [](const char *value) { +#ifdef _WIN32 + _putenv_s("SYCL_CACHE_MAX_SIZE", value); +#else + setenv("SYCL_CACHE_MAX_SIZE", value, 1); +#endif + }; + + // Lambda to test invalid inputs. An exception should be thrown + // when parsing invalid values. + auto TestInvalidValues = [&](const char *value, const char *errMsg) { + SetSyclDiskCacheEvictionThresholdEnv(value); + try { + OnDiskEvicType::reset(); + TestConfig(0); + FAIL() << errMsg; + } catch (...) { + } + }; + + // Test eviction threshold with zero. + SetSyclDiskCacheEvictionThresholdEnv("0"); + sycl::detail::readConfig(true); + TestConfig(0); + + // Test invalid values. + TestInvalidValues("-1", "Should throw exception for negative value"); + TestInvalidValues("a", "Should throw exception for non-integer value"); + + // Test valid values. + SetSyclDiskCacheEvictionThresholdEnv("1024"); + OnDiskEvicType::reset(); + TestConfig(1024); + + // When SYCL_CACHE_MAX_SIZE is not set, it should default to + // 0:0:0. +#ifdef _WIN32 + _putenv_s("SYCL_CACHE_MAX_SIZE", ""); +#else + unsetenv("SYCL_CACHE_MAX_SIZE"); +#endif + OnDiskEvicType::reset(); + TestConfig(0); +} diff --git a/sycl/unittests/kernel-and-program/PersistentDeviceCodeCache.cpp b/sycl/unittests/kernel-and-program/PersistentDeviceCodeCache.cpp index 1cd0fcee45dc7..ba72facd2b45a 100644 --- a/sycl/unittests/kernel-and-program/PersistentDeviceCodeCache.cpp +++ b/sycl/unittests/kernel-and-program/PersistentDeviceCodeCache.cpp @@ -135,6 +135,12 @@ class PersistentDeviceCodeCache SYCLCachePersistentChanged = true; } + // Set SYCL_CACHE_MAX_SIZE. + void SetDiskCacheEvictionEnv(const char *NewValue) { + set_env("SYCL_CACHE_MAX_SIZE", NewValue); + sycl::detail::SYCLConfig::reset(); + } + void AppendToSYCLCacheDirEnv(const char *SubDir) { std::string NewSYCLCacheDirPath{RootSYCLCacheDir}; if (NewSYCLCacheDirPath.back() != '\\' && NewSYCLCacheDirPath.back() != '/') @@ -144,6 +150,24 @@ class PersistentDeviceCodeCache sycl::detail::SYCLConfig::reset(); } + // Get the list of binary files in the cache directory. + std::vector getBinaryFileNames(std::string CachePath) { + + std::vector FileNames; + std::error_code EC; + for (llvm::sys::fs::directory_iterator DirIt(CachePath, EC); + DirIt != llvm::sys::fs::directory_iterator(); DirIt.increment(EC)) { + // Check if the file is a binary file. + std::string filename = DirIt->path(); + if (filename.find(".bin") != std::string::npos) { + // Just return the file name without the path. + FileNames.push_back(filename.substr(filename.find_last_of("/\\") + 1)); + } + } + + return FileNames; + } + void ResetSYCLCacheDirEnv() { set_env("SYCL_CACHE_DIR", RootSYCLCacheDir.c_str()); sycl::detail::SYCLConfig::reset(); @@ -169,6 +193,9 @@ class PersistentDeviceCodeCache SetSYCLCachePersistentEnv(SYCLCachePersistentBefore ? SYCLCachePersistentBefore->c_str() : nullptr); + + // Reset SYCL_CACHE_MAX_SIZE. + SetDiskCacheEvictionEnv(nullptr); ResetSYCLCacheDirEnv(); } @@ -394,14 +421,14 @@ TEST_P(PersistentDeviceCodeCache, CorruptedCacheFiles) { // Binary file is corrupted detail::PersistentDeviceCodeCache::putItemToDisc({Dev}, {&Img}, {}, BuildOptions, NativeProg); - std::ofstream FileStream(ItemDir + "/0.bin", - std::ofstream::out | std::ofstream::trunc); - /* Emulate binary built for 2 devices: first is OK, second is trancated - * from 23 bytes to 4 - */ - FileStream << 2 << 12 << "123456789012" << 23 << "1234"; - FileStream.close(); - EXPECT_FALSE(FileStream.fail()) << "Failed to create trancated binary file"; + { + std::ofstream FileStream(ItemDir + "/0.bin", + std::ofstream::out | std::ofstream::trunc); + // Emulate binary which is truncated from 23 bytes to 4. + FileStream << 1 << 23 << "1234"; + FileStream.close(); + EXPECT_FALSE(FileStream.fail()) << "Failed to create trancated binary file"; + } Res = detail::PersistentDeviceCodeCache::getItemFromDisc({Dev}, {&Img}, {}, BuildOptions); EXPECT_EQ(Res.size(), static_cast(0)) @@ -421,6 +448,27 @@ TEST_P(PersistentDeviceCodeCache, CorruptedCacheFiles) { EXPECT_EQ(Res.size(), static_cast(0)) << "Item with corrupted binary file was read"; ASSERT_NO_ERROR(llvm::sys::fs::remove_directories(ItemDir)); + +// Death tests (ASSERT_DEATH) rely on assert which is not available in release +// mode. +#ifndef NDEBUG + // Unexpected 2 binaries in a single file. + detail::PersistentDeviceCodeCache::putItemToDisc({Dev}, {&Img}, {}, + BuildOptions, NativeProg); + { + std::ofstream FileStream(ItemDir + "/0.bin", + std::ofstream::out | std::ofstream::trunc); + // Emulate binaries for 2 devices in a single file. + FileStream << 2 << 12 << "123456789012" << 4 << "1234"; + FileStream.close(); + EXPECT_FALSE(FileStream.fail()) + << "Failed to create a file containing 2 binaries"; + } + ASSERT_DEATH(detail::PersistentDeviceCodeCache::getItemFromDisc( + {Dev}, {&Img}, {}, BuildOptions), + "NumBinaries == 1"); + ASSERT_NO_ERROR(llvm::sys::fs::remove_directories(ItemDir)); +#endif } /* Checks that lock file affects cache operations as expected: @@ -519,6 +567,92 @@ TEST_P(PersistentDeviceCodeCache, AccessDeniedForCacheDir) { } #endif //_WIN32 +// Unit tests for testing eviction in persistent cache. +TEST_P(PersistentDeviceCodeCache, BasicEviction) { + + // Cleanup the cache directory. + std::string CacheRoot = detail::PersistentDeviceCodeCache::getRootDir(); + ASSERT_NO_ERROR(llvm::sys::fs::remove_directories(CacheRoot)); + ASSERT_NO_ERROR(llvm::sys::fs::create_directories(CacheRoot)); + + // Disable eviction for the time being. + SetDiskCacheEvictionEnv("9000000"); + + std::string BuildOptions{"--eviction"}; + // Put 3 items to the cache. + detail::PersistentDeviceCodeCache::putItemToDisc({Dev}, {&Img}, {}, + BuildOptions, NativeProg); + + std::string ItemDir = detail::PersistentDeviceCodeCache::getCacheItemPath( + Dev, {&Img}, {}, BuildOptions); + size_t SizeOfOneEntry = (size_t)(detail::getDirectorySize(ItemDir)); + + detail::PersistentDeviceCodeCache::putItemToDisc({Dev}, {&Img}, {}, + BuildOptions, NativeProg); + + detail::PersistentDeviceCodeCache::putItemToDisc({Dev}, {&Img}, {}, + BuildOptions, NativeProg); + + // Retrieve 0.bin from the cache. + auto Res = detail::PersistentDeviceCodeCache::getItemFromDisc( + {Dev}, {&Img}, {}, BuildOptions); + + // Get the number of binary files in the cached item folder. + auto BinFiles = getBinaryFileNames(ItemDir); + EXPECT_EQ(BinFiles.size(), static_cast(3)) + << "Missing binary files. Eviction should not have happened."; + + // Set SYCL_CACHE_MAX_SIZE. + SetDiskCacheEvictionEnv(std::to_string(3 * SizeOfOneEntry).c_str()); + + // Put 4th item to the cache. This should trigger eviction. Three of the + // items should be evicted as we evict till the size of cache is less than + // the half of cache size. + detail::PersistentDeviceCodeCache::putItemToDisc({Dev}, {&Img}, {}, + BuildOptions, NativeProg); + + // We should have two binary files: 0.bin, 3.bin. + BinFiles = getBinaryFileNames(ItemDir); + EXPECT_EQ(BinFiles.size(), static_cast(1)) + << "Eviction failed. Wrong number of binary files in the cache."; + + // Check that 1.bin, 2.bin, and 0.bin was evicted. + for (const auto &File : BinFiles) { + EXPECT_NE(File, "1.bin") + << "Eviction failed. 1.bin should have been evicted."; + EXPECT_NE(File, "2.bin") + << "Eviction failed. 2.bin should have been evicted."; + EXPECT_NE(File, "0.bin") + << "Eviction failed. 0.bin should have been evicted."; + } + + ASSERT_NO_ERROR(llvm::sys::fs::remove_directories(ItemDir)); +} + +// Unit test for testing size file creation and update, concurrently. +TEST_P(PersistentDeviceCodeCache, ConcurentReadWriteCacheFileSize) { + // Cleanup the cache directory. + std::string CacheRoot = detail::PersistentDeviceCodeCache::getRootDir(); + ASSERT_NO_ERROR(llvm::sys::fs::remove_directories(CacheRoot)); + ASSERT_NO_ERROR(llvm::sys::fs::create_directories(CacheRoot)); + + // Insanely large value (1GB) to not trigger eviction. This test just + // checks for deadlocks/crashes when updating the size file concurrently. + SetDiskCacheEvictionEnv("1000000000"); + ConcurentReadWriteCache(1, 100); +} + +// Unit test for adding and evicting cache, concurrently. +TEST_P(PersistentDeviceCodeCache, ConcurentReadWriteCacheEviction) { + // Cleanup the cache directory. + std::string CacheRoot = detail::PersistentDeviceCodeCache::getRootDir(); + ASSERT_NO_ERROR(llvm::sys::fs::remove_directories(CacheRoot)); + ASSERT_NO_ERROR(llvm::sys::fs::create_directories(CacheRoot)); + + SetDiskCacheEvictionEnv("1000"); + ConcurentReadWriteCache(2, 100); +} + INSTANTIATE_TEST_SUITE_P(PersistentDeviceCodeCacheImpl, PersistentDeviceCodeCache, ::testing::Values(SYCL_DEVICE_BINARY_TYPE_SPIRV,