diff --git a/.github/workflows/sycl-nightly.yml b/.github/workflows/sycl-nightly.yml index 0b464c75d33b6..3a741cdb221d7 100644 --- a/.github/workflows/sycl-nightly.yml +++ b/.github/workflows/sycl-nightly.yml @@ -57,7 +57,7 @@ jobs: - name: OCL CPU (Intel/GEN12) runner: '["Linux", "gen12"]' image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest - image_options: -u 1001 + image_options: -u 1001 --privileged --cap-add SYS_ADMIN target_devices: opencl:cpu - name: OCL CPU (Intel/Arc) diff --git a/sycl/include/sycl/detail/builtins/relational_functions.inc b/sycl/include/sycl/detail/builtins/relational_functions.inc index 64a7d67934da1..d23f144676cdb 100644 --- a/sycl/include/sycl/detail/builtins/relational_functions.inc +++ b/sycl/include/sycl/detail/builtins/relational_functions.inc @@ -118,8 +118,12 @@ BUILTIN_REL(ONE_ARG, signbit, __spirv_SignBitSet) #undef BUILTIN_REL #ifdef __SYCL_DEVICE_ONLY__ -DEVICE_IMPL_TEMPLATE(THREE_ARGS, bitselect, builtin_enable_generic_t, - __spirv_ocl_bitselect) +DEVICE_IMPL_TEMPLATE( + THREE_ARGS, bitselect, builtin_enable_generic_t, [](auto... xs) { + using ret_ty = detail::builtin_enable_generic_t; + using detail::builtins::convert_result; + return convert_result(__spirv_ocl_bitselect(xs...)); + }) #else HOST_IMPL_TEMPLATE(THREE_ARGS, bitselect, builtin_enable_generic_t, rel, default_ret_type) diff --git a/sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp b/sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp index ae6137c01fe12..5efd958789f72 100644 --- a/sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp +++ b/sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp @@ -124,13 +124,23 @@ constexpr vector_type_t make_vector(const T (&&Arr)[N]) { } template -constexpr vector_type_t make_vector_impl(T Base, T Stride, - std::index_sequence) { - return vector_type_t{(T)(Base + ((T)Is) * Stride)...}; +constexpr auto make_vector_impl(T Base, T Stride, std::index_sequence) { + if constexpr (std::is_integral_v && N <= 3) { + // This sequence is a bit more efficient for integral types and N <= 3. + return vector_type_t{(T)(Base + ((T)Is) * Stride)...}; + } else { + using CppT = typename element_type_traits::EnclosingCppT; + CppT BaseCpp = Base; + CppT StrideCpp = Stride; + vector_type_t VBase = BaseCpp; + vector_type_t VStride = StrideCpp; + vector_type_t VStrideCoef{(CppT)(Is)...}; + vector_type_t Result{VBase + VStride * VStrideCoef}; + return wrapper_type_converter::template to_vector(Result); + } } -template -constexpr vector_type_t make_vector(T Base, T Stride) { +template constexpr auto make_vector(T Base, T Stride) { return make_vector_impl(Base, Stride, std::make_index_sequence{}); } @@ -265,18 +275,13 @@ class [[__sycl_detail__::__uses_aspects__( /// are initialized with the arithmetic progression defined by the arguments. /// For example, simd x(1, 3) will initialize x to the /// {1, 4, 7, 10} sequence. - /// @param Val The start of the progression. + /// If Ty is a floating-point type and \p Base or \p Step is +/-inf or nan, + /// then this constructor has undefined behavior. + /// @param Base The start of the progression. /// @param Step The step of the progression. - simd_obj_impl(Ty Val, Ty Step) noexcept { - __esimd_dbg_print(simd_obj_impl(Ty Val, Ty Step)); - if constexpr (is_wrapper_elem_type_v || !std::is_integral_v) { - for (int i = 0; i < N; ++i) { - M_data[i] = bitcast_to_raw_type(Val); - Val = binary_op(Val, Step); - } - } else { - M_data = make_vector(Val, Step); - } + simd_obj_impl(Ty Base, Ty Step) noexcept { + __esimd_dbg_print(simd_obj_impl(Ty Base, Ty Step)); + M_data = make_vector(Base, Step); } /// Broadcast constructor. Given value is type-converted to the diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index 2ecc96a122317..d923821027339 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -3043,7 +3043,7 @@ gather(AccessorT acc, simd byte_offsets, /// simd gather(AccessorT acc, simd byte_offsets, /// simd_mask mask, /// PropertyListT props = {}); // (acc-ga-2) -/// Supported platforms: DG2, PVC in most cases. The DG2/PVC is not required if +/// Supported platforms: DG2, PVC in most cases. DG2/PVC is not required if /// VS == 1 and no L1/L2 cache hints used and sizeof(T) <= 4 and N = {1,8,16,32} /// /// Loads ("gathers") elements of the type 'T' from memory locations addressed @@ -3111,7 +3111,7 @@ gather(AccessorT acc, simd byte_offsets, /// typename PropertyListT = empty_properties_t> /// simd gather(AccessorT acc, simd byte_offsets, /// PropertyListT props = {}); // (acc-ga-3) -/// Supported platforms: DG2, PVC in most cases. The DG2/PVC is not required if +/// Supported platforms: DG2, PVC in most cases. DG2/PVC is not required if /// VS == 1 and no L1/L2 cache hints used and sizeof(T) <= 4 and N = {1,8,16,32} /// /// Loads ("gathers") elements of the type 'T' from memory locations addressed @@ -7389,6 +7389,317 @@ __ESIMD_API flags); } +/// Variant of gather that uses local accessor as a parameter +/// template +/// simd gather(AccessorT acc, simd byte_offsets, +/// simd_mask mask, simd pass_thru, +/// PropertyListT props = {}); // (lacc-ga-1) +/// simd gather(AccessorT acc, simd byte_offsets, +/// simd_mask mask, +/// PropertyListT props = {}); // (lacc-ga-2) +/// simd gather(AccessorT acc, simd byte_offsets, +/// PropertyListT props = {}); // (lacc-ga-3) +/// +/// The next 3 functions are similar to (lacc-ga-1,2,3), but they don't have +/// the template parameter 'VS'. These functions are added for convenience and +/// to make it possible for the user to omit the template parameters T and N, +/// e.g. 'auto res = gather(acc, byte_offsets); +/// template +/// simd gather(AccessorT acc, simd byte_offsets, +/// simd_mask mask, simd pass_thru, +/// PropertyListT props = {}); // (lacc-ga-4) +/// simd gather(AccessorT acc, simd byte_offsets, +/// simd_mask mask, PropertyListT props = {});//(lacc-ga-5) +/// simd gather(AccessorT acc, simd byte_offsets, +/// PropertyListT props = {}); // (lacc-ga-6) +/// +/// The next 3 functions are similar to (lacc-ga-1,2,3), but accept the +/// \p byte_offsets as a \c simd_view argument: +/// template +/// simd gather(AccessorT acc, OffsetSimdViewT byte_offsets, +/// simd_mask mask, simd pass_thru, +/// PropertyListT props = {}); // (lacc-ga-7) +/// simd gather(AccessorT acc, OffsetSimdViewT byte_offsets, +/// simd_mask mask, +/// PropertyListT props = {}); // (lacc-ga-8) +/// simd gather(AccessorT acc, OffsetSimdViewT byte_offsets, +/// PropertyListT props = {}); // (lacc-ga-9) + +/// template +/// simd gather(AccessorT acc, simd byte_offsets, +/// simd_mask mask, simd pass_thru, +/// PropertyListT props = {}); // (lacc-ga-1) +/// Supported platforms: DG2, PVC only - Temporary restriction for the variant +/// with pass_thru operand. The only exception: DG2/PVC is not required if +/// the __ESIMD_GATHER_SCATTER_LLVM_IR macro is used. +/// +/// Loads ("gathers") elements of the type 'T' from memory locations addressed +/// by the local accessor \p acc and byte offsets \p byte_offsets, and returns +/// the loaded elements. +/// Access to any element's memory location can be disabled via the input vector +/// of predicates \p mask. If mask[i] is unset, then the load from +/// (acc + byte_offsets[i]) is skipped and the corresponding i-th element from +/// \p pass_thru operand is returned. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @tparam VS Vector size. It can also be read as the number of reads per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param acc Accessor referencing the data to load. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param mask The access mask. +/// @param pass_thru The vector pass through values. +/// @param props The optional compile-time properties. Only 'alignment' +/// property is used. +/// @return A vector of elements read. +template +__ESIMD_API std::enable_if_t< + (detail::is_local_accessor_with_v && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, simd byte_offsets, + simd_mask mask, simd pass_thru, PropertyListT props = {}) { + return slm_gather(byte_offsets + + __ESIMD_DNS::localAccessorToOffset(acc), + mask, pass_thru, props); +} + +/// template +/// simd gather(AccessorT acc, simd byte_offsets, +/// simd_mask mask, +/// PropertyListT props = {}); // (lacc-ga-2) +/// Supported platforms: DG2, PVC in most cases. DG2/PVC is not required if +/// VS == 1 and the __ESIMD_GATHER_SCATTER_LLVM_IR macro is used or sizeof(T) <= +/// 4 and N = {1,2,4,8,16,32} +/// +/// Loads ("gathers") elements of the type 'T' from memory locations addressed +/// by the local accessor \p acc and byte offsets \p byte_offsets, and returns +/// the loaded elements. +/// Access to any element's memory location can be disabled via the input vector +/// of predicates \p mask. If mask[i] is unset, then the load from +/// (acc + byte_offsets[i]) is skipped and the corresponding i-th element of +/// the returned vector is undefined. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @tparam VS Vector size. It can also be read as the number of reads per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param acc Accessor referencing the data to load. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param mask The access mask. +/// @param props The optional compile-time properties. Only 'alignment' +/// property is used. +/// @return A vector of elements read. +template +__ESIMD_API std::enable_if_t< + (detail::is_local_accessor_with_v && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, simd byte_offsets, + simd_mask mask, PropertyListT props = {}) { + return slm_gather( + byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), mask, props); +} + +/// template +/// simd gather(AccessorT acc, simd byte_offsets, +/// PropertyListT props = {}); // (lacc-ga-3) +/// Supported platforms: DG2, PVC in most cases. DG2/PVC is not required if +/// VS == 1 and the __ESIMD_GATHER_SCATTER_LLVM_IR macro is used or sizeof(T) <= +/// 4 and N = {1,2,4,8,16,32} +/// +/// Loads ("gathers") elements of the type 'T' from memory locations addressed +/// by the local accessor \p acc and byte offsets \p byte_offsets, and returns +/// the loaded elements. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @tparam VS Vector size. It can also be read as the number of reads per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param acc Accessor referencing the data to load. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param props The optional compile-time properties. Only 'alignment' +/// and cache hint properties are used. +/// @return A vector of elements read. +template +__ESIMD_API std::enable_if_t< + (detail::is_local_accessor_with_v && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, simd byte_offsets, + PropertyListT props = {}) { + return slm_gather( + byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), props); +} + +/// template +/// simd gather(AccessorT acc, simd byte_offsets, +/// simd_mask mask, simd pass_thru, +/// PropertyListT props = {}); // (lacc-ga-4) +/// This function is identical to (lacc-ga-1) except that vector size is fixed +/// to 1. This variant is added for convenience and lets the user omit the +/// template arguments and call the function as 'gather(acc, byte_offsets, mask, +/// pass_thru);'. +// Dev note: the mask type was turned into template parameter `MaskT` to +// avoid the conflicts of this prototype with the old gather() function +// accepting a 'global_offset' parameter and avoid 'ambiguous call' errors +// for calls like this: gather(acc, byte_offsets_simd, 0, mask); +template +__ESIMD_API std::enable_if_t< + (detail::is_local_accessor_with_v && + std::is_same_v> && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, simd byte_offsets, MaskT mask, + simd pass_thru, PropertyListT props = {}) { + return slm_gather(byte_offsets + + __ESIMD_DNS::localAccessorToOffset(acc), + mask, pass_thru, props); +} + +/// template +/// simd gather(AccessorT acc, simd byte_offsets, +/// simd_mask mask, PropertyListT props // (lacc-ga-5) +/// This function is identical to (lacc-ga-2) except that vector size is fixed +/// to 1. This variant is added for convenience and let user omit the template +/// arguments and call the function as 'gather(acc, byte_offsets, mask);'. +// Dev note: the mask type was turned into template parameter `MaskT` to +// avoid the conflicts of this prototype with the old gather() function +// accepting a 'global_offset' parameter and avoid 'ambiguous call' errors +// for calls like this: gather(acc, byte_offsets_simd, 0); +template +__ESIMD_API std::enable_if_t< + (detail::is_local_accessor_with_v && + std::is_same_v> && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, simd byte_offsets, MaskT mask, + PropertyListT props = {}) { + return slm_gather( + byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), mask, props); +} + +/// template +/// simd gather(AccessorT acc, simd byte_offsets, +/// PropertyListT props = {}); // (lacc-ga-6) +/// This function is identical to (lacc-ga-3) except that vector size is fixed +/// to 1. This variant is added for convenience and let user omit the template +/// arguments and call the function as 'gather(acc, byte_offsets);'. +template +__ESIMD_API std::enable_if_t< + (detail::is_local_accessor_with_v && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, simd byte_offsets, + PropertyListT props = {}) { + return slm_gather( + byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), props); +} + +/// template +/// simd gather(AccessorT acc, OffsetSimdViewT byte_offsets, +/// simd_mask mask, simd pass_thru, +/// PropertyListT props = {}); // (lacc-ga-7) +/// This function is identical to (lacc-ga-1) except that the \p byte_offsets +/// is represented as \c simd_view. +template +__ESIMD_API std::enable_if_t< + (detail::is_local_accessor_with_v && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask mask, + simd pass_thru, PropertyListT props = {}) { + return gather(acc, byte_offsets.read(), mask, pass_thru, props); +} + +/// template +/// simd gather(AccessorT acc, OffsetSimdViewT byte_offsets, +/// simd_mask mask, +/// PropertyListT props = {}); // (lacc-ga-8) +/// This function is identical to (lacc-ga-2) except that the \p byte_offsets +/// is represented as \c simd_view. +template +__ESIMD_API std::enable_if_t< + (detail::is_local_accessor_with_v && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask mask, + PropertyListT props = {}) { + return gather(acc, byte_offsets.read(), mask, props); +} + +/// template +/// simd gather(AccessorT acc, OffsetSimdViewT byte_offsets, +/// PropertyListT props = {}); // (lacc-ga-9) +/// This function is identical to (lacc-ga-3) except that the \p byte_offsets +/// is represented as \c simd_view. +template +__ESIMD_API std::enable_if_t< + (detail::is_local_accessor_with_v && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, OffsetSimdViewT byte_offsets, PropertyListT props = {}) { + return gather(acc, byte_offsets.read(), props); +} + /// Variant of gather that uses local accessor as a parameter /// /// Collects elements located at given offsets in an accessor and returns them @@ -7411,7 +7722,7 @@ __ESIMD_API std::enable_if_t, simd> - gather(AccessorTy acc, simd offsets, uint32_t glob_offset = 0, + gather(AccessorTy acc, simd offsets, uint32_t glob_offset, simd_mask mask = 1) { return slm_gather( offsets + glob_offset + __ESIMD_DNS::localAccessorToOffset(acc), mask); diff --git a/sycl/source/detail/pi.cpp b/sycl/source/detail/pi.cpp index 4dbeaccc9baf2..bbee0c1131feb 100644 --- a/sycl/source/detail/pi.cpp +++ b/sycl/source/detail/pi.cpp @@ -718,6 +718,14 @@ getBinaryImageFormat(const unsigned char *ImgData, size_t ImgSize) { return PI_DEVICE_BINARY_TYPE_NATIVE; } + if (MatchMagicNumber(std::array{'!', '<', 'a', 'r', 'c', 'h', '>', '\n'})) + // "ar" format is used to pack binaries for multiple devices, e.g. via + // + // -Xsycl-target-backend=spir64_gen "-device acm-g10,acm-g11" + // + // option. + return PI_DEVICE_BINARY_TYPE_NATIVE; + return PI_DEVICE_BINARY_TYPE_NONE; } diff --git a/sycl/test-e2e/Basic/built-ins/marray_relational.cpp b/sycl/test-e2e/Basic/built-ins/marray_relational.cpp index 3b2f7ccaf990d..36021efa81475 100644 --- a/sycl/test-e2e/Basic/built-ins/marray_relational.cpp +++ b/sycl/test-e2e/Basic/built-ins/marray_relational.cpp @@ -111,6 +111,64 @@ int main() { TEST2(sycl::any, int, EXPECTED(bool, false), 3, ma7); TEST(sycl::bitselect, float, EXPECTED(float, 1.0, 1.0), 2, ma8, ma9, ma10); TEST(sycl::select, float, EXPECTED(float, 1.0, 2.0, 8.0), 3, ma5, ma6, c); + { + // Extra tests for select/bitselect due to special handling required for + // integer return types. + + auto Test = [&](auto F, auto Expected, auto... Args) { + std::tuple ArgsTuple{Args...}; + auto Result = std::apply(F, ArgsTuple); + static_assert(std::is_same_v); + + auto Equal = [](auto x, auto y) { + for (size_t i = 0; i < x.size(); ++i) + if (x[i] != y[i]) + return false; + + return true; + }; + + assert(Equal(Result, Expected)); + + sycl::buffer ResultBuf{1}; + deviceQueue.submit([&](sycl::handler &cgh) { + sycl::accessor Result{ResultBuf, cgh}; + cgh.single_task([=]() { + auto R = std::apply(F, ArgsTuple); + static_assert(std::is_same_v); + Result[0] = Equal(R, Expected); + }); + }); + assert(sycl::host_accessor{ResultBuf}[0]); + }; + + sycl::marray a{0b1100, 0b0011}; + sycl::marray b{0b0011, 0b1100}; + sycl::marray c{0b1010, 0b1010}; + sycl::marray r{0b0110, 0b1001}; + + auto BitSelect = [](auto... xs) { return sycl::bitselect(xs...); }; + Test(BitSelect, r, a, b, c); + // Input values/results above are positive, so use the same values for + // signed/unsigned char tests. + [&](auto... xs) { + Test(BitSelect, sycl::marray{xs}...); + }(r, a, b, c); + [&](auto... xs) { + Test(BitSelect, sycl::marray{xs}...); + }(r, a, b, c); + + auto Select = [](auto... xs) { return sycl::select(xs...); }; + sycl::marray c2{false, true}; + sycl::marray r2{a[0], b[1]}; + Test(Select, r2, a, b, c2); + [&](auto... xs) { + Test(Select, sycl::marray{xs}..., c2); + }(r2, a, b); + [&](auto... xs) { + Test(Select, sycl::marray{xs}..., c2); + }(r2, a, b); + } return 0; } diff --git a/sycl/test-e2e/Basic/built-ins/scalar_relational.cpp b/sycl/test-e2e/Basic/built-ins/scalar_relational.cpp new file mode 100644 index 0000000000000..4c35dd9af6af4 --- /dev/null +++ b/sycl/test-e2e/Basic/built-ins/scalar_relational.cpp @@ -0,0 +1,59 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// RUN: %if preview-breaking-changes-supported %{ %{build} -fpreview-breaking-changes -o %t_preview.out %} +// RUN: %if preview-breaking-changes-supported %{ %{run} %t_preview.out%} + +#include + +template void TestTypes(FuncTy F) { + (F(Ts{}), ...); +} + +int main() { + sycl::queue q; + + auto Test = [&](auto F, auto Expected, auto... Args) { +#if defined(__GNUC__) || defined(__clang__) + std::cout << __PRETTY_FUNCTION__ << std::endl; +#endif + std::tuple ArgsTuple{Args...}; + auto Result = std::apply(F, ArgsTuple); + static_assert(std::is_same_v); + assert(Expected == Result); + + sycl::buffer ResultBuf{1}; + q.submit([&](sycl::handler &cgh) { + sycl::accessor Result{ResultBuf, cgh}; + cgh.single_task([=]() { + auto R = std::apply(F, ArgsTuple); + static_assert(std::is_same_v); + Result[0] = Expected == R; + }); + }); + assert(sycl::host_accessor{ResultBuf}[0]); + }; + + auto TestBitSelect = [&](auto type_val) { + using T = decltype(type_val); + auto BitSelect = [](auto... xs) { return sycl::bitselect(xs...); }; + + static_assert(std::is_integral_v, + "Only integer test is implemented here!"); + Test(BitSelect, T{0b0110}, T{0b1100}, T{0b0011}, T{0b1010}); + }; + + TestTypes(TestBitSelect); + + auto TestSelect = [&](auto type_val) { + using T = decltype(type_val); + auto Select = [](auto... xs) { return sycl::select(xs...); }; + + Test(Select, T{0}, T{1}, T{0}, true); + Test(Select, T{1}, T{1}, T{0}, false); + }; + + TestTypes(TestSelect); + + return 0; +} diff --git a/sycl/test-e2e/Basic/built-ins/vec_relational.cpp b/sycl/test-e2e/Basic/built-ins/vec_relational.cpp index 91ffcc354a275..45f12f22952fa 100644 --- a/sycl/test-e2e/Basic/built-ins/vec_relational.cpp +++ b/sycl/test-e2e/Basic/built-ins/vec_relational.cpp @@ -88,6 +88,70 @@ int main() { TEST2(sycl::any, int, EXPECTED(int32_t, 0), 3, va7); TEST(sycl::bitselect, float, EXPECTED(float, 1.0, 1.0), 2, va8, va9, va10); TEST(sycl::select, float, EXPECTED(float, 1.0, 2.0, 8.0), 3, va5, va6, c1); + { + // Extra tests for select/bitselect due to special handling required for + // integer return types. + + auto Test = [&](auto F, auto Expected, auto... Args) { + std::tuple ArgsTuple{Args...}; + auto Result = std::apply(F, ArgsTuple); + static_assert(std::is_same_v); + + // Note: operator==(vec, vec) return vec. + auto Equal = [](auto x, auto y) { + for (size_t i = 0; i < x.size(); ++i) + if (x[i] != y[i]) + return false; + + return true; + }; + + assert(Equal(Result, Expected)); + + sycl::buffer ResultBuf{1}; + deviceQueue.submit([&](sycl::handler &cgh) { + sycl::accessor Result{ResultBuf, cgh}; + cgh.single_task([=]() { + auto R = std::apply(F, ArgsTuple); + static_assert(std::is_same_v); + Result[0] = Equal(R, Expected); + }); + }); + assert(sycl::host_accessor{ResultBuf}[0]); + }; + + // Note that only int8_t/uint8_t are supported by the bitselect/select + // builtins and not all three char data types. Also, use positive numbers + // for the values below so that we could use the same for both + // signed/unsigned tests. + sycl::vec a{0b1100, 0b0011}; + sycl::vec b{0b0011, 0b1100}; + sycl::vec c{0b1010, 0b1010}; + sycl::vec r{0b0110, 0b1001}; + + auto BitSelect = [](auto... xs) { return sycl::bitselect(xs...); }; + Test(BitSelect, r, a, b, c); + [&](auto... xs) { + Test(BitSelect, xs.template as>()...); + }(r, a, b, c); + + auto Select = [](auto... xs) { return sycl::select(xs...); }; + sycl::vec c2{0x7F, 0xFF}; + sycl::vec r2{a[0], b[1]}; + + Test(Select, r2, a, b, c2); + [&](auto... xs) { + Test(Select, xs.template as>()..., c2); + }(r2, a, b); + + // Assume that MSB of a signed data type is the leftmost bit (signbit). + auto c3 = c2.template as>(); + + Test(Select, r2, a, b, c3); + [&](auto... xs) { + Test(Select, xs.template as>()..., c3); + }(r2, a, b); + } #ifdef __INTEL_PREVIEW_BREAKING_CHANGES TEST(sycl::isequal, int32_t, EXPECTED(int32_t, 1, 1), 2, va11.swizzle<0, 1>(), diff --git a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill.hpp b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill.hpp index 8ae03e7a9a577..b3c81ec144530 100644 --- a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill.hpp +++ b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill.hpp @@ -246,18 +246,8 @@ class run_test { }); queue.wait_and_throw(); - // Verify the base value was passed as-is - if (!are_bitwise_equal(result[0], base_value)) { - passed = false; - log::fail(TestDescriptionT(data_type, BaseVal, Step), - "Unexpected value at index 0, retrieved: ", result[0], - ", expected: ", base_value); - } - - // Verify the step value works as expected being passed to the fill - // constructor. - DataT expected_value = base_value; - for (size_t i = 1; i < result.size(); ++i) { + // Verify the the fill constructor. + for (size_t i = 0; i < result.size(); ++i) { if constexpr (BaseVal == init_val::nan || Step == init_val::nan) { if (!std::isnan(result[i])) { @@ -268,7 +258,7 @@ class run_test { } } else { - expected_value += step_value; + DataT expected_value = base_value + (DataT)i * step_value; if (!are_bitwise_equal(result[i], expected_value)) { passed = false; log::fail(TestDescriptionT(data_type, BaseVal, Step), diff --git a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_accuracy_fp.cpp b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_accuracy_fp.cpp index 9fcac2c603c30..9d34849e106f8 100644 --- a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_accuracy_fp.cpp +++ b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_accuracy_fp.cpp @@ -15,8 +15,7 @@ // The test verifies that simd fill constructor has no precision differences. // The test do the following actions: // - call simd with predefined base and step values -// - bitwise comparing that output[0] value is equal to base value and -// output[i] is equal to output[i -1] + step_value +// - bitwise comparing that output[i] is equal to base + i * step_value. #include "ctor_fill.hpp" diff --git a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_core.cpp b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_core.cpp index 7d2fc4f7592d8..73f35354d322f 100644 --- a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_core.cpp +++ b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_core.cpp @@ -5,10 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// TODO: remove fno-fast-math option once the issue is investigated and the test -// is fixed. -// DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%} -// RUN: %{build} -fsycl-device-code-split=per_kernel %{mathflags} -o %t.out +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out // RUN: %{run} %t.out // // Test for simd fill constructor for core types. @@ -112,10 +109,22 @@ int main(int, char **) { } { const auto types = get_tested_types(); + { + const auto base_values = + ctors::get_init_values_pack(); + const auto step_values = + ctors::get_init_values_pack(); + passed &= for_all_combinations( + types, sizes, contexts, base_values, step_values, queue); + } + // The test cases below have never been guaranteed to work some certain + // way with base and step values set to inf or non. They may or may not + // work as expected by the checks in this test. { const auto base_values = ctors::get_init_values_pack(); - const auto step_values = ctors::get_init_values_pack(); + const auto step_values = + ctors::get_init_values_pack(); passed &= for_all_combinations( types, sizes, contexts, base_values, step_values, queue); } diff --git a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_fp_extra.cpp b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_fp_extra.cpp index c6056458f5f00..525f9d6176f71 100644 --- a/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_fp_extra.cpp +++ b/sycl/test-e2e/ESIMD/api/functional/ctors/ctor_fill_fp_extra.cpp @@ -5,10 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// TODO: remove fno-fast-math option once the issue is investigated and the test -// is fixed. -// DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%} -// RUN: %{build} -fsycl-device-code-split=per_kernel %{mathflags} -o %t.out +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out // RUN: %{run} %t.out // // Test for simd fill constructor for extra fp types. diff --git a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_slm_load.hpp b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_slm_load.hpp index 3edd3f5c4557f..50881390a6a7b 100644 --- a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_slm_load.hpp +++ b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_slm_load.hpp @@ -120,11 +120,11 @@ bool test(queue Q, uint32_t PMask = ~0) { uint32_t LID = I % (LocalRange * VL * NChannels); uint32_t GID = I / VL; bool Pred = (GID & 0x1) == 0; - T ExpectedVal = GroupId * 1000000 + LID; + Tuint ExpectedVal = GroupId * 1000000 + LID; if (TestMergeOperand && !Pred) - ExpectedVal = GID + (I % VL); + ExpectedVal = sycl::bit_cast((T)(GID + (I % VL))); - if (Out[I] != ExpectedVal && NErrors++ < 32) { + if (sycl::bit_cast(Out[I]) != ExpectedVal && NErrors++ < 32) { std::cout << "Error: " << I << ": Value = " << Out[I] << ", Expected value = " << ExpectedVal << std::endl; } diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_slm_block_load.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_slm_block_load.cpp index a340c90b2a66b..635b4db519e30 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_slm_block_load.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_slm_block_load.cpp @@ -18,6 +18,13 @@ template bool test_load(queue Q) { Passed &= test(Q); Passed &= test(Q); Passed &= test(Q); + + Passed &= test(Q); + Passed &= test(Q); + Passed &= test(Q); + Passed &= test(Q); + Passed &= test(Q); + Passed &= test(Q); return Passed; } diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp index 03ef9bc46c483..e34f259c093ec 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp @@ -508,7 +508,7 @@ bool testSLM(queue Q, uint32_t MaskStride, PropertiesT) { uint32_t GlobalElemOffset = GlobalID * N; uint32_t LocalElemOffset = LocalID * N; - // Allocate a bit more to safely initialize it with 4-element chunks. + // Allocate a bit more to safely initialize it with 8-element chunks. constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T); slm_init(); @@ -761,3 +761,254 @@ template bool testACC(queue Q) { } return Passed; } + +template +bool testLACC(queue Q, uint32_t MaskStride, PropertiesT) { + + static_assert(VS > 0 && N % VS == 0, + "Incorrect VS parameter. N must be divisible by VS."); + constexpr int NOffsets = N / VS; + static_assert(!UsePassThru || UseMask, + "PassThru cannot be used without using mask"); + + constexpr uint32_t Groups = 8; + constexpr uint32_t Threads = 16; + + std::cout << "Running case: T=" << esimd_test::type_name() << ", N=" << N + << ", VS=" << VS << ", MaskStride=" << MaskStride + << ", Groups=" << Groups << ", Threads=" << Threads + << ", use_mask=" << UseMask << ", use_pass_thru=" << UsePassThru + << ", use_properties=" << UseProperties << std::endl; + + uint16_t Size = Groups * Threads * N; + using Tuint = esimd_test::uint_type_t; + + sycl::range<1> GlobalRange{Groups}; + sycl::range<1> LocalRange{Threads}; + sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange}; + + T *Out = sycl::malloc_shared(Size, Q); + std::memset(Out, 0, Size * sizeof(T)); + + T *In = sycl::malloc_shared(Size * 2, Q); + for (int I = 0; I < Size; I++) + In[I] = esimd_test::getRandomValue(); + + try { + buffer InBuf(In, Size * 2); + Q.submit([&](handler &CGH) { + // Allocate a bit more to safely initialize it with 8-element chunks. + constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T); + + auto InAcc = local_accessor(SLMSize, CGH); + + CGH.parallel_for(Range, [=](sycl::nd_item<1> NDI) SYCL_ESIMD_KERNEL { + uint16_t GlobalID = NDI.get_global_id(0); + uint16_t LocalID = NDI.get_local_id(0); + uint32_t GlobalElemOffset = GlobalID * N; + uint32_t LocalElemOffset = LocalID * N; + + if (LocalID == 0) { + for (int I = 0; I < Threads * N; I += 8) { + simd InVec(In + GlobalElemOffset + I); + simd Offsets(I * sizeof(T), sizeof(T)); + scatter(InAcc, Offsets, InVec); + } + } + barrier(); + PropertiesT Props{}; + + simd ByteOffsets(LocalElemOffset * sizeof(T), + VS * sizeof(T)); + simd_view ByteOffsetsView = ByteOffsets.template select(); + + simd_mask Pred; + for (int I = 0; I < NOffsets; I++) + Pred[I] = (I % MaskStride == 0) ? 1 : 0; + + using Tuint = esimd_test::uint_type_t; + simd PassThruInt(GlobalElemOffset, 1); + simd PassThru = PassThruInt.template bit_cast_view(); + auto PassThruView = PassThru.template select(0); + + simd Vals; + if constexpr (VS > 1) { // VS > 1 requires specifying + if constexpr (UsePassThru) { + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd + Vals = gather(InAcc, ByteOffsets, Pred, PassThru, + Props); + else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view + Vals = gather(InAcc, ByteOffsets, Pred, PassThruView, + Props); + else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd + Vals = gather(InAcc, ByteOffsetsView, Pred, PassThru, + Props); + else // ByteOffset - view, PassThru - view + Vals = gather(InAcc, ByteOffsetsView, Pred, + PassThruView, Props); + } else { // UseProperties is false + if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd + Vals = gather(InAcc, ByteOffsets, Pred, PassThru); + else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view + Vals = + gather(InAcc, ByteOffsets, Pred, PassThruView); + else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd + Vals = + gather(InAcc, ByteOffsetsView, Pred, PassThru); + else // ByteOffset - view, PassThru - view + Vals = gather(InAcc, ByteOffsetsView, Pred, + PassThruView); + } + } else if constexpr (UseMask) { // UsePassThru is false + if constexpr (UseProperties) { + if (GlobalID % 2 == 0) // ByteOffset - simd + Vals = gather(InAcc, ByteOffsets, Pred, Props); + else // ByteOffset - simd_view + Vals = gather(InAcc, ByteOffsetsView, Pred, Props); + } else { // UseProperties is false + if (GlobalID % 2 == 0) // ByteOffset - simd + Vals = gather(InAcc, ByteOffsets, Pred); + else // ByteOffset - simd_view + Vals = gather(InAcc, ByteOffsetsView, Pred); + } + } else { // UseMask is false, UsePassThru is false + if constexpr (UseProperties) { + if (GlobalID % 2 == 0) // ByteOffset - simd + Vals = gather(InAcc, ByteOffsets, Props); + else // ByteOffset - simd_view + Vals = gather(InAcc, ByteOffsetsView, Props); + } else { // UseProperties is false + if (GlobalID % 2 == 0) // ByteOffset - simd + Vals = gather(InAcc, ByteOffsets); + else // ByteOffset - simd_view + Vals = gather(InAcc, ByteOffsetsView); + } + } + } else { + // if (VS == 1) then can often be omitted - test it here. + // The variants accepting simd_view for 'PassThru' operand though + // still require to be specified explicitly to help + // C++ FE do simd to simd_view matching. + if constexpr (UsePassThru) { + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd + Vals = gather(InAcc, ByteOffsets, Pred, PassThru, Props); + else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view + Vals = gather(InAcc, ByteOffsets, Pred, PassThruView, + Props); + else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd + Vals = gather(InAcc, ByteOffsetsView, Pred, PassThru, Props); + else // ByteOffset - view, PassThru - view + Vals = gather(InAcc, ByteOffsetsView, Pred, PassThruView, + Props); + } else { // UseProperties is false + if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd + Vals = gather(InAcc, ByteOffsets, Pred, PassThru); + else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view + Vals = gather(InAcc, ByteOffsets, Pred, PassThruView); + else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd + Vals = gather(InAcc, ByteOffsetsView, Pred, PassThru); + else // ByteOffset - view, PassThru - view + Vals = + gather(InAcc, ByteOffsetsView, Pred, PassThruView); + } + } else if constexpr (UseMask) { // UsePassThru is false + if constexpr (UseProperties) { + if (GlobalID % 2 == 0) // ByteOffset - simd + Vals = gather(InAcc, ByteOffsets, Pred, Props); + else // ByteOffset - simd_view + Vals = gather(InAcc, ByteOffsetsView, Pred, Props); + } else { // UseProperties is false + if (GlobalID % 2 == 0) // ByteOffset - simd + Vals = gather(InAcc, ByteOffsets, Pred); + else // ByteOffset - simd_view + Vals = gather(InAcc, ByteOffsetsView, Pred); + } + } else { // UsePassThru is false, UseMask is false + if constexpr (UseProperties) { + if (GlobalID % 2 == 0) // ByteOffset - simd + Vals = gather(InAcc, ByteOffsets, Props); + else // ByteOffset - simd_view + Vals = gather(InAcc, ByteOffsetsView, Props); + } else { + if (GlobalID % 2 == 0) // ByteOffset - simd + Vals = gather(InAcc, ByteOffsets); + else // ByteOffset - simd_view + Vals = gather(InAcc, ByteOffsetsView); + } + } + } // end if (VS == 1) + Vals.copy_to(Out + GlobalID * N); + }); + }).wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + sycl::free(In, Q); + sycl::free(Out, Q); + return false; + } + + bool Passed = verify(In, Out, N, Size, VS, MaskStride, UseMask, UsePassThru); + if (!Passed) + std::cout << "Case FAILED" << std::endl; + + sycl::free(In, Q); + sycl::free(Out, Q); + return Passed; +} + +template bool testLACC(queue Q) { + constexpr bool UseMask = true; + constexpr bool UsePassThru = true; + constexpr bool UseProperties = true; + + properties AlignElemProps{alignment}; + + bool Passed = true; + Passed &= testLACC( + Q, 2, AlignElemProps); + Passed &= testLACC( + Q, 2, AlignElemProps); + Passed &= testLACC( + Q, 2, AlignElemProps); + Passed &= testLACC( + Q, 3, AlignElemProps); + Passed &= testLACC( + Q, 2, AlignElemProps); + Passed &= testLACC( + Q, 3, AlignElemProps); + + if constexpr (Features == TestFeatures::PVC || + Features == TestFeatures::DG2) { + properties LSCProps{alignment}; + Passed &= testLACC( + Q, 2, LSCProps); + Passed &= + testLACC(Q, 2, LSCProps); + Passed &= + testLACC(Q, 2, LSCProps); + Passed &= + testLACC(Q, 3, LSCProps); + + Passed &= + testLACC(Q, 2, LSCProps); + + // Check VS > 1. GPU supports only dwords and qwords in this mode. + if constexpr (sizeof(T) >= 4) { + // TODO: This test case causes flaky fail. Enable it after the issue + // in GPU driver is fixed. + // Passed &= testACC( + // Q, 3, AlignElemProps); + + Passed &= testLACC( + Q, 3, AlignElemProps); + Passed &= testLACC( + Q, 3, AlignElemProps); + Passed &= testLACC( + Q, 3, AlignElemProps); + } + } + return Passed; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/gather_lacc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/gather_lacc.cpp new file mode 100644 index 0000000000000..329c3aa977286 --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/gather_lacc.cpp @@ -0,0 +1,37 @@ +//==------- gather_lacc.cpp - DPC++ ESIMD on-device test ------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES-INTEL-DRIVER: lin: 26690, win: 101.4576 +// Use per-kernel compilation to have more information about failing cases. +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::gather() functions accepting Local ACCESSOR +// and optional compile-time esimd::properties. +// The gather() calls in this test do not use VS > 1 (number of loads per +// offset) to not impose using DG2/PVC features. + +#include "Inputs/gather.hpp" + +int main() { + auto Q = queue{gpu_selector_v}; + esimd_test::printTestLabel(Q); + + constexpr auto TestFeatures = TestFeatures::Generic; + bool Passed = true; + + Passed &= testLACC(Q); + Passed &= testLACC(Q); + if (Q.get_device().has(sycl::aspect::fp16)) + Passed &= testLACC(Q); + Passed &= testLACC(Q); + Passed &= testLACC(Q); + Passed &= + testLACC(Q); + std::cout << (Passed ? "Passed\n" : "FAILED\n"); + return Passed ? 0 : 1; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/gather_lacc_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/gather_lacc_dg2_pvc.cpp new file mode 100644 index 0000000000000..ce13ee56f367e --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/gather_lacc_dg2_pvc.cpp @@ -0,0 +1,40 @@ +//==------- gather_lacc_dg2_pvc.cpp - DPC++ ESIMD on-device test ----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: gpu-intel-dg2 || gpu-intel-pvc +// REQUIRES-INTEL-DRIVER: lin: 26690, win: 101.4576 + +// Use per-kernel compilation to have more information about failing cases. +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::gather() functions accepting Local ACCESSOR +// and optional compile-time esimd::properties. +// The gather() calls in this test can use VS > 1 (number of loads per offset). + +#include "Inputs/gather.hpp" + +int main() { + auto Q = queue{gpu_selector_v}; + esimd_test::printTestLabel(Q); + + constexpr auto TestFeatures = TestFeatures::DG2; + bool Passed = true; + + Passed &= testLACC(Q); + Passed &= testLACC(Q); + if (Q.get_device().has(sycl::aspect::fp16)) + Passed &= testLACC(Q); + Passed &= testLACC(Q); + Passed &= testLACC(Q); + Passed &= + testLACC(Q); + + std::cout << (Passed ? "Passed\n" : "FAILED\n"); + return Passed ? 0 : 1; +} diff --git a/sycl/test-e2e/KernelAndProgram/test_cache_jit_aot.cpp b/sycl/test-e2e/KernelAndProgram/test_cache_jit_aot.cpp new file mode 100644 index 0000000000000..856d1510edfc5 --- /dev/null +++ b/sycl/test-e2e/KernelAndProgram/test_cache_jit_aot.cpp @@ -0,0 +1,89 @@ +// Don't use normal %{run} as we need to control cache directory removal and +// cannot do that reliably when number of devices is unknown. +// +// REQUIRES: level_zero, ocloc +// +// DEFINE: %{cache_vars} = env SYCL_CACHE_PERSISTENT=1 SYCL_CACHE_TRACE=1 SYCL_CACHE_DIR=%t/cache_dir +// DEFINE: %{build_cmd} = %{build} +// RUN: mkdir -p %t/cache_dir +// +// The following block of code should be copy-pasted as-is to verify different +// JIT/AOT options. Don't know how to avoid code duplication. +// ****************************** +// Check the logs first. +// RUN: %{build_cmd} -DVALUE=1 -o %t.out +// RUN: rm -rf %t/cache_dir/* +// RUN: %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s %if windows %{ --check-prefixes=CHECK,CHECK-WIN %} +// +// Now try to substitute the cached image and verify it is actually taken and +// the code/binary there is executed. +// RUN: mv %t/cache_dir/*/*/*/*/*.bin %t.value1.bin +// RUN: rm -rf %t/cache_dir/* +// RUN: %{build_cmd} -DVALUE=2 -o %t.out +// RUN: %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes RESULT2 +// RUN: mv %t.value1.bin %t/cache_dir/*/*/*/*/*.bin +// RUN: %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes RESULT1 +// ****************************** +// +// REDEFINE: %{build_cmd} = %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device acm-g10" %s +// ****************************** +// Check the logs first. +// RUN: %{build_cmd} -DVALUE=1 -o %t.out +// RUN: rm -rf %t/cache_dir/* +// RUN: %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s %if windows %{ --check-prefixes=CHECK,CHECK-WIN %} +// +// Now try to substitute the cached image and verify it is actually taken and +// the code/binary there is executed. +// RUN: mv %t/cache_dir/*/*/*/*/*.bin %t.value1.bin +// RUN: rm -rf %t/cache_dir/* +// RUN: %{build_cmd} -DVALUE=2 -o %t.out +// RUN: %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes RESULT2 +// RUN: mv %t.value1.bin %t/cache_dir/*/*/*/*/*.bin +// RUN: %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes RESULT1 +// ****************************** +// +// REDEFINE: %{build_cmd} = %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device acm-g10,acm-g11" %s +// ****************************** +// Check the logs first. +// RUN: %{build_cmd} -DVALUE=1 -o %t.out +// RUN: rm -rf %t/cache_dir/* +// RUN: %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s %if windows %{ --check-prefixes=CHECK,CHECK-WIN %} +// +// Now try to substitute the cached image and verify it is actually taken and +// the code/binary there is executed. +// RUN: mv %t/cache_dir/*/*/*/*/*.bin %t.value1.bin +// RUN: rm -rf %t/cache_dir/* +// RUN: %{build_cmd} -DVALUE=2 -o %t.out +// RUN: %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes RESULT2 +// RUN: mv %t.value1.bin %t/cache_dir/*/*/*/*/*.bin +// RUN: %{cache_vars} %{run-unfiltered-devices} %t.out 2>&1 | FileCheck %s --check-prefixes RESULT1 +// ****************************** + +// CHECK: Code caching: device binary has been cached: [[BIN_FILE:.*]] +// CHECK-WIN: Code caching: using cached device binary: [[BIN_FILE]] +// CHECK-WIN: Code caching: using cached device binary: [[BIN_FILE]] + +// RESULT1: Result (0): 1 +// RESULT1: Result (1): 1 +// RESULT1: Result (2): 1 + +// RESULT2: Result (0): 2 +// RESULT2: Result (1): 2 +// RESULT2: Result (2): 2 + +#include + +int main() { + for (int i = 0; i < 3; ++i) { + sycl::buffer b{1}; + sycl::queue{} + .submit([&](sycl::handler &cgh) { + sycl::accessor acc{b, cgh}; + cgh.single_task([=]() { acc[0] = VALUE; }); + }) + .wait(); + std::cout << "Result (" << i << "): " << sycl::host_accessor{b}[0] + << std::endl; + } + return 0; +} diff --git a/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp b/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp index 3e2a3932224ed..120679246c64c 100644 --- a/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp +++ b/sycl/test-e2e/KernelFusion/Reduction/reduction.cpp @@ -1,6 +1,9 @@ // RUN: %{build} -fsycl-embed-ir -o %t.out // RUN: %{run} %t.out -// XFAIL: cpu +// +// The test fails on opencl:cpu when running on AMD runner and passes when +// running on Intel Arc GPU runner. +// UNSUPPORTED: cpu // Test fusion works with reductions. diff --git a/sycl/test/esimd/ctor_codegen.cpp b/sycl/test/esimd/ctor_codegen.cpp index cf86da2a5aad5..9e75e76fdf972 100644 --- a/sycl/test/esimd/ctor_codegen.cpp +++ b/sycl/test/esimd/ctor_codegen.cpp @@ -24,22 +24,62 @@ SYCL_EXTERNAL auto foo(double i) SYCL_ESIMD_FUNCTION { // CHECK-NEXT: } } -// Base + step constructor, FP element type, loops exected - don't check. -SYCL_EXTERNAL auto bar() SYCL_ESIMD_FUNCTION { - simd val(17, 3); - return val; +// Const base + step constructor, FP element type. +SYCL_EXTERNAL auto double_base_step_const() SYCL_ESIMD_FUNCTION { + // CHECK: define dso_local spir_func void @_Z22double_base_step_constv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} { + return simd{1.0, 3.0}; + // CHECK: store <64 x double> , ptr addrspace(4) %[[RES]] + // CHECK-NEXT: ret void +} + +// Variable base + step constructor, FP element type. +SYCL_EXTERNAL auto double_base_step_var(double base, double step) SYCL_ESIMD_FUNCTION { + // CHECK: define dso_local spir_func void @_Z20double_base_step_vardd({{.*}} %[[RES:[a-zA-Z0-9_\.]+]], double noundef %[[BASE:[a-zA-Z0-9_\.]+]], double noundef %[[STEP:[a-zA-Z0-9_\.]+]]){{.*}} { + return simd{base, step}; + // CHECK: %[[BASE_VEC_TMP:[a-zA-Z0-9_\.]+]] = insertelement <32 x double> poison, double %[[BASE]], i64 0 + // CHECK: %[[BASE_VEC:[a-zA-Z0-9_\.]+]] = shufflevector <32 x double> %[[BASE_VEC_TMP]], <32 x double> poison, <32 x i32> zeroinitializer + // CHECK: %[[STEP_VEC_TMP:[a-zA-Z0-9_\.]+]] = insertelement <32 x double> poison, double %[[STEP]], i64 0 + // CHECK: %[[STEP_VEC:[a-zA-Z0-9_\.]+]] = shufflevector <32 x double> %[[STEP_VEC_TMP]], <32 x double> poison, <32 x i32> zeroinitializer + // CHECK: %[[FMA_VEC:[a-zA-Z0-9_\.]+]] = tail call noundef <32 x double> @llvm.fmuladd.v32f64(<32 x double> %[[STEP_VEC]], <32 x double> , <32 x double> %[[BASE_VEC]]) + // CHECK: store <32 x double> %[[FMA_VEC]], ptr addrspace(4) %[[RES]] + // CHECK-NEXT: ret void } -// Base + step constructor, integer element type, no loops exected - check. -SYCL_EXTERNAL auto baz() SYCL_ESIMD_FUNCTION { - // CHECK: define dso_local spir_func void @_Z3bazv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} { - simd val(17, 3); +// Const base + step constructor, integer element type. +SYCL_EXTERNAL auto int_base_step_const() SYCL_ESIMD_FUNCTION { + // CHECK: define dso_local spir_func void @_Z19int_base_step_constv({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} { + simd val(17, 3); return val; - // CHECK: store <2 x i32> , ptr addrspace(4) %[[RES]] + // CHECK: store <16 x i32> , ptr addrspace(4) %[[RES]] // CHECK-NEXT: ret void // CHECK-NEXT: } } +// Variable base + step constructor, integer element type. +SYCL_EXTERNAL auto int_base_step_var(int base, int step) SYCL_ESIMD_FUNCTION { + // CHECK: define dso_local spir_func void @_Z17int_base_step_varii({{.*}} %[[RES:[a-zA-Z0-9_\.]+]], i32 noundef %[[BASE:[a-zA-Z0-9_\.]+]], i32 noundef %[[STEP:[a-zA-Z0-9_\.]+]]){{.*}} { + return simd{base, step}; + // CHECK: %[[BASE_VEC_TMP:[a-zA-Z0-9_\.]+]] = insertelement <32 x i32> poison, i32 %[[BASE]], i64 0 + // CHECK: %[[BASE_VEC:[a-zA-Z0-9_\.]+]] = shufflevector <32 x i32> %[[BASE_VEC_TMP]], <32 x i32> poison, <32 x i32> zeroinitializer + // CHECK: %[[STEP_VEC_TMP:[a-zA-Z0-9_\.]+]] = insertelement <32 x i32> poison, i32 %[[STEP]], i64 0 + // CHECK: %[[STEP_VEC:[a-zA-Z0-9_\.]+]] = shufflevector <32 x i32> %[[STEP_VEC_TMP]], <32 x i32> poison, <32 x i32> zeroinitializer + // CHECK: %[[MUL_VEC:[a-zA-Z0-9_\.]+]] = mul <32 x i32> %[[STEP_VEC]], + // CHECK: %[[ADD_VEC:[a-zA-Z0-9_\.]+]] = add <32 x i32> %[[BASE_VEC]], %[[MUL_VEC]] + // CHECK: store <32 x i32> %[[ADD_VEC]], ptr addrspace(4) %[[RES]] + // CHECK-NEXT: ret void +} + +// Variable base + step constructor, integer element type. +SYCL_EXTERNAL auto int_base_step_var_n2(int base, int step) SYCL_ESIMD_FUNCTION { + // CHECK: define dso_local spir_func void @_Z20int_base_step_var_n2ii({{.*}} %[[RES:[a-zA-Z0-9_\.]+]], i32 noundef %[[BASE:[a-zA-Z0-9_\.]+]], i32 noundef %[[STEP:[a-zA-Z0-9_\.]+]]){{.*}} { + return simd{base, step}; + // CHECK: %[[BASE_VEC_TMP1:[a-zA-Z0-9_\.]+]] = insertelement <2 x i32> poison, i32 %[[BASE]], i64 0 + // CHECK: %[[BASE_INC:[a-zA-Z0-9_\.]+]] = add nsw i32 %[[BASE]], %[[STEP]] + // CHECK: %[[RESULT_VEC:[a-zA-Z0-9_\.]+]] = insertelement <2 x i32> %[[BASE_VEC_TMP1]], i32 %[[BASE_INC]], i64 1 + // CHECK: store <2 x i32> %[[RESULT_VEC]], ptr addrspace(4) %[[RES]] + // CHECK-NEXT: ret void +} + // Broadcast constructor, FP element type, no loops exected - check. SYCL_EXTERNAL auto gee() SYCL_ESIMD_FUNCTION { // CHECK: define dso_local spir_func void @_Z3geev({{.*}} %[[RES:[a-zA-Z0-9_\.]+]]){{.*}} { diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp index 31dbc3e889f4b..3f653adb5e476 100644 --- a/sycl/test/esimd/memory_properties.cpp +++ b/sycl/test/esimd/memory_properties.cpp @@ -31,7 +31,7 @@ test_block_store(AccType &, LocalAccType &local_acc, float *, int byte_offset32, size_t byte_offset64); SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void -test_gather_scatter(AccType &, float *, int byte_offset32, +test_gather_scatter(AccType &, LocalAccType &, float *, int byte_offset32, size_t byte_offset64); SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void test_slm_gather_scatter(int byte_offset32); @@ -48,7 +48,7 @@ class EsimdFunctor { test_block_load(acc, local_acc, ptr, byte_offset32, byte_offset64); test_atomic_update(acc, local_acc_int, ptr, byte_offset32, byte_offset64); test_block_store(acc, local_acc, ptr, byte_offset32, byte_offset64); - test_gather_scatter(acc, ptr, byte_offset32, byte_offset64); + test_gather_scatter(acc, local_acc, ptr, byte_offset32, byte_offset64); test_slm_gather_scatter(byte_offset32); } }; @@ -939,8 +939,8 @@ test_block_store(AccType &acc, LocalAccType &local_acc, float *ptrf, // CHECK-LABEL: define {{.*}} @_Z19test_gather_scatter{{.*}} SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void -test_gather_scatter(AccType &acc, float *ptrf, int byte_offset32, - size_t byte_offset64) { +test_gather_scatter(AccType &acc, LocalAccType &local_acc, float *ptrf, + int byte_offset32, size_t byte_offset64) { properties props_cache_load{cache_hint_L1, cache_hint_L2, alignment<8>}; @@ -980,6 +980,10 @@ test_gather_scatter(AccType &acc, float *ptrf, int byte_offset32, // 6) gather(acc, offsets, mask): offsets is simd or simd_view // 7) gather(acc, offsets, mask, pass_thru) // 8) gather(acc, ...): same as (5), (6), (7) above, but with VS > 1. + // 9) gather(lacc, offsets): offsets is simd or simd_view + // 10) gather(lacc, offsets, mask): offsets is simd or simd_view + // 11) gather(lacc, offsets, mask, pass_thru) + // 12) gather(lacc, ...): same as (9), (10), (11) above, but with VS > 1. // 1) gather(usm, offsets): offsets is simd or simd_view // CHECK-COUNT-4: call <32 x float> @llvm.masked.gather.v32f32.v32p4(<32 x ptr addrspace(4)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}, <32 x float> {{[^)]+}}) @@ -1154,6 +1158,72 @@ test_gather_scatter(AccType &acc, float *ptrf, int byte_offset32, acc_res = gather(acc, ioffset_n16_view, mask_n16, pass_thru_view, props_align4); + // 9) gather(lacc, offsets): offsets is simd or simd_view + // CHECK-COUNT-16: call <32 x float> @llvm.masked.gather.v32f32.v32p3(<32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}, <32 x float> {{[^)]+}}) + acc_res = gather(local_acc, ioffset_n32); + acc_res = gather(local_acc, ioffset_n32_view); + acc_res = gather(local_acc, ioffset_n32, props_align4); + acc_res = gather(local_acc, ioffset_n32_view, props_align4); + + // 10) gather(lacc, offsets, mask): offsets is simd or simd_view + acc_res = gather(local_acc, ioffset_n32, mask_n32); + acc_res = gather(local_acc, ioffset_n32_view, mask_n32); + acc_res = gather(local_acc, ioffset_n32, mask_n32, props_align4); + acc_res = + gather(local_acc, ioffset_n32_view, mask_n32, props_align4); + + // 11) gather(lacc, offsets, mask, pass_thru) + acc_res = gather(local_acc, ioffset_n32, mask_n32, pass_thru); + acc_res = gather(local_acc, ioffset_n32_view, mask_n32, pass_thru); + acc_res = + gather(local_acc, ioffset_n32, mask_n32, pass_thru, props_align4); + acc_res = gather(local_acc, ioffset_n32_view, mask_n32, pass_thru, + props_align4); + + acc_res = gather(local_acc, ioffset_n32, mask_n32, pass_thru_view); + acc_res = + gather(local_acc, ioffset_n32_view, mask_n32, pass_thru_view); + acc_res = gather(local_acc, ioffset_n32, mask_n32, pass_thru_view, + props_align4); + acc_res = gather(local_acc, ioffset_n32_view, mask_n32, + pass_thru_view, props_align4); + + // 12) gather(lacc, ...): same as (9), (10), (11) above, but with VS > 1. + // CHECK-COUNT-16: call <32 x i32> @llvm.genx.lsc.load.merge.slm.v32i32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, i32 0, <32 x i32> {{[^)]+}}) + acc_res = gather(local_acc, ioffset_n16); + acc_res = gather(local_acc, ioffset_n16_view); + acc_res = gather(local_acc, ioffset_n16, props_align4); + acc_res = gather(local_acc, ioffset_n16_view, props_align4); + + acc_res = gather(local_acc, ioffset_n16, mask_n16); + acc_res = gather(local_acc, ioffset_n16_view, mask_n16); + acc_res = + gather(local_acc, ioffset_n16, mask_n16, props_align4); + acc_res = + gather(local_acc, ioffset_n16_view, mask_n16, props_align4); + + acc_res = gather(local_acc, ioffset_n16, mask_n16, pass_thru); + acc_res = + gather(local_acc, ioffset_n16_view, mask_n16, pass_thru); + acc_res = gather(local_acc, ioffset_n16, mask_n16, pass_thru, + props_align4); + acc_res = gather(local_acc, ioffset_n16_view, mask_n16, + pass_thru, props_align4); + + acc_res = + gather(local_acc, ioffset_n16, mask_n16, pass_thru_view); + acc_res = gather(local_acc, ioffset_n16_view, mask_n16, + pass_thru_view); + acc_res = gather(local_acc, ioffset_n16, mask_n16, + pass_thru_view, props_align4); + acc_res = gather(local_acc, ioffset_n16_view, mask_n16, + pass_thru_view, props_align4); + + // Validate that a new API doesn't conflict with the old API. + // CHECK-COUNT-2: call <32 x float> @llvm.masked.gather.v32f32.v32p3(<32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}, <32 x float> {{[^)]+}}) + acc_res = gather(local_acc, ioffset_n32, 0); + acc_res = gather(local_acc, ioffset_n32, 0, mask_n32); + // CHECK-COUNT-4: call void @llvm.genx.svm.scatter.v32i1.v32i64.v32f32(<32 x i1> {{[^)]+}}, i32 0, <32 x i64> {{[^)]+}}, <32 x float> {{[^)]+}}) scatter(ptrf, ioffset_n32, usm, mask_n32);