diff --git a/sycl/include/sycl/builtins.hpp b/sycl/include/sycl/builtins.hpp index f35e03a5e2f00..4edeeb8a73878 100644 --- a/sycl/include/sycl/builtins.hpp +++ b/sycl/include/sycl/builtins.hpp @@ -12,10 +12,7 @@ #ifdef __INTEL_PREVIEW_BREAKING_CHANGES -// Include the generated builtins. -#include -#include -#include +#include #else // __INTEL_PREVIEW_BREAKING_CHANGES diff --git a/sycl/include/sycl/builtins_preview.hpp b/sycl/include/sycl/builtins_preview.hpp new file mode 100644 index 0000000000000..bc497c540693b --- /dev/null +++ b/sycl/include/sycl/builtins_preview.hpp @@ -0,0 +1,270 @@ +//==------------------- builtins_preview.hpp -------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// Implement SYCL builtin functions. This implementation is mainly driven by the +// requirement of not including anywhere in the SYCL headers (i.e. from +// within ), because it pollutes global namespace. Note that we +// can avoid that using MSVC's STL as the pollution happens even from +// / and other headers that have to be included per the SYCL +// specification. As such, an alternative approach might be to use math +// intrinsics with GCC/clang-based compilers and use when using MSVC as +// a host compiler. That hasn't been tried/investigated. +// +// Current implementation splits builtins into several files following the SYCL +// 2020 (revision 8) split into common/math/geometric/relational/etc. functions. +// For each set, the implementation is split into a user-visible +// include/sycl/detail/builtins/*_functions.hpp providing full device-side +// implementation as well as defining user-visible APIs and defining ABI +// implemented under source/builtins/*_functions.cpp for the host side. We +// provide both scalar/vector overloads through symbols in the SYCL runtime +// library due to the limitation above (for scalars) and due to +// performance reasons for vector overloads (to be able to benefit from +// vectorization). +// +// Providing declaration for the host side symbols contained in the library +// comes with its own challenges. One is compilation time - blindly providing +// all those declarations takes significant time (about 10% slowdown for +// "clang++ -fsycl" when compiling just "#include "). Another +// issue is that return type for templates is part of the mangling (and as such +// SFINAE requirements too). To overcome that we structure host side +// implementation roughly like this (in most cases): +// +// math_function.cpp exports: +// float sycl::__sin_impl(float); +// float1 sycl::__sin_impl(float1); +// float2 sycl::__sin_impl(float2); +// ... +// /* same for other types */ +// +// math_functions.hpp provide an implementation based on the following idea (in +// ::sycl namespace): +// float sin(float x) { +// extern __sin_impl(float); +// return __sin_impl(x); +// } +// template +// enable_if_valid_type sin(T x) { +// if constexpr (marray_or_swizzle) { +// ... +// call sycl::sin(vector_or_scalar) +// } else { +// extern T __sin_impl(T); +// return __sin_impl(x); +// } +// } +// That way we avoid having the full set of explicit declaration for the symbols +// in the library and instead only pay with compile time when those template +// instantiations actually happen. + +#pragma once + +#include + +namespace sycl { +inline namespace _V1 { +namespace detail { +template +inline constexpr bool builtin_same_shape_v = + ((... && is_scalar_arithmetic_v) || (... && is_marray_v) || + (... && is_vec_or_swizzle_v)) && + (... && (num_elements::value == + num_elements::type>::value)); + +template +inline constexpr bool builtin_same_or_swizzle_v = + // Use builtin_same_shape_v to filter out types unrelated to builtins. + builtin_same_shape_v && all_same_v...>; + +namespace builtins { +#ifdef __SYCL_DEVICE_ONLY__ +template auto convert_arg(T &&x) { + using no_cv_ref = std::remove_cv_t>; + if constexpr (is_vec_v) { + using elem_type = get_elem_type_t; + using converted_elem_type = + decltype(convert_arg(std::declval())); + + constexpr auto N = no_cv_ref::size(); + using result_type = std::conditional_t; + // TODO: We should have this bit_cast impl inside vec::convert. + return bit_cast(static_cast(x)); + } else if constexpr (std::is_same_v) + return static_cast(x); + else if constexpr (is_multi_ptr_v) { + return convert_arg(x.get_decorated()); + } else if constexpr (is_scalar_arithmetic_v) { + // E.g. on linux: long long -> int64_t (long), or char -> int8_t (signed + // char) and same for unsigned; Windows has long/long long reversed. + // TODO: Inline this scalar impl. + return static_cast>(x); + } else if constexpr (std::is_pointer_v) { + using elem_type = remove_decoration_t>; + using converted_elem_type = + decltype(convert_arg(std::declval())); + using result_type = + typename DecoratedType::value>::type *; + return reinterpret_cast(x); + } else if constexpr (is_swizzle_v) { + return convert_arg(simplify_if_swizzle_t{x}); + } else { + // TODO: should it be unreachable? What can it be? + return std::forward(x); + } +} + +template auto convert_result(T &&x) { + if constexpr (is_vec_v) { + return bit_cast(x); + } else { + return std::forward(x); + } +} +#endif +} // namespace builtins + +template +auto builtin_marray_impl(FuncTy F, const Ts &...x) { + using ret_elem_type = decltype(F(x[0]...)); + using T = typename first_type::type; + marray Res; + constexpr auto N = T::size(); + for (size_t I = 0; I < N / 2; ++I) { + auto PartialRes = F(to_vec2(x, I * 2)...); + std::memcpy(&Res[I * 2], &PartialRes, sizeof(decltype(PartialRes))); + } + if (N % 2) + Res[N - 1] = F(x[N - 1]...); + return Res; +} + +template +auto builtin_default_host_impl(FuncTy F, const Ts &...x) { + // We implement support for marray/swizzle in the headers and export symbols + // for scalars/vector from the library binary. The reason is that scalar + // implementations mostly depend on which pollutes global namespace, + // so we can't unconditionally include it from the SYCL headers. Vector + // overloads have to be implemented in the library next to scalar overloads in + // order to be vectorizable. + if constexpr ((... || is_marray_v)) { + return builtin_marray_impl(F, x...); + } else { + return F(simplify_if_swizzle_t{x}...); + } +} + +template +auto builtin_delegate_to_scalar(FuncTy F, const Ts &...x) { + using T = typename first_type::type; + if constexpr (is_vec_or_swizzle_v) { + using ret_elem_type = decltype(F(x[0]...)); + // TODO: using r{} to avoid Werror. Not sure if ok. + vec r{}; + loop([&](auto idx) { r[idx] = F(x[idx]...); }); + return r; + } else { + static_assert(is_marray_v); + return builtin_marray_impl(F, x...); + } +} + +template +struct any_elem_type + : std::bool_constant, float, double, half, char, signed char, short, + int, long, long long, unsigned char, unsigned short, unsigned int, + unsigned long, unsigned long long>> {}; +template +struct fp_elem_type + : std::bool_constant< + check_type_in_v, float, double, half>> {}; +template +struct float_elem_type + : std::bool_constant, float>> {}; +template +struct integer_elem_type + : std::bool_constant< + check_type_in_v, char, signed char, short, int, + long, long long, unsigned char, unsigned short, + unsigned int, unsigned long, unsigned long long>> {}; +template +struct suint32_elem_type + : std::bool_constant< + check_type_in_v, int32_t, uint32_t>> {}; + +template +struct same_basic_shape : std::bool_constant> {}; + +template +struct same_elem_type : std::bool_constant::value && + all_same_v...>> { +}; + +template struct any_shape : std::true_type {}; + +template +struct scalar_only : std::bool_constant> {}; + +template +struct non_scalar_only : std::bool_constant> {}; + +template struct default_ret_type { + using type = T; +}; + +template struct scalar_ret_type { + using type = get_elem_type_t; +}; + +template