diff --git a/sycl/include/sycl/builtins.hpp b/sycl/include/sycl/builtins.hpp
index f35e03a5e2f00..4edeeb8a73878 100644
--- a/sycl/include/sycl/builtins.hpp
+++ b/sycl/include/sycl/builtins.hpp
@@ -12,10 +12,7 @@
 
 #ifdef __INTEL_PREVIEW_BREAKING_CHANGES
 
-// Include the generated builtins.
-#include <sycl/builtins_marray_gen.hpp>
-#include <sycl/builtins_scalar_gen.hpp>
-#include <sycl/builtins_vector_gen.hpp>
+#include <sycl/builtins_preview.hpp>
 
 #else // __INTEL_PREVIEW_BREAKING_CHANGES
 
diff --git a/sycl/include/sycl/builtins_preview.hpp b/sycl/include/sycl/builtins_preview.hpp
new file mode 100644
index 0000000000000..bc497c540693b
--- /dev/null
+++ b/sycl/include/sycl/builtins_preview.hpp
@@ -0,0 +1,270 @@
+//==------------------- builtins_preview.hpp -------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Implement SYCL builtin functions. This implementation is mainly driven by the
+// requirement of not including <cmath> anywhere in the SYCL headers (i.e. from
+// within <sycl/sycl.hpp>), because it pollutes global namespace. Note that we
+// can avoid that using MSVC's STL as the pollution happens even from
+// <vector>/<string> and other headers that have to be included per the SYCL
+// specification. As such, an alternative approach might be to use math
+// intrinsics with GCC/clang-based compilers and use <cmath> when using MSVC as
+// a host compiler. That hasn't been tried/investigated.
+//
+// Current implementation splits builtins into several files following the SYCL
+// 2020 (revision 8) split into common/math/geometric/relational/etc. functions.
+// For each set, the implementation is split into a user-visible
+// include/sycl/detail/builtins/*_functions.hpp providing full device-side
+// implementation as well as defining user-visible APIs and defining ABI
+// implemented under source/builtins/*_functions.cpp for the host side. We
+// provide both scalar/vector overloads through symbols in the SYCL runtime
+// library due to the <cmath> limitation above (for scalars) and due to
+// performance reasons for vector overloads (to be able to benefit from
+// vectorization).
+//
+// Providing declaration for the host side symbols contained in the library
+// comes with its own challenges. One is compilation time - blindly providing
+// all those declarations takes significant time (about 10% slowdown for
+// "clang++ -fsycl" when compiling just "#include <sycl/sycl.hpp>"). Another
+// issue is that return type for templates is part of the mangling (and as such
+// SFINAE requirements too). To overcome that we structure host side
+// implementation roughly like this (in most cases):
+//
+// math_function.cpp exports:
+//   float sycl::__sin_impl(float);
+//   float1 sycl::__sin_impl(float1);
+//   float2 sycl::__sin_impl(float2);
+//   ...
+//   /* same for other types */
+//
+// math_functions.hpp provide an implementation based on the following idea (in
+// ::sycl namespace):
+//   float sin(float x) {
+//     extern __sin_impl(float);
+//     return __sin_impl(x);
+//   }
+//   template <typename T>
+//   enable_if_valid_type<T> sin(T x) {
+//     if constexpr (marray_or_swizzle) {
+//       ...
+//       call sycl::sin(vector_or_scalar)
+//     } else {
+//       extern T __sin_impl(T);
+//       return __sin_impl(x);
+//     }
+//   }
+// That way we avoid having the full set of explicit declaration for the symbols
+// in the library and instead only pay with compile time when those template
+// instantiations actually happen.
+
+#pragma once
+
+#include <sycl/builtins_utils_vec.hpp>
+
+namespace sycl {
+inline namespace _V1 {
+namespace detail {
+template <typename... Ts>
+inline constexpr bool builtin_same_shape_v =
+    ((... && is_scalar_arithmetic_v<Ts>) || (... && is_marray_v<Ts>) ||
+     (... && is_vec_or_swizzle_v<Ts>)) &&
+    (... && (num_elements<Ts>::value ==
+             num_elements<typename first_type<Ts...>::type>::value));
+
+template <typename... Ts>
+inline constexpr bool builtin_same_or_swizzle_v =
+    // Use builtin_same_shape_v to filter out types unrelated to builtins.
+    builtin_same_shape_v<Ts...> && all_same_v<simplify_if_swizzle_t<Ts>...>;
+
+namespace builtins {
+#ifdef __SYCL_DEVICE_ONLY__
+template <typename T> auto convert_arg(T &&x) {
+  using no_cv_ref = std::remove_cv_t<std::remove_reference_t<T>>;
+  if constexpr (is_vec_v<no_cv_ref>) {
+    using elem_type = get_elem_type_t<no_cv_ref>;
+    using converted_elem_type =
+        decltype(convert_arg(std::declval<elem_type>()));
+
+    constexpr auto N = no_cv_ref::size();
+    using result_type = std::conditional_t<N == 1, converted_elem_type,
+                                           converted_elem_type
+                                           __attribute__((ext_vector_type(N)))>;
+    // TODO: We should have this bit_cast impl inside vec::convert.
+    return bit_cast<result_type>(static_cast<typename no_cv_ref::vector_t>(x));
+  } else if constexpr (std::is_same_v<no_cv_ref, half>)
+    return static_cast<half_impl::BIsRepresentationT>(x);
+  else if constexpr (is_multi_ptr_v<no_cv_ref>) {
+    return convert_arg(x.get_decorated());
+  } else if constexpr (is_scalar_arithmetic_v<no_cv_ref>) {
+    // E.g. on linux: long long -> int64_t (long), or char -> int8_t (signed
+    // char) and same for unsigned; Windows has long/long long reversed.
+    // TODO: Inline this scalar impl.
+    return static_cast<ConvertToOpenCLType_t<no_cv_ref>>(x);
+  } else if constexpr (std::is_pointer_v<no_cv_ref>) {
+    using elem_type = remove_decoration_t<std::remove_pointer_t<no_cv_ref>>;
+    using converted_elem_type =
+        decltype(convert_arg(std::declval<elem_type>()));
+    using result_type =
+        typename DecoratedType<converted_elem_type,
+                               deduce_AS<no_cv_ref>::value>::type *;
+    return reinterpret_cast<result_type>(x);
+  } else if constexpr (is_swizzle_v<no_cv_ref>) {
+    return convert_arg(simplify_if_swizzle_t<no_cv_ref>{x});
+  } else {
+    // TODO: should it be unreachable? What can it be?
+    return std::forward<T>(x);
+  }
+}
+
+template <typename RetTy, typename T> auto convert_result(T &&x) {
+  if constexpr (is_vec_v<RetTy>) {
+    return bit_cast<typename RetTy::vector_t>(x);
+  } else {
+    return std::forward<T>(x);
+  }
+}
+#endif
+} // namespace builtins
+
+template <typename FuncTy, typename... Ts>
+auto builtin_marray_impl(FuncTy F, const Ts &...x) {
+  using ret_elem_type = decltype(F(x[0]...));
+  using T = typename first_type<Ts...>::type;
+  marray<ret_elem_type, T::size()> Res;
+  constexpr auto N = T::size();
+  for (size_t I = 0; I < N / 2; ++I) {
+    auto PartialRes = F(to_vec2(x, I * 2)...);
+    std::memcpy(&Res[I * 2], &PartialRes, sizeof(decltype(PartialRes)));
+  }
+  if (N % 2)
+    Res[N - 1] = F(x[N - 1]...);
+  return Res;
+}
+
+template <typename FuncTy, typename... Ts>
+auto builtin_default_host_impl(FuncTy F, const Ts &...x) {
+  // We implement support for marray/swizzle in the headers and export symbols
+  // for scalars/vector from the library binary. The reason is that scalar
+  // implementations mostly depend on <cmath> which pollutes global namespace,
+  // so we can't unconditionally include it from the SYCL headers. Vector
+  // overloads have to be implemented in the library next to scalar overloads in
+  // order to be vectorizable.
+  if constexpr ((... || is_marray_v<Ts>)) {
+    return builtin_marray_impl(F, x...);
+  } else {
+    return F(simplify_if_swizzle_t<Ts>{x}...);
+  }
+}
+
+template <typename FuncTy, typename... Ts>
+auto builtin_delegate_to_scalar(FuncTy F, const Ts &...x) {
+  using T = typename first_type<Ts...>::type;
+  if constexpr (is_vec_or_swizzle_v<T>) {
+    using ret_elem_type = decltype(F(x[0]...));
+    // TODO: using r{} to avoid Werror. Not sure if ok.
+    vec<ret_elem_type, T::size()> r{};
+    loop<T::size()>([&](auto idx) { r[idx] = F(x[idx]...); });
+    return r;
+  } else {
+    static_assert(is_marray_v<T>);
+    return builtin_marray_impl(F, x...);
+  }
+}
+
+template <typename T>
+struct any_elem_type
+    : std::bool_constant<check_type_in_v<
+          get_elem_type_t<T>, float, double, half, char, signed char, short,
+          int, long, long long, unsigned char, unsigned short, unsigned int,
+          unsigned long, unsigned long long>> {};
+template <typename T>
+struct fp_elem_type
+    : std::bool_constant<
+          check_type_in_v<get_elem_type_t<T>, float, double, half>> {};
+template <typename T>
+struct float_elem_type
+    : std::bool_constant<check_type_in_v<get_elem_type_t<T>, float>> {};
+template <typename T>
+struct integer_elem_type
+    : std::bool_constant<
+          check_type_in_v<get_elem_type_t<T>, char, signed char, short, int,
+                          long, long long, unsigned char, unsigned short,
+                          unsigned int, unsigned long, unsigned long long>> {};
+template <typename T>
+struct suint32_elem_type
+    : std::bool_constant<
+          check_type_in_v<get_elem_type_t<T>, int32_t, uint32_t>> {};
+
+template <typename... Ts>
+struct same_basic_shape : std::bool_constant<builtin_same_shape_v<Ts...>> {};
+
+template <typename... Ts>
+struct same_elem_type : std::bool_constant<same_basic_shape<Ts...>::value &&
+                                           all_same_v<get_elem_type_t<Ts>...>> {
+};
+
+template <typename> struct any_shape : std::true_type {};
+
+template <typename T>
+struct scalar_only : std::bool_constant<is_scalar_arithmetic_v<T>> {};
+
+template <typename T>
+struct non_scalar_only : std::bool_constant<!is_scalar_arithmetic_v<T>> {};
+
+template <typename T> struct default_ret_type {
+  using type = T;
+};
+
+template <typename T> struct scalar_ret_type {
+  using type = get_elem_type_t<T>;
+};
+
+template <template <typename> typename RetTypeTrait,
+          template <typename> typename ElemTypeChecker,
+          template <typename> typename ShapeChecker,
+          template <typename...> typename ExtraConditions, typename... Ts>
+struct builtin_enable
+    : std::enable_if<
+          ElemTypeChecker<typename first_type<Ts...>::type>::value &&
+              ShapeChecker<typename first_type<Ts...>::type>::value &&
+              ExtraConditions<Ts...>::value,
+          typename RetTypeTrait<
+              simplify_if_swizzle_t<typename first_type<Ts...>::type>>::type> {
+};
+#define BUILTIN_CREATE_ENABLER(NAME, RET_TYPE_TRAIT, ELEM_TYPE_CHECKER,        \
+                               SHAPE_CHECKER, EXTRA_CONDITIONS)                \
+  namespace detail {                                                           \
+  template <typename... Ts>                                                    \
+  using NAME##_t =                                                             \
+      typename builtin_enable<RET_TYPE_TRAIT, ELEM_TYPE_CHECKER,               \
+                              SHAPE_CHECKER, EXTRA_CONDITIONS, Ts...>::type;   \
+  }
+} // namespace detail
+
+BUILTIN_CREATE_ENABLER(builtin_enable_generic, default_ret_type, any_elem_type,
+                       any_shape, same_elem_type)
+BUILTIN_CREATE_ENABLER(builtin_enable_generic_scalar, default_ret_type,
+                       any_elem_type, scalar_only, same_elem_type)
+BUILTIN_CREATE_ENABLER(builtin_enable_generic_non_scalar, default_ret_type,
+                       any_elem_type, non_scalar_only, same_elem_type)
+} // namespace _V1
+} // namespace sycl
+
+// The headers below are specifically implemented without including all the
+// necessary headers to allow preprocessing them on their own and providing
+// human-friendly result. One can use a command like this to achieve that:
+// clang++ -[DU]__SYCL_DEVICE_ONLY__ -x c++ math_functions.inc  \
+//         -I <..>/llvm/sycl/include -E -o - \
+//     | grep -v '^#' | clang-format > math_functions.{host|device}.ii
+
+#include <sycl/detail/builtins/common_functions.inc>
+#include <sycl/detail/builtins/geometric_functions.inc>
+#include <sycl/detail/builtins/half_precision_math_functions.inc>
+#include <sycl/detail/builtins/integer_functions.inc>
+#include <sycl/detail/builtins/math_functions.inc>
+#include <sycl/detail/builtins/native_math_functions.inc>
+#include <sycl/detail/builtins/relational_functions.inc>
diff --git a/sycl/include/sycl/detail/builtins/common_functions.inc b/sycl/include/sycl/detail/builtins/common_functions.inc
new file mode 100644
index 0000000000000..fb10964934cf0
--- /dev/null
+++ b/sycl/include/sycl/detail/builtins/common_functions.inc
@@ -0,0 +1,103 @@
+//==------------------- common_functions.hpp -------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Intentionally insufficient set of includes and no "#pragma once".
+
+#include <sycl/detail/builtins/helper_macros.hpp>
+
+namespace sycl {
+inline namespace _V1 {
+BUILTIN_CREATE_ENABLER(builtin_enable_common, default_ret_type, fp_elem_type,
+                       any_shape, same_elem_type)
+BUILTIN_CREATE_ENABLER(builtin_enable_common_non_scalar, default_ret_type,
+                       fp_elem_type, non_scalar_only, same_elem_type)
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define BUILTIN_COMMON(NUM_ARGS, NAME, SPIRV_IMPL)                             \
+  DEVICE_IMPL_TEMPLATE(NUM_ARGS, NAME, builtin_enable_common_t, SPIRV_IMPL)
+#else
+#define BUILTIN_COMMON(NUM_ARGS, NAME, SPIRV_IMPL)                             \
+  HOST_IMPL_TEMPLATE(NUM_ARGS, NAME, builtin_enable_common_t, common,          \
+                     default_ret_type)
+#endif
+
+BUILTIN_COMMON(ONE_ARG, degrees, __spirv_ocl_degrees)
+BUILTIN_COMMON(ONE_ARG, radians, __spirv_ocl_radians)
+BUILTIN_COMMON(ONE_ARG, sign, __spirv_ocl_sign)
+
+BUILTIN_COMMON(THREE_ARGS, mix, __spirv_ocl_mix)
+template <typename T0, typename T1>
+detail::builtin_enable_common_non_scalar_t<T0, T1>
+mix(T0 x, T1 y, detail::get_elem_type_t<T0> z) {
+  return mix(detail::simplify_if_swizzle_t<T0>{x},
+             detail::simplify_if_swizzle_t<T0>{y},
+             detail::simplify_if_swizzle_t<T0>{z});
+}
+
+BUILTIN_COMMON(TWO_ARGS, step, __spirv_ocl_step)
+template <typename T>
+detail::builtin_enable_common_non_scalar_t<T> step(detail::get_elem_type_t<T> x,
+                                                   T y) {
+  return step(detail::simplify_if_swizzle_t<T>{x},
+              detail::simplify_if_swizzle_t<T>{y});
+}
+
+BUILTIN_COMMON(THREE_ARGS, smoothstep, __spirv_ocl_smoothstep)
+template <typename T>
+detail::builtin_enable_common_non_scalar_t<T>
+smoothstep(detail::get_elem_type_t<T> x, detail::get_elem_type_t<T> y, T z) {
+  return smoothstep(detail::simplify_if_swizzle_t<T>{x},
+                    detail::simplify_if_swizzle_t<T>{y},
+                    detail::simplify_if_swizzle_t<T>{z});
+}
+
+BUILTIN_COMMON(TWO_ARGS, max, __spirv_ocl_fmax_common)
+template <typename T>
+detail::builtin_enable_common_non_scalar_t<T>
+max(T x, detail::get_elem_type_t<T> y) {
+  return max(detail::simplify_if_swizzle_t<T>{x},
+             detail::simplify_if_swizzle_t<T>{y});
+}
+
+BUILTIN_COMMON(TWO_ARGS, min, __spirv_ocl_fmin_common)
+template <typename T>
+detail::builtin_enable_common_non_scalar_t<T>
+min(T x, detail::get_elem_type_t<T> y) {
+  return min(detail::simplify_if_swizzle_t<T>{x},
+             detail::simplify_if_swizzle_t<T>{y});
+}
+
+#undef BUILTIN_COMMON
+
+#ifdef __SYCL_DEVICE_ONLY__
+DEVICE_IMPL_TEMPLATE(THREE_ARGS, clamp, builtin_enable_generic_t,
+                     [](auto... xs) {
+                       using ElemTy = detail::get_elem_type_t<T0>;
+                       if constexpr (std::is_integral_v<ElemTy>) {
+                         if constexpr (std::is_signed_v<ElemTy>) {
+                           return __spirv_ocl_s_clamp(xs...);
+                         } else {
+                           return __spirv_ocl_u_clamp(xs...);
+                         }
+                       } else {
+                         return __spirv_ocl_fclamp(xs...);
+                       }
+                     })
+#else
+HOST_IMPL_TEMPLATE(THREE_ARGS, clamp, builtin_enable_generic_t, common,
+                   default_ret_type)
+#endif
+template <typename T>
+detail::builtin_enable_generic_non_scalar_t<T>
+clamp(T x, detail::get_elem_type_t<T> y, detail::get_elem_type_t<T> z) {
+  return clamp(detail::simplify_if_swizzle_t<T>{x},
+               detail::simplify_if_swizzle_t<T>{y},
+               detail::simplify_if_swizzle_t<T>{z});
+}
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/include/sycl/detail/builtins/geometric_functions.inc b/sycl/include/sycl/detail/builtins/geometric_functions.inc
new file mode 100644
index 0000000000000..1d0dd385f9651
--- /dev/null
+++ b/sycl/include/sycl/detail/builtins/geometric_functions.inc
@@ -0,0 +1,97 @@
+//==------------------- geometric_functions.hpp ----------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Intentionally insufficient set of includes and no "#pragma once".
+
+#include <sycl/detail/builtins/helper_macros.hpp>
+
+namespace sycl {
+inline namespace _V1 {
+namespace detail {
+template <typename T>
+struct shape_geo : std::bool_constant<is_valid_size_v<T, 2, 3, 4> ||
+                                      is_scalar_arithmetic_v<T>> {};
+template <typename T>
+struct shape_geo3or4 : std::bool_constant<is_valid_size_v<T, 3, 4>> {};
+} // namespace detail
+
+BUILTIN_CREATE_ENABLER(builtin_enable_geo, default_ret_type, fp_elem_type,
+                       shape_geo, same_elem_type)
+BUILTIN_CREATE_ENABLER(builtin_enable_geo_fast, default_ret_type,
+                       float_elem_type, shape_geo, same_elem_type)
+BUILTIN_CREATE_ENABLER(builtin_enable_geo_scalar_ret, scalar_ret_type,
+                       fp_elem_type, shape_geo, same_elem_type)
+BUILTIN_CREATE_ENABLER(builtin_enable_geo_fast_scalar_ret, scalar_ret_type,
+                       float_elem_type, shape_geo, same_elem_type)
+BUILTIN_CREATE_ENABLER(builtin_enable_geo3or4, default_ret_type, fp_elem_type,
+                       shape_geo3or4, same_elem_type)
+
+namespace detail {
+template <typename FuncTy, typename... Ts>
+auto builtin_delegate_geo_impl(FuncTy F, const Ts &...x) {
+  using T = typename first_type<Ts...>::type;
+  if constexpr (is_marray_v<T>) {
+    auto ret = F(to_vec(x)...);
+    if constexpr (is_vec_v<decltype(ret)>)
+      return to_marray(ret);
+    else
+      return ret;
+  } else {
+    return F(simplify_if_swizzle_t<T>{x}...);
+  }
+}
+} // namespace detail
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define BUILTIN_GEO(NUM_ARGS, NAME, ENABLER, RET_TYPE_TRAITS)                  \
+  DEVICE_IMPL_TEMPLATE_CUSTOM_DELEGATE(NUM_ARGS, NAME, ENABLER,                \
+                                       builtin_delegate_geo_impl, sycl,        \
+                                       __spirv_ocl_##NAME)
+#else
+#define BUILTIN_GEO(NUM_ARGS, NAME, ENABLER, RET_TYPE_TRAITS)                  \
+  HOST_IMPL_TEMPLATE_CUSTOM_DELEGATOR(NUM_ARGS, NAME, ENABLER, geo,            \
+                                      RET_TYPE_TRAITS,                         \
+                                      builtin_delegate_geo_impl)
+#endif
+
+#ifdef __SYCL_DEVICE_ONLY__
+DEVICE_IMPL_TEMPLATE_CUSTOM_DELEGATE(TWO_ARGS, cross, builtin_enable_geo3or4_t,
+                                     builtin_delegate_geo_impl, sycl,
+                                     __spirv_ocl_cross)
+#else
+BUILTIN_GEO(TWO_ARGS, cross, builtin_enable_geo3or4_t, default_ret_type)
+#endif
+
+#ifdef __SYCL_DEVICE_ONLY__
+DEVICE_IMPL_TEMPLATE_CUSTOM_DELEGATE(
+    TWO_ARGS, dot, builtin_enable_geo_scalar_ret_t, builtin_delegate_geo_impl,
+    sycl, [](auto x, auto y) {
+      if constexpr (detail::is_scalar_arithmetic_v<decltype(x)>)
+        return x * y;
+      else {
+        return __spirv_Dot(x, y);
+      }
+    })
+#else
+BUILTIN_GEO(TWO_ARGS, dot, builtin_enable_geo_scalar_ret_t, scalar_ret_type)
+#endif
+
+// FIXME: fast_* should use *fast*_t enablers.
+BUILTIN_GEO(ONE_ARG, length, builtin_enable_geo_scalar_ret_t, scalar_ret_type)
+BUILTIN_GEO(ONE_ARG, fast_length, builtin_enable_geo_scalar_ret_t,
+            scalar_ret_type)
+BUILTIN_GEO(TWO_ARGS, distance, builtin_enable_geo_scalar_ret_t,
+            scalar_ret_type)
+BUILTIN_GEO(TWO_ARGS, fast_distance, builtin_enable_geo_scalar_ret_t,
+            scalar_ret_type)
+BUILTIN_GEO(ONE_ARG, normalize, builtin_enable_geo_t, default_ret_type)
+BUILTIN_GEO(ONE_ARG, fast_normalize, builtin_enable_geo_t, default_ret_type)
+
+#undef BUILTIN_GEO
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/include/sycl/detail/builtins/half_precision_math_functions.inc b/sycl/include/sycl/detail/builtins/half_precision_math_functions.inc
new file mode 100644
index 0000000000000..2717f6f72071b
--- /dev/null
+++ b/sycl/include/sycl/detail/builtins/half_precision_math_functions.inc
@@ -0,0 +1,52 @@
+//==------------------- half_precision_math_functions.hpp ------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Intentionally insufficient set of includes and no "#pragma once".
+
+#include <sycl/detail/builtins/helper_macros.hpp>
+
+namespace sycl {
+inline namespace _V1 {
+BUILTIN_CREATE_ENABLER(builtin_enable_half_precision_math, default_ret_type,
+                       float_elem_type, non_scalar_only, same_elem_type)
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define BUILTIN_HALF(NUM_ARGS, NAME)                                           \
+  inline float NAME(NUM_ARGS##_TYPE_ARG(float)) {                              \
+    return __spirv_ocl_half_##NAME(NUM_ARGS##_ARG);                            \
+  }                                                                            \
+  DEVICE_IMPL_TEMPLATE_CUSTOM_DELEGATE(                                        \
+      NUM_ARGS, NAME, builtin_enable_half_precision_math_t,                    \
+      builtin_marray_impl, half_precision, __spirv_ocl_half_##NAME)
+#else
+#define BUILTIN_HALF(NUM_ARGS, NAME)                                           \
+  HOST_IMPL_SCALAR(NUM_ARGS, NAME, float)                                      \
+  HOST_IMPL_TEMPLATE(NUM_ARGS, NAME, builtin_enable_half_precision_math_t,     \
+                     half_precision, default_ret_type)
+#endif
+
+namespace half_precision {
+BUILTIN_HALF(ONE_ARG, cos)
+BUILTIN_HALF(TWO_ARGS, divide)
+BUILTIN_HALF(ONE_ARG, exp)
+BUILTIN_HALF(ONE_ARG, exp2)
+BUILTIN_HALF(ONE_ARG, exp10)
+BUILTIN_HALF(ONE_ARG, log)
+BUILTIN_HALF(ONE_ARG, log2)
+BUILTIN_HALF(ONE_ARG, log10)
+BUILTIN_HALF(TWO_ARGS, powr)
+BUILTIN_HALF(ONE_ARG, recip)
+BUILTIN_HALF(ONE_ARG, rsqrt)
+BUILTIN_HALF(ONE_ARG, sin)
+BUILTIN_HALF(ONE_ARG, sqrt)
+BUILTIN_HALF(ONE_ARG, tan)
+} // namespace half_precision
+
+#undef BUILTIN_HALF
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/include/sycl/detail/builtins/helper_macros.hpp b/sycl/include/sycl/detail/builtins/helper_macros.hpp
new file mode 100644
index 0000000000000..8ad4b7fd79a10
--- /dev/null
+++ b/sycl/include/sycl/detail/builtins/helper_macros.hpp
@@ -0,0 +1,231 @@
+//==-- helper_macros.hpp -- Utility macros to implement sycl builtins ------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+// Usage:
+//   #define HANDLE_TYPE(INVARIANT_ARG1, INVARIANT_ARG2, TYPE) ...
+//   FOR_EACH2(HANDLE_TYPE, A1, A2, TYPE1, TYPE2, ...)
+// it will expand into
+//   HANDLE_TYPE(A1, A2, TYPE1)
+//   HANDLE_TYPE(A1, A2, TYPE2)
+//   ...
+// Number of "invariant" arguments determines the numeric suffix for the
+// FOR_EACHN. Only 0-4 are currently supported, and up to 15 types at most.
+#define GET_MACRO(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, \
+                  _15, NAME, ...)                                              \
+  NAME
+#define FOR_EACH4_A1(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1)          \
+  BASE_CASE(FIXED1, FIXED2, FIXED3, FIXED4, ARG1)
+#define FOR_EACH4_A2(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2)    \
+  BASE_CASE(FIXED1, FIXED2, FIXED3, FIXED4, ARG1)                              \
+  BASE_CASE(FIXED1, FIXED2, FIXED3, FIXED4, ARG2)
+#define FOR_EACH4_A3(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2,    \
+                     ARG3)                                                     \
+  FOR_EACH4_A2(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2)          \
+  BASE_CASE(FIXED1, FIXED2, FIXED3, FIXED4, ARG3)
+#define FOR_EACH4_A4(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2,    \
+                     ARG3, ARG4)                                               \
+  FOR_EACH4_A3(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2, ARG3)    \
+  BASE_CASE(FIXED1, FIXED2, FIXED3, FIXED4, ARG4)
+#define FOR_EACH4_A5(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2,    \
+                     ARG3, ARG4, ARG5)                                         \
+  FOR_EACH4_A4(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2, ARG3,    \
+               ARG4)                                                           \
+  BASE_CASE(FIXED1, FIXED2, FIXED3, FIXED4, ARG5)
+#define FOR_EACH4_A6(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2,    \
+                     ARG3, ARG4, ARG5, ARG6)                                   \
+  FOR_EACH4_A5(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2, ARG3,    \
+               ARG4, ARG5)                                                     \
+  BASE_CASE(FIXED1, FIXED2, FIXED3, FIXED4, ARG6)
+#define FOR_EACH4_A7(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2,    \
+                     ARG3, ARG4, ARG5, ARG6, ARG7)                             \
+  FOR_EACH4_A6(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2, ARG3,    \
+               ARG4, ARG5, ARG6)                                               \
+  BASE_CASE(FIXED1, FIXED2, FIXED3, FIXED4, ARG7)
+#define FOR_EACH4_A11(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2,   \
+                      ARG3, ARG4, ARG5, ARG6, ARG7, ARG8, ARG9, ARG10, ARG11)  \
+  FOR_EACH4_A7(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2, ARG3,    \
+               ARG4, ARG5, ARG6, ARG7)                                         \
+  FOR_EACH4_A4(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG8, ARG9, ARG10,   \
+               ARG11)
+#define FOR_EACH4_A14(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2,   \
+                      ARG3, ARG4, ARG5, ARG6, ARG7, ARG8, ARG9, ARG10, ARG11,  \
+                      ARG12, ARG13, ARG14)                                     \
+  FOR_EACH4_A11(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2, ARG3,   \
+                ARG4, ARG5, ARG6, ARG7, ARG8, ARG9, ARG10, ARG11)              \
+  FOR_EACH4_A3(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG12, ARG13, ARG14)
+
+#define FOR_EACH4(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ...)              \
+  GET_MACRO(__VA_ARGS__, FOR_EACH4##_A15, FOR_EACH4##_A14, FOR_EACH4##_A13,    \
+            FOR_EACH4##_A12, FOR_EACH4##_A11, FOR_EACH4##_A10, FOR_EACH4##_A9, \
+            FOR_EACH4##_A8, FOR_EACH4##_A7, FOR_EACH4##_A6, FOR_EACH4##_A5,    \
+            FOR_EACH4##_A4, FOR_EACH4##_A3, FOR_EACH4##_A2, FOR_EACH4##_A1,    \
+            _0, )                                                              \
+  (BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, __VA_ARGS__)
+
+#define FOR_EACH3_BASE(BASE_CASE, FIXED1, FIXED2, FIXED3, ARG1)                \
+  BASE_CASE(FIXED1, FIXED2, FIXED3, ARG1)
+#define FOR_EACH3(BASE_CASE, FIXED1, FIXED2, FIXED3, ...)                      \
+  FOR_EACH4(FOR_EACH3_BASE, BASE_CASE, FIXED1, FIXED2, FIXED3, __VA_ARGS__)
+
+#define FOR_EACH2_BASE(BASE_CASE, FIXED1, FIXED2, ARG1)                        \
+  BASE_CASE(FIXED1, FIXED2, ARG1)
+#define FOR_EACH2(BASE_CASE, FIXED1, FIXED2, ...)                              \
+  FOR_EACH3(FOR_EACH2_BASE, BASE_CASE, FIXED1, FIXED2, __VA_ARGS__)
+
+#define FOR_EACH1_BASE(BASE_CASE, FIXED1, ARG1) BASE_CASE(FIXED1, ARG1)
+#define FOR_EACH1(BASE_CASE, FIXED1, ...)                                      \
+  FOR_EACH2(FOR_EACH1_BASE, BASE_CASE, FIXED1, __VA_ARGS__)
+
+#define FOR_EACH0_BASE(BASE_CASE, ARG1) BASE_CASE(ARG1)
+#define FOR_EACH0(BASE_CASE, ...)                                              \
+  FOR_EACH1(FOR_EACH0_BASE, BASE_CASE, __VA_ARGS__)
+
+// Some helpers to unify implementation between different numbers of template
+// types.
+
+#define ONE_ARG_TYPENAME_TYPE typename T0
+#define TWO_ARGS_TYPENAME_TYPE typename T0, typename T1
+#define THREE_ARGS_TYPENAME_TYPE typename T0, typename T1, typename T2
+
+#define ONE_ARG_TEMPLATE_TYPE T0
+#define TWO_ARGS_TEMPLATE_TYPE T0, T1
+#define THREE_ARGS_TEMPLATE_TYPE T0, T1, T2
+
+#define ONE_ARG_TEMPLATE_TYPE_ARG T0 x
+#define TWO_ARGS_TEMPLATE_TYPE_ARG T0 x, T1 y
+#define THREE_ARGS_TEMPLATE_TYPE_ARG T0 x, T1 y, T2 z
+
+#define ONE_ARG_TEMPLATE_TYPE_ARG_REF T0 &x
+#define TWO_ARGS_TEMPLATE_TYPE_ARG_REF T0 &x, T1 &y
+#define THREE_ARGS_TEMPLATE_TYPE_ARG_REF T0 &x, T1 &y, T2 &z
+
+#define ONE_ARG_ARG x
+#define TWO_ARGS_ARG x, y
+#define THREE_ARGS_ARG x, y, z
+
+#define ONE_ARG_SIMPLIFIED_ARG                                                 \
+  simplify_if_swizzle_t<T0> { x }
+#define TWO_ARGS_SIMPLIFIED_ARG                                                \
+  simplify_if_swizzle_t<T0>{x}, simplify_if_swizzle_t<T1> { y }
+#define THREE_ARGS_SIMPLIFIED_ARG                                              \
+  simplify_if_swizzle_t<T0>{x}, simplify_if_swizzle_t<T1>{y},                  \
+      simplify_if_swizzle_t<T2> {                                              \
+    z                                                                          \
+  }
+
+#define TWO_ARGS_ARG_ROTATED y, x
+#define THREE_ARGS_ARG_ROTATED z, x, y
+
+#define ONE_ARG_CONVERTED_ARG detail::builtins::convert_arg(x)
+#define TWO_ARGS_CONVERTED_ARG                                                 \
+  detail::builtins::convert_arg(x), detail::builtins::convert_arg(y)
+#define THREE_ARGS_CONVERTED_ARG                                               \
+  detail::builtins::convert_arg(x), detail::builtins::convert_arg(y),          \
+      detail::builtins::convert_arg(z)
+
+#define ONE_ARG_AUTO_ARG auto x
+#define TWO_ARGS_AUTO_ARG auto x, auto y
+#define THREE_ARGS_AUTO_ARG auto x, auto y, auto z
+
+#define ONE_ARG_TYPE_ARG(TYPE) TYPE x
+#define TWO_ARGS_TYPE_ARG(TYPE) TYPE x, TYPE y
+#define THREE_ARGS_TYPE_ARG(TYPE) TYPE x, TYPE y, TYPE z
+
+#define ONE_ARG_TYPE(TYPE) TYPE
+#define TWO_ARGS_TYPE(TYPE) TYPE, TYPE
+#define THREE_ARGS_TYPE(TYPE) TYPE, TYPE, TYPE
+
+#define ONE_ARG_VEC_TYPE(TYPE, VL) vec<TYPE, VL>
+#define TWO_ARGS_VEC_TYPE(TYPE, VL) vec<TYPE, VL>, vec<TYPE, VL>
+#define THREE_ARGS_VEC_TYPE(TYPE, VL)                                          \
+  vec<TYPE, VL>, vec<TYPE, VL>, vec<TYPE, VL>
+
+#define ONE_ARG_VEC_TYPE_ARG(TYPE, VL) vec<TYPE, VL> x
+#define TWO_ARGS_VEC_TYPE_ARG(TYPE, VL) vec<TYPE, VL> x, vec<TYPE, VL> y
+#define THREE_ARGS_VEC_TYPE_ARG(TYPE, VL)                                      \
+  vec<TYPE, VL> x, vec<TYPE, VL> y, vec<TYPE, VL> z
+
+#define TWO_ARGS_LESS_ONE ONE_ARG
+#define THREE_ARGS_LESS_ONE TWO_ARGS
+
+#define SYCL_CONCAT_IMPL(A, B) A##B
+#define SYCL_CONCAT(A, B) SYCL_CONCAT_IMPL(A, B)
+
+#define LESS_ONE(NUM_ARGS) SYCL_CONCAT(NUM_ARGS, _LESS_ONE)
+
+// 3 types.
+#define FP_TYPES float, double, half
+// 6 types.
+#define SIGNED_TYPES char, signed char, short, int, long, long long
+// 5 types
+#define UNSIGNED_TYPES                                                         \
+  unsigned char, unsigned short, unsigned int, unsigned long, unsigned long long
+// 11 types
+#define INTEGER_TYPES SIGNED_TYPES, UNSIGNED_TYPES
+
+#define DEVICE_IMPL_TEMPLATE_CUSTOM_DELEGATE(                                  \
+    NUM_ARGS, NAME, ENABLER, DELEGATOR, NS, /*SCALAR_VEC_IMPL*/...)            \
+  template <NUM_ARGS##_TYPENAME_TYPE>                                          \
+  detail::ENABLER<NUM_ARGS##_TEMPLATE_TYPE> NAME(                              \
+      NUM_ARGS##_TEMPLATE_TYPE_ARG) {                                          \
+    if constexpr (detail::is_marray_v<T0>) {                                   \
+      return detail::DELEGATOR(                                                \
+          [](NUM_ARGS##_AUTO_ARG) { return NS::NAME(NUM_ARGS##_ARG); },        \
+          NUM_ARGS##_ARG);                                                     \
+    } else {                                                                   \
+      return __VA_ARGS__(NUM_ARGS##_CONVERTED_ARG);                            \
+    }                                                                          \
+  }
+
+#define DEVICE_IMPL_TEMPLATE(NUM_ARGS, NAME, ENABLER, /*SCALAR_VEC_IMPL*/...)  \
+  DEVICE_IMPL_TEMPLATE_CUSTOM_DELEGATE(NUM_ARGS, NAME, ENABLER,                \
+                                       builtin_marray_impl, sycl, __VA_ARGS__)
+
+// Use extern function declaration in function scope to save compile time.
+// Otherwise the FE has to parse multiple types/VLs/functions costing us around
+// 0.3s in compile-time. It also allows us to skip providing all the explicit
+// declarations through even more macro magic.
+#define HOST_IMPL_TEMPLATE_CUSTOM_DELEGATOR(                                   \
+    NUM_ARGS, NAME, ENABLER, FUNC_CLASS, RET_TYPE_TRAITS, DELEGATOR)           \
+  template <typename... Ts> auto __##FUNC_CLASS##_##NAME##_lambda(Ts... xs) {  \
+    /* Can't inline into the real lambda due to                                \
+     * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112867. Can't emulate a    \
+     * lambda through a local struct because templates are not allowed in      \
+     * local structs. Have to specify FUNC_CLASS to avoid                      \
+     * ambiguity between, e.g. sycl::__cos_lambda/sycl::native::__cos_lambda   \
+     * or between max in common functions and max in integer functions.        \
+     */                                                                        \
+    using ret_ty = typename detail::RET_TYPE_TRAITS<                           \
+        typename detail::first_type<Ts...>::type>::type;                       \
+    extern ret_ty __##NAME##_impl(Ts...);                                      \
+    return __##NAME##_impl(xs...);                                             \
+  }                                                                            \
+  template <NUM_ARGS##_TYPENAME_TYPE>                                          \
+  detail::ENABLER<NUM_ARGS##_TEMPLATE_TYPE> NAME(                              \
+      NUM_ARGS##_TEMPLATE_TYPE_ARG) {                                          \
+    return detail::DELEGATOR(                                                  \
+        [](auto... xs) { return __##FUNC_CLASS##_##NAME##_lambda(xs...); },    \
+        NUM_ARGS##_ARG);                                                       \
+  }
+
+#define HOST_IMPL_TEMPLATE(NUM_ARGS, NAME, ENABLER, FUNC_CLASS,                \
+                           RET_TYPE_TRAITS)                                    \
+  HOST_IMPL_TEMPLATE_CUSTOM_DELEGATOR(NUM_ARGS, NAME, ENABLER, FUNC_CLASS,     \
+                                      RET_TYPE_TRAITS,                         \
+                                      builtin_default_host_impl)
+
+#define HOST_IMPL_SCALAR_RET_TYPE(NUM_ARGS, NAME, RET_TYPE, TYPE)              \
+  inline RET_TYPE NAME(NUM_ARGS##_TYPE_ARG(TYPE)) {                            \
+    extern RET_TYPE __##NAME##_impl(NUM_ARGS##_TYPE(TYPE));                    \
+    return __##NAME##_impl(NUM_ARGS##_ARG);                                    \
+  }
+
+#define HOST_IMPL_SCALAR(NUM_ARGS, NAME, TYPE)                                 \
+  HOST_IMPL_SCALAR_RET_TYPE(NUM_ARGS, NAME, TYPE, TYPE)
diff --git a/sycl/include/sycl/detail/builtins/integer_functions.inc b/sycl/include/sycl/detail/builtins/integer_functions.inc
new file mode 100644
index 0000000000000..dfeb815e52494
--- /dev/null
+++ b/sycl/include/sycl/detail/builtins/integer_functions.inc
@@ -0,0 +1,225 @@
+//==------------------- integer_functions.hpp ------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Intentionally insufficient set of includes and no "#pragma once".
+
+#include <sycl/detail/builtins/helper_macros.hpp>
+
+namespace sycl {
+inline namespace _V1 {
+BUILTIN_CREATE_ENABLER(builtin_enable_integer, default_ret_type,
+                       integer_elem_type, any_shape, same_elem_type)
+BUILTIN_CREATE_ENABLER(builtin_enable_integer_non_scalar, default_ret_type,
+                       integer_elem_type, non_scalar_only, same_elem_type)
+BUILTIN_CREATE_ENABLER(builtin_enable_suint32, default_ret_type,
+                       suint32_elem_type, any_shape, same_elem_type)
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define BUILTIN_GENINT(NUM_ARGS, NAME)                                         \
+  DEVICE_IMPL_TEMPLATE(                                                        \
+      NUM_ARGS, NAME, builtin_enable_integer_t, [](auto... xs) {               \
+        using ret_ty =                                                         \
+            detail::builtin_enable_integer_t<NUM_ARGS##_TEMPLATE_TYPE>;        \
+        return detail::builtins::convert_result<ret_ty>(                       \
+            __spirv_ocl_##NAME(xs...));                                        \
+      })
+#else
+#define BUILTIN_GENINT(NUM_ARGS, NAME)                                         \
+  HOST_IMPL_TEMPLATE(NUM_ARGS, NAME, builtin_enable_integer_t, integer,        \
+                     default_ret_type)
+#endif
+
+// Integer builtin with separate signed/unsigned implementations.
+#ifdef __SYCL_DEVICE_ONLY__
+#define BUILTIN_GENINT_SU(NUM_ARGS, NAME)                                      \
+  DEVICE_IMPL_TEMPLATE(                                                        \
+      NUM_ARGS, NAME, builtin_enable_integer_t, [](auto... xs) {               \
+        using ret_ty =                                                         \
+            detail::builtin_enable_integer_t<NUM_ARGS##_TEMPLATE_TYPE>;        \
+        using detail::builtins::convert_result;                                \
+        if constexpr (std::is_signed_v<detail::get_elem_type_t<T0>>)           \
+          return convert_result<ret_ty>(__spirv_ocl_s_##NAME(xs...));          \
+        else                                                                   \
+          return convert_result<ret_ty>(__spirv_ocl_u_##NAME(xs...));          \
+      })
+#else
+#define BUILTIN_GENINT_SU(NUM_ARGS, NAME) BUILTIN_GENINT(NUM_ARGS, NAME)
+#endif
+
+#if __SYCL_DEVICE_ONLY__
+DEVICE_IMPL_TEMPLATE(ONE_ARG, abs, builtin_enable_integer_t, [](auto x) {
+  using ret_ty = detail::builtin_enable_integer_t<T0>;
+  using detail::builtins::convert_result;
+  if constexpr (std::is_signed_v<detail::get_elem_type_t<T0>>)
+    // SPIR-V builtin returns unsigned type, SYCL's return type is signed
+    // with the following restriction:
+    //   > The behavior is undefined if the result cannot be represented by
+    //   > the return type
+    return convert_result<ret_ty>(bit_cast<T0>(__spirv_ocl_s_abs(x)));
+  else
+    return convert_result<ret_ty>(__spirv_ocl_u_abs(x));
+})
+#else
+BUILTIN_GENINT_SU(ONE_ARG, abs)
+#endif
+
+BUILTIN_GENINT_SU(TWO_ARGS, add_sat)
+
+#if __SYCL_DEVICE_ONLY__
+DEVICE_IMPL_TEMPLATE(
+    TWO_ARGS, abs_diff, builtin_enable_integer_t, [](auto... xs) {
+      using ret_ty = detail::builtin_enable_integer_t<T0>;
+      using detail::builtins::convert_result;
+      if constexpr (std::is_signed_v<detail::get_elem_type_t<T0>>) {
+        // SPIRV built-in returns [vector of] unsigned type(s).
+        auto ret = __spirv_ocl_s_abs_diff(xs...);
+        if constexpr (detail::is_vec_v<T0>) {
+          // SYCL 2020 revision 8's abs_diff returns T0 (or corresponding vec in
+          // case of a swizzle). The only way to produce signed ext_vector_type
+          // from unsigned is with C-style case. Also note that element type of
+          // sycl::vec and ext_vector_type might be different, e.g.
+          // sycl::vec<char, N>::vector_t is
+          // signed char __attribute__((ext_vector_type(N))).
+          //
+          // TODO: Shouldn't be different from "abs" above.
+          return convert_result<ret_ty>((typename T0::vector_t)(ret));
+        } else {
+          return convert_result<ret_ty>(ret);
+        }
+      } else {
+        return convert_result<ret_ty>(__spirv_ocl_u_abs_diff(xs...));
+      }
+    })
+#else
+BUILTIN_GENINT_SU(TWO_ARGS, abs_diff)
+#endif
+
+BUILTIN_GENINT_SU(TWO_ARGS, hadd)
+BUILTIN_GENINT_SU(TWO_ARGS, mul_hi)
+BUILTIN_GENINT_SU(TWO_ARGS, rhadd)
+BUILTIN_GENINT_SU(TWO_ARGS, sub_sat)
+BUILTIN_GENINT_SU(THREE_ARGS, mad_hi)
+BUILTIN_GENINT_SU(THREE_ARGS, mad_sat)
+
+BUILTIN_GENINT_SU(TWO_ARGS, max)
+template <typename T>
+detail::builtin_enable_integer_non_scalar_t<T>
+max(T x, detail::get_elem_type_t<T> y) {
+  return max(detail::simplify_if_swizzle_t<T>{x},
+             detail::simplify_if_swizzle_t<T>{y});
+}
+
+BUILTIN_GENINT_SU(TWO_ARGS, min)
+template <typename T>
+detail::builtin_enable_integer_non_scalar_t<T>
+min(T x, detail::get_elem_type_t<T> y) {
+  return min(detail::simplify_if_swizzle_t<T>{x},
+             detail::simplify_if_swizzle_t<T>{y});
+}
+
+BUILTIN_GENINT(ONE_ARG, clz)
+BUILTIN_GENINT(ONE_ARG, ctz)
+BUILTIN_GENINT(ONE_ARG, popcount)
+BUILTIN_GENINT(TWO_ARGS, rotate)
+
+#undef BUILTIN_GENINT
+#undef BUILTIN_GENINT_SU
+
+#ifdef __SYCL_DEVICE_ONLY__
+DEVICE_IMPL_TEMPLATE(
+    THREE_ARGS, mad24, builtin_enable_suint32_t, [](auto... xs) {
+      if constexpr (std::is_same_v<int32_t, detail::get_elem_type_t<T0>>) {
+        return __spirv_ocl_s_mad24(xs...);
+      } else {
+        return __spirv_ocl_u_mad24(xs...);
+      }
+    })
+#else
+template <typename T0, typename T1, typename T2>
+detail::builtin_enable_suint32_t<T0, T1, T2> mad24(T0 x, T1 y, T2 z) {
+  if constexpr (detail::is_scalar_arithmetic_v<T0>) {
+    return x * y + z;
+  } else {
+    detail::simplify_if_swizzle_t<T0> res;
+    for (int i = 0; i < detail::num_elements<T0>::value; ++i)
+      res[i] = x[i] * y[i] + z[i];
+    return res;
+  }
+}
+#endif
+
+#ifdef __SYCL_DEVICE_ONLY__
+DEVICE_IMPL_TEMPLATE(TWO_ARGS, mul24, builtin_enable_suint32_t, [](auto... xs) {
+  if constexpr (std::is_same_v<int32_t, detail::get_elem_type_t<T0>>) {
+    return __spirv_ocl_s_mul24(xs...);
+  } else {
+    return __spirv_ocl_u_mul24(xs...);
+  }
+})
+#else
+template <typename T0, typename T1>
+detail::builtin_enable_suint32_t<T0, T1> mul24(T0 x, T1 y) {
+  if constexpr (detail::is_scalar_arithmetic_v<T0>) {
+    return x * y;
+  } else {
+    detail::simplify_if_swizzle_t<T0> res;
+    for (int i = 0; i < detail::num_elements<T0>::value; ++i)
+      res[i] = x[i] * y[i];
+    return res;
+  }
+}
+#endif
+
+namespace detail {
+// clang-format off
+template <typename T>
+using upsample_ret_type_t = change_elements_t<
+    typename map_type<get_elem_type_t<T>,
+                      int8_t, /*->*/ int16_t, uint8_t, /*->*/ uint16_t,
+                      int16_t, /*->*/ int32_t, uint16_t, /*->*/ uint32_t,
+                      int32_t, /*->*/ int64_t, uint32_t, /*->*/ uint64_t>::type,
+    T>;
+// clang-format on
+
+template <typename T0, typename T1>
+inline constexpr bool enable_upsample_v =
+    builtin_same_shape_v<T0, T1> &&
+    check_type_in_v<get_elem_type_t<T0>, int8_t, uint8_t, int16_t, uint16_t,
+                    int32_t, uint32_t> &&
+    check_type_in_v<get_elem_type_t<T1>, uint8_t, uint16_t, uint32_t> &&
+    sizeof(get_elem_type_t<T0>) == sizeof(get_elem_type_t<T1>);
+
+template <typename T0, typename T1>
+using enable_upsample_t =
+    std::enable_if_t<enable_upsample_v<T0, T1>, upsample_ret_type_t<T0>>;
+} // namespace detail
+
+#if __SYCL_DEVICE_ONLY
+DEVICE_IMPL_TEMPLATE(TWO_ARGS, mul24, enable_upsample_t, [](auto... xs) {
+  if constexpr (std::is_same_v<int32_t, detail::get_elem_type_t<T0>>) {
+    return __spirv_ocl_s_upsample(xs...);
+  } else {
+    return __spirv_ocl_u_upsample(xs...);
+  }
+})
+#else
+template <typename T0, typename T1>
+std::enable_if_t<detail::enable_upsample_v<T0, T1>,
+                 detail::upsample_ret_type_t<T0>>
+upsample(T0 x, T1 y) {
+  using namespace detail;
+  if constexpr (is_vec_or_swizzle_v<T0> || is_marray_v<T0>) {
+    return builtin_delegate_to_scalar(
+        [](auto... xs) { return upsample(xs...); }, x, y);
+  } else {
+    return upsample_ret_type_t<T0>{x} << (sizeof(T0) * 8) | y;
+  }
+}
+#endif
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/include/sycl/detail/builtins/math_functions.inc b/sycl/include/sycl/detail/builtins/math_functions.inc
new file mode 100644
index 0000000000000..faf83ef19d4c6
--- /dev/null
+++ b/sycl/include/sycl/detail/builtins/math_functions.inc
@@ -0,0 +1,554 @@
+//==------------------- math_functions.hpp ---------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Intentionally insufficient set of includes and no "#pragma once".
+
+#include <sycl/detail/builtins/helper_macros.hpp>
+
+namespace sycl {
+inline namespace _V1 {
+namespace detail {
+template <typename... Ts> struct last_int_rest_same {
+  static constexpr bool value = []() constexpr {
+    constexpr auto N = sizeof...(Ts);
+    using first_type = typename first_type<Ts...>::type;
+    if (!builtin_same_shape_v<first_type>)
+      return false;
+    int i = 0;
+    using int_type =
+        std::conditional_t<is_vec_or_swizzle_v<first_type>, int32_t, int>;
+    return (
+        (... &&
+         (++i == N
+              ? /* last */ builtin_same_shape_v<Ts> // filter out "bad" types,
+                                                    // e.g. multi-ptr
+                    && std::is_same_v<get_elem_type_t<Ts>, int_type>
+              : /* not last  */ builtin_same_or_swizzle_v<first_type, Ts>)));
+  }();
+};
+template <typename... Ts> struct last_intptr_rest_same {
+  static constexpr bool value = []() constexpr {
+    constexpr auto N = sizeof...(Ts);
+    using first_type = typename first_type<Ts...>::type;
+    if (!builtin_same_shape_v<first_type>)
+      return false;
+    int i = 0;
+    using int_type =
+        std::conditional_t<is_vec_or_swizzle_v<first_type>, int32_t, int>;
+    return (
+        (... &&
+         (++i == N
+              ? /* last */ is_multi_ptr_v<Ts> &&
+                    has_writeable_addr_space_v<Ts> &&
+                    builtin_same_shape_v<first_type, get_elem_type_t<Ts>> &&
+                    !is_swizzle_v<get_elem_type_t<Ts>> &&
+                    std::is_same_v<get_elem_type_t<get_elem_type_t<Ts>>,
+                                   int_type>
+              : /* not last  */ builtin_same_or_swizzle_v<first_type, Ts>)));
+  }();
+};
+} // namespace detail
+BUILTIN_CREATE_ENABLER(builtin_enable_math, default_ret_type, fp_elem_type,
+                       non_scalar_only, same_elem_type)
+BUILTIN_CREATE_ENABLER(builtin_enable_math_allow_scalar, default_ret_type,
+                       fp_elem_type, any_shape, same_elem_type)
+BUILTIN_CREATE_ENABLER(builtin_enable_last_int, default_ret_type, fp_elem_type,
+                       non_scalar_only, last_int_rest_same)
+BUILTIN_CREATE_ENABLER(builtin_enable_last_intptr_scalar, default_ret_type,
+                       fp_elem_type, scalar_only, last_intptr_rest_same)
+BUILTIN_CREATE_ENABLER(builtin_enable_last_intptr_non_scalar, default_ret_type,
+                       fp_elem_type, non_scalar_only, last_intptr_rest_same)
+
+namespace detail {
+// FIXME: get rid of these.
+template <typename... Ts>
+inline constexpr bool builtin_enable_math_allow_scalar_v =
+    builtin_same_or_swizzle_v<Ts...> &&
+    check_type_in_v<get_elem_type_t<typename first_type<Ts...>::type>, float,
+                    double, half>;
+} // namespace detail
+
+#ifdef __SYCL_DEVICE_ONLY__
+// Common between generic case and fast math optimized path. Note that vector
+// case is template with a single implementation between all three types, so we
+// have to introduce this VEC_IMPL parameter to be able to use native version
+// for floatN.
+#define BUILTIN_GENF_DEVICE_COMMON(NUM_ARGS, NAME, VEC_IMPL)                   \
+  inline double NAME(NUM_ARGS##_TYPE_ARG(double)) {                            \
+    return __spirv_ocl_##NAME(NUM_ARGS##_ARG);                                 \
+  }                                                                            \
+  inline half NAME(NUM_ARGS##_TYPE_ARG(half)) {                                \
+    return __spirv_ocl_##NAME(NUM_ARGS##_CONVERTED_ARG);                       \
+  }                                                                            \
+  DEVICE_IMPL_TEMPLATE(NUM_ARGS, NAME, builtin_enable_math_t, VEC_IMPL)
+
+#define BUILTIN_GENF(NUM_ARGS, NAME)                                           \
+  inline float NAME(NUM_ARGS##_TYPE_ARG(float)) {                              \
+    return __spirv_ocl_##NAME(NUM_ARGS##_ARG);                                 \
+  }                                                                            \
+  BUILTIN_GENF_DEVICE_COMMON(NUM_ARGS, NAME, __spirv_ocl_##NAME)
+
+#define BUILTIN_GENF_NATIVE_OPT(NUM_ARGS, NAME)                                \
+  namespace detail {                                                           \
+  template <typename T>                                                        \
+  decltype(auto) maybe_fmf_##NAME(NUM_ARGS##_TYPE_ARG(T)) {                    \
+    if constexpr (use_fast_math_v<T>) {                                        \
+      return __spirv_ocl_native_##NAME(NUM_ARGS##_ARG);                        \
+    } else {                                                                   \
+      return __spirv_ocl_##NAME(NUM_ARGS##_ARG);                               \
+    }                                                                          \
+  }                                                                            \
+  }                                                                            \
+  inline float NAME(NUM_ARGS##_TYPE_ARG(float)) {                              \
+    return detail::maybe_fmf_##NAME(NUM_ARGS##_CONVERTED_ARG);                 \
+  }                                                                            \
+  BUILTIN_GENF_DEVICE_COMMON(NUM_ARGS, NAME, detail::maybe_fmf_##NAME)
+
+#else
+#define BUILTIN_GENF(NUM_ARGS, NAME)                                           \
+  FOR_EACH2(HOST_IMPL_SCALAR, NUM_ARGS, NAME, FP_TYPES)                        \
+  HOST_IMPL_TEMPLATE(NUM_ARGS, NAME, builtin_enable_math_t, math,              \
+                     default_ret_type)
+
+// Optimization only affects device code.
+#define BUILTIN_GENF_NATIVE_OPT(NUM_ARGS, NAME) BUILTIN_GENF(NUM_ARGS, NAME)
+#endif
+
+BUILTIN_GENF(ONE_ARG, acos)
+BUILTIN_GENF(ONE_ARG, acosh)
+BUILTIN_GENF(ONE_ARG, acospi)
+BUILTIN_GENF(ONE_ARG, asin)
+BUILTIN_GENF(ONE_ARG, asinh)
+BUILTIN_GENF(ONE_ARG, asinpi)
+BUILTIN_GENF(ONE_ARG, atan)
+BUILTIN_GENF(ONE_ARG, atanh)
+BUILTIN_GENF(ONE_ARG, atanpi)
+BUILTIN_GENF(ONE_ARG, cbrt)
+BUILTIN_GENF(ONE_ARG, ceil)
+BUILTIN_GENF(ONE_ARG, cosh)
+BUILTIN_GENF(ONE_ARG, cospi)
+BUILTIN_GENF(ONE_ARG, erf)
+BUILTIN_GENF(ONE_ARG, erfc)
+BUILTIN_GENF(ONE_ARG, expm1)
+BUILTIN_GENF(ONE_ARG, fabs)
+BUILTIN_GENF(ONE_ARG, floor)
+BUILTIN_GENF(ONE_ARG, lgamma)
+BUILTIN_GENF(ONE_ARG, log1p)
+BUILTIN_GENF(ONE_ARG, logb)
+BUILTIN_GENF(ONE_ARG, rint)
+BUILTIN_GENF(ONE_ARG, round)
+BUILTIN_GENF(ONE_ARG, sinh)
+BUILTIN_GENF(ONE_ARG, sinpi)
+BUILTIN_GENF(ONE_ARG, tanh)
+BUILTIN_GENF(ONE_ARG, tanpi)
+BUILTIN_GENF(ONE_ARG, tgamma)
+BUILTIN_GENF(ONE_ARG, trunc)
+BUILTIN_GENF(TWO_ARGS, atan2)
+BUILTIN_GENF(TWO_ARGS, atan2pi)
+BUILTIN_GENF(TWO_ARGS, copysign)
+BUILTIN_GENF(TWO_ARGS, fdim)
+BUILTIN_GENF(TWO_ARGS, fmod)
+BUILTIN_GENF(TWO_ARGS, hypot)
+BUILTIN_GENF(TWO_ARGS, maxmag)
+BUILTIN_GENF(TWO_ARGS, minmag)
+BUILTIN_GENF(TWO_ARGS, nextafter)
+BUILTIN_GENF(TWO_ARGS, pow)
+BUILTIN_GENF(TWO_ARGS, remainder)
+BUILTIN_GENF(THREE_ARGS, fma)
+BUILTIN_GENF(THREE_ARGS, mad)
+
+#define BUILTIN_GENF_SCALAR_2ND(NAME)                                          \
+  BUILTIN_GENF(TWO_ARGS, NAME)                                                 \
+  template <typename T>                                                        \
+  detail::builtin_enable_math_t<T> NAME(T x, detail::get_elem_type_t<T> y) {   \
+    return NAME(detail::simplify_if_swizzle_t<T>{x},                           \
+                detail::simplify_if_swizzle_t<T>{y});                          \
+  }
+
+BUILTIN_GENF_SCALAR_2ND(fmax)
+BUILTIN_GENF_SCALAR_2ND(fmin)
+
+#undef BUILTIN_GENF_SCALAR_2ND
+
+BUILTIN_GENF_NATIVE_OPT(ONE_ARG, cos)
+BUILTIN_GENF_NATIVE_OPT(ONE_ARG, exp)
+BUILTIN_GENF_NATIVE_OPT(ONE_ARG, exp10)
+BUILTIN_GENF_NATIVE_OPT(ONE_ARG, exp2)
+BUILTIN_GENF_NATIVE_OPT(ONE_ARG, log)
+BUILTIN_GENF_NATIVE_OPT(ONE_ARG, log10)
+BUILTIN_GENF_NATIVE_OPT(ONE_ARG, log2)
+BUILTIN_GENF_NATIVE_OPT(ONE_ARG, rsqrt)
+BUILTIN_GENF_NATIVE_OPT(ONE_ARG, sin)
+BUILTIN_GENF_NATIVE_OPT(ONE_ARG, sqrt)
+BUILTIN_GENF_NATIVE_OPT(ONE_ARG, tan)
+BUILTIN_GENF_NATIVE_OPT(TWO_ARGS, powr)
+
+#undef BUILTIN_GENF_NATIVE_OPT
+#undef BUILTIN_GENF
+#undef BUILTIN_GENF_DEVICE_COMMON
+
+namespace detail {
+template <typename T0, typename T1>
+inline constexpr bool builtin_ptr_check_v =
+    is_multi_ptr_v<T1> && has_writeable_addr_space_v<T1> &&
+    is_valid_elem_type_v<T1, simplify_if_swizzle_t<T0>>;
+
+template <typename T0, typename T1>
+inline constexpr bool builtin_enable_ptr_v =
+    builtin_enable_math_allow_scalar_v<T0> && builtin_ptr_check_v<T0, T1>;
+
+template <typename T0, typename T1>
+using builtin_enable_ptr_scalar_t =
+    std::enable_if_t<builtin_enable_ptr_v<T0, T1> && is_scalar_arithmetic_v<T0>,
+                     T0>;
+
+template <typename T0, typename T1>
+using builtin_enable_ptr_non_scalar_t =
+    std::enable_if_t<builtin_enable_ptr_v<T0, T1> &&
+                         !is_scalar_arithmetic_v<T0>,
+                     simplify_if_swizzle_t<T0>>;
+
+template <typename FuncTy, typename PtrTy, typename... Ts>
+auto builtin_delegate_ptr_impl(FuncTy F, PtrTy p, Ts... xs) {
+  using T0 = typename first_type<Ts...>::type;
+  // Simplify just incase, although most callers seem to do that on their own.
+  simplify_if_swizzle_t<T0> r{};
+
+  // TODO: Optimize for sizes. Make not to violate ANSI-aliasing rules for the
+  // pointer argument.
+  auto p0 = [&]() {
+    if constexpr (is_multi_ptr_v<PtrTy>)
+      return address_space_cast<PtrTy::address_space,
+                                get_multi_ptr_decoration_v<PtrTy>>(&(*p)[0]);
+    else
+      // Deprecated case of raw ptr/host impl.
+      return &(*p)[0];
+  }();
+
+  constexpr auto N = T0::size();
+  if constexpr (N <= 16)
+    loop<N>([&](auto i) { r[i] = F(xs[i]..., p0 + i); });
+  else
+    for (size_t i = 0; i < N; ++i)
+      r[i] = F(xs[i]..., p0 + i);
+  return r;
+}
+} // namespace detail
+
+#define LAST_PTR_SCALAR(NUM_ARGS, NAME, SCALAR_ENABLER, TYPE)                  \
+  template <typename PtrTy>                                                    \
+  detail::SCALAR_ENABLER<TYPE, PtrTy> NAME(                                    \
+      SYCL_CONCAT(LESS_ONE(NUM_ARGS), _TYPE_ARG)(TYPE), PtrTy p) {             \
+    return detail::NAME##_impl(SYCL_CONCAT(LESS_ONE(NUM_ARGS), _ARG), p);      \
+  }
+#define BUILTIN_LAST_PTR_COMMON(NUM_ARGS, NAME, SCALAR_ENABLER,                \
+                                NON_SCALAR_ENABLER)                            \
+  FOR_EACH3(LAST_PTR_SCALAR, NUM_ARGS, NAME, SCALAR_ENABLER, FP_TYPES)         \
+  template <SYCL_CONCAT(LESS_ONE(NUM_ARGS), _TYPENAME_TYPE), typename PtrTy>   \
+  detail::NON_SCALAR_ENABLER<SYCL_CONCAT(LESS_ONE(NUM_ARGS), _TEMPLATE_TYPE),  \
+                             PtrTy>                                            \
+  NAME(SYCL_CONCAT(LESS_ONE(NUM_ARGS), _TEMPLATE_TYPE_ARG), PtrTy p) {         \
+    return detail::NAME##_impl(SYCL_CONCAT(LESS_ONE(NUM_ARGS), _ARG), p);      \
+  }
+
+#if __SYCL_DEVICE_ONLY__
+#define BUILTIN_LAST_PTR(NUM_ARGS, NAME, SCALAR_ENABLER, NON_SCALAR_ENABLER)   \
+  namespace detail {                                                           \
+  template <NUM_ARGS##_TYPENAME_TYPE>                                          \
+  auto NAME##_impl(NUM_ARGS##_TEMPLATE_TYPE_ARG_REF) {                         \
+    if constexpr (is_marray_v<T0>) {                                           \
+      return builtin_delegate_ptr_impl(                                        \
+          [](auto... xs) { return NAME##_impl(xs...); },                       \
+          NUM_ARGS##_ARG_ROTATED);                                             \
+    } else {                                                                   \
+      return __spirv_ocl_##NAME(NUM_ARGS##_CONVERTED_ARG);                     \
+    }                                                                          \
+  }                                                                            \
+  } /* namespace detail */                                                     \
+  BUILTIN_LAST_PTR_COMMON(NUM_ARGS, NAME, SCALAR_ENABLER, NON_SCALAR_ENABLER)
+#else
+#define BUILTIN_LAST_PTR(NUM_ARGS, NAME, SCALAR_ENABLER, NON_SCALAR_ENABLER)   \
+  BUILTIN_LAST_PTR_COMMON(NUM_ARGS, NAME, SCALAR_ENABLER, NON_SCALAR_ENABLER)
+#endif
+
+namespace detail {
+template <typename T>
+using builtin_last_raw_intptr_t =
+    // FIXME: Should we allow marray here, or limit just to vec/swizzle/ scalar?
+    //        If not, "enabler" has to be changed as well.
+    change_elements_t<std::conditional_t<is_marray_v<T>, int, int32_t>,
+                      simplify_if_swizzle_t<T>> *;
+}
+#define BUILTIN_LAST_RAW_INTPTR(NUM_ARGS, NAME)                                \
+  template <SYCL_CONCAT(LESS_ONE(NUM_ARGS), _TYPENAME_TYPE)>                   \
+  __SYCL_DEPRECATED("SYCL builtin functions with raw pointer arguments have "  \
+                    "been deprecated. Please use multi_ptr.")                  \
+  detail::builtin_enable_math_allow_scalar_t<SYCL_CONCAT(LESS_ONE(NUM_ARGS),   \
+                                                         _TEMPLATE_TYPE)>      \
+  NAME(SYCL_CONCAT(LESS_ONE(NUM_ARGS), _TEMPLATE_TYPE_ARG),                    \
+       detail::builtin_last_raw_intptr_t<T0> p) {                              \
+    return detail::NAME##_impl(SYCL_CONCAT(LESS_ONE(NUM_ARGS), _ARG), p);      \
+  }
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define BUILTIN_LAST_INTPTR(NUM_ARGS, NAME)                                    \
+  BUILTIN_LAST_PTR(NUM_ARGS, NAME, builtin_enable_last_intptr_scalar_t,        \
+                   builtin_enable_last_intptr_non_scalar_t)                    \
+  BUILTIN_LAST_RAW_INTPTR(NUM_ARGS, NAME)
+#else
+#define LAST_INT_PTR_DECLARE_SCALAR(NUM_ARGS, NAME, TYPE)                      \
+  __SYCL_EXPORT TYPE NAME##_impl(SYCL_CONCAT(LESS_ONE(NUM_ARGS), _TYPE)(TYPE), \
+                                 int *);
+#define BUILTIN_LAST_INTPTR(NUM_ARGS, NAME)                                    \
+  namespace detail {                                                           \
+  FOR_EACH2(LAST_INT_PTR_DECLARE_SCALAR, NUM_ARGS, NAME, FP_TYPES)             \
+  template <SYCL_CONCAT(LESS_ONE(NUM_ARGS), _TYPENAME_TYPE), typename PtrTy>   \
+  auto NAME##_impl(SYCL_CONCAT(LESS_ONE(NUM_ARGS), _TEMPLATE_TYPE_ARG_REF),    \
+                   PtrTy p) {                                                  \
+    if constexpr (is_multi_ptr_v<PtrTy>) {                                     \
+      /* TODO: Can't really create multi_ptr on host... */                     \
+      return NAME##_impl(SYCL_CONCAT(LESS_ONE(NUM_ARGS), _ARG), p.get_raw());  \
+    } else {                                                                   \
+      return builtin_delegate_ptr_impl(                                        \
+          [](auto... xs) { return NAME##_impl(xs...); }, p,                    \
+          SYCL_CONCAT(LESS_ONE(NUM_ARGS), _SIMPLIFIED_ARG));                   \
+    }                                                                          \
+  }                                                                            \
+  } /* namespace detail */                                                     \
+  BUILTIN_LAST_PTR(NUM_ARGS, NAME, builtin_enable_last_intptr_scalar_t,        \
+                   builtin_enable_last_intptr_non_scalar_t)                    \
+  BUILTIN_LAST_RAW_INTPTR(NUM_ARGS, NAME)
+#endif
+
+BUILTIN_LAST_INTPTR(TWO_ARGS, frexp)
+BUILTIN_LAST_INTPTR(TWO_ARGS, lgamma_r)
+BUILTIN_LAST_INTPTR(THREE_ARGS, remquo)
+
+#undef BUILTIN_LAST_INTPTR
+#undef LAST_INT_PTR_DECLARE_SCALAR
+
+#ifndef __SYCL_DEVICE_ONLY__
+namespace detail {
+template <typename T0, typename T1> auto fract_impl(T0 &x, T1 &y) {
+  auto flr = floor(simplify_if_swizzle_t<T0>{x});
+  *y = flr;
+  return fmin(x - flr, nextafter(simplify_if_swizzle_t<T0>{1.0},
+                                 simplify_if_swizzle_t<T0>{0.0}));
+}
+} // namespace detail
+#endif
+BUILTIN_LAST_PTR(TWO_ARGS, fract, builtin_enable_ptr_scalar_t,
+                 builtin_enable_ptr_non_scalar_t)
+template <typename T0>
+__SYCL_DEPRECATED("SYCL builtin functions with raw pointer arguments have been "
+                  "deprecated. Please use multi_ptr.")
+detail::builtin_enable_math_allow_scalar_t<T0> fract(
+    T0 x, detail::simplify_if_swizzle_t<T0> *y) {
+  return detail::fract_impl(x, y);
+}
+
+#ifndef __SYCL_DEVICE_ONLY__
+namespace detail {
+__SYCL_EXPORT float modf_impl(float, float *);
+__SYCL_EXPORT double modf_impl(double, double *);
+__SYCL_EXPORT half modf_impl(half, half *);
+template <typename T0, typename T1> auto modf_impl(T0 &x, T1 &&y) {
+  if constexpr (is_multi_ptr_v<std::remove_reference_t<T1>>) {
+    // TODO: Spec needs to be clarified, multi_ptr shouldn't be possible on
+    // host.
+    return modf_impl(x, y.get_raw());
+  } else {
+    return builtin_delegate_ptr_impl(
+        [](auto x, auto y) { return modf_impl(x, y); }, y,
+        simplify_if_swizzle_t<T0>{x});
+  }
+}
+} // namespace detail
+#endif
+BUILTIN_LAST_PTR(TWO_ARGS, modf, builtin_enable_ptr_scalar_t,
+                 builtin_enable_ptr_non_scalar_t)
+template <typename T0>
+__SYCL_DEPRECATED("SYCL builtin functions with raw pointer arguments have been "
+                  "deprecated. Please use multi_ptr.")
+detail::builtin_enable_math_allow_scalar_t<T0> modf(
+    T0 x, detail::simplify_if_swizzle_t<T0> *y) {
+  return detail::modf_impl(x, y);
+}
+
+#undef BUILTIN_LAST_PTR
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define BUILTIN_MATH_LAST_INT(NAME)                                            \
+  float NAME(float x, int y) { return __spirv_ocl_##NAME(x, y); }              \
+  double NAME(double x, int y) { return __spirv_ocl_##NAME(x, y); }            \
+  half NAME(half x, int y) {                                                   \
+    return __spirv_ocl_##NAME(detail::builtins::convert_arg(x), y);            \
+  }                                                                            \
+  DEVICE_IMPL_TEMPLATE(TWO_ARGS, NAME, builtin_enable_last_int_t,              \
+                       __spirv_ocl_##NAME)
+#else
+#define SCALAR_EXTERN_LAST_INT(NAME, TYPE)                                     \
+  inline TYPE NAME(TYPE x, int y) {                                            \
+    extern TYPE __##NAME##_impl(TYPE, int);                                    \
+    return __##NAME##_impl(x, y);                                              \
+  }
+#define BUILTIN_MATH_LAST_INT(NAME)                                            \
+  FOR_EACH1(SCALAR_EXTERN_LAST_INT, NAME, FP_TYPES)                            \
+  HOST_IMPL_TEMPLATE(TWO_ARGS, NAME, builtin_enable_last_int_t, math,          \
+                     default_ret_type)
+#endif
+
+BUILTIN_MATH_LAST_INT(pown)
+BUILTIN_MATH_LAST_INT(rootn)
+BUILTIN_MATH_LAST_INT(ldexp)
+template <typename T> detail::builtin_enable_math_t<T> ldexp(T x, int y) {
+  return ldexp(
+      detail::simplify_if_swizzle_t<T>{x},
+      detail::change_elements_t<int, detail::simplify_if_swizzle_t<T>>{y});
+}
+
+#undef BUILTIN_MATH_LAST_INT
+#undef SCALAR_EXTERN_LAST_INT
+
+namespace detail {
+#ifdef __SYCL_DEVICE_ONLY__
+template <typename T0, typename T1> auto sincos_impl(T0 &x, T1 &&y) {
+  if constexpr (is_marray_v<T0>) {
+    return builtin_delegate_ptr_impl(
+        [](auto... xs) { return sincos_impl(xs...); }, y, x);
+  } else {
+    using detail::builtins::convert_arg;
+    if constexpr (use_fast_math_v<T0>) {
+      // This is a performance optimization to ensure that sincos isn't slower
+      // than a pair of sin/cos executed separately. Theoretically, calling
+      // non-native sincos might be faster than calling native::sin plus
+      // native::cos separately and we'd need some kind of cost model to make
+      // the right decision (and move this entirely to the JIT/AOT compilers).
+      // However, in practice, this simpler solution seems to work just fine and
+      // matches how sin/cos above are optimized for the fast math path.
+      *y = __spirv_ocl_native_cos(convert_arg(x));
+      return __spirv_ocl_native_sin(convert_arg(x));
+    } else {
+      return __spirv_ocl_sincos(convert_arg(x), convert_arg(y));
+    }
+  }
+}
+#else
+__SYCL_EXPORT float sincos_impl(float, float *);
+__SYCL_EXPORT double sincos_impl(double, double *);
+__SYCL_EXPORT half sincos_impl(half, half *);
+template <typename T0, typename T1> auto sincos_impl(T0 &x, T1 &&y) {
+  if constexpr (is_multi_ptr_v<std::remove_reference_t<T1>>) {
+    // TODO: Spec needs to be clarified, multi_ptr shouldn't be possible on
+    // host.
+    return sincos_impl(x, y.get_raw());
+  } else {
+    return builtin_delegate_ptr_impl(
+        [](auto... xs) { return sincos_impl(xs...); }, y,
+        simplify_if_swizzle_t<T0>{x});
+  }
+}
+#endif
+} // namespace detail
+BUILTIN_LAST_PTR_COMMON(TWO_ARGS, sincos, builtin_enable_ptr_scalar_t,
+                        builtin_enable_ptr_non_scalar_t)
+template <typename T0>
+__SYCL_DEPRECATED("SYCL builtin functions with raw pointer arguments have been "
+                  "deprecated. Please use multi_ptr.")
+detail::builtin_enable_math_allow_scalar_t<T0> sincos(
+    T0 x, detail::simplify_if_swizzle_t<T0> *y) {
+  return detail::sincos_impl(x, y);
+}
+
+#undef BUILTIN_LAST_PTR_COMMON
+#undef LAST_PTR_SCALAR
+
+namespace detail {
+template <typename T>
+struct ilogb_ret_traits
+    : change_elements<std::conditional_t<is_vec_or_swizzle_v<T>, int32_t, int>,
+                      T> {};
+template <typename T>
+using builtin_enable_ilogb_t =
+    std::enable_if_t<builtin_enable_math_allow_scalar_v<T>,
+                     typename ilogb_ret_traits<T>::type>;
+} // namespace detail
+
+#ifdef __SYCL_DEVICE_ONLY__
+inline int ilogb(float x) { return __spirv_ocl_ilogb(x); }
+inline int ilogb(double x) { return __spirv_ocl_ilogb(x); }
+inline int ilogb(half x) {
+  return __spirv_ocl_ilogb(detail::builtins::convert_arg(x));
+}
+DEVICE_IMPL_TEMPLATE(ONE_ARG, ilogb, builtin_enable_ilogb_t, __spirv_ocl_ilogb)
+#else
+inline int ilogb(float x) {
+  extern int __ilogb_impl(float);
+  return __ilogb_impl(x);
+}
+inline int ilogb(double x) {
+  extern int __ilogb_impl(double);
+  return __ilogb_impl(x);
+}
+inline int ilogb(half x) {
+  extern int __ilogb_impl(half);
+  return __ilogb_impl(x);
+}
+HOST_IMPL_TEMPLATE(ONE_ARG, ilogb, builtin_enable_ilogb_t, math,
+                   ilogb_ret_traits)
+#endif
+
+// nan implementation, as per
+// https://github.com/KhronosGroup/SYCL-Docs/pull/519.
+namespace detail {
+template <typename T>
+// clang-format off
+using nan_elem_result_type = change_elements_t<
+    typename map_type<get_elem_type_t<T>,
+                      uint32_t, /*->*/ float,
+                      uint64_t, /*->*/ double,
+                      uint16_t, /*->*/ half>::type,
+    T>;
+// clang-format on
+
+template <typename T>
+using builtin_enable_nan_t = std::enable_if_t<
+    (((is_vec_or_swizzle_v<T> || is_marray_v<T>)) &&
+     check_type_in_v<get_elem_type_t<T>, uint32_t, uint64_t, uint16_t>),
+    nan_elem_result_type<T>>;
+} // namespace detail
+
+#ifdef __SYCL_DEVICE_ONLY__
+inline float nan(uint32_t x) {
+  return __spirv_ocl_nan(detail::builtins::convert_arg(x));
+}
+inline double nan(uint64_t x) {
+  return __spirv_ocl_nan(detail::builtins::convert_arg(x));
+}
+inline half nan(uint16_t x) {
+  return __spirv_ocl_nan(detail::builtins::convert_arg(x));
+}
+DEVICE_IMPL_TEMPLATE(ONE_ARG, nan, builtin_enable_nan_t, __spirv_ocl_nan)
+#else
+inline float nan(uint32_t) { return std::numeric_limits<float>::quiet_NaN(); }
+inline double nan(uint64_t) { return std::numeric_limits<float>::quiet_NaN(); }
+// NOTE: half_type.hpp provides partial specialization for std::numeric_limits.
+inline half nan(uint16_t) { return std::numeric_limits<half>::quiet_NaN(); }
+template <typename T> detail::builtin_enable_nan_t<T> nan(T x) {
+  return detail::builtin_delegate_to_scalar([](auto x) { return nan(x); }, x);
+}
+#endif
+
+template <typename T>
+__SYCL_DEPRECATED("abs for floating point types is non-standard and has been "
+                  "deprecated. Please use fabs instead.")
+detail::builtin_enable_math_allow_scalar_t<T> abs(T x) {
+  return fabs(x);
+}
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/include/sycl/detail/builtins/native_math_functions.inc b/sycl/include/sycl/detail/builtins/native_math_functions.inc
new file mode 100644
index 0000000000000..01cb6f43e2c59
--- /dev/null
+++ b/sycl/include/sycl/detail/builtins/native_math_functions.inc
@@ -0,0 +1,53 @@
+//==------------------- native_math_functions.hpp --------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Intentionally insufficient set of includes and no "#pragma once".
+
+#include <sycl/detail/builtins/helper_macros.hpp>
+
+namespace sycl {
+inline namespace _V1 {
+BUILTIN_CREATE_ENABLER(builtin_enable_native_math, default_ret_type,
+                       float_elem_type, non_scalar_only, same_elem_type)
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define BUILTIN_NATIVE(NUM_ARGS, NAME)                                         \
+  inline float NAME(NUM_ARGS##_TYPE_ARG(float)) {                              \
+    return __spirv_ocl_native_##NAME(NUM_ARGS##_ARG);                          \
+  }                                                                            \
+  DEVICE_IMPL_TEMPLATE_CUSTOM_DELEGATE(                                        \
+      NUM_ARGS, NAME, builtin_enable_native_math_t, builtin_marray_impl,       \
+      native, __spirv_ocl_native_##NAME)
+#else
+#define BUILTIN_NATIVE(NUM_ARGS, NAME)                                         \
+  HOST_IMPL_SCALAR(NUM_ARGS, NAME, float)                                      \
+  HOST_IMPL_TEMPLATE(NUM_ARGS, NAME, builtin_enable_native_math_t, native,     \
+                     default_ret_type)
+#endif
+
+namespace native {
+BUILTIN_NATIVE(ONE_ARG, cos)
+BUILTIN_NATIVE(ONE_ARG, exp)
+BUILTIN_NATIVE(ONE_ARG, exp10)
+BUILTIN_NATIVE(ONE_ARG, exp2)
+BUILTIN_NATIVE(ONE_ARG, log)
+BUILTIN_NATIVE(ONE_ARG, log10)
+BUILTIN_NATIVE(ONE_ARG, log2)
+BUILTIN_NATIVE(ONE_ARG, recip)
+BUILTIN_NATIVE(ONE_ARG, rsqrt)
+BUILTIN_NATIVE(ONE_ARG, sin)
+BUILTIN_NATIVE(ONE_ARG, sqrt)
+BUILTIN_NATIVE(ONE_ARG, tan)
+BUILTIN_NATIVE(TWO_ARGS, divide)
+BUILTIN_NATIVE(TWO_ARGS, powr)
+} // namespace native
+
+#undef BUILTIN_NATIVE
+#undef VEC_EXTERN
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/include/sycl/detail/builtins/relational_functions.inc b/sycl/include/sycl/detail/builtins/relational_functions.inc
new file mode 100644
index 0000000000000..64a7d67934da1
--- /dev/null
+++ b/sycl/include/sycl/detail/builtins/relational_functions.inc
@@ -0,0 +1,264 @@
+//==------------------- relational_functions.hpp ---------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Intentionally insufficient set of includes and no "#pragma once".
+
+#include <sycl/detail/builtins/helper_macros.hpp>
+
+namespace sycl {
+inline namespace _V1 {
+namespace detail {
+template <typename T>
+struct rel_ret_traits
+    : std::conditional<is_scalar_arithmetic_v<T>, bool,
+                       std::conditional_t<
+                           is_marray_v<T>, marray<bool, num_elements<T>::value>,
+                           same_size_signed_int_t<simplify_if_swizzle_t<T>>>> {
+};
+} // namespace detail
+BUILTIN_CREATE_ENABLER(builtin_enable_rel, rel_ret_traits, fp_elem_type,
+                       non_scalar_only, same_elem_type)
+
+namespace detail {
+#ifdef __SYCL_DEVICE_ONLY__
+template <typename FuncTy, typename... Ts>
+auto builtin_device_rel_impl(FuncTy F, const Ts &...xs) {
+  using T = typename first_type<Ts...>::type;
+  if constexpr (detail::is_vec_or_swizzle_v<T>) {
+    // decltype(ret) is signed char ext_vector_type(N). Convert it to
+    // sycl::vec<signed char, N> first and then to the required return type of
+    // the relation builtin (vector of int16_t/int32_t/int64_t depending on the
+    // arguments' element type).
+    auto ret = F(builtins::convert_arg(xs)...);
+    vec<signed char, num_elements<T>::value> tmp{ret};
+    using res_elem_type =
+        make_type_t<get_elem_type_t<T>, type_list<int16_t, int32_t, int64_t>>;
+    static_assert(is_scalar_arithmetic_v<res_elem_type>);
+    return tmp.template convert<res_elem_type>();
+  } else if constexpr (std::is_same_v<T, half>) {
+    return bool{F(builtins::convert_arg(xs)...)};
+  } else {
+    static_assert(!detail::is_swizzle_v<T>);
+    return F(builtins::convert_arg(xs)...);
+  }
+}
+#endif
+
+template <typename FuncTy, typename... Ts>
+auto builtin_delegate_rel_impl(FuncTy F, const Ts &...x) {
+  using T = typename first_type<Ts...>::type;
+  if constexpr ((... || is_swizzle_v<Ts>)) {
+    return F(simplify_if_swizzle_t<T>{x}...);
+  } else if constexpr (is_vec_v<T>) {
+    // TODO: using Res{} to avoid Werror. Not sure if ok.
+    vec<same_size_signed_int_t<get_elem_type_t<T>>, T::size()> Res{};
+    detail::loop<T::size()>(
+        [&](auto idx) { Res[idx] = F(x[idx]...) ? -1 : 0; });
+    return Res;
+  } else {
+    // marray.
+    marray<bool, T::size()> Res;
+    // TODO: Can we optimize this? Note that using vector version isn't
+    // straightforward as it doesn't return booleans.
+    detail::loop<T::size()>([&](auto idx) { Res[idx] = F(x[idx]...); });
+    return Res;
+  }
+}
+} // namespace detail
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define BUILTIN_REL(NUM_ARGS, NAME, SPIRV_IMPL)                                \
+  bool NAME(NUM_ARGS##_TYPE_ARG(float)) { return SPIRV_IMPL(NUM_ARGS##_ARG); } \
+  bool NAME(NUM_ARGS##_TYPE_ARG(double)) {                                     \
+    return SPIRV_IMPL(NUM_ARGS##_ARG);                                         \
+  }                                                                            \
+  bool NAME(NUM_ARGS##_TYPE_ARG(half)) {                                       \
+    return SPIRV_IMPL(NUM_ARGS##_CONVERTED_ARG);                               \
+  }                                                                            \
+  template <NUM_ARGS##_TYPENAME_TYPE>                                          \
+  detail::builtin_enable_rel_t<NUM_ARGS##_TEMPLATE_TYPE> NAME(                 \
+      NUM_ARGS##_TEMPLATE_TYPE_ARG) {                                          \
+    if constexpr (detail::is_marray_v<T0>)                                     \
+      return detail::builtin_delegate_rel_impl(                                \
+          [&](NUM_ARGS##_AUTO_ARG) { return NAME(NUM_ARGS##_ARG); },           \
+          NUM_ARGS##_ARG);                                                     \
+    else                                                                       \
+      return detail::builtin_device_rel_impl(                                  \
+          [&](NUM_ARGS##_AUTO_ARG) { return SPIRV_IMPL(NUM_ARGS##_ARG); },     \
+          NUM_ARGS##_ARG);                                                     \
+  }
+#else
+#define BUILTIN_REL(NUM_ARGS, NAME, SPIRV_IMPL)                                \
+  FOR_EACH3(HOST_IMPL_SCALAR_RET_TYPE, NUM_ARGS, NAME, bool, FP_TYPES)         \
+  HOST_IMPL_TEMPLATE_CUSTOM_DELEGATOR(NUM_ARGS, NAME, builtin_enable_rel_t,    \
+                                      rel, rel_ret_traits,                     \
+                                      builtin_delegate_rel_impl)
+#endif
+
+BUILTIN_REL(TWO_ARGS, isequal, __spirv_FOrdEqual)
+BUILTIN_REL(TWO_ARGS, isnotequal, __spirv_FUnordNotEqual)
+BUILTIN_REL(TWO_ARGS, isgreater, __spirv_FOrdGreaterThan)
+BUILTIN_REL(TWO_ARGS, isgreaterequal, __spirv_FOrdGreaterThanEqual)
+BUILTIN_REL(TWO_ARGS, isless, __spirv_FOrdLessThan)
+BUILTIN_REL(TWO_ARGS, islessequal, __spirv_FOrdLessThanEqual)
+BUILTIN_REL(TWO_ARGS, islessgreater, __spirv_FOrdNotEqual)
+BUILTIN_REL(ONE_ARG, isfinite, __spirv_IsFinite)
+BUILTIN_REL(ONE_ARG, isinf, __spirv_IsInf)
+BUILTIN_REL(ONE_ARG, isnan, __spirv_IsNan)
+BUILTIN_REL(ONE_ARG, isnormal, __spirv_IsNormal)
+BUILTIN_REL(TWO_ARGS, isordered, __spirv_Ordered)
+BUILTIN_REL(TWO_ARGS, isunordered, __spirv_Unordered)
+BUILTIN_REL(ONE_ARG, signbit, __spirv_SignBitSet)
+
+#undef BUILTIN_REL
+
+#ifdef __SYCL_DEVICE_ONLY__
+DEVICE_IMPL_TEMPLATE(THREE_ARGS, bitselect, builtin_enable_generic_t,
+                     __spirv_ocl_bitselect)
+#else
+HOST_IMPL_TEMPLATE(THREE_ARGS, bitselect, builtin_enable_generic_t, rel,
+                   default_ret_type)
+#endif
+
+namespace detail {
+template <typename T>
+struct builtin_enable_rel_all_any
+    : std::enable_if<(is_marray_v<T> &&
+                      std::is_same_v<get_elem_type_t<T>, bool>) ||
+                         (is_vec_or_swizzle_v<T> &&
+                          check_type_in_v<get_elem_type_t<T>, int8_t, int16_t,
+                                          int32_t, int64_t>),
+                     std::conditional_t<is_marray_v<T>, bool, int>> {};
+
+template <typename T>
+struct builtin_enable_rel_all_any_deprecated
+    : std::enable_if<((is_scalar_arithmetic_v<T> || is_marray_v<T>)) &&
+                         check_type_in_v<get_elem_type_t<T>, signed char, short,
+                                         int, long, long long>,
+                     bool> {};
+} // namespace detail
+
+template <typename T>
+typename detail::builtin_enable_rel_all_any<T>::type any(T x) {
+  if constexpr (detail::is_marray_v<T>) {
+    return std::any_of(x.begin(), x.end(), [](bool x) { return x; });
+  } else {
+    for (size_t i = 0; i < detail::num_elements<T>::value; ++i)
+      if (detail::msbIsSet(x[i]))
+        return true;
+    return false;
+  }
+}
+
+template <typename T>
+__SYCL_DEPRECATED("This overload is deprecated in SYCL 2020.")
+typename detail::builtin_enable_rel_all_any_deprecated<T>::type any(T x) {
+  if constexpr (detail::is_marray_v<T>) {
+    return std::any_of(x.begin(), x.end(),
+                       [](auto x) { return detail::msbIsSet(x); });
+  } else {
+    return detail::msbIsSet(x);
+  }
+}
+
+template <typename T>
+typename detail::builtin_enable_rel_all_any<T>::type all(T x) {
+  if constexpr (detail::is_marray_v<T>) {
+    return std::all_of(x.begin(), x.end(), [](bool x) { return x; });
+  } else {
+    for (size_t i = 0; i < detail::num_elements<T>::value; ++i)
+      if (!detail::msbIsSet(x[i]))
+        return false;
+    return true;
+  }
+}
+
+template <typename T>
+__SYCL_DEPRECATED("This overload is deprecated in SYCL 2020.")
+typename detail::builtin_enable_rel_all_any_deprecated<T>::type all(T x) {
+  if constexpr (detail::is_marray_v<T>) {
+    return std::all_of(x.begin(), x.end(),
+                       [](auto x) { return detail::msbIsSet(x); });
+  } else {
+    return detail::msbIsSet(x);
+  }
+}
+namespace detail {
+template <typename T>
+inline constexpr bool is_rel_generic_scalar_v =
+    check_type_in_v<T, char, signed char, short, int, long, long long,
+                    unsigned char, unsigned short, unsigned int, unsigned long,
+                    unsigned long long, float, double, half>;
+template <typename T>
+inline constexpr bool is_rel_vector_elem_type_v =
+    check_type_in_v<T, int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
+                    uint32_t, uint64_t, float, double, half>;
+
+template <typename T0, typename T1, typename T2>
+struct rel_enable_select_marray_helper {
+  static constexpr bool check_T0 =
+      is_marray_v<T0> && is_rel_generic_scalar_v<get_elem_type_t<T0>>;
+  static constexpr bool check_T1 = std::is_same_v<T0, T1>;
+  static constexpr bool check_T2 =
+      is_marray_v<T2> && std::is_same_v<get_elem_type_t<T2>, bool> &&
+      num_elements<T0>::value == num_elements<T2>::value;
+
+  static constexpr bool value = check_T0 && check_T1 && check_T2;
+};
+
+template <typename T0, typename T1, typename T2>
+struct rel_enable_select_vec_helper {
+  using T0_simplified = simplify_if_swizzle_t<T0>;
+  using T1_simplified = simplify_if_swizzle_t<T1>;
+  using T2_simplified = simplify_if_swizzle_t<T2>;
+  using T0_elem_type = get_elem_type_t<T0>;
+  using T2_elem_type = get_elem_type_t<T2>;
+
+  static constexpr bool check_T0 =
+      is_vec_or_swizzle_v<T0> && is_rel_vector_elem_type_v<get_elem_type_t<T0>>;
+  static constexpr bool check_T1 = std::is_same_v<T0_simplified, T1_simplified>;
+  static constexpr bool check_T2 =
+      is_vec_or_swizzle_v<T2> &&
+      num_elements<T0>::value == num_elements<T2>::value &&
+      std ::is_integral_v<T2_elem_type> &&
+      sizeof(T0_elem_type) == sizeof(T2_elem_type);
+  static constexpr bool value = check_T0 && check_T1 && check_T2;
+};
+
+template <typename T0, typename T1, typename T2>
+inline constexpr bool rel_enable_select_v =
+    detail::rel_enable_select_marray_helper<T0, T1, T2>::value ||
+    detail::rel_enable_select_vec_helper<T0, T1, T2>::value;
+} // namespace detail
+
+// __spirv_ocl_select doesn't behave as required by SYCL/OpenCL spec for vector
+// data types (MSB-related stuff).
+template <typename T>
+std::enable_if_t<detail::is_rel_generic_scalar_v<T>, T> select(T a, T b,
+                                                               bool c) {
+  return (c ? b : a);
+}
+
+template <typename T0, typename T1, typename T2>
+std::enable_if_t<detail::rel_enable_select_v<T0, T1, T2>,
+                 detail::simplify_if_swizzle_t<T0>>
+select(T0 a, T1 b, T2 c) {
+  if constexpr (detail::is_marray_v<T0>) {
+    T0 ret;
+    for (size_t i = 0; i < T0::size(); ++i)
+      ret[i] = (c[i] ? b[i] : a[i]);
+    return ret;
+  } else {
+    detail::simplify_if_swizzle_t<T0> ret;
+    for (size_t i = 0; i < ret.size(); ++i)
+      ret[i] = (detail::msbIsSet(c[i]) ? b[i] : a[i]);
+    return ret;
+  }
+}
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/include/sycl/detail/type_traits.hpp b/sycl/include/sycl/detail/type_traits.hpp
index d7facf82bbb30..6d0ed81bcbcf0 100644
--- a/sycl/include/sycl/detail/type_traits.hpp
+++ b/sycl/include/sycl/detail/type_traits.hpp
@@ -155,7 +155,7 @@ template <class T> using marray_element_t = typename T::value_type;
 // get_elem_type
 // Get the element type of T. If T is a scalar, the element type is considered
 // the type of the scalar.
-template <typename T> struct get_elem_type {
+template <typename T, typename = void> struct get_elem_type {
   using type = T;
 };
 template <typename T, size_t N> struct get_elem_type<marray<T, N>> {
@@ -170,6 +170,29 @@ struct get_elem_type<SwizzleOp<VecT, OperationLeftT, OperationRightT,
                                OperationCurrentT, Indexes...>> {
   using type = typename get_elem_type<std::remove_cv_t<VecT>>::type;
 };
+
+template <typename ElementType, access::address_space Space,
+          access::decorated DecorateAddress>
+struct get_elem_type<multi_ptr<ElementType, Space, DecorateAddress>> {
+  using type = ElementType;
+};
+
+template <typename T, typename = void>
+struct is_ext_vector : std::false_type {};
+
+template <typename T>
+struct is_ext_vector<
+    T, std::void_t<decltype(__builtin_reduce_max(std::declval<T>()))>>
+    : std::true_type {};
+
+template <typename T>
+inline constexpr bool is_ext_vector_v = is_ext_vector<T>::value;
+
+template <typename T>
+struct get_elem_type<T, std::enable_if_t<is_ext_vector_v<T>>> {
+  using type = decltype(__builtin_reduce_max(std::declval<T>()));
+};
+
 template <typename T> using get_elem_type_t = typename get_elem_type<T>::type;
 
 // change_base_type_t
@@ -298,6 +321,9 @@ template <typename T>
 struct is_scalar_arithmetic
     : std::bool_constant<!is_vec<T>::value && is_arithmetic<T>::value> {};
 
+template <typename T>
+inline constexpr bool is_scalar_arithmetic_v = is_scalar_arithmetic<T>::value;
+
 template <typename T>
 struct is_vector_arithmetic
     : std::bool_constant<is_vec<T>::value && is_arithmetic<T>::value> {};
@@ -484,6 +510,28 @@ template <typename Ret, typename... Args> struct function_traits<Ret(Args...)> {
   using args_type = std::tuple<Args...>;
 };
 
+// No first_type_t due to
+// https://open-std.org/jtc1/sc22/wg21/docs/cwg_active.html#1430.
+template <typename T, typename... Ts> struct first_type {
+  using type = T;
+};
+
+template <typename T0, typename... Ts>
+inline constexpr bool all_same_v = (... && std::is_same_v<T0, Ts>);
+
+// Example usage:
+//   using mapped = map_type<type_to_map, from0, /*->*/ to0,
+//                                        from1, /*->*/ to1,
+//                                        ...>
+template <typename...> struct map_type {
+  using type = void;
+};
+
+template <typename T, typename From, typename To, typename... Rest>
+struct map_type<T, From, To, Rest...> {
+  using type = std::conditional_t<std::is_same_v<From, T>, To,
+                                  typename map_type<T, Rest...>::type>;
+};
 } // namespace detail
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/include/sycl/detail/vector_convert.hpp b/sycl/include/sycl/detail/vector_convert.hpp
index e75aaa004b129..693e9c9f188df 100644
--- a/sycl/include/sycl/detail/vector_convert.hpp
+++ b/sycl/include/sycl/detail/vector_convert.hpp
@@ -57,13 +57,9 @@
 #include <sycl/detail/generic_type_traits.hpp> // for is_sigeninteger, is_s...
 
 #ifndef __SYCL_DEVICE_ONLY__
-
-#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
-#include <sycl/builtins_scalar_gen.hpp> // for ceil, floor, rint, trunc
-#else                                   // __INTEL_PREVIEW_BREAKING_CHANGES
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
 #include <sycl/builtins_legacy_scalar.hpp> // for ceil, floor, rint, trunc
-#endif                                     // __INTEL_PREVIEW_BREAKING_CHANGES
-
+#endif
 #include <cfenv> // for fesetround, fegetround
 #endif
 
@@ -74,6 +70,19 @@ namespace sycl {
 enum class rounding_mode { automatic = 0, rte = 1, rtz = 2, rtp = 3, rtn = 4 };
 
 inline namespace _V1 {
+#ifdef __INTEL_PREVIEW_BREAKING_CHANGES
+#ifndef __SYCL_DEVICE_ONLY__
+// TODO: Refactor includes so we can just "#include".
+inline float ceil(float);
+inline double ceil(double);
+inline float floor(float);
+inline double floor(double);
+inline float rint(float);
+inline double rint(double);
+inline float trunc(float);
+inline double trunc(double);
+#endif
+#endif
 namespace detail {
 
 template <typename T, typename R>
diff --git a/sycl/include/sycl/types.hpp b/sycl/include/sycl/types.hpp
index b9ec19748e05f..8debd52b8dfd8 100644
--- a/sycl/include/sycl/types.hpp
+++ b/sycl/include/sycl/types.hpp
@@ -1646,7 +1646,7 @@ class SwizzleOp {
 
   __SYCL2020_DEPRECATED("get_count() is deprecated, please use size() instead")
   size_t get_count() const { return size(); }
-  size_t size() const noexcept { return getNumElements(); }
+  static constexpr size_t size() noexcept { return getNumElements(); }
 
   template <int Num = getNumElements()>
   __SYCL2020_DEPRECATED(
diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt
index 3ea9217913b56..ddaa85f1a8dcb 100644
--- a/sycl/source/CMakeLists.txt
+++ b/sycl/source/CMakeLists.txt
@@ -155,7 +155,7 @@ function(add_sycl_rt_library LIB_NAME LIB_OBJ_NAME)
   endif()
 endfunction(add_sycl_rt_library)
 
-set(SYCL_SOURCES
+set(SYCL_COMMON_SOURCES
     "backend/opencl.cpp"
     "backend/level_zero.cpp"
     "backend.cpp"
@@ -163,11 +163,6 @@ set(SYCL_SOURCES
     "detail/allowlist.cpp"
     "detail/bindless_images.cpp"
     "detail/buffer_impl.cpp"
-    "detail/builtins_common.cpp"
-    "detail/builtins_geometric.cpp"
-    "detail/builtins_integer.cpp"
-    "detail/builtins_math.cpp"
-    "detail/builtins_relational.cpp"
     "detail/pi.cpp"
     "detail/common.cpp"
     "detail/config.cpp"
@@ -241,6 +236,25 @@ set(SYCL_SOURCES
     "$<$<OR:$<PLATFORM_ID:Linux>,$<PLATFORM_ID:Darwin>>:detail/posix_pi.cpp>"
 )
 
+set(SYCL_NON_PREVIEW_SOURCES "${SYCL_COMMON_SOURCES}"
+    "detail/builtins_common.cpp"
+    "detail/builtins_geometric.cpp"
+    "detail/builtins_integer.cpp"
+    "detail/builtins_math.cpp"
+    "detail/builtins_relational.cpp"
+)
+
+
+set(SYCL_PREVIEW_SOURCES "${SYCL_COMMON_SOURCES}"
+    "builtins/common_functions.cpp"
+    "builtins/geometric_functions.cpp"
+    "builtins/half_precision_math_functions.cpp"
+    "builtins/integer_functions.cpp"
+    "builtins/math_functions.cpp"
+    "builtins/native_math_functions.cpp"
+    "builtins/relational_functions.cpp"
+)
+
 if (MSVC)
   # MSVC provides two incompatible build variants for its CRT: release and debug
   # To avoid potential issues in user code we also need to provide two kinds
@@ -256,14 +270,14 @@ if (MSVC)
 
   set(WIN_DUPE "1")
   if (SYCL_ENABLE_XPTI_TRACING)
-    add_sycl_rt_library(sycl${SYCL_MAJOR_VERSION}d sycld_object XPTI_LIB xptid COMPILE_OPTIONS "/MDd" SOURCES ${SYCL_SOURCES})
+    add_sycl_rt_library(sycl${SYCL_MAJOR_VERSION}d sycld_object XPTI_LIB xptid COMPILE_OPTIONS "/MDd" SOURCES ${SYCL_NON_PREVIEW_SOURCES})
     if(SYCL_ENABLE_MAJOR_RELEASE_PREVIEW_LIB)
-      add_sycl_rt_library(sycl${SYCL_MAJOR_VERSION}-previewd sycl-previewd_object XPTI_LIB xptid COMPILE_OPTIONS "/MDd" "/D__INTEL_PREVIEW_BREAKING_CHANGES" SOURCES ${SYCL_SOURCES})
+      add_sycl_rt_library(sycl${SYCL_MAJOR_VERSION}-previewd sycl-previewd_object XPTI_LIB xptid COMPILE_OPTIONS "/MDd" "/D__INTEL_PREVIEW_BREAKING_CHANGES" SOURCES ${SYCL_PREVIEW_SOURCES})
     endif()
   else()
-    add_sycl_rt_library(sycl${SYCL_MAJOR_VERSION}d sycld_object COMPILE_OPTIONS "/MDd" SOURCES ${SYCL_SOURCES})
+    add_sycl_rt_library(sycl${SYCL_MAJOR_VERSION}d sycld_object COMPILE_OPTIONS "/MDd" SOURCES ${SYCL_NON_PREVIEW_SOURCES})
     if(SYCL_ENABLE_MAJOR_RELEASE_PREVIEW_LIB)
-      add_sycl_rt_library(sycl${SYCL_MAJOR_VERSION}-previewd sycl-previewd_object COMPILE_OPTIONS "/MDd" "/D__INTEL_PREVIEW_BREAKING_CHANGES" SOURCES ${SYCL_SOURCES})
+      add_sycl_rt_library(sycl${SYCL_MAJOR_VERSION}-previewd sycl-previewd_object COMPILE_OPTIONS "/MDd" "/D__INTEL_PREVIEW_BREAKING_CHANGES" SOURCES ${SYCL_PREVIEW_SOURCES})
     endif()
   endif()
   unset(WIN_DUPE)
@@ -288,14 +302,14 @@ set(LIB_NAME "sycl")
 endif()
 
 if (SYCL_ENABLE_XPTI_TRACING)
-  add_sycl_rt_library(${LIB_NAME} sycl_object XPTI_LIB xpti COMPILE_OPTIONS ${SYCL_EXTRA_OPTS}  SOURCES ${SYCL_SOURCES})
+  add_sycl_rt_library(${LIB_NAME} sycl_object XPTI_LIB xpti COMPILE_OPTIONS ${SYCL_EXTRA_OPTS}  SOURCES ${SYCL_NON_PREVIEW_SOURCES})
   if(SYCL_ENABLE_MAJOR_RELEASE_PREVIEW_LIB)
-    add_sycl_rt_library(${LIB_NAME}-preview sycl-preview_object XPTI_LIB xpti COMPILE_OPTIONS ${SYCL_EXTRA_OPTS} "-D__INTEL_PREVIEW_BREAKING_CHANGES" SOURCES ${SYCL_SOURCES})
+    add_sycl_rt_library(${LIB_NAME}-preview sycl-preview_object XPTI_LIB xpti COMPILE_OPTIONS ${SYCL_EXTRA_OPTS} "-D__INTEL_PREVIEW_BREAKING_CHANGES" SOURCES ${SYCL_PREVIEW_SOURCES})
   endif()
 else()
-  add_sycl_rt_library(${LIB_NAME} sycl_object COMPILE_OPTIONS ${SYCL_EXTRA_OPTS} SOURCES ${SYCL_SOURCES})
+  add_sycl_rt_library(${LIB_NAME} sycl_object COMPILE_OPTIONS ${SYCL_EXTRA_OPTS} SOURCES ${SYCL_NON_PREVIEW_SOURCES})
   if(SYCL_ENABLE_MAJOR_RELEASE_PREVIEW_LIB)
-    add_sycl_rt_library(${LIB_NAME}-preview sycl-preview_object COMPILE_OPTIONS ${SYCL_EXTRA_OPTS} "-D__INTEL_PREVIEW_BREAKING_CHANGES" SOURCES ${SYCL_SOURCES})
+    add_sycl_rt_library(${LIB_NAME}-preview sycl-preview_object COMPILE_OPTIONS ${SYCL_EXTRA_OPTS} "-D__INTEL_PREVIEW_BREAKING_CHANGES" SOURCES ${SYCL_PREVIEW_SOURCES})
   endif()
 endif()
 
diff --git a/sycl/source/builtins/common_functions.cpp b/sycl/source/builtins/common_functions.cpp
new file mode 100644
index 0000000000000..09742649ca24d
--- /dev/null
+++ b/sycl/source/builtins/common_functions.cpp
@@ -0,0 +1,78 @@
+//==------------------- common_functions.cpp -------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Define _USE_MATH_DEFINES to enforce math defines of macros like M_PI in
+// <cmath>. _USE_MATH_DEFINES is defined here before includes of SYCL header
+// files to avoid include of <cmath> via those SYCL headers with unset
+// _USE_MATH_DEFINES.
+//
+// Note that C++20 has std::numbers containing the constants but we're limited
+// to C++17.
+#define _USE_MATH_DEFINES
+#include <cmath>
+
+#include <sycl/builtins_preview.hpp>
+
+#include "host_helper_macros.hpp"
+
+namespace sycl {
+inline namespace _V1 {
+#define BUILTIN_COMMON(NUM_ARGS, NAME, IMPL)                                   \
+  HOST_IMPL(NAME, IMPL)                                                        \
+  EXPORT_SCALAR_AND_VEC_1_16(NUM_ARGS, NAME, FP_TYPES)
+
+BUILTIN_COMMON(ONE_ARG, degrees,
+               [](auto x) -> decltype(x) { return (180 / M_PI) * x; })
+
+BUILTIN_COMMON(ONE_ARG, radians,
+               [](auto x) -> decltype(x) { return (M_PI / 180) * x; })
+
+BUILTIN_COMMON(ONE_ARG, sign, [](auto x) -> decltype(x) {
+  using T = decltype(x);
+  if (std::isnan(x))
+    return T(0.0);
+  if (x > 0)
+    return T(1.0);
+  if (x < 0)
+    return T(-1.0);
+  /* x is +0.0 or -0.0 */
+  return x;
+})
+
+BUILTIN_COMMON(THREE_ARGS, mix, [](auto x, auto y, auto z) -> decltype(x) {
+  return x + (y - x) * z;
+})
+
+BUILTIN_COMMON(TWO_ARGS, step,
+               [](auto x, auto y) -> decltype(x) { return y < x ? 0.0 : 1.0; })
+
+BUILTIN_COMMON(THREE_ARGS, smoothstep,
+               [](auto x, auto y, auto z) -> decltype(x) {
+                 using T = decltype(x);
+                 auto t = sycl::clamp((z - x) / (y - x), T{0}, T{1});
+                 return t * t * (3 - 2 * t);
+               })
+
+BUILTIN_COMMON(TWO_ARGS, max,
+               [](auto x, auto y) -> decltype(x) { return (x < y ? y : x); })
+BUILTIN_COMMON(TWO_ARGS, min,
+               [](auto x, auto y) -> decltype(x) { return (y < x ? y : x); })
+
+// clamp is implemented for INTEGER_TYPES as well, so expand/inline
+// BUILTIN_COMMON manually.
+HOST_IMPL(clamp, [](auto x, auto y, auto z) -> decltype(x) {
+  using ElemTy = detail::get_elem_type_t<decltype(x)>;
+  if constexpr (std::is_integral_v<ElemTy>) {
+    return std::min(std::max(x, y), z);
+  } else {
+    return std::fmin(std::fmax(x, y), z);
+  }
+})
+EXPORT_SCALAR_AND_VEC_1_16(THREE_ARGS, clamp, INTEGER_TYPES, FP_TYPES)
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/source/builtins/geometric_functions.cpp b/sycl/source/builtins/geometric_functions.cpp
new file mode 100644
index 0000000000000..bc047ff8558c0
--- /dev/null
+++ b/sycl/source/builtins/geometric_functions.cpp
@@ -0,0 +1,75 @@
+//==------------------- geometric_functions.cpp ----------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <sycl/builtins_preview.hpp>
+
+#include "host_helper_macros.hpp"
+
+#include <cmath>
+
+namespace sycl {
+inline namespace _V1 {
+template <typename T> static inline T cross_host_impl(T p0, T p1) {
+  T result(0);
+  result.x() = p0.y() * p1.z() - p0.z() * p1.y();
+  result.y() = p0.z() * p1.x() - p0.x() * p1.z();
+  result.z() = p0.x() * p1.y() - p0.y() * p1.x();
+  return result;
+}
+EXPORT_VEC_3_4(TWO_ARGS, cross, FP_TYPES)
+
+template <typename T0, typename T1>
+static inline auto dot_host_impl(T0 x, T1 y) {
+  if constexpr (detail::is_scalar_arithmetic<T0>::value) {
+    return x * y;
+  } else {
+    auto R = x[0] * y[0];
+    for (size_t i = 1; i < detail::num_elements<T0>::value; ++i)
+      R += x[i] * y[i];
+    return R;
+  }
+}
+EXPORT_SCALAR_AND_VEC_2_4(TWO_ARGS, dot, FP_TYPES)
+
+template <typename T> static inline auto length_host_impl(T x) {
+  auto d = dot(x, x);
+  return static_cast<decltype(d)>(std::sqrt(d));
+}
+EXPORT_SCALAR_AND_VEC_2_4(ONE_ARG, length, FP_TYPES)
+// fast_length on host is the same as just length.
+template <typename T> static inline auto fast_length_host_impl(T x) {
+  return length_host_impl(x);
+}
+EXPORT_SCALAR_AND_VEC_2_4(ONE_ARG, fast_length, float)
+
+template <typename T0, typename T1>
+static inline auto distance_host_impl(T0 x, T1 y) {
+  return length(x - y);
+}
+EXPORT_SCALAR_AND_VEC_2_4(TWO_ARGS, distance, FP_TYPES)
+// fast_distance on host is the same as just distance.
+template <typename T0, typename T1>
+static inline auto fast_distance_host_impl(T0 x, T1 y) {
+  return distance_host_impl(x, y);
+}
+EXPORT_SCALAR_AND_VEC_2_4(TWO_ARGS, fast_distance, float)
+
+template <typename T> static inline auto normalize_host_impl(T x) {
+  auto len = length(x);
+  if (len == 0)
+    return x;
+  return x / len;
+}
+EXPORT_SCALAR_AND_VEC_2_4(ONE_ARG, normalize, FP_TYPES)
+// fast_normalize on host is the same as just normalize.
+template <typename T> static inline auto fast_normalize_host_impl(T x) {
+  return normalize_host_impl(x);
+}
+EXPORT_SCALAR_AND_VEC_2_4(ONE_ARG, fast_normalize, float)
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/source/builtins/half_precision_math_functions.cpp b/sycl/source/builtins/half_precision_math_functions.cpp
new file mode 100644
index 0000000000000..ada7677cca8a5
--- /dev/null
+++ b/sycl/source/builtins/half_precision_math_functions.cpp
@@ -0,0 +1,43 @@
+//==------------------- half_precision_math_functions.cpp ------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <sycl/builtins_preview.hpp>
+
+#include "host_helper_macros.hpp"
+
+#include <cmath>
+
+namespace sycl {
+inline namespace _V1 {
+namespace half_precision {
+#define BUILTIN_HALF_CUSTOM(NUM_ARGS, NAME, IMPL)                              \
+  HOST_IMPL(NAME, IMPL)                                                        \
+  EXPORT_SCALAR_AND_VEC_1_16_NS(NUM_ARGS, NAME, half_precision, float)
+
+#define BUILTIN_HALF(NUM_ARGS, NAME)                                           \
+  BUILTIN_HALF_CUSTOM(NUM_ARGS, NAME, std::NAME)
+
+BUILTIN_HALF(ONE_ARG, cos)
+BUILTIN_HALF_CUSTOM(TWO_ARGS, divide, [](auto x, auto y) { return x / y; })
+BUILTIN_HALF(ONE_ARG, exp)
+BUILTIN_HALF(ONE_ARG, exp2)
+BUILTIN_HALF_CUSTOM(ONE_ARG, exp10, [](auto x) { return std::pow(10.0f, x); })
+BUILTIN_HALF(ONE_ARG, log)
+BUILTIN_HALF(ONE_ARG, log2)
+BUILTIN_HALF(ONE_ARG, log10)
+BUILTIN_HALF_CUSTOM(TWO_ARGS, powr, [](auto x, auto y) {
+  return (x >= 0 ? std::pow(x, y) : x);
+})
+BUILTIN_HALF_CUSTOM(ONE_ARG, recip, [](auto x) { return 1.0f / x; })
+BUILTIN_HALF_CUSTOM(ONE_ARG, rsqrt, [](auto x) { return 1.0f / std::sqrt(x); })
+BUILTIN_HALF(ONE_ARG, sin)
+BUILTIN_HALF(ONE_ARG, sqrt)
+BUILTIN_HALF(ONE_ARG, tan)
+} // namespace half_precision
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/source/builtins/host_helper_macros.hpp b/sycl/source/builtins/host_helper_macros.hpp
new file mode 100644
index 0000000000000..484b0bc95fb8b
--- /dev/null
+++ b/sycl/source/builtins/host_helper_macros.hpp
@@ -0,0 +1,93 @@
+//==-- host_helper_macros.hpp -- Utility macros to implement sycl builtins -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#define FOR_VEC_1_16(MACRO, ...)                                               \
+  MACRO(__VA_ARGS__, 1)                                                        \
+  MACRO(__VA_ARGS__, 2)                                                        \
+  MACRO(__VA_ARGS__, 3)                                                        \
+  MACRO(__VA_ARGS__, 4)                                                        \
+  MACRO(__VA_ARGS__, 8)                                                        \
+  MACRO(__VA_ARGS__, 16)
+
+#define FOR_VEC_2_4(MACRO, ...)                                                \
+  MACRO(__VA_ARGS__, 2)                                                        \
+  MACRO(__VA_ARGS__, 3)                                                        \
+  MACRO(__VA_ARGS__, 4)
+
+#define FOR_VEC_3_4(MACRO, ...)                                                \
+  MACRO(__VA_ARGS__, 3)                                                        \
+  MACRO(__VA_ARGS__, 4)
+
+// For both macros below NS is the namespace of the original sycl builtin
+// function (e.g., sycl::cos vs native::cos). Headers implementation uses
+// something like
+//
+//   float sin(float x) {
+//     extern __sin_impl(float);
+//     return __sin_impl(x);
+//   }
+//
+// and that "extern" declaration isn't automatically matched against the symbol
+// we export. As such, verify the return type consistency using static_assert.
+#define EXPORT_SCALAR_NS(NUM_ARGS, NAME, NS, TYPE)                             \
+  __SYCL_EXPORT auto __##NAME##_impl(NUM_ARGS##_TYPE_ARG(TYPE))                \
+      -> decltype(NAME##_host_impl(NUM_ARGS##_ARG)) {                          \
+    static_assert(std::is_same_v<decltype(NAME##_host_impl(NUM_ARGS##_ARG)),   \
+                                 decltype(NS::NAME(NUM_ARGS##_ARG))>);         \
+    return NAME##_host_impl(NUM_ARGS##_ARG);                                   \
+  }
+#define EXPORT_VEC_NS(NUM_ARGS, NAME, NS, TYPE, VL)                            \
+  __SYCL_EXPORT auto __##NAME##_impl(NUM_ARGS##_VEC_TYPE_ARG(TYPE, VL))        \
+      -> decltype(NAME##_host_impl(NUM_ARGS##_ARG)) {                          \
+    static_assert(std::is_same_v<decltype(NAME##_host_impl(NUM_ARGS##_ARG)),   \
+                                 decltype(NS::NAME(NUM_ARGS##_ARG))>);         \
+    return NAME##_host_impl(NUM_ARGS##_ARG);                                   \
+  }
+
+#define EXPORT_SCALAR(NUM_ARGS, NAME, TYPE)                                    \
+  EXPORT_SCALAR_NS(NUM_ARGS, NAME, sycl, TYPE)
+#define EXPORT_VEC(NUM_ARGS, NAME, TYPE, VL)                                   \
+  EXPORT_VEC_NS(NUM_ARGS, NAME, sycl, TYPE, VL)
+
+#define EXPORT_SCALAR_AND_VEC_1_16_IMPL(NUM_ARGS, NAME, NS, TYPE)              \
+  EXPORT_SCALAR_NS(NUM_ARGS, NAME, NS, TYPE)                                   \
+  FOR_VEC_1_16(EXPORT_VEC_NS, NUM_ARGS, NAME, NS, TYPE)
+
+#define EXPORT_SCALAR_AND_VEC_2_4_IMPL(NUM_ARGS, NAME, TYPE)                   \
+  EXPORT_SCALAR(NUM_ARGS, NAME, TYPE)                                          \
+  FOR_VEC_2_4(EXPORT_VEC, NUM_ARGS, NAME, TYPE)
+
+#define EXPORT_VEC_3_4_IMPL(NUM_ARGS, NAME, TYPE)                              \
+  FOR_VEC_3_4(EXPORT_VEC, NUM_ARGS, NAME, TYPE)
+
+#define EXPORT_SCALAR_AND_VEC_1_16_NS(NUM_ARGS, NAME, NS, ...)                 \
+  FOR_EACH3(EXPORT_SCALAR_AND_VEC_1_16_IMPL, NUM_ARGS, NAME, NS, __VA_ARGS__)
+#define EXPORT_SCALAR_AND_VEC_1_16(NUM_ARGS, NAME, ...)                        \
+  EXPORT_SCALAR_AND_VEC_1_16_NS(NUM_ARGS, NAME, sycl, __VA_ARGS__)
+
+#define EXPORT_SCALAR_AND_VEC_2_4(NUM_ARGS, NAME, ...)                         \
+  FOR_EACH2(EXPORT_SCALAR_AND_VEC_2_4_IMPL, NUM_ARGS, NAME, __VA_ARGS__)
+#define EXPORT_VEC_3_4(NUM_ARGS, NAME, ...)                                    \
+  FOR_EACH2(EXPORT_VEC_3_4_IMPL, NUM_ARGS, NAME, __VA_ARGS__)
+
+#define HOST_IMPL(NAME, ...)                                                   \
+  template <typename... Ts> static auto NAME##_host_impl(Ts... xs) {           \
+    using namespace detail;                                                    \
+    if constexpr ((... || is_vec_v<Ts>)) {                                     \
+      using ret_elem_type = decltype(NAME##_host_impl(xs[0]...));              \
+      using T = typename first_type<Ts...>::type;                              \
+      vec<ret_elem_type, T::size()> r{};                                       \
+      loop<T::size()>(                                                         \
+          [&](auto idx) { r[idx] = NAME##_host_impl(xs[idx]...); });           \
+      return r;                                                                \
+    } else {                                                                   \
+      return __VA_ARGS__(xs...);                                               \
+    }                                                                          \
+  }
diff --git a/sycl/source/builtins/integer_functions.cpp b/sycl/source/builtins/integer_functions.cpp
new file mode 100644
index 0000000000000..26c4dd9a5788f
--- /dev/null
+++ b/sycl/source/builtins/integer_functions.cpp
@@ -0,0 +1,258 @@
+//==------------------- integer_functions.cpp ------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <sycl/builtins_preview.hpp>
+
+#include "host_helper_macros.hpp"
+
+namespace {
+// A helper function for mul_hi built-in for long
+template <typename T> inline T __get_high_half(T a0b0, T a0b1, T a1b0, T a1b1) {
+  constexpr int halfsize = (sizeof(T) * 8) / 2;
+  // To get the upper 64 bits:
+  // 64 bits from a1b1, upper 32 bits from [a1b0 + (a0b1 + a0b0>>32 (carry bit
+  // in 33rd bit))] with carry bit on 64th bit - use of hadd. Add the a1b1 to
+  // the above 32 bit result.
+  return a1b1 +
+         (sycl::hadd(a1b0, (a0b1 + (a0b0 >> halfsize))) >> (halfsize - 1));
+}
+
+// A helper function for mul_hi built-in for long
+template <typename T>
+inline void __get_half_products(T a, T b, T &a0b0, T &a0b1, T &a1b0, T &a1b1) {
+  constexpr sycl::cl_int halfsize = (sizeof(T) * 8) / 2;
+  T a1 = a >> halfsize;
+  T a0 = (a << halfsize) >> halfsize;
+  T b1 = b >> halfsize;
+  T b0 = (b << halfsize) >> halfsize;
+
+  // a1b1 - for bits - [64-128)
+  // a1b0 a0b1 for bits - [32-96)
+  // a0b0 for bits - [0-64)
+  a1b1 = a1 * b1;
+  a0b1 = a0 * b1;
+  a1b0 = a1 * b0;
+  a0b0 = a0 * b0;
+}
+
+// T is minimum of 64 bits- long or longlong
+template <typename T> inline T __u_long_mul_hi(T a, T b) {
+  T a0b0, a0b1, a1b0, a1b1;
+  __get_half_products(a, b, a0b0, a0b1, a1b0, a1b1);
+  T result = __get_high_half(a0b0, a0b1, a1b0, a1b1);
+  return result;
+}
+
+template <typename T> inline T __s_long_mul_hi(T a, T b) {
+  using UT = std::make_unsigned_t<T>;
+  UT absA = std::abs(a);
+  UT absB = std::abs(b);
+
+  UT a0b0, a0b1, a1b0, a1b1;
+  __get_half_products(absA, absB, a0b0, a0b1, a1b0, a1b1);
+  T result = __get_high_half(a0b0, a0b1, a1b0, a1b1);
+
+  bool isResultNegative = (a < 0) != (b < 0);
+  if (isResultNegative) {
+    result = ~result;
+
+    // Find the low half to see if we need to carry
+    constexpr int halfsize = (sizeof(T) * 8) / 2;
+    UT low = a0b0 + ((a0b1 + a1b0) << halfsize);
+    if (low == 0)
+      ++result;
+  }
+
+  return result;
+}
+} // namespace
+
+namespace sycl {
+inline namespace _V1 {
+#define BUILTIN_GENINT(NUM_ARGS, NAME, IMPL)                                   \
+  HOST_IMPL(NAME, IMPL)                                                        \
+  EXPORT_SCALAR_AND_VEC_1_16(NUM_ARGS, NAME, INTEGER_TYPES)
+#define BUILTIN_GENINT_SU(NUM_ARGS, NAME, IMPL)                                \
+  BUILTIN_GENINT(NUM_ARGS, NAME, IMPL)
+
+BUILTIN_GENINT(ONE_ARG, abs, [](auto x) -> decltype(x) {
+  if constexpr (std::is_signed_v<decltype(x)>) {
+    return std::abs(x);
+  } else {
+    return x;
+  }
+})
+
+BUILTIN_GENINT_SU(TWO_ARGS, abs_diff, [](auto x, auto y) -> decltype(x) {
+  // From SYCL 2020 revision 8:
+  //
+  // > The subtraction is done without modulo overflow. The behavior is
+  // > undefined if the result cannot be represented by the return type.
+  return sycl::abs(x - y);
+})
+
+BUILTIN_GENINT_SU(TWO_ARGS, add_sat, [](auto x, auto y) -> decltype(x) {
+  using T = decltype(x);
+  if constexpr (std::is_signed_v<T>) {
+    if (x > 0 && y > 0)
+      return (x < (std::numeric_limits<T>::max() - y)
+                  ? (x + y)
+                  : std::numeric_limits<T>::max());
+    if (x < 0 && y < 0)
+      return (x > (std::numeric_limits<T>::min() - y)
+                  ? (x + y)
+                  : std::numeric_limits<T>::min());
+    return x + y;
+  } else {
+    return (x < (std::numeric_limits<T>::max() - y)
+                ? x + y
+                : std::numeric_limits<T>::max());
+  }
+})
+
+BUILTIN_GENINT_SU(TWO_ARGS, hadd, [](auto x, auto y) -> decltype(x) {
+  const decltype(x) one = 1;
+  return (x >> one) + (y >> one) + ((y & x) & one);
+})
+
+BUILTIN_GENINT_SU(TWO_ARGS, rhadd, [](auto x, auto y) -> decltype(x) {
+  const decltype(x) one = 1;
+  return (x >> one) + (y >> one) + ((y | x) & one);
+})
+
+BUILTIN_GENINT_SU(THREE_ARGS, mad_hi,
+                  [](auto x, auto y, auto z) -> decltype(x) {
+                    return sycl::mul_hi(x, y) + z;
+                  })
+
+BUILTIN_GENINT_SU(
+    THREE_ARGS, mad_sat, [](auto a, auto b, auto c) -> decltype(a) {
+      using T = decltype(a);
+      if constexpr (std::is_signed_v<T>) {
+        if constexpr (sizeof(T) == 8) {
+          bool neg_prod = (a < 0) ^ (b < 0);
+          T mulhi = __s_long_mul_hi(a, b);
+
+          // check mul_hi. If it is any value != 0.
+          // if prod is +ve, any value in mulhi means we need to saturate.
+          // if prod is -ve, any value in mulhi besides -1 means we need to
+          // saturate.
+          if (!neg_prod && mulhi != 0)
+            return std::numeric_limits<T>::max();
+          if (neg_prod && mulhi != -1)
+            return std::numeric_limits<T>::min(); // essentially some other
+                                                  // negative value.
+          return sycl::add_sat(T(a * b), c);
+        } else {
+          using UPT = sycl::detail::make_larger_t<T>;
+          UPT mul = UPT(a) * UPT(b);
+          UPT res = mul + UPT(c);
+          const UPT max = std::numeric_limits<T>::max();
+          const UPT min = std::numeric_limits<T>::min();
+          res = std::min(std::max(res, min), max);
+          return T(res);
+        }
+      } else {
+        if constexpr (sizeof(T) == 8) {
+          T mulhi = __u_long_mul_hi(a, b);
+          // check mul_hi. If it is any value != 0.
+          if (mulhi != 0)
+            return std::numeric_limits<T>::max();
+          return sycl::add_sat(T(a * b), c);
+        } else {
+          using UPT = sycl::detail::make_larger_t<T>;
+          UPT mul = UPT(a) * UPT(b);
+          const UPT min = std::numeric_limits<T>::min();
+          const UPT max = std::numeric_limits<T>::max();
+          mul = std::min(std::max(mul, min), max);
+          return sycl::add_sat(T(mul), c);
+        }
+      }
+    })
+
+BUILTIN_GENINT_SU(TWO_ARGS, mul_hi, [](auto a, auto b) -> decltype(a) {
+  using T = decltype(a);
+  if constexpr (sizeof(T) == 8) {
+    if constexpr (std::is_signed_v<T>)
+      return __s_long_mul_hi(a, b);
+    else
+      return __u_long_mul_hi(a, b);
+  } else {
+    using UPT = sycl::detail::make_larger_t<T>;
+    UPT a_s = a;
+    UPT b_s = b;
+    UPT mul = a_s * b_s;
+    return (mul >> (sizeof(T) * 8));
+  }
+})
+
+BUILTIN_GENINT_SU(TWO_ARGS, sub_sat, [](auto x, auto y) -> decltype(x) {
+  using T = decltype(x);
+  if constexpr (std::is_signed_v<T>) {
+    using UT = std::make_unsigned_t<T>;
+    T result = UT(x) - UT(y);
+    // Saturate result if (+) - (-) = (-) or (-) - (+) = (+).
+    if (((x < 0) ^ (y < 0)) && ((x < 0) ^ (result < 0)))
+      result = result < 0 ? std::numeric_limits<T>::max()
+                          : std::numeric_limits<T>::min();
+    return result;
+  } else {
+    return (y < (x - std::numeric_limits<T>::min()))
+               ? (x - y)
+               : std::numeric_limits<T>::min();
+  }
+})
+
+BUILTIN_GENINT_SU(TWO_ARGS, max,
+                  [](auto x, auto y) -> decltype(x) { return x < y ? y : x; })
+
+BUILTIN_GENINT_SU(TWO_ARGS, min,
+                  [](auto x, auto y) -> decltype(x) { return y < x ? y : x; })
+
+template <typename T> static inline constexpr T __clz_impl(T x, T m, T n = 0) {
+  return (x & m) ? n : __clz_impl(x, T(m >> 1), ++n);
+}
+template <typename T> static inline constexpr T __clz(T x) {
+  using UT = std::make_unsigned_t<T>;
+  return (x == T(0)) ? sizeof(T) * 8
+                     : __clz_impl<UT>(x, sycl::detail::msbMask<UT>(x));
+}
+BUILTIN_GENINT(ONE_ARG, clz, __clz)
+
+template <typename T> static inline constexpr T __ctz_impl(T x, T m, T n = 0) {
+  return (x & m) ? n : __ctz_impl(x, T(m << 1), ++n);
+}
+
+template <typename T> static inline constexpr T __ctz(T x) {
+  using UT = std::make_unsigned_t<T>;
+  return (x == T(0)) ? sizeof(T) * 8 : __ctz_impl<UT>(x, 1);
+}
+BUILTIN_GENINT(ONE_ARG, ctz, __ctz)
+
+BUILTIN_GENINT(TWO_ARGS, rotate, [](auto x, auto n) -> decltype(x) {
+  using T = decltype(x);
+  using UT = std::make_unsigned_t<T>;
+  // Shrink the shift width so that it's in the range [0, num_bits(T)). Cast
+  // everything to unsigned to avoid type conversion issues.
+  constexpr UT size = sizeof(x) * 8;
+  UT xu = UT(x);
+  UT nu = UT(n) & (size - 1);
+  return (xu << nu) | (xu >> (size - nu));
+})
+
+template <typename T>
+static inline constexpr T __popcount_impl(T x, size_t n = 0) {
+  return (x == T(0)) ? n : __popcount_impl(x >> 1, ((x & T(1)) ? ++n : n));
+}
+template <typename T> static inline constexpr T __popcount(T x) {
+  using UT = sycl::detail::make_unsigned_t<T>;
+  return __popcount_impl(UT(x));
+}
+BUILTIN_GENINT(ONE_ARG, popcount, __popcount)
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/source/builtins/math_functions.cpp b/sycl/source/builtins/math_functions.cpp
new file mode 100644
index 0000000000000..c65a181ce8c9e
--- /dev/null
+++ b/sycl/source/builtins/math_functions.cpp
@@ -0,0 +1,241 @@
+//==------------------- math_functions.cpp ---------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Define _USE_MATH_DEFINES to enforce math defines of macros like M_PI in
+// <cmath>. _USE_MATH_DEFINES is defined here before includes of SYCL header
+// files to avoid include of <cmath> via those SYCL headers with unset
+// _USE_MATH_DEFINES.
+#define _USE_MATH_DEFINES
+
+#include <cmath>
+
+#include <sycl/builtins_preview.hpp>
+
+#include "host_helper_macros.hpp"
+
+namespace sycl {
+inline namespace _V1 {
+#define BUILTIN_GENF_CUSTOM(NUM_ARGS, NAME, IMPL)                              \
+  HOST_IMPL(NAME, IMPL)                                                        \
+  EXPORT_SCALAR_AND_VEC_1_16(NUM_ARGS, NAME, FP_TYPES)
+
+// NOTE: "-> decltype(x)" here and below is need for the half version, what
+// implementation do is invoking implicit conversion to float and compute the
+// result in float precision. Need to convert back by specifying return type.
+#define BUILTIN_GENF(NUM_ARGS, NAME)                                           \
+  BUILTIN_GENF_CUSTOM(NUM_ARGS, NAME, [](NUM_ARGS##_AUTO_ARG) -> decltype(x) { \
+    return std::NAME(NUM_ARGS##_ARG);                                          \
+  })
+
+BUILTIN_GENF(ONE_ARG, acos)
+BUILTIN_GENF(ONE_ARG, acosh)
+BUILTIN_GENF_CUSTOM(ONE_ARG, acospi,
+                    [](auto x) -> decltype(x) { return std::acos(x) / M_PI; })
+BUILTIN_GENF(ONE_ARG, asin)
+BUILTIN_GENF(ONE_ARG, asinh)
+BUILTIN_GENF_CUSTOM(ONE_ARG, asinpi,
+                    [](auto x) -> decltype(x) { return std::asin(x) / M_PI; })
+BUILTIN_GENF(ONE_ARG, atan)
+BUILTIN_GENF(ONE_ARG, atanh)
+BUILTIN_GENF_CUSTOM(ONE_ARG, atanpi,
+                    [](auto x) -> decltype(x) { return std::atan(x) / M_PI; })
+BUILTIN_GENF(TWO_ARGS, atan2)
+BUILTIN_GENF_CUSTOM(TWO_ARGS, atan2pi, [](auto x, auto y) -> decltype(x) {
+  return std::atan2(x, y) / M_PI;
+})
+BUILTIN_GENF(ONE_ARG, cbrt)
+BUILTIN_GENF(ONE_ARG, ceil)
+BUILTIN_GENF(TWO_ARGS, copysign)
+BUILTIN_GENF(ONE_ARG, cos)
+BUILTIN_GENF(ONE_ARG, cosh)
+BUILTIN_GENF_CUSTOM(ONE_ARG, cospi, [](auto x) -> decltype(x) {
+  return std::sin(M_PI * (0.5 - x));
+})
+BUILTIN_GENF(ONE_ARG, erf)
+BUILTIN_GENF(ONE_ARG, erfc)
+BUILTIN_GENF(ONE_ARG, exp)
+BUILTIN_GENF(ONE_ARG, exp2)
+BUILTIN_GENF_CUSTOM(ONE_ARG, exp10,
+                    [](auto x) -> decltype(x) { return std::pow(10, x); })
+BUILTIN_GENF(ONE_ARG, expm1)
+BUILTIN_GENF(ONE_ARG, fabs)
+BUILTIN_GENF(TWO_ARGS, fdim)
+BUILTIN_GENF(ONE_ARG, floor)
+BUILTIN_GENF(THREE_ARGS, fma)
+BUILTIN_GENF(TWO_ARGS, fmax)
+BUILTIN_GENF(TWO_ARGS, fmin)
+BUILTIN_GENF(TWO_ARGS, fmod)
+BUILTIN_GENF(TWO_ARGS, hypot)
+BUILTIN_GENF(ONE_ARG, lgamma)
+BUILTIN_GENF(ONE_ARG, log)
+BUILTIN_GENF(ONE_ARG, log2)
+BUILTIN_GENF(ONE_ARG, log10)
+BUILTIN_GENF(ONE_ARG, log1p)
+BUILTIN_GENF(ONE_ARG, logb)
+BUILTIN_GENF_CUSTOM(THREE_ARGS, mad, [](auto x, auto y, auto z) -> decltype(x) {
+  return (x * y) + z;
+})
+BUILTIN_GENF_CUSTOM(TWO_ARGS, maxmag, [](auto x, auto y) -> decltype(x) {
+  if (std::fabs(x) > std::fabs(y))
+    return x;
+  if (std::fabs(y) > std::fabs(x))
+    return y;
+  return std::fmax(x, y);
+})
+BUILTIN_GENF_CUSTOM(TWO_ARGS, minmag, [](auto x, auto y) -> decltype(x) {
+  if (std::fabs(x) < std::fabs(y))
+    return x;
+  if (std::fabs(y) < std::fabs(x))
+    return y;
+  return std::fmin(x, y);
+})
+BUILTIN_GENF(TWO_ARGS, pow)
+BUILTIN_GENF_CUSTOM(TWO_ARGS, powr, [](auto x, auto y) -> decltype(x) {
+  using T = decltype(x);
+  return (x >= T(0)) ? T(std::pow(x, y)) : x;
+})
+BUILTIN_GENF(TWO_ARGS, remainder)
+BUILTIN_GENF(ONE_ARG, rint)
+BUILTIN_GENF(ONE_ARG, round)
+BUILTIN_GENF_CUSTOM(ONE_ARG, rsqrt, [](auto x) -> decltype(x) {
+  return decltype(x){1.0} / std::sqrt(x);
+})
+BUILTIN_GENF(ONE_ARG, sin)
+BUILTIN_GENF(ONE_ARG, sinh)
+BUILTIN_GENF_CUSTOM(ONE_ARG, sinpi,
+                    [](auto x) -> decltype(x) { return std::sin(M_PI * x); })
+BUILTIN_GENF(ONE_ARG, sqrt)
+BUILTIN_GENF(ONE_ARG, tan)
+BUILTIN_GENF(ONE_ARG, tanh)
+BUILTIN_GENF_CUSTOM(
+    ONE_ARG, tanpi,
+    [](auto x) -> decltype(x) { // For uniformity, place in range [0.0, 1.0).
+      double y = x - std::floor(x);
+      // Flip for better accuracy.
+      return 1.0 / std::tan((0.5 - y) * M_PI);
+    })
+BUILTIN_GENF(ONE_ARG, tgamma)
+BUILTIN_GENF(ONE_ARG, trunc)
+BUILTIN_GENF_CUSTOM(TWO_ARGS, nextafter, [](auto x, auto y) {
+  if constexpr (!std::is_same_v<decltype(x), half>) {
+    return std::nextafter(x, y);
+  } else {
+    // Copied from sycl_host_nextafter, not sure if it's valid when operating on
+    // sycl::half. That said, should be covered by
+    // sycl/test/regression/host_half_nextafter.cpp
+
+    if (std::isnan(static_cast<float>(x)))
+      return x;
+    if (std::isnan(static_cast<float>(y)) || x == y)
+      return y;
+
+    uint16_t x_bits = sycl::bit_cast<uint16_t>(x);
+    uint16_t x_sign = x_bits & 0x8000;
+    int16_t movement = (x > y ? -1 : 1) * (x_sign ? -1 : 1);
+    if (x_bits == x_sign && movement == -1) {
+      // Special case where we underflow in the decrement, in which case we turn
+      // it around and flip the sign. The overflow case does not need special
+      // handling.
+      movement = 1;
+      x_bits ^= 0x8000;
+    }
+    x_bits += movement;
+    return sycl::bit_cast<half>(x_bits);
+  }
+})
+
+namespace detail {
+__SYCL_EXPORT float frexp_impl(float x, int *p) { return std::frexp(x, p); }
+__SYCL_EXPORT double frexp_impl(double x, int *p) { return std::frexp(x, p); }
+__SYCL_EXPORT half frexp_impl(half x, int *p) { return std::frexp(x, p); }
+} // namespace detail
+
+namespace detail {
+template <typename T> static inline T __lgamma_r_impl(T x, int *signp) {
+  T g = std::tgamma(x);
+  *signp = std::signbit(sycl::detail::cast_if_host_half(g)) ? -1 : 1;
+  return std::log(std::abs(g));
+}
+
+__SYCL_EXPORT float lgamma_r_impl(float x, int *p) {
+  return __lgamma_r_impl(x, p);
+}
+__SYCL_EXPORT double lgamma_r_impl(double x, int *p) {
+  return __lgamma_r_impl(x, p);
+}
+__SYCL_EXPORT half lgamma_r_impl(half x, int *p) {
+  return __lgamma_r_impl(x, p);
+}
+} // namespace detail
+
+HOST_IMPL(ilogb, std::ilogb)
+EXPORT_SCALAR_AND_VEC_1_16(ONE_ARG, ilogb, FP_TYPES)
+
+namespace detail {
+__SYCL_EXPORT float modf_impl(float x, float *p) { return std::modf(x, p); }
+__SYCL_EXPORT double modf_impl(double x, double *p) { return std::modf(x, p); }
+__SYCL_EXPORT half modf_impl(half x, half *p) {
+  float val;
+  auto ret = std::modf(x, &val);
+  *p = val;
+  return ret;
+}
+} // namespace detail
+
+namespace detail {
+template <typename T> static inline T __sincos(T x, T *cosval) {
+  (*cosval) = std::cos(x);
+  return std::sin(x);
+}
+
+__SYCL_EXPORT float sincos_impl(float x, float *p) { return __sincos(x, p); }
+__SYCL_EXPORT double sincos_impl(double x, double *p) { return __sincos(x, p); }
+__SYCL_EXPORT half sincos_impl(half x, half *p) { return __sincos(x, p); }
+} // namespace detail
+
+#define EXPORT_VEC_LAST_INT(NAME, TYPE, VL)                                    \
+  vec<TYPE, VL> __SYCL_EXPORT __##NAME##_impl(vec<TYPE, VL> x,                 \
+                                              vec<int, VL> y) {                \
+    return NAME##_host_impl(x, y);                                             \
+  }
+#define EXPORT_VEC_LAST_INT_1_16(NAME, TYPE)                                   \
+  FOR_VEC_1_16(EXPORT_VEC_LAST_INT, NAME, TYPE)
+
+#define BUILTIN_MATH_LAST_INT(NAME, IMPL)                                      \
+  __SYCL_EXPORT float __##NAME##_impl(float x, int y) { return IMPL(x, y); }   \
+  __SYCL_EXPORT double __##NAME##_impl(double x, int y) { return IMPL(x, y); } \
+  __SYCL_EXPORT half __##NAME##_impl(half x, int y) { return IMPL(x, y); }     \
+  HOST_IMPL(NAME, pown /* delegate to scalar */)                               \
+  FOR_EACH1(EXPORT_VEC_LAST_INT_1_16, NAME, FP_TYPES)
+
+BUILTIN_MATH_LAST_INT(pown, std::pow)
+BUILTIN_MATH_LAST_INT(rootn, [](auto x, auto y) -> decltype(x) {
+  return std::pow(x, decltype(x){1} / y);
+})
+BUILTIN_MATH_LAST_INT(ldexp, std::ldexp)
+
+namespace {
+template <typename T> auto __remquo_impl(T x, T y, int *z) {
+  T rem = std::remainder(x, y);
+  *z = static_cast<int>(std::round((x - rem) / y));
+  return rem;
+}
+} // namespace
+namespace detail {
+__SYCL_EXPORT float remquo_impl(float x, float y, int *z) {
+  return __remquo_impl(x, y, z);
+}
+__SYCL_EXPORT double remquo_impl(double x, double y, int *z) {
+  return __remquo_impl(x, y, z);
+}
+__SYCL_EXPORT half remquo_impl(half x, half y, int *z) {
+  return __remquo_impl(x, y, z);
+}
+} // namespace detail
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/source/builtins/native_math_functions.cpp b/sycl/source/builtins/native_math_functions.cpp
new file mode 100644
index 0000000000000..f741bd531d6b5
--- /dev/null
+++ b/sycl/source/builtins/native_math_functions.cpp
@@ -0,0 +1,44 @@
+//==------------------- native_math_functions.cpp --------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <sycl/builtins_preview.hpp>
+
+#include "host_helper_macros.hpp"
+
+#include <cmath>
+
+namespace sycl {
+inline namespace _V1 {
+namespace native {
+#define BUILTIN_NATIVE_CUSTOM(NUM_ARGS, NAME, IMPL)                            \
+  HOST_IMPL(NAME, IMPL)                                                        \
+  EXPORT_SCALAR_AND_VEC_1_16_NS(NUM_ARGS, NAME, native, float)
+
+#define BUILTIN_NATIVE(NUM_ARGS, NAME)                                         \
+  BUILTIN_NATIVE_CUSTOM(NUM_ARGS, NAME, std::NAME)
+
+BUILTIN_NATIVE(ONE_ARG, cos)
+BUILTIN_NATIVE_CUSTOM(TWO_ARGS, divide, [](auto x, auto y) { return x / y; })
+BUILTIN_NATIVE(ONE_ARG, exp)
+BUILTIN_NATIVE(ONE_ARG, exp2)
+BUILTIN_NATIVE_CUSTOM(ONE_ARG, exp10, [](auto x) { return std::pow(10.0f, x); })
+BUILTIN_NATIVE(ONE_ARG, log)
+BUILTIN_NATIVE(ONE_ARG, log2)
+BUILTIN_NATIVE(ONE_ARG, log10)
+BUILTIN_NATIVE_CUSTOM(TWO_ARGS, powr, [](auto x, auto y) {
+  return (x >= 0 ? std::pow(x, y) : x);
+})
+BUILTIN_NATIVE_CUSTOM(ONE_ARG, recip, [](auto x) { return 1.0f / x; })
+BUILTIN_NATIVE_CUSTOM(ONE_ARG, rsqrt,
+                      [](auto x) { return 1.0f / std::sqrt(x); })
+BUILTIN_NATIVE(ONE_ARG, sin)
+BUILTIN_NATIVE(ONE_ARG, sqrt)
+BUILTIN_NATIVE(ONE_ARG, tan)
+} // namespace native
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/source/builtins/relational_functions.cpp b/sycl/source/builtins/relational_functions.cpp
new file mode 100644
index 0000000000000..b81af864fc10d
--- /dev/null
+++ b/sycl/source/builtins/relational_functions.cpp
@@ -0,0 +1,98 @@
+//==------------------- relational_functions.cpp ---------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <sycl/builtins_preview.hpp>
+
+#include "host_helper_macros.hpp"
+
+#include <bitset>
+#include <cmath>
+
+namespace sycl {
+inline namespace _V1 {
+
+#if defined(__GNUC__) && !defined(__clang__)
+// sycl::vec has UB in operator[] (aliasing violation) that causes the following
+// warning here. Note that the way this #pragma works is that we have to put it
+// around the macro definition, not where the macro is instantiated.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
+#define REL_BUILTIN_CUSTOM(NUM_ARGS, NAME, ...)                                \
+  template <typename... Ts> static auto NAME##_host_impl(Ts... xs) {           \
+    using namespace detail;                                                    \
+    if constexpr ((... || is_vec_v<Ts>)) {                                     \
+      return builtin_delegate_rel_impl(                                        \
+          [](auto... xs) { return NAME##_host_impl(xs...); }, xs...);          \
+    } else {                                                                   \
+      return __VA_ARGS__(xs...);                                               \
+    }                                                                          \
+  }                                                                            \
+  EXPORT_SCALAR_AND_VEC_1_16(NUM_ARGS, NAME, FP_TYPES)
+#define REL_BUILTIN(NUM_ARGS, NAME)                                            \
+  REL_BUILTIN_CUSTOM(NUM_ARGS, NAME, std::NAME)
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+
+REL_BUILTIN_CUSTOM(TWO_ARGS, isequal, ([](auto x, auto y) { return x == y; }))
+REL_BUILTIN_CUSTOM(TWO_ARGS, isnotequal,
+                   ([](auto x, auto y) { return x != y; }))
+REL_BUILTIN_CUSTOM(TWO_ARGS, isgreater, ([](auto x, auto y) { return x > y; }))
+REL_BUILTIN_CUSTOM(TWO_ARGS, isgreaterequal,
+                   ([](auto x, auto y) { return x >= y; }))
+REL_BUILTIN_CUSTOM(TWO_ARGS, isless, ([](auto x, auto y) { return x < y; }))
+REL_BUILTIN_CUSTOM(TWO_ARGS, islessequal,
+                   ([](auto x, auto y) { return x <= y; }))
+REL_BUILTIN_CUSTOM(TWO_ARGS, islessgreater,
+                   ([](auto x, auto y) { return x < y || x > y; }))
+REL_BUILTIN(ONE_ARG, isfinite)
+REL_BUILTIN(ONE_ARG, isinf)
+REL_BUILTIN(ONE_ARG, isnan)
+REL_BUILTIN(ONE_ARG, isnormal)
+REL_BUILTIN_CUSTOM(TWO_ARGS, isordered,
+                   ([](auto x, auto y) { return !std::isunordered(x, y); }))
+REL_BUILTIN_CUSTOM(TWO_ARGS, isunordered,
+                   ([](auto x, auto y) { return std::isunordered(x, y); }))
+#if defined(__GNUC__) && !defined(__clang__)
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112816
+#pragma GCC push_options
+#pragma GCC optimize("-O2")
+#endif
+
+REL_BUILTIN(ONE_ARG, signbit)
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+#endif
+
+HOST_IMPL(bitselect, [](auto x, auto y, auto z) {
+  using T0 = decltype(x);
+  using T1 = decltype(y);
+  using T2 = decltype(z);
+  constexpr size_t N = sizeof(T0) * 8;
+  using bitset = std::bitset<N>;
+
+  static_assert(std::is_same_v<T0, T1> && std::is_same_v<T1, T2> &&
+                detail::is_scalar_arithmetic_v<T0>);
+
+  using utype = detail::make_type_t<
+      T0, detail::type_list<unsigned char, unsigned short, unsigned int,
+                            unsigned long, unsigned long long>>;
+  static_assert(sizeof(utype) == sizeof(T0));
+  bitset bx(bit_cast<utype>(x)), by(bit_cast<utype>(y)), bz(bit_cast<utype>(z));
+  bitset res = (bz & by) | (~bz & bx);
+  unsigned long long ures = res.to_ullong();
+  assert((ures & std::numeric_limits<utype>::max()) == ures);
+  return bit_cast<T0>(static_cast<utype>(ures));
+})
+EXPORT_SCALAR_AND_VEC_1_16(THREE_ARGS, bitselect, INTEGER_TYPES, FP_TYPES)
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/test-e2e/DeviceLib/built-ins/scalar_integer.cpp b/sycl/test-e2e/DeviceLib/built-ins/scalar_integer.cpp
index b968c2c87a3b3..314b6fd5515cc 100644
--- a/sycl/test-e2e/DeviceLib/built-ins/scalar_integer.cpp
+++ b/sycl/test-e2e/DeviceLib/built-ins/scalar_integer.cpp
@@ -415,7 +415,7 @@ int main() {
       myQueue.submit([&](s::handler &cgh) {
         auto AccR = BufR.get_access<s::access::mode::write>(cgh);
         cgh.single_task<class upsampleSC1UC1>(
-            [=]() { AccR[0] = s::upsample((char)0x10, (unsigned char)0x10); });
+            [=]() { AccR[0] = s::upsample((int8_t)0x10, (uint8_t)0x10); });
       });
     }
     assert(r == 0x1010);