Skip to content

Commit

Permalink
Merge pull request #1485 from IntelPython/work-around-sg-loadstore-is…
Browse files Browse the repository at this point in the history
…sues
  • Loading branch information
oleksandr-pavlyk authored Dec 9, 2023
2 parents 0b63d4f + 86153f3 commit a4369ac
Show file tree
Hide file tree
Showing 82 changed files with 862 additions and 260 deletions.
45 changes: 45 additions & 0 deletions dpctl/tensor/libtensor/include/kernels/alignment.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
//
// Data Parallel Control (dpctl)
//
// Copyright 2020-2023 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

#pragma once

#include <cstddef>
#include <cstdint>

namespace dpctl
{
namespace tensor
{
namespace kernels
{
namespace alignment_utils
{

static constexpr size_t required_alignment = 64;

template <std::uintptr_t alignment, typename Ptr> bool is_aligned(Ptr p)
{
return !(reinterpret_cast<std::uintptr_t>(p) % alignment);
}

template <typename KernelName> class disabled_sg_loadstore_wrapper_krn;

} // end of namespace alignment_utils
} // end of namespace kernels
} // end of namespace tensor
} // end of namespace dpctl
42 changes: 36 additions & 6 deletions dpctl/tensor/libtensor/include/kernels/clip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include <pybind11/pybind11.h>
#include <type_traits>

#include "kernels/alignment.hpp"
#include "utils/math_utils.hpp"
#include "utils/offset_utils.hpp"
#include "utils/type_dispatch.hpp"
Expand All @@ -51,6 +52,11 @@ namespace td_ns = dpctl::tensor::type_dispatch;

using namespace dpctl::tensor::offset_utils;

using dpctl::tensor::kernels::alignment_utils::
disabled_sg_loadstore_wrapper_krn;
using dpctl::tensor::kernels::alignment_utils::is_aligned;
using dpctl::tensor::kernels::alignment_utils::required_alignment;

template <typename T> T clip(const T &x, const T &min, const T &max)
{
using dpctl::tensor::type_utils::is_complex;
Expand All @@ -73,7 +79,11 @@ template <typename T> T clip(const T &x, const T &min, const T &max)
}
}

template <typename T, int vec_sz = 4, int n_vecs = 2> class ClipContigFunctor
template <typename T,
int vec_sz = 4,
int n_vecs = 2,
bool enable_sg_loadstore = true>
class ClipContigFunctor
{
private:
size_t nelems = 0;
Expand All @@ -96,7 +106,7 @@ template <typename T, int vec_sz = 4, int n_vecs = 2> class ClipContigFunctor
void operator()(sycl::nd_item<1> ndit) const
{
using dpctl::tensor::type_utils::is_complex;
if constexpr (is_complex<T>::value) {
if constexpr (is_complex<T>::value || !enable_sg_loadstore) {
std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
size_t base = ndit.get_global_linear_id();

Expand Down Expand Up @@ -195,10 +205,30 @@ sycl::event clip_contig_impl(sycl::queue &q,
const auto gws_range = sycl::range<1>(n_groups * lws);
const auto lws_range = sycl::range<1>(lws);

cgh.parallel_for<clip_contig_kernel<T, vec_sz, n_vecs>>(
sycl::nd_range<1>(gws_range, lws_range),
ClipContigFunctor<T, vec_sz, n_vecs>(nelems, x_tp, min_tp, max_tp,
dst_tp));
if (is_aligned<required_alignment>(x_cp) &&
is_aligned<required_alignment>(min_cp) &&
is_aligned<required_alignment>(max_cp) &&
is_aligned<required_alignment>(dst_cp))
{
constexpr bool enable_sg_loadstore = true;
using KernelName = clip_contig_kernel<T, vec_sz, n_vecs>;

cgh.parallel_for<KernelName>(
sycl::nd_range<1>(gws_range, lws_range),
ClipContigFunctor<T, vec_sz, n_vecs, enable_sg_loadstore>(
nelems, x_tp, min_tp, max_tp, dst_tp));
}
else {
constexpr bool disable_sg_loadstore = false;
using InnerKernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
using KernelName =
disabled_sg_loadstore_wrapper_krn<InnerKernelName>;

cgh.parallel_for<KernelName>(
sycl::nd_range<1>(gws_range, lws_range),
ClipContigFunctor<T, vec_sz, n_vecs, disable_sg_loadstore>(
nelems, x_tp, min_tp, max_tp, dst_tp));
}
});

return clip_ev;
Expand Down
43 changes: 37 additions & 6 deletions dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <sycl/sycl.hpp>
#include <type_traits>

#include "kernels/alignment.hpp"
#include "utils/offset_utils.hpp"
#include "utils/type_utils.hpp"

Expand All @@ -44,6 +45,11 @@ namespace copy_and_cast
namespace py = pybind11;
using namespace dpctl::tensor::offset_utils;

using dpctl::tensor::kernels::alignment_utils::
disabled_sg_loadstore_wrapper_krn;
using dpctl::tensor::kernels::alignment_utils::is_aligned;
using dpctl::tensor::kernels::alignment_utils::required_alignment;

template <typename srcT, typename dstT, typename IndexerT>
class copy_cast_generic_kernel;

Expand Down Expand Up @@ -200,7 +206,8 @@ template <typename srcT,
typename dstT,
typename CastFnT,
int vec_sz = 4,
int n_vecs = 2>
int n_vecs = 2,
bool enable_sg_loadstore = true>
class ContigCopyFunctor
{
private:
Expand All @@ -219,7 +226,9 @@ class ContigCopyFunctor
CastFnT fn{};

using dpctl::tensor::type_utils::is_complex;
if constexpr (is_complex<srcT>::value || is_complex<dstT>::value) {
if constexpr (!enable_sg_loadstore || is_complex<srcT>::value ||
is_complex<dstT>::value)
{
std::uint8_t sgSize = ndit.get_sub_group().get_local_range()[0];
size_t base = ndit.get_global_linear_id();

Expand Down Expand Up @@ -326,10 +335,32 @@ sycl::event copy_and_cast_contig_impl(sycl::queue &q,
const auto gws_range = sycl::range<1>(n_groups * lws);
const auto lws_range = sycl::range<1>(lws);

cgh.parallel_for<copy_cast_contig_kernel<srcTy, dstTy, n_vecs, vec_sz>>(
sycl::nd_range<1>(gws_range, lws_range),
ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
n_vecs>(nelems, src_tp, dst_tp));
if (is_aligned<required_alignment>(src_cp) &&
is_aligned<required_alignment>(dst_cp))
{
constexpr bool enable_sg_loadstore = true;
using KernelName =
copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;

cgh.parallel_for<KernelName>(
sycl::nd_range<1>(gws_range, lws_range),
ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
n_vecs, enable_sg_loadstore>(nelems, src_tp,
dst_tp));
}
else {
constexpr bool disable_sg_loadstore = false;
using InnerKernelName =
copy_cast_contig_kernel<srcTy, dstTy, vec_sz, n_vecs>;
using KernelName =
disabled_sg_loadstore_wrapper_krn<InnerKernelName>;

cgh.parallel_for<KernelName>(
sycl::nd_range<1>(gws_range, lws_range),
ContigCopyFunctor<srcTy, dstTy, Caster<srcTy, dstTy>, vec_sz,
n_vecs, disable_sg_loadstore>(nelems, src_tp,
dst_tp));
}
});

return copy_and_cast_ev;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,15 @@ template <typename argT, typename resT> struct AbsFunctor
template <typename argT,
typename resT = argT,
unsigned int vec_sz = 4,
unsigned int n_vecs = 2>
using AbsContigFunctor = elementwise_common::
UnaryContigFunctor<argT, resT, AbsFunctor<argT, resT>, vec_sz, n_vecs>;
unsigned int n_vecs = 2,
bool enable_sg_loadstore = true>
using AbsContigFunctor =
elementwise_common::UnaryContigFunctor<argT,
resT,
AbsFunctor<argT, resT>,
vec_sz,
n_vecs,
enable_sg_loadstore>;

template <typename T> struct AbsOutputType
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,15 @@ template <typename argT, typename resT> struct AcosFunctor
template <typename argTy,
typename resTy = argTy,
unsigned int vec_sz = 4,
unsigned int n_vecs = 2>
using AcosContigFunctor = elementwise_common::
UnaryContigFunctor<argTy, resTy, AcosFunctor<argTy, resTy>, vec_sz, n_vecs>;
unsigned int n_vecs = 2,
bool enable_sg_loadstore = true>
using AcosContigFunctor =
elementwise_common::UnaryContigFunctor<argTy,
resTy,
AcosFunctor<argTy, resTy>,
vec_sz,
n_vecs,
enable_sg_loadstore>;

template <typename argTy, typename resTy, typename IndexerT>
using AcosStridedFunctor = elementwise_common::
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,13 +167,15 @@ template <typename argT, typename resT> struct AcoshFunctor
template <typename argTy,
typename resTy = argTy,
unsigned int vec_sz = 4,
unsigned int n_vecs = 2>
unsigned int n_vecs = 2,
bool enable_sg_loadstore = true>
using AcoshContigFunctor =
elementwise_common::UnaryContigFunctor<argTy,
resTy,
AcoshFunctor<argTy, resTy>,
vec_sz,
n_vecs>;
n_vecs,
enable_sg_loadstore>;

template <typename argTy, typename resTy, typename IndexerT>
using AcoshStridedFunctor = elementwise_common::
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,14 +123,16 @@ template <typename argT1,
typename argT2,
typename resT,
unsigned int vec_sz = 4,
unsigned int n_vecs = 2>
unsigned int n_vecs = 2,
bool enable_sg_loadstore = true>
using AddContigFunctor =
elementwise_common::BinaryContigFunctor<argT1,
argT2,
resT,
AddFunctor<argT1, argT2, resT>,
vec_sz,
n_vecs>;
n_vecs,
enable_sg_loadstore>;

template <typename argT1, typename argT2, typename resT, typename IndexerT>
using AddStridedFunctor =
Expand Down Expand Up @@ -425,13 +427,15 @@ template <typename argT, typename resT> struct AddInplaceFunctor
template <typename argT,
typename resT,
unsigned int vec_sz = 4,
unsigned int n_vecs = 2>
unsigned int n_vecs = 2,
bool enable_sg_loadstore = true>
using AddInplaceContigFunctor = elementwise_common::BinaryInplaceContigFunctor<
argT,
resT,
AddInplaceFunctor<argT, resT>,
vec_sz,
n_vecs>;
n_vecs,
enable_sg_loadstore>;

template <typename argT, typename resT, typename IndexerT>
using AddInplaceStridedFunctor =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,15 @@ template <typename argT, typename resT> struct AngleFunctor
template <typename argTy,
typename resTy = argTy,
unsigned int vec_sz = 4,
unsigned int n_vecs = 2>
unsigned int n_vecs = 2,
bool enable_sg_loadstore = true>
using AngleContigFunctor =
elementwise_common::UnaryContigFunctor<argTy,
resTy,
AngleFunctor<argTy, resTy>,
vec_sz,
n_vecs>;
n_vecs,
enable_sg_loadstore>;

template <typename argTy, typename resTy, typename IndexerT>
using AngleStridedFunctor = elementwise_common::
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,9 +169,15 @@ template <typename argT, typename resT> struct AsinFunctor
template <typename argTy,
typename resTy = argTy,
unsigned int vec_sz = 4,
unsigned int n_vecs = 2>
using AsinContigFunctor = elementwise_common::
UnaryContigFunctor<argTy, resTy, AsinFunctor<argTy, resTy>, vec_sz, n_vecs>;
unsigned int n_vecs = 2,
bool enable_sg_loadstore = true>
using AsinContigFunctor =
elementwise_common::UnaryContigFunctor<argTy,
resTy,
AsinFunctor<argTy, resTy>,
vec_sz,
n_vecs,
enable_sg_loadstore>;

template <typename argTy, typename resTy, typename IndexerT>
using AsinStridedFunctor = elementwise_common::
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,15 @@ template <typename argT, typename resT> struct AsinhFunctor
template <typename argTy,
typename resTy = argTy,
unsigned int vec_sz = 4,
unsigned int n_vecs = 2>
unsigned int n_vecs = 2,
bool enable_sg_loadstore = true>
using AsinhContigFunctor =
elementwise_common::UnaryContigFunctor<argTy,
resTy,
AsinhFunctor<argTy, resTy>,
vec_sz,
n_vecs>;
n_vecs,
enable_sg_loadstore>;

template <typename argTy, typename resTy, typename IndexerT>
using AsinhStridedFunctor = elementwise_common::
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,15 @@ template <typename argT, typename resT> struct AtanFunctor
template <typename argTy,
typename resTy = argTy,
unsigned int vec_sz = 4,
unsigned int n_vecs = 2>
using AtanContigFunctor = elementwise_common::
UnaryContigFunctor<argTy, resTy, AtanFunctor<argTy, resTy>, vec_sz, n_vecs>;
unsigned int n_vecs = 2,
bool enable_sg_loadstore = true>
using AtanContigFunctor =
elementwise_common::UnaryContigFunctor<argTy,
resTy,
AtanFunctor<argTy, resTy>,
vec_sz,
n_vecs,
enable_sg_loadstore>;

template <typename argTy, typename resTy, typename IndexerT>
using AtanStridedFunctor = elementwise_common::
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,16 @@ template <typename argT1,
typename argT2,
typename resT,
unsigned int vec_sz = 4,
unsigned int n_vecs = 2>
unsigned int n_vecs = 2,
bool enable_sg_loadstore = true>
using Atan2ContigFunctor =
elementwise_common::BinaryContigFunctor<argT1,
argT2,
resT,
Atan2Functor<argT1, argT2, resT>,
vec_sz,
n_vecs>;
n_vecs,
enable_sg_loadstore>;

template <typename argT1, typename argT2, typename resT, typename IndexerT>
using Atan2StridedFunctor =
Expand Down
Loading

0 comments on commit a4369ac

Please sign in to comment.