From 0b353bfb158fbfb0aff580f3edfcb23d7e9f48ca Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Wed, 4 Dec 2024 14:14:01 -0800 Subject: [PATCH 01/30] Implements `top_k` The implementation leverages existing merge-sort code, and partially sorts the array in cases where a parial sort reduces the size of temporary memory allocation --- dpctl/tensor/CMakeLists.txt | 1 + dpctl/tensor/__init__.py | 2 + dpctl/tensor/_topk.py | 184 ++++++++ .../include/kernels/sorting/topk.hpp | 440 ++++++++++++++++++ .../tensor/libtensor/source/sorting/topk.cpp | 387 +++++++++++++++ .../tensor/libtensor/source/sorting/topk.hpp | 42 ++ .../libtensor/source/tensor_sorting.cpp | 2 + 7 files changed, 1058 insertions(+) create mode 100644 dpctl/tensor/_topk.py create mode 100644 dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp create mode 100644 dpctl/tensor/libtensor/source/sorting/topk.cpp create mode 100644 dpctl/tensor/libtensor/source/sorting/topk.hpp diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index 882233b30c..75976f1b1f 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -115,6 +115,7 @@ set(_sorting_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_sort.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_argsort.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/searchsorted.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/topk.cpp ) set(_sorting_radix_sources ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_sort.cpp diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index 581c3d54f6..d83d41215f 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -201,6 +201,7 @@ ) from ._sorting import argsort, sort from ._testing import allclose +from ._topk import top_k from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type __all__ = [ @@ -387,4 +388,5 @@ "DLDeviceType", "take_along_axis", "put_along_axis", + "top_k", ] diff --git a/dpctl/tensor/_topk.py b/dpctl/tensor/_topk.py new file mode 100644 index 0000000000..a807aea689 --- /dev/null +++ b/dpctl/tensor/_topk.py @@ -0,0 +1,184 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import operator +from typing import NamedTuple + +import dpctl.tensor as dpt +import dpctl.tensor._tensor_impl as ti +import dpctl.utils as du +from dpctl.tensor._numpy_helper import normalize_axis_index + +from ._tensor_sorting_impl import _topk + + +def _get_top_k_largest(mode): + modes = {"largest": True, "smallest": False} + try: + return modes[mode] + except KeyError: + raise ValueError( + f"`mode` must be `largest` or `smallest`. Got `{mode}`." + ) + + +class TopKResult(NamedTuple): + values: dpt.usm_ndarray + indices: dpt.usm_ndarray + + +def top_k(x, k, /, *, axis=None, mode="largest"): + """top_k(x, k, axis=None, mode="largest") + + Returns the `k` largest or smallest values and their indices in the input + array `x` along the specified axis `axis`. + + Args: + x (usm_ndarray): + input array. + k (int): + number of elements to find. Must be a positive integer value. + axis (Optional[int]): + axis along which to search. If `None`, the search will be performed + over the flattened array. Default: ``None``. + mode (Literal["largest", "smallest"]): + search mode. Must be one of the following modes: + - `"largest"`: return the `k` largest elements. + - `"smallest"`: return the `k` smallest elements. + Default: `"largest"`. + + Returns: + tuple[usm_ndarray, usm_ndarray]: + a namedtuple `(values, indices)` whose + + - first element `values` will be an array containing the `k` largest or + smallest elements of `x`. The array has the same data type as `x`. + If `axis` was `None`, `values` will be a one-dimensional array + with shape `(k,)` and otherwise, `values` will have shape + `x.shape[:axis] + (k,) + x.shape[axis+1:]` + + - second element `indices` will be an array containing indices of `x` + that result in `values`. The array will have the same shape as + `values` and will have the default array index data type. + """ + largest = _get_top_k_largest(mode) + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}" + ) + + k = operator.index(k) + if k < 0: + raise ValueError("`k` must be a positive integer value") + + nd = x.ndim + if axis is None: + sz = x.size + if nd == 0: + return TopKResult( + dpt.copy(x, order="C"), + dpt.zeros_like( + x, dtype=ti.default_device_index_type(x.sycl_queue) + ), + ) + arr = x + n_search_dims = None + res_sh = k + else: + axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis") + sz = x.shape[axis] + a1 = axis + 1 + if a1 == nd: + perm = list(range(nd)) + arr = x + else: + perm = [i for i in range(nd) if i != axis] + [ + axis, + ] + arr = dpt.permute_dims(x, perm) + n_search_dims = 1 + res_sh = arr.shape[: nd - 1] + (k,) + + if k > sz: + raise ValueError(f"`k`={k} is out of bounds {sz}") + + exec_q = x.sycl_queue + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + + res_usm_type = arr.usm_type + if arr.flags.c_contiguous: + vals = dpt.empty( + res_sh, + dtype=arr.dtype, + usm_type=res_usm_type, + order="C", + sycl_queue=exec_q, + ) + inds = dpt.empty( + res_sh, + dtype=ti.default_device_index_type(exec_q), + usm_type=res_usm_type, + order="C", + sycl_queue=exec_q, + ) + ht_ev, impl_ev = _topk( + src=arr, + trailing_dims_to_search=n_search_dims, + k=k, + largest=largest, + vals=vals, + inds=inds, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_ev, impl_ev) + else: + tmp = dpt.empty_like(arr, order="C") + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, copy_ev) + vals = dpt.empty( + res_sh, + dtype=arr.dtype, + usm_type=res_usm_type, + order="C", + sycl_queue=exec_q, + ) + inds = dpt.empty( + res_sh, + dtype=ti.default_device_index_type(exec_q), + usm_type=res_usm_type, + order="C", + sycl_queue=exec_q, + ) + ht_ev, impl_ev = _topk( + src=tmp, + trailing_dims_to_search=n_search_dims, + k=k, + largest=largest, + vals=vals, + inds=inds, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_ev, impl_ev) + if axis is not None and a1 != nd: + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + vals = dpt.permute_dims(vals, inv_perm) + inds = dpt.permute_dims(inds, inv_perm) + return TopKResult(vals, inds) diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp new file mode 100644 index 0000000000..7db3f3e3be --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp @@ -0,0 +1,440 @@ +//=== topk.hpp - Implementation of topk kernels ---*-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor topk operation. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "kernels/dpctl_tensor_types.hpp" +#include "merge_sort.hpp" +#include "utils/sycl_alloc_utils.hpp" +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ + +namespace topk_detail +{ + +void scale_topk_params(const std::uint64_t nelems_per_slm, + const std::size_t sub_groups_per_work_group, + const std::uint32_t elems_per_wi, + const std::vector &sg_sizes, + std::size_t &lws, + std::size_t &nelems_wg_sorts) +{ + for (auto it = sg_sizes.rbegin(); it != sg_sizes.rend(); ++it) { + auto sg_size = *it; + lws = sub_groups_per_work_group * sg_size; + nelems_wg_sorts = elems_per_wi * lws; + if (nelems_wg_sorts < nelems_per_slm) { + return; + } + } + // should never reach + throw std::runtime_error("Could not construct top k kernel parameters"); +} + +} // namespace topk_detail + +template +class populate_index_data_full_sort_krn; + +template +class topk_map_to_rows_full_sort_krn; + +template class populate_index_data_krn; + +template class topk_map_to_rows_krn; + +template +sycl::event topk_full_sort_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a + // matrix when sorting over rows) + std::size_t sort_nelems, // size of each array to sort (length of rows, + // i.e. number of columns) + dpctl::tensor::ssize_t k, + const argTy *arg_tp, + argTy *vals_tp, + IndexTy *inds_tp, + const CompT &comp, + const std::vector &depends) +{ + IndexTy *index_data = + sycl::malloc_device(iter_nelems * sort_nelems, exec_q); + if (index_data == nullptr) { + throw std::runtime_error("Unable to allocate device_memory"); + } + + sycl::event populate_indexed_data_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + auto const &range = sycl::range<1>(iter_nelems * sort_nelems); + + using KernelName = + populate_index_data_full_sort_krn; + + cgh.parallel_for(range, [=](sycl::id<1> id) { + std::size_t i = id[0]; + index_data[i] = static_cast(i); + }); + }); + + std::size_t sorted_block_size; + // Sort segments of the array + sycl::event base_sort_ev = + merge_sort_detail::sort_over_work_group_contig_impl( + exec_q, iter_nelems, sort_nelems, index_data, index_data, comp, + sorted_block_size, // modified in place with size of sorted block + // size + {populate_indexed_data_ev}); + + // Merge segments in parallel until all elements are sorted + sycl::event merges_ev = merge_sort_detail::merge_sorted_block_contig_impl( + exec_q, iter_nelems, sort_nelems, index_data, comp, sorted_block_size, + {base_sort_ev}); + + sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(merges_ev); + + using KernelName = + topk_map_to_rows_full_sort_krn; + + cgh.parallel_for(iter_nelems * k, [=](sycl::id<1> id) { + std::size_t gid = id[0]; + + std::size_t iter_gid = gid / k; + std::size_t axis_gid = gid - (iter_gid * k); + + std::size_t src_idx = iter_gid * sort_nelems + axis_gid; + std::size_t dst_idx = iter_gid * k + axis_gid; + + auto res_ind = index_data[src_idx]; + vals_tp[dst_idx] = arg_tp[res_ind]; + inds_tp[dst_idx] = res_ind % sort_nelems; + }); + }); + + sycl::event cleanup_host_task_event = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(write_out_ev); + const sycl::context &ctx = exec_q.get_context(); + + using dpctl::tensor::alloc_utils::sycl_free_noexcept; + cgh.host_task( + [ctx, index_data] { sycl_free_noexcept(index_data, ctx); }); + }); + + return cleanup_host_task_event; +}; + +template +class topk_over_work_group_krn; + +template > +sycl::event +topk_impl(sycl::queue &exec_q, + std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows + // in a matrix when sorting over rows) + std::size_t axis_nelems, // size of each array to sort (length of + // rows, i.e. number of columns) + dpctl::tensor::ssize_t k, + const char *arg_cp, + char *vals_cp, + char *inds_cp, + dpctl::tensor::ssize_t iter_arg_offset, + dpctl::tensor::ssize_t iter_vals_offset, + dpctl::tensor::ssize_t iter_inds_offset, + dpctl::tensor::ssize_t axis_arg_offset, + dpctl::tensor::ssize_t axis_vals_offset, + dpctl::tensor::ssize_t axis_inds_offset, + const std::vector &depends) +{ + if (axis_nelems < static_cast(k)) { + throw std::runtime_error("Invalid sort axis size for value of k"); + } + + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + axis_arg_offset; + argTy *vals_tp = reinterpret_cast(vals_cp) + iter_vals_offset + + axis_vals_offset; + IndexTy *inds_tp = reinterpret_cast(inds_cp) + iter_inds_offset + + axis_inds_offset; + + using dpctl::tensor::kernels::IndexComp; + const IndexComp index_comp{arg_tp, ValueComp{}}; + + if (axis_nelems <= 512 || k >= 1024 || + static_cast(k) > axis_nelems / 2) + { + return topk_full_sort_impl(exec_q, iter_nelems, axis_nelems, k, arg_tp, + vals_tp, inds_tp, index_comp, depends); + } + else { + using PartialKernelName = + topk_over_work_group_krn; + + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = exec_q.get_context(); + auto const &dev = exec_q.get_device(); + + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + auto krn = kb.get_kernel(kernel_id); + + const std::uint32_t max_sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + const std::uint64_t device_local_memory_size = + dev.get_info(); + + // leave 512 bytes of local memory for RT + const std::uint64_t safety_margin = 512; + + const std::uint64_t nelems_per_slm = + (device_local_memory_size - safety_margin) / (2 * sizeof(IndexTy)); + + constexpr std::uint32_t sub_groups_per_work_group = 4; + const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2; + + std::size_t lws = sub_groups_per_work_group * max_sg_size; + + std::size_t sorted_block_size = elems_per_wi * lws; + if (sorted_block_size > nelems_per_slm) { + const std::vector sg_sizes = + dev.get_info(); + topk_detail::scale_topk_params( + nelems_per_slm, sub_groups_per_work_group, elems_per_wi, + sg_sizes, + lws, // modified by reference + sorted_block_size // modified by reference + ); + } + + // This assumption permits doing away with using a loop + assert(sorted_block_size % lws == 0); + + const std::size_t n_segments = + merge_sort_detail::quotient_ceil(axis_nelems, + sorted_block_size); + + // round k up for the later merge kernel + const dpctl::tensor::ssize_t round_k_to = elems_per_wi; + dpctl::tensor::ssize_t k_rounded = + merge_sort_detail::quotient_ceil( + k, round_k_to) * + round_k_to; + + // get length of tail for alloc size + auto rem = axis_nelems % sorted_block_size; + auto alloc_len = (rem && rem < static_cast(k_rounded)) + ? rem + k_rounded * (n_segments - 1) + : k_rounded * n_segments; + + // if allocation would be sufficiently large or k is larger than + // elements processed, use full sort + if (static_cast(k_rounded) >= axis_nelems || + static_cast(k_rounded) >= sorted_block_size || + alloc_len >= axis_nelems / 2) + { + return topk_full_sort_impl(exec_q, iter_nelems, axis_nelems, k, + arg_tp, vals_tp, inds_tp, index_comp, + depends); + } + + IndexTy *index_data = + sycl::malloc_device(iter_nelems * alloc_len, exec_q); + if (index_data == nullptr) { + throw std::runtime_error("Unable to allocate device_memory"); + } + + // no need to populate index data: SLM will be populated with default + // values + + sycl::event base_sort_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.use_kernel_bundle(kb); + + sycl::range<1> global_range{iter_nelems * n_segments * lws}; + sycl::range<1> local_range{lws}; + + sycl::range<1> slm_range{sorted_block_size}; + sycl::local_accessor work_space(slm_range, cgh); + sycl::local_accessor scratch_space(slm_range, cgh); + + sycl::nd_range<1> ndRange(global_range, local_range); + + cgh.parallel_for( + ndRange, [=](sycl::nd_item<1> it) { + const std::size_t group_id = it.get_group_linear_id(); + const std::size_t iter_id = group_id / n_segments; + const std::size_t segment_id = + group_id - iter_id * n_segments; + const std::size_t lid = it.get_local_linear_id(); + + const std::size_t segment_start_idx = + segment_id * sorted_block_size; + const std::size_t segment_end_idx = std::min( + segment_start_idx + sorted_block_size, axis_nelems); + const std::size_t wg_chunk_size = + segment_end_idx - segment_start_idx; + + // load input into SLM + for (std::size_t array_id = segment_start_idx + lid; + array_id < segment_end_idx; array_id += lws) + { + IndexTy v = (array_id < axis_nelems) + ? iter_id * axis_nelems + array_id + : IndexTy{}; + work_space[array_id - segment_start_idx] = v; + } + sycl::group_barrier(it.get_group()); + + const std::size_t chunk = + merge_sort_detail::quotient_ceil( + sorted_block_size, lws); + + const std::size_t chunk_start_idx = lid * chunk; + const std::size_t chunk_end_idx = + sycl::min(chunk_start_idx + chunk, wg_chunk_size); + + merge_sort_detail::leaf_sort_impl( + work_space, chunk_start_idx, chunk_end_idx, index_comp); + + sycl::group_barrier(it.get_group()); + + bool data_in_temp = false; + std::size_t n_chunks_merged = 1; + + // merge chunk while n_chunks_merged * chunk < wg_chunk_size + const std::size_t max_chunks_merged = + 1 + ((wg_chunk_size - 1) / chunk); + for (; n_chunks_merged < max_chunks_merged; + data_in_temp = !data_in_temp, n_chunks_merged *= 2) + { + const std::size_t nelems_sorted_so_far = + n_chunks_merged * chunk; + const std::size_t q = (lid / n_chunks_merged); + const std::size_t start_1 = sycl::min( + 2 * nelems_sorted_so_far * q, wg_chunk_size); + const std::size_t end_1 = sycl::min( + start_1 + nelems_sorted_so_far, wg_chunk_size); + const std::size_t end_2 = sycl::min( + end_1 + nelems_sorted_so_far, wg_chunk_size); + const std::size_t offset = + chunk * (lid - q * n_chunks_merged); + + if (data_in_temp) { + merge_sort_detail::merge_impl( + offset, scratch_space, work_space, start_1, + end_1, end_2, start_1, index_comp, chunk); + } + else { + merge_sort_detail::merge_impl( + offset, work_space, scratch_space, start_1, + end_1, end_2, start_1, index_comp, chunk); + } + sycl::group_barrier(it.get_group()); + } + + // output assumed to be structured as (iter_nelems, + // alloc_len) + const std::size_t k_segment_start_idx = + segment_id * k_rounded; + const std::size_t k_segment_end_idx = std::min( + k_segment_start_idx + k_rounded, alloc_len); + const auto &out_src = + (data_in_temp) ? scratch_space : work_space; + for (std::size_t array_id = k_segment_start_idx + lid; + array_id < k_segment_end_idx; array_id += lws) + { + if (lid < static_cast(k_rounded)) { + index_data[iter_id * alloc_len + array_id] = + out_src[array_id - k_segment_start_idx]; + } + } + }); + }); + + // Merge segments in parallel until all elements are sorted + sycl::event merges_ev = + merge_sort_detail::merge_sorted_block_contig_impl( + exec_q, iter_nelems, alloc_len, index_data, index_comp, + k_rounded, {base_sort_ev}); + + // Write out top k of the merge-sorted memory + sycl::event write_topk_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(merges_ev); + + using KernelName = topk_map_to_rows_krn; + + cgh.parallel_for(iter_nelems * k, [=](sycl::id<1> id) { + std::size_t gid = id[0]; + + std::size_t iter_gid = gid / k; + std::size_t axis_gid = gid - (iter_gid * k); + + std::size_t src_idx = iter_gid * alloc_len + axis_gid; + std::size_t dst_idx = iter_gid * k + axis_gid; + + auto res_ind = index_data[src_idx]; + vals_tp[dst_idx] = arg_tp[res_ind]; + inds_tp[dst_idx] = res_ind % axis_nelems; + }); + }); + + sycl::event cleanup_host_task_event = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(write_topk_ev); + const sycl::context &ctx = exec_q.get_context(); + + using dpctl::tensor::alloc_utils::sycl_free_noexcept; + cgh.host_task( + [ctx, index_data] { sycl_free_noexcept(index_data, ctx); }); + }); + + return cleanup_host_task_event; + } +} + +} // end of namespace kernels +} // end of namespace tensor +} // end of namespace dpctl diff --git a/dpctl/tensor/libtensor/source/sorting/topk.cpp b/dpctl/tensor/libtensor/source/sorting/topk.cpp new file mode 100644 index 0000000000..af2f6ce027 --- /dev/null +++ b/dpctl/tensor/libtensor/source/sorting/topk.cpp @@ -0,0 +1,387 @@ +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===--------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include +#include +#include + +#include "utils/math_utils.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/output_validation.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +#include "kernels/sorting/topk.hpp" +#include "rich_comparisons.hpp" +#include "topk.hpp" + +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +typedef sycl::event (*topk_impl_fn_ptr_t)(sycl::queue &, + std::size_t, + std::size_t, + py::ssize_t, + bool, + const char *, + char *, + char *, + py::ssize_t, + py::ssize_t, + py::ssize_t, + py::ssize_t, + py::ssize_t, + py::ssize_t, + const std::vector &); + +static topk_impl_fn_ptr_t topk_dispatch_vector[td_ns::num_types]; + +namespace +{ + +template +sycl::event +topk_caller(sycl::queue &exec_q, + std::size_t iter_nelems, // number of sub-arrays to sort (num. of + // rows in a matrix when sorting over rows) + std::size_t axis_nelems, // size of each array to sort (length of + // rows, i.e. number of columns) + py::ssize_t k, + bool largest, + const char *arg_cp, + char *vals_cp, + char *inds_cp, + py::ssize_t iter_arg_offset, + py::ssize_t iter_vals_offset, + py::ssize_t iter_inds_offset, + py::ssize_t axis_arg_offset, + py::ssize_t axis_vals_offset, + py::ssize_t axis_inds_offset, + const std::vector &depends) +{ + using dpctl::tensor::kernels::topk_impl; + if (largest) { + using CompTy = + typename dpctl::tensor::py_internal::DescendingSorter::type; + return topk_impl( + exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp, + iter_arg_offset, iter_vals_offset, iter_inds_offset, + axis_arg_offset, axis_vals_offset, axis_inds_offset, depends); + } + else { + using CompTy = + typename dpctl::tensor::py_internal::AscendingSorter::type; + return topk_impl( + exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp, + iter_arg_offset, iter_vals_offset, iter_inds_offset, + axis_arg_offset, axis_vals_offset, axis_inds_offset, depends); + } +} + +} // namespace + +std::pair +py_topk(const dpctl::tensor::usm_ndarray &src, + const int trailing_dims_to_search, + const py::ssize_t k, + const bool largest, + const dpctl::tensor::usm_ndarray &vals, + const dpctl::tensor::usm_ndarray &inds, + sycl::queue &exec_q, + const std::vector &depends) +{ + int src_nd = src.get_ndim(); + int vals_nd = vals.get_ndim(); + int inds_nd = inds.get_ndim(); + if (src_nd != vals_nd || src_nd != inds_nd) { + throw py::value_error("The input and output arrays must have " + "the same array ranks"); + } + int iteration_nd = src_nd - trailing_dims_to_search; + if (trailing_dims_to_search <= 0 || iteration_nd < 0) { + throw py::value_error( + "trailing_dims_to_search must be positive, but no " + "greater than rank of the array being searched"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *vals_shape_ptr = vals.get_shape_raw(); + const py::ssize_t *inds_shape_ptr = inds.get_shape_raw(); + + bool same_shapes = true; + std::size_t iter_nelems(1); + for (int i = 0; same_shapes && (i < iteration_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == vals_shape_ptr[i] && + src_shape_i == inds_shape_ptr[i]); + iter_nelems *= static_cast(src_shape_i); + } + + if (!same_shapes) { + throw py::value_error( + "Destination shape does not match the input shape"); + } + + std::size_t vals_k(1); + std::size_t inds_k(1); + std::size_t axis_nelems(1); + for (int i = iteration_nd; i < src_nd; ++i) { + axis_nelems *= static_cast(src_shape_ptr[i]); + vals_k *= static_cast(vals_shape_ptr[i]); + inds_k *= static_cast(inds_shape_ptr[i]); + } + + bool valid_k = (vals_k == static_cast(k) && + inds_k == static_cast(k) && + axis_nelems >= static_cast(k)); + if (!valid_k) { + throw py::value_error( + "The value of k is invalid for the input and destination arrays"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, vals, inds})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(vals); + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(inds); + + if ((iter_nelems == 0) || (axis_nelems == 0)) { + // Nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, vals) || overlap(src, inds)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(vals, + k * iter_nelems); + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(inds, + k * iter_nelems); + + int src_typenum = src.get_typenum(); + int vals_typenum = vals.get_typenum(); + int inds_typenum = inds.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int vals_typeid = array_types.typenum_to_lookup_id(vals_typenum); + int inds_typeid = array_types.typenum_to_lookup_id(inds_typenum); + + if (src_typeid != vals_typeid) { + throw py::value_error("Input array and vals array must have " + "the same data type"); + } + + if (inds_typeid != static_cast(td_ns::typenum_t::INT64)) { + throw py::value_error("Inds array must have data type int64"); + } + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_vals_c_contig = vals.is_c_contiguous(); + bool is_inds_c_contig = inds.is_c_contiguous(); + + if (is_src_c_contig && is_vals_c_contig && is_inds_c_contig) { + static constexpr py::ssize_t zero_offset = py::ssize_t(0); + + auto fn = topk_dispatch_vector[src_typeid]; + + sycl::event comp_ev = + fn(exec_q, iter_nelems, axis_nelems, k, largest, src.get_data(), + vals.get_data(), inds.get_data(), zero_offset, zero_offset, + zero_offset, zero_offset, zero_offset, zero_offset, depends); + + sycl::event keep_args_alive_ev = + dpctl::utils::keep_args_alive(exec_q, {src, vals, inds}, {comp_ev}); + + return std::make_pair(keep_args_alive_ev, comp_ev); + } + + return std::make_pair(sycl::event(), sycl::event()); +} + +std::pair +py_topk(const dpctl::tensor::usm_ndarray &src, + const py::ssize_t k, + const bool largest, + const dpctl::tensor::usm_ndarray &vals, + const dpctl::tensor::usm_ndarray &inds, + sycl::queue &exec_q, + const std::vector &depends) +{ + int src_nd = src.get_ndim(); + int vals_nd = vals.get_ndim(); + int inds_nd = inds.get_ndim(); + if (vals_nd != 1 || inds_nd != 1) { + throw py::value_error("Output arrays must be one-dimensional"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *vals_shape_ptr = vals.get_shape_raw(); + const py::ssize_t *inds_shape_ptr = inds.get_shape_raw(); + + std::size_t axis_nelems(1); + for (int i = 0; i < src_nd; ++i) { + axis_nelems *= static_cast(src_shape_ptr[i]); + } + + bool valid_k = (axis_nelems >= static_cast(k) && + vals_shape_ptr[0] == k && inds_shape_ptr[0] == k); + if (!valid_k) { + throw py::value_error( + "The value of k is invalid for the input and destination arrays"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, vals, inds})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(vals); + dpctl::tensor::validation::CheckWritable::throw_if_not_writable(inds); + + if (axis_nelems == 0) { + // Nothing to do + return std::make_pair(sycl::event(), sycl::event()); + } + + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, vals) || overlap(src, inds)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(vals, k); + + dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(inds, k); + + int src_typenum = src.get_typenum(); + int vals_typenum = vals.get_typenum(); + int inds_typenum = inds.get_typenum(); + + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int vals_typeid = array_types.typenum_to_lookup_id(vals_typenum); + int inds_typeid = array_types.typenum_to_lookup_id(inds_typenum); + + if (src_typeid != vals_typeid) { + throw py::value_error("Input array and vals array must have " + "the same data type"); + } + + if (inds_typeid != static_cast(td_ns::typenum_t::INT64)) { + throw py::value_error("Inds array must have data type int64"); + } + + bool is_src_c_contig = src.is_c_contiguous(); + bool is_vals_c_contig = vals.is_c_contiguous(); + bool is_inds_c_contig = inds.is_c_contiguous(); + + if (is_src_c_contig && is_vals_c_contig && is_inds_c_contig) { + static constexpr py::ssize_t zero_offset = py::ssize_t(0); + static constexpr std::size_t iter_nelems = 1; + + auto fn = topk_dispatch_vector[src_typeid]; + + sycl::event comp_ev = + fn(exec_q, iter_nelems, axis_nelems, k, largest, src.get_data(), + vals.get_data(), inds.get_data(), zero_offset, zero_offset, + zero_offset, zero_offset, zero_offset, zero_offset, depends); + + sycl::event keep_args_alive_ev = + dpctl::utils::keep_args_alive(exec_q, {src, vals, inds}, {comp_ev}); + + return std::make_pair(keep_args_alive_ev, comp_ev); + } + + return std::make_pair(sycl::event(), sycl::event()); +} + +template struct TopKFactory +{ + fnT get() + { + using IdxT = std::int64_t; + return topk_caller; + } +}; + +void init_topk_dispatch_vectors(void) +{ + td_ns::DispatchVectorBuilder + dvb; + dvb.populate_dispatch_vector(topk_dispatch_vector); +} + +void init_topk_functions(py::module_ m) +{ + dpctl::tensor::py_internal::init_topk_dispatch_vectors(); + + auto py_topk = [](const dpctl::tensor::usm_ndarray &src, + std::optional trailing_dims_to_search, + const py::ssize_t k, const bool largest, + const dpctl::tensor::usm_ndarray &vals, + const dpctl::tensor::usm_ndarray &inds, + sycl::queue &exec_q, + const std::vector &depends) + -> std::pair { + if (trailing_dims_to_search) { + return dpctl::tensor::py_internal::py_topk( + src, trailing_dims_to_search.value(), k, largest, vals, inds, + exec_q, depends); + } + else { + return dpctl::tensor::py_internal::py_topk(src, k, largest, vals, + inds, exec_q, depends); + } + }; + + m.def("_topk", py_topk, py::arg("src"), py::arg("trailing_dims_to_search"), + py::arg("k"), py::arg("largest"), py::arg("vals"), py::arg("inds"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); +} + +} // end of namespace py_internal +} // end of namespace tensor +} // end of namespace dpctl diff --git a/dpctl/tensor/libtensor/source/sorting/topk.hpp b/dpctl/tensor/libtensor/source/sorting/topk.hpp new file mode 100644 index 0000000000..37042457da --- /dev/null +++ b/dpctl/tensor/libtensor/source/sorting/topk.hpp @@ -0,0 +1,42 @@ +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_sorting_impl +/// extension. +//===--------------------------------------------------------------------===// + +#pragma once + +#include + +namespace py = pybind11; + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +extern void init_topk_functions(py::module_); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/tensor_sorting.cpp b/dpctl/tensor/libtensor/source/tensor_sorting.cpp index 52d3ab67b4..09deac7b30 100644 --- a/dpctl/tensor/libtensor/source/tensor_sorting.cpp +++ b/dpctl/tensor/libtensor/source/tensor_sorting.cpp @@ -28,6 +28,7 @@ #include "sorting/merge_argsort.hpp" #include "sorting/merge_sort.hpp" #include "sorting/searchsorted.hpp" +#include "sorting/topk.hpp" namespace py = pybind11; @@ -36,4 +37,5 @@ PYBIND11_MODULE(_tensor_sorting_impl, m) dpctl::tensor::py_internal::init_merge_sort_functions(m); dpctl::tensor::py_internal::init_merge_argsort_functions(m); dpctl::tensor::py_internal::init_searchsorted_functions(m); + dpctl::tensor::py_internal::init_topk_functions(m); } From f8095731e6a1ffc757190e6d45014273ecadac9f Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Wed, 4 Dec 2024 16:16:25 -0800 Subject: [PATCH 02/30] Add `top_k` to doc sources --- .../doc_sources/api_reference/dpctl/tensor.sorting_functions.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/doc_sources/api_reference/dpctl/tensor.sorting_functions.rst b/docs/doc_sources/api_reference/dpctl/tensor.sorting_functions.rst index ae1605d988..ef20f4654c 100644 --- a/docs/doc_sources/api_reference/dpctl/tensor.sorting_functions.rst +++ b/docs/doc_sources/api_reference/dpctl/tensor.sorting_functions.rst @@ -10,3 +10,4 @@ Sorting functions argsort sort + top_k From b59e59d57c0e150a52aa92f2b89e407dfbe4a60d Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Thu, 5 Dec 2024 13:25:17 -0800 Subject: [PATCH 03/30] Use `std::size_t` for `k` instead of `py::ssize_t` Reduces amount of casting. `k` will need to fit in `py::ssize_t` regardless. --- .../include/kernels/sorting/topk.hpp | 25 ++++++++----------- .../tensor/libtensor/source/sorting/topk.cpp | 19 +++++++------- 2 files changed, 19 insertions(+), 25 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp index 7db3f3e3be..f709a1737d 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp @@ -86,7 +86,7 @@ sycl::event topk_full_sort_impl( // matrix when sorting over rows) std::size_t sort_nelems, // size of each array to sort (length of rows, // i.e. number of columns) - dpctl::tensor::ssize_t k, + std::size_t k, const argTy *arg_tp, argTy *vals_tp, IndexTy *inds_tp, @@ -174,7 +174,7 @@ topk_impl(sycl::queue &exec_q, // in a matrix when sorting over rows) std::size_t axis_nelems, // size of each array to sort (length of // rows, i.e. number of columns) - dpctl::tensor::ssize_t k, + std::size_t k, const char *arg_cp, char *vals_cp, char *inds_cp, @@ -186,7 +186,7 @@ topk_impl(sycl::queue &exec_q, dpctl::tensor::ssize_t axis_inds_offset, const std::vector &depends) { - if (axis_nelems < static_cast(k)) { + if (axis_nelems < k) { throw std::runtime_error("Invalid sort axis size for value of k"); } @@ -200,9 +200,7 @@ topk_impl(sycl::queue &exec_q, using dpctl::tensor::kernels::IndexComp; const IndexComp index_comp{arg_tp, ValueComp{}}; - if (axis_nelems <= 512 || k >= 1024 || - static_cast(k) > axis_nelems / 2) - { + if (axis_nelems <= 512 || k >= 1024 || k > axis_nelems / 2) { return topk_full_sort_impl(exec_q, iter_nelems, axis_nelems, k, arg_tp, vals_tp, inds_tp, index_comp, depends); } @@ -256,22 +254,19 @@ topk_impl(sycl::queue &exec_q, sorted_block_size); // round k up for the later merge kernel - const dpctl::tensor::ssize_t round_k_to = elems_per_wi; - dpctl::tensor::ssize_t k_rounded = - merge_sort_detail::quotient_ceil( - k, round_k_to) * - round_k_to; + std::size_t k_rounded = + merge_sort_detail::quotient_ceil(k, elems_per_wi) * + elems_per_wi; // get length of tail for alloc size auto rem = axis_nelems % sorted_block_size; - auto alloc_len = (rem && rem < static_cast(k_rounded)) + auto alloc_len = (rem && rem < k_rounded) ? rem + k_rounded * (n_segments - 1) : k_rounded * n_segments; // if allocation would be sufficiently large or k is larger than // elements processed, use full sort - if (static_cast(k_rounded) >= axis_nelems || - static_cast(k_rounded) >= sorted_block_size || + if (k_rounded >= axis_nelems || k_rounded >= sorted_block_size || alloc_len >= axis_nelems / 2) { return topk_full_sort_impl(exec_q, iter_nelems, axis_nelems, k, @@ -386,7 +381,7 @@ topk_impl(sycl::queue &exec_q, for (std::size_t array_id = k_segment_start_idx + lid; array_id < k_segment_end_idx; array_id += lws) { - if (lid < static_cast(k_rounded)) { + if (lid < k_rounded) { index_data[iter_id * alloc_len + array_id] = out_src[array_id - k_segment_start_idx]; } diff --git a/dpctl/tensor/libtensor/source/sorting/topk.cpp b/dpctl/tensor/libtensor/source/sorting/topk.cpp index af2f6ce027..fe5eac9acb 100644 --- a/dpctl/tensor/libtensor/source/sorting/topk.cpp +++ b/dpctl/tensor/libtensor/source/sorting/topk.cpp @@ -58,7 +58,7 @@ namespace py_internal typedef sycl::event (*topk_impl_fn_ptr_t)(sycl::queue &, std::size_t, std::size_t, - py::ssize_t, + std::size_t, bool, const char *, char *, @@ -83,7 +83,7 @@ topk_caller(sycl::queue &exec_q, // rows in a matrix when sorting over rows) std::size_t axis_nelems, // size of each array to sort (length of // rows, i.e. number of columns) - py::ssize_t k, + std::size_t k, bool largest, const char *arg_cp, char *vals_cp, @@ -120,7 +120,7 @@ topk_caller(sycl::queue &exec_q, std::pair py_topk(const dpctl::tensor::usm_ndarray &src, const int trailing_dims_to_search, - const py::ssize_t k, + const std::size_t k, const bool largest, const dpctl::tensor::usm_ndarray &vals, const dpctl::tensor::usm_ndarray &inds, @@ -168,9 +168,7 @@ py_topk(const dpctl::tensor::usm_ndarray &src, inds_k *= static_cast(inds_shape_ptr[i]); } - bool valid_k = (vals_k == static_cast(k) && - inds_k == static_cast(k) && - axis_nelems >= static_cast(k)); + bool valid_k = (vals_k == k && inds_k == k && axis_nelems >= k); if (!valid_k) { throw py::value_error( "The value of k is invalid for the input and destination arrays"); @@ -243,7 +241,7 @@ py_topk(const dpctl::tensor::usm_ndarray &src, std::pair py_topk(const dpctl::tensor::usm_ndarray &src, - const py::ssize_t k, + const std::size_t k, const bool largest, const dpctl::tensor::usm_ndarray &vals, const dpctl::tensor::usm_ndarray &inds, @@ -266,8 +264,9 @@ py_topk(const dpctl::tensor::usm_ndarray &src, axis_nelems *= static_cast(src_shape_ptr[i]); } - bool valid_k = (axis_nelems >= static_cast(k) && - vals_shape_ptr[0] == k && inds_shape_ptr[0] == k); + bool valid_k = + (axis_nelems >= k && static_cast(vals_shape_ptr[0]) == k && + static_cast(inds_shape_ptr[0]) == k); if (!valid_k) { throw py::value_error( "The value of k is invalid for the input and destination arrays"); @@ -360,7 +359,7 @@ void init_topk_functions(py::module_ m) auto py_topk = [](const dpctl::tensor::usm_ndarray &src, std::optional trailing_dims_to_search, - const py::ssize_t k, const bool largest, + const std::size_t k, const bool largest, const dpctl::tensor::usm_ndarray &vals, const dpctl::tensor::usm_ndarray &inds, sycl::queue &exec_q, From b4d7ba4b010141f4b459e2ea5dc45b5b2ef48fa4 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Thu, 5 Dec 2024 15:20:11 -0800 Subject: [PATCH 04/30] Add implementation of `top_k` using radix sort --- .../include/kernels/sorting/topk.hpp | 202 +++++++++++++----- .../tensor/libtensor/source/sorting/topk.cpp | 54 +++-- 2 files changed, 193 insertions(+), 63 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp index f709a1737d..11f3e851fb 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp @@ -35,6 +35,7 @@ #include "kernels/dpctl_tensor_types.hpp" #include "merge_sort.hpp" +#include "radix_sort.hpp" #include "utils/sycl_alloc_utils.hpp" #include @@ -70,31 +71,25 @@ void scale_topk_params(const std::uint64_t nelems_per_slm, } // namespace topk_detail template -class populate_index_data_full_sort_krn; +class topk_populate_index_data_krn; template -class topk_map_to_rows_full_sort_krn; - -template class populate_index_data_krn; - -template class topk_map_to_rows_krn; +class topk_full_merge_map_back_krn; template -sycl::event topk_full_sort_impl( - sycl::queue &exec_q, - std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a - // matrix when sorting over rows) - std::size_t sort_nelems, // size of each array to sort (length of rows, - // i.e. number of columns) - std::size_t k, - const argTy *arg_tp, - argTy *vals_tp, - IndexTy *inds_tp, - const CompT &comp, - const std::vector &depends) +sycl::event +topk_full_merge_sort_impl(sycl::queue &exec_q, + std::size_t iter_nelems, // number of sub-arrays + std::size_t axis_nelems, // size of each sub-array + std::size_t k, + const argTy *arg_tp, + argTy *vals_tp, + IndexTy *inds_tp, + const CompT &comp, + const std::vector &depends) { IndexTy *index_data = - sycl::malloc_device(iter_nelems * sort_nelems, exec_q); + sycl::malloc_device(iter_nelems * axis_nelems, exec_q); if (index_data == nullptr) { throw std::runtime_error("Unable to allocate device_memory"); } @@ -103,10 +98,10 @@ sycl::event topk_full_sort_impl( exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - auto const &range = sycl::range<1>(iter_nelems * sort_nelems); + auto const &range = sycl::range<1>(iter_nelems * axis_nelems); using KernelName = - populate_index_data_full_sort_krn; + topk_populate_index_data_krn; cgh.parallel_for(range, [=](sycl::id<1> id) { std::size_t i = id[0]; @@ -118,21 +113,20 @@ sycl::event topk_full_sort_impl( // Sort segments of the array sycl::event base_sort_ev = merge_sort_detail::sort_over_work_group_contig_impl( - exec_q, iter_nelems, sort_nelems, index_data, index_data, comp, + exec_q, iter_nelems, axis_nelems, index_data, index_data, comp, sorted_block_size, // modified in place with size of sorted block // size {populate_indexed_data_ev}); // Merge segments in parallel until all elements are sorted sycl::event merges_ev = merge_sort_detail::merge_sorted_block_contig_impl( - exec_q, iter_nelems, sort_nelems, index_data, comp, sorted_block_size, + exec_q, iter_nelems, axis_nelems, index_data, comp, sorted_block_size, {base_sort_ev}); sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(merges_ev); - using KernelName = - topk_map_to_rows_full_sort_krn; + using KernelName = topk_full_merge_map_back_krn; cgh.parallel_for(iter_nelems * k, [=](sycl::id<1> id) { std::size_t gid = id[0]; @@ -140,12 +134,12 @@ sycl::event topk_full_sort_impl( std::size_t iter_gid = gid / k; std::size_t axis_gid = gid - (iter_gid * k); - std::size_t src_idx = iter_gid * sort_nelems + axis_gid; + std::size_t src_idx = iter_gid * axis_nelems + axis_gid; std::size_t dst_idx = iter_gid * k + axis_gid; auto res_ind = index_data[src_idx]; vals_tp[dst_idx] = arg_tp[res_ind]; - inds_tp[dst_idx] = res_ind % sort_nelems; + inds_tp[dst_idx] = res_ind % axis_nelems; }); }); @@ -162,29 +156,32 @@ sycl::event topk_full_sort_impl( return cleanup_host_task_event; }; +template +class topk_partial_merge_map_back_krn; + template class topk_over_work_group_krn; template > -sycl::event -topk_impl(sycl::queue &exec_q, - std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows - // in a matrix when sorting over rows) - std::size_t axis_nelems, // size of each array to sort (length of - // rows, i.e. number of columns) - std::size_t k, - const char *arg_cp, - char *vals_cp, - char *inds_cp, - dpctl::tensor::ssize_t iter_arg_offset, - dpctl::tensor::ssize_t iter_vals_offset, - dpctl::tensor::ssize_t iter_inds_offset, - dpctl::tensor::ssize_t axis_arg_offset, - dpctl::tensor::ssize_t axis_vals_offset, - dpctl::tensor::ssize_t axis_inds_offset, - const std::vector &depends) +sycl::event topk_merge_impl( + sycl::queue &exec_q, + std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows + // in a matrix when sorting over rows) + std::size_t axis_nelems, // size of each array to sort (length of + // rows, i.e. number of columns) + std::size_t k, + const char *arg_cp, + char *vals_cp, + char *inds_cp, + dpctl::tensor::ssize_t iter_arg_offset, + dpctl::tensor::ssize_t iter_vals_offset, + dpctl::tensor::ssize_t iter_inds_offset, + dpctl::tensor::ssize_t axis_arg_offset, + dpctl::tensor::ssize_t axis_vals_offset, + dpctl::tensor::ssize_t axis_inds_offset, + const std::vector &depends) { if (axis_nelems < k) { throw std::runtime_error("Invalid sort axis size for value of k"); @@ -201,8 +198,9 @@ topk_impl(sycl::queue &exec_q, const IndexComp index_comp{arg_tp, ValueComp{}}; if (axis_nelems <= 512 || k >= 1024 || k > axis_nelems / 2) { - return topk_full_sort_impl(exec_q, iter_nelems, axis_nelems, k, arg_tp, - vals_tp, inds_tp, index_comp, depends); + return topk_full_merge_sort_impl(exec_q, iter_nelems, axis_nelems, k, + arg_tp, vals_tp, inds_tp, index_comp, + depends); } else { using PartialKernelName = @@ -269,9 +267,9 @@ topk_impl(sycl::queue &exec_q, if (k_rounded >= axis_nelems || k_rounded >= sorted_block_size || alloc_len >= axis_nelems / 2) { - return topk_full_sort_impl(exec_q, iter_nelems, axis_nelems, k, - arg_tp, vals_tp, inds_tp, index_comp, - depends); + return topk_full_merge_sort_impl(exec_q, iter_nelems, axis_nelems, + k, arg_tp, vals_tp, inds_tp, + index_comp, depends); } IndexTy *index_data = @@ -399,7 +397,8 @@ topk_impl(sycl::queue &exec_q, sycl::event write_topk_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(merges_ev); - using KernelName = topk_map_to_rows_krn; + using KernelName = + topk_partial_merge_map_back_krn; cgh.parallel_for(iter_nelems * k, [=](sycl::id<1> id) { std::size_t gid = id[0]; @@ -430,6 +429,109 @@ topk_impl(sycl::queue &exec_q, } } +template class topk_iota_krn; + +template class topk_radix_map_back_krn; + +template +sycl::event topk_radix_impl(sycl::queue &exec_q, + std::size_t iter_nelems, // number of sub-arrays + std::size_t axis_nelems, // size of each sub-array + std::size_t k, + bool ascending, + const char *arg_cp, + char *vals_cp, + char *inds_cp, + dpctl::tensor::ssize_t iter_arg_offset, + dpctl::tensor::ssize_t iter_vals_offset, + dpctl::tensor::ssize_t iter_inds_offset, + dpctl::tensor::ssize_t axis_arg_offset, + dpctl::tensor::ssize_t axis_vals_offset, + dpctl::tensor::ssize_t axis_inds_offset, + const std::vector &depends) +{ + if (axis_nelems < k) { + throw std::runtime_error("Invalid sort axis size for value of k"); + } + + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + axis_arg_offset; + argTy *vals_tp = reinterpret_cast(vals_cp) + iter_vals_offset + + axis_vals_offset; + IndexTy *inds_tp = reinterpret_cast(inds_cp) + iter_inds_offset + + axis_inds_offset; + + const std::size_t total_nelems = iter_nelems * axis_nelems; + const std::size_t padded_total_nelems = ((total_nelems + 63) / 64) * 64; + IndexTy *workspace = sycl::malloc_device( + padded_total_nelems + total_nelems, exec_q); + + IndexTy *tmp_tp = sycl::malloc_device(total_nelems, exec_q); + + if (nullptr == workspace || nullptr == tmp_tp) { + throw std::runtime_error( + "Not enough device memory for radix sort topk"); + } + + using IdentityProjT = radix_sort_details::IdentityProj; + using IndexedProjT = + radix_sort_details::IndexedProj; + const IndexedProjT proj_op{arg_tp, IdentityProjT{}}; + + sycl::event iota_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using KernelName = topk_iota_krn; + + cgh.parallel_for( + sycl::range<1>(total_nelems), [=](sycl::id<1> id) { + size_t i = id[0]; + IndexTy sort_id = static_cast(i); + workspace[i] = sort_id; + }); + }); + + sycl::event radix_sort_ev = + radix_sort_details::parallel_radix_sort_impl( + exec_q, iter_nelems, axis_nelems, workspace, tmp_tp, proj_op, + ascending, {iota_ev}); + + // Write out top k of the temporary + sycl::event write_topk_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(radix_sort_ev); + + using KernelName = topk_radix_map_back_krn; + + cgh.parallel_for(iter_nelems * k, [=](sycl::id<1> id) { + std::size_t gid = id[0]; + + std::size_t iter_gid = gid / k; + std::size_t axis_gid = gid - (iter_gid * k); + + std::size_t src_idx = iter_gid * axis_nelems + axis_gid; + std::size_t dst_idx = iter_gid * k + axis_gid; + + IndexTy res_ind = tmp_tp[src_idx]; + vals_tp[dst_idx] = arg_tp[res_ind]; + inds_tp[dst_idx] = res_ind % axis_nelems; + }); + }); + + sycl::event cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(write_topk_ev); + + const sycl::context &ctx = exec_q.get_context(); + + using dpctl::tensor::alloc_utils::sycl_free_noexcept; + cgh.host_task([ctx, workspace, tmp_tp] { + sycl_free_noexcept(workspace, ctx); + sycl_free_noexcept(tmp_tp, ctx); + }); + }); + + return cleanup_ev; +} + } // end of namespace kernels } // end of namespace tensor } // end of namespace dpctl diff --git a/dpctl/tensor/libtensor/source/sorting/topk.cpp b/dpctl/tensor/libtensor/source/sorting/topk.cpp index fe5eac9acb..d3a9e951ac 100644 --- a/dpctl/tensor/libtensor/source/sorting/topk.cpp +++ b/dpctl/tensor/libtensor/source/sorting/topk.cpp @@ -76,6 +76,23 @@ static topk_impl_fn_ptr_t topk_dispatch_vector[td_ns::num_types]; namespace { +template +struct use_radix_sort : public std::false_type +{ +}; + +template +struct use_radix_sort< + T, + std::enable_if_t, + std::is_same, + std::is_same, + std::is_same, + std::is_same>::value>> + : public std::true_type +{ +}; + template sycl::event topk_caller(sycl::queue &exec_q, @@ -96,22 +113,33 @@ topk_caller(sycl::queue &exec_q, py::ssize_t axis_inds_offset, const std::vector &depends) { - using dpctl::tensor::kernels::topk_impl; - if (largest) { - using CompTy = - typename dpctl::tensor::py_internal::DescendingSorter::type; - return topk_impl( - exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp, - iter_arg_offset, iter_vals_offset, iter_inds_offset, + if constexpr (use_radix_sort::value) { + using dpctl::tensor::kernels::topk_radix_impl; + auto ascending = !largest; + return topk_radix_impl( + exec_q, iter_nelems, axis_nelems, k, ascending, arg_cp, vals_cp, + inds_cp, iter_arg_offset, iter_vals_offset, iter_inds_offset, axis_arg_offset, axis_vals_offset, axis_inds_offset, depends); } else { - using CompTy = - typename dpctl::tensor::py_internal::AscendingSorter::type; - return topk_impl( - exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp, - iter_arg_offset, iter_vals_offset, iter_inds_offset, - axis_arg_offset, axis_vals_offset, axis_inds_offset, depends); + using dpctl::tensor::kernels::topk_merge_impl; + if (largest) { + using CompTy = + typename dpctl::tensor::py_internal::DescendingSorter< + argTy>::type; + return topk_merge_impl( + exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp, + iter_arg_offset, iter_vals_offset, iter_inds_offset, + axis_arg_offset, axis_vals_offset, axis_inds_offset, depends); + } + else { + using CompTy = typename dpctl::tensor::py_internal::AscendingSorter< + argTy>::type; + return topk_merge_impl( + exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp, + iter_arg_offset, iter_vals_offset, iter_inds_offset, + axis_arg_offset, axis_vals_offset, axis_inds_offset, depends); + } } } From c37f1de0cb9fde37445382634c2d28c53cb8d75f Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Wed, 11 Dec 2024 13:36:53 -0800 Subject: [PATCH 05/30] Reduce code duplication by using std::optional in py_topk directly Instead of using an overload to handle the `axis=None` case, use std::optional and check for trailing_dims_to_search in validation logic --- .../tensor/libtensor/source/sorting/topk.cpp | 239 ++++++------------ 1 file changed, 71 insertions(+), 168 deletions(-) diff --git a/dpctl/tensor/libtensor/source/sorting/topk.cpp b/dpctl/tensor/libtensor/source/sorting/topk.cpp index d3a9e951ac..dea20fd494 100644 --- a/dpctl/tensor/libtensor/source/sorting/topk.cpp +++ b/dpctl/tensor/libtensor/source/sorting/topk.cpp @@ -94,24 +94,21 @@ struct use_radix_sort< }; template -sycl::event -topk_caller(sycl::queue &exec_q, - std::size_t iter_nelems, // number of sub-arrays to sort (num. of - // rows in a matrix when sorting over rows) - std::size_t axis_nelems, // size of each array to sort (length of - // rows, i.e. number of columns) - std::size_t k, - bool largest, - const char *arg_cp, - char *vals_cp, - char *inds_cp, - py::ssize_t iter_arg_offset, - py::ssize_t iter_vals_offset, - py::ssize_t iter_inds_offset, - py::ssize_t axis_arg_offset, - py::ssize_t axis_vals_offset, - py::ssize_t axis_inds_offset, - const std::vector &depends) +sycl::event topk_caller(sycl::queue &exec_q, + std::size_t iter_nelems, // number of sub-arrays + std::size_t axis_nelems, // size of each sub-array + std::size_t k, + bool largest, + const char *arg_cp, + char *vals_cp, + char *inds_cp, + py::ssize_t iter_arg_offset, + py::ssize_t iter_vals_offset, + py::ssize_t iter_inds_offset, + py::ssize_t axis_arg_offset, + py::ssize_t axis_vals_offset, + py::ssize_t axis_inds_offset, + const std::vector &depends) { if constexpr (use_radix_sort::value) { using dpctl::tensor::kernels::topk_radix_impl; @@ -147,7 +144,7 @@ topk_caller(sycl::queue &exec_q, std::pair py_topk(const dpctl::tensor::usm_ndarray &src, - const int trailing_dims_to_search, + std::optional trailing_dims_to_search, const std::size_t k, const bool largest, const dpctl::tensor::usm_ndarray &vals, @@ -158,48 +155,70 @@ py_topk(const dpctl::tensor::usm_ndarray &src, int src_nd = src.get_ndim(); int vals_nd = vals.get_ndim(); int inds_nd = inds.get_ndim(); - if (src_nd != vals_nd || src_nd != inds_nd) { - throw py::value_error("The input and output arrays must have " - "the same array ranks"); - } - int iteration_nd = src_nd - trailing_dims_to_search; - if (trailing_dims_to_search <= 0 || iteration_nd < 0) { - throw py::value_error( - "trailing_dims_to_search must be positive, but no " - "greater than rank of the array being searched"); - } const py::ssize_t *src_shape_ptr = src.get_shape_raw(); const py::ssize_t *vals_shape_ptr = vals.get_shape_raw(); const py::ssize_t *inds_shape_ptr = inds.get_shape_raw(); - bool same_shapes = true; + std::size_t axis_nelems(1); std::size_t iter_nelems(1); - for (int i = 0; same_shapes && (i < iteration_nd); ++i) { - auto src_shape_i = src_shape_ptr[i]; - same_shapes = same_shapes && (src_shape_i == vals_shape_ptr[i] && - src_shape_i == inds_shape_ptr[i]); - iter_nelems *= static_cast(src_shape_i); - } + if (trailing_dims_to_search.has_value()) { + if (src_nd != vals_nd || src_nd != inds_nd) { + throw py::value_error("The input and output arrays must have " + "the same array ranks"); + } - if (!same_shapes) { - throw py::value_error( - "Destination shape does not match the input shape"); - } + auto trailing_dims = trailing_dims_to_search.value(); + int iter_nd = src_nd - trailing_dims; + if (trailing_dims <= 0 || iter_nd < 0) { + throw py::value_error( + "trailing_dims_to_search must be positive, but no " + "greater than rank of the array being searched"); + } - std::size_t vals_k(1); - std::size_t inds_k(1); - std::size_t axis_nelems(1); - for (int i = iteration_nd; i < src_nd; ++i) { - axis_nelems *= static_cast(src_shape_ptr[i]); - vals_k *= static_cast(vals_shape_ptr[i]); - inds_k *= static_cast(inds_shape_ptr[i]); + bool same_shapes = true; + for (int i = 0; same_shapes && (i < iter_nd); ++i) { + auto src_shape_i = src_shape_ptr[i]; + same_shapes = same_shapes && (src_shape_i == vals_shape_ptr[i] && + src_shape_i == inds_shape_ptr[i]); + iter_nelems *= static_cast(src_shape_i); + } + + if (!same_shapes) { + throw py::value_error( + "Destination shape does not match the input shape"); + } + + std::size_t vals_k(1); + std::size_t inds_k(1); + for (int i = iter_nd; i < src_nd; ++i) { + axis_nelems *= static_cast(src_shape_ptr[i]); + vals_k *= static_cast(vals_shape_ptr[i]); + inds_k *= static_cast(inds_shape_ptr[i]); + } + + bool valid_k = (vals_k == k && inds_k == k && axis_nelems >= k); + if (!valid_k) { + throw py::value_error("The value of k is invalid for the input and " + "destination arrays"); + } } + else { + if (vals_nd != 1 || inds_nd != 1) { + throw py::value_error("Output arrays must be one-dimensional"); + } - bool valid_k = (vals_k == k && inds_k == k && axis_nelems >= k); - if (!valid_k) { - throw py::value_error( - "The value of k is invalid for the input and destination arrays"); + for (int i = 0; i < src_nd; ++i) { + axis_nelems *= static_cast(src_shape_ptr[i]); + } + + bool valid_k = (axis_nelems >= k && + static_cast(vals_shape_ptr[0]) == k && + static_cast(inds_shape_ptr[0]) == k); + if (!valid_k) { + throw py::value_error("The value of k is invalid for the input and " + "destination arrays"); + } } if (!dpctl::utils::queues_are_compatible(exec_q, {src, vals, inds})) { @@ -267,103 +286,6 @@ py_topk(const dpctl::tensor::usm_ndarray &src, return std::make_pair(sycl::event(), sycl::event()); } -std::pair -py_topk(const dpctl::tensor::usm_ndarray &src, - const std::size_t k, - const bool largest, - const dpctl::tensor::usm_ndarray &vals, - const dpctl::tensor::usm_ndarray &inds, - sycl::queue &exec_q, - const std::vector &depends) -{ - int src_nd = src.get_ndim(); - int vals_nd = vals.get_ndim(); - int inds_nd = inds.get_ndim(); - if (vals_nd != 1 || inds_nd != 1) { - throw py::value_error("Output arrays must be one-dimensional"); - } - - const py::ssize_t *src_shape_ptr = src.get_shape_raw(); - const py::ssize_t *vals_shape_ptr = vals.get_shape_raw(); - const py::ssize_t *inds_shape_ptr = inds.get_shape_raw(); - - std::size_t axis_nelems(1); - for (int i = 0; i < src_nd; ++i) { - axis_nelems *= static_cast(src_shape_ptr[i]); - } - - bool valid_k = - (axis_nelems >= k && static_cast(vals_shape_ptr[0]) == k && - static_cast(inds_shape_ptr[0]) == k); - if (!valid_k) { - throw py::value_error( - "The value of k is invalid for the input and destination arrays"); - } - - if (!dpctl::utils::queues_are_compatible(exec_q, {src, vals, inds})) { - throw py::value_error( - "Execution queue is not compatible with allocation queues"); - } - - dpctl::tensor::validation::CheckWritable::throw_if_not_writable(vals); - dpctl::tensor::validation::CheckWritable::throw_if_not_writable(inds); - - if (axis_nelems == 0) { - // Nothing to do - return std::make_pair(sycl::event(), sycl::event()); - } - - auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); - if (overlap(src, vals) || overlap(src, inds)) { - throw py::value_error("Arrays index overlapping segments of memory"); - } - - dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(vals, k); - - dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(inds, k); - - int src_typenum = src.get_typenum(); - int vals_typenum = vals.get_typenum(); - int inds_typenum = inds.get_typenum(); - - const auto &array_types = td_ns::usm_ndarray_types(); - int src_typeid = array_types.typenum_to_lookup_id(src_typenum); - int vals_typeid = array_types.typenum_to_lookup_id(vals_typenum); - int inds_typeid = array_types.typenum_to_lookup_id(inds_typenum); - - if (src_typeid != vals_typeid) { - throw py::value_error("Input array and vals array must have " - "the same data type"); - } - - if (inds_typeid != static_cast(td_ns::typenum_t::INT64)) { - throw py::value_error("Inds array must have data type int64"); - } - - bool is_src_c_contig = src.is_c_contiguous(); - bool is_vals_c_contig = vals.is_c_contiguous(); - bool is_inds_c_contig = inds.is_c_contiguous(); - - if (is_src_c_contig && is_vals_c_contig && is_inds_c_contig) { - static constexpr py::ssize_t zero_offset = py::ssize_t(0); - static constexpr std::size_t iter_nelems = 1; - - auto fn = topk_dispatch_vector[src_typeid]; - - sycl::event comp_ev = - fn(exec_q, iter_nelems, axis_nelems, k, largest, src.get_data(), - vals.get_data(), inds.get_data(), zero_offset, zero_offset, - zero_offset, zero_offset, zero_offset, zero_offset, depends); - - sycl::event keep_args_alive_ev = - dpctl::utils::keep_args_alive(exec_q, {src, vals, inds}, {comp_ev}); - - return std::make_pair(keep_args_alive_ev, comp_ev); - } - - return std::make_pair(sycl::event(), sycl::event()); -} - template struct TopKFactory { fnT get() @@ -385,26 +307,7 @@ void init_topk_functions(py::module_ m) { dpctl::tensor::py_internal::init_topk_dispatch_vectors(); - auto py_topk = [](const dpctl::tensor::usm_ndarray &src, - std::optional trailing_dims_to_search, - const std::size_t k, const bool largest, - const dpctl::tensor::usm_ndarray &vals, - const dpctl::tensor::usm_ndarray &inds, - sycl::queue &exec_q, - const std::vector &depends) - -> std::pair { - if (trailing_dims_to_search) { - return dpctl::tensor::py_internal::py_topk( - src, trailing_dims_to_search.value(), k, largest, vals, inds, - exec_q, depends); - } - else { - return dpctl::tensor::py_internal::py_topk(src, k, largest, vals, - inds, exec_q, depends); - } - }; - - m.def("_topk", py_topk, py::arg("src"), py::arg("trailing_dims_to_search"), + m.def("_topk", &py_topk, py::arg("src"), py::arg("trailing_dims_to_search"), py::arg("k"), py::arg("largest"), py::arg("vals"), py::arg("inds"), py::arg("sycl_queue"), py::arg("depends") = py::list()); } From 1f0e9fd3d777879aaedaf0df95c38edb33640a5d Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Wed, 11 Dec 2024 14:51:58 -0800 Subject: [PATCH 06/30] Clean up top_k docstring --- dpctl/tensor/_topk.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/dpctl/tensor/_topk.py b/dpctl/tensor/_topk.py index a807aea689..2984f95751 100644 --- a/dpctl/tensor/_topk.py +++ b/dpctl/tensor/_topk.py @@ -56,23 +56,24 @@ def top_k(x, k, /, *, axis=None, mode="largest"): over the flattened array. Default: ``None``. mode (Literal["largest", "smallest"]): search mode. Must be one of the following modes: - - `"largest"`: return the `k` largest elements. - - `"smallest"`: return the `k` smallest elements. + + - `"largest"`: return the `k` largest elements. + - `"smallest"`: return the `k` smallest elements. + Default: `"largest"`. Returns: - tuple[usm_ndarray, usm_ndarray]: + tuple[usm_ndarray, usm_ndarray] a namedtuple `(values, indices)` whose - - first element `values` will be an array containing the `k` largest or - smallest elements of `x`. The array has the same data type as `x`. - If `axis` was `None`, `values` will be a one-dimensional array - with shape `(k,)` and otherwise, `values` will have shape - `x.shape[:axis] + (k,) + x.shape[axis+1:]` - - - second element `indices` will be an array containing indices of `x` - that result in `values`. The array will have the same shape as - `values` and will have the default array index data type. + * first element `values` will be an array containing the `k` + largest or smallest elements of `x`. The array has the same data + type as `x`. If `axis` was `None`, `values` will be a + one-dimensional array with shape `(k,)` and otherwise, `values` + will have shape `x.shape[:axis] + (k,) + x.shape[axis+1:]` + * second element `indices` will be an array containing indices of + `x` that result in `values`. The array will have the same shape + as `values` and will have the default array index data type. """ largest = _get_top_k_largest(mode) if not isinstance(x, dpt.usm_ndarray): From fa74e5edbc92b2ee3f172f76772f507b3574b452 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Thu, 12 Dec 2024 12:40:29 -0800 Subject: [PATCH 07/30] Move top_k into _sorting.py --- dpctl/tensor/__init__.py | 3 +- dpctl/tensor/_sorting.py | 165 ++++++++++++++++++++++++++++++++++ dpctl/tensor/_topk.py | 185 --------------------------------------- 3 files changed, 166 insertions(+), 187 deletions(-) delete mode 100644 dpctl/tensor/_topk.py diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index d83d41215f..490cdfd23c 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -199,9 +199,8 @@ unique_inverse, unique_values, ) -from ._sorting import argsort, sort +from ._sorting import argsort, sort, top_k from ._testing import allclose -from ._topk import top_k from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type __all__ = [ diff --git a/dpctl/tensor/_sorting.py b/dpctl/tensor/_sorting.py index d5026a6ee8..63f2d25b86 100644 --- a/dpctl/tensor/_sorting.py +++ b/dpctl/tensor/_sorting.py @@ -14,6 +14,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import operator +from typing import NamedTuple + import dpctl.tensor as dpt import dpctl.tensor._tensor_impl as ti import dpctl.utils as du @@ -24,6 +27,7 @@ _argsort_descending, _sort_ascending, _sort_descending, + _topk, ) from ._tensor_sorting_radix_impl import ( _radix_argsort_ascending, @@ -267,3 +271,164 @@ def argsort(x, axis=-1, descending=False, stable=True, kind=None): inv_perm = sorted(range(nd), key=lambda d: perm[d]) res = dpt.permute_dims(res, inv_perm) return res + + +def _get_top_k_largest(mode): + modes = {"largest": True, "smallest": False} + try: + return modes[mode] + except KeyError: + raise ValueError( + f"`mode` must be `largest` or `smallest`. Got `{mode}`." + ) + + +class TopKResult(NamedTuple): + values: dpt.usm_ndarray + indices: dpt.usm_ndarray + + +def top_k(x, k, /, *, axis=None, mode="largest"): + """top_k(x, k, axis=None, mode="largest") + + Returns the `k` largest or smallest values and their indices in the input + array `x` along the specified axis `axis`. + + Args: + x (usm_ndarray): + input array. + k (int): + number of elements to find. Must be a positive integer value. + axis (Optional[int]): + axis along which to search. If `None`, the search will be performed + over the flattened array. Default: ``None``. + mode (Literal["largest", "smallest"]): + search mode. Must be one of the following modes: + + - `"largest"`: return the `k` largest elements. + - `"smallest"`: return the `k` smallest elements. + + Default: `"largest"`. + + Returns: + tuple[usm_ndarray, usm_ndarray] + a namedtuple `(values, indices)` whose + + * first element `values` will be an array containing the `k` + largest or smallest elements of `x`. The array has the same data + type as `x`. If `axis` was `None`, `values` will be a + one-dimensional array with shape `(k,)` and otherwise, `values` + will have shape `x.shape[:axis] + (k,) + x.shape[axis+1:]` + * second element `indices` will be an array containing indices of + `x` that result in `values`. The array will have the same shape + as `values` and will have the default array index data type. + """ + largest = _get_top_k_largest(mode) + if not isinstance(x, dpt.usm_ndarray): + raise TypeError( + f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}" + ) + + k = operator.index(k) + if k < 0: + raise ValueError("`k` must be a positive integer value") + + nd = x.ndim + if axis is None: + sz = x.size + if nd == 0: + return TopKResult( + dpt.copy(x, order="C"), + dpt.zeros_like( + x, dtype=ti.default_device_index_type(x.sycl_queue) + ), + ) + arr = x + n_search_dims = None + res_sh = k + else: + axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis") + sz = x.shape[axis] + a1 = axis + 1 + if a1 == nd: + perm = list(range(nd)) + arr = x + else: + perm = [i for i in range(nd) if i != axis] + [ + axis, + ] + arr = dpt.permute_dims(x, perm) + n_search_dims = 1 + res_sh = arr.shape[: nd - 1] + (k,) + + if k > sz: + raise ValueError(f"`k`={k} is out of bounds {sz}") + + exec_q = x.sycl_queue + _manager = du.SequentialOrderManager[exec_q] + dep_evs = _manager.submitted_events + + res_usm_type = arr.usm_type + if arr.flags.c_contiguous: + vals = dpt.empty( + res_sh, + dtype=arr.dtype, + usm_type=res_usm_type, + order="C", + sycl_queue=exec_q, + ) + inds = dpt.empty( + res_sh, + dtype=ti.default_device_index_type(exec_q), + usm_type=res_usm_type, + order="C", + sycl_queue=exec_q, + ) + ht_ev, impl_ev = _topk( + src=arr, + trailing_dims_to_search=n_search_dims, + k=k, + largest=largest, + vals=vals, + inds=inds, + sycl_queue=exec_q, + depends=dep_evs, + ) + _manager.add_event_pair(ht_ev, impl_ev) + else: + tmp = dpt.empty_like(arr, order="C") + ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( + src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs + ) + _manager.add_event_pair(ht_ev, copy_ev) + vals = dpt.empty( + res_sh, + dtype=arr.dtype, + usm_type=res_usm_type, + order="C", + sycl_queue=exec_q, + ) + inds = dpt.empty( + res_sh, + dtype=ti.default_device_index_type(exec_q), + usm_type=res_usm_type, + order="C", + sycl_queue=exec_q, + ) + ht_ev, impl_ev = _topk( + src=tmp, + trailing_dims_to_search=n_search_dims, + k=k, + largest=largest, + vals=vals, + inds=inds, + sycl_queue=exec_q, + depends=[copy_ev], + ) + _manager.add_event_pair(ht_ev, impl_ev) + if axis is not None and a1 != nd: + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + vals = dpt.permute_dims(vals, inv_perm) + inds = dpt.permute_dims(inds, inv_perm) + + return TopKResult(vals, inds) diff --git a/dpctl/tensor/_topk.py b/dpctl/tensor/_topk.py deleted file mode 100644 index 2984f95751..0000000000 --- a/dpctl/tensor/_topk.py +++ /dev/null @@ -1,185 +0,0 @@ -# Data Parallel Control (dpctl) -# -# Copyright 2020-2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import operator -from typing import NamedTuple - -import dpctl.tensor as dpt -import dpctl.tensor._tensor_impl as ti -import dpctl.utils as du -from dpctl.tensor._numpy_helper import normalize_axis_index - -from ._tensor_sorting_impl import _topk - - -def _get_top_k_largest(mode): - modes = {"largest": True, "smallest": False} - try: - return modes[mode] - except KeyError: - raise ValueError( - f"`mode` must be `largest` or `smallest`. Got `{mode}`." - ) - - -class TopKResult(NamedTuple): - values: dpt.usm_ndarray - indices: dpt.usm_ndarray - - -def top_k(x, k, /, *, axis=None, mode="largest"): - """top_k(x, k, axis=None, mode="largest") - - Returns the `k` largest or smallest values and their indices in the input - array `x` along the specified axis `axis`. - - Args: - x (usm_ndarray): - input array. - k (int): - number of elements to find. Must be a positive integer value. - axis (Optional[int]): - axis along which to search. If `None`, the search will be performed - over the flattened array. Default: ``None``. - mode (Literal["largest", "smallest"]): - search mode. Must be one of the following modes: - - - `"largest"`: return the `k` largest elements. - - `"smallest"`: return the `k` smallest elements. - - Default: `"largest"`. - - Returns: - tuple[usm_ndarray, usm_ndarray] - a namedtuple `(values, indices)` whose - - * first element `values` will be an array containing the `k` - largest or smallest elements of `x`. The array has the same data - type as `x`. If `axis` was `None`, `values` will be a - one-dimensional array with shape `(k,)` and otherwise, `values` - will have shape `x.shape[:axis] + (k,) + x.shape[axis+1:]` - * second element `indices` will be an array containing indices of - `x` that result in `values`. The array will have the same shape - as `values` and will have the default array index data type. - """ - largest = _get_top_k_largest(mode) - if not isinstance(x, dpt.usm_ndarray): - raise TypeError( - f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}" - ) - - k = operator.index(k) - if k < 0: - raise ValueError("`k` must be a positive integer value") - - nd = x.ndim - if axis is None: - sz = x.size - if nd == 0: - return TopKResult( - dpt.copy(x, order="C"), - dpt.zeros_like( - x, dtype=ti.default_device_index_type(x.sycl_queue) - ), - ) - arr = x - n_search_dims = None - res_sh = k - else: - axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis") - sz = x.shape[axis] - a1 = axis + 1 - if a1 == nd: - perm = list(range(nd)) - arr = x - else: - perm = [i for i in range(nd) if i != axis] + [ - axis, - ] - arr = dpt.permute_dims(x, perm) - n_search_dims = 1 - res_sh = arr.shape[: nd - 1] + (k,) - - if k > sz: - raise ValueError(f"`k`={k} is out of bounds {sz}") - - exec_q = x.sycl_queue - _manager = du.SequentialOrderManager[exec_q] - dep_evs = _manager.submitted_events - - res_usm_type = arr.usm_type - if arr.flags.c_contiguous: - vals = dpt.empty( - res_sh, - dtype=arr.dtype, - usm_type=res_usm_type, - order="C", - sycl_queue=exec_q, - ) - inds = dpt.empty( - res_sh, - dtype=ti.default_device_index_type(exec_q), - usm_type=res_usm_type, - order="C", - sycl_queue=exec_q, - ) - ht_ev, impl_ev = _topk( - src=arr, - trailing_dims_to_search=n_search_dims, - k=k, - largest=largest, - vals=vals, - inds=inds, - sycl_queue=exec_q, - depends=dep_evs, - ) - _manager.add_event_pair(ht_ev, impl_ev) - else: - tmp = dpt.empty_like(arr, order="C") - ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray( - src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs - ) - _manager.add_event_pair(ht_ev, copy_ev) - vals = dpt.empty( - res_sh, - dtype=arr.dtype, - usm_type=res_usm_type, - order="C", - sycl_queue=exec_q, - ) - inds = dpt.empty( - res_sh, - dtype=ti.default_device_index_type(exec_q), - usm_type=res_usm_type, - order="C", - sycl_queue=exec_q, - ) - ht_ev, impl_ev = _topk( - src=tmp, - trailing_dims_to_search=n_search_dims, - k=k, - largest=largest, - vals=vals, - inds=inds, - sycl_queue=exec_q, - depends=[copy_ev], - ) - _manager.add_event_pair(ht_ev, impl_ev) - if axis is not None and a1 != nd: - inv_perm = sorted(range(nd), key=lambda d: perm[d]) - vals = dpt.permute_dims(vals, inv_perm) - inds = dpt.permute_dims(inds, inv_perm) - return TopKResult(vals, inds) From cd8388109ea9fdaedb48a6c7767656719c98e572 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Thu, 12 Dec 2024 16:00:19 -0800 Subject: [PATCH 08/30] top_k raises when k > 1 and input is a scalar --- dpctl/tensor/_sorting.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dpctl/tensor/_sorting.py b/dpctl/tensor/_sorting.py index 63f2d25b86..19cd2f2836 100644 --- a/dpctl/tensor/_sorting.py +++ b/dpctl/tensor/_sorting.py @@ -337,6 +337,8 @@ def top_k(x, k, /, *, axis=None, mode="largest"): if axis is None: sz = x.size if nd == 0: + if k > 1: + raise ValueError(f"`k`={k} is out of bounds 1") return TopKResult( dpt.copy(x, order="C"), dpt.zeros_like( From 70c2ef0a78e928204a84b9d0e49748bad75da940 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Thu, 12 Dec 2024 16:59:58 -0800 Subject: [PATCH 09/30] Fix bug in top_k partial merge sort implementation rounded value of k must be divisible by the merge sort chunk size --- .../libtensor/include/kernels/sorting/topk.hpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp index 11f3e851fb..2674f877c9 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp @@ -36,6 +36,7 @@ #include "kernels/dpctl_tensor_types.hpp" #include "merge_sort.hpp" #include "radix_sort.hpp" +#include "search_sorted_detail.hpp" #include "utils/sycl_alloc_utils.hpp" #include @@ -247,14 +248,16 @@ sycl::event topk_merge_impl( // This assumption permits doing away with using a loop assert(sorted_block_size % lws == 0); + using search_sorted_detail::quotient_ceil; const std::size_t n_segments = - merge_sort_detail::quotient_ceil(axis_nelems, - sorted_block_size); + quotient_ceil(axis_nelems, sorted_block_size); - // round k up for the later merge kernel + // round k up for the later merge kernel if necessary + const std::size_t round_k_to = dev.has(sycl::aspect::cpu) ? 32 : 4; std::size_t k_rounded = - merge_sort_detail::quotient_ceil(k, elems_per_wi) * - elems_per_wi; + (k < round_k_to) + ? k + : quotient_ceil(k, round_k_to) * round_k_to; // get length of tail for alloc size auto rem = axis_nelems % sorted_block_size; @@ -322,8 +325,7 @@ sycl::event topk_merge_impl( sycl::group_barrier(it.get_group()); const std::size_t chunk = - merge_sort_detail::quotient_ceil( - sorted_block_size, lws); + quotient_ceil(sorted_block_size, lws); const std::size_t chunk_start_idx = lid * chunk; const std::size_t chunk_end_idx = From 65f14bec064c4fe8b4ec75824824d2b0d4407135 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 13 Dec 2024 18:48:05 -0600 Subject: [PATCH 10/30] Add test file for top_k functionality --- dpctl/tests/test_usm_ndarray_top_k.py | 94 +++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 dpctl/tests/test_usm_ndarray_top_k.py diff --git a/dpctl/tests/test_usm_ndarray_top_k.py b/dpctl/tests/test_usm_ndarray_top_k.py new file mode 100644 index 0000000000..954c21a990 --- /dev/null +++ b/dpctl/tests/test_usm_ndarray_top_k.py @@ -0,0 +1,94 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +@pytest.mark.parametrize("n", [33, 255, 511, 1021, 8193]) +def test_topk_1d_largest(dtype, n): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + o = dpt.ones(n, dtype=dtype) + z = dpt.zeros(n, dtype=dtype) + zo = dpt.concat((o, z)) + inp = dpt.roll(zo, 734) + k = 5 + + s = dpt.top_k(inp, k, mode="largest") + assert s.values.shape == (k,) + assert s.values.dtype == inp.dtype + assert s.indices.shape == (k,) + assert dpt.all(s.values == dpt.ones(k, dtype=dtype)) + assert dpt.all(s.values == inp[s.indices]) + + +@pytest.mark.parametrize( + "dtype", + [ + "i1", + "u1", + "i2", + "u2", + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +@pytest.mark.parametrize("n", [33, 255, 257, 513, 1021, 8193]) +def test_topk_1d_smallest(dtype, n): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + o = dpt.ones(n, dtype=dtype) + z = dpt.zeros(n, dtype=dtype) + zo = dpt.concat((o, z)) + inp = dpt.roll(zo, 734) + k = 5 + + s = dpt.top_k(inp, k, mode="smallest") + assert s.values.shape == (k,) + assert s.values.dtype == inp.dtype + assert s.indices.shape == (k,) + assert dpt.all(s.values == dpt.zeros(k, dtype=dtype)) + assert dpt.all(s.values == inp[s.indices]) From 275827ae9f2976a52a6559b0908498e2d41fe712 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 20 Dec 2024 16:51:19 -0600 Subject: [PATCH 11/30] Add sort_utils.hpp with iota_impl Reuse that function call in sorting code-base where argsort is used. --- .../include/kernels/sorting/merge_sort.hpp | 16 ++- .../include/kernels/sorting/radix_sort.hpp | 51 ++++----- .../include/kernels/sorting/sort_utils.hpp | 103 ++++++++++++++++++ .../include/kernels/sorting/topk.hpp | 34 ++++-- 4 files changed, 162 insertions(+), 42 deletions(-) create mode 100644 dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp index f3b5030c48..002da0ff9b 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp @@ -33,6 +33,7 @@ #include "kernels/dpctl_tensor_types.hpp" #include "kernels/sorting/search_sorted_detail.hpp" +#include "kernels/sorting/sort_utils.hpp" namespace dpctl { @@ -811,20 +812,27 @@ sycl::event stable_argsort_axis1_contig_impl( const size_t total_nelems = iter_nelems * sort_nelems; + using dpctl::tensor::kernels::sort_utils_detail::iota_impl; + + using IotaKernelName = populate_index_data_krn; + +#if 1 + sycl::event populate_indexed_data_ev = iota_impl( + exec_q, res_tp, total_nelems, depends); + +#else sycl::event populate_indexed_data_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); const sycl::range<1> range{total_nelems}; - using KernelName = - populate_index_data_krn; - - cgh.parallel_for(range, [=](sycl::id<1> id) { + cgh.parallel_for(range, [=](sycl::id<1> id) { size_t i = id[0]; res_tp[i] = static_cast(i); }); }); +#endif // Sort segments of the array sycl::event base_sort_ev = diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp index dc3da24315..84edb1dd3b 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp @@ -38,6 +38,7 @@ #include #include "kernels/dpctl_tensor_types.hpp" +#include "kernels/sorting/sort_utils.hpp" #include "utils/sycl_alloc_utils.hpp" namespace dpctl @@ -1256,9 +1257,7 @@ struct subgroup_radix_sort const uint16_t id = wi * block_size + i; if (id < n) values[i] = std::move( - this_input_arr[iter_val_offset + - static_cast( - id)]); + this_input_arr[iter_val_offset + id]); } while (true) { @@ -1272,8 +1271,7 @@ struct subgroup_radix_sort // counting phase auto pcounter = get_accessor_pointer(counter_acc) + - static_cast(wi) + - iter_counter_offset; + (wi + iter_counter_offset); // initialize counters #pragma unroll @@ -1348,19 +1346,15 @@ struct subgroup_radix_sort // scan contiguous numbers uint16_t bin_sum[bin_count]; - bin_sum[0] = - counter_acc[iter_counter_offset + - static_cast( - wi * bin_count)]; + const std::size_t counter_offset0 = + iter_counter_offset + wi * bin_count; + bin_sum[0] = counter_acc[counter_offset0]; #pragma unroll for (uint16_t i = 1; i < bin_count; ++i) bin_sum[i] = bin_sum[i - 1] + - counter_acc - [iter_counter_offset + - static_cast( - wi * bin_count + i)]; + counter_acc[counter_offset0 + i]; sycl::group_barrier(ndit.get_group()); @@ -1374,10 +1368,7 @@ struct subgroup_radix_sort // add to local sum, generate exclusive scan result #pragma unroll for (uint16_t i = 0; i < bin_count; ++i) - counter_acc[iter_counter_offset + - static_cast( - wi * bin_count + i + - 1)] = + counter_acc[counter_offset0 + i + 1] = sum_scan + bin_sum[i]; if (wi == 0) @@ -1407,10 +1398,8 @@ struct subgroup_radix_sort if (r < n) { // move the values to source range and // destroy the values - this_output_arr - [iter_val_offset + - static_cast(r)] = - std::move(values[i]); + this_output_arr[iter_val_offset + r] = + std::move(values[i]); } } @@ -1422,8 +1411,7 @@ struct subgroup_radix_sort for (uint16_t i = 0; i < block_size; ++i) { const uint16_t r = indices[i]; if (r < n) - exchange_acc[iter_exchange_offset + - static_cast(r)] = + exchange_acc[iter_exchange_offset + r] = std::move(values[i]); } @@ -1435,8 +1423,7 @@ struct subgroup_radix_sort if (id < n) values[i] = std::move( exchange_acc[iter_exchange_offset + - static_cast( - id)]); + id]); } sycl::group_barrier(ndit.get_group()); @@ -1795,18 +1782,26 @@ radix_argsort_axis1_contig_impl(sycl::queue &exec_q, radix_sort_details::IndexedProj; const IndexedProjT proj_op{arg_tp, IdentityProjT{}}; + using IotaKernelName = radix_argsort_iota_krn; + +#if 1 + using dpctl::tensor::kernels::sort_utils_detail::iota_impl; + + sycl::event iota_ev = iota_impl( + exec_q, workspace, total_nelems, depends); +#else + sycl::event iota_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - using KernelName = radix_argsort_iota_krn; - - cgh.parallel_for( + cgh.parallel_for( sycl::range<1>(total_nelems), [=](sycl::id<1> id) { size_t i = id[0]; IndexTy sort_id = static_cast(i); workspace[i] = sort_id; }); }); +#endif sycl::event radix_sort_ev = radix_sort_details::parallel_radix_sort_impl( diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp new file mode 100644 index 0000000000..1b1c361d75 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp @@ -0,0 +1,103 @@ +//=== sorting.hpp - Implementation of sorting kernels ---*-C++-*--/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2024 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for tensor sort/argsort operations. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include + +#include + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ +namespace sort_utils_detail +{ + +namespace syclexp = sycl::ext::oneapi::experimental; + +template +sycl::event iota_impl(sycl::queue &exec_q, + T *data, + std::size_t nelems, + const std::vector &dependent_events) +{ + constexpr std::uint32_t lws = 256; + constexpr std::uint32_t n_wi = 4; + const std::size_t n_groups = (nelems + n_wi * lws - 1) / (n_wi * lws); + + sycl::range<1> gRange{n_groups * lws}; + sycl::range<1> lRange{lws}; + sycl::nd_range<1> ndRange{gRange, lRange}; + + sycl::event e = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_events); + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> it) { + const std::size_t gid = it.get_global_id(); + const auto &sg = it.get_sub_group(); + const std::uint32_t lane_id = sg.get_local_id()[0]; + + const std::size_t offset = (gid - lane_id) * n_wi; + const std::uint32_t max_sgSize = sg.get_max_local_range()[0]; + + std::array stripe{}; +#pragma unroll + for (std::uint32_t i = 0; i < n_wi; ++i) { + stripe[i] = T(offset + lane_id + i * max_sgSize); + } + + if (offset + n_wi * max_sgSize < nelems) { + constexpr auto group_ls_props = syclexp::properties{ + syclexp::data_placement_striped + // , syclexp::full_group + }; + + auto out_multi_ptr = sycl::address_space_cast< + sycl::access::address_space::global_space, + sycl::access::decorated::yes>(&data[offset]); + + syclexp::group_store(sg, sycl::span{&stripe[0], n_wi}, + out_multi_ptr, group_ls_props); + } + else { + for (std::size_t idx = offset + lane_id; idx < nelems; + idx += max_sgSize) + { + data[idx] = T(idx); + } + } + }); + }); + + return e; +} + +} // end of namespace sort_utils_detail +} // end of namespace kernels +} // end of namespace tensor +} // end of namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp index 2674f877c9..da9e0bd812 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp @@ -34,9 +34,10 @@ #include #include "kernels/dpctl_tensor_types.hpp" -#include "merge_sort.hpp" -#include "radix_sort.hpp" -#include "search_sorted_detail.hpp" +#include "kernels/sorting/merge_sort.hpp" +#include "kernels/sorting/radix_sort.hpp" +#include "kernels/sorting/search_sorted_detail.hpp" +#include "kernels/sorting/sort_utils.hpp" #include "utils/sycl_alloc_utils.hpp" #include @@ -95,20 +96,26 @@ topk_full_merge_sort_impl(sycl::queue &exec_q, throw std::runtime_error("Unable to allocate device_memory"); } + using IotaKernelName = topk_populate_index_data_krn; + +#if 1 + using dpctl::tensor::kernels::sort_utils_detail::iota_impl; + + sycl::event populate_indexed_data_ev = iota_impl( + exec_q, index_data, iter_nelems * axis_nelems, depends); +#else sycl::event populate_indexed_data_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); auto const &range = sycl::range<1>(iter_nelems * axis_nelems); - using KernelName = - topk_populate_index_data_krn; - - cgh.parallel_for(range, [=](sycl::id<1> id) { + cgh.parallel_for(range, [=](sycl::id<1> id) { std::size_t i = id[0]; index_data[i] = static_cast(i); }); }); +#endif std::size_t sorted_block_size; // Sort segments of the array @@ -480,18 +487,25 @@ sycl::event topk_radix_impl(sycl::queue &exec_q, radix_sort_details::IndexedProj; const IndexedProjT proj_op{arg_tp, IdentityProjT{}}; + using IotaKernelName = topk_iota_krn; + +#if 1 + using dpctl::tensor::kernels::sort_utils_detail::iota_impl; + + sycl::event iota_ev = iota_impl( + exec_q, workspace, total_nelems, depends); +#else sycl::event iota_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); - using KernelName = topk_iota_krn; - - cgh.parallel_for( + cgh.parallel_for( sycl::range<1>(total_nelems), [=](sycl::id<1> id) { size_t i = id[0]; IndexTy sort_id = static_cast(i); workspace[i] = sort_id; }); }); +#endif sycl::event radix_sort_ev = radix_sort_details::parallel_radix_sort_impl( From cd1243fad1fd3cc19f6812a16a61d064473ac83a Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sun, 22 Dec 2024 14:55:45 -0600 Subject: [PATCH 12/30] Simplify write-out kernels in topk implementation (avoid recomputing gid) --- .../include/kernels/sorting/topk.hpp | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp index da9e0bd812..7ea91ad148 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp @@ -410,17 +410,17 @@ sycl::event topk_merge_impl( topk_partial_merge_map_back_krn; cgh.parallel_for(iter_nelems * k, [=](sycl::id<1> id) { - std::size_t gid = id[0]; + const std::size_t gid = id[0]; - std::size_t iter_gid = gid / k; - std::size_t axis_gid = gid - (iter_gid * k); + const std::size_t iter_gid = gid / k; + const std::size_t axis_gid = gid - (iter_gid * k); - std::size_t src_idx = iter_gid * alloc_len + axis_gid; - std::size_t dst_idx = iter_gid * k + axis_gid; + const std::size_t src_idx = iter_gid * alloc_len + axis_gid; + const std::size_t dst_idx = gid; - auto res_ind = index_data[src_idx]; + const auto res_ind = index_data[src_idx]; vals_tp[dst_idx] = arg_tp[res_ind]; - inds_tp[dst_idx] = res_ind % axis_nelems; + inds_tp[dst_idx] = (res_ind % axis_nelems); }); }); @@ -519,17 +519,17 @@ sycl::event topk_radix_impl(sycl::queue &exec_q, using KernelName = topk_radix_map_back_krn; cgh.parallel_for(iter_nelems * k, [=](sycl::id<1> id) { - std::size_t gid = id[0]; + const std::size_t gid = id[0]; - std::size_t iter_gid = gid / k; - std::size_t axis_gid = gid - (iter_gid * k); + const std::size_t iter_gid = gid / k; + const std::size_t axis_gid = gid - (iter_gid * k); - std::size_t src_idx = iter_gid * axis_nelems + axis_gid; - std::size_t dst_idx = iter_gid * k + axis_gid; + const std::size_t src_idx = iter_gid * axis_nelems + axis_gid; + const std::size_t dst_idx = gid; - IndexTy res_ind = tmp_tp[src_idx]; + const IndexTy res_ind = tmp_tp[src_idx]; vals_tp[dst_idx] = arg_tp[res_ind]; - inds_tp[dst_idx] = res_ind % axis_nelems; + inds_tp[dst_idx] = (res_ind % axis_nelems); }); }); From 823c2014632783a3b389003cdda930a86ab269af Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sun, 22 Dec 2024 14:56:56 -0600 Subject: [PATCH 13/30] Add explicit read into registers --- .../libtensor/include/kernels/sorting/topk.hpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp index 7ea91ad148..6698aa50de 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp @@ -145,8 +145,9 @@ topk_full_merge_sort_impl(sycl::queue &exec_q, std::size_t src_idx = iter_gid * axis_nelems + axis_gid; std::size_t dst_idx = iter_gid * k + axis_gid; - auto res_ind = index_data[src_idx]; - vals_tp[dst_idx] = arg_tp[res_ind]; + const IndexTy res_ind = index_data[src_idx]; + const argTy v = arg_tp[res_ind]; + vals_tp[dst_idx] = v; inds_tp[dst_idx] = res_ind % axis_nelems; }); }); @@ -418,8 +419,9 @@ sycl::event topk_merge_impl( const std::size_t src_idx = iter_gid * alloc_len + axis_gid; const std::size_t dst_idx = gid; - const auto res_ind = index_data[src_idx]; - vals_tp[dst_idx] = arg_tp[res_ind]; + const IndexTy res_ind = index_data[src_idx]; + const argTy v = arg_tp[res_ind]; + vals_tp[dst_idx] = v; inds_tp[dst_idx] = (res_ind % axis_nelems); }); }); @@ -528,7 +530,8 @@ sycl::event topk_radix_impl(sycl::queue &exec_q, const std::size_t dst_idx = gid; const IndexTy res_ind = tmp_tp[src_idx]; - vals_tp[dst_idx] = arg_tp[res_ind]; + const argTy v = arg_tp[res_ind]; + vals_tp[dst_idx] = v; inds_tp[dst_idx] = (res_ind % axis_nelems); }); }); From 3a4d8ab164fe1d9fb9e2b364b1500e7d69cefb84 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 20 Dec 2024 16:54:15 -0600 Subject: [PATCH 14/30] Use unique_ptr as temporary owner of USM allocation Until it is passed over to the host function, and unique_ptr's ownership is released. Also reduced allocation sizes, where too much was being allocated. Introduce smart_malloc_device, etc. The smart_malloc_device(count, q) makes USM allocation and returns a unique_ptr which owns the allocation. The function throws an exception (std::runtime_error) if USM allocation is not successful. Introduce async_smart_free. This function intends to replace use of host_task submissions to manage USM temporary deallocations. The usage is as follows: ``` // returns unique_ptr auto alloc_owner = smart_malloc_device(count, q); // get raw pointer for use in kernels T *data = alloc_owner.get(); [..SNIP..] // submit host_task that releases the unique_ptr // after the host task was successfully submitted // and ownership of USM allocation is transfered to // the said host task sycl::event ht_ev = async_smart_free(q, dependent_events, alloc_owner); [...SNIP...] ``` --- .../include/kernels/sorting/radix_sort.hpp | 66 +++++-------- .../include/kernels/sorting/topk.hpp | 70 +++++--------- .../include/utils/sycl_alloc_utils.hpp | 93 ++++++++++++++++++- 3 files changed, 137 insertions(+), 92 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp index 84edb1dd3b..de0011e2ab 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp @@ -1588,11 +1588,11 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q, using CountT = std::uint32_t; // memory for storing count and offset values - CountT *count_ptr = - sycl::malloc_device(n_iters * n_counts, exec_q); - if (nullptr == count_ptr) { - throw std::runtime_error("Could not allocate USM-device memory"); - } + auto count_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + n_iters * n_counts, exec_q); + + CountT *count_ptr = count_owner.get(); constexpr std::uint32_t zero_radix_iter{0}; @@ -1605,25 +1605,17 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q, n_counts, count_ptr, proj_op, is_ascending, depends); - sort_ev = exec_q.submit([=](sycl::handler &cgh) { - cgh.depends_on(sort_ev); - const sycl::context &ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, count_ptr]() { sycl_free_noexcept(count_ptr, ctx); }); - }); + sort_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {sort_ev}, count_owner); return sort_ev; } - ValueT *tmp_arr = - sycl::malloc_device(n_iters * n_to_sort, exec_q); - if (nullptr == tmp_arr) { - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - sycl_free_noexcept(count_ptr, exec_q); - throw std::runtime_error("Could not allocate USM-device memory"); - } + auto tmp_arr_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + n_iters * n_to_sort, exec_q); + + ValueT *tmp_arr = tmp_arr_owner.get(); // iterations per each bucket assert("Number of iterations must be even" && radix_iters % 2 == 0); @@ -1657,17 +1649,8 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q, } } - sort_ev = exec_q.submit([=](sycl::handler &cgh) { - cgh.depends_on(sort_ev); - - const sycl::context &ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, count_ptr, tmp_arr]() { - sycl_free_noexcept(tmp_arr, ctx); - sycl_free_noexcept(count_ptr, ctx); - }); - }); + sort_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {sort_ev}, tmp_arr_owner, count_owner); } return sort_ev; @@ -1769,13 +1752,12 @@ radix_argsort_axis1_contig_impl(sycl::queue &exec_q, reinterpret_cast(res_cp) + iter_res_offset + sort_res_offset; const std::size_t total_nelems = iter_nelems * sort_nelems; - const std::size_t padded_total_nelems = ((total_nelems + 63) / 64) * 64; - IndexTy *workspace = sycl::malloc_device( - padded_total_nelems + total_nelems, exec_q); + auto workspace_owner = + dpctl::tensor::alloc_utils::smart_malloc_device(total_nelems, + exec_q); - if (nullptr == workspace) { - throw std::runtime_error("Could not allocate workspace on device"); - } + // get raw USM pointer + IndexTy *workspace = workspace_owner.get(); using IdentityProjT = radix_sort_details::IdentityProj; using IndexedProjT = @@ -1820,14 +1802,8 @@ radix_argsort_axis1_contig_impl(sycl::queue &exec_q, }); }); - sycl::event cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(map_back_ev); - - const sycl::context &ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, workspace] { sycl_free_noexcept(workspace, ctx); }); - }); + sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {map_back_ev}, workspace_owner); return cleanup_ev; } diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp index 6698aa50de..d1d686a2dd 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp @@ -30,9 +30,10 @@ #include #include #include -#include #include +#include + #include "kernels/dpctl_tensor_types.hpp" #include "kernels/sorting/merge_sort.hpp" #include "kernels/sorting/radix_sort.hpp" @@ -90,11 +91,11 @@ topk_full_merge_sort_impl(sycl::queue &exec_q, const CompT &comp, const std::vector &depends) { - IndexTy *index_data = - sycl::malloc_device(iter_nelems * axis_nelems, exec_q); - if (index_data == nullptr) { - throw std::runtime_error("Unable to allocate device_memory"); - } + auto index_data_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * axis_nelems, exec_q); + // extract USM pointer + IndexTy *index_data = index_data_owner.get(); using IotaKernelName = topk_populate_index_data_krn; @@ -153,14 +154,8 @@ topk_full_merge_sort_impl(sycl::queue &exec_q, }); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(write_out_ev); - const sycl::context &ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, index_data] { sycl_free_noexcept(index_data, ctx); }); - }); + dpctl::tensor::alloc_utils::async_smart_free(exec_q, {write_out_ev}, + index_data_owner); return cleanup_host_task_event; }; @@ -283,11 +278,11 @@ sycl::event topk_merge_impl( index_comp, depends); } - IndexTy *index_data = - sycl::malloc_device(iter_nelems * alloc_len, exec_q); - if (index_data == nullptr) { - throw std::runtime_error("Unable to allocate device_memory"); - } + auto index_data_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + iter_nelems * alloc_len, exec_q); + // get raw USM pointer + IndexTy *index_data = index_data_owner.get(); // no need to populate index data: SLM will be populated with default // values @@ -427,14 +422,8 @@ sycl::event topk_merge_impl( }); sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(write_topk_ev); - const sycl::context &ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task( - [ctx, index_data] { sycl_free_noexcept(index_data, ctx); }); - }); + dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {write_topk_ev}, index_data_owner); return cleanup_host_task_event; } @@ -474,15 +463,13 @@ sycl::event topk_radix_impl(sycl::queue &exec_q, const std::size_t total_nelems = iter_nelems * axis_nelems; const std::size_t padded_total_nelems = ((total_nelems + 63) / 64) * 64; - IndexTy *workspace = sycl::malloc_device( - padded_total_nelems + total_nelems, exec_q); + auto workspace_owner = + dpctl::tensor::alloc_utils::smart_malloc_device( + padded_total_nelems + total_nelems, exec_q); - IndexTy *tmp_tp = sycl::malloc_device(total_nelems, exec_q); - - if (nullptr == workspace || nullptr == tmp_tp) { - throw std::runtime_error( - "Not enough device memory for radix sort topk"); - } + // get raw USM pointer + IndexTy *workspace = workspace_owner.get(); + IndexTy *tmp_tp = workspace + padded_total_nelems; using IdentityProjT = radix_sort_details::IdentityProj; using IndexedProjT = @@ -536,17 +523,8 @@ sycl::event topk_radix_impl(sycl::queue &exec_q, }); }); - sycl::event cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(write_topk_ev); - - const sycl::context &ctx = exec_q.get_context(); - - using dpctl::tensor::alloc_utils::sycl_free_noexcept; - cgh.host_task([ctx, workspace, tmp_tp] { - sycl_free_noexcept(workspace, ctx); - sycl_free_noexcept(tmp_tp, ctx); - }); - }); + sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( + exec_q, {write_topk_ev}, workspace_owner); return cleanup_ev; } diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp index 3ad5f6f36a..87476221d8 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp @@ -28,6 +28,9 @@ #include #include +#include +#include +#include #include "sycl/sycl.hpp" @@ -73,11 +76,99 @@ void sycl_free_noexcept(T *ptr, const sycl::context &ctx) noexcept } } -template void sycl_free_noexcept(T *ptr, sycl::queue &q) noexcept +template +void sycl_free_noexcept(T *ptr, const sycl::queue &q) noexcept { sycl_free_noexcept(ptr, q.get_context()); } +class USMDeleter +{ +private: + sycl::context ctx_; + +public: + USMDeleter(const sycl::queue &q) : ctx_(q.get_context()) {} + USMDeleter(const sycl::context &ctx) : ctx_(ctx) {} + + template void operator()(T *ptr) const + { + sycl_free_noexcept(ptr, ctx_); + } +}; + +template +std::unique_ptr +smart_malloc(std::size_t count, + const sycl::queue &q, + sycl::usm::alloc kind, + const sycl::property_list &propList = {}) +{ + T *ptr = sycl::malloc(count, q, kind, propList); + if (nullptr == ptr) { + throw std::runtime_error("Unable to allocate device_memory"); + } + + auto usm_deleter = USMDeleter(q); + return std::unique_ptr(ptr, usm_deleter); +} + +template +std::unique_ptr +smart_malloc_device(std::size_t count, + const sycl::queue &q, + const sycl::property_list &propList = {}) +{ + return smart_malloc(count, q, sycl::usm::alloc::device, propList); +} + +template +std::unique_ptr +smart_malloc_shared(std::size_t count, + const sycl::queue &q, + const sycl::property_list &propList = {}) +{ + return smart_malloc(count, q, sycl::usm::alloc::shared, propList); +} + +template +std::unique_ptr +smart_malloc_jost(std::size_t count, + const sycl::queue &q, + const sycl::property_list &propList = {}) +{ + return smart_malloc(count, q, sycl::usm::alloc::host, propList); +} + +template +sycl::event async_smart_free(sycl::queue &exec_q, + const std::vector &depends, + Args &&...args) +{ + constexpr std::size_t n = sizeof...(Args); + + std::vector ptrs; + ptrs.reserve(n); + (ptrs.push_back(reinterpret_cast(args.get())), ...); + + std::vector dels; + dels.reserve(n); + (dels.push_back(args.get_deleter()), ...); + + sycl::event ht_e = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.host_task([ptrs, dels]() { + for (size_t i = 0; i < ptrs.size(); ++i) { + dels[i](ptrs[i]); + } + }); + }); + (args.release(), ...); + + return ht_e; +} + } // end of namespace alloc_utils } // end of namespace tensor } // end of namespace dpctl From b15e3020f0b116398b24a99c034e6001d75ec1da Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Wed, 25 Dec 2024 06:41:40 -0600 Subject: [PATCH 15/30] Factor out write-out kernel into separate detail function Replaced three duplicates of the same kernel with calls to this function. --- .../include/kernels/sorting/topk.hpp | 110 +++++++++--------- 1 file changed, 52 insertions(+), 58 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp index d1d686a2dd..3d07b26802 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp @@ -71,6 +71,41 @@ void scale_topk_params(const std::uint64_t nelems_per_slm, throw std::runtime_error("Could not construct top k kernel parameters"); } +template +sycl::event write_out_impl(sycl::queue &exec_q, + std::size_t iter_nelems, + std::size_t k, + const argTy *arg_tp, + const IndexTy *index_data, + std::size_t iter_index_stride, + std::size_t axis_nelems, + argTy *vals_tp, + IndexTy *inds_tp, + const std::vector &depends) +{ + sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + cgh.parallel_for(iter_nelems * k, [=](sycl::id<1> id) { + const std::size_t gid = id[0]; + + const std::size_t iter_gid = gid / k; + const std::size_t axis_gid = gid - (iter_gid * k); + + const std::size_t src_idx = iter_gid * iter_index_stride + axis_gid; + const std::size_t dst_idx = gid; + + const IndexTy res_ind = index_data[src_idx]; + const argTy v = arg_tp[res_ind]; + + vals_tp[dst_idx] = v; + inds_tp[dst_idx] = (res_ind % axis_nelems); + }); + }); + + return write_out_ev; +} + } // namespace topk_detail template @@ -132,26 +167,13 @@ topk_full_merge_sort_impl(sycl::queue &exec_q, exec_q, iter_nelems, axis_nelems, index_data, comp, sorted_block_size, {base_sort_ev}); - sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(merges_ev); - - using KernelName = topk_full_merge_map_back_krn; - - cgh.parallel_for(iter_nelems * k, [=](sycl::id<1> id) { - std::size_t gid = id[0]; - - std::size_t iter_gid = gid / k; - std::size_t axis_gid = gid - (iter_gid * k); + using WriteOutKernelName = + topk_full_merge_map_back_krn; - std::size_t src_idx = iter_gid * axis_nelems + axis_gid; - std::size_t dst_idx = iter_gid * k + axis_gid; - - const IndexTy res_ind = index_data[src_idx]; - const argTy v = arg_tp[res_ind]; - vals_tp[dst_idx] = v; - inds_tp[dst_idx] = res_ind % axis_nelems; - }); - }); + sycl::event write_out_ev = + topk_detail::write_out_impl( + exec_q, iter_nelems, k, arg_tp, index_data, axis_nelems, + axis_nelems, vals_tp, inds_tp, {merges_ev}); sycl::event cleanup_host_task_event = dpctl::tensor::alloc_utils::async_smart_free(exec_q, {write_out_ev}, @@ -399,27 +421,13 @@ sycl::event topk_merge_impl( k_rounded, {base_sort_ev}); // Write out top k of the merge-sorted memory - sycl::event write_topk_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(merges_ev); - - using KernelName = - topk_partial_merge_map_back_krn; + using WriteOutKernelName = + topk_partial_merge_map_back_krn; - cgh.parallel_for(iter_nelems * k, [=](sycl::id<1> id) { - const std::size_t gid = id[0]; - - const std::size_t iter_gid = gid / k; - const std::size_t axis_gid = gid - (iter_gid * k); - - const std::size_t src_idx = iter_gid * alloc_len + axis_gid; - const std::size_t dst_idx = gid; - - const IndexTy res_ind = index_data[src_idx]; - const argTy v = arg_tp[res_ind]; - vals_tp[dst_idx] = v; - inds_tp[dst_idx] = (res_ind % axis_nelems); - }); - }); + sycl::event write_topk_ev = + topk_detail::write_out_impl( + exec_q, iter_nelems, k, arg_tp, index_data, alloc_len, + axis_nelems, vals_tp, inds_tp, {merges_ev}); sycl::event cleanup_host_task_event = dpctl::tensor::alloc_utils::async_smart_free( @@ -502,26 +510,12 @@ sycl::event topk_radix_impl(sycl::queue &exec_q, ascending, {iota_ev}); // Write out top k of the temporary - sycl::event write_topk_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(radix_sort_ev); - - using KernelName = topk_radix_map_back_krn; + using WriteOutKernelName = topk_radix_map_back_krn; - cgh.parallel_for(iter_nelems * k, [=](sycl::id<1> id) { - const std::size_t gid = id[0]; - - const std::size_t iter_gid = gid / k; - const std::size_t axis_gid = gid - (iter_gid * k); - - const std::size_t src_idx = iter_gid * axis_nelems + axis_gid; - const std::size_t dst_idx = gid; - - const IndexTy res_ind = tmp_tp[src_idx]; - const argTy v = arg_tp[res_ind]; - vals_tp[dst_idx] = v; - inds_tp[dst_idx] = (res_ind % axis_nelems); - }); - }); + sycl::event write_topk_ev = + topk_detail::write_out_impl( + exec_q, iter_nelems, k, arg_tp, tmp_tp, axis_nelems, axis_nelems, + vals_tp, inds_tp, {radix_sort_ev}); sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( exec_q, {write_topk_ev}, workspace_owner); From 4ef615e4e12fcda623c5aacbf51316f2b3ffe4f7 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Wed, 25 Dec 2024 06:43:46 -0600 Subject: [PATCH 16/30] Typo: smart_malloc_jost -> smart_malloc_host --- dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp index 87476221d8..72440ef969 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp @@ -133,7 +133,7 @@ smart_malloc_shared(std::size_t count, template std::unique_ptr -smart_malloc_jost(std::size_t count, +smart_malloc_host(std::size_t count, const sycl::queue &q, const sycl::property_list &propList = {}) { From 2608aea15a7b860126c8b475c85c82b5303a8ba7 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Wed, 25 Dec 2024 11:05:22 -0600 Subject: [PATCH 17/30] Reimplemented map_back_impl to process few elements per work-item Factored out map_back_impl projects indexing from flat index to a row-wise index. Removed dead code excluded by preprocessor conditional. --- .../include/kernels/sorting/merge_sort.hpp | 33 ++--------- .../include/kernels/sorting/radix_sort.hpp | 28 ++-------- .../include/kernels/sorting/sort_utils.hpp | 21 +++++++ .../include/kernels/sorting/topk.hpp | 55 ++++++++++++++----- 4 files changed, 70 insertions(+), 67 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp index 002da0ff9b..dbf40c10fe 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp @@ -816,24 +816,9 @@ sycl::event stable_argsort_axis1_contig_impl( using IotaKernelName = populate_index_data_krn; -#if 1 sycl::event populate_indexed_data_ev = iota_impl( exec_q, res_tp, total_nelems, depends); -#else - sycl::event populate_indexed_data_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - - const sycl::range<1> range{total_nelems}; - - cgh.parallel_for(range, [=](sycl::id<1> id) { - size_t i = id[0]; - res_tp[i] = static_cast(i); - }); - }); -#endif - // Sort segments of the array sycl::event base_sort_ev = merge_sort_detail::sort_over_work_group_contig_impl( @@ -847,21 +832,11 @@ sycl::event stable_argsort_axis1_contig_impl( exec_q, iter_nelems, sort_nelems, res_tp, index_comp, sorted_block_size, {base_sort_ev}); - sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(merges_ev); - - auto temp_acc = - merge_sort_detail::GetReadOnlyAccess{}(res_tp, - cgh); + using MapBackKernelName = index_map_to_rows_krn; + using dpctl::tensor::kernels::sort_utils_detail::map_back_impl; - using KernelName = index_map_to_rows_krn; - - const sycl::range<1> range{total_nelems}; - - cgh.parallel_for(range, [=](sycl::id<1> id) { - res_tp[id] = (temp_acc[id] % sort_nelems); - }); - }); + sycl::event write_out_ev = map_back_impl( + exec_q, total_nelems, res_tp, res_tp, sort_nelems, {merges_ev}); return write_out_ev; } diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp index de0011e2ab..811aa581ff 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp @@ -1766,41 +1766,21 @@ radix_argsort_axis1_contig_impl(sycl::queue &exec_q, using IotaKernelName = radix_argsort_iota_krn; -#if 1 using dpctl::tensor::kernels::sort_utils_detail::iota_impl; sycl::event iota_ev = iota_impl( exec_q, workspace, total_nelems, depends); -#else - - sycl::event iota_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - - cgh.parallel_for( - sycl::range<1>(total_nelems), [=](sycl::id<1> id) { - size_t i = id[0]; - IndexTy sort_id = static_cast(i); - workspace[i] = sort_id; - }); - }); -#endif sycl::event radix_sort_ev = radix_sort_details::parallel_radix_sort_impl( exec_q, iter_nelems, sort_nelems, workspace, res_tp, proj_op, sort_ascending, {iota_ev}); - sycl::event map_back_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(radix_sort_ev); - - using KernelName = radix_argsort_index_write_out_krn; + using MapBackKernelName = radix_argsort_index_write_out_krn; + using dpctl::tensor::kernels::sort_utils_detail::map_back_impl; - cgh.parallel_for( - sycl::range<1>(total_nelems), [=](sycl::id<1> id) { - IndexTy linear_index = res_tp[id]; - res_tp[id] = (linear_index % sort_nelems); - }); - }); + sycl::event map_back_ev = map_back_impl( + exec_q, total_nelems, res_tp, res_tp, sort_nelems, {radix_sort_ev}); sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free( exec_q, {map_back_ev}, workspace_owner); diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp index 1b1c361d75..f62a6c3fa0 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp @@ -97,6 +97,27 @@ sycl::event iota_impl(sycl::queue &exec_q, return e; } +template +sycl::event map_back_impl(sycl::queue &exec_q, + std::size_t nelems, + const IndexTy *flat_index_data, + IndexTy *reduced_index_data, + std::size_t row_size, + const std::vector &dependent_events) +{ + sycl::event map_back_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_events); + + cgh.parallel_for( + sycl::range<1>(nelems), [=](sycl::id<1> id) { + const IndexTy linear_index = flat_index_data[id]; + reduced_index_data[id] = (linear_index % row_size); + }); + }); + + return map_back_ev; +} + } // end of namespace sort_utils_detail } // end of namespace kernels } // end of namespace tensor diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp index 3d07b26802..6b235a1427 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp @@ -83,23 +83,50 @@ sycl::event write_out_impl(sycl::queue &exec_q, IndexTy *inds_tp, const std::vector &depends) { - sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - - cgh.parallel_for(iter_nelems * k, [=](sycl::id<1> id) { - const std::size_t gid = id[0]; + constexpr std::uint32_t lws = 64; + constexpr std::uint32_t n_wi = 4; + const std::size_t nelems = iter_nelems * k; + const std::size_t n_groups = (nelems + lws * n_wi - 1) / (n_wi * lws); - const std::size_t iter_gid = gid / k; - const std::size_t axis_gid = gid - (iter_gid * k); + sycl::range<1> lRange{lws}; + sycl::range<1> gRange{n_groups * lws}; + sycl::nd_range<1> ndRange{gRange, lRange}; - const std::size_t src_idx = iter_gid * iter_index_stride + axis_gid; - const std::size_t dst_idx = gid; - - const IndexTy res_ind = index_data[src_idx]; - const argTy v = arg_tp[res_ind]; + sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); - vals_tp[dst_idx] = v; - inds_tp[dst_idx] = (res_ind % axis_nelems); + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> it) { + const std::size_t gid = it.get_global_linear_id(); + const auto &sg = it.get_sub_group(); + const std::uint32_t lane_id = sg.get_local_id()[0]; + const std::uint32_t sg_size = sg.get_max_local_range()[0]; + + const std::size_t start_id = + (gid - lane_id) * sg_size * n_wi + lane_id; + +#pragma unroll + for (std::uint32_t i = 0; i < n_wi; ++i) { + const std::size_t data_id = start_id + i * sg_size; + + if (data_id < nelems) { + const std::size_t iter_id = data_id / k; + + /* + const std::size_t axis_gid = data_id - (iter_gid * k); + const std::size_t src_idx = iter_gid * iter_index_stride + + axis_gid; + */ + const std::size_t src_idx = + data_id + iter_id * (iter_index_stride - k); + + const IndexTy res_ind = index_data[src_idx]; + const argTy v = arg_tp[res_ind]; + + const std::size_t dst_idx = data_id; + vals_tp[dst_idx] = v; + inds_tp[dst_idx] = (res_ind % axis_nelems); + } + } }); }); From 15da303f33eaff1d4e4b38c9f5f64bbe03fb5dcd Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Wed, 25 Dec 2024 16:51:13 -0600 Subject: [PATCH 18/30] Replace use of std::log2(size_t_value) Replaced it with hand-written implementation of ceil_log2(n), such that n <= (dectype(n){1} << ceil_log2(n)) is true for all positive values of `n` in the range. --- .../include/kernels/sorting/radix_sort.hpp | 41 ++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp index 811aa581ff..887841acb8 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp @@ -63,6 +63,45 @@ class radix_sort_reorder_peer_kernel; template class radix_sort_reorder_kernel; +/*! @brief Computes smallest exponent such that `n <= (1 << exponent)` */ +template && + sizeof(SizeT) == sizeof(std::uint64_t), + int> = 0> +std::uint32_t ceil_log2(SizeT n) +{ + if (n <= 1) + return std::uint32_t{1}; + + std::uint32_t exp{1}; + --n; + if (n >= (SizeT{1} << 32)) { + n >>= 32; + exp += 32; + } + if (n >= (SizeT{1} << 16)) { + n >>= 16; + exp += 16; + } + if (n >= (SizeT{1} << 8)) { + n >>= 8; + exp += 8; + } + if (n >= (SizeT{1} << 4)) { + n >>= 4; + exp += 4; + } + if (n >= (SizeT{1} << 2)) { + n >>= 2; + exp += 2; + } + if (n >= (SizeT{1} << 1)) { + n >>= 1; + ++exp; + } + return exp; +} + //---------------------------------------------------------- // bitwise order-preserving conversions to unsigned integers //---------------------------------------------------------- @@ -1145,7 +1184,7 @@ struct subgroup_radix_sort const std::size_t max_slm_size = dev.template get_info() / 2; - const auto n_uniform = 1 << (std::uint32_t(std::log2(n - 1)) + 1); + const auto n_uniform = 1 << ceil_log2(n); const auto req_slm_size_val = sizeof(T) * n_uniform; return ((req_slm_size_val + req_slm_size_counters) <= max_slm_size) From fb9bc436cf68624b4be5d0bcb99ad7108b8ca959 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Thu, 26 Dec 2024 21:03:59 -0600 Subject: [PATCH 19/30] Remove dead code disable via preprocessor conditional --- .../include/kernels/sorting/topk.hpp | 27 ------------------- 1 file changed, 27 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp index 6b235a1427..0ff3fc4723 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp @@ -161,24 +161,10 @@ topk_full_merge_sort_impl(sycl::queue &exec_q, using IotaKernelName = topk_populate_index_data_krn; -#if 1 using dpctl::tensor::kernels::sort_utils_detail::iota_impl; sycl::event populate_indexed_data_ev = iota_impl( exec_q, index_data, iter_nelems * axis_nelems, depends); -#else - sycl::event populate_indexed_data_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - - auto const &range = sycl::range<1>(iter_nelems * axis_nelems); - - cgh.parallel_for(range, [=](sycl::id<1> id) { - std::size_t i = id[0]; - index_data[i] = static_cast(i); - }); - }); -#endif std::size_t sorted_block_size; // Sort segments of the array @@ -513,23 +499,10 @@ sycl::event topk_radix_impl(sycl::queue &exec_q, using IotaKernelName = topk_iota_krn; -#if 1 using dpctl::tensor::kernels::sort_utils_detail::iota_impl; sycl::event iota_ev = iota_impl( exec_q, workspace, total_nelems, depends); -#else - sycl::event iota_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - - cgh.parallel_for( - sycl::range<1>(total_nelems), [=](sycl::id<1> id) { - size_t i = id[0]; - IndexTy sort_id = static_cast(i); - workspace[i] = sort_id; - }); - }); -#endif sycl::event radix_sort_ev = radix_sort_details::parallel_radix_sort_impl( From 6cf36b1760132b2f2e1d200808cfaa79858c11f2 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Wed, 25 Dec 2024 13:20:00 -0600 Subject: [PATCH 20/30] Add information displayed on failure, renamed variables Add check of computed against expected indices --- dpctl/tests/test_usm_ndarray_top_k.py | 97 +++++++++++++++++++++++---- 1 file changed, 83 insertions(+), 14 deletions(-) diff --git a/dpctl/tests/test_usm_ndarray_top_k.py b/dpctl/tests/test_usm_ndarray_top_k.py index 954c21a990..b0f3f798bd 100644 --- a/dpctl/tests/test_usm_ndarray_top_k.py +++ b/dpctl/tests/test_usm_ndarray_top_k.py @@ -20,6 +20,38 @@ from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported +def _expected_largest_inds(inp, n, shift, k): + "Computed expected top_k indices for mode='largest'" + assert k < n + ones_start_id = shift % (2 * n) + + alloc_dev = inp.device + + if ones_start_id < n: + expected_inds = dpt.arange( + ones_start_id, ones_start_id + k, dtype="i8", device=alloc_dev + ) + else: + # wrap-around + ones_end_id = (ones_start_id + n) % (2 * n) + if ones_end_id >= k: + expected_inds = dpt.arange(k, dtype="i8", device=alloc_dev) + else: + expected_inds = dpt.concat( + ( + dpt.arange(ones_end_id, dtype="i8", device=alloc_dev), + dpt.arange( + ones_start_id, + ones_start_id + k - ones_end_id, + dtype="i8", + device=alloc_dev, + ), + ) + ) + + return expected_inds + + @pytest.mark.parametrize( "dtype", [ @@ -38,23 +70,57 @@ "c16", ], ) -@pytest.mark.parametrize("n", [33, 255, 511, 1021, 8193]) -def test_topk_1d_largest(dtype, n): +@pytest.mark.parametrize("n", [33, 43, 255, 511, 1021, 8193]) +def test_top_k_1d_largest(dtype, n): q = get_queue_or_skip() skip_if_dtype_not_supported(dtype, q) + shift, k = 734, 5 o = dpt.ones(n, dtype=dtype) z = dpt.zeros(n, dtype=dtype) - zo = dpt.concat((o, z)) - inp = dpt.roll(zo, 734) - k = 5 + oz = dpt.concat((o, z)) + inp = dpt.roll(oz, shift) + + expected_inds = _expected_largest_inds(oz, n, shift, k) s = dpt.top_k(inp, k, mode="largest") assert s.values.shape == (k,) assert s.values.dtype == inp.dtype assert s.indices.shape == (k,) - assert dpt.all(s.values == dpt.ones(k, dtype=dtype)) - assert dpt.all(s.values == inp[s.indices]) + assert dpt.all(s.indices == expected_inds) + assert dpt.all(s.values == dpt.ones(k, dtype=dtype)), s.values + assert dpt.all(s.values == inp[s.indices]), s.indices + + +def _expected_smallest_inds(inp, n, shift, k): + "Computed expected top_k indices for mode='smallest'" + assert k < n + zeros_start_id = (n + shift) % (2 * n) + zeros_end_id = (shift) % (2 * n) + + alloc_dev = inp.device + + if zeros_start_id < zeros_end_id: + expected_inds = dpt.arange( + zeros_start_id, zeros_start_id + k, dtype="i8", device=alloc_dev + ) + else: + if zeros_end_id >= k: + expected_inds = dpt.arange(k, dtype="i8", device=alloc_dev) + else: + expected_inds = dpt.concat( + ( + dpt.arange(zeros_end_id, dtype="i8", device=alloc_dev), + dpt.arange( + zeros_start_id, + zeros_start_id + k - zeros_end_id, + dtype="i8", + device=alloc_dev, + ), + ) + ) + + return expected_inds @pytest.mark.parametrize( @@ -75,20 +141,23 @@ def test_topk_1d_largest(dtype, n): "c16", ], ) -@pytest.mark.parametrize("n", [33, 255, 257, 513, 1021, 8193]) -def test_topk_1d_smallest(dtype, n): +@pytest.mark.parametrize("n", [37, 39, 61, 255, 257, 513, 1021, 8193]) +def test_top_k_1d_smallest(dtype, n): q = get_queue_or_skip() skip_if_dtype_not_supported(dtype, q) + shift, k = 734, 5 o = dpt.ones(n, dtype=dtype) z = dpt.zeros(n, dtype=dtype) - zo = dpt.concat((o, z)) - inp = dpt.roll(zo, 734) - k = 5 + oz = dpt.concat((o, z)) + inp = dpt.roll(oz, shift) + + expected_inds = _expected_smallest_inds(oz, n, shift, k) s = dpt.top_k(inp, k, mode="smallest") assert s.values.shape == (k,) assert s.values.dtype == inp.dtype assert s.indices.shape == (k,) - assert dpt.all(s.values == dpt.zeros(k, dtype=dtype)) - assert dpt.all(s.values == inp[s.indices]) + assert dpt.all(s.indices == expected_inds) + assert dpt.all(s.values == dpt.zeros(k, dtype=dtype)), s.values + assert dpt.all(s.values == inp[s.indices]), s.indices From 4ec252e5c5a51bd24ee843ccb1a52d4bb0c81c08 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Fri, 27 Dec 2024 08:56:21 -0600 Subject: [PATCH 21/30] Add comment to explain ceil_log2 algo --- dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp index 887841acb8..335b285bbf 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp @@ -75,6 +75,8 @@ std::uint32_t ceil_log2(SizeT n) std::uint32_t exp{1}; --n; + // if n > 2^b, n = q * 2^b + r for q > 0 and 0 <= r < 2^b + // ceil_log2(q * 2^b + r) == ceil_log2(q * 2^b) == q + ceil_log2(n1) if (n >= (SizeT{1} << 32)) { n >>= 32; exp += 32; From 1436522844d305ef09053e347c059ceaba38405e Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Fri, 27 Dec 2024 08:57:01 -0600 Subject: [PATCH 22/30] Add static assers to async_smart_free One asserts that at least one unique pointer is specified. Another that specified arguments are unique pointers with USMDeleter. --- .../include/utils/sycl_alloc_utils.hpp | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp index 72440ef969..f67e1bba1f 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include "sycl/sycl.hpp" @@ -140,12 +141,50 @@ smart_malloc_host(std::size_t count, return smart_malloc(count, q, sycl::usm::alloc::host, propList); } +namespace +{ +template struct valid_smart_ptr : public std::false_type +{ +}; + +template +struct valid_smart_ptr &> + : public std::is_same +{ +}; + +template +struct valid_smart_ptr> + : public std::is_same +{ +}; + +// base case +template struct all_valid_smart_ptrs +{ + static constexpr bool value = true; +}; + +template +struct all_valid_smart_ptrs +{ + static constexpr bool value = valid_smart_ptr::value && + (all_valid_smart_ptrs::value); +}; +} // namespace + template sycl::event async_smart_free(sycl::queue &exec_q, const std::vector &depends, Args &&...args) { constexpr std::size_t n = sizeof...(Args); + static_assert( + n > 0, "async_smart_free requires at least one smart pointer argument"); + + static_assert( + all_valid_smart_ptrs::value, + "async_smart_free requires unique_ptr created with smart_malloc"); std::vector ptrs; ptrs.reserve(n); From f0e3245f4466e3197a0bff12d40b57197db32a89 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Fri, 27 Dec 2024 18:08:51 -0800 Subject: [PATCH 23/30] Skip top_k test known to fail for some CPU architectures --- dpctl/tests/test_usm_ndarray_top_k.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dpctl/tests/test_usm_ndarray_top_k.py b/dpctl/tests/test_usm_ndarray_top_k.py index b0f3f798bd..01a83ef293 100644 --- a/dpctl/tests/test_usm_ndarray_top_k.py +++ b/dpctl/tests/test_usm_ndarray_top_k.py @@ -55,7 +55,7 @@ def _expected_largest_inds(inp, n, shift, k): @pytest.mark.parametrize( "dtype", [ - "i1", + pytest.param("i1", marks=pytest.mark.skip(reason="CPU bug")), "u1", "i2", "u2", @@ -74,6 +74,8 @@ def _expected_largest_inds(inp, n, shift, k): def test_top_k_1d_largest(dtype, n): q = get_queue_or_skip() skip_if_dtype_not_supported(dtype, q) + if dtype == "i1": + pytest.skip() shift, k = 734, 5 o = dpt.ones(n, dtype=dtype) @@ -126,7 +128,7 @@ def _expected_smallest_inds(inp, n, shift, k): @pytest.mark.parametrize( "dtype", [ - "i1", + pytest.param("i1", marks=pytest.mark.skip(reason="CPU bug")), "u1", "i2", "u2", From 8f4dcdd05fb1e5118da91810aee08b7cb700efdf Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Tue, 31 Dec 2024 10:15:11 -0600 Subject: [PATCH 24/30] Fixed blunder in work-item id to data_id computation gid-lane_id is already a multiple of sg_size. --- dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp index 0ff3fc4723..1828c88eb4 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp @@ -101,8 +101,7 @@ sycl::event write_out_impl(sycl::queue &exec_q, const std::uint32_t lane_id = sg.get_local_id()[0]; const std::uint32_t sg_size = sg.get_max_local_range()[0]; - const std::size_t start_id = - (gid - lane_id) * sg_size * n_wi + lane_id; + const std::size_t start_id = (gid - lane_id) * n_wi + lane_id; #pragma unroll for (std::uint32_t i = 0; i < n_wi; ++i) { From b6806ba880c7db1f46076484317993dedf5c7408 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Tue, 31 Dec 2024 10:16:11 -0600 Subject: [PATCH 25/30] Replace map_back_impl in sort_utils Change kernel to process few data elements in the work-item. --- .../include/kernels/sorting/sort_utils.hpp | 31 ++++++++++++++++--- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp index f62a6c3fa0..a81f528852 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp @@ -105,14 +105,35 @@ sycl::event map_back_impl(sycl::queue &exec_q, std::size_t row_size, const std::vector &dependent_events) { + constexpr std::uint32_t lws = 64; + constexpr std::uint32_t n_wi = 4; + const std::size_t n_groups = (nelems + lws * n_wi - 1) / (n_wi * lws); + + sycl::range<1> lRange{lws}; + sycl::range<1> gRange{n_groups * lws}; + sycl::nd_range<1> ndRange{gRange, lRange}; + sycl::event map_back_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(dependent_events); - cgh.parallel_for( - sycl::range<1>(nelems), [=](sycl::id<1> id) { - const IndexTy linear_index = flat_index_data[id]; - reduced_index_data[id] = (linear_index % row_size); - }); + cgh.parallel_for(ndRange, [=](sycl::nd_item<1> it) { + const std::size_t gid = it.get_global_linear_id(); + const auto &sg = it.get_sub_group(); + const std::uint32_t lane_id = sg.get_local_id()[0]; + const std::uint32_t sg_size = sg.get_max_local_range()[0]; + + const std::size_t start_id = (gid - lane_id) * n_wi + lane_id; + +#pragma unroll + for (std::uint32_t i = 0; i < n_wi; ++i) { + const std::size_t data_id = start_id + i * sg_size; + + if (data_id < nelems) { + const IndexTy linear_index = flat_index_data[data_id]; + reduced_index_data[data_id] = (linear_index % row_size); + } + } + }); }); return map_back_ev; From b47d9f3ca96e31eada000ab4c2aa87adf6400f32 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Thu, 2 Jan 2025 10:40:26 -0600 Subject: [PATCH 26/30] Use get_global_linear_id instead of get_global_id and rely on implicit conversion --- dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp index a81f528852..d1f166f945 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp @@ -58,7 +58,7 @@ sycl::event iota_impl(sycl::queue &exec_q, sycl::event e = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(dependent_events); cgh.parallel_for(ndRange, [=](sycl::nd_item<1> it) { - const std::size_t gid = it.get_global_id(); + const std::size_t gid = it.get_global_linear_id(); const auto &sg = it.get_sub_group(); const std::uint32_t lane_id = sg.get_local_id()[0]; From 0493485a85f47dc33ba0c6d266a4e3caa89afd6e Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Thu, 2 Jan 2025 10:41:03 -0600 Subject: [PATCH 27/30] Counters in one-workgroup kernel to use uint16_t from uint32_t Counters can not exceed uint16_t max, because the kernel assumes that the number of elements to sort fits into uint16_t. The change reduces the kernel SLM footprint. Also, remove use of std::move, uint16_t->std::uint16_t, etc Replace size_t->std::size_t, uint32_t->std::uint32_t Use `if constexpr` in order-preservign-cast for better readability. --- .../include/kernels/sorting/radix_sort.hpp | 159 ++++++++++-------- 1 file changed, 86 insertions(+), 73 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp index 335b285bbf..dd20e647ff 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp @@ -70,13 +70,14 @@ template = 0> std::uint32_t ceil_log2(SizeT n) { + // if n > 2^b, n = q * 2^b + r for q > 0 and 0 <= r < 2^b + // floor_log2(q * 2^b + r) == floor_log2(q * 2^b) == q + floor_log2(n1) + // ceil_log2(n) == 1 + floor_log2(n-1) if (n <= 1) return std::uint32_t{1}; std::uint32_t exp{1}; --n; - // if n > 2^b, n = q * 2^b + r for q > 0 and 0 <= r < 2^b - // ceil_log2(q * 2^b + r) == ceil_log2(q * 2^b) == q + ceil_log2(n1) if (n >= (SizeT{1} << 32)) { n >>= 32; exp += 32; @@ -137,16 +138,20 @@ template order_preserving_cast(IntT val) { using UIntT = std::make_unsigned_t; - // ascending_mask: 100..0 - constexpr UIntT ascending_mask = - (UIntT(1) << std::numeric_limits::digits); - // descending_mask: 011..1 - constexpr UIntT descending_mask = (std::numeric_limits::max() >> 1); - - constexpr UIntT mask = (is_ascending) ? ascending_mask : descending_mask; const UIntT uint_val = sycl::bit_cast(val); - return (uint_val ^ mask); + if constexpr (is_ascending) { + // ascending_mask: 100..0 + constexpr UIntT ascending_mask = + (UIntT(1) << std::numeric_limits::digits); + return (uint_val ^ ascending_mask); + } + else { + // descending_mask: 011..1 + constexpr UIntT descending_mask = + (std::numeric_limits::max() >> 1); + return (uint_val ^ descending_mask); + } } template std::uint16_t order_preserving_cast(sycl::half val) @@ -1045,10 +1050,10 @@ template class radix_sort_one_wg_krn; template + std::uint16_t req_sub_group_size = (block_size < 4 ? 32 : 16)> struct subgroup_radix_sort { private: @@ -1062,8 +1067,8 @@ struct subgroup_radix_sort public: template sycl::event operator()(sycl::queue &exec_q, - size_t n_iters, - size_t n_to_sort, + std::size_t n_iters, + std::size_t n_to_sort, ValueT *input_ptr, OutputT *output_ptr, ProjT proj_op, @@ -1160,8 +1165,8 @@ struct subgroup_radix_sort }; static_assert(wg_size <= 1024); - static constexpr uint16_t bin_count = (1 << radix); - static constexpr uint16_t counter_buf_sz = wg_size * bin_count + 1; + static constexpr std::uint16_t bin_count = (1 << radix); + static constexpr std::uint16_t counter_buf_sz = wg_size * bin_count + 1; enum class temp_allocations { @@ -1177,7 +1182,7 @@ struct subgroup_radix_sort assert(n <= (SizeT(1) << 16)); constexpr auto req_slm_size_counters = - counter_buf_sz * sizeof(uint32_t); + counter_buf_sz * sizeof(std::uint16_t); const auto &dev = exec_q.get_device(); @@ -1212,9 +1217,9 @@ struct subgroup_radix_sort typename SLM_value_tag, typename SLM_counter_tag> sycl::event operator()(sycl::queue &exec_q, - size_t n_iters, - size_t n_batch_size, - size_t n_values, + std::size_t n_iters, + std::size_t n_batch_size, + std::size_t n_values, InputT *input_arr, OutputT *output_arr, const ProjT &proj_op, @@ -1228,7 +1233,7 @@ struct subgroup_radix_sort assert(n_values <= static_cast(block_size) * static_cast(wg_size)); - uint16_t n = static_cast(n_values); + const std::uint16_t n = static_cast(n_values); static_assert(std::is_same_v, OutputT>); using ValueT = OutputT; @@ -1237,17 +1242,18 @@ struct subgroup_radix_sort TempBuf buf_val( n_batch_size, static_cast(block_size * wg_size)); - TempBuf buf_count( + TempBuf buf_count( n_batch_size, static_cast(counter_buf_sz)); sycl::range<1> lRange{wg_size}; sycl::event sort_ev; - std::vector deps = depends; + std::vector deps{depends}; - std::size_t n_batches = (n_iters + n_batch_size - 1) / n_batch_size; + const std::size_t n_batches = + (n_iters + n_batch_size - 1) / n_batch_size; - for (size_t batch_id = 0; batch_id < n_batches; ++batch_id) { + for (std::size_t batch_id = 0; batch_id < n_batches; ++batch_id) { const std::size_t block_start = batch_id * n_batch_size; @@ -1286,46 +1292,49 @@ struct subgroup_radix_sort const std::size_t iter_exchange_offset = iter_id * exchange_acc_iter_stride; - uint16_t wi = ndit.get_local_linear_id(); - uint16_t begin_bit = 0; + std::uint16_t wi = ndit.get_local_linear_id(); + std::uint16_t begin_bit = 0; - constexpr uint16_t end_bit = + constexpr std::uint16_t end_bit = number_of_bits_in_type(); -// copy from input array into values + // copy from input array into values #pragma unroll - for (uint16_t i = 0; i < block_size; ++i) { - const uint16_t id = wi * block_size + i; - if (id < n) - values[i] = std::move( - this_input_arr[iter_val_offset + id]); + for (std::uint16_t i = 0; i < block_size; ++i) { + const std::uint16_t id = wi * block_size + i; + values[i] = + (id < n) ? this_input_arr[iter_val_offset + id] + : ValueT{}; } while (true) { // indices for indirect access in the "re-order" // phase - uint16_t indices[block_size]; + std::uint16_t indices[block_size]; { // pointers to bucket's counters - uint32_t *counters[block_size]; + std::uint16_t *counters[block_size]; // counting phase auto pcounter = get_accessor_pointer(counter_acc) + (wi + iter_counter_offset); -// initialize counters + // initialize counters #pragma unroll - for (uint16_t i = 0; i < bin_count; ++i) - pcounter[i * wg_size] = std::uint32_t{0}; + for (std::uint16_t i = 0; i < bin_count; ++i) + pcounter[i * wg_size] = std::uint16_t{0}; sycl::group_barrier(ndit.get_group()); if (is_ascending) { #pragma unroll - for (uint16_t i = 0; i < block_size; ++i) { - const uint16_t id = wi * block_size + i; - constexpr uint16_t bin_mask = + for (std::uint16_t i = 0; i < block_size; + ++i) + { + const std::uint16_t id = + wi * block_size + i; + constexpr std::uint16_t bin_mask = bin_count - 1; // points to the padded element, i.e. id @@ -1334,7 +1343,7 @@ struct subgroup_radix_sort default_out_of_range_bin_id = bin_mask; - const uint16_t bin = + const std::uint16_t bin = (id < n) ? get_bucket_id( order_preserving_cast< @@ -1352,9 +1361,12 @@ struct subgroup_radix_sort } else { #pragma unroll - for (uint16_t i = 0; i < block_size; ++i) { - const uint16_t id = wi * block_size + i; - constexpr uint16_t bin_mask = + for (std::uint16_t i = 0; i < block_size; + ++i) + { + const std::uint16_t id = + wi * block_size + i; + constexpr std::uint16_t bin_mask = bin_count - 1; // points to the padded element, i.e. id @@ -1363,7 +1375,7 @@ struct subgroup_radix_sort default_out_of_range_bin_id = bin_mask; - const uint16_t bin = + const std::uint16_t bin = (id < n) ? get_bucket_id( order_preserving_cast< @@ -1386,13 +1398,14 @@ struct subgroup_radix_sort { // scan contiguous numbers - uint16_t bin_sum[bin_count]; + std::uint16_t bin_sum[bin_count]; const std::size_t counter_offset0 = iter_counter_offset + wi * bin_count; bin_sum[0] = counter_acc[counter_offset0]; #pragma unroll - for (uint16_t i = 1; i < bin_count; ++i) + for (std::uint16_t i = 1; i < bin_count; + ++i) bin_sum[i] = bin_sum[i - 1] + counter_acc[counter_offset0 + i]; @@ -1400,15 +1413,16 @@ struct subgroup_radix_sort sycl::group_barrier(ndit.get_group()); // exclusive scan local sum - uint16_t sum_scan = + std::uint16_t sum_scan = sycl::exclusive_scan_over_group( ndit.get_group(), bin_sum[bin_count - 1], - sycl::plus()); + sycl::plus()); // add to local sum, generate exclusive scan result #pragma unroll - for (uint16_t i = 0; i < bin_count; ++i) + for (std::uint16_t i = 0; i < bin_count; + ++i) counter_acc[counter_offset0 + i + 1] = sum_scan + bin_sum[i]; @@ -1420,11 +1434,13 @@ struct subgroup_radix_sort } #pragma unroll - for (uint16_t i = 0; i < block_size; ++i) { + for (std::uint16_t i = 0; i < block_size; ++i) { // a global index is a local offset plus a // global base index indices[i] += *counters[i]; } + + sycl::group_barrier(ndit.get_group()); } begin_bit += radix; @@ -1432,39 +1448,36 @@ struct subgroup_radix_sort // "re-order" phase sycl::group_barrier(ndit.get_group()); if (begin_bit >= end_bit) { -// the last iteration - writing out the result + // the last iteration - writing out the result #pragma unroll - for (uint16_t i = 0; i < block_size; ++i) { - const uint16_t r = indices[i]; + for (std::uint16_t i = 0; i < block_size; ++i) { + const std::uint16_t r = indices[i]; if (r < n) { - // move the values to source range and - // destroy the values this_output_arr[iter_val_offset + r] = - std::move(values[i]); + values[i]; } } return; } -// data exchange + // data exchange #pragma unroll - for (uint16_t i = 0; i < block_size; ++i) { - const uint16_t r = indices[i]; + for (std::uint16_t i = 0; i < block_size; ++i) { + const std::uint16_t r = indices[i]; if (r < n) exchange_acc[iter_exchange_offset + r] = - std::move(values[i]); + values[i]; } sycl::group_barrier(ndit.get_group()); #pragma unroll - for (uint16_t i = 0; i < block_size; ++i) { - const uint16_t id = wi * block_size + i; + for (std::uint16_t i = 0; i < block_size; ++i) { + const std::uint16_t id = wi * block_size + i; if (id < n) - values[i] = std::move( - exchange_acc[iter_exchange_offset + - id]); + values[i] = + exchange_acc[iter_exchange_offset + id]; } sycl::group_barrier(ndit.get_group()); @@ -1736,10 +1749,10 @@ radix_sort_axis1_contig_impl(sycl::queue &exec_q, const bool sort_ascending, // number of sub-arrays to sort (num. of rows in a // matrix when sorting over rows) - size_t iter_nelems, + std::size_t iter_nelems, // size of each array to sort (length of rows, // i.e. number of columns) - size_t sort_nelems, + std::size_t sort_nelems, const char *arg_cp, char *res_cp, ssize_t iter_arg_offset, @@ -1775,10 +1788,10 @@ radix_argsort_axis1_contig_impl(sycl::queue &exec_q, const bool sort_ascending, // number of sub-arrays to sort (num. of // rows in a matrix when sorting over rows) - size_t iter_nelems, + std::size_t iter_nelems, // size of each array to sort (length of // rows, i.e. number of columns) - size_t sort_nelems, + std::size_t sort_nelems, const char *arg_cp, char *res_cp, ssize_t iter_arg_offset, From 5125e11eb4b4d5fe1822746e574b17630a9c4b3e Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Fri, 3 Jan 2025 19:18:45 -0600 Subject: [PATCH 28/30] Apply work-around for failing tests with CPU device and short sub-groups The team developing OpenCL:CPU device runtime and compiler was notified. See CMPLRLLVM-64592 Once fixed, the work-around should be removed. --- .../include/kernels/sorting/radix_sort.hpp | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp index dd20e647ff..15f22b334e 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp @@ -1253,6 +1253,24 @@ struct subgroup_radix_sort const std::size_t n_batches = (n_iters + n_batch_size - 1) / n_batch_size; + const auto &kernel_id = sycl::get_kernel_id(); + + auto const &ctx = exec_q.get_context(); + auto const &dev = exec_q.get_device(); + auto kb = sycl::get_kernel_bundle( + ctx, {dev}, {kernel_id}); + + const auto &krn = kb.get_kernel(kernel_id); + + const std::uint32_t krn_sg_size = krn.template get_info< + sycl::info::kernel_device_specific::max_sub_group_size>(dev); + + // due to a bug in CPU device implementation, an additional + // synchronization is necessary for short sub-group sizes + const bool work_around_needed = + exec_q.get_device().has(sycl::aspect::cpu) && + (krn_sg_size < 16); + for (std::size_t batch_id = 0; batch_id < n_batches; ++batch_id) { const std::size_t block_start = batch_id * n_batch_size; @@ -1269,6 +1287,7 @@ struct subgroup_radix_sort sort_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(deps); + cgh.use_kernel_bundle(kb); // allocation to use for value exchanges auto exchange_acc = buf_val.get_acc(cgh); @@ -1357,6 +1376,11 @@ struct subgroup_radix_sort counters[i] = &pcounter[bin * wg_size]; indices[i] = *counters[i]; *counters[i] = indices[i] + 1; + + if (work_around_needed) { + sycl::group_barrier( + ndit.get_group()); + } } } else { @@ -1389,6 +1413,11 @@ struct subgroup_radix_sort counters[i] = &pcounter[bin * wg_size]; indices[i] = *counters[i]; *counters[i] = indices[i] + 1; + + if (work_around_needed) { + sycl::group_barrier( + ndit.get_group()); + } } } From 505b64c419a42b2fe890fd523c96ab8d9e4db163 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Sat, 28 Dec 2024 11:59:35 -0600 Subject: [PATCH 29/30] Remove skipping of tests for i1/i2 dtypes since work-around was applied in C++. Add tests for 2d input arrays, for axis=0 and axis=1 Add a test for non-contiguous input, 0d input, validation 100% coverage of top_k function implementation achieved --- dpctl/tests/test_usm_ndarray_top_k.py | 162 +++++++++++++++++++++++++- 1 file changed, 156 insertions(+), 6 deletions(-) diff --git a/dpctl/tests/test_usm_ndarray_top_k.py b/dpctl/tests/test_usm_ndarray_top_k.py index 01a83ef293..a27853d8c8 100644 --- a/dpctl/tests/test_usm_ndarray_top_k.py +++ b/dpctl/tests/test_usm_ndarray_top_k.py @@ -55,7 +55,7 @@ def _expected_largest_inds(inp, n, shift, k): @pytest.mark.parametrize( "dtype", [ - pytest.param("i1", marks=pytest.mark.skip(reason="CPU bug")), + "i1", "u1", "i2", "u2", @@ -74,8 +74,6 @@ def _expected_largest_inds(inp, n, shift, k): def test_top_k_1d_largest(dtype, n): q = get_queue_or_skip() skip_if_dtype_not_supported(dtype, q) - if dtype == "i1": - pytest.skip() shift, k = 734, 5 o = dpt.ones(n, dtype=dtype) @@ -89,9 +87,9 @@ def test_top_k_1d_largest(dtype, n): assert s.values.shape == (k,) assert s.values.dtype == inp.dtype assert s.indices.shape == (k,) - assert dpt.all(s.indices == expected_inds) assert dpt.all(s.values == dpt.ones(k, dtype=dtype)), s.values assert dpt.all(s.values == inp[s.indices]), s.indices + assert dpt.all(s.indices == expected_inds), (s.indices, expected_inds) def _expected_smallest_inds(inp, n, shift, k): @@ -128,7 +126,7 @@ def _expected_smallest_inds(inp, n, shift, k): @pytest.mark.parametrize( "dtype", [ - pytest.param("i1", marks=pytest.mark.skip(reason="CPU bug")), + "i1", "u1", "i2", "u2", @@ -160,6 +158,158 @@ def test_top_k_1d_smallest(dtype, n): assert s.values.shape == (k,) assert s.values.dtype == inp.dtype assert s.indices.shape == (k,) - assert dpt.all(s.indices == expected_inds) assert dpt.all(s.values == dpt.zeros(k, dtype=dtype)), s.values assert dpt.all(s.values == inp[s.indices]), s.indices + assert dpt.all(s.indices == expected_inds), (s.indices, expected_inds) + + +@pytest.mark.parametrize( + "dtype", + [ + # skip short types to ensure that m*n can be represented + # in the type + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +@pytest.mark.parametrize("n", [37, 39, 61, 255, 257, 513, 1021, 8193]) +def test_top_k_2d_largest(dtype, n): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + m, k = 8, 3 + if dtype == "f2" and m * n > 2000: + pytest.skip( + "f2 can not distinguish between large integers used in this test" + ) + + x = dpt.reshape(dpt.arange(m * n, dtype=dtype), (m, n)) + + r = dpt.top_k(x, k, axis=1) + + assert r.values.shape == (m, k) + assert r.indices.shape == (m, k) + expected_inds = dpt.reshape(dpt.arange(n, dtype=r.indices.dtype), (1, n))[ + :, -k: + ] + assert expected_inds.shape == (1, k) + assert dpt.all( + dpt.sort(r.indices, axis=1) == dpt.sort(expected_inds, axis=1) + ), (r.indices, expected_inds) + expected_vals = x[:, -k:] + assert dpt.all( + dpt.sort(r.values, axis=1) == dpt.sort(expected_vals, axis=1) + ) + + +@pytest.mark.parametrize( + "dtype", + [ + # skip short types to ensure that m*n can be represented + # in the type + "i4", + "u4", + "i8", + "u8", + "f2", + "f4", + "f8", + "c8", + "c16", + ], +) +@pytest.mark.parametrize("n", [37, 39, 61, 255, 257, 513, 1021, 8193]) +def test_top_k_2d_smallest(dtype, n): + q = get_queue_or_skip() + skip_if_dtype_not_supported(dtype, q) + + m, k = 8, 3 + if dtype == "f2" and m * n > 2000: + pytest.skip( + "f2 can not distinguish between large integers used in this test" + ) + + x = dpt.reshape(dpt.arange(m * n, dtype=dtype), (m, n)) + + r = dpt.top_k(x, k, axis=1, mode="smallest") + + assert r.values.shape == (m, k) + assert r.indices.shape == (m, k) + expected_inds = dpt.reshape(dpt.arange(n, dtype=r.indices.dtype), (1, n))[ + :, :k + ] + assert dpt.all( + dpt.sort(r.indices, axis=1) == dpt.sort(expected_inds, axis=1) + ) + assert dpt.all(dpt.sort(r.values, axis=1) == dpt.sort(x[:, :k], axis=1)) + + +def test_top_k_0d(): + get_queue_or_skip() + + a = dpt.ones(tuple(), dtype="i4") + assert a.ndim == 0 + assert a.size == 1 + + r = dpt.top_k(a, 1) + assert r.values == a + assert r.indices == dpt.zeros_like(a, dtype=r.indices.dtype) + + +def test_top_k_noncontig(): + get_queue_or_skip() + + a = dpt.arange(256, dtype=dpt.int32)[::2] + r = dpt.top_k(a, 3) + + assert dpt.all(dpt.sort(r.values) == dpt.asarray([250, 252, 254])), r.values + assert dpt.all( + dpt.sort(r.indices) == dpt.asarray([125, 126, 127]) + ), r.indices + + +def test_top_k_axis0(): + get_queue_or_skip() + + m, n, k = 128, 8, 3 + x = dpt.reshape(dpt.arange(m * n, dtype=dpt.int32), (m, n)) + + r = dpt.top_k(x, k, axis=0, mode="smallest") + assert r.values.shape == (k, n) + assert r.indices.shape == (k, n) + expected_inds = dpt.reshape(dpt.arange(m, dtype=r.indices.dtype), (m, 1))[ + :k, : + ] + assert dpt.all( + dpt.sort(r.indices, axis=0) == dpt.sort(expected_inds, axis=0) + ) + assert dpt.all(dpt.sort(r.values, axis=0) == dpt.sort(x[:k, :], axis=0)) + + +def test_top_k_validation(): + get_queue_or_skip() + x = dpt.ones(10, dtype=dpt.int64) + with pytest.raises(ValueError): + # k must be positive + dpt.top_k(x, -1) + with pytest.raises(TypeError): + # argument should be usm_ndarray + dpt.top_k(list(), 2) + x2 = dpt.reshape(x, (2, 5)) + with pytest.raises(ValueError): + # k must not exceed array dimension + # along specified axis + dpt.top_k(x2, 100, axis=1) + with pytest.raises(ValueError): + # for 0d arrays, k must be 1 + dpt.top_k(x[0], 2) + with pytest.raises(ValueError): + # mode must be "largest", or "smallest" + dpt.top_k(x, 2, mode="invalid") From 8c6abf5fecb3ab09b6f742a5dc764986d5476016 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Sun, 5 Jan 2025 13:08:19 -0800 Subject: [PATCH 30/30] Add entry to changelog for top_k --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index fec5e79f17..e7138a6a73 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +* Added `dpctl.tensor.top_k` per Python Array API specification: [#1921](https://github.com/IntelPython/dpctl/pull/1921) + ### Changed * Improved performance of copy-and-cast operations from `numpy.ndarray` to `tensor.usm_ndarray` for contiguous inputs [gh-1829](https://github.com/IntelPython/dpctl/pull/1829)