From 0b353bfb158fbfb0aff580f3edfcb23d7e9f48ca Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 4 Dec 2024 14:14:01 -0800
Subject: [PATCH 01/30] Implements `top_k`

The implementation leverages existing merge-sort code, and partially sorts the array in cases where a parial sort reduces the size of temporary memory allocation
---
 dpctl/tensor/CMakeLists.txt                   |   1 +
 dpctl/tensor/__init__.py                      |   2 +
 dpctl/tensor/_topk.py                         | 184 ++++++++
 .../include/kernels/sorting/topk.hpp          | 440 ++++++++++++++++++
 .../tensor/libtensor/source/sorting/topk.cpp  | 387 +++++++++++++++
 .../tensor/libtensor/source/sorting/topk.hpp  |  42 ++
 .../libtensor/source/tensor_sorting.cpp       |   2 +
 7 files changed, 1058 insertions(+)
 create mode 100644 dpctl/tensor/_topk.py
 create mode 100644 dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
 create mode 100644 dpctl/tensor/libtensor/source/sorting/topk.cpp
 create mode 100644 dpctl/tensor/libtensor/source/sorting/topk.hpp

diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt
index 882233b30c..75976f1b1f 100644
--- a/dpctl/tensor/CMakeLists.txt
+++ b/dpctl/tensor/CMakeLists.txt
@@ -115,6 +115,7 @@ set(_sorting_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_sort.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/merge_argsort.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/searchsorted.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/topk.cpp
 )
 set(_sorting_radix_sources
     ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sorting/radix_sort.cpp
diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index 581c3d54f6..d83d41215f 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -201,6 +201,7 @@
 )
 from ._sorting import argsort, sort
 from ._testing import allclose
+from ._topk import top_k
 from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type
 
 __all__ = [
@@ -387,4 +388,5 @@
     "DLDeviceType",
     "take_along_axis",
     "put_along_axis",
+    "top_k",
 ]
diff --git a/dpctl/tensor/_topk.py b/dpctl/tensor/_topk.py
new file mode 100644
index 0000000000..a807aea689
--- /dev/null
+++ b/dpctl/tensor/_topk.py
@@ -0,0 +1,184 @@
+#                       Data Parallel Control (dpctl)
+#
+#  Copyright 2020-2024 Intel Corporation
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import operator
+from typing import NamedTuple
+
+import dpctl.tensor as dpt
+import dpctl.tensor._tensor_impl as ti
+import dpctl.utils as du
+from dpctl.tensor._numpy_helper import normalize_axis_index
+
+from ._tensor_sorting_impl import _topk
+
+
+def _get_top_k_largest(mode):
+    modes = {"largest": True, "smallest": False}
+    try:
+        return modes[mode]
+    except KeyError:
+        raise ValueError(
+            f"`mode` must be `largest` or `smallest`. Got `{mode}`."
+        )
+
+
+class TopKResult(NamedTuple):
+    values: dpt.usm_ndarray
+    indices: dpt.usm_ndarray
+
+
+def top_k(x, k, /, *, axis=None, mode="largest"):
+    """top_k(x, k, axis=None, mode="largest")
+
+    Returns the `k` largest or smallest values and their indices in the input
+    array `x` along the specified axis `axis`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        k (int):
+            number of elements to find. Must be a positive integer value.
+        axis (Optional[int]):
+            axis along which to search. If `None`, the search will be performed
+            over the flattened array. Default: ``None``.
+        mode (Literal["largest", "smallest"]):
+            search mode. Must be one of the following modes:
+            -  `"largest"`: return the `k` largest elements.
+            -  `"smallest"`: return the `k` smallest elements.
+            Default: `"largest"`.
+
+    Returns:
+        tuple[usm_ndarray, usm_ndarray]:
+            a namedtuple `(values, indices)` whose
+
+        - first element `values` will be an array containing the `k` largest or
+          smallest elements of `x`. The array has the same data type as `x`.
+          If `axis` was `None`, `values` will be a one-dimensional array
+          with shape `(k,)` and otherwise, `values` will have shape
+          `x.shape[:axis] + (k,) + x.shape[axis+1:]`
+
+        - second element `indices` will be an array containing indices of `x`
+          that result in `values`. The array will have the same shape as
+          `values` and will have the default array index data type.
+    """
+    largest = _get_top_k_largest(mode)
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
+        )
+
+    k = operator.index(k)
+    if k < 0:
+        raise ValueError("`k` must be a positive integer value")
+
+    nd = x.ndim
+    if axis is None:
+        sz = x.size
+        if nd == 0:
+            return TopKResult(
+                dpt.copy(x, order="C"),
+                dpt.zeros_like(
+                    x, dtype=ti.default_device_index_type(x.sycl_queue)
+                ),
+            )
+        arr = x
+        n_search_dims = None
+        res_sh = k
+    else:
+        axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
+        sz = x.shape[axis]
+        a1 = axis + 1
+        if a1 == nd:
+            perm = list(range(nd))
+            arr = x
+        else:
+            perm = [i for i in range(nd) if i != axis] + [
+                axis,
+            ]
+            arr = dpt.permute_dims(x, perm)
+        n_search_dims = 1
+        res_sh = arr.shape[: nd - 1] + (k,)
+
+    if k > sz:
+        raise ValueError(f"`k`={k} is out of bounds {sz}")
+
+    exec_q = x.sycl_queue
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+
+    res_usm_type = arr.usm_type
+    if arr.flags.c_contiguous:
+        vals = dpt.empty(
+            res_sh,
+            dtype=arr.dtype,
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        inds = dpt.empty(
+            res_sh,
+            dtype=ti.default_device_index_type(exec_q),
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        ht_ev, impl_ev = _topk(
+            src=arr,
+            trailing_dims_to_search=n_search_dims,
+            k=k,
+            largest=largest,
+            vals=vals,
+            inds=inds,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    else:
+        tmp = dpt.empty_like(arr, order="C")
+        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        vals = dpt.empty(
+            res_sh,
+            dtype=arr.dtype,
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        inds = dpt.empty(
+            res_sh,
+            dtype=ti.default_device_index_type(exec_q),
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        ht_ev, impl_ev = _topk(
+            src=tmp,
+            trailing_dims_to_search=n_search_dims,
+            k=k,
+            largest=largest,
+            vals=vals,
+            inds=inds,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    if axis is not None and a1 != nd:
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        vals = dpt.permute_dims(vals, inv_perm)
+        inds = dpt.permute_dims(inds, inv_perm)
+    return TopKResult(vals, inds)
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
new file mode 100644
index 0000000000..7db3f3e3be
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -0,0 +1,440 @@
+//=== topk.hpp -  Implementation of topk kernels       ---*-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor topk operation.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <stdexcept>
+#include <sycl/sycl.hpp>
+#include <vector>
+
+#include "kernels/dpctl_tensor_types.hpp"
+#include "merge_sort.hpp"
+#include "utils/sycl_alloc_utils.hpp"
+#include <sycl/ext/oneapi/sub_group_mask.hpp>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+
+namespace topk_detail
+{
+
+void scale_topk_params(const std::uint64_t nelems_per_slm,
+                       const std::size_t sub_groups_per_work_group,
+                       const std::uint32_t elems_per_wi,
+                       const std::vector<std::size_t> &sg_sizes,
+                       std::size_t &lws,
+                       std::size_t &nelems_wg_sorts)
+{
+    for (auto it = sg_sizes.rbegin(); it != sg_sizes.rend(); ++it) {
+        auto sg_size = *it;
+        lws = sub_groups_per_work_group * sg_size;
+        nelems_wg_sorts = elems_per_wi * lws;
+        if (nelems_wg_sorts < nelems_per_slm) {
+            return;
+        }
+    }
+    // should never reach
+    throw std::runtime_error("Could not construct top k kernel parameters");
+}
+
+} // namespace topk_detail
+
+template <typename T1, typename T2, typename T3>
+class populate_index_data_full_sort_krn;
+
+template <typename T1, typename T2, typename T3>
+class topk_map_to_rows_full_sort_krn;
+
+template <typename T1, typename T2, typename T3> class populate_index_data_krn;
+
+template <typename T1, typename T2, typename T3> class topk_map_to_rows_krn;
+
+template <typename argTy, typename IndexTy, typename CompT>
+sycl::event topk_full_sort_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a
+                             // matrix when sorting over rows)
+    std::size_t sort_nelems, // size of each array to sort  (length of rows,
+                             // i.e. number of columns)
+    dpctl::tensor::ssize_t k,
+    const argTy *arg_tp,
+    argTy *vals_tp,
+    IndexTy *inds_tp,
+    const CompT &comp,
+    const std::vector<sycl::event> &depends)
+{
+    IndexTy *index_data =
+        sycl::malloc_device<IndexTy>(iter_nelems * sort_nelems, exec_q);
+    if (index_data == nullptr) {
+        throw std::runtime_error("Unable to allocate device_memory");
+    }
+
+    sycl::event populate_indexed_data_ev =
+        exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            auto const &range = sycl::range<1>(iter_nelems * sort_nelems);
+
+            using KernelName =
+                populate_index_data_full_sort_krn<argTy, IndexTy, CompT>;
+
+            cgh.parallel_for<KernelName>(range, [=](sycl::id<1> id) {
+                std::size_t i = id[0];
+                index_data[i] = static_cast<IndexTy>(i);
+            });
+        });
+
+    std::size_t sorted_block_size;
+    // Sort segments of the array
+    sycl::event base_sort_ev =
+        merge_sort_detail::sort_over_work_group_contig_impl(
+            exec_q, iter_nelems, sort_nelems, index_data, index_data, comp,
+            sorted_block_size, // modified in place with size of sorted block
+                               // size
+            {populate_indexed_data_ev});
+
+    // Merge segments in parallel until all elements are sorted
+    sycl::event merges_ev = merge_sort_detail::merge_sorted_block_contig_impl(
+        exec_q, iter_nelems, sort_nelems, index_data, comp, sorted_block_size,
+        {base_sort_ev});
+
+    sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(merges_ev);
+
+        using KernelName =
+            topk_map_to_rows_full_sort_krn<argTy, IndexTy, CompT>;
+
+        cgh.parallel_for<KernelName>(iter_nelems * k, [=](sycl::id<1> id) {
+            std::size_t gid = id[0];
+
+            std::size_t iter_gid = gid / k;
+            std::size_t axis_gid = gid - (iter_gid * k);
+
+            std::size_t src_idx = iter_gid * sort_nelems + axis_gid;
+            std::size_t dst_idx = iter_gid * k + axis_gid;
+
+            auto res_ind = index_data[src_idx];
+            vals_tp[dst_idx] = arg_tp[res_ind];
+            inds_tp[dst_idx] = res_ind % sort_nelems;
+        });
+    });
+
+    sycl::event cleanup_host_task_event =
+        exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(write_out_ev);
+            const sycl::context &ctx = exec_q.get_context();
+
+            using dpctl::tensor::alloc_utils::sycl_free_noexcept;
+            cgh.host_task(
+                [ctx, index_data] { sycl_free_noexcept(index_data, ctx); });
+        });
+
+    return cleanup_host_task_event;
+};
+
+template <typename T1, typename T2, typename Comp>
+class topk_over_work_group_krn;
+
+template <typename argTy,
+          typename IndexTy,
+          typename ValueComp = std::less<argTy>>
+sycl::event
+topk_impl(sycl::queue &exec_q,
+          std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows
+                                   // in a matrix when sorting over rows)
+          std::size_t axis_nelems, // size of each array to sort  (length of
+                                   // rows, i.e. number of columns)
+          dpctl::tensor::ssize_t k,
+          const char *arg_cp,
+          char *vals_cp,
+          char *inds_cp,
+          dpctl::tensor::ssize_t iter_arg_offset,
+          dpctl::tensor::ssize_t iter_vals_offset,
+          dpctl::tensor::ssize_t iter_inds_offset,
+          dpctl::tensor::ssize_t axis_arg_offset,
+          dpctl::tensor::ssize_t axis_vals_offset,
+          dpctl::tensor::ssize_t axis_inds_offset,
+          const std::vector<sycl::event> &depends)
+{
+    if (axis_nelems < static_cast<std::size_t>(k)) {
+        throw std::runtime_error("Invalid sort axis size for value of k");
+    }
+
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + axis_arg_offset;
+    argTy *vals_tp = reinterpret_cast<argTy *>(vals_cp) + iter_vals_offset +
+                     axis_vals_offset;
+    IndexTy *inds_tp = reinterpret_cast<IndexTy *>(inds_cp) + iter_inds_offset +
+                       axis_inds_offset;
+
+    using dpctl::tensor::kernels::IndexComp;
+    const IndexComp<IndexTy, argTy, ValueComp> index_comp{arg_tp, ValueComp{}};
+
+    if (axis_nelems <= 512 || k >= 1024 ||
+        static_cast<std::size_t>(k) > axis_nelems / 2)
+    {
+        return topk_full_sort_impl(exec_q, iter_nelems, axis_nelems, k, arg_tp,
+                                   vals_tp, inds_tp, index_comp, depends);
+    }
+    else {
+        using PartialKernelName =
+            topk_over_work_group_krn<IndexTy, IndexTy, ValueComp>;
+
+        const auto &kernel_id = sycl::get_kernel_id<PartialKernelName>();
+
+        auto const &ctx = exec_q.get_context();
+        auto const &dev = exec_q.get_device();
+
+        auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+            ctx, {dev}, {kernel_id});
+
+        auto krn = kb.get_kernel(kernel_id);
+
+        const std::uint32_t max_sg_size = krn.template get_info<
+            sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+        const std::uint64_t device_local_memory_size =
+            dev.get_info<sycl::info::device::local_mem_size>();
+
+        //  leave 512 bytes of local memory for RT
+        const std::uint64_t safety_margin = 512;
+
+        const std::uint64_t nelems_per_slm =
+            (device_local_memory_size - safety_margin) / (2 * sizeof(IndexTy));
+
+        constexpr std::uint32_t sub_groups_per_work_group = 4;
+        const std::uint32_t elems_per_wi = dev.has(sycl::aspect::cpu) ? 8 : 2;
+
+        std::size_t lws = sub_groups_per_work_group * max_sg_size;
+
+        std::size_t sorted_block_size = elems_per_wi * lws;
+        if (sorted_block_size > nelems_per_slm) {
+            const std::vector<std::size_t> sg_sizes =
+                dev.get_info<sycl::info::device::sub_group_sizes>();
+            topk_detail::scale_topk_params(
+                nelems_per_slm, sub_groups_per_work_group, elems_per_wi,
+                sg_sizes,
+                lws,              // modified by reference
+                sorted_block_size // modified by reference
+            );
+        }
+
+        // This assumption permits doing away with using a loop
+        assert(sorted_block_size % lws == 0);
+
+        const std::size_t n_segments =
+            merge_sort_detail::quotient_ceil<std::size_t>(axis_nelems,
+                                                          sorted_block_size);
+
+        // round k up for the later merge kernel
+        const dpctl::tensor::ssize_t round_k_to = elems_per_wi;
+        dpctl::tensor::ssize_t k_rounded =
+            merge_sort_detail::quotient_ceil<dpctl::tensor::ssize_t>(
+                k, round_k_to) *
+            round_k_to;
+
+        // get length of tail for alloc size
+        auto rem = axis_nelems % sorted_block_size;
+        auto alloc_len = (rem && rem < static_cast<std::size_t>(k_rounded))
+                             ? rem + k_rounded * (n_segments - 1)
+                             : k_rounded * n_segments;
+
+        // if allocation would be sufficiently large or k is larger than
+        // elements processed, use full sort
+        if (static_cast<std::size_t>(k_rounded) >= axis_nelems ||
+            static_cast<std::size_t>(k_rounded) >= sorted_block_size ||
+            alloc_len >= axis_nelems / 2)
+        {
+            return topk_full_sort_impl(exec_q, iter_nelems, axis_nelems, k,
+                                       arg_tp, vals_tp, inds_tp, index_comp,
+                                       depends);
+        }
+
+        IndexTy *index_data =
+            sycl::malloc_device<IndexTy>(iter_nelems * alloc_len, exec_q);
+        if (index_data == nullptr) {
+            throw std::runtime_error("Unable to allocate device_memory");
+        }
+
+        // no need to populate index data: SLM will be populated with default
+        // values
+
+        sycl::event base_sort_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(depends);
+
+            cgh.use_kernel_bundle(kb);
+
+            sycl::range<1> global_range{iter_nelems * n_segments * lws};
+            sycl::range<1> local_range{lws};
+
+            sycl::range<1> slm_range{sorted_block_size};
+            sycl::local_accessor<IndexTy, 1> work_space(slm_range, cgh);
+            sycl::local_accessor<IndexTy, 1> scratch_space(slm_range, cgh);
+
+            sycl::nd_range<1> ndRange(global_range, local_range);
+
+            cgh.parallel_for<PartialKernelName>(
+                ndRange, [=](sycl::nd_item<1> it) {
+                    const std::size_t group_id = it.get_group_linear_id();
+                    const std::size_t iter_id = group_id / n_segments;
+                    const std::size_t segment_id =
+                        group_id - iter_id * n_segments;
+                    const std::size_t lid = it.get_local_linear_id();
+
+                    const std::size_t segment_start_idx =
+                        segment_id * sorted_block_size;
+                    const std::size_t segment_end_idx = std::min<std::size_t>(
+                        segment_start_idx + sorted_block_size, axis_nelems);
+                    const std::size_t wg_chunk_size =
+                        segment_end_idx - segment_start_idx;
+
+                    // load input into SLM
+                    for (std::size_t array_id = segment_start_idx + lid;
+                         array_id < segment_end_idx; array_id += lws)
+                    {
+                        IndexTy v = (array_id < axis_nelems)
+                                        ? iter_id * axis_nelems + array_id
+                                        : IndexTy{};
+                        work_space[array_id - segment_start_idx] = v;
+                    }
+                    sycl::group_barrier(it.get_group());
+
+                    const std::size_t chunk =
+                        merge_sort_detail::quotient_ceil<std::size_t>(
+                            sorted_block_size, lws);
+
+                    const std::size_t chunk_start_idx = lid * chunk;
+                    const std::size_t chunk_end_idx =
+                        sycl::min(chunk_start_idx + chunk, wg_chunk_size);
+
+                    merge_sort_detail::leaf_sort_impl(
+                        work_space, chunk_start_idx, chunk_end_idx, index_comp);
+
+                    sycl::group_barrier(it.get_group());
+
+                    bool data_in_temp = false;
+                    std::size_t n_chunks_merged = 1;
+
+                    // merge chunk while n_chunks_merged * chunk < wg_chunk_size
+                    const std::size_t max_chunks_merged =
+                        1 + ((wg_chunk_size - 1) / chunk);
+                    for (; n_chunks_merged < max_chunks_merged;
+                         data_in_temp = !data_in_temp, n_chunks_merged *= 2)
+                    {
+                        const std::size_t nelems_sorted_so_far =
+                            n_chunks_merged * chunk;
+                        const std::size_t q = (lid / n_chunks_merged);
+                        const std::size_t start_1 = sycl::min(
+                            2 * nelems_sorted_so_far * q, wg_chunk_size);
+                        const std::size_t end_1 = sycl::min(
+                            start_1 + nelems_sorted_so_far, wg_chunk_size);
+                        const std::size_t end_2 = sycl::min(
+                            end_1 + nelems_sorted_so_far, wg_chunk_size);
+                        const std::size_t offset =
+                            chunk * (lid - q * n_chunks_merged);
+
+                        if (data_in_temp) {
+                            merge_sort_detail::merge_impl(
+                                offset, scratch_space, work_space, start_1,
+                                end_1, end_2, start_1, index_comp, chunk);
+                        }
+                        else {
+                            merge_sort_detail::merge_impl(
+                                offset, work_space, scratch_space, start_1,
+                                end_1, end_2, start_1, index_comp, chunk);
+                        }
+                        sycl::group_barrier(it.get_group());
+                    }
+
+                    // output assumed to be structured as (iter_nelems,
+                    // alloc_len)
+                    const std::size_t k_segment_start_idx =
+                        segment_id * k_rounded;
+                    const std::size_t k_segment_end_idx = std::min<std::size_t>(
+                        k_segment_start_idx + k_rounded, alloc_len);
+                    const auto &out_src =
+                        (data_in_temp) ? scratch_space : work_space;
+                    for (std::size_t array_id = k_segment_start_idx + lid;
+                         array_id < k_segment_end_idx; array_id += lws)
+                    {
+                        if (lid < static_cast<std::size_t>(k_rounded)) {
+                            index_data[iter_id * alloc_len + array_id] =
+                                out_src[array_id - k_segment_start_idx];
+                        }
+                    }
+                });
+        });
+
+        // Merge segments in parallel until all elements are sorted
+        sycl::event merges_ev =
+            merge_sort_detail::merge_sorted_block_contig_impl(
+                exec_q, iter_nelems, alloc_len, index_data, index_comp,
+                k_rounded, {base_sort_ev});
+
+        // Write out top k of the merge-sorted memory
+        sycl::event write_topk_ev = exec_q.submit([&](sycl::handler &cgh) {
+            cgh.depends_on(merges_ev);
+
+            using KernelName = topk_map_to_rows_krn<argTy, IndexTy, ValueComp>;
+
+            cgh.parallel_for<KernelName>(iter_nelems * k, [=](sycl::id<1> id) {
+                std::size_t gid = id[0];
+
+                std::size_t iter_gid = gid / k;
+                std::size_t axis_gid = gid - (iter_gid * k);
+
+                std::size_t src_idx = iter_gid * alloc_len + axis_gid;
+                std::size_t dst_idx = iter_gid * k + axis_gid;
+
+                auto res_ind = index_data[src_idx];
+                vals_tp[dst_idx] = arg_tp[res_ind];
+                inds_tp[dst_idx] = res_ind % axis_nelems;
+            });
+        });
+
+        sycl::event cleanup_host_task_event =
+            exec_q.submit([&](sycl::handler &cgh) {
+                cgh.depends_on(write_topk_ev);
+                const sycl::context &ctx = exec_q.get_context();
+
+                using dpctl::tensor::alloc_utils::sycl_free_noexcept;
+                cgh.host_task(
+                    [ctx, index_data] { sycl_free_noexcept(index_data, ctx); });
+            });
+
+        return cleanup_host_task_event;
+    }
+}
+
+} // end of namespace kernels
+} // end of namespace tensor
+} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/topk.cpp b/dpctl/tensor/libtensor/source/sorting/topk.cpp
new file mode 100644
index 0000000000..af2f6ce027
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/sorting/topk.cpp
@@ -0,0 +1,387 @@
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===--------------------------------------------------------------------===//
+
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <optional>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "dpctl4pybind11.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <sycl/sycl.hpp>
+
+#include "utils/math_utils.hpp"
+#include "utils/memory_overlap.hpp"
+#include "utils/output_validation.hpp"
+#include "utils/type_dispatch.hpp"
+#include "utils/type_utils.hpp"
+
+#include "kernels/sorting/topk.hpp"
+#include "rich_comparisons.hpp"
+#include "topk.hpp"
+
+namespace td_ns = dpctl::tensor::type_dispatch;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+typedef sycl::event (*topk_impl_fn_ptr_t)(sycl::queue &,
+                                          std::size_t,
+                                          std::size_t,
+                                          py::ssize_t,
+                                          bool,
+                                          const char *,
+                                          char *,
+                                          char *,
+                                          py::ssize_t,
+                                          py::ssize_t,
+                                          py::ssize_t,
+                                          py::ssize_t,
+                                          py::ssize_t,
+                                          py::ssize_t,
+                                          const std::vector<sycl::event> &);
+
+static topk_impl_fn_ptr_t topk_dispatch_vector[td_ns::num_types];
+
+namespace
+{
+
+template <typename argTy, typename IndexTy>
+sycl::event
+topk_caller(sycl::queue &exec_q,
+            std::size_t iter_nelems, // number of sub-arrays to sort (num. of
+                                     // rows in a matrix when sorting over rows)
+            std::size_t axis_nelems, // size of each array to sort  (length of
+                                     // rows, i.e. number of columns)
+            py::ssize_t k,
+            bool largest,
+            const char *arg_cp,
+            char *vals_cp,
+            char *inds_cp,
+            py::ssize_t iter_arg_offset,
+            py::ssize_t iter_vals_offset,
+            py::ssize_t iter_inds_offset,
+            py::ssize_t axis_arg_offset,
+            py::ssize_t axis_vals_offset,
+            py::ssize_t axis_inds_offset,
+            const std::vector<sycl::event> &depends)
+{
+    using dpctl::tensor::kernels::topk_impl;
+    if (largest) {
+        using CompTy =
+            typename dpctl::tensor::py_internal::DescendingSorter<argTy>::type;
+        return topk_impl<argTy, IndexTy, CompTy>(
+            exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp,
+            iter_arg_offset, iter_vals_offset, iter_inds_offset,
+            axis_arg_offset, axis_vals_offset, axis_inds_offset, depends);
+    }
+    else {
+        using CompTy =
+            typename dpctl::tensor::py_internal::AscendingSorter<argTy>::type;
+        return topk_impl<argTy, IndexTy, CompTy>(
+            exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp,
+            iter_arg_offset, iter_vals_offset, iter_inds_offset,
+            axis_arg_offset, axis_vals_offset, axis_inds_offset, depends);
+    }
+}
+
+} // namespace
+
+std::pair<sycl::event, sycl::event>
+py_topk(const dpctl::tensor::usm_ndarray &src,
+        const int trailing_dims_to_search,
+        const py::ssize_t k,
+        const bool largest,
+        const dpctl::tensor::usm_ndarray &vals,
+        const dpctl::tensor::usm_ndarray &inds,
+        sycl::queue &exec_q,
+        const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    int vals_nd = vals.get_ndim();
+    int inds_nd = inds.get_ndim();
+    if (src_nd != vals_nd || src_nd != inds_nd) {
+        throw py::value_error("The input and output arrays must have "
+                              "the same array ranks");
+    }
+    int iteration_nd = src_nd - trailing_dims_to_search;
+    if (trailing_dims_to_search <= 0 || iteration_nd < 0) {
+        throw py::value_error(
+            "trailing_dims_to_search must be positive, but no "
+            "greater than rank of the array being searched");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *vals_shape_ptr = vals.get_shape_raw();
+    const py::ssize_t *inds_shape_ptr = inds.get_shape_raw();
+
+    bool same_shapes = true;
+    std::size_t iter_nelems(1);
+    for (int i = 0; same_shapes && (i < iteration_nd); ++i) {
+        auto src_shape_i = src_shape_ptr[i];
+        same_shapes = same_shapes && (src_shape_i == vals_shape_ptr[i] &&
+                                      src_shape_i == inds_shape_ptr[i]);
+        iter_nelems *= static_cast<std::size_t>(src_shape_i);
+    }
+
+    if (!same_shapes) {
+        throw py::value_error(
+            "Destination shape does not match the input shape");
+    }
+
+    std::size_t vals_k(1);
+    std::size_t inds_k(1);
+    std::size_t axis_nelems(1);
+    for (int i = iteration_nd; i < src_nd; ++i) {
+        axis_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
+        vals_k *= static_cast<std::size_t>(vals_shape_ptr[i]);
+        inds_k *= static_cast<std::size_t>(inds_shape_ptr[i]);
+    }
+
+    bool valid_k = (vals_k == static_cast<std::size_t>(k) &&
+                    inds_k == static_cast<std::size_t>(k) &&
+                    axis_nelems >= static_cast<std::size_t>(k));
+    if (!valid_k) {
+        throw py::value_error(
+            "The value of k is invalid for the input and destination arrays");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, vals, inds})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(vals);
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(inds);
+
+    if ((iter_nelems == 0) || (axis_nelems == 0)) {
+        // Nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, vals) || overlap(src, inds)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(vals,
+                                                               k * iter_nelems);
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(inds,
+                                                               k * iter_nelems);
+
+    int src_typenum = src.get_typenum();
+    int vals_typenum = vals.get_typenum();
+    int inds_typenum = inds.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int vals_typeid = array_types.typenum_to_lookup_id(vals_typenum);
+    int inds_typeid = array_types.typenum_to_lookup_id(inds_typenum);
+
+    if (src_typeid != vals_typeid) {
+        throw py::value_error("Input array and vals array must have "
+                              "the same data type");
+    }
+
+    if (inds_typeid != static_cast<int>(td_ns::typenum_t::INT64)) {
+        throw py::value_error("Inds array must have data type int64");
+    }
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_vals_c_contig = vals.is_c_contiguous();
+    bool is_inds_c_contig = inds.is_c_contiguous();
+
+    if (is_src_c_contig && is_vals_c_contig && is_inds_c_contig) {
+        static constexpr py::ssize_t zero_offset = py::ssize_t(0);
+
+        auto fn = topk_dispatch_vector[src_typeid];
+
+        sycl::event comp_ev =
+            fn(exec_q, iter_nelems, axis_nelems, k, largest, src.get_data(),
+               vals.get_data(), inds.get_data(), zero_offset, zero_offset,
+               zero_offset, zero_offset, zero_offset, zero_offset, depends);
+
+        sycl::event keep_args_alive_ev =
+            dpctl::utils::keep_args_alive(exec_q, {src, vals, inds}, {comp_ev});
+
+        return std::make_pair(keep_args_alive_ev, comp_ev);
+    }
+
+    return std::make_pair(sycl::event(), sycl::event());
+}
+
+std::pair<sycl::event, sycl::event>
+py_topk(const dpctl::tensor::usm_ndarray &src,
+        const py::ssize_t k,
+        const bool largest,
+        const dpctl::tensor::usm_ndarray &vals,
+        const dpctl::tensor::usm_ndarray &inds,
+        sycl::queue &exec_q,
+        const std::vector<sycl::event> &depends)
+{
+    int src_nd = src.get_ndim();
+    int vals_nd = vals.get_ndim();
+    int inds_nd = inds.get_ndim();
+    if (vals_nd != 1 || inds_nd != 1) {
+        throw py::value_error("Output arrays must be one-dimensional");
+    }
+
+    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
+    const py::ssize_t *vals_shape_ptr = vals.get_shape_raw();
+    const py::ssize_t *inds_shape_ptr = inds.get_shape_raw();
+
+    std::size_t axis_nelems(1);
+    for (int i = 0; i < src_nd; ++i) {
+        axis_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
+    }
+
+    bool valid_k = (axis_nelems >= static_cast<std::size_t>(k) &&
+                    vals_shape_ptr[0] == k && inds_shape_ptr[0] == k);
+    if (!valid_k) {
+        throw py::value_error(
+            "The value of k is invalid for the input and destination arrays");
+    }
+
+    if (!dpctl::utils::queues_are_compatible(exec_q, {src, vals, inds})) {
+        throw py::value_error(
+            "Execution queue is not compatible with allocation queues");
+    }
+
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(vals);
+    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(inds);
+
+    if (axis_nelems == 0) {
+        // Nothing to do
+        return std::make_pair(sycl::event(), sycl::event());
+    }
+
+    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
+    if (overlap(src, vals) || overlap(src, inds)) {
+        throw py::value_error("Arrays index overlapping segments of memory");
+    }
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(vals, k);
+
+    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(inds, k);
+
+    int src_typenum = src.get_typenum();
+    int vals_typenum = vals.get_typenum();
+    int inds_typenum = inds.get_typenum();
+
+    const auto &array_types = td_ns::usm_ndarray_types();
+    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
+    int vals_typeid = array_types.typenum_to_lookup_id(vals_typenum);
+    int inds_typeid = array_types.typenum_to_lookup_id(inds_typenum);
+
+    if (src_typeid != vals_typeid) {
+        throw py::value_error("Input array and vals array must have "
+                              "the same data type");
+    }
+
+    if (inds_typeid != static_cast<int>(td_ns::typenum_t::INT64)) {
+        throw py::value_error("Inds array must have data type int64");
+    }
+
+    bool is_src_c_contig = src.is_c_contiguous();
+    bool is_vals_c_contig = vals.is_c_contiguous();
+    bool is_inds_c_contig = inds.is_c_contiguous();
+
+    if (is_src_c_contig && is_vals_c_contig && is_inds_c_contig) {
+        static constexpr py::ssize_t zero_offset = py::ssize_t(0);
+        static constexpr std::size_t iter_nelems = 1;
+
+        auto fn = topk_dispatch_vector[src_typeid];
+
+        sycl::event comp_ev =
+            fn(exec_q, iter_nelems, axis_nelems, k, largest, src.get_data(),
+               vals.get_data(), inds.get_data(), zero_offset, zero_offset,
+               zero_offset, zero_offset, zero_offset, zero_offset, depends);
+
+        sycl::event keep_args_alive_ev =
+            dpctl::utils::keep_args_alive(exec_q, {src, vals, inds}, {comp_ev});
+
+        return std::make_pair(keep_args_alive_ev, comp_ev);
+    }
+
+    return std::make_pair(sycl::event(), sycl::event());
+}
+
+template <typename fnT, typename T> struct TopKFactory
+{
+    fnT get()
+    {
+        using IdxT = std::int64_t;
+        return topk_caller<T, IdxT>;
+    }
+};
+
+void init_topk_dispatch_vectors(void)
+{
+    td_ns::DispatchVectorBuilder<topk_impl_fn_ptr_t, TopKFactory,
+                                 td_ns::num_types>
+        dvb;
+    dvb.populate_dispatch_vector(topk_dispatch_vector);
+}
+
+void init_topk_functions(py::module_ m)
+{
+    dpctl::tensor::py_internal::init_topk_dispatch_vectors();
+
+    auto py_topk = [](const dpctl::tensor::usm_ndarray &src,
+                      std::optional<const int> trailing_dims_to_search,
+                      const py::ssize_t k, const bool largest,
+                      const dpctl::tensor::usm_ndarray &vals,
+                      const dpctl::tensor::usm_ndarray &inds,
+                      sycl::queue &exec_q,
+                      const std::vector<sycl::event> &depends)
+        -> std::pair<sycl::event, sycl::event> {
+        if (trailing_dims_to_search) {
+            return dpctl::tensor::py_internal::py_topk(
+                src, trailing_dims_to_search.value(), k, largest, vals, inds,
+                exec_q, depends);
+        }
+        else {
+            return dpctl::tensor::py_internal::py_topk(src, k, largest, vals,
+                                                       inds, exec_q, depends);
+        }
+    };
+
+    m.def("_topk", py_topk, py::arg("src"), py::arg("trailing_dims_to_search"),
+          py::arg("k"), py::arg("largest"), py::arg("vals"), py::arg("inds"),
+          py::arg("sycl_queue"), py::arg("depends") = py::list());
+}
+
+} // end of namespace py_internal
+} // end of namespace tensor
+} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/topk.hpp b/dpctl/tensor/libtensor/source/sorting/topk.hpp
new file mode 100644
index 0000000000..37042457da
--- /dev/null
+++ b/dpctl/tensor/libtensor/source/sorting/topk.hpp
@@ -0,0 +1,42 @@
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===--------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines functions of dpctl.tensor._tensor_sorting_impl
+/// extension.
+//===--------------------------------------------------------------------===//
+
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace py_internal
+{
+
+extern void init_topk_functions(py::module_);
+
+} // namespace py_internal
+} // namespace tensor
+} // namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/tensor_sorting.cpp b/dpctl/tensor/libtensor/source/tensor_sorting.cpp
index 52d3ab67b4..09deac7b30 100644
--- a/dpctl/tensor/libtensor/source/tensor_sorting.cpp
+++ b/dpctl/tensor/libtensor/source/tensor_sorting.cpp
@@ -28,6 +28,7 @@
 #include "sorting/merge_argsort.hpp"
 #include "sorting/merge_sort.hpp"
 #include "sorting/searchsorted.hpp"
+#include "sorting/topk.hpp"
 
 namespace py = pybind11;
 
@@ -36,4 +37,5 @@ PYBIND11_MODULE(_tensor_sorting_impl, m)
     dpctl::tensor::py_internal::init_merge_sort_functions(m);
     dpctl::tensor::py_internal::init_merge_argsort_functions(m);
     dpctl::tensor::py_internal::init_searchsorted_functions(m);
+    dpctl::tensor::py_internal::init_topk_functions(m);
 }

From f8095731e6a1ffc757190e6d45014273ecadac9f Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 4 Dec 2024 16:16:25 -0800
Subject: [PATCH 02/30] Add `top_k` to doc sources

---
 .../doc_sources/api_reference/dpctl/tensor.sorting_functions.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/doc_sources/api_reference/dpctl/tensor.sorting_functions.rst b/docs/doc_sources/api_reference/dpctl/tensor.sorting_functions.rst
index ae1605d988..ef20f4654c 100644
--- a/docs/doc_sources/api_reference/dpctl/tensor.sorting_functions.rst
+++ b/docs/doc_sources/api_reference/dpctl/tensor.sorting_functions.rst
@@ -10,3 +10,4 @@ Sorting functions
 
    argsort
    sort
+   top_k

From b59e59d57c0e150a52aa92f2b89e407dfbe4a60d Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Thu, 5 Dec 2024 13:25:17 -0800
Subject: [PATCH 03/30] Use `std::size_t` for `k` instead of `py::ssize_t`

Reduces amount of casting. `k` will need to fit in `py::ssize_t` regardless.
---
 .../include/kernels/sorting/topk.hpp          | 25 ++++++++-----------
 .../tensor/libtensor/source/sorting/topk.cpp  | 19 +++++++-------
 2 files changed, 19 insertions(+), 25 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
index 7db3f3e3be..f709a1737d 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -86,7 +86,7 @@ sycl::event topk_full_sort_impl(
                              // matrix when sorting over rows)
     std::size_t sort_nelems, // size of each array to sort  (length of rows,
                              // i.e. number of columns)
-    dpctl::tensor::ssize_t k,
+    std::size_t k,
     const argTy *arg_tp,
     argTy *vals_tp,
     IndexTy *inds_tp,
@@ -174,7 +174,7 @@ topk_impl(sycl::queue &exec_q,
                                    // in a matrix when sorting over rows)
           std::size_t axis_nelems, // size of each array to sort  (length of
                                    // rows, i.e. number of columns)
-          dpctl::tensor::ssize_t k,
+          std::size_t k,
           const char *arg_cp,
           char *vals_cp,
           char *inds_cp,
@@ -186,7 +186,7 @@ topk_impl(sycl::queue &exec_q,
           dpctl::tensor::ssize_t axis_inds_offset,
           const std::vector<sycl::event> &depends)
 {
-    if (axis_nelems < static_cast<std::size_t>(k)) {
+    if (axis_nelems < k) {
         throw std::runtime_error("Invalid sort axis size for value of k");
     }
 
@@ -200,9 +200,7 @@ topk_impl(sycl::queue &exec_q,
     using dpctl::tensor::kernels::IndexComp;
     const IndexComp<IndexTy, argTy, ValueComp> index_comp{arg_tp, ValueComp{}};
 
-    if (axis_nelems <= 512 || k >= 1024 ||
-        static_cast<std::size_t>(k) > axis_nelems / 2)
-    {
+    if (axis_nelems <= 512 || k >= 1024 || k > axis_nelems / 2) {
         return topk_full_sort_impl(exec_q, iter_nelems, axis_nelems, k, arg_tp,
                                    vals_tp, inds_tp, index_comp, depends);
     }
@@ -256,22 +254,19 @@ topk_impl(sycl::queue &exec_q,
                                                           sorted_block_size);
 
         // round k up for the later merge kernel
-        const dpctl::tensor::ssize_t round_k_to = elems_per_wi;
-        dpctl::tensor::ssize_t k_rounded =
-            merge_sort_detail::quotient_ceil<dpctl::tensor::ssize_t>(
-                k, round_k_to) *
-            round_k_to;
+        std::size_t k_rounded =
+            merge_sort_detail::quotient_ceil<std::size_t>(k, elems_per_wi) *
+            elems_per_wi;
 
         // get length of tail for alloc size
         auto rem = axis_nelems % sorted_block_size;
-        auto alloc_len = (rem && rem < static_cast<std::size_t>(k_rounded))
+        auto alloc_len = (rem && rem < k_rounded)
                              ? rem + k_rounded * (n_segments - 1)
                              : k_rounded * n_segments;
 
         // if allocation would be sufficiently large or k is larger than
         // elements processed, use full sort
-        if (static_cast<std::size_t>(k_rounded) >= axis_nelems ||
-            static_cast<std::size_t>(k_rounded) >= sorted_block_size ||
+        if (k_rounded >= axis_nelems || k_rounded >= sorted_block_size ||
             alloc_len >= axis_nelems / 2)
         {
             return topk_full_sort_impl(exec_q, iter_nelems, axis_nelems, k,
@@ -386,7 +381,7 @@ topk_impl(sycl::queue &exec_q,
                     for (std::size_t array_id = k_segment_start_idx + lid;
                          array_id < k_segment_end_idx; array_id += lws)
                     {
-                        if (lid < static_cast<std::size_t>(k_rounded)) {
+                        if (lid < k_rounded) {
                             index_data[iter_id * alloc_len + array_id] =
                                 out_src[array_id - k_segment_start_idx];
                         }
diff --git a/dpctl/tensor/libtensor/source/sorting/topk.cpp b/dpctl/tensor/libtensor/source/sorting/topk.cpp
index af2f6ce027..fe5eac9acb 100644
--- a/dpctl/tensor/libtensor/source/sorting/topk.cpp
+++ b/dpctl/tensor/libtensor/source/sorting/topk.cpp
@@ -58,7 +58,7 @@ namespace py_internal
 typedef sycl::event (*topk_impl_fn_ptr_t)(sycl::queue &,
                                           std::size_t,
                                           std::size_t,
-                                          py::ssize_t,
+                                          std::size_t,
                                           bool,
                                           const char *,
                                           char *,
@@ -83,7 +83,7 @@ topk_caller(sycl::queue &exec_q,
                                      // rows in a matrix when sorting over rows)
             std::size_t axis_nelems, // size of each array to sort  (length of
                                      // rows, i.e. number of columns)
-            py::ssize_t k,
+            std::size_t k,
             bool largest,
             const char *arg_cp,
             char *vals_cp,
@@ -120,7 +120,7 @@ topk_caller(sycl::queue &exec_q,
 std::pair<sycl::event, sycl::event>
 py_topk(const dpctl::tensor::usm_ndarray &src,
         const int trailing_dims_to_search,
-        const py::ssize_t k,
+        const std::size_t k,
         const bool largest,
         const dpctl::tensor::usm_ndarray &vals,
         const dpctl::tensor::usm_ndarray &inds,
@@ -168,9 +168,7 @@ py_topk(const dpctl::tensor::usm_ndarray &src,
         inds_k *= static_cast<std::size_t>(inds_shape_ptr[i]);
     }
 
-    bool valid_k = (vals_k == static_cast<std::size_t>(k) &&
-                    inds_k == static_cast<std::size_t>(k) &&
-                    axis_nelems >= static_cast<std::size_t>(k));
+    bool valid_k = (vals_k == k && inds_k == k && axis_nelems >= k);
     if (!valid_k) {
         throw py::value_error(
             "The value of k is invalid for the input and destination arrays");
@@ -243,7 +241,7 @@ py_topk(const dpctl::tensor::usm_ndarray &src,
 
 std::pair<sycl::event, sycl::event>
 py_topk(const dpctl::tensor::usm_ndarray &src,
-        const py::ssize_t k,
+        const std::size_t k,
         const bool largest,
         const dpctl::tensor::usm_ndarray &vals,
         const dpctl::tensor::usm_ndarray &inds,
@@ -266,8 +264,9 @@ py_topk(const dpctl::tensor::usm_ndarray &src,
         axis_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
     }
 
-    bool valid_k = (axis_nelems >= static_cast<std::size_t>(k) &&
-                    vals_shape_ptr[0] == k && inds_shape_ptr[0] == k);
+    bool valid_k =
+        (axis_nelems >= k && static_cast<std::size_t>(vals_shape_ptr[0]) == k &&
+         static_cast<std::size_t>(inds_shape_ptr[0]) == k);
     if (!valid_k) {
         throw py::value_error(
             "The value of k is invalid for the input and destination arrays");
@@ -360,7 +359,7 @@ void init_topk_functions(py::module_ m)
 
     auto py_topk = [](const dpctl::tensor::usm_ndarray &src,
                       std::optional<const int> trailing_dims_to_search,
-                      const py::ssize_t k, const bool largest,
+                      const std::size_t k, const bool largest,
                       const dpctl::tensor::usm_ndarray &vals,
                       const dpctl::tensor::usm_ndarray &inds,
                       sycl::queue &exec_q,

From b4d7ba4b010141f4b459e2ea5dc45b5b2ef48fa4 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Thu, 5 Dec 2024 15:20:11 -0800
Subject: [PATCH 04/30] Add implementation of `top_k` using radix sort

---
 .../include/kernels/sorting/topk.hpp          | 202 +++++++++++++-----
 .../tensor/libtensor/source/sorting/topk.cpp  |  54 +++--
 2 files changed, 193 insertions(+), 63 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
index f709a1737d..11f3e851fb 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -35,6 +35,7 @@
 
 #include "kernels/dpctl_tensor_types.hpp"
 #include "merge_sort.hpp"
+#include "radix_sort.hpp"
 #include "utils/sycl_alloc_utils.hpp"
 #include <sycl/ext/oneapi/sub_group_mask.hpp>
 
@@ -70,31 +71,25 @@ void scale_topk_params(const std::uint64_t nelems_per_slm,
 } // namespace topk_detail
 
 template <typename T1, typename T2, typename T3>
-class populate_index_data_full_sort_krn;
+class topk_populate_index_data_krn;
 
 template <typename T1, typename T2, typename T3>
-class topk_map_to_rows_full_sort_krn;
-
-template <typename T1, typename T2, typename T3> class populate_index_data_krn;
-
-template <typename T1, typename T2, typename T3> class topk_map_to_rows_krn;
+class topk_full_merge_map_back_krn;
 
 template <typename argTy, typename IndexTy, typename CompT>
-sycl::event topk_full_sort_impl(
-    sycl::queue &exec_q,
-    std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows in a
-                             // matrix when sorting over rows)
-    std::size_t sort_nelems, // size of each array to sort  (length of rows,
-                             // i.e. number of columns)
-    std::size_t k,
-    const argTy *arg_tp,
-    argTy *vals_tp,
-    IndexTy *inds_tp,
-    const CompT &comp,
-    const std::vector<sycl::event> &depends)
+sycl::event
+topk_full_merge_sort_impl(sycl::queue &exec_q,
+                          std::size_t iter_nelems, // number of sub-arrays
+                          std::size_t axis_nelems, // size of each sub-array
+                          std::size_t k,
+                          const argTy *arg_tp,
+                          argTy *vals_tp,
+                          IndexTy *inds_tp,
+                          const CompT &comp,
+                          const std::vector<sycl::event> &depends)
 {
     IndexTy *index_data =
-        sycl::malloc_device<IndexTy>(iter_nelems * sort_nelems, exec_q);
+        sycl::malloc_device<IndexTy>(iter_nelems * axis_nelems, exec_q);
     if (index_data == nullptr) {
         throw std::runtime_error("Unable to allocate device_memory");
     }
@@ -103,10 +98,10 @@ sycl::event topk_full_sort_impl(
         exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(depends);
 
-            auto const &range = sycl::range<1>(iter_nelems * sort_nelems);
+            auto const &range = sycl::range<1>(iter_nelems * axis_nelems);
 
             using KernelName =
-                populate_index_data_full_sort_krn<argTy, IndexTy, CompT>;
+                topk_populate_index_data_krn<argTy, IndexTy, CompT>;
 
             cgh.parallel_for<KernelName>(range, [=](sycl::id<1> id) {
                 std::size_t i = id[0];
@@ -118,21 +113,20 @@ sycl::event topk_full_sort_impl(
     // Sort segments of the array
     sycl::event base_sort_ev =
         merge_sort_detail::sort_over_work_group_contig_impl(
-            exec_q, iter_nelems, sort_nelems, index_data, index_data, comp,
+            exec_q, iter_nelems, axis_nelems, index_data, index_data, comp,
             sorted_block_size, // modified in place with size of sorted block
                                // size
             {populate_indexed_data_ev});
 
     // Merge segments in parallel until all elements are sorted
     sycl::event merges_ev = merge_sort_detail::merge_sorted_block_contig_impl(
-        exec_q, iter_nelems, sort_nelems, index_data, comp, sorted_block_size,
+        exec_q, iter_nelems, axis_nelems, index_data, comp, sorted_block_size,
         {base_sort_ev});
 
     sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(merges_ev);
 
-        using KernelName =
-            topk_map_to_rows_full_sort_krn<argTy, IndexTy, CompT>;
+        using KernelName = topk_full_merge_map_back_krn<argTy, IndexTy, CompT>;
 
         cgh.parallel_for<KernelName>(iter_nelems * k, [=](sycl::id<1> id) {
             std::size_t gid = id[0];
@@ -140,12 +134,12 @@ sycl::event topk_full_sort_impl(
             std::size_t iter_gid = gid / k;
             std::size_t axis_gid = gid - (iter_gid * k);
 
-            std::size_t src_idx = iter_gid * sort_nelems + axis_gid;
+            std::size_t src_idx = iter_gid * axis_nelems + axis_gid;
             std::size_t dst_idx = iter_gid * k + axis_gid;
 
             auto res_ind = index_data[src_idx];
             vals_tp[dst_idx] = arg_tp[res_ind];
-            inds_tp[dst_idx] = res_ind % sort_nelems;
+            inds_tp[dst_idx] = res_ind % axis_nelems;
         });
     });
 
@@ -162,29 +156,32 @@ sycl::event topk_full_sort_impl(
     return cleanup_host_task_event;
 };
 
+template <typename T1, typename T2, typename T3>
+class topk_partial_merge_map_back_krn;
+
 template <typename T1, typename T2, typename Comp>
 class topk_over_work_group_krn;
 
 template <typename argTy,
           typename IndexTy,
           typename ValueComp = std::less<argTy>>
-sycl::event
-topk_impl(sycl::queue &exec_q,
-          std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows
-                                   // in a matrix when sorting over rows)
-          std::size_t axis_nelems, // size of each array to sort  (length of
-                                   // rows, i.e. number of columns)
-          std::size_t k,
-          const char *arg_cp,
-          char *vals_cp,
-          char *inds_cp,
-          dpctl::tensor::ssize_t iter_arg_offset,
-          dpctl::tensor::ssize_t iter_vals_offset,
-          dpctl::tensor::ssize_t iter_inds_offset,
-          dpctl::tensor::ssize_t axis_arg_offset,
-          dpctl::tensor::ssize_t axis_vals_offset,
-          dpctl::tensor::ssize_t axis_inds_offset,
-          const std::vector<sycl::event> &depends)
+sycl::event topk_merge_impl(
+    sycl::queue &exec_q,
+    std::size_t iter_nelems, // number of sub-arrays to sort (num. of rows
+                             // in a matrix when sorting over rows)
+    std::size_t axis_nelems, // size of each array to sort  (length of
+                             // rows, i.e. number of columns)
+    std::size_t k,
+    const char *arg_cp,
+    char *vals_cp,
+    char *inds_cp,
+    dpctl::tensor::ssize_t iter_arg_offset,
+    dpctl::tensor::ssize_t iter_vals_offset,
+    dpctl::tensor::ssize_t iter_inds_offset,
+    dpctl::tensor::ssize_t axis_arg_offset,
+    dpctl::tensor::ssize_t axis_vals_offset,
+    dpctl::tensor::ssize_t axis_inds_offset,
+    const std::vector<sycl::event> &depends)
 {
     if (axis_nelems < k) {
         throw std::runtime_error("Invalid sort axis size for value of k");
@@ -201,8 +198,9 @@ topk_impl(sycl::queue &exec_q,
     const IndexComp<IndexTy, argTy, ValueComp> index_comp{arg_tp, ValueComp{}};
 
     if (axis_nelems <= 512 || k >= 1024 || k > axis_nelems / 2) {
-        return topk_full_sort_impl(exec_q, iter_nelems, axis_nelems, k, arg_tp,
-                                   vals_tp, inds_tp, index_comp, depends);
+        return topk_full_merge_sort_impl(exec_q, iter_nelems, axis_nelems, k,
+                                         arg_tp, vals_tp, inds_tp, index_comp,
+                                         depends);
     }
     else {
         using PartialKernelName =
@@ -269,9 +267,9 @@ topk_impl(sycl::queue &exec_q,
         if (k_rounded >= axis_nelems || k_rounded >= sorted_block_size ||
             alloc_len >= axis_nelems / 2)
         {
-            return topk_full_sort_impl(exec_q, iter_nelems, axis_nelems, k,
-                                       arg_tp, vals_tp, inds_tp, index_comp,
-                                       depends);
+            return topk_full_merge_sort_impl(exec_q, iter_nelems, axis_nelems,
+                                             k, arg_tp, vals_tp, inds_tp,
+                                             index_comp, depends);
         }
 
         IndexTy *index_data =
@@ -399,7 +397,8 @@ topk_impl(sycl::queue &exec_q,
         sycl::event write_topk_ev = exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(merges_ev);
 
-            using KernelName = topk_map_to_rows_krn<argTy, IndexTy, ValueComp>;
+            using KernelName =
+                topk_partial_merge_map_back_krn<argTy, IndexTy, ValueComp>;
 
             cgh.parallel_for<KernelName>(iter_nelems * k, [=](sycl::id<1> id) {
                 std::size_t gid = id[0];
@@ -430,6 +429,109 @@ topk_impl(sycl::queue &exec_q,
     }
 }
 
+template <typename T1, typename T2> class topk_iota_krn;
+
+template <typename T1, typename T2> class topk_radix_map_back_krn;
+
+template <typename argTy, typename IndexTy>
+sycl::event topk_radix_impl(sycl::queue &exec_q,
+                            std::size_t iter_nelems, // number of sub-arrays
+                            std::size_t axis_nelems, // size of each sub-array
+                            std::size_t k,
+                            bool ascending,
+                            const char *arg_cp,
+                            char *vals_cp,
+                            char *inds_cp,
+                            dpctl::tensor::ssize_t iter_arg_offset,
+                            dpctl::tensor::ssize_t iter_vals_offset,
+                            dpctl::tensor::ssize_t iter_inds_offset,
+                            dpctl::tensor::ssize_t axis_arg_offset,
+                            dpctl::tensor::ssize_t axis_vals_offset,
+                            dpctl::tensor::ssize_t axis_inds_offset,
+                            const std::vector<sycl::event> &depends)
+{
+    if (axis_nelems < k) {
+        throw std::runtime_error("Invalid sort axis size for value of k");
+    }
+
+    const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
+                          iter_arg_offset + axis_arg_offset;
+    argTy *vals_tp = reinterpret_cast<argTy *>(vals_cp) + iter_vals_offset +
+                     axis_vals_offset;
+    IndexTy *inds_tp = reinterpret_cast<IndexTy *>(inds_cp) + iter_inds_offset +
+                       axis_inds_offset;
+
+    const std::size_t total_nelems = iter_nelems * axis_nelems;
+    const std::size_t padded_total_nelems = ((total_nelems + 63) / 64) * 64;
+    IndexTy *workspace = sycl::malloc_device<IndexTy>(
+        padded_total_nelems + total_nelems, exec_q);
+
+    IndexTy *tmp_tp = sycl::malloc_device<IndexTy>(total_nelems, exec_q);
+
+    if (nullptr == workspace || nullptr == tmp_tp) {
+        throw std::runtime_error(
+            "Not enough device memory for radix sort topk");
+    }
+
+    using IdentityProjT = radix_sort_details::IdentityProj;
+    using IndexedProjT =
+        radix_sort_details::IndexedProj<IndexTy, argTy, IdentityProjT>;
+    const IndexedProjT proj_op{arg_tp, IdentityProjT{}};
+
+    sycl::event iota_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        using KernelName = topk_iota_krn<argTy, IndexTy>;
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(total_nelems), [=](sycl::id<1> id) {
+                size_t i = id[0];
+                IndexTy sort_id = static_cast<IndexTy>(i);
+                workspace[i] = sort_id;
+            });
+    });
+
+    sycl::event radix_sort_ev =
+        radix_sort_details::parallel_radix_sort_impl<IndexTy, IndexedProjT>(
+            exec_q, iter_nelems, axis_nelems, workspace, tmp_tp, proj_op,
+            ascending, {iota_ev});
+
+    // Write out top k of the temporary
+    sycl::event write_topk_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(radix_sort_ev);
+
+        using KernelName = topk_radix_map_back_krn<argTy, IndexTy>;
+
+        cgh.parallel_for<KernelName>(iter_nelems * k, [=](sycl::id<1> id) {
+            std::size_t gid = id[0];
+
+            std::size_t iter_gid = gid / k;
+            std::size_t axis_gid = gid - (iter_gid * k);
+
+            std::size_t src_idx = iter_gid * axis_nelems + axis_gid;
+            std::size_t dst_idx = iter_gid * k + axis_gid;
+
+            IndexTy res_ind = tmp_tp[src_idx];
+            vals_tp[dst_idx] = arg_tp[res_ind];
+            inds_tp[dst_idx] = res_ind % axis_nelems;
+        });
+    });
+
+    sycl::event cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(write_topk_ev);
+
+        const sycl::context &ctx = exec_q.get_context();
+
+        using dpctl::tensor::alloc_utils::sycl_free_noexcept;
+        cgh.host_task([ctx, workspace, tmp_tp] {
+            sycl_free_noexcept(workspace, ctx);
+            sycl_free_noexcept(tmp_tp, ctx);
+        });
+    });
+
+    return cleanup_ev;
+}
+
 } // end of namespace kernels
 } // end of namespace tensor
 } // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/source/sorting/topk.cpp b/dpctl/tensor/libtensor/source/sorting/topk.cpp
index fe5eac9acb..d3a9e951ac 100644
--- a/dpctl/tensor/libtensor/source/sorting/topk.cpp
+++ b/dpctl/tensor/libtensor/source/sorting/topk.cpp
@@ -76,6 +76,23 @@ static topk_impl_fn_ptr_t topk_dispatch_vector[td_ns::num_types];
 namespace
 {
 
+template <typename T, typename = void>
+struct use_radix_sort : public std::false_type
+{
+};
+
+template <typename T>
+struct use_radix_sort<
+    T,
+    std::enable_if_t<std::disjunction<std::is_same<T, bool>,
+                                      std::is_same<T, std::uint8_t>,
+                                      std::is_same<T, std::int8_t>,
+                                      std::is_same<T, std::uint16_t>,
+                                      std::is_same<T, std::int16_t>>::value>>
+    : public std::true_type
+{
+};
+
 template <typename argTy, typename IndexTy>
 sycl::event
 topk_caller(sycl::queue &exec_q,
@@ -96,22 +113,33 @@ topk_caller(sycl::queue &exec_q,
             py::ssize_t axis_inds_offset,
             const std::vector<sycl::event> &depends)
 {
-    using dpctl::tensor::kernels::topk_impl;
-    if (largest) {
-        using CompTy =
-            typename dpctl::tensor::py_internal::DescendingSorter<argTy>::type;
-        return topk_impl<argTy, IndexTy, CompTy>(
-            exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp,
-            iter_arg_offset, iter_vals_offset, iter_inds_offset,
+    if constexpr (use_radix_sort<argTy>::value) {
+        using dpctl::tensor::kernels::topk_radix_impl;
+        auto ascending = !largest;
+        return topk_radix_impl<argTy, IndexTy>(
+            exec_q, iter_nelems, axis_nelems, k, ascending, arg_cp, vals_cp,
+            inds_cp, iter_arg_offset, iter_vals_offset, iter_inds_offset,
             axis_arg_offset, axis_vals_offset, axis_inds_offset, depends);
     }
     else {
-        using CompTy =
-            typename dpctl::tensor::py_internal::AscendingSorter<argTy>::type;
-        return topk_impl<argTy, IndexTy, CompTy>(
-            exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp,
-            iter_arg_offset, iter_vals_offset, iter_inds_offset,
-            axis_arg_offset, axis_vals_offset, axis_inds_offset, depends);
+        using dpctl::tensor::kernels::topk_merge_impl;
+        if (largest) {
+            using CompTy =
+                typename dpctl::tensor::py_internal::DescendingSorter<
+                    argTy>::type;
+            return topk_merge_impl<argTy, IndexTy, CompTy>(
+                exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp,
+                iter_arg_offset, iter_vals_offset, iter_inds_offset,
+                axis_arg_offset, axis_vals_offset, axis_inds_offset, depends);
+        }
+        else {
+            using CompTy = typename dpctl::tensor::py_internal::AscendingSorter<
+                argTy>::type;
+            return topk_merge_impl<argTy, IndexTy, CompTy>(
+                exec_q, iter_nelems, axis_nelems, k, arg_cp, vals_cp, inds_cp,
+                iter_arg_offset, iter_vals_offset, iter_inds_offset,
+                axis_arg_offset, axis_vals_offset, axis_inds_offset, depends);
+        }
     }
 }
 

From c37f1de0cb9fde37445382634c2d28c53cb8d75f Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 11 Dec 2024 13:36:53 -0800
Subject: [PATCH 05/30] Reduce code duplication by using std::optional in
 py_topk directly

Instead of using an overload to handle the `axis=None` case, use std::optional and check for trailing_dims_to_search in validation logic
---
 .../tensor/libtensor/source/sorting/topk.cpp  | 239 ++++++------------
 1 file changed, 71 insertions(+), 168 deletions(-)

diff --git a/dpctl/tensor/libtensor/source/sorting/topk.cpp b/dpctl/tensor/libtensor/source/sorting/topk.cpp
index d3a9e951ac..dea20fd494 100644
--- a/dpctl/tensor/libtensor/source/sorting/topk.cpp
+++ b/dpctl/tensor/libtensor/source/sorting/topk.cpp
@@ -94,24 +94,21 @@ struct use_radix_sort<
 };
 
 template <typename argTy, typename IndexTy>
-sycl::event
-topk_caller(sycl::queue &exec_q,
-            std::size_t iter_nelems, // number of sub-arrays to sort (num. of
-                                     // rows in a matrix when sorting over rows)
-            std::size_t axis_nelems, // size of each array to sort  (length of
-                                     // rows, i.e. number of columns)
-            std::size_t k,
-            bool largest,
-            const char *arg_cp,
-            char *vals_cp,
-            char *inds_cp,
-            py::ssize_t iter_arg_offset,
-            py::ssize_t iter_vals_offset,
-            py::ssize_t iter_inds_offset,
-            py::ssize_t axis_arg_offset,
-            py::ssize_t axis_vals_offset,
-            py::ssize_t axis_inds_offset,
-            const std::vector<sycl::event> &depends)
+sycl::event topk_caller(sycl::queue &exec_q,
+                        std::size_t iter_nelems, // number of sub-arrays
+                        std::size_t axis_nelems, // size of each sub-array
+                        std::size_t k,
+                        bool largest,
+                        const char *arg_cp,
+                        char *vals_cp,
+                        char *inds_cp,
+                        py::ssize_t iter_arg_offset,
+                        py::ssize_t iter_vals_offset,
+                        py::ssize_t iter_inds_offset,
+                        py::ssize_t axis_arg_offset,
+                        py::ssize_t axis_vals_offset,
+                        py::ssize_t axis_inds_offset,
+                        const std::vector<sycl::event> &depends)
 {
     if constexpr (use_radix_sort<argTy>::value) {
         using dpctl::tensor::kernels::topk_radix_impl;
@@ -147,7 +144,7 @@ topk_caller(sycl::queue &exec_q,
 
 std::pair<sycl::event, sycl::event>
 py_topk(const dpctl::tensor::usm_ndarray &src,
-        const int trailing_dims_to_search,
+        std::optional<const int> trailing_dims_to_search,
         const std::size_t k,
         const bool largest,
         const dpctl::tensor::usm_ndarray &vals,
@@ -158,48 +155,70 @@ py_topk(const dpctl::tensor::usm_ndarray &src,
     int src_nd = src.get_ndim();
     int vals_nd = vals.get_ndim();
     int inds_nd = inds.get_ndim();
-    if (src_nd != vals_nd || src_nd != inds_nd) {
-        throw py::value_error("The input and output arrays must have "
-                              "the same array ranks");
-    }
-    int iteration_nd = src_nd - trailing_dims_to_search;
-    if (trailing_dims_to_search <= 0 || iteration_nd < 0) {
-        throw py::value_error(
-            "trailing_dims_to_search must be positive, but no "
-            "greater than rank of the array being searched");
-    }
 
     const py::ssize_t *src_shape_ptr = src.get_shape_raw();
     const py::ssize_t *vals_shape_ptr = vals.get_shape_raw();
     const py::ssize_t *inds_shape_ptr = inds.get_shape_raw();
 
-    bool same_shapes = true;
+    std::size_t axis_nelems(1);
     std::size_t iter_nelems(1);
-    for (int i = 0; same_shapes && (i < iteration_nd); ++i) {
-        auto src_shape_i = src_shape_ptr[i];
-        same_shapes = same_shapes && (src_shape_i == vals_shape_ptr[i] &&
-                                      src_shape_i == inds_shape_ptr[i]);
-        iter_nelems *= static_cast<std::size_t>(src_shape_i);
-    }
+    if (trailing_dims_to_search.has_value()) {
+        if (src_nd != vals_nd || src_nd != inds_nd) {
+            throw py::value_error("The input and output arrays must have "
+                                  "the same array ranks");
+        }
 
-    if (!same_shapes) {
-        throw py::value_error(
-            "Destination shape does not match the input shape");
-    }
+        auto trailing_dims = trailing_dims_to_search.value();
+        int iter_nd = src_nd - trailing_dims;
+        if (trailing_dims <= 0 || iter_nd < 0) {
+            throw py::value_error(
+                "trailing_dims_to_search must be positive, but no "
+                "greater than rank of the array being searched");
+        }
 
-    std::size_t vals_k(1);
-    std::size_t inds_k(1);
-    std::size_t axis_nelems(1);
-    for (int i = iteration_nd; i < src_nd; ++i) {
-        axis_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
-        vals_k *= static_cast<std::size_t>(vals_shape_ptr[i]);
-        inds_k *= static_cast<std::size_t>(inds_shape_ptr[i]);
+        bool same_shapes = true;
+        for (int i = 0; same_shapes && (i < iter_nd); ++i) {
+            auto src_shape_i = src_shape_ptr[i];
+            same_shapes = same_shapes && (src_shape_i == vals_shape_ptr[i] &&
+                                          src_shape_i == inds_shape_ptr[i]);
+            iter_nelems *= static_cast<std::size_t>(src_shape_i);
+        }
+
+        if (!same_shapes) {
+            throw py::value_error(
+                "Destination shape does not match the input shape");
+        }
+
+        std::size_t vals_k(1);
+        std::size_t inds_k(1);
+        for (int i = iter_nd; i < src_nd; ++i) {
+            axis_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
+            vals_k *= static_cast<std::size_t>(vals_shape_ptr[i]);
+            inds_k *= static_cast<std::size_t>(inds_shape_ptr[i]);
+        }
+
+        bool valid_k = (vals_k == k && inds_k == k && axis_nelems >= k);
+        if (!valid_k) {
+            throw py::value_error("The value of k is invalid for the input and "
+                                  "destination arrays");
+        }
     }
+    else {
+        if (vals_nd != 1 || inds_nd != 1) {
+            throw py::value_error("Output arrays must be one-dimensional");
+        }
 
-    bool valid_k = (vals_k == k && inds_k == k && axis_nelems >= k);
-    if (!valid_k) {
-        throw py::value_error(
-            "The value of k is invalid for the input and destination arrays");
+        for (int i = 0; i < src_nd; ++i) {
+            axis_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
+        }
+
+        bool valid_k = (axis_nelems >= k &&
+                        static_cast<std::size_t>(vals_shape_ptr[0]) == k &&
+                        static_cast<std::size_t>(inds_shape_ptr[0]) == k);
+        if (!valid_k) {
+            throw py::value_error("The value of k is invalid for the input and "
+                                  "destination arrays");
+        }
     }
 
     if (!dpctl::utils::queues_are_compatible(exec_q, {src, vals, inds})) {
@@ -267,103 +286,6 @@ py_topk(const dpctl::tensor::usm_ndarray &src,
     return std::make_pair(sycl::event(), sycl::event());
 }
 
-std::pair<sycl::event, sycl::event>
-py_topk(const dpctl::tensor::usm_ndarray &src,
-        const std::size_t k,
-        const bool largest,
-        const dpctl::tensor::usm_ndarray &vals,
-        const dpctl::tensor::usm_ndarray &inds,
-        sycl::queue &exec_q,
-        const std::vector<sycl::event> &depends)
-{
-    int src_nd = src.get_ndim();
-    int vals_nd = vals.get_ndim();
-    int inds_nd = inds.get_ndim();
-    if (vals_nd != 1 || inds_nd != 1) {
-        throw py::value_error("Output arrays must be one-dimensional");
-    }
-
-    const py::ssize_t *src_shape_ptr = src.get_shape_raw();
-    const py::ssize_t *vals_shape_ptr = vals.get_shape_raw();
-    const py::ssize_t *inds_shape_ptr = inds.get_shape_raw();
-
-    std::size_t axis_nelems(1);
-    for (int i = 0; i < src_nd; ++i) {
-        axis_nelems *= static_cast<std::size_t>(src_shape_ptr[i]);
-    }
-
-    bool valid_k =
-        (axis_nelems >= k && static_cast<std::size_t>(vals_shape_ptr[0]) == k &&
-         static_cast<std::size_t>(inds_shape_ptr[0]) == k);
-    if (!valid_k) {
-        throw py::value_error(
-            "The value of k is invalid for the input and destination arrays");
-    }
-
-    if (!dpctl::utils::queues_are_compatible(exec_q, {src, vals, inds})) {
-        throw py::value_error(
-            "Execution queue is not compatible with allocation queues");
-    }
-
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(vals);
-    dpctl::tensor::validation::CheckWritable::throw_if_not_writable(inds);
-
-    if (axis_nelems == 0) {
-        // Nothing to do
-        return std::make_pair(sycl::event(), sycl::event());
-    }
-
-    auto const &overlap = dpctl::tensor::overlap::MemoryOverlap();
-    if (overlap(src, vals) || overlap(src, inds)) {
-        throw py::value_error("Arrays index overlapping segments of memory");
-    }
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(vals, k);
-
-    dpctl::tensor::validation::AmpleMemory::throw_if_not_ample(inds, k);
-
-    int src_typenum = src.get_typenum();
-    int vals_typenum = vals.get_typenum();
-    int inds_typenum = inds.get_typenum();
-
-    const auto &array_types = td_ns::usm_ndarray_types();
-    int src_typeid = array_types.typenum_to_lookup_id(src_typenum);
-    int vals_typeid = array_types.typenum_to_lookup_id(vals_typenum);
-    int inds_typeid = array_types.typenum_to_lookup_id(inds_typenum);
-
-    if (src_typeid != vals_typeid) {
-        throw py::value_error("Input array and vals array must have "
-                              "the same data type");
-    }
-
-    if (inds_typeid != static_cast<int>(td_ns::typenum_t::INT64)) {
-        throw py::value_error("Inds array must have data type int64");
-    }
-
-    bool is_src_c_contig = src.is_c_contiguous();
-    bool is_vals_c_contig = vals.is_c_contiguous();
-    bool is_inds_c_contig = inds.is_c_contiguous();
-
-    if (is_src_c_contig && is_vals_c_contig && is_inds_c_contig) {
-        static constexpr py::ssize_t zero_offset = py::ssize_t(0);
-        static constexpr std::size_t iter_nelems = 1;
-
-        auto fn = topk_dispatch_vector[src_typeid];
-
-        sycl::event comp_ev =
-            fn(exec_q, iter_nelems, axis_nelems, k, largest, src.get_data(),
-               vals.get_data(), inds.get_data(), zero_offset, zero_offset,
-               zero_offset, zero_offset, zero_offset, zero_offset, depends);
-
-        sycl::event keep_args_alive_ev =
-            dpctl::utils::keep_args_alive(exec_q, {src, vals, inds}, {comp_ev});
-
-        return std::make_pair(keep_args_alive_ev, comp_ev);
-    }
-
-    return std::make_pair(sycl::event(), sycl::event());
-}
-
 template <typename fnT, typename T> struct TopKFactory
 {
     fnT get()
@@ -385,26 +307,7 @@ void init_topk_functions(py::module_ m)
 {
     dpctl::tensor::py_internal::init_topk_dispatch_vectors();
 
-    auto py_topk = [](const dpctl::tensor::usm_ndarray &src,
-                      std::optional<const int> trailing_dims_to_search,
-                      const std::size_t k, const bool largest,
-                      const dpctl::tensor::usm_ndarray &vals,
-                      const dpctl::tensor::usm_ndarray &inds,
-                      sycl::queue &exec_q,
-                      const std::vector<sycl::event> &depends)
-        -> std::pair<sycl::event, sycl::event> {
-        if (trailing_dims_to_search) {
-            return dpctl::tensor::py_internal::py_topk(
-                src, trailing_dims_to_search.value(), k, largest, vals, inds,
-                exec_q, depends);
-        }
-        else {
-            return dpctl::tensor::py_internal::py_topk(src, k, largest, vals,
-                                                       inds, exec_q, depends);
-        }
-    };
-
-    m.def("_topk", py_topk, py::arg("src"), py::arg("trailing_dims_to_search"),
+    m.def("_topk", &py_topk, py::arg("src"), py::arg("trailing_dims_to_search"),
           py::arg("k"), py::arg("largest"), py::arg("vals"), py::arg("inds"),
           py::arg("sycl_queue"), py::arg("depends") = py::list());
 }

From 1f0e9fd3d777879aaedaf0df95c38edb33640a5d Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Wed, 11 Dec 2024 14:51:58 -0800
Subject: [PATCH 06/30] Clean up top_k docstring

---
 dpctl/tensor/_topk.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/dpctl/tensor/_topk.py b/dpctl/tensor/_topk.py
index a807aea689..2984f95751 100644
--- a/dpctl/tensor/_topk.py
+++ b/dpctl/tensor/_topk.py
@@ -56,23 +56,24 @@ def top_k(x, k, /, *, axis=None, mode="largest"):
             over the flattened array. Default: ``None``.
         mode (Literal["largest", "smallest"]):
             search mode. Must be one of the following modes:
-            -  `"largest"`: return the `k` largest elements.
-            -  `"smallest"`: return the `k` smallest elements.
+
+            - `"largest"`: return the `k` largest elements.
+            - `"smallest"`: return the `k` smallest elements.
+
             Default: `"largest"`.
 
     Returns:
-        tuple[usm_ndarray, usm_ndarray]:
+        tuple[usm_ndarray, usm_ndarray]
             a namedtuple `(values, indices)` whose
 
-        - first element `values` will be an array containing the `k` largest or
-          smallest elements of `x`. The array has the same data type as `x`.
-          If `axis` was `None`, `values` will be a one-dimensional array
-          with shape `(k,)` and otherwise, `values` will have shape
-          `x.shape[:axis] + (k,) + x.shape[axis+1:]`
-
-        - second element `indices` will be an array containing indices of `x`
-          that result in `values`. The array will have the same shape as
-          `values` and will have the default array index data type.
+            * first element `values` will be an array containing the `k`
+              largest or smallest elements of `x`. The array has the same data
+              type as `x`. If `axis` was `None`, `values` will be a
+              one-dimensional array with shape `(k,)` and otherwise, `values`
+              will have shape `x.shape[:axis] + (k,) + x.shape[axis+1:]`
+            * second element `indices` will be an array containing indices of
+              `x` that result in `values`. The array will have the same shape
+              as `values` and will have the default array index data type.
     """
     largest = _get_top_k_largest(mode)
     if not isinstance(x, dpt.usm_ndarray):

From fa74e5edbc92b2ee3f172f76772f507b3574b452 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Thu, 12 Dec 2024 12:40:29 -0800
Subject: [PATCH 07/30] Move top_k into _sorting.py

---
 dpctl/tensor/__init__.py |   3 +-
 dpctl/tensor/_sorting.py | 165 ++++++++++++++++++++++++++++++++++
 dpctl/tensor/_topk.py    | 185 ---------------------------------------
 3 files changed, 166 insertions(+), 187 deletions(-)
 delete mode 100644 dpctl/tensor/_topk.py

diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py
index d83d41215f..490cdfd23c 100644
--- a/dpctl/tensor/__init__.py
+++ b/dpctl/tensor/__init__.py
@@ -199,9 +199,8 @@
     unique_inverse,
     unique_values,
 )
-from ._sorting import argsort, sort
+from ._sorting import argsort, sort, top_k
 from ._testing import allclose
-from ._topk import top_k
 from ._type_utils import can_cast, finfo, iinfo, isdtype, result_type
 
 __all__ = [
diff --git a/dpctl/tensor/_sorting.py b/dpctl/tensor/_sorting.py
index d5026a6ee8..63f2d25b86 100644
--- a/dpctl/tensor/_sorting.py
+++ b/dpctl/tensor/_sorting.py
@@ -14,6 +14,9 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import operator
+from typing import NamedTuple
+
 import dpctl.tensor as dpt
 import dpctl.tensor._tensor_impl as ti
 import dpctl.utils as du
@@ -24,6 +27,7 @@
     _argsort_descending,
     _sort_ascending,
     _sort_descending,
+    _topk,
 )
 from ._tensor_sorting_radix_impl import (
     _radix_argsort_ascending,
@@ -267,3 +271,164 @@ def argsort(x, axis=-1, descending=False, stable=True, kind=None):
         inv_perm = sorted(range(nd), key=lambda d: perm[d])
         res = dpt.permute_dims(res, inv_perm)
     return res
+
+
+def _get_top_k_largest(mode):
+    modes = {"largest": True, "smallest": False}
+    try:
+        return modes[mode]
+    except KeyError:
+        raise ValueError(
+            f"`mode` must be `largest` or `smallest`. Got `{mode}`."
+        )
+
+
+class TopKResult(NamedTuple):
+    values: dpt.usm_ndarray
+    indices: dpt.usm_ndarray
+
+
+def top_k(x, k, /, *, axis=None, mode="largest"):
+    """top_k(x, k, axis=None, mode="largest")
+
+    Returns the `k` largest or smallest values and their indices in the input
+    array `x` along the specified axis `axis`.
+
+    Args:
+        x (usm_ndarray):
+            input array.
+        k (int):
+            number of elements to find. Must be a positive integer value.
+        axis (Optional[int]):
+            axis along which to search. If `None`, the search will be performed
+            over the flattened array. Default: ``None``.
+        mode (Literal["largest", "smallest"]):
+            search mode. Must be one of the following modes:
+
+            - `"largest"`: return the `k` largest elements.
+            - `"smallest"`: return the `k` smallest elements.
+
+            Default: `"largest"`.
+
+    Returns:
+        tuple[usm_ndarray, usm_ndarray]
+            a namedtuple `(values, indices)` whose
+
+            * first element `values` will be an array containing the `k`
+              largest or smallest elements of `x`. The array has the same data
+              type as `x`. If `axis` was `None`, `values` will be a
+              one-dimensional array with shape `(k,)` and otherwise, `values`
+              will have shape `x.shape[:axis] + (k,) + x.shape[axis+1:]`
+            * second element `indices` will be an array containing indices of
+              `x` that result in `values`. The array will have the same shape
+              as `values` and will have the default array index data type.
+    """
+    largest = _get_top_k_largest(mode)
+    if not isinstance(x, dpt.usm_ndarray):
+        raise TypeError(
+            f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
+        )
+
+    k = operator.index(k)
+    if k < 0:
+        raise ValueError("`k` must be a positive integer value")
+
+    nd = x.ndim
+    if axis is None:
+        sz = x.size
+        if nd == 0:
+            return TopKResult(
+                dpt.copy(x, order="C"),
+                dpt.zeros_like(
+                    x, dtype=ti.default_device_index_type(x.sycl_queue)
+                ),
+            )
+        arr = x
+        n_search_dims = None
+        res_sh = k
+    else:
+        axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
+        sz = x.shape[axis]
+        a1 = axis + 1
+        if a1 == nd:
+            perm = list(range(nd))
+            arr = x
+        else:
+            perm = [i for i in range(nd) if i != axis] + [
+                axis,
+            ]
+            arr = dpt.permute_dims(x, perm)
+        n_search_dims = 1
+        res_sh = arr.shape[: nd - 1] + (k,)
+
+    if k > sz:
+        raise ValueError(f"`k`={k} is out of bounds {sz}")
+
+    exec_q = x.sycl_queue
+    _manager = du.SequentialOrderManager[exec_q]
+    dep_evs = _manager.submitted_events
+
+    res_usm_type = arr.usm_type
+    if arr.flags.c_contiguous:
+        vals = dpt.empty(
+            res_sh,
+            dtype=arr.dtype,
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        inds = dpt.empty(
+            res_sh,
+            dtype=ti.default_device_index_type(exec_q),
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        ht_ev, impl_ev = _topk(
+            src=arr,
+            trailing_dims_to_search=n_search_dims,
+            k=k,
+            largest=largest,
+            vals=vals,
+            inds=inds,
+            sycl_queue=exec_q,
+            depends=dep_evs,
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    else:
+        tmp = dpt.empty_like(arr, order="C")
+        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
+            src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs
+        )
+        _manager.add_event_pair(ht_ev, copy_ev)
+        vals = dpt.empty(
+            res_sh,
+            dtype=arr.dtype,
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        inds = dpt.empty(
+            res_sh,
+            dtype=ti.default_device_index_type(exec_q),
+            usm_type=res_usm_type,
+            order="C",
+            sycl_queue=exec_q,
+        )
+        ht_ev, impl_ev = _topk(
+            src=tmp,
+            trailing_dims_to_search=n_search_dims,
+            k=k,
+            largest=largest,
+            vals=vals,
+            inds=inds,
+            sycl_queue=exec_q,
+            depends=[copy_ev],
+        )
+        _manager.add_event_pair(ht_ev, impl_ev)
+    if axis is not None and a1 != nd:
+        inv_perm = sorted(range(nd), key=lambda d: perm[d])
+        vals = dpt.permute_dims(vals, inv_perm)
+        inds = dpt.permute_dims(inds, inv_perm)
+
+    return TopKResult(vals, inds)
diff --git a/dpctl/tensor/_topk.py b/dpctl/tensor/_topk.py
deleted file mode 100644
index 2984f95751..0000000000
--- a/dpctl/tensor/_topk.py
+++ /dev/null
@@ -1,185 +0,0 @@
-#                       Data Parallel Control (dpctl)
-#
-#  Copyright 2020-2024 Intel Corporation
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import operator
-from typing import NamedTuple
-
-import dpctl.tensor as dpt
-import dpctl.tensor._tensor_impl as ti
-import dpctl.utils as du
-from dpctl.tensor._numpy_helper import normalize_axis_index
-
-from ._tensor_sorting_impl import _topk
-
-
-def _get_top_k_largest(mode):
-    modes = {"largest": True, "smallest": False}
-    try:
-        return modes[mode]
-    except KeyError:
-        raise ValueError(
-            f"`mode` must be `largest` or `smallest`. Got `{mode}`."
-        )
-
-
-class TopKResult(NamedTuple):
-    values: dpt.usm_ndarray
-    indices: dpt.usm_ndarray
-
-
-def top_k(x, k, /, *, axis=None, mode="largest"):
-    """top_k(x, k, axis=None, mode="largest")
-
-    Returns the `k` largest or smallest values and their indices in the input
-    array `x` along the specified axis `axis`.
-
-    Args:
-        x (usm_ndarray):
-            input array.
-        k (int):
-            number of elements to find. Must be a positive integer value.
-        axis (Optional[int]):
-            axis along which to search. If `None`, the search will be performed
-            over the flattened array. Default: ``None``.
-        mode (Literal["largest", "smallest"]):
-            search mode. Must be one of the following modes:
-
-            - `"largest"`: return the `k` largest elements.
-            - `"smallest"`: return the `k` smallest elements.
-
-            Default: `"largest"`.
-
-    Returns:
-        tuple[usm_ndarray, usm_ndarray]
-            a namedtuple `(values, indices)` whose
-
-            * first element `values` will be an array containing the `k`
-              largest or smallest elements of `x`. The array has the same data
-              type as `x`. If `axis` was `None`, `values` will be a
-              one-dimensional array with shape `(k,)` and otherwise, `values`
-              will have shape `x.shape[:axis] + (k,) + x.shape[axis+1:]`
-            * second element `indices` will be an array containing indices of
-              `x` that result in `values`. The array will have the same shape
-              as `values` and will have the default array index data type.
-    """
-    largest = _get_top_k_largest(mode)
-    if not isinstance(x, dpt.usm_ndarray):
-        raise TypeError(
-            f"Expected type dpctl.tensor.usm_ndarray, got {type(x)}"
-        )
-
-    k = operator.index(k)
-    if k < 0:
-        raise ValueError("`k` must be a positive integer value")
-
-    nd = x.ndim
-    if axis is None:
-        sz = x.size
-        if nd == 0:
-            return TopKResult(
-                dpt.copy(x, order="C"),
-                dpt.zeros_like(
-                    x, dtype=ti.default_device_index_type(x.sycl_queue)
-                ),
-            )
-        arr = x
-        n_search_dims = None
-        res_sh = k
-    else:
-        axis = normalize_axis_index(axis, ndim=nd, msg_prefix="axis")
-        sz = x.shape[axis]
-        a1 = axis + 1
-        if a1 == nd:
-            perm = list(range(nd))
-            arr = x
-        else:
-            perm = [i for i in range(nd) if i != axis] + [
-                axis,
-            ]
-            arr = dpt.permute_dims(x, perm)
-        n_search_dims = 1
-        res_sh = arr.shape[: nd - 1] + (k,)
-
-    if k > sz:
-        raise ValueError(f"`k`={k} is out of bounds {sz}")
-
-    exec_q = x.sycl_queue
-    _manager = du.SequentialOrderManager[exec_q]
-    dep_evs = _manager.submitted_events
-
-    res_usm_type = arr.usm_type
-    if arr.flags.c_contiguous:
-        vals = dpt.empty(
-            res_sh,
-            dtype=arr.dtype,
-            usm_type=res_usm_type,
-            order="C",
-            sycl_queue=exec_q,
-        )
-        inds = dpt.empty(
-            res_sh,
-            dtype=ti.default_device_index_type(exec_q),
-            usm_type=res_usm_type,
-            order="C",
-            sycl_queue=exec_q,
-        )
-        ht_ev, impl_ev = _topk(
-            src=arr,
-            trailing_dims_to_search=n_search_dims,
-            k=k,
-            largest=largest,
-            vals=vals,
-            inds=inds,
-            sycl_queue=exec_q,
-            depends=dep_evs,
-        )
-        _manager.add_event_pair(ht_ev, impl_ev)
-    else:
-        tmp = dpt.empty_like(arr, order="C")
-        ht_ev, copy_ev = ti._copy_usm_ndarray_into_usm_ndarray(
-            src=arr, dst=tmp, sycl_queue=exec_q, depends=dep_evs
-        )
-        _manager.add_event_pair(ht_ev, copy_ev)
-        vals = dpt.empty(
-            res_sh,
-            dtype=arr.dtype,
-            usm_type=res_usm_type,
-            order="C",
-            sycl_queue=exec_q,
-        )
-        inds = dpt.empty(
-            res_sh,
-            dtype=ti.default_device_index_type(exec_q),
-            usm_type=res_usm_type,
-            order="C",
-            sycl_queue=exec_q,
-        )
-        ht_ev, impl_ev = _topk(
-            src=tmp,
-            trailing_dims_to_search=n_search_dims,
-            k=k,
-            largest=largest,
-            vals=vals,
-            inds=inds,
-            sycl_queue=exec_q,
-            depends=[copy_ev],
-        )
-        _manager.add_event_pair(ht_ev, impl_ev)
-    if axis is not None and a1 != nd:
-        inv_perm = sorted(range(nd), key=lambda d: perm[d])
-        vals = dpt.permute_dims(vals, inv_perm)
-        inds = dpt.permute_dims(inds, inv_perm)
-    return TopKResult(vals, inds)

From cd8388109ea9fdaedb48a6c7767656719c98e572 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Thu, 12 Dec 2024 16:00:19 -0800
Subject: [PATCH 08/30] top_k raises when k > 1 and input is a scalar

---
 dpctl/tensor/_sorting.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dpctl/tensor/_sorting.py b/dpctl/tensor/_sorting.py
index 63f2d25b86..19cd2f2836 100644
--- a/dpctl/tensor/_sorting.py
+++ b/dpctl/tensor/_sorting.py
@@ -337,6 +337,8 @@ def top_k(x, k, /, *, axis=None, mode="largest"):
     if axis is None:
         sz = x.size
         if nd == 0:
+            if k > 1:
+                raise ValueError(f"`k`={k} is out of bounds 1")
             return TopKResult(
                 dpt.copy(x, order="C"),
                 dpt.zeros_like(

From 70c2ef0a78e928204a84b9d0e49748bad75da940 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Thu, 12 Dec 2024 16:59:58 -0800
Subject: [PATCH 09/30] Fix bug in top_k partial merge sort implementation

rounded value of k must be divisible by the merge sort chunk size
---
 .../libtensor/include/kernels/sorting/topk.hpp   | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
index 11f3e851fb..2674f877c9 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -36,6 +36,7 @@
 #include "kernels/dpctl_tensor_types.hpp"
 #include "merge_sort.hpp"
 #include "radix_sort.hpp"
+#include "search_sorted_detail.hpp"
 #include "utils/sycl_alloc_utils.hpp"
 #include <sycl/ext/oneapi/sub_group_mask.hpp>
 
@@ -247,14 +248,16 @@ sycl::event topk_merge_impl(
         // This assumption permits doing away with using a loop
         assert(sorted_block_size % lws == 0);
 
+        using search_sorted_detail::quotient_ceil;
         const std::size_t n_segments =
-            merge_sort_detail::quotient_ceil<std::size_t>(axis_nelems,
-                                                          sorted_block_size);
+            quotient_ceil<std::size_t>(axis_nelems, sorted_block_size);
 
-        // round k up for the later merge kernel
+        // round k up for the later merge kernel if necessary
+        const std::size_t round_k_to = dev.has(sycl::aspect::cpu) ? 32 : 4;
         std::size_t k_rounded =
-            merge_sort_detail::quotient_ceil<std::size_t>(k, elems_per_wi) *
-            elems_per_wi;
+            (k < round_k_to)
+                ? k
+                : quotient_ceil<std::size_t>(k, round_k_to) * round_k_to;
 
         // get length of tail for alloc size
         auto rem = axis_nelems % sorted_block_size;
@@ -322,8 +325,7 @@ sycl::event topk_merge_impl(
                     sycl::group_barrier(it.get_group());
 
                     const std::size_t chunk =
-                        merge_sort_detail::quotient_ceil<std::size_t>(
-                            sorted_block_size, lws);
+                        quotient_ceil<std::size_t>(sorted_block_size, lws);
 
                     const std::size_t chunk_start_idx = lid * chunk;
                     const std::size_t chunk_end_idx =

From 65f14bec064c4fe8b4ec75824824d2b0d4407135 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 13 Dec 2024 18:48:05 -0600
Subject: [PATCH 10/30] Add test file for top_k functionality

---
 dpctl/tests/test_usm_ndarray_top_k.py | 94 +++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 dpctl/tests/test_usm_ndarray_top_k.py

diff --git a/dpctl/tests/test_usm_ndarray_top_k.py b/dpctl/tests/test_usm_ndarray_top_k.py
new file mode 100644
index 0000000000..954c21a990
--- /dev/null
+++ b/dpctl/tests/test_usm_ndarray_top_k.py
@@ -0,0 +1,94 @@
+#                      Data Parallel Control (dpctl)
+#
+# Copyright 2020-2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+import dpctl.tensor as dpt
+from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+@pytest.mark.parametrize("n", [33, 255, 511, 1021, 8193])
+def test_topk_1d_largest(dtype, n):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    o = dpt.ones(n, dtype=dtype)
+    z = dpt.zeros(n, dtype=dtype)
+    zo = dpt.concat((o, z))
+    inp = dpt.roll(zo, 734)
+    k = 5
+
+    s = dpt.top_k(inp, k, mode="largest")
+    assert s.values.shape == (k,)
+    assert s.values.dtype == inp.dtype
+    assert s.indices.shape == (k,)
+    assert dpt.all(s.values == dpt.ones(k, dtype=dtype))
+    assert dpt.all(s.values == inp[s.indices])
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        "i1",
+        "u1",
+        "i2",
+        "u2",
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+@pytest.mark.parametrize("n", [33, 255, 257, 513, 1021, 8193])
+def test_topk_1d_smallest(dtype, n):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    o = dpt.ones(n, dtype=dtype)
+    z = dpt.zeros(n, dtype=dtype)
+    zo = dpt.concat((o, z))
+    inp = dpt.roll(zo, 734)
+    k = 5
+
+    s = dpt.top_k(inp, k, mode="smallest")
+    assert s.values.shape == (k,)
+    assert s.values.dtype == inp.dtype
+    assert s.indices.shape == (k,)
+    assert dpt.all(s.values == dpt.zeros(k, dtype=dtype))
+    assert dpt.all(s.values == inp[s.indices])

From 275827ae9f2976a52a6559b0908498e2d41fe712 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 20 Dec 2024 16:51:19 -0600
Subject: [PATCH 11/30] Add sort_utils.hpp with iota_impl

Reuse that function call in sorting code-base where argsort is used.
---
 .../include/kernels/sorting/merge_sort.hpp    |  16 ++-
 .../include/kernels/sorting/radix_sort.hpp    |  51 ++++-----
 .../include/kernels/sorting/sort_utils.hpp    | 103 ++++++++++++++++++
 .../include/kernels/sorting/topk.hpp          |  34 ++++--
 4 files changed, 162 insertions(+), 42 deletions(-)
 create mode 100644 dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
index f3b5030c48..002da0ff9b 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
@@ -33,6 +33,7 @@
 
 #include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/sorting/search_sorted_detail.hpp"
+#include "kernels/sorting/sort_utils.hpp"
 
 namespace dpctl
 {
@@ -811,20 +812,27 @@ sycl::event stable_argsort_axis1_contig_impl(
 
     const size_t total_nelems = iter_nelems * sort_nelems;
 
+    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
+
+    using IotaKernelName = populate_index_data_krn<argTy, IndexTy, ValueComp>;
+
+#if 1
+    sycl::event populate_indexed_data_ev = iota_impl<IotaKernelName, IndexTy>(
+        exec_q, res_tp, total_nelems, depends);
+
+#else
     sycl::event populate_indexed_data_ev =
         exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(depends);
 
             const sycl::range<1> range{total_nelems};
 
-            using KernelName =
-                populate_index_data_krn<argTy, IndexTy, ValueComp>;
-
-            cgh.parallel_for<KernelName>(range, [=](sycl::id<1> id) {
+            cgh.parallel_for<IotaKernelName>(range, [=](sycl::id<1> id) {
                 size_t i = id[0];
                 res_tp[i] = static_cast<IndexTy>(i);
             });
         });
+#endif
 
     // Sort segments of the array
     sycl::event base_sort_ev =
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
index dc3da24315..84edb1dd3b 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
@@ -38,6 +38,7 @@
 #include <sycl/sycl.hpp>
 
 #include "kernels/dpctl_tensor_types.hpp"
+#include "kernels/sorting/sort_utils.hpp"
 #include "utils/sycl_alloc_utils.hpp"
 
 namespace dpctl
@@ -1256,9 +1257,7 @@ struct subgroup_radix_sort
                             const uint16_t id = wi * block_size + i;
                             if (id < n)
                                 values[i] = std::move(
-                                    this_input_arr[iter_val_offset +
-                                                   static_cast<std::size_t>(
-                                                       id)]);
+                                    this_input_arr[iter_val_offset + id]);
                         }
 
                         while (true) {
@@ -1272,8 +1271,7 @@ struct subgroup_radix_sort
                                 // counting phase
                                 auto pcounter =
                                     get_accessor_pointer(counter_acc) +
-                                    static_cast<std::size_t>(wi) +
-                                    iter_counter_offset;
+                                    (wi + iter_counter_offset);
 
 // initialize counters
 #pragma unroll
@@ -1348,19 +1346,15 @@ struct subgroup_radix_sort
 
                                     // scan contiguous numbers
                                     uint16_t bin_sum[bin_count];
-                                    bin_sum[0] =
-                                        counter_acc[iter_counter_offset +
-                                                    static_cast<std::size_t>(
-                                                        wi * bin_count)];
+                                    const std::size_t counter_offset0 =
+                                        iter_counter_offset + wi * bin_count;
+                                    bin_sum[0] = counter_acc[counter_offset0];
 
 #pragma unroll
                                     for (uint16_t i = 1; i < bin_count; ++i)
                                         bin_sum[i] =
                                             bin_sum[i - 1] +
-                                            counter_acc
-                                                [iter_counter_offset +
-                                                 static_cast<std::size_t>(
-                                                     wi * bin_count + i)];
+                                            counter_acc[counter_offset0 + i];
 
                                     sycl::group_barrier(ndit.get_group());
 
@@ -1374,10 +1368,7 @@ struct subgroup_radix_sort
 // add to local sum, generate exclusive scan result
 #pragma unroll
                                     for (uint16_t i = 0; i < bin_count; ++i)
-                                        counter_acc[iter_counter_offset +
-                                                    static_cast<std::size_t>(
-                                                        wi * bin_count + i +
-                                                        1)] =
+                                        counter_acc[counter_offset0 + i + 1] =
                                             sum_scan + bin_sum[i];
 
                                     if (wi == 0)
@@ -1407,10 +1398,8 @@ struct subgroup_radix_sort
                                     if (r < n) {
                                         // move the values to source range and
                                         // destroy the values
-                                        this_output_arr
-                                            [iter_val_offset +
-                                             static_cast<std::size_t>(r)] =
-                                                std::move(values[i]);
+                                        this_output_arr[iter_val_offset + r] =
+                                            std::move(values[i]);
                                     }
                                 }
 
@@ -1422,8 +1411,7 @@ struct subgroup_radix_sort
                             for (uint16_t i = 0; i < block_size; ++i) {
                                 const uint16_t r = indices[i];
                                 if (r < n)
-                                    exchange_acc[iter_exchange_offset +
-                                                 static_cast<std::size_t>(r)] =
+                                    exchange_acc[iter_exchange_offset + r] =
                                         std::move(values[i]);
                             }
 
@@ -1435,8 +1423,7 @@ struct subgroup_radix_sort
                                 if (id < n)
                                     values[i] = std::move(
                                         exchange_acc[iter_exchange_offset +
-                                                     static_cast<std::size_t>(
-                                                         id)]);
+                                                     id]);
                             }
 
                             sycl::group_barrier(ndit.get_group());
@@ -1795,18 +1782,26 @@ radix_argsort_axis1_contig_impl(sycl::queue &exec_q,
         radix_sort_details::IndexedProj<IndexTy, argTy, IdentityProjT>;
     const IndexedProjT proj_op{arg_tp, IdentityProjT{}};
 
+    using IotaKernelName = radix_argsort_iota_krn<argTy, IndexTy>;
+
+#if 1
+    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
+
+    sycl::event iota_ev = iota_impl<IotaKernelName, IndexTy>(
+        exec_q, workspace, total_nelems, depends);
+#else
+
     sycl::event iota_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
 
-        using KernelName = radix_argsort_iota_krn<argTy, IndexTy>;
-
-        cgh.parallel_for<KernelName>(
+        cgh.parallel_for<IotaKernelName>(
             sycl::range<1>(total_nelems), [=](sycl::id<1> id) {
                 size_t i = id[0];
                 IndexTy sort_id = static_cast<IndexTy>(i);
                 workspace[i] = sort_id;
             });
     });
+#endif
 
     sycl::event radix_sort_ev =
         radix_sort_details::parallel_radix_sort_impl<IndexTy, IndexedProjT>(
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
new file mode 100644
index 0000000000..1b1c361d75
--- /dev/null
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
@@ -0,0 +1,103 @@
+//=== sorting.hpp -  Implementation of sorting kernels       ---*-C++-*--/===//
+//
+//                      Data Parallel Control (dpctl)
+//
+// Copyright 2020-2024 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines kernels for tensor sort/argsort operations.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+#include <sycl/sycl.hpp>
+
+namespace dpctl
+{
+namespace tensor
+{
+namespace kernels
+{
+namespace sort_utils_detail
+{
+
+namespace syclexp = sycl::ext::oneapi::experimental;
+
+template <class KernelName, typename T>
+sycl::event iota_impl(sycl::queue &exec_q,
+                      T *data,
+                      std::size_t nelems,
+                      const std::vector<sycl::event> &dependent_events)
+{
+    constexpr std::uint32_t lws = 256;
+    constexpr std::uint32_t n_wi = 4;
+    const std::size_t n_groups = (nelems + n_wi * lws - 1) / (n_wi * lws);
+
+    sycl::range<1> gRange{n_groups * lws};
+    sycl::range<1> lRange{lws};
+    sycl::nd_range<1> ndRange{gRange, lRange};
+
+    sycl::event e = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependent_events);
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
+            const std::size_t gid = it.get_global_id();
+            const auto &sg = it.get_sub_group();
+            const std::uint32_t lane_id = sg.get_local_id()[0];
+
+            const std::size_t offset = (gid - lane_id) * n_wi;
+            const std::uint32_t max_sgSize = sg.get_max_local_range()[0];
+
+            std::array<T, n_wi> stripe{};
+#pragma unroll
+            for (std::uint32_t i = 0; i < n_wi; ++i) {
+                stripe[i] = T(offset + lane_id + i * max_sgSize);
+            }
+
+            if (offset + n_wi * max_sgSize < nelems) {
+                constexpr auto group_ls_props = syclexp::properties{
+                    syclexp::data_placement_striped
+                    // , syclexp::full_group
+                };
+
+                auto out_multi_ptr = sycl::address_space_cast<
+                    sycl::access::address_space::global_space,
+                    sycl::access::decorated::yes>(&data[offset]);
+
+                syclexp::group_store(sg, sycl::span<T, n_wi>{&stripe[0], n_wi},
+                                     out_multi_ptr, group_ls_props);
+            }
+            else {
+                for (std::size_t idx = offset + lane_id; idx < nelems;
+                     idx += max_sgSize)
+                {
+                    data[idx] = T(idx);
+                }
+            }
+        });
+    });
+
+    return e;
+}
+
+} // end of namespace sort_utils_detail
+} // end of namespace kernels
+} // end of namespace tensor
+} // end of namespace dpctl
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
index 2674f877c9..da9e0bd812 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -34,9 +34,10 @@
 #include <vector>
 
 #include "kernels/dpctl_tensor_types.hpp"
-#include "merge_sort.hpp"
-#include "radix_sort.hpp"
-#include "search_sorted_detail.hpp"
+#include "kernels/sorting/merge_sort.hpp"
+#include "kernels/sorting/radix_sort.hpp"
+#include "kernels/sorting/search_sorted_detail.hpp"
+#include "kernels/sorting/sort_utils.hpp"
 #include "utils/sycl_alloc_utils.hpp"
 #include <sycl/ext/oneapi/sub_group_mask.hpp>
 
@@ -95,20 +96,26 @@ topk_full_merge_sort_impl(sycl::queue &exec_q,
         throw std::runtime_error("Unable to allocate device_memory");
     }
 
+    using IotaKernelName = topk_populate_index_data_krn<argTy, IndexTy, CompT>;
+
+#if 1
+    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
+
+    sycl::event populate_indexed_data_ev = iota_impl<IotaKernelName, IndexTy>(
+        exec_q, index_data, iter_nelems * axis_nelems, depends);
+#else
     sycl::event populate_indexed_data_ev =
         exec_q.submit([&](sycl::handler &cgh) {
             cgh.depends_on(depends);
 
             auto const &range = sycl::range<1>(iter_nelems * axis_nelems);
 
-            using KernelName =
-                topk_populate_index_data_krn<argTy, IndexTy, CompT>;
-
-            cgh.parallel_for<KernelName>(range, [=](sycl::id<1> id) {
+            cgh.parallel_for<IotaKernelName>(range, [=](sycl::id<1> id) {
                 std::size_t i = id[0];
                 index_data[i] = static_cast<IndexTy>(i);
             });
         });
+#endif
 
     std::size_t sorted_block_size;
     // Sort segments of the array
@@ -480,18 +487,25 @@ sycl::event topk_radix_impl(sycl::queue &exec_q,
         radix_sort_details::IndexedProj<IndexTy, argTy, IdentityProjT>;
     const IndexedProjT proj_op{arg_tp, IdentityProjT{}};
 
+    using IotaKernelName = topk_iota_krn<argTy, IndexTy>;
+
+#if 1
+    using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
+
+    sycl::event iota_ev = iota_impl<IotaKernelName, IndexTy>(
+        exec_q, workspace, total_nelems, depends);
+#else
     sycl::event iota_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(depends);
 
-        using KernelName = topk_iota_krn<argTy, IndexTy>;
-
-        cgh.parallel_for<KernelName>(
+        cgh.parallel_for<IotaKernelName>(
             sycl::range<1>(total_nelems), [=](sycl::id<1> id) {
                 size_t i = id[0];
                 IndexTy sort_id = static_cast<IndexTy>(i);
                 workspace[i] = sort_id;
             });
     });
+#endif
 
     sycl::event radix_sort_ev =
         radix_sort_details::parallel_radix_sort_impl<IndexTy, IndexedProjT>(

From cd1243fad1fd3cc19f6812a16a61d064473ac83a Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 22 Dec 2024 14:55:45 -0600
Subject: [PATCH 12/30] Simplify write-out kernels in topk implementation
 (avoid recomputing gid)

---
 .../include/kernels/sorting/topk.hpp          | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
index da9e0bd812..7ea91ad148 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -410,17 +410,17 @@ sycl::event topk_merge_impl(
                 topk_partial_merge_map_back_krn<argTy, IndexTy, ValueComp>;
 
             cgh.parallel_for<KernelName>(iter_nelems * k, [=](sycl::id<1> id) {
-                std::size_t gid = id[0];
+                const std::size_t gid = id[0];
 
-                std::size_t iter_gid = gid / k;
-                std::size_t axis_gid = gid - (iter_gid * k);
+                const std::size_t iter_gid = gid / k;
+                const std::size_t axis_gid = gid - (iter_gid * k);
 
-                std::size_t src_idx = iter_gid * alloc_len + axis_gid;
-                std::size_t dst_idx = iter_gid * k + axis_gid;
+                const std::size_t src_idx = iter_gid * alloc_len + axis_gid;
+                const std::size_t dst_idx = gid;
 
-                auto res_ind = index_data[src_idx];
+                const auto res_ind = index_data[src_idx];
                 vals_tp[dst_idx] = arg_tp[res_ind];
-                inds_tp[dst_idx] = res_ind % axis_nelems;
+                inds_tp[dst_idx] = (res_ind % axis_nelems);
             });
         });
 
@@ -519,17 +519,17 @@ sycl::event topk_radix_impl(sycl::queue &exec_q,
         using KernelName = topk_radix_map_back_krn<argTy, IndexTy>;
 
         cgh.parallel_for<KernelName>(iter_nelems * k, [=](sycl::id<1> id) {
-            std::size_t gid = id[0];
+            const std::size_t gid = id[0];
 
-            std::size_t iter_gid = gid / k;
-            std::size_t axis_gid = gid - (iter_gid * k);
+            const std::size_t iter_gid = gid / k;
+            const std::size_t axis_gid = gid - (iter_gid * k);
 
-            std::size_t src_idx = iter_gid * axis_nelems + axis_gid;
-            std::size_t dst_idx = iter_gid * k + axis_gid;
+            const std::size_t src_idx = iter_gid * axis_nelems + axis_gid;
+            const std::size_t dst_idx = gid;
 
-            IndexTy res_ind = tmp_tp[src_idx];
+            const IndexTy res_ind = tmp_tp[src_idx];
             vals_tp[dst_idx] = arg_tp[res_ind];
-            inds_tp[dst_idx] = res_ind % axis_nelems;
+            inds_tp[dst_idx] = (res_ind % axis_nelems);
         });
     });
 

From 823c2014632783a3b389003cdda930a86ab269af Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Sun, 22 Dec 2024 14:56:56 -0600
Subject: [PATCH 13/30] Add explicit read into registers

---
 .../libtensor/include/kernels/sorting/topk.hpp      | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
index 7ea91ad148..6698aa50de 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -145,8 +145,9 @@ topk_full_merge_sort_impl(sycl::queue &exec_q,
             std::size_t src_idx = iter_gid * axis_nelems + axis_gid;
             std::size_t dst_idx = iter_gid * k + axis_gid;
 
-            auto res_ind = index_data[src_idx];
-            vals_tp[dst_idx] = arg_tp[res_ind];
+            const IndexTy res_ind = index_data[src_idx];
+            const argTy v = arg_tp[res_ind];
+            vals_tp[dst_idx] = v;
             inds_tp[dst_idx] = res_ind % axis_nelems;
         });
     });
@@ -418,8 +419,9 @@ sycl::event topk_merge_impl(
                 const std::size_t src_idx = iter_gid * alloc_len + axis_gid;
                 const std::size_t dst_idx = gid;
 
-                const auto res_ind = index_data[src_idx];
-                vals_tp[dst_idx] = arg_tp[res_ind];
+                const IndexTy res_ind = index_data[src_idx];
+                const argTy v = arg_tp[res_ind];
+                vals_tp[dst_idx] = v;
                 inds_tp[dst_idx] = (res_ind % axis_nelems);
             });
         });
@@ -528,7 +530,8 @@ sycl::event topk_radix_impl(sycl::queue &exec_q,
             const std::size_t dst_idx = gid;
 
             const IndexTy res_ind = tmp_tp[src_idx];
-            vals_tp[dst_idx] = arg_tp[res_ind];
+            const argTy v = arg_tp[res_ind];
+            vals_tp[dst_idx] = v;
             inds_tp[dst_idx] = (res_ind % axis_nelems);
         });
     });

From 3a4d8ab164fe1d9fb9e2b364b1500e7d69cefb84 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <oleksandr.pavlyk@intel.com>
Date: Fri, 20 Dec 2024 16:54:15 -0600
Subject: [PATCH 14/30] Use unique_ptr as temporary owner of USM allocation

Until it is passed over to the host function, and
unique_ptr's ownership is released.

Also reduced allocation sizes, where too much was being
allocated.

Introduce smart_malloc_device, etc.

The smart_malloc_device<T>(count, q) makes USM allocation
and returns a unique_ptr<T, USMDeleter> which owns the
allocation. The function throws an exception (std::runtime_error)
if USM allocation is not successful.

Introduce async_smart_free.

This function intends to replace use of host_task submissions
to manage USM temporary deallocations.

The usage is as follows:

```
  // returns unique_ptr
  auto alloc_owner = smart_malloc_device<T>(count, q);

  // get raw pointer for use in kernels
  T *data = alloc_owner.get();

  [..SNIP..]

  // submit host_task that releases the unique_ptr
  // after the host task was successfully submitted
  // and ownership of USM allocation is transfered to
  // the said host task
  sycl::event ht_ev =
      async_smart_free(q,
      dependent_events,
      alloc_owner);

  [...SNIP...]
```
---
 .../include/kernels/sorting/radix_sort.hpp    | 66 +++++--------
 .../include/kernels/sorting/topk.hpp          | 70 +++++---------
 .../include/utils/sycl_alloc_utils.hpp        | 93 ++++++++++++++++++-
 3 files changed, 137 insertions(+), 92 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
index 84edb1dd3b..de0011e2ab 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
@@ -1588,11 +1588,11 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
         using CountT = std::uint32_t;
 
         // memory for storing count and offset values
-        CountT *count_ptr =
-            sycl::malloc_device<CountT>(n_iters * n_counts, exec_q);
-        if (nullptr == count_ptr) {
-            throw std::runtime_error("Could not allocate USM-device memory");
-        }
+        auto count_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<CountT>(
+                n_iters * n_counts, exec_q);
+
+        CountT *count_ptr = count_owner.get();
 
         constexpr std::uint32_t zero_radix_iter{0};
 
@@ -1605,25 +1605,17 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
                                                    n_counts, count_ptr, proj_op,
                                                    is_ascending, depends);
 
-            sort_ev = exec_q.submit([=](sycl::handler &cgh) {
-                cgh.depends_on(sort_ev);
-                const sycl::context &ctx = exec_q.get_context();
-
-                using dpctl::tensor::alloc_utils::sycl_free_noexcept;
-                cgh.host_task(
-                    [ctx, count_ptr]() { sycl_free_noexcept(count_ptr, ctx); });
-            });
+            sort_ev = dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {sort_ev}, count_owner);
 
             return sort_ev;
         }
 
-        ValueT *tmp_arr =
-            sycl::malloc_device<ValueT>(n_iters * n_to_sort, exec_q);
-        if (nullptr == tmp_arr) {
-            using dpctl::tensor::alloc_utils::sycl_free_noexcept;
-            sycl_free_noexcept(count_ptr, exec_q);
-            throw std::runtime_error("Could not allocate USM-device memory");
-        }
+        auto tmp_arr_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<ValueT>(
+                n_iters * n_to_sort, exec_q);
+
+        ValueT *tmp_arr = tmp_arr_owner.get();
 
         // iterations per each bucket
         assert("Number of iterations must be even" && radix_iters % 2 == 0);
@@ -1657,17 +1649,8 @@ sycl::event parallel_radix_sort_impl(sycl::queue &exec_q,
             }
         }
 
-        sort_ev = exec_q.submit([=](sycl::handler &cgh) {
-            cgh.depends_on(sort_ev);
-
-            const sycl::context &ctx = exec_q.get_context();
-
-            using dpctl::tensor::alloc_utils::sycl_free_noexcept;
-            cgh.host_task([ctx, count_ptr, tmp_arr]() {
-                sycl_free_noexcept(tmp_arr, ctx);
-                sycl_free_noexcept(count_ptr, ctx);
-            });
-        });
+        sort_ev = dpctl::tensor::alloc_utils::async_smart_free(
+            exec_q, {sort_ev}, tmp_arr_owner, count_owner);
     }
 
     return sort_ev;
@@ -1769,13 +1752,12 @@ radix_argsort_axis1_contig_impl(sycl::queue &exec_q,
         reinterpret_cast<IndexTy *>(res_cp) + iter_res_offset + sort_res_offset;
 
     const std::size_t total_nelems = iter_nelems * sort_nelems;
-    const std::size_t padded_total_nelems = ((total_nelems + 63) / 64) * 64;
-    IndexTy *workspace = sycl::malloc_device<IndexTy>(
-        padded_total_nelems + total_nelems, exec_q);
+    auto workspace_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(total_nelems,
+                                                                 exec_q);
 
-    if (nullptr == workspace) {
-        throw std::runtime_error("Could not allocate workspace on device");
-    }
+    // get raw USM pointer
+    IndexTy *workspace = workspace_owner.get();
 
     using IdentityProjT = radix_sort_details::IdentityProj;
     using IndexedProjT =
@@ -1820,14 +1802,8 @@ radix_argsort_axis1_contig_impl(sycl::queue &exec_q,
             });
     });
 
-    sycl::event cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(map_back_ev);
-
-        const sycl::context &ctx = exec_q.get_context();
-
-        using dpctl::tensor::alloc_utils::sycl_free_noexcept;
-        cgh.host_task([ctx, workspace] { sycl_free_noexcept(workspace, ctx); });
-    });
+    sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {map_back_ev}, workspace_owner);
 
     return cleanup_ev;
 }
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
index 6698aa50de..d1d686a2dd 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -30,9 +30,10 @@
 #include <iterator>
 #include <limits>
 #include <stdexcept>
-#include <sycl/sycl.hpp>
 #include <vector>
 
+#include <sycl/sycl.hpp>
+
 #include "kernels/dpctl_tensor_types.hpp"
 #include "kernels/sorting/merge_sort.hpp"
 #include "kernels/sorting/radix_sort.hpp"
@@ -90,11 +91,11 @@ topk_full_merge_sort_impl(sycl::queue &exec_q,
                           const CompT &comp,
                           const std::vector<sycl::event> &depends)
 {
-    IndexTy *index_data =
-        sycl::malloc_device<IndexTy>(iter_nelems * axis_nelems, exec_q);
-    if (index_data == nullptr) {
-        throw std::runtime_error("Unable to allocate device_memory");
-    }
+    auto index_data_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
+            iter_nelems * axis_nelems, exec_q);
+    // extract USM pointer
+    IndexTy *index_data = index_data_owner.get();
 
     using IotaKernelName = topk_populate_index_data_krn<argTy, IndexTy, CompT>;
 
@@ -153,14 +154,8 @@ topk_full_merge_sort_impl(sycl::queue &exec_q,
     });
 
     sycl::event cleanup_host_task_event =
-        exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(write_out_ev);
-            const sycl::context &ctx = exec_q.get_context();
-
-            using dpctl::tensor::alloc_utils::sycl_free_noexcept;
-            cgh.host_task(
-                [ctx, index_data] { sycl_free_noexcept(index_data, ctx); });
-        });
+        dpctl::tensor::alloc_utils::async_smart_free(exec_q, {write_out_ev},
+                                                     index_data_owner);
 
     return cleanup_host_task_event;
 };
@@ -283,11 +278,11 @@ sycl::event topk_merge_impl(
                                              index_comp, depends);
         }
 
-        IndexTy *index_data =
-            sycl::malloc_device<IndexTy>(iter_nelems * alloc_len, exec_q);
-        if (index_data == nullptr) {
-            throw std::runtime_error("Unable to allocate device_memory");
-        }
+        auto index_data_owner =
+            dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
+                iter_nelems * alloc_len, exec_q);
+        // get raw USM pointer
+        IndexTy *index_data = index_data_owner.get();
 
         // no need to populate index data: SLM will be populated with default
         // values
@@ -427,14 +422,8 @@ sycl::event topk_merge_impl(
         });
 
         sycl::event cleanup_host_task_event =
-            exec_q.submit([&](sycl::handler &cgh) {
-                cgh.depends_on(write_topk_ev);
-                const sycl::context &ctx = exec_q.get_context();
-
-                using dpctl::tensor::alloc_utils::sycl_free_noexcept;
-                cgh.host_task(
-                    [ctx, index_data] { sycl_free_noexcept(index_data, ctx); });
-            });
+            dpctl::tensor::alloc_utils::async_smart_free(
+                exec_q, {write_topk_ev}, index_data_owner);
 
         return cleanup_host_task_event;
     }
@@ -474,15 +463,13 @@ sycl::event topk_radix_impl(sycl::queue &exec_q,
 
     const std::size_t total_nelems = iter_nelems * axis_nelems;
     const std::size_t padded_total_nelems = ((total_nelems + 63) / 64) * 64;
-    IndexTy *workspace = sycl::malloc_device<IndexTy>(
-        padded_total_nelems + total_nelems, exec_q);
+    auto workspace_owner =
+        dpctl::tensor::alloc_utils::smart_malloc_device<IndexTy>(
+            padded_total_nelems + total_nelems, exec_q);
 
-    IndexTy *tmp_tp = sycl::malloc_device<IndexTy>(total_nelems, exec_q);
-
-    if (nullptr == workspace || nullptr == tmp_tp) {
-        throw std::runtime_error(
-            "Not enough device memory for radix sort topk");
-    }
+    // get raw USM pointer
+    IndexTy *workspace = workspace_owner.get();
+    IndexTy *tmp_tp = workspace + padded_total_nelems;
 
     using IdentityProjT = radix_sort_details::IdentityProj;
     using IndexedProjT =
@@ -536,17 +523,8 @@ sycl::event topk_radix_impl(sycl::queue &exec_q,
         });
     });
 
-    sycl::event cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(write_topk_ev);
-
-        const sycl::context &ctx = exec_q.get_context();
-
-        using dpctl::tensor::alloc_utils::sycl_free_noexcept;
-        cgh.host_task([ctx, workspace, tmp_tp] {
-            sycl_free_noexcept(workspace, ctx);
-            sycl_free_noexcept(tmp_tp, ctx);
-        });
-    });
+    sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
+        exec_q, {write_topk_ev}, workspace_owner);
 
     return cleanup_ev;
 }
diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
index 3ad5f6f36a..87476221d8 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
@@ -28,6 +28,9 @@
 
 #include <exception>
 #include <iostream>
+#include <memory>
+#include <stdexcept>
+#include <vector>
 
 #include "sycl/sycl.hpp"
 
@@ -73,11 +76,99 @@ void sycl_free_noexcept(T *ptr, const sycl::context &ctx) noexcept
     }
 }
 
-template <typename T> void sycl_free_noexcept(T *ptr, sycl::queue &q) noexcept
+template <typename T>
+void sycl_free_noexcept(T *ptr, const sycl::queue &q) noexcept
 {
     sycl_free_noexcept(ptr, q.get_context());
 }
 
+class USMDeleter
+{
+private:
+    sycl::context ctx_;
+
+public:
+    USMDeleter(const sycl::queue &q) : ctx_(q.get_context()) {}
+    USMDeleter(const sycl::context &ctx) : ctx_(ctx) {}
+
+    template <typename T> void operator()(T *ptr) const
+    {
+        sycl_free_noexcept(ptr, ctx_);
+    }
+};
+
+template <typename T>
+std::unique_ptr<T, USMDeleter>
+smart_malloc(std::size_t count,
+             const sycl::queue &q,
+             sycl::usm::alloc kind,
+             const sycl::property_list &propList = {})
+{
+    T *ptr = sycl::malloc<T>(count, q, kind, propList);
+    if (nullptr == ptr) {
+        throw std::runtime_error("Unable to allocate device_memory");
+    }
+
+    auto usm_deleter = USMDeleter(q);
+    return std::unique_ptr<T, USMDeleter>(ptr, usm_deleter);
+}
+
+template <typename T>
+std::unique_ptr<T, USMDeleter>
+smart_malloc_device(std::size_t count,
+                    const sycl::queue &q,
+                    const sycl::property_list &propList = {})
+{
+    return smart_malloc<T>(count, q, sycl::usm::alloc::device, propList);
+}
+
+template <typename T>
+std::unique_ptr<T, USMDeleter>
+smart_malloc_shared(std::size_t count,
+                    const sycl::queue &q,
+                    const sycl::property_list &propList = {})
+{
+    return smart_malloc<T>(count, q, sycl::usm::alloc::shared, propList);
+}
+
+template <typename T>
+std::unique_ptr<T, USMDeleter>
+smart_malloc_jost(std::size_t count,
+                  const sycl::queue &q,
+                  const sycl::property_list &propList = {})
+{
+    return smart_malloc<T>(count, q, sycl::usm::alloc::host, propList);
+}
+
+template <typename... Args>
+sycl::event async_smart_free(sycl::queue &exec_q,
+                             const std::vector<sycl::event> &depends,
+                             Args &&...args)
+{
+    constexpr std::size_t n = sizeof...(Args);
+
+    std::vector<void *> ptrs;
+    ptrs.reserve(n);
+    (ptrs.push_back(reinterpret_cast<void *>(args.get())), ...);
+
+    std::vector<USMDeleter> dels;
+    dels.reserve(n);
+    (dels.push_back(args.get_deleter()), ...);
+
+    sycl::event ht_e = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.host_task([ptrs, dels]() {
+            for (size_t i = 0; i < ptrs.size(); ++i) {
+                dels[i](ptrs[i]);
+            }
+        });
+    });
+    (args.release(), ...);
+
+    return ht_e;
+}
+
 } // end of namespace alloc_utils
 } // end of namespace tensor
 } // end of namespace dpctl

From b15e3020f0b116398b24a99c034e6001d75ec1da Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 25 Dec 2024 06:41:40 -0600
Subject: [PATCH 15/30] Factor out write-out kernel into separate detail
 function

Replaced three duplicates of the same kernel with calls to
this function.
---
 .../include/kernels/sorting/topk.hpp          | 110 +++++++++---------
 1 file changed, 52 insertions(+), 58 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
index d1d686a2dd..3d07b26802 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -71,6 +71,41 @@ void scale_topk_params(const std::uint64_t nelems_per_slm,
     throw std::runtime_error("Could not construct top k kernel parameters");
 }
 
+template <class KernelName, typename argTy, typename IndexTy>
+sycl::event write_out_impl(sycl::queue &exec_q,
+                           std::size_t iter_nelems,
+                           std::size_t k,
+                           const argTy *arg_tp,
+                           const IndexTy *index_data,
+                           std::size_t iter_index_stride,
+                           std::size_t axis_nelems,
+                           argTy *vals_tp,
+                           IndexTy *inds_tp,
+                           const std::vector<sycl::event> &depends)
+{
+    sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
+
+        cgh.parallel_for<KernelName>(iter_nelems * k, [=](sycl::id<1> id) {
+            const std::size_t gid = id[0];
+
+            const std::size_t iter_gid = gid / k;
+            const std::size_t axis_gid = gid - (iter_gid * k);
+
+            const std::size_t src_idx = iter_gid * iter_index_stride + axis_gid;
+            const std::size_t dst_idx = gid;
+
+            const IndexTy res_ind = index_data[src_idx];
+            const argTy v = arg_tp[res_ind];
+
+            vals_tp[dst_idx] = v;
+            inds_tp[dst_idx] = (res_ind % axis_nelems);
+        });
+    });
+
+    return write_out_ev;
+}
+
 } // namespace topk_detail
 
 template <typename T1, typename T2, typename T3>
@@ -132,26 +167,13 @@ topk_full_merge_sort_impl(sycl::queue &exec_q,
         exec_q, iter_nelems, axis_nelems, index_data, comp, sorted_block_size,
         {base_sort_ev});
 
-    sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(merges_ev);
-
-        using KernelName = topk_full_merge_map_back_krn<argTy, IndexTy, CompT>;
-
-        cgh.parallel_for<KernelName>(iter_nelems * k, [=](sycl::id<1> id) {
-            std::size_t gid = id[0];
-
-            std::size_t iter_gid = gid / k;
-            std::size_t axis_gid = gid - (iter_gid * k);
+    using WriteOutKernelName =
+        topk_full_merge_map_back_krn<argTy, IndexTy, CompT>;
 
-            std::size_t src_idx = iter_gid * axis_nelems + axis_gid;
-            std::size_t dst_idx = iter_gid * k + axis_gid;
-
-            const IndexTy res_ind = index_data[src_idx];
-            const argTy v = arg_tp[res_ind];
-            vals_tp[dst_idx] = v;
-            inds_tp[dst_idx] = res_ind % axis_nelems;
-        });
-    });
+    sycl::event write_out_ev =
+        topk_detail::write_out_impl<WriteOutKernelName, argTy, IndexTy>(
+            exec_q, iter_nelems, k, arg_tp, index_data, axis_nelems,
+            axis_nelems, vals_tp, inds_tp, {merges_ev});
 
     sycl::event cleanup_host_task_event =
         dpctl::tensor::alloc_utils::async_smart_free(exec_q, {write_out_ev},
@@ -399,27 +421,13 @@ sycl::event topk_merge_impl(
                 k_rounded, {base_sort_ev});
 
         // Write out top k of the merge-sorted memory
-        sycl::event write_topk_ev = exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(merges_ev);
-
-            using KernelName =
-                topk_partial_merge_map_back_krn<argTy, IndexTy, ValueComp>;
+        using WriteOutKernelName =
+            topk_partial_merge_map_back_krn<argTy, IndexTy, ValueComp>;
 
-            cgh.parallel_for<KernelName>(iter_nelems * k, [=](sycl::id<1> id) {
-                const std::size_t gid = id[0];
-
-                const std::size_t iter_gid = gid / k;
-                const std::size_t axis_gid = gid - (iter_gid * k);
-
-                const std::size_t src_idx = iter_gid * alloc_len + axis_gid;
-                const std::size_t dst_idx = gid;
-
-                const IndexTy res_ind = index_data[src_idx];
-                const argTy v = arg_tp[res_ind];
-                vals_tp[dst_idx] = v;
-                inds_tp[dst_idx] = (res_ind % axis_nelems);
-            });
-        });
+        sycl::event write_topk_ev =
+            topk_detail::write_out_impl<WriteOutKernelName, argTy, IndexTy>(
+                exec_q, iter_nelems, k, arg_tp, index_data, alloc_len,
+                axis_nelems, vals_tp, inds_tp, {merges_ev});
 
         sycl::event cleanup_host_task_event =
             dpctl::tensor::alloc_utils::async_smart_free(
@@ -502,26 +510,12 @@ sycl::event topk_radix_impl(sycl::queue &exec_q,
             ascending, {iota_ev});
 
     // Write out top k of the temporary
-    sycl::event write_topk_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(radix_sort_ev);
-
-        using KernelName = topk_radix_map_back_krn<argTy, IndexTy>;
+    using WriteOutKernelName = topk_radix_map_back_krn<argTy, IndexTy>;
 
-        cgh.parallel_for<KernelName>(iter_nelems * k, [=](sycl::id<1> id) {
-            const std::size_t gid = id[0];
-
-            const std::size_t iter_gid = gid / k;
-            const std::size_t axis_gid = gid - (iter_gid * k);
-
-            const std::size_t src_idx = iter_gid * axis_nelems + axis_gid;
-            const std::size_t dst_idx = gid;
-
-            const IndexTy res_ind = tmp_tp[src_idx];
-            const argTy v = arg_tp[res_ind];
-            vals_tp[dst_idx] = v;
-            inds_tp[dst_idx] = (res_ind % axis_nelems);
-        });
-    });
+    sycl::event write_topk_ev =
+        topk_detail::write_out_impl<WriteOutKernelName, argTy, IndexTy>(
+            exec_q, iter_nelems, k, arg_tp, tmp_tp, axis_nelems, axis_nelems,
+            vals_tp, inds_tp, {radix_sort_ev});
 
     sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
         exec_q, {write_topk_ev}, workspace_owner);

From 4ef615e4e12fcda623c5aacbf51316f2b3ffe4f7 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 25 Dec 2024 06:43:46 -0600
Subject: [PATCH 16/30] Typo: smart_malloc_jost -> smart_malloc_host

---
 dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
index 87476221d8..72440ef969 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
@@ -133,7 +133,7 @@ smart_malloc_shared(std::size_t count,
 
 template <typename T>
 std::unique_ptr<T, USMDeleter>
-smart_malloc_jost(std::size_t count,
+smart_malloc_host(std::size_t count,
                   const sycl::queue &q,
                   const sycl::property_list &propList = {})
 {

From 2608aea15a7b860126c8b475c85c82b5303a8ba7 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 25 Dec 2024 11:05:22 -0600
Subject: [PATCH 17/30] Reimplemented map_back_impl to process few elements per
 work-item

Factored out map_back_impl projects indexing from flat index to a
row-wise index.

Removed dead code excluded by preprocessor conditional.
---
 .../include/kernels/sorting/merge_sort.hpp    | 33 ++---------
 .../include/kernels/sorting/radix_sort.hpp    | 28 ++--------
 .../include/kernels/sorting/sort_utils.hpp    | 21 +++++++
 .../include/kernels/sorting/topk.hpp          | 55 ++++++++++++++-----
 4 files changed, 70 insertions(+), 67 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
index 002da0ff9b..dbf40c10fe 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/merge_sort.hpp
@@ -816,24 +816,9 @@ sycl::event stable_argsort_axis1_contig_impl(
 
     using IotaKernelName = populate_index_data_krn<argTy, IndexTy, ValueComp>;
 
-#if 1
     sycl::event populate_indexed_data_ev = iota_impl<IotaKernelName, IndexTy>(
         exec_q, res_tp, total_nelems, depends);
 
-#else
-    sycl::event populate_indexed_data_ev =
-        exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(depends);
-
-            const sycl::range<1> range{total_nelems};
-
-            cgh.parallel_for<IotaKernelName>(range, [=](sycl::id<1> id) {
-                size_t i = id[0];
-                res_tp[i] = static_cast<IndexTy>(i);
-            });
-        });
-#endif
-
     // Sort segments of the array
     sycl::event base_sort_ev =
         merge_sort_detail::sort_over_work_group_contig_impl(
@@ -847,21 +832,11 @@ sycl::event stable_argsort_axis1_contig_impl(
         exec_q, iter_nelems, sort_nelems, res_tp, index_comp, sorted_block_size,
         {base_sort_ev});
 
-    sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(merges_ev);
-
-        auto temp_acc =
-            merge_sort_detail::GetReadOnlyAccess<decltype(res_tp)>{}(res_tp,
-                                                                     cgh);
+    using MapBackKernelName = index_map_to_rows_krn<argTy, IndexTy, ValueComp>;
+    using dpctl::tensor::kernels::sort_utils_detail::map_back_impl;
 
-        using KernelName = index_map_to_rows_krn<argTy, IndexTy, ValueComp>;
-
-        const sycl::range<1> range{total_nelems};
-
-        cgh.parallel_for<KernelName>(range, [=](sycl::id<1> id) {
-            res_tp[id] = (temp_acc[id] % sort_nelems);
-        });
-    });
+    sycl::event write_out_ev = map_back_impl<MapBackKernelName, IndexTy>(
+        exec_q, total_nelems, res_tp, res_tp, sort_nelems, {merges_ev});
 
     return write_out_ev;
 }
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
index de0011e2ab..811aa581ff 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
@@ -1766,41 +1766,21 @@ radix_argsort_axis1_contig_impl(sycl::queue &exec_q,
 
     using IotaKernelName = radix_argsort_iota_krn<argTy, IndexTy>;
 
-#if 1
     using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
 
     sycl::event iota_ev = iota_impl<IotaKernelName, IndexTy>(
         exec_q, workspace, total_nelems, depends);
-#else
-
-    sycl::event iota_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        cgh.parallel_for<IotaKernelName>(
-            sycl::range<1>(total_nelems), [=](sycl::id<1> id) {
-                size_t i = id[0];
-                IndexTy sort_id = static_cast<IndexTy>(i);
-                workspace[i] = sort_id;
-            });
-    });
-#endif
 
     sycl::event radix_sort_ev =
         radix_sort_details::parallel_radix_sort_impl<IndexTy, IndexedProjT>(
             exec_q, iter_nelems, sort_nelems, workspace, res_tp, proj_op,
             sort_ascending, {iota_ev});
 
-    sycl::event map_back_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(radix_sort_ev);
-
-        using KernelName = radix_argsort_index_write_out_krn<argTy, IndexTy>;
+    using MapBackKernelName = radix_argsort_index_write_out_krn<argTy, IndexTy>;
+    using dpctl::tensor::kernels::sort_utils_detail::map_back_impl;
 
-        cgh.parallel_for<KernelName>(
-            sycl::range<1>(total_nelems), [=](sycl::id<1> id) {
-                IndexTy linear_index = res_tp[id];
-                res_tp[id] = (linear_index % sort_nelems);
-            });
-    });
+    sycl::event map_back_ev = map_back_impl<MapBackKernelName, IndexTy>(
+        exec_q, total_nelems, res_tp, res_tp, sort_nelems, {radix_sort_ev});
 
     sycl::event cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
         exec_q, {map_back_ev}, workspace_owner);
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
index 1b1c361d75..f62a6c3fa0 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
@@ -97,6 +97,27 @@ sycl::event iota_impl(sycl::queue &exec_q,
     return e;
 }
 
+template <class KernelName, typename IndexTy>
+sycl::event map_back_impl(sycl::queue &exec_q,
+                          std::size_t nelems,
+                          const IndexTy *flat_index_data,
+                          IndexTy *reduced_index_data,
+                          std::size_t row_size,
+                          const std::vector<sycl::event> &dependent_events)
+{
+    sycl::event map_back_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(dependent_events);
+
+        cgh.parallel_for<KernelName>(
+            sycl::range<1>(nelems), [=](sycl::id<1> id) {
+                const IndexTy linear_index = flat_index_data[id];
+                reduced_index_data[id] = (linear_index % row_size);
+            });
+    });
+
+    return map_back_ev;
+}
+
 } // end of namespace sort_utils_detail
 } // end of namespace kernels
 } // end of namespace tensor
diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
index 3d07b26802..6b235a1427 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -83,23 +83,50 @@ sycl::event write_out_impl(sycl::queue &exec_q,
                            IndexTy *inds_tp,
                            const std::vector<sycl::event> &depends)
 {
-    sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        cgh.parallel_for<KernelName>(iter_nelems * k, [=](sycl::id<1> id) {
-            const std::size_t gid = id[0];
+    constexpr std::uint32_t lws = 64;
+    constexpr std::uint32_t n_wi = 4;
+    const std::size_t nelems = iter_nelems * k;
+    const std::size_t n_groups = (nelems + lws * n_wi - 1) / (n_wi * lws);
 
-            const std::size_t iter_gid = gid / k;
-            const std::size_t axis_gid = gid - (iter_gid * k);
+    sycl::range<1> lRange{lws};
+    sycl::range<1> gRange{n_groups * lws};
+    sycl::nd_range<1> ndRange{gRange, lRange};
 
-            const std::size_t src_idx = iter_gid * iter_index_stride + axis_gid;
-            const std::size_t dst_idx = gid;
-
-            const IndexTy res_ind = index_data[src_idx];
-            const argTy v = arg_tp[res_ind];
+    sycl::event write_out_ev = exec_q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(depends);
 
-            vals_tp[dst_idx] = v;
-            inds_tp[dst_idx] = (res_ind % axis_nelems);
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
+            const std::size_t gid = it.get_global_linear_id();
+            const auto &sg = it.get_sub_group();
+            const std::uint32_t lane_id = sg.get_local_id()[0];
+            const std::uint32_t sg_size = sg.get_max_local_range()[0];
+
+            const std::size_t start_id =
+                (gid - lane_id) * sg_size * n_wi + lane_id;
+
+#pragma unroll
+            for (std::uint32_t i = 0; i < n_wi; ++i) {
+                const std::size_t data_id = start_id + i * sg_size;
+
+                if (data_id < nelems) {
+                    const std::size_t iter_id = data_id / k;
+
+                    /*
+                    const std::size_t axis_gid = data_id - (iter_gid * k);
+                    const std::size_t src_idx = iter_gid * iter_index_stride +
+                    axis_gid;
+                    */
+                    const std::size_t src_idx =
+                        data_id + iter_id * (iter_index_stride - k);
+
+                    const IndexTy res_ind = index_data[src_idx];
+                    const argTy v = arg_tp[res_ind];
+
+                    const std::size_t dst_idx = data_id;
+                    vals_tp[dst_idx] = v;
+                    inds_tp[dst_idx] = (res_ind % axis_nelems);
+                }
+            }
         });
     });
 

From 15da303f33eaff1d4e4b38c9f5f64bbe03fb5dcd Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 25 Dec 2024 16:51:13 -0600
Subject: [PATCH 18/30] Replace use of std::log2(size_t_value)

Replaced it with hand-written implementation of ceil_log2(n),
such that n <= (dectype(n){1} << ceil_log2(n)) is true for all
positive values of `n` in the range.
---
 .../include/kernels/sorting/radix_sort.hpp    | 41 ++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
index 811aa581ff..887841acb8 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
@@ -63,6 +63,45 @@ class radix_sort_reorder_peer_kernel;
 template <std::uint32_t, bool, typename... TrailingNames>
 class radix_sort_reorder_kernel;
 
+/*! @brief Computes smallest exponent such that `n <= (1 << exponent)` */
+template <typename SizeT,
+          std::enable_if_t<std::is_unsigned_v<SizeT> &&
+                               sizeof(SizeT) == sizeof(std::uint64_t),
+                           int> = 0>
+std::uint32_t ceil_log2(SizeT n)
+{
+    if (n <= 1)
+        return std::uint32_t{1};
+
+    std::uint32_t exp{1};
+    --n;
+    if (n >= (SizeT{1} << 32)) {
+        n >>= 32;
+        exp += 32;
+    }
+    if (n >= (SizeT{1} << 16)) {
+        n >>= 16;
+        exp += 16;
+    }
+    if (n >= (SizeT{1} << 8)) {
+        n >>= 8;
+        exp += 8;
+    }
+    if (n >= (SizeT{1} << 4)) {
+        n >>= 4;
+        exp += 4;
+    }
+    if (n >= (SizeT{1} << 2)) {
+        n >>= 2;
+        exp += 2;
+    }
+    if (n >= (SizeT{1} << 1)) {
+        n >>= 1;
+        ++exp;
+    }
+    return exp;
+}
+
 //----------------------------------------------------------
 // bitwise order-preserving conversions to unsigned integers
 //----------------------------------------------------------
@@ -1145,7 +1184,7 @@ struct subgroup_radix_sort
         const std::size_t max_slm_size =
             dev.template get_info<sycl::info::device::local_mem_size>() / 2;
 
-        const auto n_uniform = 1 << (std::uint32_t(std::log2(n - 1)) + 1);
+        const auto n_uniform = 1 << ceil_log2(n);
         const auto req_slm_size_val = sizeof(T) * n_uniform;
 
         return ((req_slm_size_val + req_slm_size_counters) <= max_slm_size)

From fb9bc436cf68624b4be5d0bcb99ad7108b8ca959 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 26 Dec 2024 21:03:59 -0600
Subject: [PATCH 19/30] Remove dead code disable via preprocessor conditional

---
 .../include/kernels/sorting/topk.hpp          | 27 -------------------
 1 file changed, 27 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
index 6b235a1427..0ff3fc4723 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -161,24 +161,10 @@ topk_full_merge_sort_impl(sycl::queue &exec_q,
 
     using IotaKernelName = topk_populate_index_data_krn<argTy, IndexTy, CompT>;
 
-#if 1
     using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
 
     sycl::event populate_indexed_data_ev = iota_impl<IotaKernelName, IndexTy>(
         exec_q, index_data, iter_nelems * axis_nelems, depends);
-#else
-    sycl::event populate_indexed_data_ev =
-        exec_q.submit([&](sycl::handler &cgh) {
-            cgh.depends_on(depends);
-
-            auto const &range = sycl::range<1>(iter_nelems * axis_nelems);
-
-            cgh.parallel_for<IotaKernelName>(range, [=](sycl::id<1> id) {
-                std::size_t i = id[0];
-                index_data[i] = static_cast<IndexTy>(i);
-            });
-        });
-#endif
 
     std::size_t sorted_block_size;
     // Sort segments of the array
@@ -513,23 +499,10 @@ sycl::event topk_radix_impl(sycl::queue &exec_q,
 
     using IotaKernelName = topk_iota_krn<argTy, IndexTy>;
 
-#if 1
     using dpctl::tensor::kernels::sort_utils_detail::iota_impl;
 
     sycl::event iota_ev = iota_impl<IotaKernelName, IndexTy>(
         exec_q, workspace, total_nelems, depends);
-#else
-    sycl::event iota_ev = exec_q.submit([&](sycl::handler &cgh) {
-        cgh.depends_on(depends);
-
-        cgh.parallel_for<IotaKernelName>(
-            sycl::range<1>(total_nelems), [=](sycl::id<1> id) {
-                size_t i = id[0];
-                IndexTy sort_id = static_cast<IndexTy>(i);
-                workspace[i] = sort_id;
-            });
-    });
-#endif
 
     sycl::event radix_sort_ev =
         radix_sort_details::parallel_radix_sort_impl<IndexTy, IndexedProjT>(

From 6cf36b1760132b2f2e1d200808cfaa79858c11f2 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 25 Dec 2024 13:20:00 -0600
Subject: [PATCH 20/30] Add information displayed on failure, renamed variables

Add check of computed against expected indices
---
 dpctl/tests/test_usm_ndarray_top_k.py | 97 +++++++++++++++++++++++----
 1 file changed, 83 insertions(+), 14 deletions(-)

diff --git a/dpctl/tests/test_usm_ndarray_top_k.py b/dpctl/tests/test_usm_ndarray_top_k.py
index 954c21a990..b0f3f798bd 100644
--- a/dpctl/tests/test_usm_ndarray_top_k.py
+++ b/dpctl/tests/test_usm_ndarray_top_k.py
@@ -20,6 +20,38 @@
 from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported
 
 
+def _expected_largest_inds(inp, n, shift, k):
+    "Computed expected top_k indices for mode='largest'"
+    assert k < n
+    ones_start_id = shift % (2 * n)
+
+    alloc_dev = inp.device
+
+    if ones_start_id < n:
+        expected_inds = dpt.arange(
+            ones_start_id, ones_start_id + k, dtype="i8", device=alloc_dev
+        )
+    else:
+        # wrap-around
+        ones_end_id = (ones_start_id + n) % (2 * n)
+        if ones_end_id >= k:
+            expected_inds = dpt.arange(k, dtype="i8", device=alloc_dev)
+        else:
+            expected_inds = dpt.concat(
+                (
+                    dpt.arange(ones_end_id, dtype="i8", device=alloc_dev),
+                    dpt.arange(
+                        ones_start_id,
+                        ones_start_id + k - ones_end_id,
+                        dtype="i8",
+                        device=alloc_dev,
+                    ),
+                )
+            )
+
+    return expected_inds
+
+
 @pytest.mark.parametrize(
     "dtype",
     [
@@ -38,23 +70,57 @@
         "c16",
     ],
 )
-@pytest.mark.parametrize("n", [33, 255, 511, 1021, 8193])
-def test_topk_1d_largest(dtype, n):
+@pytest.mark.parametrize("n", [33, 43, 255, 511, 1021, 8193])
+def test_top_k_1d_largest(dtype, n):
     q = get_queue_or_skip()
     skip_if_dtype_not_supported(dtype, q)
 
+    shift, k = 734, 5
     o = dpt.ones(n, dtype=dtype)
     z = dpt.zeros(n, dtype=dtype)
-    zo = dpt.concat((o, z))
-    inp = dpt.roll(zo, 734)
-    k = 5
+    oz = dpt.concat((o, z))
+    inp = dpt.roll(oz, shift)
+
+    expected_inds = _expected_largest_inds(oz, n, shift, k)
 
     s = dpt.top_k(inp, k, mode="largest")
     assert s.values.shape == (k,)
     assert s.values.dtype == inp.dtype
     assert s.indices.shape == (k,)
-    assert dpt.all(s.values == dpt.ones(k, dtype=dtype))
-    assert dpt.all(s.values == inp[s.indices])
+    assert dpt.all(s.indices == expected_inds)
+    assert dpt.all(s.values == dpt.ones(k, dtype=dtype)), s.values
+    assert dpt.all(s.values == inp[s.indices]), s.indices
+
+
+def _expected_smallest_inds(inp, n, shift, k):
+    "Computed expected top_k indices for mode='smallest'"
+    assert k < n
+    zeros_start_id = (n + shift) % (2 * n)
+    zeros_end_id = (shift) % (2 * n)
+
+    alloc_dev = inp.device
+
+    if zeros_start_id < zeros_end_id:
+        expected_inds = dpt.arange(
+            zeros_start_id, zeros_start_id + k, dtype="i8", device=alloc_dev
+        )
+    else:
+        if zeros_end_id >= k:
+            expected_inds = dpt.arange(k, dtype="i8", device=alloc_dev)
+        else:
+            expected_inds = dpt.concat(
+                (
+                    dpt.arange(zeros_end_id, dtype="i8", device=alloc_dev),
+                    dpt.arange(
+                        zeros_start_id,
+                        zeros_start_id + k - zeros_end_id,
+                        dtype="i8",
+                        device=alloc_dev,
+                    ),
+                )
+            )
+
+    return expected_inds
 
 
 @pytest.mark.parametrize(
@@ -75,20 +141,23 @@ def test_topk_1d_largest(dtype, n):
         "c16",
     ],
 )
-@pytest.mark.parametrize("n", [33, 255, 257, 513, 1021, 8193])
-def test_topk_1d_smallest(dtype, n):
+@pytest.mark.parametrize("n", [37, 39, 61, 255, 257, 513, 1021, 8193])
+def test_top_k_1d_smallest(dtype, n):
     q = get_queue_or_skip()
     skip_if_dtype_not_supported(dtype, q)
 
+    shift, k = 734, 5
     o = dpt.ones(n, dtype=dtype)
     z = dpt.zeros(n, dtype=dtype)
-    zo = dpt.concat((o, z))
-    inp = dpt.roll(zo, 734)
-    k = 5
+    oz = dpt.concat((o, z))
+    inp = dpt.roll(oz, shift)
+
+    expected_inds = _expected_smallest_inds(oz, n, shift, k)
 
     s = dpt.top_k(inp, k, mode="smallest")
     assert s.values.shape == (k,)
     assert s.values.dtype == inp.dtype
     assert s.indices.shape == (k,)
-    assert dpt.all(s.values == dpt.zeros(k, dtype=dtype))
-    assert dpt.all(s.values == inp[s.indices])
+    assert dpt.all(s.indices == expected_inds)
+    assert dpt.all(s.values == dpt.zeros(k, dtype=dtype)), s.values
+    assert dpt.all(s.values == inp[s.indices]), s.indices

From 4ec252e5c5a51bd24ee843ccb1a52d4bb0c81c08 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Fri, 27 Dec 2024 08:56:21 -0600
Subject: [PATCH 21/30] Add comment to explain ceil_log2 algo

---
 dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
index 887841acb8..335b285bbf 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
@@ -75,6 +75,8 @@ std::uint32_t ceil_log2(SizeT n)
 
     std::uint32_t exp{1};
     --n;
+    // if n > 2^b, n = q * 2^b + r for q > 0 and 0 <= r < 2^b
+    // ceil_log2(q * 2^b + r) == ceil_log2(q * 2^b) == q + ceil_log2(n1)
     if (n >= (SizeT{1} << 32)) {
         n >>= 32;
         exp += 32;

From 1436522844d305ef09053e347c059ceaba38405e Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Fri, 27 Dec 2024 08:57:01 -0600
Subject: [PATCH 22/30] Add static assers to async_smart_free

One asserts that at least one unique pointer is specified.
Another that specified arguments are unique pointers with
USMDeleter.
---
 .../include/utils/sycl_alloc_utils.hpp        | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
index 72440ef969..f67e1bba1f 100644
--- a/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
+++ b/dpctl/tensor/libtensor/include/utils/sycl_alloc_utils.hpp
@@ -30,6 +30,7 @@
 #include <iostream>
 #include <memory>
 #include <stdexcept>
+#include <type_traits>
 #include <vector>
 
 #include "sycl/sycl.hpp"
@@ -140,12 +141,50 @@ smart_malloc_host(std::size_t count,
     return smart_malloc<T>(count, q, sycl::usm::alloc::host, propList);
 }
 
+namespace
+{
+template <typename T> struct valid_smart_ptr : public std::false_type
+{
+};
+
+template <typename ValT, typename DelT>
+struct valid_smart_ptr<std::unique_ptr<ValT, DelT> &>
+    : public std::is_same<DelT, USMDeleter>
+{
+};
+
+template <typename ValT, typename DelT>
+struct valid_smart_ptr<std::unique_ptr<ValT, DelT>>
+    : public std::is_same<DelT, USMDeleter>
+{
+};
+
+// base case
+template <typename... Rest> struct all_valid_smart_ptrs
+{
+    static constexpr bool value = true;
+};
+
+template <typename Arg, typename... RestArgs>
+struct all_valid_smart_ptrs<Arg, RestArgs...>
+{
+    static constexpr bool value = valid_smart_ptr<Arg>::value &&
+                                  (all_valid_smart_ptrs<RestArgs...>::value);
+};
+} // namespace
+
 template <typename... Args>
 sycl::event async_smart_free(sycl::queue &exec_q,
                              const std::vector<sycl::event> &depends,
                              Args &&...args)
 {
     constexpr std::size_t n = sizeof...(Args);
+    static_assert(
+        n > 0, "async_smart_free requires at least one smart pointer argument");
+
+    static_assert(
+        all_valid_smart_ptrs<Args...>::value,
+        "async_smart_free requires unique_ptr created with smart_malloc");
 
     std::vector<void *> ptrs;
     ptrs.reserve(n);

From f0e3245f4466e3197a0bff12d40b57197db32a89 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Fri, 27 Dec 2024 18:08:51 -0800
Subject: [PATCH 23/30] Skip top_k test known to fail for some CPU
 architectures

---
 dpctl/tests/test_usm_ndarray_top_k.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dpctl/tests/test_usm_ndarray_top_k.py b/dpctl/tests/test_usm_ndarray_top_k.py
index b0f3f798bd..01a83ef293 100644
--- a/dpctl/tests/test_usm_ndarray_top_k.py
+++ b/dpctl/tests/test_usm_ndarray_top_k.py
@@ -55,7 +55,7 @@ def _expected_largest_inds(inp, n, shift, k):
 @pytest.mark.parametrize(
     "dtype",
     [
-        "i1",
+        pytest.param("i1", marks=pytest.mark.skip(reason="CPU bug")),
         "u1",
         "i2",
         "u2",
@@ -74,6 +74,8 @@ def _expected_largest_inds(inp, n, shift, k):
 def test_top_k_1d_largest(dtype, n):
     q = get_queue_or_skip()
     skip_if_dtype_not_supported(dtype, q)
+    if dtype == "i1":
+        pytest.skip()
 
     shift, k = 734, 5
     o = dpt.ones(n, dtype=dtype)
@@ -126,7 +128,7 @@ def _expected_smallest_inds(inp, n, shift, k):
 @pytest.mark.parametrize(
     "dtype",
     [
-        "i1",
+        pytest.param("i1", marks=pytest.mark.skip(reason="CPU bug")),
         "u1",
         "i2",
         "u2",

From 8f4dcdd05fb1e5118da91810aee08b7cb700efdf Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 31 Dec 2024 10:15:11 -0600
Subject: [PATCH 24/30] Fixed blunder in work-item id to data_id computation

gid-lane_id is already a multiple of sg_size.
---
 dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
index 0ff3fc4723..1828c88eb4 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/topk.hpp
@@ -101,8 +101,7 @@ sycl::event write_out_impl(sycl::queue &exec_q,
             const std::uint32_t lane_id = sg.get_local_id()[0];
             const std::uint32_t sg_size = sg.get_max_local_range()[0];
 
-            const std::size_t start_id =
-                (gid - lane_id) * sg_size * n_wi + lane_id;
+            const std::size_t start_id = (gid - lane_id) * n_wi + lane_id;
 
 #pragma unroll
             for (std::uint32_t i = 0; i < n_wi; ++i) {

From b6806ba880c7db1f46076484317993dedf5c7408 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Tue, 31 Dec 2024 10:16:11 -0600
Subject: [PATCH 25/30] Replace map_back_impl in sort_utils

Change kernel to process few data elements in the work-item.
---
 .../include/kernels/sorting/sort_utils.hpp    | 31 ++++++++++++++++---
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
index f62a6c3fa0..a81f528852 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
@@ -105,14 +105,35 @@ sycl::event map_back_impl(sycl::queue &exec_q,
                           std::size_t row_size,
                           const std::vector<sycl::event> &dependent_events)
 {
+    constexpr std::uint32_t lws = 64;
+    constexpr std::uint32_t n_wi = 4;
+    const std::size_t n_groups = (nelems + lws * n_wi - 1) / (n_wi * lws);
+
+    sycl::range<1> lRange{lws};
+    sycl::range<1> gRange{n_groups * lws};
+    sycl::nd_range<1> ndRange{gRange, lRange};
+
     sycl::event map_back_ev = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(dependent_events);
 
-        cgh.parallel_for<KernelName>(
-            sycl::range<1>(nelems), [=](sycl::id<1> id) {
-                const IndexTy linear_index = flat_index_data[id];
-                reduced_index_data[id] = (linear_index % row_size);
-            });
+        cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
+            const std::size_t gid = it.get_global_linear_id();
+            const auto &sg = it.get_sub_group();
+            const std::uint32_t lane_id = sg.get_local_id()[0];
+            const std::uint32_t sg_size = sg.get_max_local_range()[0];
+
+            const std::size_t start_id = (gid - lane_id) * n_wi + lane_id;
+
+#pragma unroll
+            for (std::uint32_t i = 0; i < n_wi; ++i) {
+                const std::size_t data_id = start_id + i * sg_size;
+
+                if (data_id < nelems) {
+                    const IndexTy linear_index = flat_index_data[data_id];
+                    reduced_index_data[data_id] = (linear_index % row_size);
+                }
+            }
+        });
     });
 
     return map_back_ev;

From b47d9f3ca96e31eada000ab4c2aa87adf6400f32 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 2 Jan 2025 10:40:26 -0600
Subject: [PATCH 26/30] Use get_global_linear_id instead of get_global_id and
 rely on implicit conversion

---
 dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
index a81f528852..d1f166f945 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/sort_utils.hpp
@@ -58,7 +58,7 @@ sycl::event iota_impl(sycl::queue &exec_q,
     sycl::event e = exec_q.submit([&](sycl::handler &cgh) {
         cgh.depends_on(dependent_events);
         cgh.parallel_for<KernelName>(ndRange, [=](sycl::nd_item<1> it) {
-            const std::size_t gid = it.get_global_id();
+            const std::size_t gid = it.get_global_linear_id();
             const auto &sg = it.get_sub_group();
             const std::uint32_t lane_id = sg.get_local_id()[0];
 

From 0493485a85f47dc33ba0c6d266a4e3caa89afd6e Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Thu, 2 Jan 2025 10:41:03 -0600
Subject: [PATCH 27/30] Counters in one-workgroup kernel to use uint16_t from
 uint32_t

Counters can not exceed uint16_t max, because the kernel
assumes that the number of elements to sort fits into uint16_t.
The change reduces the kernel SLM footprint.

Also, remove use of std::move, uint16_t->std::uint16_t, etc

Replace size_t->std::size_t, uint32_t->std::uint32_t

Use `if constexpr` in order-preservign-cast for better readability.
---
 .../include/kernels/sorting/radix_sort.hpp    | 159 ++++++++++--------
 1 file changed, 86 insertions(+), 73 deletions(-)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
index 335b285bbf..dd20e647ff 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
@@ -70,13 +70,14 @@ template <typename SizeT,
                            int> = 0>
 std::uint32_t ceil_log2(SizeT n)
 {
+    // if n > 2^b, n = q * 2^b + r for q > 0 and 0 <= r < 2^b
+    // floor_log2(q * 2^b + r) == floor_log2(q * 2^b) == q + floor_log2(n1)
+    // ceil_log2(n) == 1 + floor_log2(n-1)
     if (n <= 1)
         return std::uint32_t{1};
 
     std::uint32_t exp{1};
     --n;
-    // if n > 2^b, n = q * 2^b + r for q > 0 and 0 <= r < 2^b
-    // ceil_log2(q * 2^b + r) == ceil_log2(q * 2^b) == q + ceil_log2(n1)
     if (n >= (SizeT{1} << 32)) {
         n >>= 32;
         exp += 32;
@@ -137,16 +138,20 @@ template <bool is_ascending,
 std::make_unsigned_t<IntT> order_preserving_cast(IntT val)
 {
     using UIntT = std::make_unsigned_t<IntT>;
-    // ascending_mask: 100..0
-    constexpr UIntT ascending_mask =
-        (UIntT(1) << std::numeric_limits<IntT>::digits);
-    // descending_mask: 011..1
-    constexpr UIntT descending_mask = (std::numeric_limits<UIntT>::max() >> 1);
-
-    constexpr UIntT mask = (is_ascending) ? ascending_mask : descending_mask;
     const UIntT uint_val = sycl::bit_cast<UIntT>(val);
 
-    return (uint_val ^ mask);
+    if constexpr (is_ascending) {
+        // ascending_mask: 100..0
+        constexpr UIntT ascending_mask =
+            (UIntT(1) << std::numeric_limits<IntT>::digits);
+        return (uint_val ^ ascending_mask);
+    }
+    else {
+        // descending_mask: 011..1
+        constexpr UIntT descending_mask =
+            (std::numeric_limits<UIntT>::max() >> 1);
+        return (uint_val ^ descending_mask);
+    }
 }
 
 template <bool is_ascending> std::uint16_t order_preserving_cast(sycl::half val)
@@ -1045,10 +1050,10 @@ template <typename Names, std::uint16_t... Constants>
 class radix_sort_one_wg_krn;
 
 template <typename KernelNameBase,
-          uint16_t wg_size = 256,
-          uint16_t block_size = 16,
+          std::uint16_t wg_size = 256,
+          std::uint16_t block_size = 16,
           std::uint32_t radix = 4,
-          uint16_t req_sub_group_size = (block_size < 4 ? 32 : 16)>
+          std::uint16_t req_sub_group_size = (block_size < 4 ? 32 : 16)>
 struct subgroup_radix_sort
 {
 private:
@@ -1062,8 +1067,8 @@ struct subgroup_radix_sort
 public:
     template <typename ValueT, typename OutputT, typename ProjT>
     sycl::event operator()(sycl::queue &exec_q,
-                           size_t n_iters,
-                           size_t n_to_sort,
+                           std::size_t n_iters,
+                           std::size_t n_to_sort,
                            ValueT *input_ptr,
                            OutputT *output_ptr,
                            ProjT proj_op,
@@ -1160,8 +1165,8 @@ struct subgroup_radix_sort
     };
 
     static_assert(wg_size <= 1024);
-    static constexpr uint16_t bin_count = (1 << radix);
-    static constexpr uint16_t counter_buf_sz = wg_size * bin_count + 1;
+    static constexpr std::uint16_t bin_count = (1 << radix);
+    static constexpr std::uint16_t counter_buf_sz = wg_size * bin_count + 1;
 
     enum class temp_allocations
     {
@@ -1177,7 +1182,7 @@ struct subgroup_radix_sort
         assert(n <= (SizeT(1) << 16));
 
         constexpr auto req_slm_size_counters =
-            counter_buf_sz * sizeof(uint32_t);
+            counter_buf_sz * sizeof(std::uint16_t);
 
         const auto &dev = exec_q.get_device();
 
@@ -1212,9 +1217,9 @@ struct subgroup_radix_sort
                   typename SLM_value_tag,
                   typename SLM_counter_tag>
         sycl::event operator()(sycl::queue &exec_q,
-                               size_t n_iters,
-                               size_t n_batch_size,
-                               size_t n_values,
+                               std::size_t n_iters,
+                               std::size_t n_batch_size,
+                               std::size_t n_values,
                                InputT *input_arr,
                                OutputT *output_arr,
                                const ProjT &proj_op,
@@ -1228,7 +1233,7 @@ struct subgroup_radix_sort
             assert(n_values <= static_cast<std::size_t>(block_size) *
                                    static_cast<std::size_t>(wg_size));
 
-            uint16_t n = static_cast<uint16_t>(n_values);
+            const std::uint16_t n = static_cast<std::uint16_t>(n_values);
             static_assert(std::is_same_v<std::remove_cv_t<InputT>, OutputT>);
 
             using ValueT = OutputT;
@@ -1237,17 +1242,18 @@ struct subgroup_radix_sort
 
             TempBuf<ValueT, SLM_value_tag> buf_val(
                 n_batch_size, static_cast<std::size_t>(block_size * wg_size));
-            TempBuf<std::uint32_t, SLM_counter_tag> buf_count(
+            TempBuf<std::uint16_t, SLM_counter_tag> buf_count(
                 n_batch_size, static_cast<std::size_t>(counter_buf_sz));
 
             sycl::range<1> lRange{wg_size};
 
             sycl::event sort_ev;
-            std::vector<sycl::event> deps = depends;
+            std::vector<sycl::event> deps{depends};
 
-            std::size_t n_batches = (n_iters + n_batch_size - 1) / n_batch_size;
+            const std::size_t n_batches =
+                (n_iters + n_batch_size - 1) / n_batch_size;
 
-            for (size_t batch_id = 0; batch_id < n_batches; ++batch_id) {
+            for (std::size_t batch_id = 0; batch_id < n_batches; ++batch_id) {
 
                 const std::size_t block_start = batch_id * n_batch_size;
 
@@ -1286,46 +1292,49 @@ struct subgroup_radix_sort
                         const std::size_t iter_exchange_offset =
                             iter_id * exchange_acc_iter_stride;
 
-                        uint16_t wi = ndit.get_local_linear_id();
-                        uint16_t begin_bit = 0;
+                        std::uint16_t wi = ndit.get_local_linear_id();
+                        std::uint16_t begin_bit = 0;
 
-                        constexpr uint16_t end_bit =
+                        constexpr std::uint16_t end_bit =
                             number_of_bits_in_type<KeyT>();
 
-// copy from input array into values
+                        // copy from input array into values
 #pragma unroll
-                        for (uint16_t i = 0; i < block_size; ++i) {
-                            const uint16_t id = wi * block_size + i;
-                            if (id < n)
-                                values[i] = std::move(
-                                    this_input_arr[iter_val_offset + id]);
+                        for (std::uint16_t i = 0; i < block_size; ++i) {
+                            const std::uint16_t id = wi * block_size + i;
+                            values[i] =
+                                (id < n) ? this_input_arr[iter_val_offset + id]
+                                         : ValueT{};
                         }
 
                         while (true) {
                             // indices for indirect access in the "re-order"
                             // phase
-                            uint16_t indices[block_size];
+                            std::uint16_t indices[block_size];
                             {
                                 // pointers to bucket's counters
-                                uint32_t *counters[block_size];
+                                std::uint16_t *counters[block_size];
 
                                 // counting phase
                                 auto pcounter =
                                     get_accessor_pointer(counter_acc) +
                                     (wi + iter_counter_offset);
 
-// initialize counters
+                                // initialize counters
 #pragma unroll
-                                for (uint16_t i = 0; i < bin_count; ++i)
-                                    pcounter[i * wg_size] = std::uint32_t{0};
+                                for (std::uint16_t i = 0; i < bin_count; ++i)
+                                    pcounter[i * wg_size] = std::uint16_t{0};
 
                                 sycl::group_barrier(ndit.get_group());
 
                                 if (is_ascending) {
 #pragma unroll
-                                    for (uint16_t i = 0; i < block_size; ++i) {
-                                        const uint16_t id = wi * block_size + i;
-                                        constexpr uint16_t bin_mask =
+                                    for (std::uint16_t i = 0; i < block_size;
+                                         ++i)
+                                    {
+                                        const std::uint16_t id =
+                                            wi * block_size + i;
+                                        constexpr std::uint16_t bin_mask =
                                             bin_count - 1;
 
                                         // points to the padded element, i.e. id
@@ -1334,7 +1343,7 @@ struct subgroup_radix_sort
                                             default_out_of_range_bin_id =
                                                 bin_mask;
 
-                                        const uint16_t bin =
+                                        const std::uint16_t bin =
                                             (id < n)
                                                 ? get_bucket_id<bin_mask>(
                                                       order_preserving_cast<
@@ -1352,9 +1361,12 @@ struct subgroup_radix_sort
                                 }
                                 else {
 #pragma unroll
-                                    for (uint16_t i = 0; i < block_size; ++i) {
-                                        const uint16_t id = wi * block_size + i;
-                                        constexpr uint16_t bin_mask =
+                                    for (std::uint16_t i = 0; i < block_size;
+                                         ++i)
+                                    {
+                                        const std::uint16_t id =
+                                            wi * block_size + i;
+                                        constexpr std::uint16_t bin_mask =
                                             bin_count - 1;
 
                                         // points to the padded element, i.e. id
@@ -1363,7 +1375,7 @@ struct subgroup_radix_sort
                                             default_out_of_range_bin_id =
                                                 bin_mask;
 
-                                        const uint16_t bin =
+                                        const std::uint16_t bin =
                                             (id < n)
                                                 ? get_bucket_id<bin_mask>(
                                                       order_preserving_cast<
@@ -1386,13 +1398,14 @@ struct subgroup_radix_sort
                                 {
 
                                     // scan contiguous numbers
-                                    uint16_t bin_sum[bin_count];
+                                    std::uint16_t bin_sum[bin_count];
                                     const std::size_t counter_offset0 =
                                         iter_counter_offset + wi * bin_count;
                                     bin_sum[0] = counter_acc[counter_offset0];
 
 #pragma unroll
-                                    for (uint16_t i = 1; i < bin_count; ++i)
+                                    for (std::uint16_t i = 1; i < bin_count;
+                                         ++i)
                                         bin_sum[i] =
                                             bin_sum[i - 1] +
                                             counter_acc[counter_offset0 + i];
@@ -1400,15 +1413,16 @@ struct subgroup_radix_sort
                                     sycl::group_barrier(ndit.get_group());
 
                                     // exclusive scan local sum
-                                    uint16_t sum_scan =
+                                    std::uint16_t sum_scan =
                                         sycl::exclusive_scan_over_group(
                                             ndit.get_group(),
                                             bin_sum[bin_count - 1],
-                                            sycl::plus<uint16_t>());
+                                            sycl::plus<std::uint16_t>());
 
 // add to local sum, generate exclusive scan result
 #pragma unroll
-                                    for (uint16_t i = 0; i < bin_count; ++i)
+                                    for (std::uint16_t i = 0; i < bin_count;
+                                         ++i)
                                         counter_acc[counter_offset0 + i + 1] =
                                             sum_scan + bin_sum[i];
 
@@ -1420,11 +1434,13 @@ struct subgroup_radix_sort
                                 }
 
 #pragma unroll
-                                for (uint16_t i = 0; i < block_size; ++i) {
+                                for (std::uint16_t i = 0; i < block_size; ++i) {
                                     // a global index is a local offset plus a
                                     // global base index
                                     indices[i] += *counters[i];
                                 }
+
+                                sycl::group_barrier(ndit.get_group());
                             }
 
                             begin_bit += radix;
@@ -1432,39 +1448,36 @@ struct subgroup_radix_sort
                             // "re-order" phase
                             sycl::group_barrier(ndit.get_group());
                             if (begin_bit >= end_bit) {
-// the last iteration - writing out the result
+                                // the last iteration - writing out the result
 #pragma unroll
-                                for (uint16_t i = 0; i < block_size; ++i) {
-                                    const uint16_t r = indices[i];
+                                for (std::uint16_t i = 0; i < block_size; ++i) {
+                                    const std::uint16_t r = indices[i];
                                     if (r < n) {
-                                        // move the values to source range and
-                                        // destroy the values
                                         this_output_arr[iter_val_offset + r] =
-                                            std::move(values[i]);
+                                            values[i];
                                     }
                                 }
 
                                 return;
                             }
 
-// data exchange
+                            // data exchange
 #pragma unroll
-                            for (uint16_t i = 0; i < block_size; ++i) {
-                                const uint16_t r = indices[i];
+                            for (std::uint16_t i = 0; i < block_size; ++i) {
+                                const std::uint16_t r = indices[i];
                                 if (r < n)
                                     exchange_acc[iter_exchange_offset + r] =
-                                        std::move(values[i]);
+                                        values[i];
                             }
 
                             sycl::group_barrier(ndit.get_group());
 
 #pragma unroll
-                            for (uint16_t i = 0; i < block_size; ++i) {
-                                const uint16_t id = wi * block_size + i;
+                            for (std::uint16_t i = 0; i < block_size; ++i) {
+                                const std::uint16_t id = wi * block_size + i;
                                 if (id < n)
-                                    values[i] = std::move(
-                                        exchange_acc[iter_exchange_offset +
-                                                     id]);
+                                    values[i] =
+                                        exchange_acc[iter_exchange_offset + id];
                             }
 
                             sycl::group_barrier(ndit.get_group());
@@ -1736,10 +1749,10 @@ radix_sort_axis1_contig_impl(sycl::queue &exec_q,
                              const bool sort_ascending,
                              // number of sub-arrays to sort (num. of rows in a
                              // matrix when sorting over rows)
-                             size_t iter_nelems,
+                             std::size_t iter_nelems,
                              // size of each array to sort  (length of rows,
                              // i.e. number of columns)
-                             size_t sort_nelems,
+                             std::size_t sort_nelems,
                              const char *arg_cp,
                              char *res_cp,
                              ssize_t iter_arg_offset,
@@ -1775,10 +1788,10 @@ radix_argsort_axis1_contig_impl(sycl::queue &exec_q,
                                 const bool sort_ascending,
                                 // number of sub-arrays to sort (num. of
                                 // rows in a matrix when sorting over rows)
-                                size_t iter_nelems,
+                                std::size_t iter_nelems,
                                 // size of each array to sort  (length of
                                 // rows, i.e. number of columns)
-                                size_t sort_nelems,
+                                std::size_t sort_nelems,
                                 const char *arg_cp,
                                 char *res_cp,
                                 ssize_t iter_arg_offset,

From 5125e11eb4b4d5fe1822746e574b17630a9c4b3e Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Fri, 3 Jan 2025 19:18:45 -0600
Subject: [PATCH 28/30] Apply work-around for failing tests with CPU device and
 short sub-groups

The team developing OpenCL:CPU device runtime and compiler was notified.
See CMPLRLLVM-64592

Once fixed, the work-around should be removed.
---
 .../include/kernels/sorting/radix_sort.hpp    | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
index dd20e647ff..15f22b334e 100644
--- a/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
+++ b/dpctl/tensor/libtensor/include/kernels/sorting/radix_sort.hpp
@@ -1253,6 +1253,24 @@ struct subgroup_radix_sort
             const std::size_t n_batches =
                 (n_iters + n_batch_size - 1) / n_batch_size;
 
+            const auto &kernel_id = sycl::get_kernel_id<KernelName>();
+
+            auto const &ctx = exec_q.get_context();
+            auto const &dev = exec_q.get_device();
+            auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(
+                ctx, {dev}, {kernel_id});
+
+            const auto &krn = kb.get_kernel(kernel_id);
+
+            const std::uint32_t krn_sg_size = krn.template get_info<
+                sycl::info::kernel_device_specific::max_sub_group_size>(dev);
+
+            // due to a bug in CPU device implementation, an additional
+            // synchronization is necessary for short sub-group sizes
+            const bool work_around_needed =
+                exec_q.get_device().has(sycl::aspect::cpu) &&
+                (krn_sg_size < 16);
+
             for (std::size_t batch_id = 0; batch_id < n_batches; ++batch_id) {
 
                 const std::size_t block_start = batch_id * n_batch_size;
@@ -1269,6 +1287,7 @@ struct subgroup_radix_sort
 
                 sort_ev = exec_q.submit([&](sycl::handler &cgh) {
                     cgh.depends_on(deps);
+                    cgh.use_kernel_bundle(kb);
 
                     // allocation to use for value exchanges
                     auto exchange_acc = buf_val.get_acc(cgh);
@@ -1357,6 +1376,11 @@ struct subgroup_radix_sort
                                         counters[i] = &pcounter[bin * wg_size];
                                         indices[i] = *counters[i];
                                         *counters[i] = indices[i] + 1;
+
+                                        if (work_around_needed) {
+                                            sycl::group_barrier(
+                                                ndit.get_group());
+                                        }
                                     }
                                 }
                                 else {
@@ -1389,6 +1413,11 @@ struct subgroup_radix_sort
                                         counters[i] = &pcounter[bin * wg_size];
                                         indices[i] = *counters[i];
                                         *counters[i] = indices[i] + 1;
+
+                                        if (work_around_needed) {
+                                            sycl::group_barrier(
+                                                ndit.get_group());
+                                        }
                                     }
                                 }
 

From 505b64c419a42b2fe890fd523c96ab8d9e4db163 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Sat, 28 Dec 2024 11:59:35 -0600
Subject: [PATCH 29/30] Remove skipping of tests for i1/i2 dtypes since
 work-around was applied in C++.

Add tests for 2d input arrays, for axis=0 and axis=1

Add a test for non-contiguous input, 0d input, validation

100% coverage of top_k function implementation achieved
---
 dpctl/tests/test_usm_ndarray_top_k.py | 162 +++++++++++++++++++++++++-
 1 file changed, 156 insertions(+), 6 deletions(-)

diff --git a/dpctl/tests/test_usm_ndarray_top_k.py b/dpctl/tests/test_usm_ndarray_top_k.py
index 01a83ef293..a27853d8c8 100644
--- a/dpctl/tests/test_usm_ndarray_top_k.py
+++ b/dpctl/tests/test_usm_ndarray_top_k.py
@@ -55,7 +55,7 @@ def _expected_largest_inds(inp, n, shift, k):
 @pytest.mark.parametrize(
     "dtype",
     [
-        pytest.param("i1", marks=pytest.mark.skip(reason="CPU bug")),
+        "i1",
         "u1",
         "i2",
         "u2",
@@ -74,8 +74,6 @@ def _expected_largest_inds(inp, n, shift, k):
 def test_top_k_1d_largest(dtype, n):
     q = get_queue_or_skip()
     skip_if_dtype_not_supported(dtype, q)
-    if dtype == "i1":
-        pytest.skip()
 
     shift, k = 734, 5
     o = dpt.ones(n, dtype=dtype)
@@ -89,9 +87,9 @@ def test_top_k_1d_largest(dtype, n):
     assert s.values.shape == (k,)
     assert s.values.dtype == inp.dtype
     assert s.indices.shape == (k,)
-    assert dpt.all(s.indices == expected_inds)
     assert dpt.all(s.values == dpt.ones(k, dtype=dtype)), s.values
     assert dpt.all(s.values == inp[s.indices]), s.indices
+    assert dpt.all(s.indices == expected_inds), (s.indices, expected_inds)
 
 
 def _expected_smallest_inds(inp, n, shift, k):
@@ -128,7 +126,7 @@ def _expected_smallest_inds(inp, n, shift, k):
 @pytest.mark.parametrize(
     "dtype",
     [
-        pytest.param("i1", marks=pytest.mark.skip(reason="CPU bug")),
+        "i1",
         "u1",
         "i2",
         "u2",
@@ -160,6 +158,158 @@ def test_top_k_1d_smallest(dtype, n):
     assert s.values.shape == (k,)
     assert s.values.dtype == inp.dtype
     assert s.indices.shape == (k,)
-    assert dpt.all(s.indices == expected_inds)
     assert dpt.all(s.values == dpt.zeros(k, dtype=dtype)), s.values
     assert dpt.all(s.values == inp[s.indices]), s.indices
+    assert dpt.all(s.indices == expected_inds), (s.indices, expected_inds)
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        # skip short types to ensure that m*n can be represented
+        # in the type
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+@pytest.mark.parametrize("n", [37, 39, 61, 255, 257, 513, 1021, 8193])
+def test_top_k_2d_largest(dtype, n):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    m, k = 8, 3
+    if dtype == "f2" and m * n > 2000:
+        pytest.skip(
+            "f2 can not distinguish between large integers used in this test"
+        )
+
+    x = dpt.reshape(dpt.arange(m * n, dtype=dtype), (m, n))
+
+    r = dpt.top_k(x, k, axis=1)
+
+    assert r.values.shape == (m, k)
+    assert r.indices.shape == (m, k)
+    expected_inds = dpt.reshape(dpt.arange(n, dtype=r.indices.dtype), (1, n))[
+        :, -k:
+    ]
+    assert expected_inds.shape == (1, k)
+    assert dpt.all(
+        dpt.sort(r.indices, axis=1) == dpt.sort(expected_inds, axis=1)
+    ), (r.indices, expected_inds)
+    expected_vals = x[:, -k:]
+    assert dpt.all(
+        dpt.sort(r.values, axis=1) == dpt.sort(expected_vals, axis=1)
+    )
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        # skip short types to ensure that m*n can be represented
+        # in the type
+        "i4",
+        "u4",
+        "i8",
+        "u8",
+        "f2",
+        "f4",
+        "f8",
+        "c8",
+        "c16",
+    ],
+)
+@pytest.mark.parametrize("n", [37, 39, 61, 255, 257, 513, 1021, 8193])
+def test_top_k_2d_smallest(dtype, n):
+    q = get_queue_or_skip()
+    skip_if_dtype_not_supported(dtype, q)
+
+    m, k = 8, 3
+    if dtype == "f2" and m * n > 2000:
+        pytest.skip(
+            "f2 can not distinguish between large integers used in this test"
+        )
+
+    x = dpt.reshape(dpt.arange(m * n, dtype=dtype), (m, n))
+
+    r = dpt.top_k(x, k, axis=1, mode="smallest")
+
+    assert r.values.shape == (m, k)
+    assert r.indices.shape == (m, k)
+    expected_inds = dpt.reshape(dpt.arange(n, dtype=r.indices.dtype), (1, n))[
+        :, :k
+    ]
+    assert dpt.all(
+        dpt.sort(r.indices, axis=1) == dpt.sort(expected_inds, axis=1)
+    )
+    assert dpt.all(dpt.sort(r.values, axis=1) == dpt.sort(x[:, :k], axis=1))
+
+
+def test_top_k_0d():
+    get_queue_or_skip()
+
+    a = dpt.ones(tuple(), dtype="i4")
+    assert a.ndim == 0
+    assert a.size == 1
+
+    r = dpt.top_k(a, 1)
+    assert r.values == a
+    assert r.indices == dpt.zeros_like(a, dtype=r.indices.dtype)
+
+
+def test_top_k_noncontig():
+    get_queue_or_skip()
+
+    a = dpt.arange(256, dtype=dpt.int32)[::2]
+    r = dpt.top_k(a, 3)
+
+    assert dpt.all(dpt.sort(r.values) == dpt.asarray([250, 252, 254])), r.values
+    assert dpt.all(
+        dpt.sort(r.indices) == dpt.asarray([125, 126, 127])
+    ), r.indices
+
+
+def test_top_k_axis0():
+    get_queue_or_skip()
+
+    m, n, k = 128, 8, 3
+    x = dpt.reshape(dpt.arange(m * n, dtype=dpt.int32), (m, n))
+
+    r = dpt.top_k(x, k, axis=0, mode="smallest")
+    assert r.values.shape == (k, n)
+    assert r.indices.shape == (k, n)
+    expected_inds = dpt.reshape(dpt.arange(m, dtype=r.indices.dtype), (m, 1))[
+        :k, :
+    ]
+    assert dpt.all(
+        dpt.sort(r.indices, axis=0) == dpt.sort(expected_inds, axis=0)
+    )
+    assert dpt.all(dpt.sort(r.values, axis=0) == dpt.sort(x[:k, :], axis=0))
+
+
+def test_top_k_validation():
+    get_queue_or_skip()
+    x = dpt.ones(10, dtype=dpt.int64)
+    with pytest.raises(ValueError):
+        # k must be positive
+        dpt.top_k(x, -1)
+    with pytest.raises(TypeError):
+        # argument should be usm_ndarray
+        dpt.top_k(list(), 2)
+    x2 = dpt.reshape(x, (2, 5))
+    with pytest.raises(ValueError):
+        # k must not exceed array dimension
+        # along specified axis
+        dpt.top_k(x2, 100, axis=1)
+    with pytest.raises(ValueError):
+        # for 0d arrays, k must be 1
+        dpt.top_k(x[0], 2)
+    with pytest.raises(ValueError):
+        # mode must be "largest", or "smallest"
+        dpt.top_k(x, 2, mode="invalid")

From 8c6abf5fecb3ab09b6f742a5dc764986d5476016 Mon Sep 17 00:00:00 2001
From: Nikita Grigorian <nikita.grigorian@intel.com>
Date: Sun, 5 Jan 2025 13:08:19 -0800
Subject: [PATCH 30/30] Add entry to changelog for top_k

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index fec5e79f17..e7138a6a73 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+* Added `dpctl.tensor.top_k` per Python Array API specification: [#1921](https://github.com/IntelPython/dpctl/pull/1921)
+
 ### Changed
 
 * Improved performance of copy-and-cast operations from `numpy.ndarray` to `tensor.usm_ndarray` for contiguous inputs [gh-1829](https://github.com/IntelPython/dpctl/pull/1829)