From 2fa244edc9b1a421e64654a4e5120518ec6e31ef Mon Sep 17 00:00:00 2001
From: aamijar <aamijar230@gmail.com>
Date: Tue, 3 Jun 2025 02:25:09 +0000
Subject: [PATCH 01/11] init umap-dice-coefficient

---
 cpp/CMakeLists.txt                            |   3 +
 cpp/src/distance/detail/distance.cuh          |  82 ++++++-
 .../distance/detail/distance_ops/all_ops.cuh  |   1 +
 cpp/src/distance/detail/distance_ops/dice.cuh |  93 ++++++++
 .../detail/pairwise_matrix/dispatch-ext.cuh   |   2 +
 .../pairwise_matrix/dispatch_00_generate.py   | 209 +++++++++---------
 .../dispatch_dice_double_double_double_int.cu |  51 +++++
 .../dispatch_dice_float_float_float_int.cu    |  51 +++++
 .../dispatch_dice_half_float_float_int.cu     |  51 +++++
 cpp/src/distance/distance-ext.cuh             |   2 +
 cpp/src/distance/distance-inl.cuh             |   3 +
 cpp/src/distance/distance.cu                  |   2 +
 12 files changed, 445 insertions(+), 105 deletions(-)
 create mode 100644 cpp/src/distance/detail/distance_ops/dice.cuh
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_dice_double_double_double_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_dice_float_float_float_int.cu
 create mode 100644 cpp/src/distance/detail/pairwise_matrix/dispatch_dice_half_float_float_int.cu
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5ccb6765ab..29fcfcf2d8 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -344,6 +344,9 @@ if(BUILD_SHARED_LIBS)
     src/distance/detail/pairwise_matrix/dispatch_cosine_float_float_float_int.cu
     src/distance/detail/pairwise_matrix/dispatch_cosine_half_float_float_int.cu
     src/distance/detail/pairwise_matrix/dispatch_cosine_double_double_double_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_dice_float_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_dice_half_float_float_int.cu
+    src/distance/detail/pairwise_matrix/dispatch_dice_double_double_double_int.cu
     src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_float_float_float_int.cu
     src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_half_float_float_int.cu
     src/distance/detail/pairwise_matrix/dispatch_hamming_unexpanded_double_double_double_int.cu
diff --git a/cpp/src/distance/detail/distance.cuh b/cpp/src/distance/detail/distance.cuh
index d6fd046469..9914ee31af 100644
--- a/cpp/src/distance/detail/distance.cuh
+++ b/cpp/src/distance/detail/distance.cuh
@@ -56,6 +56,7 @@ using distance_tag = std::integral_constant<DistanceType, d>;
  * - DistanceType::Canberra:
  * - DistanceType::CorrelationExpanded:
  * - DistanceType::CosineExpanded:
+ * - DistanceType::DiceExpanded:
  * - DistanceType::HammingUnexpanded:
  * - DistanceType::HellingerExpanded:
  * - DistanceType::JensenShannon:
@@ -239,6 +240,79 @@ void distance_impl(raft::resources const& handle,
     distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
 }
 
+template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
+void distance_impl(raft::resources const& handle,
+                   distance_tag<DistanceType::DiceExpanded> distance_type,
+                   const DataT* x,
+                   const DataT* y,
+                   OutT* out,
+                   IdxT m,
+                   IdxT n,
+                   IdxT k,
+                   AccT* workspace,
+                   size_t worksize,
+                   FinOpT fin_op,
+                   bool is_row_major,
+                   DataT)  // unused
+{
+  // raft distance support inputs as float/double and output as uint8_t/float/double.
+  static_assert(!((sizeof(OutT) > 1) && (sizeof(AccT) != sizeof(OutT))),
+                "OutT can be uint8_t, float, double,"
+                "if sizeof(OutT) > 1 then sizeof(AccT) == sizeof(OutT).");
+
+  ASSERT(!(worksize < (m + n) * sizeof(AccT)), "workspace size error");
+  ASSERT(workspace != nullptr, "workspace is null");
+
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  OutT* x_norm = workspace;
+  OutT* y_norm = workspace;
+  // TODO: Column major case looks to have lower accuracy for X == Y,
+  // perhaps the use of stridedSummationKernel could be causing this,
+  // need to investigate and fix.
+  if (x == y && is_row_major) {
+    raft::linalg::reduce(x_norm,
+                         x,
+                         k,
+                         std::max(m, n),
+                         (AccT)0,
+                         is_row_major,
+                         true,
+                         stream,
+                         false,
+                         raft::identity_op(),
+                         raft::add_op());
+  } else {
+    y_norm += m;
+    raft::linalg::reduce(x_norm,
+                         x,
+                         k,
+                         m,
+                         (AccT)0,
+                         is_row_major,
+                         true,
+                         stream,
+                         false,
+                         raft::identity_op(),
+                         raft::add_op());
+    raft::linalg::reduce(y_norm,
+                         y,
+                         k,
+                         n,
+                         (AccT)0,
+                         is_row_major,
+                         true,
+                         stream,
+                         false,
+                         raft::identity_op(),
+                         raft::add_op());
+  }
+
+  ops::dice_distance_op<DataT, AccT, IdxT> distance_op{};
+  pairwise_matrix_dispatch<decltype(distance_op), DataT, AccT, OutT, FinOpT, IdxT>(
+    distance_op, m, n, k, x, y, x_norm, y_norm, out, fin_op, stream, is_row_major);
+}
+
 template <typename DataT, typename AccT, typename OutT, typename FinOpT, typename IdxT = int>
 void distance_impl(raft::resources const& handle,
                    distance_tag<DistanceType::HammingUnexpanded> distance_type,
@@ -810,9 +884,11 @@ template <cuvs::distance::DistanceType distanceType,
           typename Index_ = int>
 size_t getWorkspaceSize(const InType* x, const InType* y, Index_ m, Index_ n, Index_ k)
 {
-  size_t worksize             = 0;
-  constexpr bool is_allocated = (distanceType <= cuvs::distance::DistanceType::CosineExpanded) ||
-                                (distanceType == cuvs::distance::DistanceType::CorrelationExpanded);
+  size_t worksize = 0;
+  constexpr bool is_allocated =
+    (distanceType <= cuvs::distance::DistanceType::CosineExpanded) ||
+    (distanceType == cuvs::distance::DistanceType::CorrelationExpanded) ||
+    (distanceType == cuvs::distance::DistanceType::DiceExpanded);
   constexpr int numOfBuffers =
     (distanceType == cuvs::distance::DistanceType::CorrelationExpanded) ? 2 : 1;
 
diff --git a/cpp/src/distance/detail/distance_ops/all_ops.cuh b/cpp/src/distance/detail/distance_ops/all_ops.cuh
index 534aa16fa1..ce58b8a81c 100644
--- a/cpp/src/distance/detail/distance_ops/all_ops.cuh
+++ b/cpp/src/distance/detail/distance_ops/all_ops.cuh
@@ -23,6 +23,7 @@
 #include "../distance_ops/canberra.cuh"
 #include "../distance_ops/correlation.cuh"
 #include "../distance_ops/cosine.cuh"
+#include "../distance_ops/dice.cuh"
 #include "../distance_ops/hamming.cuh"
 #include "../distance_ops/hellinger.cuh"
 #include "../distance_ops/jensen_shannon.cuh"
diff --git a/cpp/src/distance/detail/distance_ops/dice.cuh b/cpp/src/distance/detail/distance_ops/dice.cuh
new file mode 100644
index 0000000000..27ce39cd4a
--- /dev/null
+++ b/cpp/src/distance/detail/distance_ops/dice.cuh
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/util/cuda_dev_essentials.cuh>  // DI
+
+namespace cuvs::distance::detail::ops {
+
+// Epilogue operator for CUTLASS based kernel
+template <typename DataT, typename AccT>
+struct dice_cutlass_op {
+  __device__ dice_cutlass_op() noexcept {}
+  __device__ AccT operator()(AccT& aNorm, const AccT& bNorm, AccT& accVal) const noexcept
+  {
+    return static_cast<AccT>(1.0) - static_cast<AccT>(2 * accVal / (aNorm + bNorm));
+  }
+  __device__ AccT operator()(DataT aData) const noexcept { return aData; }
+};
+
+/**
+ * @brief the expanded dice distance matrix calculation
+ *
+ * It computes the following equation:
+ *
+ * d(x, y) = 1 - 2*(x ⋅ y) / ( Σ(x) + Σ(y) )
+ */
+template <typename DataType, typename AccType, typename IdxType>
+struct dice_distance_op {
+  using DataT = DataType;
+  using AccT  = AccType;
+  using IdxT  = IdxType;
+
+  // Load norms of input data
+  static constexpr bool use_norms = true;
+  // Whether the core function requires so many instructions that it makes sense
+  // to reduce loop unrolling, etc. We do this to keep compile times in check.
+  static constexpr bool expensive_inner_loop = false;
+
+  // Size of shared memory. This is normally decided by the kernel policy, but
+  // some ops such as correlation_distance_op use more.
+  template <typename Policy>
+  static constexpr size_t shared_mem_size()
+  {
+    return Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT));
+  }
+
+  DI void core(AccT& acc, DataT& x, DataT& y) const
+  {
+    if constexpr ((std::is_same_v<AccT, float> && std::is_same_v<DataT, half>)) {
+      acc += __half2float(x != DataT(0) ? DataT(1) : DataT(0)) *
+             __half2float(y != DataT(0) ? DataT(1) : DataT(0));
+    } else {
+      acc += x != DataT(0) ? DataT(1) : DataT(0) * y != DataT(0) ? DataT(1) : DataT(0);
+    }
+  };
+
+  template <typename Policy>
+  DI void epilog(AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
+                 AccT* regxn,
+                 AccT* regyn,
+                 IdxT gridStrideX,
+                 IdxT gridStrideY) const
+  {
+#pragma unroll
+    for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
+#pragma unroll
+      for (int j = 0; j < Policy::AccColsPerTh; ++j) {
+        acc[i][j] = 1.0 - (2 * acc[i][j] / (regxn[i] + regyn[j]));
+      }
+    }
+  }
+
+  constexpr dice_cutlass_op<DataT, AccT> get_cutlass_op() const
+  {
+    return dice_cutlass_op<DataT, AccT>();
+  }
+};
+
+}  // namespace cuvs::distance::detail::ops
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh b/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh
index 49497ab3a2..3c3422f2fb 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh
@@ -95,6 +95,8 @@ instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default(
   cuvs::distance::detail::ops::correlation_distance_op, int);
 instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default(
   cuvs::distance::detail::ops::cosine_distance_op, int);
+instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default(
+  cuvs::distance::detail::ops::dice_distance_op, int);
 instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default(
   cuvs::distance::detail::ops::hamming_distance_op, int);
 instantiate_cuvs_distance_detail_pairwise_matrix_dispatch_by_algo_default(
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
index d0913833f1..4c54f68662 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
@@ -86,72 +86,77 @@
 ]
 
 op_instances = [
-    dict(
-        path_prefix="canberra",
-        OpT="cuvs::distance::detail::ops::canberra_distance_op",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="correlation",
-        OpT="cuvs::distance::detail::ops::correlation_distance_op",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="cosine",
-        OpT="cuvs::distance::detail::ops::cosine_distance_op",
-        archs = [60, 80],
-    ),
-    dict(
-        path_prefix="hamming_unexpanded",
-        OpT="cuvs::distance::detail::ops::hamming_distance_op",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="hellinger_expanded",
-        OpT="cuvs::distance::detail::ops::hellinger_distance_op",
-        archs = [60],
-    ),
-    # inner product is handled by cublas.
-    dict(
-        path_prefix="jensen_shannon",
-        OpT="cuvs::distance::detail::ops::jensen_shannon_distance_op",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="kl_divergence",
-        OpT="cuvs::distance::detail::ops::kl_divergence_op",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="l1",
-        OpT="cuvs::distance::detail::ops::l1_distance_op",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="l2_expanded",
-        OpT="cuvs::distance::detail::ops::l2_exp_distance_op",
+    # dict(
+    #     path_prefix="canberra",
+    #     OpT="cuvs::distance::detail::ops::canberra_distance_op",
+    #     archs = [60],
+    # ),
+    # dict(
+    #     path_prefix="correlation",
+    #     OpT="cuvs::distance::detail::ops::correlation_distance_op",
+    #     archs = [60],
+    # ),
+    # dict(
+    #     path_prefix="cosine",
+    #     OpT="cuvs::distance::detail::ops::cosine_distance_op",
+    #     archs = [60, 80],
+    # ),
+    dict(
+        path_prefix="dice",
+        OpT="cuvs::distance::detail::ops::dice_distance_op",
         archs = [60, 80],
     ),
-    dict(
-        path_prefix="l2_unexpanded",
-        OpT="cuvs::distance::detail::ops::l2_unexp_distance_op",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="l_inf",
-        OpT="cuvs::distance::detail::ops::l_inf_distance_op",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="lp_unexpanded",
-        OpT="cuvs::distance::detail::ops::lp_unexp_distance_op",
-        archs = [60],
-    ),
-    dict(
-        path_prefix="russel_rao",
-        OpT="cuvs::distance::detail::ops::russel_rao_distance_op",
-        archs = [60],
-     ),
+    # dict(
+    #     path_prefix="hamming_unexpanded",
+    #     OpT="cuvs::distance::detail::ops::hamming_distance_op",
+    #     archs = [60],
+    # ),
+    # dict(
+    #     path_prefix="hellinger_expanded",
+    #     OpT="cuvs::distance::detail::ops::hellinger_distance_op",
+    #     archs = [60],
+    # ),
+    # # inner product is handled by cublas.
+    # dict(
+    #     path_prefix="jensen_shannon",
+    #     OpT="cuvs::distance::detail::ops::jensen_shannon_distance_op",
+    #     archs = [60],
+    # ),
+    # dict(
+    #     path_prefix="kl_divergence",
+    #     OpT="cuvs::distance::detail::ops::kl_divergence_op",
+    #     archs = [60],
+    # ),
+    # dict(
+    #     path_prefix="l1",
+    #     OpT="cuvs::distance::detail::ops::l1_distance_op",
+    #     archs = [60],
+    # ),
+    # dict(
+    #     path_prefix="l2_expanded",
+    #     OpT="cuvs::distance::detail::ops::l2_exp_distance_op",
+    #     archs = [60, 80],
+    # ),
+    # dict(
+    #     path_prefix="l2_unexpanded",
+    #     OpT="cuvs::distance::detail::ops::l2_unexp_distance_op",
+    #     archs = [60],
+    # ),
+    # dict(
+    #     path_prefix="l_inf",
+    #     OpT="cuvs::distance::detail::ops::l_inf_distance_op",
+    #     archs = [60],
+    # ),
+    # dict(
+    #     path_prefix="lp_unexpanded",
+    #     OpT="cuvs::distance::detail::ops::lp_unexp_distance_op",
+    #     archs = [60],
+    # ),
+    # dict(
+    #     path_prefix="russel_rao",
+    #     OpT="cuvs::distance::detail::ops::russel_rao_distance_op",
+    #     archs = [60],
+    #  ),
 ]
 
 def arch_headers(archs):
@@ -178,49 +183,49 @@ def arch_headers(archs):
             f.write("\n#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch\n")
         print(f"src/distance/detail/pairwise_matrix/{path}")
 
-# Dispatch kernels for with the RBF fin op.
-with open("dispatch_rbf.cu", "w") as f:
-        OpT="cuvs::distance::detail::ops::l2_unexp_distance_op"
-        archs = [60]
+# # Dispatch kernels for with the RBF fin op.
+# with open("dispatch_rbf.cu", "w") as f:
+#         OpT="cuvs::distance::detail::ops::l2_unexp_distance_op"
+#         archs = [60]
 
-        f.write(header)
-        f.write("#include \"../kernels/rbf_fin_op.cuh\" // rbf_fin_op\n")
-        f.write(arch_headers(archs))
-        f.write(macro)
+#         f.write(header)
+#         f.write("#include \"../kernels/rbf_fin_op.cuh\" // rbf_fin_op\n")
+#         f.write(arch_headers(archs))
+#         f.write(macro)
 
-        for dt in data_type_instances:
-            DataT, AccT, OutT, IdxT = (dt[k] for k in ["DataT", "AccT", "OutT", "IdxT"]);
-            IdxT = "int64_t"    # overwrite IdxT
+#         for dt in data_type_instances:
+#             DataT, AccT, OutT, IdxT = (dt[k] for k in ["DataT", "AccT", "OutT", "IdxT"]);
+#             IdxT = "int64_t"    # overwrite IdxT
 
-            FinOpT = f"cuvs::distance::kernels::detail::rbf_fin_op<{DataT}>"
-            f.write(f"\ninstantiate_raft_distance_detail_pairwise_matrix_dispatch({OpT}, {DataT}, {AccT}, {OutT}, {FinOpT}, {IdxT});\n")
+#             FinOpT = f"cuvs::distance::kernels::detail::rbf_fin_op<{DataT}>"
+#             f.write(f"\ninstantiate_raft_distance_detail_pairwise_matrix_dispatch({OpT}, {DataT}, {AccT}, {OutT}, {FinOpT}, {IdxT});\n")
 
-        f.write("\n#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch\n")
+#         f.write("\n#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch\n")
 
 
-        print("src/distance/detail/pairwise_matrix/dispatch_rbf.cu")
+#         print("src/distance/detail/pairwise_matrix/dispatch_rbf.cu")
 
 # L2 with int64_t indices for kmeans code
-int64_t_op_instances = [
-    dict(
-        path_prefix="l2_expanded",
-        OpT="cuvs::distance::detail::ops::l2_exp_distance_op",
-        archs = [60, 80],
-    )]
-
-for op in int64_t_op_instances:
-    for dt in data_type_instances:
-        DataT, AccT, OutT, IdxT = (dt[k] for k in ["DataT", "AccT", "OutT", "IdxT"]);
-
-        IdxT = "int64_t"
-        path = f"dispatch_{op['path_prefix']}_{DataT}_{AccT}_{OutT}_{IdxT}.cu"
-        with open(path, "w") as f:
-            f.write(header)
-            f.write(arch_headers(op["archs"]))
-            f.write(macro)
-
-            OpT = op['OpT']
-            FinOpT = "raft::identity_op"
-            f.write(f"\ninstantiate_raft_distance_detail_pairwise_matrix_dispatch({OpT}, {DataT}, {AccT}, {OutT}, {FinOpT}, {IdxT});\n")
-            f.write("\n#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch\n")
-        print(f"src/distance/detail/pairwise_matrix/{path}")
+# int64_t_op_instances = [
+#     dict(
+#         path_prefix="l2_expanded",
+#         OpT="cuvs::distance::detail::ops::l2_exp_distance_op",
+#         archs = [60, 80],
+#     )]
+
+# for op in int64_t_op_instances:
+#     for dt in data_type_instances:
+#         DataT, AccT, OutT, IdxT = (dt[k] for k in ["DataT", "AccT", "OutT", "IdxT"]);
+
+#         IdxT = "int64_t"
+#         path = f"dispatch_{op['path_prefix']}_{DataT}_{AccT}_{OutT}_{IdxT}.cu"
+#         with open(path, "w") as f:
+#             f.write(header)
+#             f.write(arch_headers(op["archs"]))
+#             f.write(macro)
+
+#             OpT = op['OpT']
+#             FinOpT = "raft::identity_op"
+#             f.write(f"\ninstantiate_raft_distance_detail_pairwise_matrix_dispatch({OpT}, {DataT}, {AccT}, {OutT}, {FinOpT}, {IdxT});\n")
+#             f.write("\n#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch\n")
+#         print(f"src/distance/detail/pairwise_matrix/{path}")
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_double_double_double_int.cu
new file mode 100644
index 0000000000..b07e49f59a
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_double_double_double_int.cu
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include "../distance_ops/all_ops.cuh"  // ops::*
+#include "dispatch-inl.cuh"             // dispatch
+#include "dispatch_sm60.cuh"
+#include "dispatch_sm80.cuh"
+#include <raft/core/operators.hpp>  // raft::identity_op
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void cuvs::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const OutT* x_norm,                                                              \
+      const OutT* y_norm,                                                              \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  cuvs::distance::detail::ops::dice_distance_op, double, double, double, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_float_float_float_int.cu
new file mode 100644
index 0000000000..5f312cc634
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_float_float_float_int.cu
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include "../distance_ops/all_ops.cuh"  // ops::*
+#include "dispatch-inl.cuh"             // dispatch
+#include "dispatch_sm60.cuh"
+#include "dispatch_sm80.cuh"
+#include <raft/core/operators.hpp>  // raft::identity_op
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void cuvs::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const OutT* x_norm,                                                              \
+      const OutT* y_norm,                                                              \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  cuvs::distance::detail::ops::dice_distance_op, float, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_half_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_half_float_float_int.cu
new file mode 100644
index 0000000000..a7abcd8f07
--- /dev/null
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_half_float_float_int.cu
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by dispatch_00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python dispatch_00_generate.py
+ *
+ */
+
+#include "../distance_ops/all_ops.cuh"  // ops::*
+#include "dispatch-inl.cuh"             // dispatch
+#include "dispatch_sm60.cuh"
+#include "dispatch_sm80.cuh"
+#include <raft/core/operators.hpp>  // raft::identity_op
+#define instantiate_raft_distance_detail_pairwise_matrix_dispatch(                     \
+  OpT, DataT, AccT, OutT, FinOpT, IdxT)                                                \
+  template void cuvs::distance::detail::                                               \
+    pairwise_matrix_dispatch<OpT<DataT, AccT, IdxT>, DataT, AccT, OutT, FinOpT, IdxT>( \
+      OpT<DataT, AccT, IdxT> distance_op,                                              \
+      IdxT m,                                                                          \
+      IdxT n,                                                                          \
+      IdxT k,                                                                          \
+      const DataT* x,                                                                  \
+      const DataT* y,                                                                  \
+      const OutT* x_norm,                                                              \
+      const OutT* y_norm,                                                              \
+      OutT* out,                                                                       \
+      FinOpT fin_op,                                                                   \
+      cudaStream_t stream,                                                             \
+      bool is_row_major)
+
+instantiate_raft_distance_detail_pairwise_matrix_dispatch(
+  cuvs::distance::detail::ops::dice_distance_op, half, float, float, raft::identity_op, int);
+
+#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch
diff --git a/cpp/src/distance/distance-ext.cuh b/cpp/src/distance/distance-ext.cuh
index a692a62a35..3582fb7182 100644
--- a/cpp/src/distance/distance-ext.cuh
+++ b/cpp/src/distance/distance-ext.cuh
@@ -223,6 +223,7 @@ void pairwise_distance(raft::resources const& handle,
 instantiate_cuvs_distance_distance_by_algo(cuvs::distance::DistanceType::Canberra);
 instantiate_cuvs_distance_distance_by_algo(cuvs::distance::DistanceType::CorrelationExpanded);
 instantiate_cuvs_distance_distance_by_algo(cuvs::distance::DistanceType::CosineExpanded);
+instantiate_cuvs_distance_distance_by_algo(cuvs::distance::DistanceType::DiceExpanded);
 instantiate_cuvs_distance_distance_by_algo(cuvs::distance::DistanceType::HammingUnexpanded);
 
 instantiate_cuvs_distance_distance_by_algo(cuvs::distance::DistanceType::HellingerExpanded);
@@ -310,6 +311,7 @@ instantiate_cuvs_distance_getWorkspaceSize_by_algo(cuvs::distance::DistanceType:
 instantiate_cuvs_distance_getWorkspaceSize_by_algo(
   cuvs::distance::DistanceType::CorrelationExpanded);
 instantiate_cuvs_distance_getWorkspaceSize_by_algo(cuvs::distance::DistanceType::CosineExpanded);
+instantiate_cuvs_distance_getWorkspaceSize_by_algo(cuvs::distance::DistanceType::DiceExpanded);
 instantiate_cuvs_distance_getWorkspaceSize_by_algo(cuvs::distance::DistanceType::HammingUnexpanded);
 
 instantiate_cuvs_distance_getWorkspaceSize_by_algo(cuvs::distance::DistanceType::HellingerExpanded);
diff --git a/cpp/src/distance/distance-inl.cuh b/cpp/src/distance/distance-inl.cuh
index e047d31448..5e6ac54085 100644
--- a/cpp/src/distance/distance-inl.cuh
+++ b/cpp/src/distance/distance-inl.cuh
@@ -307,6 +307,9 @@ void pairwise_distance(raft::resources const& handle,
     case DistanceType::RusselRaoExpanded:
       dispatch(std::integral_constant<DistanceType, DistanceType::RusselRaoExpanded>{});
       break;
+    case DistanceType::DiceExpanded:
+      dispatch(std::integral_constant<DistanceType, DistanceType::DiceExpanded>{});
+      break;
     default: THROW("Unknown or unsupported distance metric '%d'!", (int)metric);
   };
 }
diff --git a/cpp/src/distance/distance.cu b/cpp/src/distance/distance.cu
index 47e72460f0..facdca8858 100644
--- a/cpp/src/distance/distance.cu
+++ b/cpp/src/distance/distance.cu
@@ -88,6 +88,7 @@
 instantiate_cuvs_distance_distance_by_algo(cuvs::distance::DistanceType::Canberra);
 instantiate_cuvs_distance_distance_by_algo(cuvs::distance::DistanceType::CorrelationExpanded);
 instantiate_cuvs_distance_distance_by_algo(cuvs::distance::DistanceType::CosineExpanded);
+instantiate_cuvs_distance_distance_by_algo(cuvs::distance::DistanceType::DiceExpanded);
 instantiate_cuvs_distance_distance_by_algo(cuvs::distance::DistanceType::HammingUnexpanded);
 
 instantiate_cuvs_distance_distance_by_algo(cuvs::distance::DistanceType::HellingerExpanded);
@@ -176,6 +177,7 @@ instantiate_cuvs_distance_getWorkspaceSize_by_algo(cuvs::distance::DistanceType:
 instantiate_cuvs_distance_getWorkspaceSize_by_algo(
   cuvs::distance::DistanceType::CorrelationExpanded);
 instantiate_cuvs_distance_getWorkspaceSize_by_algo(cuvs::distance::DistanceType::CosineExpanded);
+instantiate_cuvs_distance_getWorkspaceSize_by_algo(cuvs::distance::DistanceType::DiceExpanded);
 instantiate_cuvs_distance_getWorkspaceSize_by_algo(cuvs::distance::DistanceType::HammingUnexpanded);
 
 instantiate_cuvs_distance_getWorkspaceSize_by_algo(cuvs::distance::DistanceType::HellingerExpanded);

From 8eab501eabd7b69f7e1fad828cf37c7dc3c69b91 Mon Sep 17 00:00:00 2001
From: aamijar <aamijar230@gmail.com>
Date: Tue, 3 Jun 2025 02:30:52 +0000
Subject: [PATCH 02/11] update dispatch generate py

---
 .../pairwise_matrix/dispatch_00_generate.py   | 208 +++++++++---------
 1 file changed, 104 insertions(+), 104 deletions(-)

diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
index 4c54f68662..68d814c528 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
@@ -86,77 +86,77 @@
 ]
 
 op_instances = [
-    # dict(
-    #     path_prefix="canberra",
-    #     OpT="cuvs::distance::detail::ops::canberra_distance_op",
-    #     archs = [60],
-    # ),
-    # dict(
-    #     path_prefix="correlation",
-    #     OpT="cuvs::distance::detail::ops::correlation_distance_op",
-    #     archs = [60],
-    # ),
-    # dict(
-    #     path_prefix="cosine",
-    #     OpT="cuvs::distance::detail::ops::cosine_distance_op",
-    #     archs = [60, 80],
-    # ),
+    dict(
+        path_prefix="canberra",
+        OpT="cuvs::distance::detail::ops::canberra_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="correlation",
+        OpT="cuvs::distance::detail::ops::correlation_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="cosine",
+        OpT="cuvs::distance::detail::ops::cosine_distance_op",
+        archs = [60, 80],
+    ),
     dict(
         path_prefix="dice",
         OpT="cuvs::distance::detail::ops::dice_distance_op",
         archs = [60, 80],
     ),
-    # dict(
-    #     path_prefix="hamming_unexpanded",
-    #     OpT="cuvs::distance::detail::ops::hamming_distance_op",
-    #     archs = [60],
-    # ),
-    # dict(
-    #     path_prefix="hellinger_expanded",
-    #     OpT="cuvs::distance::detail::ops::hellinger_distance_op",
-    #     archs = [60],
-    # ),
-    # # inner product is handled by cublas.
-    # dict(
-    #     path_prefix="jensen_shannon",
-    #     OpT="cuvs::distance::detail::ops::jensen_shannon_distance_op",
-    #     archs = [60],
-    # ),
-    # dict(
-    #     path_prefix="kl_divergence",
-    #     OpT="cuvs::distance::detail::ops::kl_divergence_op",
-    #     archs = [60],
-    # ),
-    # dict(
-    #     path_prefix="l1",
-    #     OpT="cuvs::distance::detail::ops::l1_distance_op",
-    #     archs = [60],
-    # ),
-    # dict(
-    #     path_prefix="l2_expanded",
-    #     OpT="cuvs::distance::detail::ops::l2_exp_distance_op",
-    #     archs = [60, 80],
-    # ),
-    # dict(
-    #     path_prefix="l2_unexpanded",
-    #     OpT="cuvs::distance::detail::ops::l2_unexp_distance_op",
-    #     archs = [60],
-    # ),
-    # dict(
-    #     path_prefix="l_inf",
-    #     OpT="cuvs::distance::detail::ops::l_inf_distance_op",
-    #     archs = [60],
-    # ),
-    # dict(
-    #     path_prefix="lp_unexpanded",
-    #     OpT="cuvs::distance::detail::ops::lp_unexp_distance_op",
-    #     archs = [60],
-    # ),
-    # dict(
-    #     path_prefix="russel_rao",
-    #     OpT="cuvs::distance::detail::ops::russel_rao_distance_op",
-    #     archs = [60],
-    #  ),
+    dict(
+        path_prefix="hamming_unexpanded",
+        OpT="cuvs::distance::detail::ops::hamming_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="hellinger_expanded",
+        OpT="cuvs::distance::detail::ops::hellinger_distance_op",
+        archs = [60],
+    ),
+    # inner product is handled by cublas.
+    dict(
+        path_prefix="jensen_shannon",
+        OpT="cuvs::distance::detail::ops::jensen_shannon_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="kl_divergence",
+        OpT="cuvs::distance::detail::ops::kl_divergence_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="l1",
+        OpT="cuvs::distance::detail::ops::l1_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="l2_expanded",
+        OpT="cuvs::distance::detail::ops::l2_exp_distance_op",
+        archs = [60, 80],
+    ),
+    dict(
+        path_prefix="l2_unexpanded",
+        OpT="cuvs::distance::detail::ops::l2_unexp_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="l_inf",
+        OpT="cuvs::distance::detail::ops::l_inf_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="lp_unexpanded",
+        OpT="cuvs::distance::detail::ops::lp_unexp_distance_op",
+        archs = [60],
+    ),
+    dict(
+        path_prefix="russel_rao",
+        OpT="cuvs::distance::detail::ops::russel_rao_distance_op",
+        archs = [60],
+     ),
 ]
 
 def arch_headers(archs):
@@ -183,49 +183,49 @@ def arch_headers(archs):
             f.write("\n#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch\n")
         print(f"src/distance/detail/pairwise_matrix/{path}")
 
-# # Dispatch kernels for with the RBF fin op.
-# with open("dispatch_rbf.cu", "w") as f:
-#         OpT="cuvs::distance::detail::ops::l2_unexp_distance_op"
-#         archs = [60]
+# Dispatch kernels for with the RBF fin op.
+with open("dispatch_rbf.cu", "w") as f:
+        OpT="cuvs::distance::detail::ops::l2_unexp_distance_op"
+        archs = [60]
 
-#         f.write(header)
-#         f.write("#include \"../kernels/rbf_fin_op.cuh\" // rbf_fin_op\n")
-#         f.write(arch_headers(archs))
-#         f.write(macro)
+        f.write(header)
+        f.write("#include \"../kernels/rbf_fin_op.cuh\" // rbf_fin_op\n")
+        f.write(arch_headers(archs))
+        f.write(macro)
 
-#         for dt in data_type_instances:
-#             DataT, AccT, OutT, IdxT = (dt[k] for k in ["DataT", "AccT", "OutT", "IdxT"]);
-#             IdxT = "int64_t"    # overwrite IdxT
+        for dt in data_type_instances:
+            DataT, AccT, OutT, IdxT = (dt[k] for k in ["DataT", "AccT", "OutT", "IdxT"]);
+            IdxT = "int64_t"    # overwrite IdxT
 
-#             FinOpT = f"cuvs::distance::kernels::detail::rbf_fin_op<{DataT}>"
-#             f.write(f"\ninstantiate_raft_distance_detail_pairwise_matrix_dispatch({OpT}, {DataT}, {AccT}, {OutT}, {FinOpT}, {IdxT});\n")
+            FinOpT = f"cuvs::distance::kernels::detail::rbf_fin_op<{DataT}>"
+            f.write(f"\ninstantiate_raft_distance_detail_pairwise_matrix_dispatch({OpT}, {DataT}, {AccT}, {OutT}, {FinOpT}, {IdxT});\n")
 
-#         f.write("\n#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch\n")
+        f.write("\n#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch\n")
 
 
-#         print("src/distance/detail/pairwise_matrix/dispatch_rbf.cu")
+        print("src/distance/detail/pairwise_matrix/dispatch_rbf.cu")
 
 # L2 with int64_t indices for kmeans code
-# int64_t_op_instances = [
-#     dict(
-#         path_prefix="l2_expanded",
-#         OpT="cuvs::distance::detail::ops::l2_exp_distance_op",
-#         archs = [60, 80],
-#     )]
-
-# for op in int64_t_op_instances:
-#     for dt in data_type_instances:
-#         DataT, AccT, OutT, IdxT = (dt[k] for k in ["DataT", "AccT", "OutT", "IdxT"]);
-
-#         IdxT = "int64_t"
-#         path = f"dispatch_{op['path_prefix']}_{DataT}_{AccT}_{OutT}_{IdxT}.cu"
-#         with open(path, "w") as f:
-#             f.write(header)
-#             f.write(arch_headers(op["archs"]))
-#             f.write(macro)
-
-#             OpT = op['OpT']
-#             FinOpT = "raft::identity_op"
-#             f.write(f"\ninstantiate_raft_distance_detail_pairwise_matrix_dispatch({OpT}, {DataT}, {AccT}, {OutT}, {FinOpT}, {IdxT});\n")
-#             f.write("\n#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch\n")
-#         print(f"src/distance/detail/pairwise_matrix/{path}")
+int64_t_op_instances = [
+    dict(
+        path_prefix="l2_expanded",
+        OpT="cuvs::distance::detail::ops::l2_exp_distance_op",
+        archs = [60, 80],
+    )]
+
+for op in int64_t_op_instances:
+    for dt in data_type_instances:
+        DataT, AccT, OutT, IdxT = (dt[k] for k in ["DataT", "AccT", "OutT", "IdxT"]);
+
+        IdxT = "int64_t"
+        path = f"dispatch_{op['path_prefix']}_{DataT}_{AccT}_{OutT}_{IdxT}.cu"
+        with open(path, "w") as f:
+            f.write(header)
+            f.write(arch_headers(op["archs"]))
+            f.write(macro)
+
+            OpT = op['OpT']
+            FinOpT = "raft::identity_op"
+            f.write(f"\ninstantiate_raft_distance_detail_pairwise_matrix_dispatch({OpT}, {DataT}, {AccT}, {OutT}, {FinOpT}, {IdxT});\n")
+            f.write("\n#undef instantiate_raft_distance_detail_pairwise_matrix_dispatch\n")
+        print(f"src/distance/detail/pairwise_matrix/{path}")

From 696fdab82d5b1ec0304101e540d49970061869da Mon Sep 17 00:00:00 2001
From: aamijar <aamijar230@gmail.com>
Date: Tue, 3 Jun 2025 21:21:35 +0000
Subject: [PATCH 03/11] add gtests

---
 cpp/tests/CMakeLists.txt             |   1 +
 cpp/tests/distance/dist_dice.cu      | 112 +++++++++++++++++++++++++++
 cpp/tests/distance/distance_base.cuh |  32 ++++++++
 cpp/tests/test_utils.h               |  17 ++++
 4 files changed, 162 insertions(+)
 create mode 100644 cpp/tests/distance/dist_dice.cu

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index b49b5acd7d..e2170e594f 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -228,6 +228,7 @@ if(BUILD_TESTS)
     distance/dist_canberra.cu
     distance/dist_correlation.cu
     distance/dist_cos.cu
+    distance/dist_dice.cu
     distance/dist_hamming.cu
     distance/dist_hellinger.cu
     distance/dist_inner_product.cu
diff --git a/cpp/tests/distance/dist_dice.cu b/cpp/tests/distance/dist_dice.cu
new file mode 100644
index 0000000000..36ac62ede8
--- /dev/null
+++ b/cpp/tests/distance/dist_dice.cu
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+#include "distance_base.cuh"
+
+namespace cuvs {
+namespace distance {
+
+template <typename DataType>
+class DistanceExpDice : public DistanceTest<cuvs::distance::DistanceType::DiceExpanded, DataType> {
+};
+
+template <typename DataType>
+class DistanceExpDiceXequalY
+  : public DistanceTestSameBuffer<cuvs::distance::DistanceType::DiceExpanded, DataType> {};
+
+const std::vector<DistanceInputs<float>> inputsf = {
+  {0.001f, 128, (65536 + 128) * 128, 8, true, 1234ULL},
+  {0.001f, 1024, 1024, 32, true, 1234ULL},
+  {0.001f, 1024, 32, 1024, true, 1234ULL},
+  {0.001f, 32, 1024, 1024, true, 1234ULL},
+  {0.003f, 1024, 1024, 1024, true, 1234ULL},
+  {0.001f, (65536 + 128) * 128, 128, 8, false, 1234ULL},
+  {0.001f, 1024, 1024, 32, false, 1234ULL},
+  {0.001f, 1024, 32, 1024, false, 1234ULL},
+  {0.001f, 32, 1024, 1024, false, 1234ULL},
+  {0.003f, 1024, 1024, 1024, false, 1234ULL},
+};
+
+const std::vector<DistanceInputs<float>> inputsXeqYf = {
+  {0.01f, 1024, 1024, 32, true, 1234ULL},
+  {0.01f, 1024, 32, 1024, true, 1234ULL},
+  {0.01f, 32, 1024, 1024, true, 1234ULL},
+  {0.03f, 1024, 1024, 1024, true, 1234ULL},
+  {0.01f, 1024, 1024, 32, false, 1234ULL},
+  {0.01f, 1024, 32, 1024, false, 1234ULL},
+  {0.01f, 32, 1024, 1024, false, 1234ULL},
+  {0.03f, 1024, 1024, 1024, false, 1234ULL},
+};
+
+typedef DistanceExpDice<float> DistanceExpDiceF;
+TEST_P(DistanceExpDiceF, Result)
+{
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(devArrMatch(
+    dist_ref.data(), dist.data(), m, n, cuvs::CompareApproxNaN<float>(params.tolerance), stream));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpDiceF, ::testing::ValuesIn(inputsf));
+
+typedef DistanceExpDiceXequalY<float> DistanceExpDiceXequalYF;
+TEST_P(DistanceExpDiceXequalYF, Result)
+{
+  int m = params.m;
+  int n = params.m;
+  ASSERT_TRUE(cuvs::devArrMatch(dist_ref[0].data(),
+                                dist[0].data(),
+                                m,
+                                n,
+                                cuvs::CompareApproxNaN<float>(params.tolerance),
+                                stream));
+  n = params.isRowMajor ? m : m / 2;
+  m = params.isRowMajor ? m / 2 : m;
+
+  ASSERT_TRUE(cuvs::devArrMatch(dist_ref[1].data(),
+                                dist[1].data(),
+                                m,
+                                n,
+                                cuvs::CompareApproxNaN<float>(params.tolerance),
+                                stream));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpDiceXequalYF, ::testing::ValuesIn(inputsXeqYf));
+
+const std::vector<DistanceInputs<float>> inputsd = {
+  {0.001, 1024, 1024, 32, true, 1234ULL},
+  {0.001, 1024, 32, 1024, true, 1234ULL},
+  {0.001, 32, 1024, 1024, true, 1234ULL},
+  {0.003, 1024, 1024, 1024, true, 1234ULL},
+  {0.001f, 1024, 1024, 32, false, 1234ULL},
+  {0.001f, 1024, 32, 1024, false, 1234ULL},
+  {0.001f, 32, 1024, 1024, false, 1234ULL},
+  {0.003f, 1024, 1024, 1024, false, 1234ULL},
+};
+typedef DistanceExpDice<float> DistanceExpDiceD;
+TEST_P(DistanceExpDiceD, Result)
+{
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(devArrMatch(
+    dist_ref.data(), dist.data(), m, n, cuvs::CompareApproxNaN<float>(params.tolerance), stream));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpDiceD, ::testing::ValuesIn(inputsd));
+
+class BigMatrixDice : public BigMatrixDistanceTest<cuvs::distance::DistanceType::DiceExpanded> {};
+TEST_F(BigMatrixDice, Result) {}
+
+}  // end namespace distance
+}  // end namespace cuvs
diff --git a/cpp/tests/distance/distance_base.cuh b/cpp/tests/distance/distance_base.cuh
index e9a7bd286a..88e646429f 100644
--- a/cpp/tests/distance/distance_base.cuh
+++ b/cpp/tests/distance/distance_base.cuh
@@ -140,6 +140,34 @@ RAFT_KERNEL naiveCosineDistanceKernel(OutputType* dist,
   dist[outidx] = (OutputType)1.0 - acc_ab / (raft::sqrt(acc_a) * raft::sqrt(acc_b));
 }
 
+template <typename DataType, typename OutputType = DataType>
+RAFT_KERNEL naiveDiceDistanceKernel(
+  OutputType* dist, const DataType* x, const DataType* y, int m, int n, int k, bool isRowMajor)
+{
+  int midx = threadIdx.x + blockIdx.x * blockDim.x;
+  int nidx = threadIdx.y + blockIdx.y * blockDim.y;
+  if (midx >= m || nidx >= n) { return; }
+
+  OutputType acc_a  = OutputType(0);
+  OutputType acc_b  = OutputType(0);
+  OutputType acc_ab = OutputType(0);
+
+  for (int i = 0; i < k; ++i) {
+    int xidx = isRowMajor ? i + midx * k : i * m + midx;
+    int yidx = isRowMajor ? i + nidx * k : i * n + nidx;
+    auto a   = half2float(x[xidx]);
+    auto b   = half2float(y[yidx]);
+    acc_a += a;
+    acc_b += b;
+    acc_ab += a * b;
+  }
+
+  int outidx = isRowMajor ? midx * n + nidx : midx + m * nidx;
+
+  // Use 1.0 - (dice dissimilarity) to calc the distance
+  dist[outidx] = (OutputType)1.0 - (2 * acc_ab / ((acc_a) + (acc_b)));
+}
+
 template <typename DataType, typename OutputType = DataType>
 RAFT_KERNEL naiveInnerProductKernel(OutputType* dist,
                                     const DataType* x,
@@ -443,6 +471,10 @@ void naiveDistance(OutputType* dist,
       naiveCorrelationDistanceKernel<DataType, OutputType>
         <<<nblks, TPB, 0, stream>>>(dist, x, y, m, n, k, isRowMajor);
       break;
+    case cuvs::distance::DistanceType::DiceExpanded:
+      naiveDiceDistanceKernel<DataType, OutputType>
+        <<<nblks, TPB, 0, stream>>>(dist, x, y, m, n, k, isRowMajor);
+      break;
     default: FAIL() << "should be here\n";
   }
   RAFT_CUDA_TRY(cudaPeekAtLastError());
diff --git a/cpp/tests/test_utils.h b/cpp/tests/test_utils.h
index ee4e553fa3..2dffd22961 100644
--- a/cpp/tests/test_utils.h
+++ b/cpp/tests/test_utils.h
@@ -64,6 +64,23 @@ struct CompareApprox {
   T eps;
 };
 
+template <typename T>
+struct CompareApproxNaN {
+  CompareApproxNaN(T eps_) : eps(eps_) {}
+  bool operator()(const T& a, const T& b) const
+  {
+    T diff  = std::abs(a - b);
+    T m     = std::max(std::abs(a), std::abs(b));
+    T ratio = diff > eps ? diff / m : diff;
+
+    if (std::isnan(a) && std::isnan(b)) { return true; }
+    return (ratio <= eps);
+  }
+
+ private:
+  T eps;
+};
+
 template <typename Key, typename Value>
 ::std::ostream& operator<<(::std::ostream& os, const raft::KeyValuePair<Key, Value>& kv)
 {

From 5286b646aa87ecdb34cb1e6f4c3cbe5fb3da8338 Mon Sep 17 00:00:00 2001
From: aamijar <aamijar230@gmail.com>
Date: Wed, 4 Jun 2025 00:30:42 +0000
Subject: [PATCH 04/11] fixes for half types

---
 cpp/src/distance/detail/distance.cuh          |  4 +-
 cpp/src/distance/detail/distance_ops/dice.cuh |  7 +-
 cpp/tests/distance/dist_dice.cu               | 90 +++++++++++++++++--
 cpp/tests/distance/distance_base.cuh          |  6 +-
 4 files changed, 94 insertions(+), 13 deletions(-)

diff --git a/cpp/src/distance/detail/distance.cuh b/cpp/src/distance/detail/distance.cuh
index 9914ee31af..f0cd9bdbf3 100644
--- a/cpp/src/distance/detail/distance.cuh
+++ b/cpp/src/distance/detail/distance.cuh
@@ -265,8 +265,8 @@ void distance_impl(raft::resources const& handle,
 
   cudaStream_t stream = raft::resource::get_cuda_stream(handle);
 
-  OutT* x_norm = workspace;
-  OutT* y_norm = workspace;
+  AccT* x_norm = workspace;
+  AccT* y_norm = workspace;
   // TODO: Column major case looks to have lower accuracy for X == Y,
   // perhaps the use of stridedSummationKernel could be causing this,
   // need to investigate and fix.
diff --git a/cpp/src/distance/detail/distance_ops/dice.cuh b/cpp/src/distance/detail/distance_ops/dice.cuh
index 27ce39cd4a..91b0bb48fd 100644
--- a/cpp/src/distance/detail/distance_ops/dice.cuh
+++ b/cpp/src/distance/detail/distance_ops/dice.cuh
@@ -55,16 +55,15 @@ struct dice_distance_op {
   template <typename Policy>
   static constexpr size_t shared_mem_size()
   {
-    return Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(DataT));
+    return Policy::SmemSize + ((Policy::Mblk + Policy::Nblk) * sizeof(AccT));
   }
 
   DI void core(AccT& acc, DataT& x, DataT& y) const
   {
     if constexpr ((std::is_same_v<AccT, float> && std::is_same_v<DataT, half>)) {
-      acc += __half2float(x != DataT(0) ? DataT(1) : DataT(0)) *
-             __half2float(y != DataT(0) ? DataT(1) : DataT(0));
+      acc += __half2float(x) * __half2float(y);
     } else {
-      acc += x != DataT(0) ? DataT(1) : DataT(0) * y != DataT(0) ? DataT(1) : DataT(0);
+      acc += x * y;
     }
   };
 
diff --git a/cpp/tests/distance/dist_dice.cu b/cpp/tests/distance/dist_dice.cu
index 36ac62ede8..a97e00541e 100644
--- a/cpp/tests/distance/dist_dice.cu
+++ b/cpp/tests/distance/dist_dice.cu
@@ -20,13 +20,15 @@
 namespace cuvs {
 namespace distance {
 
-template <typename DataType>
-class DistanceExpDice : public DistanceTest<cuvs::distance::DistanceType::DiceExpanded, DataType> {
-};
+template <typename DataType, typename OutputType = DataType>
+class DistanceExpDice
+  : public DistanceTest<cuvs::distance::DistanceType::DiceExpanded, DataType, OutputType> {};
 
-template <typename DataType>
+template <typename DataType, typename OutputType = DataType>
 class DistanceExpDiceXequalY
-  : public DistanceTestSameBuffer<cuvs::distance::DistanceType::DiceExpanded, DataType> {};
+  : public DistanceTestSameBuffer<cuvs::distance::DistanceType::DiceExpanded,
+                                  DataType,
+                                  OutputType> {};
 
 const std::vector<DistanceInputs<float>> inputsf = {
   {0.001f, 128, (65536 + 128) * 128, 8, true, 1234ULL},
@@ -52,6 +54,17 @@ const std::vector<DistanceInputs<float>> inputsXeqYf = {
   {0.03f, 1024, 1024, 1024, false, 1234ULL},
 };
 
+const std::vector<DistanceInputs<half, float>> inputsh = {
+  {0.001f, 1024, 1024, 32, true, 1234ULL},
+  {0.001f, 1024, 32, 1024, true, 1234ULL},
+  {0.001f, 32, 1024, 1024, true, 1234ULL},
+  {0.003f, 1024, 1024, 1024, true, 1234ULL},
+  {0.001f, 1024, 1024, 32, false, 1234ULL},
+  {0.001f, 1024, 32, 1024, false, 1234ULL},
+  {0.001f, 32, 1024, 1024, false, 1234ULL},
+  {0.003f, 1024, 1024, 1024, false, 1234ULL},
+};
+
 typedef DistanceExpDice<float> DistanceExpDiceF;
 TEST_P(DistanceExpDiceF, Result)
 {
@@ -62,6 +75,16 @@ TEST_P(DistanceExpDiceF, Result)
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpDiceF, ::testing::ValuesIn(inputsf));
 
+typedef DistanceExpDice<half, float> DistanceExpDiceH;
+TEST_P(DistanceExpDiceH, Result)
+{
+  int m = params.isRowMajor ? params.m : params.n;
+  int n = params.isRowMajor ? params.n : params.m;
+  ASSERT_TRUE(devArrMatch(
+    dist_ref.data(), dist.data(), m, n, cuvs::CompareApproxNaN<float>(params.tolerance), stream));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpDiceH, ::testing::ValuesIn(inputsh));
+
 typedef DistanceExpDiceXequalY<float> DistanceExpDiceXequalYF;
 TEST_P(DistanceExpDiceXequalYF, Result)
 {
@@ -85,6 +108,20 @@ TEST_P(DistanceExpDiceXequalYF, Result)
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpDiceXequalYF, ::testing::ValuesIn(inputsXeqYf));
 
+typedef DistanceExpDiceXequalY<half, float> DistanceExpDiceXequalYH;
+TEST_P(DistanceExpDiceXequalYH, Result)
+{
+  int m = params.m;
+  int n = params.m;
+  ASSERT_TRUE(cuvs::devArrMatch(dist_ref[0].data(),
+                                dist[0].data(),
+                                m,
+                                n,
+                                cuvs::CompareApproxNaN<float>(params.tolerance),
+                                stream));
+}
+INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpDiceXequalYH, ::testing::ValuesIn(inputsh));
+
 const std::vector<DistanceInputs<float>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
   {0.001, 1024, 32, 1024, true, 1234ULL},
@@ -108,5 +145,48 @@ INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpDiceD, ::testing::ValuesIn(inp
 class BigMatrixDice : public BigMatrixDistanceTest<cuvs::distance::DistanceType::DiceExpanded> {};
 TEST_F(BigMatrixDice, Result) {}
 
+// Simple test to verify half precision works
+TEST(DistanceExpDiceH, SimpleHalfTest)
+{
+  raft::resources handle;
+  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
+
+  const int m = 32;
+  const int n = 32;
+  const int k = 16;
+
+  rmm::device_uvector<half> x(m * k, stream);
+  rmm::device_uvector<half> y(n * k, stream);
+  rmm::device_uvector<float> dist(m * n, stream);
+
+  // Initialize with simple values
+  raft::random::RngState r(1234ULL);
+  uniform(handle, r, x.data(), m * k, half(0.0), half(1.0));
+  uniform(handle, r, y.data(), n * k, half(0.0), half(1.0));
+
+  // Create views
+  auto x_v =
+    raft::make_device_matrix_view<half, std::int64_t, raft::layout_c_contiguous>(x.data(), m, k);
+  auto y_v =
+    raft::make_device_matrix_view<half, std::int64_t, raft::layout_c_contiguous>(y.data(), n, k);
+  auto dist_v = raft::make_device_matrix_view<float, std::int64_t, raft::layout_c_contiguous>(
+    dist.data(), m, n);
+
+  // Compute distance
+  cuvs::distance::pairwise_distance(
+    handle, x_v, y_v, dist_v, cuvs::distance::DistanceType::DiceExpanded, 2.0f);
+
+  // Just verify it doesn't crash and produces reasonable values
+  raft::resource::sync_stream(handle);
+
+  // Basic sanity check - distances should be in [0, 1] range for Dice
+  ASSERT_TRUE(devArrMatch(
+    0.0f,
+    dist.data(),
+    m * n,
+    [](float expected, float actual) { return actual >= -0.1f && actual <= 1.1f; },
+    stream));
+}
+
 }  // end namespace distance
 }  // end namespace cuvs
diff --git a/cpp/tests/distance/distance_base.cuh b/cpp/tests/distance/distance_base.cuh
index 88e646429f..576121aeea 100644
--- a/cpp/tests/distance/distance_base.cuh
+++ b/cpp/tests/distance/distance_base.cuh
@@ -573,7 +573,8 @@ class DistanceTest : public ::testing::TestWithParam<DistanceInputs<DataType, Ou
       // Hellinger works only on positive numbers
       uniform(handle, r, x.data(), m * k, DataType(0.0), DataType(1.0));
       uniform(handle, r, y.data(), n * k, DataType(0.0), DataType(1.0));
-    } else if (distanceType == cuvs::distance::DistanceType::RusselRaoExpanded) {
+    } else if (distanceType == cuvs::distance::DistanceType::RusselRaoExpanded ||
+               distanceType == cuvs::distance::DistanceType::DiceExpanded) {
       uniform(handle, r, x.data(), m * k, DataType(0.0), DataType(1.0));
       uniform(handle, r, y.data(), n * k, DataType(0.0), DataType(1.0));
       // Russel rao works on boolean values.
@@ -667,7 +668,8 @@ class DistanceTestSameBuffer
         distanceType == cuvs::distance::DistanceType::KLDivergence) {
       // Hellinger works only on positive numbers
       uniform(handle, r, x.data(), m * k, DataType(0.0), DataType(1.0));
-    } else if (distanceType == cuvs::distance::DistanceType::RusselRaoExpanded) {
+    } else if (distanceType == cuvs::distance::DistanceType::RusselRaoExpanded ||
+               distanceType == cuvs::distance::DistanceType::DiceExpanded) {
       uniform(handle, r, x.data(), m * k, DataType(0.0), DataType(1.0));
       // Russel rao works on boolean values.
       bernoulli(handle, r, x.data(), m * k, 0.5f);

From 6e2640757535990864bc952a875f88a118b421c2 Mon Sep 17 00:00:00 2001
From: aamijar <aamijar230@gmail.com>
Date: Wed, 4 Jun 2025 00:34:58 +0000
Subject: [PATCH 05/11] remove unused

---
 cpp/tests/distance/dist_dice.cu | 43 ---------------------------------
 1 file changed, 43 deletions(-)

diff --git a/cpp/tests/distance/dist_dice.cu b/cpp/tests/distance/dist_dice.cu
index a97e00541e..6f85fdc09e 100644
--- a/cpp/tests/distance/dist_dice.cu
+++ b/cpp/tests/distance/dist_dice.cu
@@ -145,48 +145,5 @@ INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpDiceD, ::testing::ValuesIn(inp
 class BigMatrixDice : public BigMatrixDistanceTest<cuvs::distance::DistanceType::DiceExpanded> {};
 TEST_F(BigMatrixDice, Result) {}
 
-// Simple test to verify half precision works
-TEST(DistanceExpDiceH, SimpleHalfTest)
-{
-  raft::resources handle;
-  cudaStream_t stream = raft::resource::get_cuda_stream(handle);
-
-  const int m = 32;
-  const int n = 32;
-  const int k = 16;
-
-  rmm::device_uvector<half> x(m * k, stream);
-  rmm::device_uvector<half> y(n * k, stream);
-  rmm::device_uvector<float> dist(m * n, stream);
-
-  // Initialize with simple values
-  raft::random::RngState r(1234ULL);
-  uniform(handle, r, x.data(), m * k, half(0.0), half(1.0));
-  uniform(handle, r, y.data(), n * k, half(0.0), half(1.0));
-
-  // Create views
-  auto x_v =
-    raft::make_device_matrix_view<half, std::int64_t, raft::layout_c_contiguous>(x.data(), m, k);
-  auto y_v =
-    raft::make_device_matrix_view<half, std::int64_t, raft::layout_c_contiguous>(y.data(), n, k);
-  auto dist_v = raft::make_device_matrix_view<float, std::int64_t, raft::layout_c_contiguous>(
-    dist.data(), m, n);
-
-  // Compute distance
-  cuvs::distance::pairwise_distance(
-    handle, x_v, y_v, dist_v, cuvs::distance::DistanceType::DiceExpanded, 2.0f);
-
-  // Just verify it doesn't crash and produces reasonable values
-  raft::resource::sync_stream(handle);
-
-  // Basic sanity check - distances should be in [0, 1] range for Dice
-  ASSERT_TRUE(devArrMatch(
-    0.0f,
-    dist.data(),
-    m * n,
-    [](float expected, float actual) { return actual >= -0.1f && actual <= 1.1f; },
-    stream));
-}
-
 }  // end namespace distance
 }  // end namespace cuvs

From 59cb596b811f8034d43c1aa597b33cf266e8f476 Mon Sep 17 00:00:00 2001
From: aamijar <aamijar230@gmail.com>
Date: Wed, 4 Jun 2025 22:00:16 +0000
Subject: [PATCH 06/11] fix double tests

---
 cpp/tests/distance/dist_dice.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/tests/distance/dist_dice.cu b/cpp/tests/distance/dist_dice.cu
index 6f85fdc09e..23cc574621 100644
--- a/cpp/tests/distance/dist_dice.cu
+++ b/cpp/tests/distance/dist_dice.cu
@@ -122,7 +122,7 @@ TEST_P(DistanceExpDiceXequalYH, Result)
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpDiceXequalYH, ::testing::ValuesIn(inputsh));
 
-const std::vector<DistanceInputs<float>> inputsd = {
+const std::vector<DistanceInputs<double>> inputsd = {
   {0.001, 1024, 1024, 32, true, 1234ULL},
   {0.001, 1024, 32, 1024, true, 1234ULL},
   {0.001, 32, 1024, 1024, true, 1234ULL},
@@ -132,13 +132,13 @@ const std::vector<DistanceInputs<float>> inputsd = {
   {0.001f, 32, 1024, 1024, false, 1234ULL},
   {0.003f, 1024, 1024, 1024, false, 1234ULL},
 };
-typedef DistanceExpDice<float> DistanceExpDiceD;
+typedef DistanceExpDice<double> DistanceExpDiceD;
 TEST_P(DistanceExpDiceD, Result)
 {
   int m = params.isRowMajor ? params.m : params.n;
   int n = params.isRowMajor ? params.n : params.m;
   ASSERT_TRUE(devArrMatch(
-    dist_ref.data(), dist.data(), m, n, cuvs::CompareApproxNaN<float>(params.tolerance), stream));
+    dist_ref.data(), dist.data(), m, n, cuvs::CompareApproxNaN<double>(params.tolerance), stream));
 }
 INSTANTIATE_TEST_CASE_P(DistanceTests, DistanceExpDiceD, ::testing::ValuesIn(inputsd));
 

From 591c6cf2f9e1b942d83466d03ccb800f92786899 Mon Sep 17 00:00:00 2001
From: aamijar <aamijar230@gmail.com>
Date: Fri, 6 Jun 2025 21:23:36 +0000
Subject: [PATCH 07/11] update year

---
 cpp/src/distance/detail/distance.cuh                          | 2 +-
 cpp/src/distance/detail/distance_ops/all_ops.cuh              | 2 +-
 cpp/src/distance/detail/distance_ops/dice.cuh                 | 2 +-
 cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh      | 2 +-
 .../distance/detail/pairwise_matrix/dispatch_00_generate.py   | 4 ++--
 .../pairwise_matrix/dispatch_dice_double_double_double_int.cu | 2 +-
 .../pairwise_matrix/dispatch_dice_float_float_float_int.cu    | 2 +-
 .../pairwise_matrix/dispatch_dice_half_float_float_int.cu     | 2 +-
 cpp/src/distance/distance-ext.cuh                             | 2 +-
 cpp/src/distance/distance-inl.cuh                             | 2 +-
 cpp/src/distance/distance.cu                                  | 2 +-
 cpp/tests/distance/distance_base.cuh                          | 2 +-
 cpp/tests/test_utils.h                                        | 2 +-
 13 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/cpp/src/distance/detail/distance.cuh b/cpp/src/distance/detail/distance.cuh
index f0cd9bdbf3..29b93c5056 100644
--- a/cpp/src/distance/detail/distance.cuh
+++ b/cpp/src/distance/detail/distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/detail/distance_ops/all_ops.cuh b/cpp/src/distance/detail/distance_ops/all_ops.cuh
index ce58b8a81c..049487faf1 100644
--- a/cpp/src/distance/detail/distance_ops/all_ops.cuh
+++ b/cpp/src/distance/detail/distance_ops/all_ops.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/detail/distance_ops/dice.cuh b/cpp/src/distance/detail/distance_ops/dice.cuh
index 91b0bb48fd..3ed5c78025 100644
--- a/cpp/src/distance/detail/distance_ops/dice.cuh
+++ b/cpp/src/distance/detail/distance_ops/dice.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh b/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh
index 3c3422f2fb..737fe6cb68 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
index 68d814c528..62808fed87 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
 # NOTE: this template is not perfectly formatted. Use pre-commit to get
 # everything in shape again.
 header = """/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_double_double_double_int.cu
index b07e49f59a..6626e8ebd3 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_double_double_double_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_double_double_double_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_float_float_float_int.cu
index 5f312cc634..4b308b54ca 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_float_float_float_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_float_float_float_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_half_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_half_float_float_int.cu
index a7abcd8f07..45840fd6dc 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_half_float_float_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_half_float_float_int.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/distance-ext.cuh b/cpp/src/distance/distance-ext.cuh
index 3582fb7182..893a755c2e 100644
--- a/cpp/src/distance/distance-ext.cuh
+++ b/cpp/src/distance/distance-ext.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/distance-inl.cuh b/cpp/src/distance/distance-inl.cuh
index 5e6ac54085..2307555c13 100644
--- a/cpp/src/distance/distance-inl.cuh
+++ b/cpp/src/distance/distance-inl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/distance/distance.cu b/cpp/src/distance/distance.cu
index facdca8858..26497238c1 100644
--- a/cpp/src/distance/distance.cu
+++ b/cpp/src/distance/distance.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/tests/distance/distance_base.cuh b/cpp/tests/distance/distance_base.cuh
index 576121aeea..3d93da8c3f 100644
--- a/cpp/tests/distance/distance_base.cuh
+++ b/cpp/tests/distance/distance_base.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/tests/test_utils.h b/cpp/tests/test_utils.h
index 2dffd22961..0111397239 100644
--- a/cpp/tests/test_utils.h
+++ b/cpp/tests/test_utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 18626b6dc8f835c07051be942053dc8c8740be27 Mon Sep 17 00:00:00 2001
From: aamijar <aamijar230@gmail.com>
Date: Thu, 12 Jun 2025 18:01:34 +0000
Subject: [PATCH 08/11] fix style

---
 cpp/CMakeLists.txt       | 2 +-
 cpp/tests/CMakeLists.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 29fcfcf2d8..3e28b7bc0a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index e2170e594f..8784e75094 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at

From 49156b342869e80362e0994209770c675b462d76 Mon Sep 17 00:00:00 2001
From: aamijar <aamijar230@gmail.com>
Date: Fri, 13 Jun 2025 04:54:40 +0000
Subject: [PATCH 09/11] fix norm

---
 cpp/src/distance/detail/distance.cuh | 46 ++++++++--------------------
 1 file changed, 13 insertions(+), 33 deletions(-)

diff --git a/cpp/src/distance/detail/distance.cuh b/cpp/src/distance/detail/distance.cuh
index 622bd041c6..adfeff7e40 100644
--- a/cpp/src/distance/detail/distance.cuh
+++ b/cpp/src/distance/detail/distance.cuh
@@ -258,41 +258,21 @@ void distance_impl(raft::resources const& handle,
   // perhaps the use of stridedSummationKernel could be causing this,
   // need to investigate and fix.
   if (x == y && is_row_major) {
-    raft::linalg::reduce(x_norm,
-                         x,
-                         k,
-                         std::max(m, n),
-                         (AccT)0,
-                         is_row_major,
-                         true,
-                         stream,
-                         false,
-                         raft::identity_op(),
-                         raft::add_op());
+    raft::linalg::reduce<true, true>(
+      x_norm, x, k, std::max(m, n), (AccT)0, stream, false, raft::identity_op(), raft::add_op());
   } else {
     y_norm += m;
-    raft::linalg::reduce(x_norm,
-                         x,
-                         k,
-                         m,
-                         (AccT)0,
-                         is_row_major,
-                         true,
-                         stream,
-                         false,
-                         raft::identity_op(),
-                         raft::add_op());
-    raft::linalg::reduce(y_norm,
-                         y,
-                         k,
-                         n,
-                         (AccT)0,
-                         is_row_major,
-                         true,
-                         stream,
-                         false,
-                         raft::identity_op(),
-                         raft::add_op());
+    if (is_row_major) {
+      raft::linalg::reduce<true, true>(
+        x_norm, x, k, m, (AccT)0, stream, false, raft::identity_op(), raft::add_op());
+      raft::linalg::reduce<true, true>(
+        y_norm, y, k, n, (AccT)0, stream, false, raft::identity_op(), raft::add_op());
+    } else {
+      raft::linalg::reduce<false, true>(
+        x_norm, x, k, m, (AccT)0, stream, false, raft::identity_op(), raft::add_op());
+      raft::linalg::reduce<false, true>(
+        y_norm, y, k, n, (AccT)0, stream, false, raft::identity_op(), raft::add_op());
+    }
   }
 
   ops::dice_distance_op<DataT, AccT, IdxT> distance_op{};

From 36ebb145ed89082a6d030970f624d98a47b4832b Mon Sep 17 00:00:00 2001
From: aamijar <aamijar230@gmail.com>
Date: Tue, 16 Dec 2025 08:02:58 +0000
Subject: [PATCH 10/11] fix headers

---
 cpp/src/distance/detail/distance_ops/all_ops.cuh  |  2 +-
 cpp/src/distance/detail/distance_ops/dice.cuh     | 15 ++-------------
 .../detail/pairwise_matrix/dispatch-ext.cuh       |  2 +-
 .../pairwise_matrix/dispatch_00_generate.py       |  2 +-
 .../dispatch_dice_double_double_double_int.cu     | 15 ++-------------
 .../dispatch_dice_float_float_float_int.cu        | 15 ++-------------
 .../dispatch_dice_half_float_float_int.cu         | 15 ++-------------
 cpp/src/distance/distance-ext.cuh                 |  2 +-
 cpp/src/distance/distance-inl.cuh                 |  2 +-
 cpp/src/distance/distance.cu                      |  2 +-
 cpp/tests/distance/dist_dice.cu                   | 15 ++-------------
 cpp/tests/distance/distance_base.cuh              |  2 +-
 cpp/tests/test_utils.h                            |  2 +-
 13 files changed, 18 insertions(+), 73 deletions(-)

diff --git a/cpp/src/distance/detail/distance_ops/all_ops.cuh b/cpp/src/distance/detail/distance_ops/all_ops.cuh
index e08c55b57b..134edca835 100644
--- a/cpp/src/distance/detail/distance_ops/all_ops.cuh
+++ b/cpp/src/distance/detail/distance_ops/all_ops.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
diff --git a/cpp/src/distance/detail/distance_ops/dice.cuh b/cpp/src/distance/detail/distance_ops/dice.cuh
index 3ed5c78025..d228c88d4e 100644
--- a/cpp/src/distance/detail/distance_ops/dice.cuh
+++ b/cpp/src/distance/detail/distance_ops/dice.cuh
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #pragma once
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh b/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh
index 1f2fe9ccbb..1ade0f4af4 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch-ext.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
index 49e48192a8..b52d29fea8 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_00_generate.py
@@ -84,7 +84,7 @@
     dict(
         path_prefix="dice",
         OpT="cuvs::distance::detail::ops::dice_distance_op",
-        archs = [60, 80],
+        archs=[60, 80],
     ),
     dict(
         path_prefix="hamming_unexpanded",
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_double_double_double_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_double_double_double_int.cu
index 6626e8ebd3..64e75f0bab 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_double_double_double_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_double_double_double_int.cu
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 /*
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_float_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_float_float_float_int.cu
index 4b308b54ca..830c023b47 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_float_float_float_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_float_float_float_int.cu
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 /*
diff --git a/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_half_float_float_int.cu b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_half_float_float_int.cu
index 45840fd6dc..0aea26a846 100644
--- a/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_half_float_float_int.cu
+++ b/cpp/src/distance/detail/pairwise_matrix/dispatch_dice_half_float_float_int.cu
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 /*
diff --git a/cpp/src/distance/distance-ext.cuh b/cpp/src/distance/distance-ext.cuh
index 6b9fb68ca9..f0155e3efa 100644
--- a/cpp/src/distance/distance-ext.cuh
+++ b/cpp/src/distance/distance-ext.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
diff --git a/cpp/src/distance/distance-inl.cuh b/cpp/src/distance/distance-inl.cuh
index 5025dcbaad..28b908ca7d 100644
--- a/cpp/src/distance/distance-inl.cuh
+++ b/cpp/src/distance/distance-inl.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
diff --git a/cpp/src/distance/distance.cu b/cpp/src/distance/distance.cu
index fd9b70a934..7d88580b90 100644
--- a/cpp/src/distance/distance.cu
+++ b/cpp/src/distance/distance.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
diff --git a/cpp/tests/distance/dist_dice.cu b/cpp/tests/distance/dist_dice.cu
index 23cc574621..0e0e314bc8 100644
--- a/cpp/tests/distance/dist_dice.cu
+++ b/cpp/tests/distance/dist_dice.cu
@@ -1,17 +1,6 @@
 /*
- * Copyright (c) 2025, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
  */
 
 #include "../test_utils.cuh"
diff --git a/cpp/tests/distance/distance_base.cuh b/cpp/tests/distance/distance_base.cuh
index 864048fbdd..c1bd04fb7a 100644
--- a/cpp/tests/distance/distance_base.cuh
+++ b/cpp/tests/distance/distance_base.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
diff --git a/cpp/tests/test_utils.h b/cpp/tests/test_utils.h
index 305953db88..748b5462cc 100644
--- a/cpp/tests/test_utils.h
+++ b/cpp/tests/test_utils.h
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2018-2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 

From 3e730b3b0c5b73adde9db827581fc9aa791ca6d5 Mon Sep 17 00:00:00 2001
From: aamijar <aamijar230@gmail.com>
Date: Tue, 16 Dec 2025 08:27:32 +0000
Subject: [PATCH 11/11] remove cutlass op

---
 cpp/src/distance/detail/distance_ops/dice.cuh | 16 ----------------
 cpp/tests/CMakeLists.txt                      |  1 +
 2 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/cpp/src/distance/detail/distance_ops/dice.cuh b/cpp/src/distance/detail/distance_ops/dice.cuh
index d228c88d4e..b3dd8cc9bb 100644
--- a/cpp/src/distance/detail/distance_ops/dice.cuh
+++ b/cpp/src/distance/detail/distance_ops/dice.cuh
@@ -9,17 +9,6 @@
 
 namespace cuvs::distance::detail::ops {
 
-// Epilogue operator for CUTLASS based kernel
-template <typename DataT, typename AccT>
-struct dice_cutlass_op {
-  __device__ dice_cutlass_op() noexcept {}
-  __device__ AccT operator()(AccT& aNorm, const AccT& bNorm, AccT& accVal) const noexcept
-  {
-    return static_cast<AccT>(1.0) - static_cast<AccT>(2 * accVal / (aNorm + bNorm));
-  }
-  __device__ AccT operator()(DataT aData) const noexcept { return aData; }
-};
-
 /**
  * @brief the expanded dice distance matrix calculation
  *
@@ -71,11 +60,6 @@ struct dice_distance_op {
       }
     }
   }
-
-  constexpr dice_cutlass_op<DataT, AccT> get_cutlass_op() const
-  {
-    return dice_cutlass_op<DataT, AccT>();
-  }
 };
 
 }  // namespace cuvs::distance::detail::ops
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 198dd9b70c..67d61a4435 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -295,6 +295,7 @@ ConfigureTest(
   PATH distance/dist_canberra.cu
        distance/dist_correlation.cu
        distance/dist_cos.cu
+       distance/dist_dice.cu
        distance/dist_hamming.cu
        distance/dist_hellinger.cu
        distance/dist_inner_product.cu