From 99bd96459eca917f3258d89b5e7aa302a0415387 Mon Sep 17 00:00:00 2001 From: Lizhen You Date: Wed, 16 Jul 2025 03:05:47 +0000 Subject: [PATCH 1/3] Add option to enable "sve" optimization level on armv9 Signed-off-by: Lizhen You --- cpp/cmake/modules/FindSVE.cmake | 45 ++++++++++++++++++++++++++++ cpp/cmake/thirdparty/get_faiss.cmake | 17 +++++++++-- 2 files changed, 60 insertions(+), 2 deletions(-) create mode 100644 cpp/cmake/modules/FindSVE.cmake diff --git a/cpp/cmake/modules/FindSVE.cmake b/cpp/cmake/modules/FindSVE.cmake new file mode 100644 index 0000000000..6533c7f069 --- /dev/null +++ b/cpp/cmake/modules/FindSVE.cmake @@ -0,0 +1,45 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +# ============================================================================= +# FindSVE.cmake This module finds the SVE (Scalable Vector Extension) support in the compiler. +# ============================================================================= + +INCLUDE(CheckCXXSourceRuns) + +set(SVE_CODE + " + #include + + int main() + { + svfloat32_t a = svdup_f32(0); + return 0; + } +" +) + +# Check for SVE support +message("Checking for SVE support") +set(CMAKE_REQUIRED_FLAGS "-march=armv9-a+sve") +check_cxx_source_runs("${SVE_CODE}" CXX_HAS_SVE) + +if(CXX_HAS_SVE) + set(CXX_SVE_FOUND + TRUE + CACHE BOOL "SVE support found" + ) + set(CXX_SVE_FLAGS + "-march=armv9-a+sve" + CACHE STRING "Flags for SVE support" + ) +else() + set(CXX_SVE_FOUND + FALSE + CACHE BOOL "SVE support not found" + ) + set(CXX_SVE_FLAGS + "" + CACHE STRING "Flags for SVE support" + ) +endif() + +mark_as_advanced(CXX_SVE_FOUND CXX_SVE_FLAGS) diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake index 26965273e2..3680aedbce 100644 --- a/cpp/cmake/thirdparty/get_faiss.cmake +++ b/cpp/cmake/thirdparty/get_faiss.cmake @@ -1,5 +1,5 @@ #============================================================================= -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ #============================================================================= function(find_and_configure_faiss) + option(CUVS_CPU_ONLY_FAISS_ENABLE_SVE "Enable SVE support for CPU ONLY FAISS" OFF) set(oneValueArgs VERSION REPOSITORY PINNED_TAG BUILD_STATIC_LIBS EXCLUDE_FROM_ALL ENABLE_GPU) cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN} ) @@ -40,14 +41,18 @@ function(find_and_configure_faiss) endif() include(cmake/modules/FindAVX) + include(cmake/modules/FindSVE) # Link against AVX CPU lib if it exists set(CUVS_FAISS_OPT_LEVEL "generic") if(CXX_AVX2_FOUND) set(CUVS_FAISS_OPT_LEVEL "avx2") endif() + if(CXX_SVE_FOUND AND CUVS_CPU_ONLY_FAISS_ENABLE_SVE) + set(CUVS_FAISS_OPT_LEVEL "sve") + endif() rapids_cpm_find(faiss ${version} ${build_patch_only} - GLOBAL_TARGETS faiss faiss_avx2 faiss_gpu_objs faiss::faiss faiss::faiss_avx2 + GLOBAL_TARGETS faiss faiss_avx2 faiss_sve faiss_gpu_objs faiss::faiss faiss::faiss_avx2 faiss::faiss_sve CPM_ARGS GIT_REPOSITORY ${repository} GIT_TAG ${tag} @@ -79,6 +84,12 @@ function(find_and_configure_faiss) # faiss will have the conda includes/link dirs target_link_libraries(faiss_avx2 PRIVATE $) endif() + if(TARGET faiss_sve AND NOT TARGET faiss::faiss_sve) + add_library(faiss::faiss_sve ALIAS faiss_sve) + # We need to ensure that faiss has all the conda information. So we use this approach so that + # faiss will have the conda includes/link dirs + target_link_libraries(faiss_sve PRIVATE $) + endif() if(TARGET faiss_gpu_objs AND NOT TARGET faiss::faiss_gpu_objs) add_library(faiss::faiss_gpu_objs ALIAS faiss_gpu_objs) # We need to ensure that faiss has all the conda information. So we use this approach so that @@ -107,6 +118,8 @@ function(find_and_configure_faiss) set(CUVS_FAISS_TARGETS "$,faiss::faiss>" PARENT_SCOPE) elseif(CXX_AVX2_FOUND) set(CUVS_FAISS_TARGETS faiss::faiss_avx2 PARENT_SCOPE) + elseif(CXX_SVE_FOUND AND CUVS_CPU_ONLY_FAISS_ENABLE_SVE) + set(CUVS_FAISS_TARGETS faiss::faiss_sve PARENT_SCOPE) else() set(CUVS_FAISS_TARGETS faiss::faiss PARENT_SCOPE) endif() From 123642e17618a9a0210178328c2cfae4ed242ce2 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Tue, 22 Jul 2025 17:16:27 -0400 Subject: [PATCH 2/3] Fix binary error --- .../preprocessing/quantize/detail/binary.cuh | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/cpp/src/preprocessing/quantize/detail/binary.cuh b/cpp/src/preprocessing/quantize/detail/binary.cuh index 7852a3f52b..7d4f6c56de 100644 --- a/cpp/src/preprocessing/quantize/detail/binary.cuh +++ b/cpp/src/preprocessing/quantize/detail/binary.cuh @@ -276,8 +276,8 @@ auto train(raft::resources const& res, const size_t dataset_size = dataset.extent(0); quantizer.threshold = raft::make_device_vector(res, dataset_dim); - using compute_t = std::conditional_t, float, T>; - std::vector host_threshold_vec(dataset_dim); + using T = std::conditional_t, float, T>; + std::vector host_threshold_vec(dataset_dim); auto threshold_ptr = host_threshold_vec.data(); if (params.threshold == cuvs::preprocessing::quantize::binary::bit_threshold::mean) { @@ -287,7 +287,7 @@ auto train(raft::resources const& res, #pragma omp parallel for reduction(+ : threshold_ptr[ : dataset_dim]) for (size_t i = 0; i < dataset_size; i++) { for (uint32_t j = 0; j < dataset_dim; j++) { - threshold_ptr[j] += static_cast(dataset.data_handle()[i * dataset_dim + j]); + threshold_ptr[j] += static_cast(dataset.data_handle()[i * dataset_dim + j]); } } for (uint32_t j = 0; j < dataset_dim; j++) { @@ -315,13 +315,13 @@ auto train(raft::resources const& res, const auto stride = stride_prime_list[prime_i]; // Transposed - auto sampled_dataset = raft::make_host_matrix(dataset_dim, num_samples); + auto sampled_dataset = raft::make_host_matrix(dataset_dim, num_samples); #pragma omp parallel for for (size_t out_i = 0; out_i < num_samples; out_i++) { const auto in_i = (out_i * stride) % dataset_size; for (uint32_t j = 0; j < dataset_dim; j++) { sampled_dataset.data_handle()[j * num_samples + out_i] = - static_cast(dataset.data_handle()[in_i * dataset_dim + j]); + static_cast(dataset.data_handle()[in_i * dataset_dim + j]); } } @@ -333,15 +333,15 @@ auto train(raft::resources const& res, } } - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { raft::copy(quantizer.threshold.data_handle(), host_threshold_vec.data(), dataset_dim, raft::resource::get_cuda_stream(res)); } else { - auto mr = raft::resource::get_workspace_resource(res); - auto casted_vec = raft::make_device_mdarray( - res, mr, raft::make_extents(dataset_dim)); + auto mr = raft::resource::get_workspace_resource(res); + auto casted_vec = + raft::make_device_mdarray(res, mr, raft::make_extents(dataset_dim)); raft::copy(casted_vec.data_handle(), host_threshold_vec.data(), dataset_dim, @@ -420,15 +420,15 @@ void transform(raft::resources const& res, out_dataset_size); // Use `float` for computation when T == half because a runtime error occurs with CUDA 11.8 - using compute_t = std::conditional_t, float, T>; - auto threshold_vec = raft::make_host_vector(0); - compute_t* threshold_ptr = nullptr; + using T = std::conditional_t, float, T>; + auto threshold_vec = raft::make_host_vector(0); + T* threshold_ptr = nullptr; if (quantizer.threshold.size() != 0) { - threshold_vec = raft::make_host_vector(dataset_dim); + threshold_vec = raft::make_host_vector(dataset_dim); threshold_ptr = threshold_vec.data_handle(); - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { raft::copy(threshold_ptr, quantizer.threshold.data_handle(), dataset_dim, @@ -439,7 +439,7 @@ void transform(raft::resources const& res, res, mr, raft::make_extents(dataset_dim)); raft::linalg::map(res, casted_vec.view(), - raft::cast_op{}, + raft::cast_op{}, raft::make_const_mdspan(quantizer.threshold.view())); raft::copy( threshold_ptr, casted_vec.data_handle(), dataset_dim, raft::resource::get_cuda_stream(res)); @@ -456,7 +456,7 @@ void transform(raft::resources const& res, if (threshold_ptr == nullptr) { if (is_positive(dataset(i, in_j))) { pack |= (1u << pack_j); } } else { - if (is_positive(static_cast(dataset(i, in_j)) - threshold_ptr[in_j])) { + if (is_positive(static_cast(dataset(i, in_j)) - threshold_ptr[in_j])) { pack |= (1u << pack_j); } } From 13ae755186226ec843336149494766c6c0a1e7b0 Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 6 Aug 2025 14:32:50 -0400 Subject: [PATCH 3/3] Update binary.cuh --- .../preprocessing/quantize/detail/binary.cuh | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/cpp/src/preprocessing/quantize/detail/binary.cuh b/cpp/src/preprocessing/quantize/detail/binary.cuh index 9435227e4b..26c9bf5481 100644 --- a/cpp/src/preprocessing/quantize/detail/binary.cuh +++ b/cpp/src/preprocessing/quantize/detail/binary.cuh @@ -276,8 +276,8 @@ auto train(raft::resources const& res, const size_t dataset_size = dataset.extent(0); quantizer.threshold = raft::make_device_vector(res, dataset_dim); - using T = std::conditional_t, float, T>; - std::vector host_threshold_vec(dataset_dim); + using compute_t = std::conditional_t, float, T>; + std::vector host_threshold_vec(dataset_dim); auto threshold_ptr = host_threshold_vec.data(); if (params.threshold == cuvs::preprocessing::quantize::binary::bit_threshold::mean) { @@ -287,7 +287,7 @@ auto train(raft::resources const& res, #pragma omp parallel for reduction(+ : threshold_ptr[ : dataset_dim]) for (size_t i = 0; i < dataset_size; i++) { for (uint32_t j = 0; j < dataset_dim; j++) { - threshold_ptr[j] += static_cast(dataset.data_handle()[i * dataset_dim + j]); + threshold_ptr[j] += static_cast(dataset.data_handle()[i * dataset_dim + j]); } } for (uint32_t j = 0; j < dataset_dim; j++) { @@ -315,13 +315,13 @@ auto train(raft::resources const& res, const auto stride = stride_prime_list[prime_i]; // Transposed - auto sampled_dataset = raft::make_host_matrix(dataset_dim, num_samples); + auto sampled_dataset = raft::make_host_matrix(dataset_dim, num_samples); #pragma omp parallel for for (size_t out_i = 0; out_i < num_samples; out_i++) { const auto in_i = (out_i * stride) % dataset_size; for (uint32_t j = 0; j < dataset_dim; j++) { sampled_dataset.data_handle()[j * num_samples + out_i] = - static_cast(dataset.data_handle()[in_i * dataset_dim + j]); + static_cast(dataset.data_handle()[in_i * dataset_dim + j]); } } @@ -333,15 +333,15 @@ auto train(raft::resources const& res, } } - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { raft::copy(quantizer.threshold.data_handle(), host_threshold_vec.data(), dataset_dim, raft::resource::get_cuda_stream(res)); } else { - auto mr = raft::resource::get_workspace_resource(res); - auto casted_vec = - raft::make_device_mdarray(res, mr, raft::make_extents(dataset_dim)); + auto mr = raft::resource::get_workspace_resource(res); + auto casted_vec = raft::make_device_mdarray( + res, mr, raft::make_extents(dataset_dim)); raft::copy(casted_vec.data_handle(), host_threshold_vec.data(), dataset_dim, @@ -424,10 +424,10 @@ void transform(raft::resources const& res, compute_t* threshold_ptr = nullptr; if (quantizer.threshold.size() != 0) { - threshold_vec = raft::make_host_vector(dataset_dim); + threshold_vec = raft::make_host_vector(dataset_dim); threshold_ptr = threshold_vec.data_handle(); - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { raft::copy(threshold_ptr, quantizer.threshold.data_handle(), dataset_dim, @@ -438,7 +438,7 @@ void transform(raft::resources const& res, res, mr, raft::make_extents(dataset_dim)); raft::linalg::map(res, casted_vec.view(), - raft::cast_op{}, + raft::cast_op{}, raft::make_const_mdspan(quantizer.threshold.view())); raft::copy( threshold_ptr, casted_vec.data_handle(), dataset_dim, raft::resource::get_cuda_stream(res)); @@ -455,7 +455,7 @@ void transform(raft::resources const& res, if (threshold_ptr == nullptr) { if (is_positive(dataset(i, in_j))) { pack |= (1u << pack_j); } } else { - if (is_positive(static_cast(dataset(i, in_j)) - threshold_ptr[in_j])) { + if (is_positive(static_cast(dataset(i, in_j)) - threshold_ptr[in_j])) { pack |= (1u << pack_j); } }