diff --git a/CMakeLists.txt b/CMakeLists.txt index a4d2c003..fa71a403 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,18 +12,6 @@ set(Torch_USE_CUDA OFF CACHE BOOL "Force disable CUDA in Torch") set(Torch_NO_CUDA ON CACHE BOOL "Force disable CUDA in Torch") set(USE_CUDA OFF CACHE BOOL "Force disable CUDA globally") -# QUAKE_ENABLE_GPU: Enable GPU support for Faiss -# Default: OFF -if(QUAKE_ENABLE_GPU) - set(FAISS_ENABLE_GPU ON) -else() - set(FAISS_ENABLE_GPU OFF) -endif() - -if(QUAKE_ENABLE_GPU) - add_compile_definitions(FAISS_ENABLE_GPU) -endif() - if(QUAKE_USE_NUMA) add_compile_definitions(QUAKE_USE_NUMA) endif() @@ -64,6 +52,25 @@ set(project_BINDINGS_DIR ${CPP_SOURCE}/bindings) set(project_THIRD_PARTY_DIR ${CPP_SOURCE}/third_party) set(project_TEST_DIR test/cpp) +if(QUAKE_ENABLE_GPU) + find_package(CuVS REQUIRED) + include(${project_THIRD_PARTY_DIR}/cmake/fetch_rapids.cmake) + include(rapids-cmake) + include(rapids-cpm) + include(rapids-cuda) + include(rapids-export) + include(rapids-find) + + rapids_cuda_init_architectures(quake_c) + + rapids_cpm_init() + set(BUILD_CUVS_C_LIBRARY OFF) + include(${project_THIRD_PARTY_DIR}/cmake/get_cuvs.cmake) + + add_compile_definitions(QUAKE_ENABLE_GPU) +endif() + +set(FAISS_ENABLE_GPU OFF) # --------------------------------------------------------------- # Print out Compiler and Path Information # --------------------------------------------------------------- @@ -81,7 +88,6 @@ message(STATUS "QUAKE_USE_NUMA: ${QUAKE_USE_NUMA}") # Apple-specific adjustments if(APPLE) include_directories("/opt/homebrew/opt/openblas/include") - set(FAISS_ENABLE_GPU OFF) endif() # Compiler options and definitions @@ -149,6 +155,10 @@ elseif(UNIX) if(QUAKE_USE_NUMA) list(APPEND LINK_LIBS -lnuma) endif() + + if (QUAKE_ENABLE_GPU) + list(APPEND LINK_LIBS cuvs::cuvs) + endif() else() # unsupported platform message(FATAL_ERROR "Unsupported platform") @@ -211,7 +221,7 @@ endif() # --------------------------------------------------------------- message(STATUS "--------- Final Configuration Summary ---------") message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") -message(STATUS "GPU Enabled: ${FAISS_ENABLE_GPU}") +message(STATUS "GPU Enabled: ${QUAKE_ENABLE_GPU}") message(STATUS "NUMA Enabled: ${QUAKE_USE_NUMA}") message(STATUS "Python used: ${Python3_EXECUTABLE}") message(STATUS "Torch Path: ${TorchPath}") @@ -221,4 +231,4 @@ message(STATUS "MKL_LINK: ${MKL_LINK}") message(STATUS "MKL_INTERFACE_FULL: ${MKL_INTERFACE_FULL}") message(STATUS "MKL_THREADING: ${MKL_THREADING}") message(STATUS "MKL_MPI: ${MKL_MPI}") -message(STATUS "------------------------------------------------") \ No newline at end of file +message(STATUS "------------------------------------------------") diff --git a/environments/ubuntu-cuda/Dockerfile b/environments/ubuntu-cuda118/Dockerfile similarity index 100% rename from environments/ubuntu-cuda/Dockerfile rename to environments/ubuntu-cuda118/Dockerfile diff --git a/environments/ubuntu-cuda/conda.yaml b/environments/ubuntu-cuda118/conda.yaml similarity index 82% rename from environments/ubuntu-cuda/conda.yaml rename to environments/ubuntu-cuda118/conda.yaml index 715744f2..99265b48 100644 --- a/environments/ubuntu-cuda/conda.yaml +++ b/environments/ubuntu-cuda118/conda.yaml @@ -3,6 +3,8 @@ channels: - pytorch - defaults - conda-forge + - nvidia + - libcuvs dependencies: - python=3.11 - numpy @@ -10,6 +12,8 @@ dependencies: - faiss-gpu - matplotlib - pytest + - libcuvs + - cuda-version=12.8 - pip - pip: - sphinx diff --git a/environments/ubuntu-cuda124/Dockerfile b/environments/ubuntu-cuda124/Dockerfile new file mode 100644 index 00000000..0462c5c9 --- /dev/null +++ b/environments/ubuntu-cuda124/Dockerfile @@ -0,0 +1,64 @@ +# Use a CUDA-enabled Ubuntu base image +FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04 + +# ----------------------------- +# Set up environment variables +# ----------------------------- +ENV CONDA_DIR=/opt/miniconda +ENV PATH="${CONDA_DIR}/bin:${PATH}" +ENV DEBIAN_FRONTEND=noninteractive + +# ----------------------------- +# Install system dependencies +# ----------------------------- +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + wget \ + curl \ + build-essential \ + ca-certificates \ + swig \ + git \ + libomp5 \ + libomp-dev \ + graphviz \ + libnuma-dev \ + && rm -rf /var/lib/apt/lists/* + + +# Install CMake 3.24.2 +RUN wget -qO /tmp/cmake.sh https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.sh && \ + chmod +x /tmp/cmake.sh && \ + /tmp/cmake.sh --skip-license --prefix=/usr/local && \ + rm /tmp/cmake.sh + + +# ----------------------------- +# Install Miniconda +# ----------------------------- +RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh && \ + bash /tmp/miniconda.sh -b -p $CONDA_DIR && \ + rm /tmp/miniconda.sh + +# ----------------------------- +# Copy in your conda environment YAML +# ----------------------------- +COPY environments/ubuntu-cuda/conda.yaml /tmp/conda.yaml + +# Create quake-env +RUN conda env create -f /tmp/conda.yaml && conda clean -afy + +# ----------------------------- +# Install GPU-enabled PyTorch +# ----------------------------- +RUN conda run -n quake-env pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cu124 + +# ----------------------------- +# Debug: show conda information +# ----------------------------- +RUN echo "===== DEBUG: which conda =====" && which conda +RUN echo "===== DEBUG: conda info =====" && conda info +RUN echo "===== DEBUG: conda env list =====" && conda env list +RUN echo "===== DEBUG: quake-env check =====" && conda run -n quake-env python -c "import sys; print('OK in quake-env; python:', sys.executable)" + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/environments/ubuntu-cuda124/conda.yaml b/environments/ubuntu-cuda124/conda.yaml new file mode 100644 index 00000000..f895341b --- /dev/null +++ b/environments/ubuntu-cuda124/conda.yaml @@ -0,0 +1,24 @@ +name: quake-env +channels: + - rapidsai + - pytorch + - defaults + - conda-forge + - nvidia +dependencies: + - python=3.11 + - numpy + - pandas + - faiss-gpu + - matplotlib + - pytest + - libcuvs + - cuda-nvrtc-dev=12.4 + - cuda-version=12.4 + - pip + - pip: + - sphinx + - sphinx_rtd_theme + - sphinxcontrib-mermaid + - graphviz + - pyyaml \ No newline at end of file diff --git a/src/cpp/include/clustering.h b/src/cpp/include/clustering.h index e4b00229..d83d563b 100644 --- a/src/cpp/include/clustering.h +++ b/src/cpp/include/clustering.h @@ -11,10 +11,41 @@ class IndexPartition; +/** + * @brief Clusters vectors into partitions using faiss::Clustering + * + * + * @param vectors The vectors to cluster. + * @param ids The IDs of the vectors. + * @param n_clusters The number of clusters to create. + * @param metric_type The metric type to use for clustering. + * @param niter The number of iterations to run k-means. + * @param initial_centroids The initial centroids to use for k-means. + */ +shared_ptr kmeans_cpu(Tensor vectors, + Tensor ids, + shared_ptr build_params, + Tensor initial_centroids = Tensor()); + +/** + * @brief Clusters vectors into partitions using CuVS k-means. + * + * + * @param vectors The vectors to cluster. + * @param ids The IDs of the vectors. + * @param n_clusters The number of clusters to create. + * @param metric_type The metric type to use for clustering. + * @param niter The number of iterations to run k-means. + * @param initial_centroids The initial centroids to use for k-means. + */ +#ifdef QUAKE_ENABLE_GPU +shared_ptr kmeans_cuvs_sample_and_predict( + Tensor vectors, Tensor ids, shared_ptr build_params); +#endif + /** * @brief Clusters vectors into partitions using k-means. * - * Uses the faiss::Clustering class to cluster vectors into n_clusters partitions. * * @param vectors The vectors to cluster. * @param ids The IDs of the vectors. @@ -25,10 +56,7 @@ class IndexPartition; */ shared_ptr kmeans(Tensor vectors, Tensor ids, - int n_clusters, - MetricType metric_type, - int niter = 5, - bool use_gpu = false, + shared_ptr build_params, Tensor initial_centroids = Tensor()); diff --git a/src/cpp/include/common.h b/src/cpp/include/common.h index 0938591c..b9ac90ed 100644 --- a/src/cpp/include/common.h +++ b/src/cpp/include/common.h @@ -38,14 +38,6 @@ #include #endif -#ifdef FAISS_ENABLE_GPU -#include -#include -#include -#include -#include -#endif - using torch::Tensor; using std::vector; using std::unordered_map; @@ -71,6 +63,8 @@ constexpr int DEFAULT_NLIST = 0; ///< Default number of cluste constexpr int DEFAULT_NITER = 5; ///< Default number of k-means iterations used during clustering. constexpr const char* DEFAULT_METRIC = "l2"; ///< Default distance metric (either "l2" for Euclidean or "ip" for inner product). constexpr int DEFAULT_NUM_WORKERS = 0; ///< Default number of workers (0 means single-threaded). +constexpr int DEFAULT_GPU_BATCH_SIZE = 100000; ///< Default batch size for GPU index building. +constexpr int DEFAULT_GPU_SAMPLE_SIZE = 1000000; ///< Default sample size for GPU index building. // Default constants for search parameters constexpr int DEFAULT_K = 1; ///< Default number of neighbors to return. @@ -132,11 +126,15 @@ struct IndexBuildParams { bool use_adaptive_nprobe = false; bool use_numa = false; - bool use_gpu = false; bool verify_numa = false; bool same_core = true; bool verbose = false; + // gpu index build params + bool use_gpu = false; + int gpu_batch_size = DEFAULT_GPU_BATCH_SIZE; + int gpu_sample_size = DEFAULT_GPU_SAMPLE_SIZE; + shared_ptr parent_params = nullptr; IndexBuildParams() = default; diff --git a/src/cpp/src/clustering.cpp b/src/cpp/src/clustering.cpp index ebbc88e0..903996ea 100644 --- a/src/cpp/src/clustering.cpp +++ b/src/cpp/src/clustering.cpp @@ -10,17 +10,189 @@ #include "index_partition.h" #include -shared_ptr kmeans(Tensor vectors, +#ifdef QUAKE_ENABLE_GPU +#include +#include // RAFT resources (handle) +#include // RAFT device view (make_device_matrix_view, etc.) +#include // cuVS k-means API + +shared_ptr kmeans_cuvs_sample_and_predict( + Tensor vectors, Tensor ids, + shared_ptr build_params) { + + int num_clusters = build_params->nlist; + int niter = build_params->niter; + int gpu_batch_size = build_params->gpu_batch_size; + int gpu_sample_size = build_params->gpu_sample_size; + MetricType metric = str_to_metric_type(build_params->metric); + + + TORCH_CHECK(vectors.dim() == 2, "vectors must be [N,D]"); + TORCH_CHECK(ids.dim() == 1, "ids must be [N]"); + int64_t N = vectors.size(0), D = vectors.size(1); + + gpu_sample_size = std::min(gpu_sample_size, (int) N); + gpu_batch_size = std::min(gpu_batch_size, (int) N); + + TORCH_CHECK(gpu_sample_size > 0 && gpu_sample_size <= N, + "invalid sample_size"); + + // 1) pin + normalize if needed + Tensor cpu_pts = vectors.contiguous().pin_memory(); + if (metric == faiss::METRIC_INNER_PRODUCT) { + auto norms = cpu_pts.norm(2, 1, true); + cpu_pts = cpu_pts.div(norms); + } + + // 2) choose a random sample of indices + auto perm = torch::randperm(N, torch::kLong); + auto samp_idx = perm.slice(0, 0, gpu_sample_size); + Tensor samp_pts = cpu_pts.index_select(0, samp_idx); + + // 3) move sample to GPU + Tensor samp_gpu = samp_pts.to(torch::kCUDA, /*non_blocking=*/true).contiguous(); + + // 4) prepare RAFT handle & cuVS params + raft::resources handle; + cudaStream_t stream = c10::cuda::getCurrentCUDAStream(); + raft::resource::set_cuda_stream(handle, stream); + + cuvs::cluster::kmeans::params params; + params.n_clusters = num_clusters; + params.init = cuvs::cluster::kmeans::params::InitMethod::Random; + params.max_iter = niter; + + // 5) allocate centroids on GPU + Tensor cent_gpu = torch::empty({num_clusters, D}, + torch::TensorOptions() + .dtype(torch::kFloat32) + .device(torch::kCUDA)) + .contiguous(); + + // 6) run fit on just the sample + { + // host scalars + float inertia = 0.0f; + int actual_iter= 0; + auto host_inertia = raft::make_host_scalar_view(&inertia); + auto host_iter = raft::make_host_scalar_view(&actual_iter); + + auto X_view = raft::make_device_matrix_view( + samp_gpu.data_ptr(), + (int) gpu_sample_size, (int)D); + auto C_view = raft::make_device_matrix_view( + cent_gpu.data_ptr(), + num_clusters, (int)D); + + cuvs::cluster::kmeans::fit( + handle, params, + X_view, + std::nullopt, + C_view, + host_inertia, + host_iter + ); + } + + Tensor all_labels = torch::empty({N}, torch::kLong); + + auto predict_fn = [&](Tensor batch_cpu, int64_t off) { + int64_t bs = batch_cpu.size(0); + + // allocate exactly bs labels on the GPU + Tensor labels32 = torch::empty( + {bs}, + torch::TensorOptions() + .dtype(torch::kInt32) + .device(torch::kCUDA)); + + Tensor batch_gpu = batch_cpu.to(torch::kCUDA, /*NB=*/true) + .contiguous(); + + auto Xv = raft::make_device_matrix_view( + batch_gpu.data_ptr(), + (int)bs, (int)D); + auto Lv = raft::make_device_vector_view( + labels32.data_ptr(), (int)bs); + + float pred_inertia = 0.0f; + auto host_pred = raft::make_host_scalar_view(&pred_inertia); + + cuvs::cluster::kmeans::predict( + handle, params, + Xv, + std::nullopt, + raft::make_device_matrix_view( + cent_gpu.data_ptr(), + num_clusters, (int)D), + Lv, + false, + host_pred + ); + + // now safe to copy back exactly bs elements + all_labels.narrow(0, off, bs) + .copy_( + labels32.to(torch::kLong) + .to(torch::kCPU) + ); + }; + + // predict the sample slice + predict_fn(samp_pts, /*off=*/0); + + // predict the rest + for (int64_t off = 0; off < N; off += gpu_batch_size) { + int64_t bs = std::min(gpu_batch_size, N - off); + if (off < gpu_sample_size) { + if (off + bs <= gpu_sample_size) { + continue; + } else { + int64_t overlap = gpu_sample_size - off; + Tensor rest_chunk = cpu_pts.slice(0, off + overlap, off + bs); + predict_fn(rest_chunk, /*off=*/off + overlap); + continue; + } + } + Tensor rest_chunk = cpu_pts.slice(0, off, off + bs); + predict_fn(rest_chunk, /*off=*/off); + } + // 8) group on CPU + Tensor sorted_lbl, sorted_idx; + std::tie(sorted_lbl, sorted_idx) = torch::sort(all_labels); + Tensor sorted_vecs = vectors.index_select(0, sorted_idx); + Tensor sorted_ids = ids.index_select(0, sorted_idx); + + Tensor counts = torch::bincount(sorted_lbl, /*weights=*/{}, num_clusters); + auto cnt_cpu = counts.to(torch::kCPU); + std::vector split_sizes( + cnt_cpu.data_ptr(), + cnt_cpu.data_ptr() + num_clusters + ); + + auto cluster_vecs = torch::split(sorted_vecs, split_sizes, 0); + auto cluster_ids = torch::split(sorted_ids, split_sizes, 0); + + auto out = std::make_shared(); + out->centroids = cent_gpu.cpu().contiguous(); + out->partition_ids = torch::arange(num_clusters, torch::kLong); + out->vectors = std::move(cluster_vecs); + out->vector_ids = std::move(cluster_ids); + + return out; +} +#endif + +shared_ptr kmeans_cpu(Tensor vectors, Tensor ids, - int n_clusters, - MetricType metric_type, - int niter, - bool use_gpu /*=false*/, + shared_ptr build_params, Tensor /* initial_centroids */) { // Ensure enough vectors are available and sizes match. - assert(vectors.size(0) >= n_clusters * 2); + assert(vectors.size(0) >= build_params->nlist * 2); assert(vectors.size(0) == ids.size(0)); + MetricType metric_type = str_to_metric_type(build_params->metric); + // Normalize vectors for inner product if (metric_type == faiss::METRIC_INNER_PRODUCT) vectors = vectors / vectors.norm(2, 1).unsqueeze(1); @@ -29,33 +201,19 @@ shared_ptr kmeans(Tensor vectors, int d = vectors.size(1); faiss::Index* index_ptr = nullptr; - - if (use_gpu) { - // Check if GPU resources are available. - #ifdef FAISS_ENABLE_GPU - faiss::gpu::StandardGpuResources gpu_res; - if (metric_type == faiss::METRIC_INNER_PRODUCT) - index_ptr = new faiss::gpu::GpuIndexFlatIP(&gpu_res, d); - else - index_ptr = new faiss::gpu::GpuIndexFlatL2(&gpu_res, d); - #else - throw std::runtime_error("GPU resources are not available. Please compile with FAISS_ENABLE_GPU."); - #endif - } else { - if (metric_type == faiss::METRIC_INNER_PRODUCT) - index_ptr = new faiss::IndexFlatIP(d); - else - index_ptr = new faiss::IndexFlatL2(d); - } + if (metric_type == faiss::METRIC_INNER_PRODUCT) + index_ptr = new faiss::IndexFlatIP(d); + else + index_ptr = new faiss::IndexFlatL2(d); faiss::ClusteringParameters cp; - cp.niter = niter; + cp.niter = build_params->niter; - faiss::Clustering clus(d, n_clusters, cp); + faiss::Clustering clus(d, build_params->nlist, cp); clus.train(n, vectors.data_ptr(), *index_ptr); // Retrieve centroids as a torch Tensor. - Tensor centroids = torch::from_blob(clus.centroids.data(), {n_clusters, d}, torch::kFloat32).clone(); + Tensor centroids = torch::from_blob(clus.centroids.data(), {build_params->nlist, d}, torch::kFloat32).clone(); if (metric_type == faiss::METRIC_INNER_PRODUCT) centroids = centroids / centroids.norm(2, 1).unsqueeze(1); @@ -72,7 +230,7 @@ shared_ptr kmeans(Tensor vectors, Tensor sorted_ids = ids.index_select(0, sorted_indices); // Compute counts per cluster using bincount. - Tensor counts_tensor = torch::bincount(sorted_assignments, /*weights=*/{}, n_clusters); + Tensor counts_tensor = torch::bincount(sorted_assignments, /*weights=*/{}, build_params->nlist); // Ensure counts are on CPU to extract split sizes. counts_tensor = counts_tensor.to(torch::kCPU); // Convert counts tensor to std::vector @@ -83,7 +241,7 @@ shared_ptr kmeans(Tensor vectors, vector cluster_vectors = torch::split(sorted_vectors, counts_vector, 0); vector cluster_ids = torch::split(sorted_ids, counts_vector, 0); - Tensor partition_ids = torch::arange(n_clusters, torch::kInt64); + Tensor partition_ids = torch::arange(build_params->nlist, torch::kInt64); shared_ptr clustering = std::make_shared(); clustering->centroids = centroids; @@ -96,6 +254,24 @@ shared_ptr kmeans(Tensor vectors, return clustering; } +shared_ptr kmeans(Tensor vectors, + Tensor ids, + shared_ptr build_params, + Tensor /* initial_centroids */) { + if (build_params->use_gpu) { + #ifdef QUAKE_ENABLE_GPU + return kmeans_cuvs_sample_and_predict( + vectors, + ids, + build_params); + #else + throw std::runtime_error("GPU support is not enabled. Please compile with QUAKE_ENABLE_GPU."); + #endif + } else { + return kmeans_cpu(vectors, ids, build_params); + } +} + tuple >> kmeans_refine_partitions( Tensor centroids, vector> partitions, diff --git a/src/cpp/src/partition_manager.cpp b/src/cpp/src/partition_manager.cpp index 3b738856..c838541b 100644 --- a/src/cpp/src/partition_manager.cpp +++ b/src/cpp/src/partition_manager.cpp @@ -409,14 +409,16 @@ shared_ptr PartitionManager::split_partitions(const Tensor &partitio shared_ptr clustering = select_partitions(partition_ids); + shared_ptr build_params = make_shared(); + build_params->nlist = num_splits; + build_params->metric = metric_type_to_str(parent_->metric_); for (int64_t i = 0; i < partition_ids.size(0); ++i) { // Ensure enough vectors to split assert(clustering->cluster_size(i) >= 4 && "Partition must have at least 8 vectors to split."); shared_ptr curr_split_clustering = kmeans( clustering->vectors[i], clustering->vector_ids[i], - num_splits, - parent_->metric_ + build_params ); for (size_t j = 0; j < curr_split_clustering->nlist(); ++j) { diff --git a/src/cpp/src/quake_index.cpp b/src/cpp/src/quake_index.cpp index 1f7cc853..9fb6f5ff 100644 --- a/src/cpp/src/quake_index.cpp +++ b/src/cpp/src/quake_index.cpp @@ -44,10 +44,7 @@ shared_ptr QuakeIndex::build(Tensor x, Tensor ids, shared_ptr clustering = kmeans( x, ids, - build_params_->nlist, - metric_, - build_params_->niter, - build_params_->use_gpu + build_params_ ); auto e1 = std::chrono::high_resolution_clock::now(); timing_info->train_time_us = std::chrono::duration_cast(e1 - s1).count(); diff --git a/src/cpp/third_party/cmake/fetch_rapids.cmake b/src/cpp/third_party/cmake/fetch_rapids.cmake new file mode 100644 index 00000000..0ec5fa04 --- /dev/null +++ b/src/cpp/third_party/cmake/fetch_rapids.cmake @@ -0,0 +1,22 @@ +# ============================================================================= +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. + +# Use this variable to update RAPIDS and cuVS versions +set(RAPIDS_VERSION "25.06") +set(rapids-cmake-version ${RAPIDS_VERSION}) + +if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUVS_RAPIDS.cmake) + file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake + ${CMAKE_CURRENT_BINARY_DIR}/CUVS_RAPIDS.cmake) +endif() +include(${CMAKE_CURRENT_BINARY_DIR}/CUVS_RAPIDS.cmake) diff --git a/src/cpp/third_party/cmake/get_cuvs.cmake b/src/cpp/third_party/cmake/get_cuvs.cmake new file mode 100644 index 00000000..a2a4e884 --- /dev/null +++ b/src/cpp/third_party/cmake/get_cuvs.cmake @@ -0,0 +1,56 @@ +# ============================================================================= +# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. + +# Use RAPIDS_VERSION from cmake/thirdparty/fetch_rapids.cmake +set(CUVS_VERSION "${RAPIDS_VERSION}") +set(CUVS_FORK "rapidsai") +set(CUVS_PINNED_TAG "branch-${RAPIDS_VERSION}") + +function(find_and_configure_cuvs) + set(oneValueArgs VERSION FORK PINNED_TAG ENABLE_NVTX BUILD_CUVS_C_LIBRARY) + cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN} ) + + + set(CUVS_COMPONENTS "") + if(PKG_BUILD_CUVS_C_LIBRARY) + string(APPEND CUVS_COMPONENTS " c_api") + endif() + #----------------------------------------------------- + # Invoke CPM find_package() + #----------------------------------------------------- + rapids_cpm_find(cuvs ${PKG_VERSION} + GLOBAL_TARGETS cuvs::cuvs + BUILD_EXPORT_SET cuvs-examples-exports + INSTALL_EXPORT_SET cuvs-examples-exports + COMPONENTS ${CUVS_COMPONENTS} + CPM_ARGS + GIT_REPOSITORY https://github.com/${PKG_FORK}/cuvs.git + GIT_TAG ${PKG_PINNED_TAG} + SOURCE_SUBDIR cpp + OPTIONS + "BUILD_C_LIBRARY ${PKG_BUILD_CUVS_C_LIBRARY}" + "BUILD_TESTS OFF" + "CUVS_NVTX ${PKG_ENABLE_NVTX}" + ) +endfunction() + +# Change pinned tag here to test a commit in CI +# To use a different CUVS locally, set the CMake variable +# CPM_cuvs_SOURCE=/path/to/local/cuvs +find_and_configure_cuvs(VERSION ${CUVS_VERSION}.00 + FORK ${CUVS_FORK} + PINNED_TAG ${CUVS_PINNED_TAG} + ENABLE_NVTX OFF + BUILD_CUVS_C_LIBRARY ${BUILD_CUVS_C_LIBRARY} +) diff --git a/test/cpp/clustering.cpp b/test/cpp/clustering.cpp new file mode 100644 index 00000000..bd373ba0 --- /dev/null +++ b/test/cpp/clustering.cpp @@ -0,0 +1,141 @@ +#include +#include +#include "clustering.h" + +// Helpers to generate random data and sequential ids +static torch::Tensor generate_random_data(int64_t N, int64_t D) { + return torch::randn({N, D}, torch::kFloat32).contiguous(); +} +static torch::Tensor generate_sequential_ids(int64_t N, int64_t start = 0) { + return torch::arange(start, start + N, torch::kInt64).contiguous(); +} + +// Compute mean squared error for clustering (for CPU sanity) +static double compute_mse(const torch::Tensor& centroids, + const std::vector& clusters) { + double total_err = 0.0; + int64_t count = 0; + auto C = centroids.to(torch::kCPU); + for (size_t i = 0; i < clusters.size(); ++i) { + auto cl = clusters[i].to(torch::kCPU); + if (cl.size(0) == 0) continue; + auto diff = cl - C[i].unsqueeze(0); + total_err += diff.pow(2).sum().item(); + count += cl.size(0); + } + return count>0 ? total_err / count : 0.0; +} + +// Fixture +class ClusteringTest : public ::testing::Test { + protected: + const int64_t num_vectors = 5000; + const int64_t dim = 64; + const int num_clusters= 20; + + torch::Tensor vectors_cpu, ids_cpu; +#ifdef QUAKE_ENABLE_GPU + torch::Tensor vectors_cuda, ids_cuda; +#endif + + void SetUp() override { + vectors_cpu = generate_random_data(num_vectors, dim); + ids_cpu = generate_sequential_ids(num_vectors); + +#ifdef QUAKE_ENABLE_GPU + if (!torch::cuda::is_available()) { + GTEST_SKIP() << "CUDA not available"; + } + vectors_cuda = vectors_cpu.to(torch::kCUDA).contiguous(); + ids_cuda = ids_cpu.to(torch::kCUDA).contiguous(); +#endif + } +}; + +// Test existing CPU kmeans +TEST_F(ClusteringTest, KMeansCPU_L2) { + shared_ptr build_params = std::make_shared(); + build_params->nlist = num_clusters; + build_params->metric = "l2"; + build_params->niter = 10; + auto cl = kmeans_cpu(vectors_cpu, ids_cpu, build_params, torch::Tensor()); + ASSERT_EQ(cl->centroids.sizes(), (std::vector{num_clusters, dim})); + int64_t tot=0; + for (int i=0;ivectors[i].size(0), cl->vector_ids[i].size(0)); + tot += cl->vectors[i].size(0); + } + ASSERT_EQ(tot, num_vectors); +} + +// Compare CPU vs CPU wrapper +TEST_F(ClusteringTest, KMeansWrapper_CPU) { + shared_ptr build_params = std::make_shared(); + build_params->nlist = num_clusters; + build_params->metric = "l2"; + build_params->niter = 10; + build_params->use_gpu = false; + auto cl = kmeans(vectors_cpu, ids_cpu, build_params, torch::Tensor()); + ASSERT_EQ(cl->centroids.sizes(), (std::vector{num_clusters, dim})); + int64_t tot=0; + for (int i=0;ivectors[i].size(0); + } + ASSERT_EQ(tot, num_vectors); +} + +#ifdef QUAKE_ENABLE_GPU +TEST_F(ClusteringTest, SampleAndPredict_GPU_L2) { + shared_ptr build_params = std::make_shared(); + build_params->nlist = num_clusters; + build_params->metric = "l2"; + build_params->niter = 10; + build_params->use_gpu = true; + build_params->gpu_sample_size = 2000; + build_params->gpu_batch_size = 100; + + auto cl = kmeans_cuvs_sample_and_predict( + vectors_cpu, ids_cpu, build_params); + + // centroids must live on CPU and have correct shape + ASSERT_EQ(cl->centroids.device().type(), torch::kCPU); + ASSERT_EQ(cl->centroids.sizes(), (std::vector{num_clusters, dim})); + + // all vectors accounted for + int64_t tot=0; + for (int i=0;ivectors[i]; + ASSERT_EQ(part.device().type(), torch::kCPU); + ASSERT_EQ(part.size(0), cl->vector_ids[i].size(0)); + tot += part.size(0); + } + ASSERT_EQ(tot, num_vectors); + + build_params->use_gpu = false; + + // Optional quality check: rough MSE vs CPU run + auto cl_cpu = kmeans_cpu(vectors_cpu, ids_cpu, build_params, torch::Tensor()); + double mse_cpu = compute_mse(cl_cpu->centroids, cl_cpu->vectors); + double mse_gpu = compute_mse(cl->centroids, cl->vectors); + ASSERT_NEAR(mse_cpu, mse_gpu, mse_cpu * 0.30); +} + +// Full wrapper test for GPU +TEST_F(ClusteringTest, KMeansWrapper_GPU) { + shared_ptr build_params = std::make_shared(); + build_params->nlist = num_clusters; + build_params->metric = "l2"; + build_params->niter = 10; + build_params->use_gpu = true; + build_params->gpu_sample_size = 2000; + build_params->gpu_batch_size = 100; + + auto cl = kmeans(vectors_cpu, ids_cpu, build_params, + torch::Tensor()); + ASSERT_EQ(cl->centroids.device().type(), torch::kCPU); + ASSERT_EQ(cl->vectors.size(), size_t(num_clusters)); + int64_t tot=0; + for (auto &p : cl->vectors) tot += p.size(0); + ASSERT_EQ(tot, num_vectors); +} +#endif // QUAKE_ENABLE_GPU diff --git a/test/cpp/quake_index.cpp b/test/cpp/quake_index.cpp index 0521ea02..b8f0199c 100644 --- a/test/cpp/quake_index.cpp +++ b/test/cpp/quake_index.cpp @@ -283,17 +283,19 @@ TEST(QuakeIndexStressTest, LargeBuildTest) { << " vectors took " << build_duration_ms << " ms.\n"; } -#ifdef FAISS_ENABLE_GPU +#ifdef QUAKE_ENABLE_GPU TEST(QuakeIndexStressTestGPU, LargeBuildTest) { // Attempt to build an index with a large number of vectors. // Adjust these numbers based on your available memory/compute. int64_t dimension = 128; // Medium-high dimension int64_t num_vectors = 1e6; // 1 million vectors - auto data_vectors = generate_random_data(num_vectors, dimension); - auto data_ids = generate_sequential_ids(num_vectors, 0); + auto data_vectors = generate_random_data(num_vectors, dimension).contiguous(); + auto data_ids = generate_sequential_ids(num_vectors, 0).contiguous(); QuakeIndex index; + std::cout << "generated\n"; + auto build_params = std::make_shared(); build_params->nlist = 512; build_params->metric = "l2"; @@ -527,12 +529,12 @@ TEST(QuakeIndexStressTest, SearchAddRemoveMaintenanceTest) { } // Define the GPU related test only if FAISS GPU support is enabled -#ifdef FAISS_ENABLE_GPU +#ifdef QUAKE_ENABLE_GPU // Test build with GPU enabled TEST(QuakeIndexGPUTest, BuildWithGPUTest) { int64_t dimension = 32; - int64_t num_vectors = 200; - int64_t nlist = 5; + int64_t num_vectors = 10000; + int64_t nlist = 10; torch::Tensor data_vectors = generate_random_data(num_vectors, dimension); torch::Tensor data_ids = generate_sequential_ids(num_vectors, 0); @@ -554,4 +556,4 @@ TEST(QuakeIndexGPUTest, BuildWithGPUTest) { EXPECT_EQ(timing_info->n_vectors, data_vectors.size(0)); EXPECT_EQ(timing_info->d, data_vectors.size(1)); } -#endif \ No newline at end of file +#endif