diff --git a/CMakeLists.txt b/CMakeLists.txt
index a4d2c003..fa71a403 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,18 +12,6 @@ set(Torch_USE_CUDA OFF CACHE BOOL "Force disable CUDA in Torch")
 set(Torch_NO_CUDA ON CACHE BOOL "Force disable CUDA in Torch")
 set(USE_CUDA OFF CACHE BOOL "Force disable CUDA globally")
 
-# QUAKE_ENABLE_GPU: Enable GPU support for Faiss
-# Default: OFF
-if(QUAKE_ENABLE_GPU)
-    set(FAISS_ENABLE_GPU ON)
-else()
-    set(FAISS_ENABLE_GPU OFF)
-endif()
-
-if(QUAKE_ENABLE_GPU)
-    add_compile_definitions(FAISS_ENABLE_GPU)
-endif()
-
 if(QUAKE_USE_NUMA)
     add_compile_definitions(QUAKE_USE_NUMA)
 endif()
@@ -64,6 +52,25 @@ set(project_BINDINGS_DIR ${CPP_SOURCE}/bindings)
 set(project_THIRD_PARTY_DIR ${CPP_SOURCE}/third_party)
 set(project_TEST_DIR test/cpp)
 
+if(QUAKE_ENABLE_GPU)
+    find_package(CuVS REQUIRED)
+    include(${project_THIRD_PARTY_DIR}/cmake/fetch_rapids.cmake)
+    include(rapids-cmake)
+    include(rapids-cpm)
+    include(rapids-cuda)
+    include(rapids-export)
+    include(rapids-find)
+
+    rapids_cuda_init_architectures(quake_c)
+
+    rapids_cpm_init()
+    set(BUILD_CUVS_C_LIBRARY OFF)
+    include(${project_THIRD_PARTY_DIR}/cmake/get_cuvs.cmake)
+
+    add_compile_definitions(QUAKE_ENABLE_GPU)
+endif()
+
+set(FAISS_ENABLE_GPU OFF)
 # ---------------------------------------------------------------
 # Print out Compiler and Path Information
 # ---------------------------------------------------------------
@@ -81,7 +88,6 @@ message(STATUS "QUAKE_USE_NUMA: ${QUAKE_USE_NUMA}")
 # Apple-specific adjustments
 if(APPLE)
     include_directories("/opt/homebrew/opt/openblas/include")
-    set(FAISS_ENABLE_GPU OFF)
 endif()
 
 # Compiler options and definitions
@@ -149,6 +155,10 @@ elseif(UNIX)
     if(QUAKE_USE_NUMA)
         list(APPEND LINK_LIBS -lnuma)
     endif()
+
+    if (QUAKE_ENABLE_GPU)
+        list(APPEND LINK_LIBS cuvs::cuvs)
+    endif()
 else()
     # unsupported platform
     message(FATAL_ERROR "Unsupported platform")
@@ -211,7 +221,7 @@ endif()
 # ---------------------------------------------------------------
 message(STATUS "--------- Final Configuration Summary ---------")
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
-message(STATUS "GPU Enabled: ${FAISS_ENABLE_GPU}")
+message(STATUS "GPU Enabled: ${QUAKE_ENABLE_GPU}")
 message(STATUS "NUMA Enabled: ${QUAKE_USE_NUMA}")
 message(STATUS "Python used: ${Python3_EXECUTABLE}")
 message(STATUS "Torch Path: ${TorchPath}")
@@ -221,4 +231,4 @@ message(STATUS "MKL_LINK: ${MKL_LINK}")
 message(STATUS "MKL_INTERFACE_FULL: ${MKL_INTERFACE_FULL}")
 message(STATUS "MKL_THREADING: ${MKL_THREADING}")
 message(STATUS "MKL_MPI: ${MKL_MPI}")
-message(STATUS "------------------------------------------------")
\ No newline at end of file
+message(STATUS "------------------------------------------------")
diff --git a/environments/ubuntu-cuda/Dockerfile b/environments/ubuntu-cuda118/Dockerfile
similarity index 100%
rename from environments/ubuntu-cuda/Dockerfile
rename to environments/ubuntu-cuda118/Dockerfile
diff --git a/environments/ubuntu-cuda/conda.yaml b/environments/ubuntu-cuda118/conda.yaml
similarity index 82%
rename from environments/ubuntu-cuda/conda.yaml
rename to environments/ubuntu-cuda118/conda.yaml
index 715744f2..99265b48 100644
--- a/environments/ubuntu-cuda/conda.yaml
+++ b/environments/ubuntu-cuda118/conda.yaml
@@ -3,6 +3,8 @@ channels:
   - pytorch
   - defaults
   - conda-forge
+  - nvidia
+  - libcuvs
 dependencies:
   - python=3.11
   - numpy
@@ -10,6 +12,8 @@ dependencies:
   - faiss-gpu
   - matplotlib
   - pytest
+  - libcuvs
+  - cuda-version=12.8
   - pip
   - pip:
     - sphinx
diff --git a/environments/ubuntu-cuda124/Dockerfile b/environments/ubuntu-cuda124/Dockerfile
new file mode 100644
index 00000000..0462c5c9
--- /dev/null
+++ b/environments/ubuntu-cuda124/Dockerfile
@@ -0,0 +1,64 @@
+# Use a CUDA-enabled Ubuntu base image
+FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
+
+# -----------------------------
+# Set up environment variables
+# -----------------------------
+ENV CONDA_DIR=/opt/miniconda
+ENV PATH="${CONDA_DIR}/bin:${PATH}"
+ENV DEBIAN_FRONTEND=noninteractive
+
+# -----------------------------
+# Install system dependencies
+# -----------------------------
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+      wget \
+      curl \
+      build-essential \
+      ca-certificates \
+      swig \
+      git \
+      libomp5 \
+      libomp-dev \
+      graphviz \
+      libnuma-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+
+# Install CMake 3.24.2
+RUN wget -qO /tmp/cmake.sh https://github.com/Kitware/CMake/releases/download/v3.30.4/cmake-3.30.4-linux-x86_64.sh && \
+    chmod +x /tmp/cmake.sh && \
+    /tmp/cmake.sh --skip-license --prefix=/usr/local && \
+    rm /tmp/cmake.sh
+
+
+# -----------------------------
+# Install Miniconda
+# -----------------------------
+RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh && \
+    bash /tmp/miniconda.sh -b -p $CONDA_DIR && \
+    rm /tmp/miniconda.sh
+
+# -----------------------------
+# Copy in your conda environment YAML
+# -----------------------------
+COPY environments/ubuntu-cuda/conda.yaml /tmp/conda.yaml
+
+# Create quake-env
+RUN conda env create -f /tmp/conda.yaml && conda clean -afy
+
+# -----------------------------
+# Install GPU-enabled PyTorch
+# -----------------------------
+RUN conda run -n quake-env pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cu124
+
+# -----------------------------
+# Debug: show conda information
+# -----------------------------
+RUN echo "===== DEBUG: which conda =====" && which conda
+RUN echo "===== DEBUG: conda info =====" && conda info
+RUN echo "===== DEBUG: conda env list =====" && conda env list
+RUN echo "===== DEBUG: quake-env check =====" && conda run -n quake-env python -c "import sys; print('OK in quake-env; python:', sys.executable)"
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/environments/ubuntu-cuda124/conda.yaml b/environments/ubuntu-cuda124/conda.yaml
new file mode 100644
index 00000000..f895341b
--- /dev/null
+++ b/environments/ubuntu-cuda124/conda.yaml
@@ -0,0 +1,24 @@
+name: quake-env
+channels:
+  - rapidsai
+  - pytorch
+  - defaults
+  - conda-forge
+  - nvidia
+dependencies:
+  - python=3.11
+  - numpy
+  - pandas
+  - faiss-gpu
+  - matplotlib
+  - pytest
+  - libcuvs
+  - cuda-nvrtc-dev=12.4
+  - cuda-version=12.4
+  - pip
+  - pip:
+    - sphinx
+    - sphinx_rtd_theme
+    - sphinxcontrib-mermaid
+    - graphviz
+    - pyyaml
\ No newline at end of file
diff --git a/src/cpp/include/clustering.h b/src/cpp/include/clustering.h
index e4b00229..d83d563b 100644
--- a/src/cpp/include/clustering.h
+++ b/src/cpp/include/clustering.h
@@ -11,10 +11,41 @@
 
 class IndexPartition;
 
+/**
+ * @brief Clusters vectors into partitions using faiss::Clustering
+ *
+ *
+ * @param vectors The vectors to cluster.
+ * @param ids The IDs of the vectors.
+ * @param n_clusters The number of clusters to create.
+ * @param metric_type The metric type to use for clustering.
+ * @param niter The number of iterations to run k-means.
+ * @param initial_centroids The initial centroids to use for k-means.
+ */
+shared_ptr<Clustering> kmeans_cpu(Tensor vectors,
+                              Tensor ids,
+                              shared_ptr<IndexBuildParams> build_params,
+                              Tensor initial_centroids = Tensor());
+
+/**
+ * @brief Clusters vectors into partitions using CuVS k-means.
+ *
+ *
+ * @param vectors The vectors to cluster.
+ * @param ids The IDs of the vectors.
+ * @param n_clusters The number of clusters to create.
+ * @param metric_type The metric type to use for clustering.
+ * @param niter The number of iterations to run k-means.
+ * @param initial_centroids The initial centroids to use for k-means.
+ */
+#ifdef QUAKE_ENABLE_GPU
+shared_ptr<Clustering> kmeans_cuvs_sample_and_predict(
+    Tensor vectors, Tensor ids, shared_ptr<IndexBuildParams> build_params);
+#endif
+
 /**
  * @brief Clusters vectors into partitions using k-means.
  *
- * Uses the faiss::Clustering class to cluster vectors into n_clusters partitions.
  *
  * @param vectors The vectors to cluster.
  * @param ids The IDs of the vectors.
@@ -25,10 +56,7 @@ class IndexPartition;
  */
 shared_ptr<Clustering> kmeans(Tensor vectors,
                               Tensor ids,
-                              int n_clusters,
-                              MetricType metric_type,
-                              int niter = 5,
-                              bool use_gpu = false,
+                              shared_ptr<IndexBuildParams> build_params,
                               Tensor initial_centroids = Tensor());
 
 
diff --git a/src/cpp/include/common.h b/src/cpp/include/common.h
index 0938591c..b9ac90ed 100644
--- a/src/cpp/include/common.h
+++ b/src/cpp/include/common.h
@@ -38,14 +38,6 @@
 #include <numaif.h>
 #endif
 
-#ifdef FAISS_ENABLE_GPU
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuIndexIVFFlat.h>
-#include <faiss/gpu/GpuIndexIVFPQ.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/GpuCloner.h>
-#endif
-
 using torch::Tensor;
 using std::vector;
 using std::unordered_map;
@@ -71,6 +63,8 @@ constexpr int DEFAULT_NLIST = 0;                   ///< Default number of cluste
 constexpr int DEFAULT_NITER = 5;                   ///< Default number of k-means iterations used during clustering.
 constexpr const char* DEFAULT_METRIC = "l2";       ///< Default distance metric (either "l2" for Euclidean or "ip" for inner product).
 constexpr int DEFAULT_NUM_WORKERS = 0;             ///< Default number of workers (0 means single-threaded).
+constexpr int DEFAULT_GPU_BATCH_SIZE = 100000;             ///< Default batch size for GPU index building.
+constexpr int DEFAULT_GPU_SAMPLE_SIZE = 1000000;           ///< Default sample size for GPU index building.
 
 // Default constants for search parameters
 constexpr int DEFAULT_K = 1;                             ///< Default number of neighbors to return.
@@ -132,11 +126,15 @@ struct IndexBuildParams {
 
     bool use_adaptive_nprobe = false;
     bool use_numa = false;
-    bool use_gpu = false;
     bool verify_numa = false;
     bool same_core = true;
     bool verbose = false;
 
+    // gpu index build params
+    bool use_gpu = false;
+    int gpu_batch_size = DEFAULT_GPU_BATCH_SIZE;
+    int gpu_sample_size = DEFAULT_GPU_SAMPLE_SIZE;
+
     shared_ptr<IndexBuildParams> parent_params = nullptr;
 
     IndexBuildParams() = default;
diff --git a/src/cpp/src/clustering.cpp b/src/cpp/src/clustering.cpp
index ebbc88e0..903996ea 100644
--- a/src/cpp/src/clustering.cpp
+++ b/src/cpp/src/clustering.cpp
@@ -10,17 +10,189 @@
 #include "index_partition.h"
 #include <list_scanning.h>
 
-shared_ptr<Clustering> kmeans(Tensor vectors,
+#ifdef QUAKE_ENABLE_GPU
+#include <c10/cuda/CUDAStream.h>
+#include <raft/core/resources.hpp>   // RAFT resources (handle)
+#include <raft/core/device_mdspan.hpp> // RAFT device view (make_device_matrix_view, etc.)
+#include <cuvs/cluster/kmeans.hpp>   // cuVS k-means API
+
+shared_ptr<Clustering> kmeans_cuvs_sample_and_predict(
+    Tensor vectors, Tensor ids,
+    shared_ptr<IndexBuildParams> build_params) {
+
+    int num_clusters       = build_params->nlist;
+    int niter              = build_params->niter;
+    int gpu_batch_size     = build_params->gpu_batch_size;
+    int gpu_sample_size    = build_params->gpu_sample_size;
+    MetricType metric      = str_to_metric_type(build_params->metric);
+
+
+    TORCH_CHECK(vectors.dim() == 2, "vectors must be [N,D]");
+    TORCH_CHECK(ids.dim() == 1,      "ids must be [N]");
+    int64_t N = vectors.size(0), D = vectors.size(1);
+
+    gpu_sample_size = std::min(gpu_sample_size, (int) N);
+    gpu_batch_size = std::min(gpu_batch_size, (int) N);
+
+    TORCH_CHECK(gpu_sample_size > 0 && gpu_sample_size <= N,
+              "invalid sample_size");
+
+    // 1) pin + normalize if needed
+    Tensor cpu_pts = vectors.contiguous().pin_memory();
+    if (metric == faiss::METRIC_INNER_PRODUCT) {
+    auto norms = cpu_pts.norm(2, 1, true);
+    cpu_pts = cpu_pts.div(norms);
+    }
+
+    // 2) choose a random sample of indices
+    auto perm     = torch::randperm(N, torch::kLong);
+    auto samp_idx = perm.slice(0, 0, gpu_sample_size);
+    Tensor samp_pts = cpu_pts.index_select(0, samp_idx);
+
+    // 3) move sample to GPU
+    Tensor samp_gpu = samp_pts.to(torch::kCUDA, /*non_blocking=*/true).contiguous();
+
+    // 4) prepare RAFT handle & cuVS params
+    raft::resources handle;
+    cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
+    raft::resource::set_cuda_stream(handle, stream);
+
+    cuvs::cluster::kmeans::params params;
+    params.n_clusters = num_clusters;
+    params.init       = cuvs::cluster::kmeans::params::InitMethod::Random;
+    params.max_iter   = niter;
+
+    // 5) allocate centroids on GPU
+    Tensor cent_gpu = torch::empty({num_clusters, D},
+                                 torch::TensorOptions()
+                                   .dtype(torch::kFloat32)
+                                   .device(torch::kCUDA))
+                    .contiguous();
+
+    // 6) run fit on just the sample
+    {
+    // host scalars
+    float inertia    = 0.0f;
+    int   actual_iter= 0;
+    auto host_inertia = raft::make_host_scalar_view(&inertia);
+    auto host_iter    = raft::make_host_scalar_view(&actual_iter);
+
+    auto X_view = raft::make_device_matrix_view<const float,int>(
+                     samp_gpu.data_ptr<float>(),
+                     (int) gpu_sample_size, (int)D);
+    auto C_view = raft::make_device_matrix_view<float,int>(
+                     cent_gpu.data_ptr<float>(),
+                     num_clusters, (int)D);
+
+    cuvs::cluster::kmeans::fit(
+      handle, params,
+      X_view,
+      std::nullopt,
+      C_view,
+      host_inertia,
+      host_iter
+    );
+    }
+
+    Tensor all_labels = torch::empty({N}, torch::kLong);
+
+    auto predict_fn = [&](Tensor batch_cpu, int64_t off) {
+    int64_t bs = batch_cpu.size(0);
+
+    // allocate exactly bs labels on the GPU
+    Tensor labels32 = torch::empty(
+        {bs},
+        torch::TensorOptions()
+            .dtype(torch::kInt32)
+            .device(torch::kCUDA));
+
+    Tensor batch_gpu = batch_cpu.to(torch::kCUDA, /*NB=*/true)
+                                 .contiguous();
+
+    auto Xv = raft::make_device_matrix_view<const float,int>(
+                batch_gpu.data_ptr<float>(),
+                (int)bs, (int)D);
+    auto Lv = raft::make_device_vector_view<int,int>(
+                labels32.data_ptr<int>(), (int)bs);
+
+    float pred_inertia = 0.0f;
+    auto host_pred     = raft::make_host_scalar_view(&pred_inertia);
+
+    cuvs::cluster::kmeans::predict(
+      handle, params,
+      Xv,
+      std::nullopt,
+      raft::make_device_matrix_view<float,int>(
+        cent_gpu.data_ptr<float>(),
+        num_clusters, (int)D),
+      Lv,
+      false,
+      host_pred
+    );
+
+    // now safe to copy back exactly bs elements
+    all_labels.narrow(0, off, bs)
+              .copy_(
+                labels32.to(torch::kLong)
+                        .to(torch::kCPU)
+              );
+    };
+
+    // predict the sample slice
+    predict_fn(samp_pts, /*off=*/0);
+
+    // predict the rest
+    for (int64_t off = 0; off < N; off += gpu_batch_size) {
+    int64_t bs = std::min<int64_t>(gpu_batch_size, N - off);
+    if (off < gpu_sample_size) {
+      if (off + bs <= gpu_sample_size) {
+        continue;
+      } else {
+        int64_t overlap = gpu_sample_size - off;
+        Tensor rest_chunk = cpu_pts.slice(0, off + overlap, off + bs);
+        predict_fn(rest_chunk, /*off=*/off + overlap);
+        continue;
+      }
+    }
+    Tensor rest_chunk = cpu_pts.slice(0, off, off + bs);
+    predict_fn(rest_chunk, /*off=*/off);
+    }
+    // 8) group on CPU
+    Tensor sorted_lbl, sorted_idx;
+    std::tie(sorted_lbl, sorted_idx) = torch::sort(all_labels);
+    Tensor sorted_vecs = vectors.index_select(0, sorted_idx);
+    Tensor sorted_ids  = ids.index_select(0, sorted_idx);
+
+    Tensor counts = torch::bincount(sorted_lbl, /*weights=*/{}, num_clusters);
+    auto cnt_cpu = counts.to(torch::kCPU);
+    std::vector<int64_t> split_sizes(
+    cnt_cpu.data_ptr<int64_t>(),
+    cnt_cpu.data_ptr<int64_t>() + num_clusters
+    );
+
+    auto cluster_vecs = torch::split(sorted_vecs, split_sizes, 0);
+    auto cluster_ids  = torch::split(sorted_ids,   split_sizes, 0);
+
+    auto out = std::make_shared<Clustering>();
+    out->centroids     = cent_gpu.cpu().contiguous();
+    out->partition_ids = torch::arange(num_clusters, torch::kLong);
+    out->vectors       = std::move(cluster_vecs);
+    out->vector_ids    = std::move(cluster_ids);
+
+    return out;
+}
+#endif
+
+shared_ptr<Clustering> kmeans_cpu(Tensor vectors,
                               Tensor ids,
-                              int n_clusters,
-                              MetricType metric_type,
-                              int niter,
-                              bool use_gpu /*=false*/,
+                              shared_ptr<IndexBuildParams> build_params,
                               Tensor /* initial_centroids */) {
     // Ensure enough vectors are available and sizes match.
-    assert(vectors.size(0) >= n_clusters * 2);
+    assert(vectors.size(0) >= build_params->nlist * 2);
     assert(vectors.size(0) == ids.size(0));
 
+    MetricType metric_type = str_to_metric_type(build_params->metric);
+
     // Normalize vectors for inner product
     if (metric_type == faiss::METRIC_INNER_PRODUCT)
         vectors = vectors / vectors.norm(2, 1).unsqueeze(1);
@@ -29,33 +201,19 @@ shared_ptr<Clustering> kmeans(Tensor vectors,
     int d = vectors.size(1);
 
     faiss::Index* index_ptr = nullptr;
-
-    if (use_gpu) {
-        // Check if GPU resources are available.
-        #ifdef FAISS_ENABLE_GPU
-        faiss::gpu::StandardGpuResources gpu_res;
-        if (metric_type == faiss::METRIC_INNER_PRODUCT)
-            index_ptr = new faiss::gpu::GpuIndexFlatIP(&gpu_res, d);
-        else
-            index_ptr = new faiss::gpu::GpuIndexFlatL2(&gpu_res, d);
-        #else
-        throw std::runtime_error("GPU resources are not available. Please compile with FAISS_ENABLE_GPU.");
-        #endif
-    } else {
-        if (metric_type == faiss::METRIC_INNER_PRODUCT)
-            index_ptr = new faiss::IndexFlatIP(d);
-        else
-            index_ptr = new faiss::IndexFlatL2(d);
-    }
+    if (metric_type == faiss::METRIC_INNER_PRODUCT)
+        index_ptr = new faiss::IndexFlatIP(d);
+    else
+        index_ptr = new faiss::IndexFlatL2(d);
 
     faiss::ClusteringParameters cp;
-    cp.niter = niter;
+    cp.niter = build_params->niter;
 
-    faiss::Clustering clus(d, n_clusters, cp);
+    faiss::Clustering clus(d, build_params->nlist, cp);
     clus.train(n, vectors.data_ptr<float>(), *index_ptr);
 
     // Retrieve centroids as a torch Tensor.
-    Tensor centroids = torch::from_blob(clus.centroids.data(), {n_clusters, d}, torch::kFloat32).clone();
+    Tensor centroids = torch::from_blob(clus.centroids.data(), {build_params->nlist, d}, torch::kFloat32).clone();
     if (metric_type == faiss::METRIC_INNER_PRODUCT)
         centroids = centroids / centroids.norm(2, 1).unsqueeze(1);
 
@@ -72,7 +230,7 @@ shared_ptr<Clustering> kmeans(Tensor vectors,
     Tensor sorted_ids = ids.index_select(0, sorted_indices);
 
     // Compute counts per cluster using bincount.
-    Tensor counts_tensor = torch::bincount(sorted_assignments, /*weights=*/{}, n_clusters);
+    Tensor counts_tensor = torch::bincount(sorted_assignments, /*weights=*/{}, build_params->nlist);
     // Ensure counts are on CPU to extract split sizes.
     counts_tensor = counts_tensor.to(torch::kCPU);
     // Convert counts tensor to std::vector<int64_t>
@@ -83,7 +241,7 @@ shared_ptr<Clustering> kmeans(Tensor vectors,
     vector<Tensor> cluster_vectors = torch::split(sorted_vectors, counts_vector, 0);
     vector<Tensor> cluster_ids = torch::split(sorted_ids, counts_vector, 0);
 
-    Tensor partition_ids = torch::arange(n_clusters, torch::kInt64);
+    Tensor partition_ids = torch::arange(build_params->nlist, torch::kInt64);
 
     shared_ptr<Clustering> clustering = std::make_shared<Clustering>();
     clustering->centroids = centroids;
@@ -96,6 +254,24 @@ shared_ptr<Clustering> kmeans(Tensor vectors,
     return clustering;
 }
 
+shared_ptr<Clustering> kmeans(Tensor vectors,
+                              Tensor ids,
+                              shared_ptr<IndexBuildParams> build_params,
+                              Tensor /* initial_centroids */) {
+    if (build_params->use_gpu) {
+    #ifdef QUAKE_ENABLE_GPU
+        return kmeans_cuvs_sample_and_predict(
+            vectors,
+            ids,
+            build_params);
+    #else
+            throw std::runtime_error("GPU support is not enabled. Please compile with QUAKE_ENABLE_GPU.");
+    #endif
+    } else {
+        return kmeans_cpu(vectors, ids, build_params);
+    }
+}
+
 tuple<Tensor, vector<shared_ptr<IndexPartition> >> kmeans_refine_partitions(
     Tensor centroids,
     vector<shared_ptr<IndexPartition>> partitions,
diff --git a/src/cpp/src/partition_manager.cpp b/src/cpp/src/partition_manager.cpp
index 3b738856..c838541b 100644
--- a/src/cpp/src/partition_manager.cpp
+++ b/src/cpp/src/partition_manager.cpp
@@ -409,14 +409,16 @@ shared_ptr<Clustering> PartitionManager::split_partitions(const Tensor &partitio
 
     shared_ptr<Clustering> clustering = select_partitions(partition_ids);
 
+    shared_ptr<IndexBuildParams> build_params = make_shared<IndexBuildParams>();
+    build_params->nlist = num_splits;
+    build_params->metric = metric_type_to_str(parent_->metric_);
     for (int64_t i = 0; i < partition_ids.size(0); ++i) {
         // Ensure enough vectors to split
         assert(clustering->cluster_size(i) >= 4 && "Partition must have at least 8 vectors to split.");
         shared_ptr<Clustering> curr_split_clustering = kmeans(
             clustering->vectors[i],
             clustering->vector_ids[i],
-            num_splits,
-            parent_->metric_
+            build_params
         );
 
         for (size_t j = 0; j < curr_split_clustering->nlist(); ++j) {
diff --git a/src/cpp/src/quake_index.cpp b/src/cpp/src/quake_index.cpp
index 1f7cc853..9fb6f5ff 100644
--- a/src/cpp/src/quake_index.cpp
+++ b/src/cpp/src/quake_index.cpp
@@ -44,10 +44,7 @@ shared_ptr<BuildTimingInfo> QuakeIndex::build(Tensor x, Tensor ids, shared_ptr<I
         shared_ptr<Clustering> clustering = kmeans(
             x,
             ids,
-            build_params_->nlist,
-            metric_,
-            build_params_->niter,
-            build_params_->use_gpu
+            build_params_
         );
         auto e1 = std::chrono::high_resolution_clock::now();
         timing_info->train_time_us = std::chrono::duration_cast<std::chrono::microseconds>(e1 - s1).count();
diff --git a/src/cpp/third_party/cmake/fetch_rapids.cmake b/src/cpp/third_party/cmake/fetch_rapids.cmake
new file mode 100644
index 00000000..0ec5fa04
--- /dev/null
+++ b/src/cpp/third_party/cmake/fetch_rapids.cmake
@@ -0,0 +1,22 @@
+# =============================================================================
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+# Use this variable to update RAPIDS and cuVS versions
+set(RAPIDS_VERSION "25.06")
+set(rapids-cmake-version ${RAPIDS_VERSION})
+
+if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUVS_RAPIDS.cmake)
+    file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
+            ${CMAKE_CURRENT_BINARY_DIR}/CUVS_RAPIDS.cmake)
+endif()
+include(${CMAKE_CURRENT_BINARY_DIR}/CUVS_RAPIDS.cmake)
diff --git a/src/cpp/third_party/cmake/get_cuvs.cmake b/src/cpp/third_party/cmake/get_cuvs.cmake
new file mode 100644
index 00000000..a2a4e884
--- /dev/null
+++ b/src/cpp/third_party/cmake/get_cuvs.cmake
@@ -0,0 +1,56 @@
+# =============================================================================
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+
+# Use RAPIDS_VERSION from cmake/thirdparty/fetch_rapids.cmake
+set(CUVS_VERSION "${RAPIDS_VERSION}")
+set(CUVS_FORK "rapidsai")
+set(CUVS_PINNED_TAG "branch-${RAPIDS_VERSION}")
+
+function(find_and_configure_cuvs)
+    set(oneValueArgs VERSION FORK PINNED_TAG ENABLE_NVTX BUILD_CUVS_C_LIBRARY)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+
+    set(CUVS_COMPONENTS "")
+    if(PKG_BUILD_CUVS_C_LIBRARY)
+        string(APPEND CUVS_COMPONENTS " c_api")
+    endif()
+    #-----------------------------------------------------
+    # Invoke CPM find_package()
+    #-----------------------------------------------------
+    rapids_cpm_find(cuvs ${PKG_VERSION}
+            GLOBAL_TARGETS      cuvs::cuvs
+            BUILD_EXPORT_SET    cuvs-examples-exports
+            INSTALL_EXPORT_SET  cuvs-examples-exports
+            COMPONENTS ${CUVS_COMPONENTS}
+            CPM_ARGS
+            GIT_REPOSITORY https://github.com/${PKG_FORK}/cuvs.git
+            GIT_TAG        ${PKG_PINNED_TAG}
+            SOURCE_SUBDIR  cpp
+            OPTIONS
+            "BUILD_C_LIBRARY ${PKG_BUILD_CUVS_C_LIBRARY}"
+            "BUILD_TESTS OFF"
+            "CUVS_NVTX ${PKG_ENABLE_NVTX}"
+            )
+endfunction()
+
+# Change pinned tag here to test a commit in CI
+# To use a different CUVS locally, set the CMake variable
+# CPM_cuvs_SOURCE=/path/to/local/cuvs
+find_and_configure_cuvs(VERSION  ${CUVS_VERSION}.00
+        FORK                     ${CUVS_FORK}
+        PINNED_TAG               ${CUVS_PINNED_TAG}
+        ENABLE_NVTX              OFF
+        BUILD_CUVS_C_LIBRARY     ${BUILD_CUVS_C_LIBRARY}
+)
diff --git a/test/cpp/clustering.cpp b/test/cpp/clustering.cpp
new file mode 100644
index 00000000..bd373ba0
--- /dev/null
+++ b/test/cpp/clustering.cpp
@@ -0,0 +1,141 @@
+#include <gtest/gtest.h>
+#include <torch/torch.h>
+#include "clustering.h"
+
+// Helpers to generate random data and sequential ids
+static torch::Tensor generate_random_data(int64_t N, int64_t D) {
+  return torch::randn({N, D}, torch::kFloat32).contiguous();
+}
+static torch::Tensor generate_sequential_ids(int64_t N, int64_t start = 0) {
+  return torch::arange(start, start + N, torch::kInt64).contiguous();
+}
+
+// Compute mean squared error for clustering (for CPU sanity)
+static double compute_mse(const torch::Tensor& centroids,
+                          const std::vector<torch::Tensor>& clusters) {
+  double total_err = 0.0;
+  int64_t count = 0;
+  auto C = centroids.to(torch::kCPU);
+  for (size_t i = 0; i < clusters.size(); ++i) {
+    auto cl = clusters[i].to(torch::kCPU);
+    if (cl.size(0) == 0) continue;
+    auto diff = cl - C[i].unsqueeze(0);
+    total_err += diff.pow(2).sum().item<double>();
+    count += cl.size(0);
+  }
+  return count>0 ? total_err / count : 0.0;
+}
+
+// Fixture
+class ClusteringTest : public ::testing::Test {
+ protected:
+  const int64_t num_vectors = 5000;
+  const int64_t dim         = 64;
+  const int     num_clusters= 20;
+
+  torch::Tensor vectors_cpu, ids_cpu;
+#ifdef QUAKE_ENABLE_GPU
+  torch::Tensor vectors_cuda, ids_cuda;
+#endif
+
+  void SetUp() override {
+    vectors_cpu = generate_random_data(num_vectors, dim);
+    ids_cpu     = generate_sequential_ids(num_vectors);
+
+#ifdef QUAKE_ENABLE_GPU
+    if (!torch::cuda::is_available()) {
+      GTEST_SKIP() << "CUDA not available";
+    }
+    vectors_cuda = vectors_cpu.to(torch::kCUDA).contiguous();
+    ids_cuda     = ids_cpu.to(torch::kCUDA).contiguous();
+#endif
+  }
+};
+
+// Test existing CPU kmeans
+TEST_F(ClusteringTest, KMeansCPU_L2) {
+  shared_ptr<IndexBuildParams> build_params = std::make_shared<IndexBuildParams>();
+  build_params->nlist = num_clusters;
+  build_params->metric = "l2";
+  build_params->niter = 10;
+  auto cl = kmeans_cpu(vectors_cpu, ids_cpu, build_params, torch::Tensor());
+  ASSERT_EQ(cl->centroids.sizes(), (std::vector<int64_t>{num_clusters, dim}));
+  int64_t tot=0;
+  for (int i=0;i<num_clusters;++i) {
+    ASSERT_EQ(cl->vectors[i].size(0), cl->vector_ids[i].size(0));
+    tot += cl->vectors[i].size(0);
+  }
+  ASSERT_EQ(tot, num_vectors);
+}
+
+// Compare CPU vs CPU wrapper
+TEST_F(ClusteringTest, KMeansWrapper_CPU) {
+  shared_ptr<IndexBuildParams> build_params = std::make_shared<IndexBuildParams>();
+  build_params->nlist = num_clusters;
+  build_params->metric = "l2";
+  build_params->niter = 10;
+  build_params->use_gpu = false;
+  auto cl = kmeans(vectors_cpu, ids_cpu, build_params, torch::Tensor());
+  ASSERT_EQ(cl->centroids.sizes(), (std::vector<int64_t>{num_clusters, dim}));
+  int64_t tot=0;
+  for (int i=0;i<num_clusters;++i) {
+    tot += cl->vectors[i].size(0);
+  }
+  ASSERT_EQ(tot, num_vectors);
+}
+
+#ifdef QUAKE_ENABLE_GPU
+TEST_F(ClusteringTest, SampleAndPredict_GPU_L2) {
+  shared_ptr<IndexBuildParams> build_params = std::make_shared<IndexBuildParams>();
+  build_params->nlist = num_clusters;
+  build_params->metric = "l2";
+  build_params->niter = 10;
+  build_params->use_gpu = true;
+  build_params->gpu_sample_size = 2000;
+  build_params->gpu_batch_size = 100;
+
+  auto cl = kmeans_cuvs_sample_and_predict(
+      vectors_cpu, ids_cpu, build_params);
+
+  // centroids must live on CPU and have correct shape
+  ASSERT_EQ(cl->centroids.device().type(), torch::kCPU);
+  ASSERT_EQ(cl->centroids.sizes(), (std::vector<int64_t>{num_clusters, dim}));
+
+  // all vectors accounted for
+  int64_t tot=0;
+  for (int i=0;i<num_clusters;++i) {
+    auto &part = cl->vectors[i];
+    ASSERT_EQ(part.device().type(), torch::kCPU);
+    ASSERT_EQ(part.size(0), cl->vector_ids[i].size(0));
+    tot += part.size(0);
+  }
+  ASSERT_EQ(tot, num_vectors);
+
+  build_params->use_gpu = false;
+
+  // Optional quality check: rough MSE vs CPU run
+  auto cl_cpu = kmeans_cpu(vectors_cpu, ids_cpu, build_params, torch::Tensor());
+  double mse_cpu = compute_mse(cl_cpu->centroids, cl_cpu->vectors);
+  double mse_gpu = compute_mse(cl->centroids, cl->vectors);
+  ASSERT_NEAR(mse_cpu, mse_gpu, mse_cpu * 0.30);
+}
+
+// Full wrapper test for GPU
+TEST_F(ClusteringTest, KMeansWrapper_GPU) {
+  shared_ptr<IndexBuildParams> build_params = std::make_shared<IndexBuildParams>();
+  build_params->nlist = num_clusters;
+  build_params->metric = "l2";
+  build_params->niter = 10;
+  build_params->use_gpu = true;
+  build_params->gpu_sample_size = 2000;
+  build_params->gpu_batch_size = 100;
+
+  auto cl = kmeans(vectors_cpu, ids_cpu, build_params,
+                   torch::Tensor());
+  ASSERT_EQ(cl->centroids.device().type(), torch::kCPU);
+  ASSERT_EQ(cl->vectors.size(), size_t(num_clusters));
+  int64_t tot=0;
+  for (auto &p : cl->vectors) tot += p.size(0);
+  ASSERT_EQ(tot, num_vectors);
+}
+#endif  // QUAKE_ENABLE_GPU
diff --git a/test/cpp/quake_index.cpp b/test/cpp/quake_index.cpp
index 0521ea02..b8f0199c 100644
--- a/test/cpp/quake_index.cpp
+++ b/test/cpp/quake_index.cpp
@@ -283,17 +283,19 @@ TEST(QuakeIndexStressTest, LargeBuildTest) {
               << " vectors took " << build_duration_ms << " ms.\n";
 }
 
-#ifdef FAISS_ENABLE_GPU
+#ifdef QUAKE_ENABLE_GPU
 TEST(QuakeIndexStressTestGPU, LargeBuildTest) {
     // Attempt to build an index with a large number of vectors.
     // Adjust these numbers based on your available memory/compute.
     int64_t dimension = 128;     // Medium-high dimension
     int64_t num_vectors = 1e6;   // 1 million vectors
-    auto data_vectors = generate_random_data(num_vectors, dimension);
-    auto data_ids = generate_sequential_ids(num_vectors, 0);
+    auto data_vectors = generate_random_data(num_vectors, dimension).contiguous();
+    auto data_ids = generate_sequential_ids(num_vectors, 0).contiguous();
 
     QuakeIndex index;
 
+    std::cout << "generated\n";
+
     auto build_params = std::make_shared<IndexBuildParams>();
     build_params->nlist = 512;
     build_params->metric = "l2";
@@ -527,12 +529,12 @@ TEST(QuakeIndexStressTest, SearchAddRemoveMaintenanceTest) {
 }
 
 // Define the GPU related test only if FAISS GPU support is enabled
-#ifdef FAISS_ENABLE_GPU
+#ifdef QUAKE_ENABLE_GPU
 // Test build with GPU enabled
 TEST(QuakeIndexGPUTest, BuildWithGPUTest) {
     int64_t dimension = 32;
-    int64_t num_vectors = 200;
-    int64_t nlist = 5;
+    int64_t num_vectors = 10000;
+    int64_t nlist = 10;
 
     torch::Tensor data_vectors = generate_random_data(num_vectors, dimension);
     torch::Tensor data_ids = generate_sequential_ids(num_vectors, 0);
@@ -554,4 +556,4 @@ TEST(QuakeIndexGPUTest, BuildWithGPUTest) {
     EXPECT_EQ(timing_info->n_vectors, data_vectors.size(0));
     EXPECT_EQ(timing_info->d, data_vectors.size(1));
 }
-#endif
\ No newline at end of file
+#endif