Merge branch 'branch-24.02' into improve_parallelism_of_refine_host

rapidsai · Dec 14, 2023 · a439b44 · a439b44
2 parents c9bfc9a + 80a48ca
commit a439b44
Show file tree

Hide file tree

Showing 35 changed files with 440 additions and 340 deletions.
diff --git a/build.sh b/build.sh
@@ -386,6 +386,8 @@ SKBUILD_EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS}"
 if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_RAFT_CPP"* ]]; then
     SKBUILD_EXTRA_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS} -DFIND_RAFT_CPP=ON"
 fi
+# Replace spaces with semicolons in SKBUILD_EXTRA_CMAKE_ARGS
+SKBUILD_EXTRA_CMAKE_ARGS=$(echo ${SKBUILD_EXTRA_CMAKE_ARGS} | sed 's/ /;/g')
 
 # If clean given, run it prior to any other steps
 if (( ${CLEAN} == 1 )); then
@@ -493,15 +495,13 @@ fi
 
 # Build and (optionally) install the pylibraft Python package
 if (( ${NUMARGS} == 0 )) || hasArg pylibraft; then
-    SKBUILD_CONFIGURE_OPTIONS="${SKBUILD_EXTRA_CMAKE_ARGS}" \
-        SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL}" \
+    SKBUILD_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS}" \
         python -m pip install --no-build-isolation --no-deps ${REPODIR}/python/pylibraft
 fi
 
 # Build and (optionally) install the raft-dask Python package
 if (( ${NUMARGS} == 0 )) || hasArg raft-dask; then
-    SKBUILD_CONFIGURE_OPTIONS="${SKBUILD_EXTRA_CMAKE_ARGS}" \
-        SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL}" \
+    SKBUILD_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS}" \
         python -m pip install --no-build-isolation --no-deps ${REPODIR}/python/raft-dask
 fi
 

diff --git a/ci/build_wheel_pylibraft.sh b/ci/build_wheel_pylibraft.sh
@@ -4,6 +4,6 @@
 set -euo pipefail
 
 # Set up skbuild options. Enable sccache in skbuild config options
-export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
+export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_RAFT_CPP=OFF"
 
 ci/build_wheel.sh pylibraft python/pylibraft
diff --git a/ci/build_wheel_raft_dask.sh b/ci/build_wheel_raft_dask.sh
@@ -4,6 +4,6 @@
 set -euo pipefail
 
 # Set up skbuild options. Enable sccache in skbuild config options
-export SKBUILD_CONFIGURE_OPTIONS="-DDETECT_CONDA_ENV=OFF -DFIND_RAFT_CPP=OFF"
+export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_RAFT_CPP=OFF"
 
 ci/build_wheel.sh raft-dask python/raft-dask
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -49,7 +49,7 @@ dependencies:
 - rapids-dask-dependency==24.2.*
 - recommonmark
 - rmm==24.2.*
-- scikit-build>=0.13.1
+- scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
 - sphinx-copybutton

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -49,7 +49,7 @@ dependencies:
 - rapids-dask-dependency==24.2.*
 - recommonmark
 - rmm==24.2.*
-- scikit-build>=0.13.1
+- scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
 - sphinx-copybutton

diff --git a/conda/environments/all_cuda-120_arch-aarch64.yaml b/conda/environments/all_cuda-120_arch-aarch64.yaml
@@ -45,7 +45,7 @@ dependencies:
 - rapids-dask-dependency==24.2.*
 - recommonmark
 - rmm==24.2.*
-- scikit-build>=0.13.1
+- scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
 - sphinx-copybutton

diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -45,7 +45,7 @@ dependencies:
 - rapids-dask-dependency==24.2.*
 - recommonmark
 - rmm==24.2.*
-- scikit-build>=0.13.1
+- scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
 - sphinx-copybutton

diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -39,6 +39,6 @@ dependencies:
 - pandas
 - pyyaml
 - rmm==24.2.*
-- scikit-build>=0.13.1
+- scikit-build-core>=0.7.0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-118_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -39,6 +39,6 @@ dependencies:
 - pandas
 - pyyaml
 - rmm==24.2.*
-- scikit-build>=0.13.1
+- scikit-build-core>=0.7.0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
@@ -35,6 +35,6 @@ dependencies:
 - pandas
 - pyyaml
 - rmm==24.2.*
-- scikit-build>=0.13.1
+- scikit-build-core>=0.7.0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-120_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
@@ -35,6 +35,6 @@ dependencies:
 - pandas
 - pyyaml
 - rmm==24.2.*
-- scikit-build>=0.13.1
+- scikit-build-core>=0.7.0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-120_arch-x86_64
diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
@@ -50,7 +50,7 @@ requirements:
     - libraft-headers {{ version }}
     - python x.x
     - rmm ={{ minor_version }}
-    - scikit-build >=0.13.1
+    - scikit-build-core >=0.7.0
     - setuptools
   run:
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}

diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
@@ -50,7 +50,7 @@ requirements:
     - pylibraft {{ version }}
     - python x.x
     - rmm ={{ minor_version }}
-    - scikit-build >=0.13.1
+    - scikit-build-core >=0.7.0
     - setuptools
     - ucx {{ ucx_version }}
     - ucx-proc=*=gpu

diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
@@ -116,6 +116,21 @@ if(RAFT_ANN_BENCH_USE_FAISS)
   include(cmake/thirdparty/get_faiss.cmake)
 endif()
 
+# ##################################################################################################
+# * Enable NVTX if available
+
+# Note: ANN_BENCH wrappers have extra NVTX code not related to raft::nvtx.They track gbench
+# benchmark cases and iterations. This is to make limited NVTX available to all algos, not just
+# raft.
+if(TARGET CUDA::nvtx3)
+  set(_CMAKE_REQUIRED_INCLUDES_ORIG ${CMAKE_REQUIRED_INCLUDES})
+  get_target_property(CMAKE_REQUIRED_INCLUDES CUDA::nvtx3 INTERFACE_INCLUDE_DIRECTORIES)
+  unset(NVTX3_HEADERS_FOUND CACHE)
+  # Check the headers explicitly to make sure the cpu-only build succeeds
+  CHECK_INCLUDE_FILE_CXX(nvtx3/nvToolsExt.h NVTX3_HEADERS_FOUND)
+  set(CMAKE_REQUIRED_INCLUDES ${_CMAKE_REQUIRED_INCLUDES_ORIG})
+endif()
+
 # ##################################################################################################
 # * Configure tests function-------------------------------------------------------------
 
@@ -141,8 +156,13 @@ function(ConfigureAnnBench)
     add_dependencies(${BENCH_NAME} ANN_BENCH)
   else()
     add_executable(${BENCH_NAME} ${ConfigureAnnBench_PATH})
-    target_compile_definitions(${BENCH_NAME} PRIVATE ANN_BENCH_BUILD_MAIN)
-    target_link_libraries(${BENCH_NAME} PRIVATE benchmark::benchmark)
+    target_compile_definitions(
+      ${BENCH_NAME} PRIVATE ANN_BENCH_BUILD_MAIN
+                            $<$<BOOL:${NVTX3_HEADERS_FOUND}>:ANN_BENCH_NVTX3_HEADERS_FOUND>
+    )
+    target_link_libraries(
+      ${BENCH_NAME} PRIVATE benchmark::benchmark $<$<BOOL:${NVTX3_HEADERS_FOUND}>:CUDA::nvtx3>
+    )
   endif()
 
   target_link_libraries(
@@ -340,8 +360,16 @@ if(RAFT_ANN_BENCH_SINGLE_EXE)
   target_include_directories(ANN_BENCH PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 
   target_link_libraries(
-    ANN_BENCH PRIVATE nlohmann_json::nlohmann_json benchmark_static dl -static-libgcc
-                      -static-libstdc++ CUDA::nvtx3
+    ANN_BENCH
+    PRIVATE raft::raft
+            nlohmann_json::nlohmann_json
+            benchmark_static
+            dl
+            -static-libgcc
+            fmt::fmt-header-only
+            spdlog::spdlog_header_only
+            -static-libstdc++
+            $<$<BOOL:${NVTX3_HEADERS_FOUND}>:CUDA::nvtx3>
   )
   set_target_properties(
     ANN_BENCH
@@ -355,17 +383,10 @@ if(RAFT_ANN_BENCH_SINGLE_EXE)
                BUILD_RPATH "\$ORIGIN"
                INSTALL_RPATH "\$ORIGIN"
   )
-
-  # Disable NVTX when the nvtx3 headers are missing
-  set(_CMAKE_REQUIRED_INCLUDES_ORIG ${CMAKE_REQUIRED_INCLUDES})
-  get_target_property(CMAKE_REQUIRED_INCLUDES ANN_BENCH INCLUDE_DIRECTORIES)
-  CHECK_INCLUDE_FILE_CXX(nvtx3/nvToolsExt.h NVTX3_HEADERS_FOUND)
-  set(CMAKE_REQUIRED_INCLUDES ${_CMAKE_REQUIRED_INCLUDES_ORIG})
   target_compile_definitions(
     ANN_BENCH
     PRIVATE
-      $<$<BOOL:${CUDAToolkit_FOUND}>:ANN_BENCH_LINK_CUDART="libcudart.so.${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}.${CUDAToolkit_VERSION_PATCH}
-  ">
+      $<$<BOOL:${CUDAToolkit_FOUND}>:ANN_BENCH_LINK_CUDART="libcudart.so.${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}.${CUDAToolkit_VERSION_PATCH}">
       $<$<BOOL:${NVTX3_HEADERS_FOUND}>:ANN_BENCH_NVTX3_HEADERS_FOUND>
   )
 

diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp
@@ -18,6 +18,7 @@
 
 #include "cuda_stub.hpp"  // cudaStream_t
 
+#include <memory>
 #include <stdexcept>
 #include <string>
 #include <vector>
@@ -64,17 +65,10 @@ inline auto parse_memory_type(const std::string& memory_type) -> MemoryType
   }
 }
 
-class AlgoProperty {
- public:
-  inline AlgoProperty() {}
-  inline AlgoProperty(MemoryType dataset_memory_type_, MemoryType query_memory_type_)
-    : dataset_memory_type(dataset_memory_type_), query_memory_type(query_memory_type_)
-  {
-  }
+struct AlgoProperty {
   MemoryType dataset_memory_type;
   // neighbors/distances should have same memory type as queries
   MemoryType query_memory_type;
-  virtual ~AlgoProperty() = default;
 };
 
 class AnnBase {
@@ -125,6 +119,11 @@ class ANN : public AnnBase {
   // The client code should call set_search_dataset() before searching,
   // and should not release dataset before searching is finished.
   virtual void set_search_dataset(const T* /*dataset*/, size_t /*nrow*/){};
+
+  /**
+   * Make a shallow copy of the ANN wrapper that shares the resources and ensures thread-safe access
+   * to them. */
+  virtual auto copy() -> std::unique_ptr<ANN<T>> = 0;
 };
 
 }  // namespace raft::bench::ann

diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
@@ -45,7 +45,7 @@ std::condition_variable cond_var;
 std::atomic_int processed_threads{0};
 
 static inline std::unique_ptr<AnnBase> current_algo{nullptr};
-static inline std::shared_ptr<AlgoProperty> current_algo_props{nullptr};
+static inline std::unique_ptr<AlgoProperty> current_algo_props{nullptr};
 
 using kv_series = std::vector<std::tuple<std::string, std::vector<nlohmann::json>>>;
 
@@ -241,9 +241,8 @@ void bench_search(::benchmark::State& state,
       return;
     }
 
-    auto algo_property = parse_algo_property(algo->get_preference(), sp_json);
-    current_algo_props = std::make_shared<AlgoProperty>(algo_property.dataset_memory_type,
-                                                        algo_property.query_memory_type);
+    current_algo_props = std::make_unique<AlgoProperty>(
+      std::move(parse_algo_property(algo->get_preference(), sp_json)));
 
     if (search_param->needs_dataset()) {
       try {
@@ -277,23 +276,22 @@ void bench_search(::benchmark::State& state,
     // We are accessing shared variables (like current_algo, current_algo_probs) before the
     // benchmark loop, therefore the synchronization here is necessary.
   }
-  const auto algo_property = *current_algo_props;
-  query_set                = dataset->query_set(algo_property.query_memory_type);
+  query_set = dataset->query_set(current_algo_props->query_memory_type);
 
   /**
    * Each thread will manage its own outputs
    */
   std::shared_ptr<buf<float>> distances =
-    std::make_shared<buf<float>>(algo_property.query_memory_type, k * query_set_size);
+    std::make_shared<buf<float>>(current_algo_props->query_memory_type, k * query_set_size);
   std::shared_ptr<buf<std::size_t>> neighbors =
-    std::make_shared<buf<std::size_t>>(algo_property.query_memory_type, k * query_set_size);
+    std::make_shared<buf<std::size_t>>(current_algo_props->query_memory_type, k * query_set_size);
 
   cuda_timer gpu_timer;
   auto start = std::chrono::high_resolution_clock::now();
   {
     nvtx_case nvtx{state.name()};
 
-    ANN<T>* algo = dynamic_cast<ANN<T>*>(current_algo.get());
+    auto algo = dynamic_cast<ANN<T>*>(current_algo.get())->copy();
     for (auto _ : state) {
       [[maybe_unused]] auto ntx_lap = nvtx.lap();
       [[maybe_unused]] auto gpu_lap = gpu_timer.lap();