Merge branch 'branch-23.12' into 23.10-ann-deletion

lowener · web-flow · commit 1f8b4215496f · 2023-11-06T13:37:50.000+01:00
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
@@ -23,6 +23,7 @@
 #include <benchmark/benchmark.h>
 
 #include <algorithm>
+#include <atomic>
 #include <chrono>
 #include <cmath>
 #include <condition_variable>
@@ -39,6 +40,7 @@ namespace raft::bench::ann {
 
 std::mutex init_mutex;
 std::condition_variable cond_var;
+std::atomic_int processed_threads{0};
 
 static inline std::unique_ptr<AnnBase> current_algo{nullptr};
 static inline std::shared_ptr<AlgoProperty> current_algo_props{nullptr};
@@ -198,7 +200,8 @@ void bench_search(::benchmark::State& state,
    * Make sure the first thread loads the algo and dataset
    */
   if (state.thread_index() == 0) {
-    std::lock_guard lk(init_mutex);
+    std::unique_lock lk(init_mutex);
+    cond_var.wait(lk, [] { return processed_threads.load(std::memory_order_acquire) == 0; });
     // algo is static to cache it between close search runs to save time on index loading
     static std::string index_file = "";
     if (index.file != index_file) {
@@ -247,11 +250,14 @@ void bench_search(::benchmark::State& state,
     }
 
     query_set = dataset->query_set(current_algo_props->query_memory_type);
+    processed_threads.store(state.threads(), std::memory_order_acq_rel);
     cond_var.notify_all();
   } else {
-    // All other threads will wait for the first thread to initialize the algo.
     std::unique_lock lk(init_mutex);
-    cond_var.wait(lk, [] { return current_algo_props.get() != nullptr; });
+    // All other threads will wait for the first thread to initialize the algo.
+    cond_var.wait(lk, [&state] {
+      return processed_threads.load(std::memory_order_acquire) == state.threads();
+    });
     // gbench ensures that all threads are synchronized at the start of the benchmark loop.
     // We are accessing shared variables (like current_algo, current_algo_probs) before the
     // benchmark loop, therefore the synchronization here is necessary.
@@ -292,6 +298,7 @@ void bench_search(::benchmark::State& state,
 
       // advance to the next batch
       batch_offset = (batch_offset + n_queries) % query_set_size;
+
       queries_processed += n_queries;
     }
   }
@@ -312,6 +319,10 @@ void bench_search(::benchmark::State& state,
 
   if (state.skipped()) { return; }
 
+  // assume thread has finished processing successfully at this point
+  // last thread to finish processing notifies all
+  if (processed_threads-- == 0) { cond_var.notify_all(); }
+
   // Use the last thread as a sanity check that all the threads are working.
   if (state.thread_index() == state.threads() - 1) {
     // evaluate recall
@@ -410,7 +421,6 @@ void register_search(std::shared_ptr<const Dataset<T>> dataset,
       auto* b = ::benchmark::RegisterBenchmark(
                   index.name + suf, bench_search<T>, index, i, dataset, metric_objective)
                   ->Unit(benchmark::kMillisecond)
-                  ->ThreadRange(threads[0], threads[1])
                   /**
                    * The following are important for getting accuracy QPS measurements on both CPU
                    * and GPU These make sure that
@@ -420,6 +430,8 @@ void register_search(std::shared_ptr<const Dataset<T>> dataset,
                    */
                   ->MeasureProcessCPUTime()
                   ->UseRealTime();
+
+      if (metric_objective == Objective::THROUGHPUT) { b->ThreadRange(threads[0], threads[1]); }
     }
   }
 }
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
@@ -147,7 +147,6 @@ void HnswLib<T>::build(const T* dataset, size_t nrow, cudaStream_t)
         char buf[20];
         std::time_t now = std::time(nullptr);
         std::strftime(buf, sizeof(buf), "%Y-%m-%d %H:%M:%S", std::localtime(&now));
-
         printf("%s building %zu / %zu\n", buf, i, items_per_thread);
         fflush(stdout);
       }
@@ -163,13 +162,11 @@ void HnswLib<T>::set_search_param(const AnnSearchParam& param_)
   auto param        = dynamic_cast<const SearchParam&>(param_);
   appr_alg_->ef_    = param.ef;
   metric_objective_ = param.metric_objective;
+  num_threads_      = param.num_threads;
 
-  bool use_pool = (metric_objective_ == Objective::LATENCY && param.num_threads > 1) &&
-                  (!thread_pool_ || num_threads_ != param.num_threads);
-  if (use_pool) {
-    num_threads_ = param.num_threads;
-    thread_pool_ = std::make_unique<FixedThreadPool>(num_threads_);
-  }
+  // Create a pool if multiple query threads have been set and the pool hasn't been created already
+  bool create_pool = (metric_objective_ == Objective::LATENCY && num_threads_ > 1 && !thread_pool_);
+  if (create_pool) { thread_pool_ = std::make_unique<FixedThreadPool>(num_threads_); }
 }
 
 template <typename T>
@@ -180,7 +177,7 @@ void HnswLib<T>::search(
     // hnsw can only handle a single vector at a time.
     get_search_knn_results_(query + i * dim_, k, indices + i * k, distances + i * k);
   };
-  if (metric_objective_ == Objective::LATENCY) {
+  if (metric_objective_ == Objective::LATENCY && num_threads_ > 1) {
     thread_pool_->submit(f, batch_size);
   } else {
     for (int i = 0; i < batch_size; i++) {
diff --git a/cpp/include/raft/core/detail/copy.hpp b/cpp/include/raft/core/detail/copy.hpp
@@ -16,7 +16,6 @@
 
 #pragma once
 #include <cstdio>
-#include <execution>
 #include <raft/core/cuda_support.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/error.hpp>
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/datasets.yaml b/python/raft-ann-bench/src/raft-ann-bench/run/conf/datasets.yaml
@@ -39,36 +39,42 @@
   dims: 960
   base_file: gist-960-euclidean/base.fbin
   query_file: gist-960-euclidean/query.fbin
+  groundtruth_neighbors_file: gist-960-euclidean/groundtruth.neighbors.ibin
   distance: euclidean
 
 - name: glove-50-angular
   dims: 50
   base_file: glove-50-angular/base.fbin
   query_file: glove-50-angular/query.fbin
+  groundtruth_neighbors_file: glove-50-angular/groundtruth.neighbors.ibin
   distance: euclidean
 
 - name: glove-50-inner
   dims: 50
   base_file: glove-50-inner/base.fbin
   query_file: glove-50-inner/query.fbin
+  groundtruth_neighbors_file: glove-50-inner/groundtruth.neighbors.ibin
   distance: euclidean
 
 - name: glove-100-angular
   dims: 100
   base_file: glove-100-angular/base.fbin
   query_file: glove-100-angular/query.fbin
+  groundtruth_neighbors_file: glove-100-angular/groundtruth.neighbors.ibin
   distance: euclidean
 
 - name: glove-100-inner
   dims: 100
   base_file: glove-100-inner/base.fbin
   query_file: glove-100-inner/query.fbin
+  groundtruth_neighbors_file: glove-100-inner/groundtruth.neighbors.ibin
   distance: euclidean
 
 - name: lastfm-65-angular
   dims: 65
   base_file: lastfm-65-angular/base.fbin
   query_file: lastfm-65-angular/query.fbin
+  groundtruth_neighbors_file: lastfm-65-angular/groundtruth.neighbors.ibin
   distance: euclidean
 
 - name: mnist-784-euclidean