Merge branch 'branch-24.02' into CAGRA-remove-max_dim-template-param

rapidsai · Dec 6, 2023 · 0b21643 · 0b21643
2 parents bbe9104 + ecd292b
commit 0b21643
Show file tree

Hide file tree

Showing 4 changed files with 38 additions and 72 deletions.
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
@@ -6,6 +6,6 @@ jobs:
  triage:
    runs-on: ubuntu-latest
    steps:
-   - uses: actions/labeler@main
+   - uses: actions/labeler@v4
      with:
        repo-token: "${{ secrets.GITHUB_TOKEN }}"
diff --git a/cpp/include/raft/core/device_resources_manager.hpp b/cpp/include/raft/core/device_resources_manager.hpp
@@ -254,12 +254,6 @@ struct device_resources_manager {
   // Container for underlying device resources to be re-used across host
   // threads for each device
   std::vector<resource_components> per_device_components_;
-  // Container for device_resources objects shared among threads. The index
-  // of the outer vector is the thread id of the thread requesting resources
-  // modulo the total number of resources managed by this object. The inner
-  // vector contains all resources associated with that id across devices
-  // in any order.
-  std::vector<std::vector<raft::device_resources>> resources_{};
 
   // Return a lock for accessing shared data
   [[nodiscard]] auto get_lock() const { return std::unique_lock{manager_mutex_}; }
@@ -271,72 +265,44 @@ struct device_resources_manager {
   // all host threads.
   auto const& get_device_resources_(int device_id)
   {
-    // Each thread maintains an independent list of devices it has
-    // accessed. If it has not marked a device as initialized, it
-    // acquires a lock to initialize it exactly once. This means that each
-    // thread will lock once for a particular device and not proceed until
-    // some thread has actually generated the corresponding device
-    // components
-    thread_local auto initialized_devices = std::vector<int>{};
-    auto res_iter                         = decltype(std::end(resources_[0])){};
-    if (std::find(std::begin(initialized_devices), std::end(initialized_devices), device_id) ==
-        std::end(initialized_devices)) {
+    thread_local auto thread_resources = std::vector<std::optional<raft::device_resources>>([]() {
+      auto result = 0;
+      RAFT_CUDA_TRY(cudaGetDeviceCount(&result));
+      RAFT_EXPECTS(result != 0, "No CUDA devices found");
+      return result;
+    }());
+    if (!thread_resources[device_id]) {
       // Only lock if we have not previously accessed this device on this
       // thread
       auto lock = get_lock();
-      initialized_devices.push_back(device_id);
       // If we are building components, do not allow any further changes to
       // resource parameters.
       params_finalized_ = true;
 
-      if (resources_.empty()) {
-        // We will potentially need as many device_resources objects as there are combinations of
-        // streams and pools on a given device.
-        resources_.resize(std::max(params_.stream_count.value_or(1), std::size_t{1}) *
-                          std::max(params_.pool_count, std::size_t{1}));
-      }
-
-      auto res_idx = get_thread_id() % resources_.size();
-      // Check to see if we have constructed device_resources for the
-      // requested device at the index assigned to this thread
-      res_iter = std::find_if(std::begin(resources_[res_idx]),
-                              std::end(resources_[res_idx]),
-                              [device_id](auto&& res) { return res.get_device() == device_id; });
+      // Even if we have not yet built device_resources for the current
+      // device, we may have already built the underlying components, since
+      // multiple device_resources may point to the same components.
+      auto component_iter = std::find_if(
+        std::begin(per_device_components_),
+        std::end(per_device_components_),
+        [device_id](auto&& components) { return components.get_device_id() == device_id; });
 
-      if (res_iter == std::end(resources_[res_idx])) {
-        // Even if we have not yet built device_resources for the current
-        // device, we may have already built the underlying components, since
-        // multiple device_resources may point to the same components.
-        auto component_iter = std::find_if(
-          std::begin(per_device_components_),
-          std::end(per_device_components_),
-          [device_id](auto&& components) { return components.get_device_id() == device_id; });
-        if (component_iter == std::end(per_device_components_)) {
-          // Build components for this device if we have not yet done so on
-          // another thread
-          per_device_components_.emplace_back(device_id, params_);
-          component_iter = std::prev(std::end(per_device_components_));
-        }
-        auto scoped_device = device_setter(device_id);
-        // Build the device_resources object for this thread out of shared
-        // components
-        resources_[res_idx].emplace_back(component_iter->get_stream(),
-                                         component_iter->get_pool(),
-                                         component_iter->get_workspace_memory_resource(),
-                                         component_iter->get_workspace_allocation_limit());
-        res_iter = std::prev(std::end(resources_[res_idx]));
+      if (component_iter == std::end(per_device_components_)) {
+        // Build components for this device if we have not yet done so on
+        // another thread
+        per_device_components_.emplace_back(device_id, params_);
+        component_iter = std::prev(std::end(per_device_components_));
       }
-    } else {
-      auto res_idx = get_thread_id() % resources_.size();
-      // If we have previously accessed this device on this thread, we do not
-      // need to lock. We know that this thread already initialized the
-      // resources it requires for this device if no other thread had already done so, so we simply
-      // retrieve the previously-generated resources.
-      res_iter = std::find_if(std::begin(resources_[res_idx]),
-                              std::end(resources_[res_idx]),
-                              [device_id](auto&& res) { return res.get_device() == device_id; });
+      auto scoped_device = device_setter(device_id);
+      // Build the device_resources object for this thread out of shared
+      // components
+      thread_resources[device_id].emplace(component_iter->get_stream(),
+                                          component_iter->get_pool(),
+                                          component_iter->get_workspace_memory_resource(),
+                                          component_iter->get_workspace_allocation_limit());
     }
-    return *res_iter;
+
+    return thread_resources[device_id].value();
   }
 
   // Thread-safe setter for the number of streams

diff --git a/docs/source/ann_benchmarks_param_tuning.md b/docs/source/ann_benchmarks_param_tuning.md
@@ -20,7 +20,7 @@ IVF-flat is a simple algorithm which won't save any space, but it provides compe
 | `nlist`              | `build_param`    | Y        | Positive Integer >0        |          | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
 | `niter`              | `build_param`    | N        | Positive Integer >0        | 20       | Number of clusters to partition the vectors into. Larger values will put less points into each cluster but this will impact index build time as more clusters need to be trained. |
 | `ratio`              | `build_param`    | N        | Positive Integer >0        | 2        | `1/ratio` is the number of training points which should be used to train the clusters.                                                                                            |
-| `dataset_memory_type` | `build_param` | N | ["device", "host", "mmap"] | "device" | What memory type should the dataset reside?                                                                                                                                       |
+| `dataset_memory_type` | `build_param` | N | ["device", "host", "mmap"] | "mmap" | What memory type should the dataset reside?                                                                                                                                       |
 | `query_memory_type`  | `search_params` | N | ["device", "host", "mmap"] | "device | What memory type should the queries reside? |
 | `nprobe`             | `search_params`  | Y        | Positive Integer >0        |          | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                       |
 
@@ -37,12 +37,12 @@ IVF-pq is an inverted-file index, which partitions the vectors into a series of
 | `pq_dim`               | `build_param`  | N | Positive Integer. Multiple of 8. | 0       | Dimensionality of the vector after product quantization. When 0, a heuristic is used to select this value. `pq_dim` * `pq_bits` must be a multiple of 8.                        |
 | `pq_bits`              | `build_param`  | N | Positive Integer. [4-8]          | 8       | Bit length of the vector element after quantization.                                                                                                                            |
 | `codebook_kind`        | `build_param`  | N | ["cluster", "subspace"]          | "subspace" | Type of codebook. See the [API docs](https://docs.rapids.ai/api/raft/nightly/cpp_api/neighbors_ivf_pq/#_CPPv412codebook_gen) for more detail                                 |
-| `dataset_memory_type`  | `build_param` | N | ["device", "host", "mmap"]       | "device" | What memory type should the dataset reside?                                                                                                                                       |
+| `dataset_memory_type`  | `build_param` | N | ["device", "host", "mmap"]       | "host" | What memory type should the dataset reside?                                                                                                                                       |
 | `query_memory_type`    | `search_params` | N | ["device", "host", "mmap"]       | "device | What memory type should the queries reside? |
 | `nprobe`               | `search_params` | Y | Positive Integer >0              |         | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                     |
 | `internalDistanceDtype` | `search_params` | N | [`float`, `half`]                | `half`  | The precision to use for the distance computations. Lower precision can increase performance at the cost of accuracy.                                                           |
 | `smemLutDtype`         | `search_params` | N | [`float`, `half`, `fp8`]         | `half`  | The precision to use for the lookup table in shared memory. Lower precision can increase performance at the cost of accuracy.                                                   |
-| `refine_ratio`         | `search_params` | N| Positive Number >=0              | 0       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.           |
+| `refine_ratio`         | `search_params` | N| Positive Number >=1              | 1       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.           |
 
 
 ### `raft_cagra`
@@ -53,7 +53,7 @@ IVF-pq is an inverted-file index, which partitions the vectors into a series of
 | `graph_degree`              | `build_param`  | N        | Positive Integer >0        | 64 | Degree of the final kNN graph index. |
 | `intermediate_graph_degree` | `build_param`  | N        | Positive Integer >0        | 128 | Degree of the intermediate kNN graph. |
 | `graph_build_algo`          | `build_param`  | N | ["IVF_PQ", "NN_DESCENT"]   | "IVF_PQ" | Algorithm to use for search |
-| `dataset_memory_type`       | `build_param`  | N | ["device", "host", "mmap"] | "device" | What memory type should the dataset reside while constructing the index?                                                                                                                                       |
+| `dataset_memory_type`       | `build_param`  | N | ["device", "host", "mmap"] | "mmap" | What memory type should the dataset reside while constructing the index?                                                                                                                                       |
 | `query_memory_type`         | `search_params` | N | ["device", "host", "mmap"] | "device | What memory type should the queries reside? |
 | `itopk`                     | `search_wdith`  | N        | Positive Integer >0        | 64 | Number of intermediate search results retained during the search. Higher values improve search accuracy at the cost of speed. |
 | `search_width`              | `search_param`  | N        | Positive Integer >0        | 1 | Number of graph nodes to select as the starting point for the search in each iteration. |
@@ -77,7 +77,7 @@ To fine tune CAGRA index building we can customize IVF-PQ index builder options
 | `ivf_pq_search_nprobe`               | `build_params` | N | Positive Integer >0              | min(2*dim, nlist)        | The closest number of clusters to search for each query vector.                                    |
 | `ivf_pq_search_internalDistanceDtype` | `build_params` | N | [`float`, `half`]                | `fp8`  | The precision to use for the distance computations. Lower precision can increase performance at the cost of accuracy.                                                           |
 | `ivf_pq_search_smemLutDtype`         | `build_params` | N | [`float`, `half`, `fp8`]         | `half`  | The precision to use for the lookup table in shared memory. Lower precision can increase performance at the cost of accuracy.                                                   |
-| `ivf_pq_search_refine_ratio`         | `build_params` | N| Positive Number >=0              | 2       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.           |
+| `ivf_pq_search_refine_ratio`         | `build_params` | N| Positive Number >=1              | 2       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.           |
 
 Alternatively, if `graph_build_algo == "NN_DESCENT"`, then we can customize the following parameters
 
@@ -125,7 +125,7 @@ IVF-pq is an inverted-file index, which partitions the vectors into a series of
 | `usePrecomputed` | `build_param`  | N        | Boolean. Default=`false`         | `false` | Use pre-computed lookup tables to speed up search at the cost of increased memory usage.                                                                                          |
 | `useFloat16`     | `build_param`  | N        | Boolean. Default=`false`         | `false`  | Use half-precision floats for clustering step.                                                                                                                                    |
 | `nprobe`         | `search_params` | Y        | Positive Integer >0              |         | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                       |
-| `refine_ratio`   | `search_params` | N| Positive Number >=0          | 0       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.             |
+| `refine_ratio`   | `search_params` | N| Positive Number >=1          | 1       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.             |
 
 ### `faiss_cpu_flat`
 
@@ -159,7 +159,7 @@ Use FAISS IVF-PQ index on CPU
 | `usePrecomputed` | `build_param`  | N        | Boolean. Default=`false`           | `false` | Use pre-computed lookup tables to speed up search at the cost of increased memory usage.                                                                                      |
 | `bitsPerCode`    | `build_param`  | N        | Positive Integer [4-8]             | 8       | Number of bits to use for each code.                                                                                                                                          |
 | `nprobe`         | `search_params` | Y        | Positive Integer >0                |         | The closest number of clusters to search for each query vector. Larger values will improve recall but will search more points in the index.                                   |
-| `refine_ratio`   | `search_params` | N| Positive Number >=0                | 0       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.         |
+| `refine_ratio`   | `search_params` | N| Positive Number >=1                | 1       | `refine_ratio * k` nearest neighbors are queried from the index initially and an additional refinement step improves recall by selecting only the best `k` neighbors.         |
 | `numThreads`     | `search_params` | N        | Positive Integer >0                  | 1       | Number of threads to use for queries.                                                                                                                                                                                                                                                             |
 
 

diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
@@ -574,8 +574,8 @@ def add_algo_group(group_list):
                         index["search_params"].append(search_dict)
                 executables_to_run[executable]["index"].append(index)
 
-            if len(index["search_params"]) == 0:
-                print("No search parameters were added to configuration")
+                if len(index["search_params"]) == 0:
+                    print("No search parameters were added to configuration")
 
     run_build_and_search(
         conf_file,