From 753a10960fd3361bc82feffa6335d8a2afc97abd Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Thu, 5 Oct 2023 14:46:56 -0700
Subject: [PATCH 001/148] start integration of cagra

---
 faiss/gpu/CMakeLists.txt     |   4 +-
 faiss/gpu/impl/RaftCagra.cu  | 134 +++++++++++++++++++++++++++++++++++
 faiss/gpu/impl/RaftCagra.cuh |  98 +++++++++++++++++++++++++
 3 files changed, 235 insertions(+), 1 deletion(-)
 create mode 100644 faiss/gpu/impl/RaftCagra.cu
 create mode 100644 faiss/gpu/impl/RaftCagra.cuh
diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index ad7d2103fa..b18209ca64 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -238,11 +238,13 @@ generate_ivf_interleaved_code()
 
 if(FAISS_ENABLE_RAFT)
   list(APPEND FAISS_GPU_HEADERS
+          impl/RaftCagra.cuh
           impl/RaftIVFFlat.cuh
           impl/RaftFlatIndex.cuh)
   list(APPEND FAISS_GPU_SRC
+          impl/RaftCagra.cu
           impl/RaftFlatIndex.cu
-	  impl/RaftIVFFlat.cu)
+	        impl/RaftIVFFlat.cu)
 
   target_compile_definitions(faiss PUBLIC USE_NVIDIA_RAFT=1)
   target_compile_definitions(faiss_avx2 PUBLIC USE_NVIDIA_RAFT=1)
diff --git a/faiss/gpu/impl/RaftCagra.cu b/faiss/gpu/impl/RaftCagra.cu
new file mode 100644
index 0000000000..0520209b17
--- /dev/null
+++ b/faiss/gpu/impl/RaftCagra.cu
@@ -0,0 +1,134 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/impl/RaftCagra.cuh>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+
+namespace faiss {
+namespace gpu {
+
+RaftCagra::RaftCagra(
+        GpuResources* resources,
+        int dim,
+        idx_t intermediate_graph_degree,
+        idx_t graph_degree,
+        faiss::cagra_build_algo graph_build_algo,
+        faiss::MetricType metric,
+        float metricArg)
+        : resources_(resources),
+          dim_(dim),
+          metric_(metric),
+          metricArg_(metricArg),
+          index_pams_() {
+    FAISS_THROW_IF_NOT_MSG(
+            metric == faiss::METRIC_L2,
+            "CAGRA currently only supports L2 metric.");
+
+    index_pams_.intermediate_graph_degree = intermediate_graph_degree;
+    index_pams_.graph_degree = graph_degree;
+    index_pams_.build_algo =
+            static_cast<raft::neighbors::cagra::graph_build_algo>(
+                    graph_build_algo);
+}
+
+void RaftCagra::train(idx_t n, const float* x) {
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    if (getDeviceForAddress(x) >= 0) {
+        raft_knn_index = raft::neighbors::cagra::build<float, idx_t>(
+                raft_handle,
+                index_pams_,
+                raft::make_device_matrix_view<const float, idx_t>(x, n, dim_));
+    } else {
+        raft_knn_index = raft::neighbors::cagra::build<float, idx_t>(
+                raft_handle,
+                index_pams_,
+                raft::make_host_matrix_view<const float, idx_t>(x, n, dim_));
+    }
+}
+
+void RaftCagra::search(
+        Tensor<float, 2, true>& queries,
+        int k,
+        Tensor<float, 2, true>& outDistances,
+        Tensor<idx_t, 2, true>& outIndices,
+        idx_t max_queries,
+        idx_t itopk_size,
+        idx_t max_iterations,
+        faiss::cagra_search_algo graph_search_algo,
+        idx_t team_size,
+        idx_t search_width,
+        idx_t min_iterations,
+        idx_t thread_block_size,
+        faiss::cagra_hash_mode hash_mode,
+        idx_t hashmap_min_bitlen,
+        float hashmap_max_fill_rate,
+        idx_t num_random_samplings,
+        idx_t rand_xor_mask) {
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    idx_t numQueries = queries.getSize(0);
+    idx_t cols = queries.getSize(1);
+    idx_t k_ = k;
+
+    FAISS_ASSERT(raft_knn_index.has_value());
+    FAISS_ASSERT(numQueries > 0);
+    FAISS_ASSERT(cols == dim_);
+
+    auto queries_view = raft::make_device_matrix_view<const float, idx_t>(
+            queries.data(), numQueries, cols);
+    auto distances_view = raft::make_device_matrix_view<float, idx_t>(
+            outDistances.data(), numQueries, k_);
+    auto indices_view = raft::make_device_matrix_view<idx_t, idx_t>(
+            outIndices.data(), numQueries, k_);
+
+    raft::neighbors::cagra::search_params search_pams;
+    search_pams.max_queries = max_queries;
+    search_pams.itopk_size = itopk_size;
+    search_pams.max_iterations = max_iterations;
+    search_pams.algo =
+            static_cast<raft::neighbors::cagra::search_algo>(graph_search_algo);
+    search_pams.team_size = team_size;
+    search_pams.search_width = search_width;
+    search_pams.min_iterations = min_iterations;
+    search_pams.thread_block_size = thread_block_size;
+    search_pams.hashmap_mode =
+            static_cast<raft::neighbors::cagra::hash_mode>(hash_mode);
+    search_pams.hashmap_min_bitlen = hashmap_min_bitlen;
+    search_pams.hashmap_max_fill_rate = hashmap_max_fill_rate;
+    search_pams.num_random_samplings = num_random_samplings;
+    search_pams.rand_xor_mask = rand_xor_mask;
+
+    raft::neighbors::cagra::search(
+            raft_handle,
+            search_pams,
+            raft_knn_index.value(),
+            queries_view,
+            indices_view,
+            distances_view);
+}
+
+} // namespace gpu
+} // namespace faiss
\ No newline at end of file
diff --git a/faiss/gpu/impl/RaftCagra.cuh b/faiss/gpu/impl/RaftCagra.cuh
new file mode 100644
index 0000000000..ccdffa28f0
--- /dev/null
+++ b/faiss/gpu/impl/RaftCagra.cuh
@@ -0,0 +1,98 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/Tensor.cuh>
+
+#include <faiss/MetricType.h>
+
+#include <raft/neighbors/cagra.cuh>
+
+namespace faiss {
+
+enum class cagra_build_algo { IVF_PQ, NN_DESCENT };
+
+enum class cagra_search_algo { SINGLE_CTA, MULTI_CTA };
+
+enum class cagra_hash_mode { HASH, SMALL, AUTO };
+
+namespace gpu {
+
+class RaftCagra {
+   public:
+    RaftCagra(
+            GpuResources* resources,
+            int dim,
+            idx_t intermediate_graph_degree,
+            idx_t graph_degree,
+            faiss::cagra_build_algo graph_build_algo,
+            faiss::MetricType metric,
+            float metricArg);
+
+    ~RaftCagra() = default;
+
+    void train(idx_t n, const float* x);
+
+    void search(
+            Tensor<float, 2, true>& queries,
+            int k,
+            Tensor<float, 2, true>& outDistances,
+            Tensor<idx_t, 2, true>& outIndices,
+            idx_t max_queries,
+            idx_t itopk_size,
+            idx_t max_iterations,
+            faiss::cagra_search_algo graph_search_algo,
+            idx_t team_size,
+            idx_t search_width,
+            idx_t min_iterations,
+            idx_t thread_block_size,
+            faiss::cagra_hash_mode hash_mode,
+            idx_t hashmap_min_bitlen,
+            float hashmap_max_fill_rate,
+            idx_t num_random_samplings,
+            idx_t rand_xor_mask);
+
+   private:
+    /// Collection of GPU resources that we use
+    GpuResources* resources_;
+
+    /// Expected dimensionality of the vectors
+    const int dim_;
+
+    /// Metric type of the index
+    faiss::MetricType metric_;
+
+    /// Metric arg
+    float metricArg_;
+
+    /// Parameters to build RAFT CAGRA index
+    raft::neighbors::cagra::index_params index_pams_;
+
+    /// Instance of trained RAFT CAGRA index
+    std::optional<raft::neighbors::cagra::index<float, idx_t>> raft_knn_index{
+            std::nullopt};
+};
+
+} // namespace gpu
+} // namespace faiss
\ No newline at end of file

From f21c1f1c89ffc947249aae58ffddf52e78b0ea6f Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Tue, 30 Jan 2024 15:11:43 -0800
Subject: [PATCH 002/148] add public API layer

---
 faiss/gpu/CMakeLists.txt     |   2 +
 faiss/gpu/GpuIndexCagra.cu   |  84 ++++++++++++++++++++++++
 faiss/gpu/GpuIndexCagra.h    | 124 +++++++++++++++++++++++++++++++++++
 faiss/gpu/impl/RaftCagra.cu  |   4 ++
 faiss/gpu/impl/RaftCagra.cuh |   4 +-
 5 files changed, 217 insertions(+), 1 deletion(-)
 create mode 100644 faiss/gpu/GpuIndexCagra.cu
 create mode 100644 faiss/gpu/GpuIndexCagra.h

diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index 2cdc7e8a19..b76a3a0fb3 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -29,6 +29,7 @@ set(FAISS_GPU_SRC
   GpuIndexIVFFlat.cu
   GpuIndexIVFPQ.cu
   GpuIndexIVFScalarQuantizer.cu
+  GpuIndexCagra.cu
   GpuResources.cpp
   StandardGpuResources.cpp
   impl/BinaryDistance.cu
@@ -91,6 +92,7 @@ set(FAISS_GPU_HEADERS
   GpuFaissAssert.h
   GpuIndex.h
   GpuIndexBinaryFlat.h
+  GpuIndexCagra.h
   GpuIndexFlat.h
   GpuIndexIVF.h
   GpuIndexIVFFlat.h
diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
new file mode 100644
index 0000000000..814ab3e9d7
--- /dev/null
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -0,0 +1,84 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/gpu/GpuIndexCagra.h>
+#include <faiss/gpu/impl/RaftCagra.cuh>
+#include "GpuIndexCagra.h"
+
+namespace faiss {
+namespace gpu {
+
+GpuIndexCagra::GpuIndexCagra(
+        GpuResourcesProvider* provider,
+        int dims,
+        faiss::MetricType metric,
+        GpuIndexCagraConfig config)
+        : GpuIndex(provider->getResources(), dims, metric, 0.0f, config),
+          cagraConfig_(config) {}
+
+void GpuIndexCagra::train(idx_t n, const float* x) {
+    if (this->is_trained) {
+        FAISS_ASSERT(index_);
+        return;
+    }
+
+    FAISS_ASSERT(!index_);
+
+    index_ = std::make_shared<RaftCagra>(
+            this->resources_.get(),
+            this->d,
+            cagraConfig_.intermediate_graph_degree,
+            cagraConfig_.graph_degree,
+            static_cast<faiss::cagra_build_algo>(cagraConfig_.build_algo),
+            cagraConfig_.nn_descent_niter,
+            this->metric_type,
+            this->metric_arg);
+
+    index_->train(n, x);
+
+    this->is_trained = true;
+    this->ntotal += n;
+}
+
+void GpuIndexCagra::searchImpl_(
+        idx_t n,
+        const float* x,
+        int k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* search_params) const {
+    FAISS_ASSERT(this->is_trained && index_);
+    FAISS_ASSERT(n > 0);
+
+    Tensor<float, 2, true> queries(const_cast<float*>(x), {n, this->d});
+    Tensor<float, 2, true> outDistances(distances, {n, k});
+    Tensor<idx_t, 2, true> outLabels(const_cast<idx_t*>(labels), {n, k});
+
+    auto params = dynamic_cast<const SearchParametersCagra*>(search_params);
+
+    index_->search(
+            queries,
+            k,
+            outDistances,
+            outLabels,
+            params->max_queries,
+            params->itopk_size,
+            params->max_iterations,
+            static_cast<faiss::cagra_search_algo>(params->algo),
+            params->team_size,
+            params->search_width,
+            params->min_iterations,
+            params->thread_block_size,
+            static_cast<faiss::cagra_hash_mode>(params->hashmap_mode),
+            params->hashmap_min_bitlen,
+            params->hashmap_max_fill_rate,
+            params->num_random_samplings,
+            params->rand_xor_mask);
+}
+
+} // namespace gpu
+} // namespace faiss
\ No newline at end of file
diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
new file mode 100644
index 0000000000..0ce5090674
--- /dev/null
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -0,0 +1,124 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/gpu/GpuIndex.h>
+
+namespace faiss {
+namespace gpu {
+
+class RaftCagra;
+
+enum class graph_build_algo {
+    /* Use IVF-PQ to build all-neighbors knn graph */
+    IVF_PQ,
+    /* Experimental, use NN-Descent to build all-neighbors knn graph */
+    NN_DESCENT
+};
+
+struct GpuIndexCagraConfig : public GpuIndexConfig {
+    /** Degree of input graph for pruning. */
+    size_t intermediate_graph_degree = 128;
+    /** Degree of output graph. */
+    size_t graph_degree = 64;
+    /** ANN algorithm to build knn graph. */
+    graph_build_algo build_algo = graph_build_algo::IVF_PQ;
+    /** Number of Iterations to run if building with NN_DESCENT */
+    size_t nn_descent_niter = 20;
+};
+
+enum class search_algo {
+    /** For large batch sizes. */
+    SINGLE_CTA,
+    /** For small batch sizes. */
+    MULTI_CTA,
+    MULTI_KERNEL,
+    AUTO
+};
+
+enum class hash_mode { HASH, SMALL, AUTO };
+
+struct SearchParametersCagra : SearchParameters {
+    /** Maximum number of queries to search at the same time (batch size). Auto
+     * select when 0.*/
+    size_t max_queries = 0;
+
+    /** Number of intermediate search results retained during the search.
+     *
+     *  This is the main knob to adjust trade off between accuracy and search
+     * speed. Higher values improve the search accuracy.
+     */
+    size_t itopk_size = 64;
+
+    /** Upper limit of search iterations. Auto select when 0.*/
+    size_t max_iterations = 0;
+
+    // In the following we list additional search parameters for fine tuning.
+    // Reasonable default values are automatically chosen.
+
+    /** Which search implementation to use. */
+    search_algo algo = search_algo::AUTO;
+
+    /** Number of threads used to calculate a single distance. 4, 8, 16, or 32.
+     */
+    size_t team_size = 0;
+
+    /** Number of graph nodes to select as the starting point for the search in
+     * each iteration. aka search width?*/
+    size_t search_width = 1;
+    /** Lower limit of search iterations. */
+    size_t min_iterations = 0;
+
+    /** Thread block size. 0, 64, 128, 256, 512, 1024. Auto selection when 0. */
+    size_t thread_block_size = 0;
+    /** Hashmap type. Auto selection when AUTO. */
+    hash_mode hashmap_mode = hash_mode::AUTO;
+    /** Lower limit of hashmap bit length. More than 8. */
+    size_t hashmap_min_bitlen = 0;
+    /** Upper limit of hashmap fill rate. More than 0.1, less than 0.9.*/
+    float hashmap_max_fill_rate = 0.5;
+
+    /** Number of iterations of initial random seed node selection. 1 or more.
+     */
+    uint32_t num_random_samplings = 1;
+    /** Bit mask used for initial random seed node selection. */
+    uint64_t rand_xor_mask = 0x128394;
+};
+
+struct GpuIndexCagra : public GpuIndex {
+   public:
+    GpuIndexCagra(
+            GpuResourcesProvider* provider,
+            int dims,
+            faiss::MetricType metric = faiss::METRIC_L2,
+            GpuIndexCagraConfig config = GpuIndexCagraConfig());
+
+    ~GpuIndexCagra();
+
+    /// Trains CAGRA based on the given vector data
+    void train(idx_t n, const float* x) override;
+
+   protected:
+    /// Called from GpuIndex for search
+    void searchImpl_(
+            idx_t n,
+            const float* x,
+            int k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* search_params) const override;
+
+    /// Our configuration options
+    const GpuIndexCagraConfig cagraConfig_;
+
+    /// Instance that we own; contains the inverted lists
+    std::shared_ptr<RaftCagra> index_;
+};
+
+} // namespace gpu
+} // namespace faiss
\ No newline at end of file
diff --git a/faiss/gpu/impl/RaftCagra.cu b/faiss/gpu/impl/RaftCagra.cu
index 0520209b17..972c95c576 100644
--- a/faiss/gpu/impl/RaftCagra.cu
+++ b/faiss/gpu/impl/RaftCagra.cu
@@ -21,10 +21,12 @@
  */
 
 #include <faiss/gpu/utils/DeviceUtils.h>
+#include <cstddef>
 #include <faiss/gpu/impl/RaftCagra.cuh>
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
+#include <raft/neighbors/cagra.cuh>
 
 namespace faiss {
 namespace gpu {
@@ -35,6 +37,7 @@ RaftCagra::RaftCagra(
         idx_t intermediate_graph_degree,
         idx_t graph_degree,
         faiss::cagra_build_algo graph_build_algo,
+        size_t nn_descent_niter,
         faiss::MetricType metric,
         float metricArg)
         : resources_(resources),
@@ -51,6 +54,7 @@ RaftCagra::RaftCagra(
     index_pams_.build_algo =
             static_cast<raft::neighbors::cagra::graph_build_algo>(
                     graph_build_algo);
+    index_pams_.nn_descent_niter = nn_descent_niter;
 }
 
 void RaftCagra::train(idx_t n, const float* x) {
diff --git a/faiss/gpu/impl/RaftCagra.cuh b/faiss/gpu/impl/RaftCagra.cuh
index ccdffa28f0..cbe9fde857 100644
--- a/faiss/gpu/impl/RaftCagra.cuh
+++ b/faiss/gpu/impl/RaftCagra.cuh
@@ -23,11 +23,12 @@
 #pragma once
 
 #include <faiss/gpu/GpuResources.h>
+#include <cstddef>
 #include <faiss/gpu/utils/Tensor.cuh>
 
 #include <faiss/MetricType.h>
 
-#include <raft/neighbors/cagra.cuh>
+#include <raft/neighbors/cagra_types.hpp>
 
 namespace faiss {
 
@@ -47,6 +48,7 @@ class RaftCagra {
             idx_t intermediate_graph_degree,
             idx_t graph_degree,
             faiss::cagra_build_algo graph_build_algo,
+            size_t nn_descent_niter,
             faiss::MetricType metric,
             float metricArg);
 

From 656f493f45b702121bc658d5fb10bb628bb9668c Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 31 Jan 2024 18:49:34 -0800
Subject: [PATCH 003/148] write tests, figure out a way to compare

---
 faiss/gpu/GpuIndexCagra.cu           |  36 ++++++++-
 faiss/gpu/GpuIndexCagra.h            |  24 +++++-
 faiss/gpu/impl/RaftCagra.cu          |  10 ++-
 faiss/gpu/impl/RaftCagra.cuh         |   8 +-
 faiss/gpu/test/CMakeLists.txt        |   4 +-
 faiss/gpu/test/TestGpuIndexCagra.cpp | 115 +++++++++++++++++++++++++++
 6 files changed, 184 insertions(+), 13 deletions(-)
 create mode 100644 faiss/gpu/test/TestGpuIndexCagra.cpp

diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
index 814ab3e9d7..db266ddbed 100644
--- a/faiss/gpu/GpuIndexCagra.cu
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -4,6 +4,21 @@
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.
  */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #include <faiss/gpu/GpuIndexCagra.h>
 #include <faiss/gpu/impl/RaftCagra.cuh>
@@ -18,7 +33,9 @@ GpuIndexCagra::GpuIndexCagra(
         faiss::MetricType metric,
         GpuIndexCagraConfig config)
         : GpuIndex(provider->getResources(), dims, metric, 0.0f, config),
-          cagraConfig_(config) {}
+          cagraConfig_(config) {
+            this->is_trained = false;
+          }
 
 void GpuIndexCagra::train(idx_t n, const float* x) {
     if (this->is_trained) {
@@ -36,7 +53,8 @@ void GpuIndexCagra::train(idx_t n, const float* x) {
             static_cast<faiss::cagra_build_algo>(cagraConfig_.build_algo),
             cagraConfig_.nn_descent_niter,
             this->metric_type,
-            this->metric_arg);
+            this->metric_arg,
+            faiss::gpu::INDICES_64_BIT);
 
     index_->train(n, x);
 
@@ -58,7 +76,13 @@ void GpuIndexCagra::searchImpl_(
     Tensor<float, 2, true> outDistances(distances, {n, k});
     Tensor<idx_t, 2, true> outLabels(const_cast<idx_t*>(labels), {n, k});
 
-    auto params = dynamic_cast<const SearchParametersCagra*>(search_params);
+    SearchParametersCagra* params;
+    if (search_params) {
+        params = dynamic_cast<SearchParametersCagra*>(const_cast<SearchParameters*>(search_params));
+    }
+    else {
+        params = new SearchParametersCagra{};
+    }
 
     index_->search(
             queries,
@@ -78,7 +102,11 @@ void GpuIndexCagra::searchImpl_(
             params->hashmap_max_fill_rate,
             params->num_random_samplings,
             params->rand_xor_mask);
+
+    if (not search_params) {
+        delete params;
+    }
 }
 
 } // namespace gpu
-} // namespace faiss
\ No newline at end of file
+} // namespace faiss
diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
index 0ce5090674..c17183635f 100644
--- a/faiss/gpu/GpuIndexCagra.h
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -4,6 +4,21 @@
  * This source code is licensed under the MIT license found in the
  * LICENSE file in the root directory of this source tree.
  */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 #pragma once
 
@@ -98,12 +113,17 @@ struct GpuIndexCagra : public GpuIndex {
             faiss::MetricType metric = faiss::METRIC_L2,
             GpuIndexCagraConfig config = GpuIndexCagraConfig());
 
-    ~GpuIndexCagra();
+    ~GpuIndexCagra() {}
 
     /// Trains CAGRA based on the given vector data
     void train(idx_t n, const float* x) override;
 
+    void reset() {}
+
    protected:
+   bool addImplRequiresIDs_() const {}
+
+   void addImpl_(idx_t n, const float* x, const idx_t* ids) {}
     /// Called from GpuIndex for search
     void searchImpl_(
             idx_t n,
@@ -121,4 +141,4 @@ struct GpuIndexCagra : public GpuIndex {
 };
 
 } // namespace gpu
-} // namespace faiss
\ No newline at end of file
+} // namespace faiss
diff --git a/faiss/gpu/impl/RaftCagra.cu b/faiss/gpu/impl/RaftCagra.cu
index 972c95c576..6253213fde 100644
--- a/faiss/gpu/impl/RaftCagra.cu
+++ b/faiss/gpu/impl/RaftCagra.cu
@@ -5,7 +5,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,7 +39,8 @@ RaftCagra::RaftCagra(
         faiss::cagra_build_algo graph_build_algo,
         size_t nn_descent_niter,
         faiss::MetricType metric,
-        float metricArg)
+        float metricArg,
+        IndicesOptions indicesOptions)
         : resources_(resources),
           dim_(dim),
           metric_(metric),
@@ -48,6 +49,9 @@ RaftCagra::RaftCagra(
     FAISS_THROW_IF_NOT_MSG(
             metric == faiss::METRIC_L2,
             "CAGRA currently only supports L2 metric.");
+    FAISS_THROW_IF_NOT_MSG(
+            indicesOptions == faiss::gpu::INDICES_64_BIT,
+            "only INDICES_64_BIT is supported for RAFT CAGRA index");
 
     index_pams_.intermediate_graph_degree = intermediate_graph_degree;
     index_pams_.graph_degree = graph_degree;
@@ -135,4 +139,4 @@ void RaftCagra::search(
 }
 
 } // namespace gpu
-} // namespace faiss
\ No newline at end of file
+} // namespace faiss
diff --git a/faiss/gpu/impl/RaftCagra.cuh b/faiss/gpu/impl/RaftCagra.cuh
index cbe9fde857..f92d04e38d 100644
--- a/faiss/gpu/impl/RaftCagra.cuh
+++ b/faiss/gpu/impl/RaftCagra.cuh
@@ -5,7 +5,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <faiss/gpu/GpuResources.h>
 #include <cstddef>
 #include <faiss/gpu/utils/Tensor.cuh>
+#include <faiss/gpu/GpuIndicesOptions.h>
 
 #include <faiss/MetricType.h>
 
@@ -50,7 +51,8 @@ class RaftCagra {
             faiss::cagra_build_algo graph_build_algo,
             size_t nn_descent_niter,
             faiss::MetricType metric,
-            float metricArg);
+            float metricArg,
+            IndicesOptions indicesOptions);
 
     ~RaftCagra() = default;
 
@@ -97,4 +99,4 @@ class RaftCagra {
 };
 
 } // namespace gpu
-} // namespace faiss
\ No newline at end of file
+} // namespace faiss
diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt
index 9300deead9..4b654f534d 100644
--- a/faiss/gpu/test/CMakeLists.txt
+++ b/faiss/gpu/test/CMakeLists.txt
@@ -21,7 +21,6 @@ find_package(CUDAToolkit REQUIRED)
 
 # Defines `gtest_discover_tests()`.
 include(GoogleTest)
-
 add_library(faiss_gpu_test_helper TestUtils.cpp)
 target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled>)
 
@@ -42,6 +41,9 @@ faiss_gpu_test(TestGpuIndexIVFPQ.cpp)
 faiss_gpu_test(TestGpuIndexIVFScalarQuantizer.cpp)
 faiss_gpu_test(TestGpuDistance.cu)
 faiss_gpu_test(TestGpuSelect.cu)
+if(FAISS_ENABLE_RAFT)
+  faiss_gpu_test(TestGpuIndexCagra.cpp)
+endif()
 
 add_executable(demo_ivfpq_indexing_gpu EXCLUDE_FROM_ALL
   demo_ivfpq_indexing_gpu.cpp)
diff --git a/faiss/gpu/test/TestGpuIndexCagra.cpp b/faiss/gpu/test/TestGpuIndexCagra.cpp
new file mode 100644
index 0000000000..1179d3a3cb
--- /dev/null
+++ b/faiss/gpu/test/TestGpuIndexCagra.cpp
@@ -0,0 +1,115 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <faiss/IndexHNSW.h>
+#include <faiss/gpu/GpuIndexCagra.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/test/TestUtils.h>
+#include <cstddef>
+#include "faiss/MetricType.h"
+
+struct Options {
+    Options() {
+        numTrain = 2 * faiss::gpu::randVal(2000, 5000);
+        dim = faiss::gpu::randVal(64, 200);
+
+        graphDegree = faiss::gpu::randSelect({16, 32});
+        intermediateGraphDegree = faiss::gpu::randSelect({32, 64});
+        buildAlgo = faiss::gpu::randSelect(
+                {faiss::gpu::graph_build_algo::IVF_PQ, faiss::gpu::graph_build_algo::NN_DESCENT});
+
+        numQuery = faiss::gpu::randVal(32, 100);
+        k = faiss::gpu::randVal(10, 30);
+
+        device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+    }
+
+    std::string toString() const {
+        std::stringstream str;
+        str << "CAGRA device " << device << " numVecs " << numTrain << " dim "
+            << dim << " graphDegree " << graphDegree << " intermediateGraphDegree " << intermediateGraphDegree
+            << "buildAlgo " << static_cast<int>(buildAlgo)
+            << " numQuery " << numQuery << " k " << k;
+
+        return str.str();
+    }
+
+    int numTrain;
+    int dim;
+    size_t graphDegree;
+    size_t intermediateGraphDegree;
+    faiss::gpu::graph_build_algo buildAlgo;
+    int numQuery;
+    int k;
+    int device;
+};
+
+void queryTest() {
+    for (int tries = 0; tries < 2; ++tries) {
+        Options opt;
+
+        std::vector<float> trainVecs =
+                faiss::gpu::randVecs(opt.numTrain, opt.dim);
+
+        faiss::IndexHNSWFlat cpuIndex(
+            opt.dim, opt.graphDegree / 2);
+        cpuIndex.train(opt.numTrain, trainVecs.data());
+        cpuIndex.add(opt.numTrain, trainVecs.data());
+
+        faiss::gpu::StandardGpuResources res;
+        res.noTempMemory();
+
+        faiss::gpu::GpuIndexCagraConfig config;
+        config.device = opt.device;
+        config.graph_degree = opt.graphDegree;
+        config.intermediate_graph_degree = opt.intermediateGraphDegree;
+        config.build_algo = opt.buildAlgo;
+
+        faiss::gpu::GpuIndexCagra gpuIndex(
+                &res, cpuIndex.d, faiss::METRIC_L2, config);
+        gpuIndex.train(opt.numTrain, trainVecs.data());
+
+        faiss::gpu::compareIndices(
+                cpuIndex,
+                gpuIndex,
+                opt.numQuery,
+                opt.dim,
+                opt.k,
+                opt.toString(),
+                0.15f,
+                1.0f,
+                0.15f);
+    }
+}
+
+TEST(TestGpuIndexCagra, Float32_Query_L2) {
+    queryTest();
+}
+
+int main(int argc, char** argv) {
+    testing::InitGoogleTest(&argc, argv);
+
+    // just run with a fixed test seed
+    faiss::gpu::setTestSeed(100);
+
+    return RUN_ALL_TESTS();
+}

From ed32954e13b754e56c9d040f28fa91282b4d8835 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 7 Feb 2024 14:01:41 -0800
Subject: [PATCH 004/148] passing tests

---
 faiss/gpu/test/CMakeLists.txt       |   2 +-
 faiss/gpu/test/TestGpuIndexCagra.cu | 159 ++++++++++++++++++++++++++++
 2 files changed, 160 insertions(+), 1 deletion(-)
 create mode 100644 faiss/gpu/test/TestGpuIndexCagra.cu

diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt
index 4b654f534d..60f78ef74f 100644
--- a/faiss/gpu/test/CMakeLists.txt
+++ b/faiss/gpu/test/CMakeLists.txt
@@ -42,7 +42,7 @@ faiss_gpu_test(TestGpuIndexIVFScalarQuantizer.cpp)
 faiss_gpu_test(TestGpuDistance.cu)
 faiss_gpu_test(TestGpuSelect.cu)
 if(FAISS_ENABLE_RAFT)
-  faiss_gpu_test(TestGpuIndexCagra.cpp)
+  faiss_gpu_test(TestGpuIndexCagra.cu)
 endif()
 
 add_executable(demo_ivfpq_indexing_gpu EXCLUDE_FROM_ALL
diff --git a/faiss/gpu/test/TestGpuIndexCagra.cu b/faiss/gpu/test/TestGpuIndexCagra.cu
new file mode 100644
index 0000000000..90215c07f3
--- /dev/null
+++ b/faiss/gpu/test/TestGpuIndexCagra.cu
@@ -0,0 +1,159 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <faiss/IndexHNSW.h>
+#include <faiss/gpu/GpuIndexCagra.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/test/TestUtils.h>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <cstddef>
+#include <faiss/MetricType.h>
+#include <optional>
+#include <vector>
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/stats/neighborhood_recall.cuh>
+
+struct Options {
+    Options() {
+        numTrain = 2 * faiss::gpu::randVal(2000, 5000);
+        dim = faiss::gpu::randVal(4, 10);
+
+        graphDegree = faiss::gpu::randSelect({32, 64});
+        intermediateGraphDegree = faiss::gpu::randSelect({64, 98});
+        buildAlgo = faiss::gpu::randSelect(
+                {faiss::gpu::graph_build_algo::IVF_PQ, faiss::gpu::graph_build_algo::NN_DESCENT});
+
+        numQuery = faiss::gpu::randVal(32, 100);
+        k = faiss::gpu::randVal(10, 30);
+
+        device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+    }
+
+    std::string toString() const {
+        std::stringstream str;
+        str << "CAGRA device " << device << " numVecs " << numTrain << " dim "
+            << dim << " graphDegree " << graphDegree << " intermediateGraphDegree " << intermediateGraphDegree
+            << "buildAlgo " << static_cast<int>(buildAlgo)
+            << " numQuery " << numQuery << " k " << k;
+
+        return str.str();
+    }
+
+    int numTrain;
+    int dim;
+    size_t graphDegree;
+    size_t intermediateGraphDegree;
+    faiss::gpu::graph_build_algo buildAlgo;
+    int numQuery;
+    int k;
+    int device;
+};
+
+void queryTest() {
+    for (int tries = 0; tries < 2; ++tries) {
+        Options opt;
+
+        std::vector<float> trainVecs =
+                faiss::gpu::randVecs(opt.numTrain, opt.dim);
+
+        faiss::IndexHNSWFlat cpuIndex(
+            opt.dim, opt.graphDegree / 2);
+        cpuIndex.hnsw.efConstruction = opt.k * 2;
+        cpuIndex.train(opt.numTrain, trainVecs.data());
+        cpuIndex.add(opt.numTrain, trainVecs.data());
+
+        faiss::gpu::StandardGpuResources res;
+        res.noTempMemory();
+
+        faiss::gpu::GpuIndexCagraConfig config;
+        config.device = opt.device;
+        config.graph_degree = opt.graphDegree;
+        config.intermediate_graph_degree = opt.intermediateGraphDegree;
+        config.build_algo = opt.buildAlgo;
+
+        faiss::gpu::GpuIndexCagra gpuIndex(
+                &res, cpuIndex.d, faiss::METRIC_L2, config);
+        gpuIndex.train(opt.numTrain, trainVecs.data());
+
+        auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+
+        std::vector<float> refDistance(opt.numQuery * opt.k, 0);
+        std::vector<faiss::idx_t> refIndices(opt.numQuery * opt.k, -1);
+        faiss::SearchParametersHNSW cpuSearchParams;
+        cpuSearchParams.efSearch = opt.k * 2;
+        cpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                refDistance.data(),
+                refIndices.data(),
+                &cpuSearchParams);
+
+        auto gpuRes = res.getResources();
+        auto devAlloc = faiss::gpu::makeDevAlloc(faiss::gpu::AllocType::FlatData, gpuRes->getDefaultStreamCurrentDevice());
+        faiss::gpu::DeviceTensor<float, 2, true> testDistance(gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        faiss::gpu::DeviceTensor<faiss::idx_t, 2, true> testIndices(gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        gpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                testDistance.data(),
+                testIndices.data());
+
+        auto refDistanceDev = faiss::gpu::toDeviceTemporary(gpuRes.get(), refDistance, gpuRes->getDefaultStreamCurrentDevice());
+        auto refIndicesDev = faiss::gpu::toDeviceTemporary(gpuRes.get(), refIndices, gpuRes->getDefaultStreamCurrentDevice());
+
+        auto raft_handle = gpuRes->getRaftHandleCurrentDevice();
+
+        auto ref_dis_mds = raft::make_device_matrix_view<const float, int>(refDistanceDev.data(), opt.numQuery, opt.k);
+        auto ref_dis_mds_opt = std::optional<raft::device_matrix_view<const float, int>>(ref_dis_mds);
+        auto ref_ind_mds = raft::make_device_matrix_view<const faiss::idx_t, int>(refIndicesDev.data(), opt.numQuery, opt.k);
+
+        auto test_dis_mds = raft::make_device_matrix_view<const float, int>(testDistance.data(), opt.numQuery, opt.k);
+                auto test_dis_mds_opt = std::optional<raft::device_matrix_view<const float, int>>(test_dis_mds);
+
+        auto test_ind_mds = raft::make_device_matrix_view<const faiss::idx_t, int>(testIndices.data(), opt.numQuery, opt.k);
+
+        double scalar_init = 0;
+        auto recall_score = raft::make_host_scalar(scalar_init);
+
+        raft::stats::neighborhood_recall(raft_handle, test_ind_mds, ref_ind_mds, recall_score.view(), test_dis_mds_opt, ref_dis_mds_opt);
+        ASSERT_TRUE(*recall_score.data_handle() > 0.98);
+
+    }
+}
+
+TEST(TestGpuIndexCagra, Float32_Query_L2) {
+    queryTest();
+}
+
+int main(int argc, char** argv) {
+    testing::InitGoogleTest(&argc, argv);
+
+    // just run with a fixed test seed
+    faiss::gpu::setTestSeed(100);
+
+    return RUN_ALL_TESTS();
+}

From 42ca86227937d2e5f3add800c745f8483f939dec Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 7 Feb 2024 14:04:14 -0800
Subject: [PATCH 005/148] remove cpp test file

---
 faiss/gpu/test/TestGpuIndexCagra.cpp | 115 ---------------------------
 1 file changed, 115 deletions(-)
 delete mode 100644 faiss/gpu/test/TestGpuIndexCagra.cpp

diff --git a/faiss/gpu/test/TestGpuIndexCagra.cpp b/faiss/gpu/test/TestGpuIndexCagra.cpp
deleted file mode 100644
index 1179d3a3cb..0000000000
--- a/faiss/gpu/test/TestGpuIndexCagra.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/**
- * Copyright (c) Facebook, Inc. and its affiliates.
- *
- * This source code is licensed under the MIT license found in the
- * LICENSE file in the root directory of this source tree.
- */
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <faiss/IndexHNSW.h>
-#include <faiss/gpu/GpuIndexCagra.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/test/TestUtils.h>
-#include <cstddef>
-#include "faiss/MetricType.h"
-
-struct Options {
-    Options() {
-        numTrain = 2 * faiss::gpu::randVal(2000, 5000);
-        dim = faiss::gpu::randVal(64, 200);
-
-        graphDegree = faiss::gpu::randSelect({16, 32});
-        intermediateGraphDegree = faiss::gpu::randSelect({32, 64});
-        buildAlgo = faiss::gpu::randSelect(
-                {faiss::gpu::graph_build_algo::IVF_PQ, faiss::gpu::graph_build_algo::NN_DESCENT});
-
-        numQuery = faiss::gpu::randVal(32, 100);
-        k = faiss::gpu::randVal(10, 30);
-
-        device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-    }
-
-    std::string toString() const {
-        std::stringstream str;
-        str << "CAGRA device " << device << " numVecs " << numTrain << " dim "
-            << dim << " graphDegree " << graphDegree << " intermediateGraphDegree " << intermediateGraphDegree
-            << "buildAlgo " << static_cast<int>(buildAlgo)
-            << " numQuery " << numQuery << " k " << k;
-
-        return str.str();
-    }
-
-    int numTrain;
-    int dim;
-    size_t graphDegree;
-    size_t intermediateGraphDegree;
-    faiss::gpu::graph_build_algo buildAlgo;
-    int numQuery;
-    int k;
-    int device;
-};
-
-void queryTest() {
-    for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
-
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-
-        faiss::IndexHNSWFlat cpuIndex(
-            opt.dim, opt.graphDegree / 2);
-        cpuIndex.train(opt.numTrain, trainVecs.data());
-        cpuIndex.add(opt.numTrain, trainVecs.data());
-
-        faiss::gpu::StandardGpuResources res;
-        res.noTempMemory();
-
-        faiss::gpu::GpuIndexCagraConfig config;
-        config.device = opt.device;
-        config.graph_degree = opt.graphDegree;
-        config.intermediate_graph_degree = opt.intermediateGraphDegree;
-        config.build_algo = opt.buildAlgo;
-
-        faiss::gpu::GpuIndexCagra gpuIndex(
-                &res, cpuIndex.d, faiss::METRIC_L2, config);
-        gpuIndex.train(opt.numTrain, trainVecs.data());
-
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                0.15f,
-                1.0f,
-                0.15f);
-    }
-}
-
-TEST(TestGpuIndexCagra, Float32_Query_L2) {
-    queryTest();
-}
-
-int main(int argc, char** argv) {
-    testing::InitGoogleTest(&argc, argv);
-
-    // just run with a fixed test seed
-    faiss::gpu::setTestSeed(100);
-
-    return RUN_ALL_TESTS();
-}

From 2c9e965d8ae88bd06b898afde4e1e056555698eb Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 7 Feb 2024 14:24:20 -0800
Subject: [PATCH 006/148] style check

---
 faiss/gpu/GpuIndexCagra.cu          | 10 ++--
 faiss/gpu/GpuIndexCagra.h           |  4 +-
 faiss/gpu/impl/RaftCagra.cuh        |  2 +-
 faiss/gpu/test/TestGpuIndexCagra.cu | 72 ++++++++++++++++++++---------
 4 files changed, 57 insertions(+), 31 deletions(-)

diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
index db266ddbed..b0a60268d3 100644
--- a/faiss/gpu/GpuIndexCagra.cu
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -34,8 +34,8 @@ GpuIndexCagra::GpuIndexCagra(
         GpuIndexCagraConfig config)
         : GpuIndex(provider->getResources(), dims, metric, 0.0f, config),
           cagraConfig_(config) {
-            this->is_trained = false;
-          }
+    this->is_trained = false;
+}
 
 void GpuIndexCagra::train(idx_t n, const float* x) {
     if (this->is_trained) {
@@ -78,9 +78,9 @@ void GpuIndexCagra::searchImpl_(
 
     SearchParametersCagra* params;
     if (search_params) {
-        params = dynamic_cast<SearchParametersCagra*>(const_cast<SearchParameters*>(search_params));
-    }
-    else {
+        params = dynamic_cast<SearchParametersCagra*>(
+                const_cast<SearchParameters*>(search_params));
+    } else {
         params = new SearchParametersCagra{};
     }
 
diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
index c17183635f..a812ebafee 100644
--- a/faiss/gpu/GpuIndexCagra.h
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -121,9 +121,9 @@ struct GpuIndexCagra : public GpuIndex {
     void reset() {}
 
    protected:
-   bool addImplRequiresIDs_() const {}
+    bool addImplRequiresIDs_() const {}
 
-   void addImpl_(idx_t n, const float* x, const idx_t* ids) {}
+    void addImpl_(idx_t n, const float* x, const idx_t* ids) {}
     /// Called from GpuIndex for search
     void searchImpl_(
             idx_t n,
diff --git a/faiss/gpu/impl/RaftCagra.cuh b/faiss/gpu/impl/RaftCagra.cuh
index f92d04e38d..5783cbf706 100644
--- a/faiss/gpu/impl/RaftCagra.cuh
+++ b/faiss/gpu/impl/RaftCagra.cuh
@@ -22,10 +22,10 @@
 
 #pragma once
 
+#include <faiss/gpu/GpuIndicesOptions.h>
 #include <faiss/gpu/GpuResources.h>
 #include <cstddef>
 #include <faiss/gpu/utils/Tensor.cuh>
-#include <faiss/gpu/GpuIndicesOptions.h>
 
 #include <faiss/MetricType.h>
 
diff --git a/faiss/gpu/test/TestGpuIndexCagra.cu b/faiss/gpu/test/TestGpuIndexCagra.cu
index 90215c07f3..3a99ba35f0 100644
--- a/faiss/gpu/test/TestGpuIndexCagra.cu
+++ b/faiss/gpu/test/TestGpuIndexCagra.cu
@@ -21,14 +21,14 @@
  */
 
 #include <faiss/IndexHNSW.h>
+#include <faiss/MetricType.h>
 #include <faiss/gpu/GpuIndexCagra.h>
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/gpu/test/TestUtils.h>
+#include <cstddef>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/DeviceTensor.cuh>
-#include <cstddef>
-#include <faiss/MetricType.h>
 #include <optional>
 #include <vector>
 
@@ -43,7 +43,8 @@ struct Options {
         graphDegree = faiss::gpu::randSelect({32, 64});
         intermediateGraphDegree = faiss::gpu::randSelect({64, 98});
         buildAlgo = faiss::gpu::randSelect(
-                {faiss::gpu::graph_build_algo::IVF_PQ, faiss::gpu::graph_build_algo::NN_DESCENT});
+                {faiss::gpu::graph_build_algo::IVF_PQ,
+                 faiss::gpu::graph_build_algo::NN_DESCENT});
 
         numQuery = faiss::gpu::randVal(32, 100);
         k = faiss::gpu::randVal(10, 30);
@@ -54,9 +55,10 @@ struct Options {
     std::string toString() const {
         std::stringstream str;
         str << "CAGRA device " << device << " numVecs " << numTrain << " dim "
-            << dim << " graphDegree " << graphDegree << " intermediateGraphDegree " << intermediateGraphDegree
-            << "buildAlgo " << static_cast<int>(buildAlgo)
-            << " numQuery " << numQuery << " k " << k;
+            << dim << " graphDegree " << graphDegree
+            << " intermediateGraphDegree " << intermediateGraphDegree
+            << "buildAlgo " << static_cast<int>(buildAlgo) << " numQuery "
+            << numQuery << " k " << k;
 
         return str.str();
     }
@@ -78,8 +80,7 @@ void queryTest() {
         std::vector<float> trainVecs =
                 faiss::gpu::randVecs(opt.numTrain, opt.dim);
 
-        faiss::IndexHNSWFlat cpuIndex(
-            opt.dim, opt.graphDegree / 2);
+        faiss::IndexHNSWFlat cpuIndex(opt.dim, opt.graphDegree / 2);
         cpuIndex.hnsw.efConstruction = opt.k * 2;
         cpuIndex.train(opt.numTrain, trainVecs.data());
         cpuIndex.add(opt.numTrain, trainVecs.data());
@@ -112,9 +113,13 @@ void queryTest() {
                 &cpuSearchParams);
 
         auto gpuRes = res.getResources();
-        auto devAlloc = faiss::gpu::makeDevAlloc(faiss::gpu::AllocType::FlatData, gpuRes->getDefaultStreamCurrentDevice());
-        faiss::gpu::DeviceTensor<float, 2, true> testDistance(gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
-        faiss::gpu::DeviceTensor<faiss::idx_t, 2, true> testIndices(gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        auto devAlloc = faiss::gpu::makeDevAlloc(
+                faiss::gpu::AllocType::FlatData,
+                gpuRes->getDefaultStreamCurrentDevice());
+        faiss::gpu::DeviceTensor<float, 2, true> testDistance(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        faiss::gpu::DeviceTensor<faiss::idx_t, 2, true> testIndices(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
         gpuIndex.search(
                 opt.numQuery,
                 queryVecs.data(),
@@ -122,26 +127,47 @@ void queryTest() {
                 testDistance.data(),
                 testIndices.data());
 
-        auto refDistanceDev = faiss::gpu::toDeviceTemporary(gpuRes.get(), refDistance, gpuRes->getDefaultStreamCurrentDevice());
-        auto refIndicesDev = faiss::gpu::toDeviceTemporary(gpuRes.get(), refIndices, gpuRes->getDefaultStreamCurrentDevice());
+        auto refDistanceDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                refDistance,
+                gpuRes->getDefaultStreamCurrentDevice());
+        auto refIndicesDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                refIndices,
+                gpuRes->getDefaultStreamCurrentDevice());
 
         auto raft_handle = gpuRes->getRaftHandleCurrentDevice();
 
-        auto ref_dis_mds = raft::make_device_matrix_view<const float, int>(refDistanceDev.data(), opt.numQuery, opt.k);
-        auto ref_dis_mds_opt = std::optional<raft::device_matrix_view<const float, int>>(ref_dis_mds);
-        auto ref_ind_mds = raft::make_device_matrix_view<const faiss::idx_t, int>(refIndicesDev.data(), opt.numQuery, opt.k);
-
-        auto test_dis_mds = raft::make_device_matrix_view<const float, int>(testDistance.data(), opt.numQuery, opt.k);
-                auto test_dis_mds_opt = std::optional<raft::device_matrix_view<const float, int>>(test_dis_mds);
-
-        auto test_ind_mds = raft::make_device_matrix_view<const faiss::idx_t, int>(testIndices.data(), opt.numQuery, opt.k);
+        auto ref_dis_mds = raft::make_device_matrix_view<const float, int>(
+                refDistanceDev.data(), opt.numQuery, opt.k);
+        auto ref_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        ref_dis_mds);
+        auto ref_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        refIndicesDev.data(), opt.numQuery, opt.k);
+
+        auto test_dis_mds = raft::make_device_matrix_view<const float, int>(
+                testDistance.data(), opt.numQuery, opt.k);
+        auto test_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        test_dis_mds);
+
+        auto test_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        testIndices.data(), opt.numQuery, opt.k);
 
         double scalar_init = 0;
         auto recall_score = raft::make_host_scalar(scalar_init);
 
-        raft::stats::neighborhood_recall(raft_handle, test_ind_mds, ref_ind_mds, recall_score.view(), test_dis_mds_opt, ref_dis_mds_opt);
+        raft::stats::neighborhood_recall(
+                raft_handle,
+                test_ind_mds,
+                ref_ind_mds,
+                recall_score.view(),
+                test_dis_mds_opt,
+                ref_dis_mds_opt);
         ASSERT_TRUE(*recall_score.data_handle() > 0.98);
-
     }
 }
 

From 2e434feb1f5af848591d5792d1f56ca4ebb1ceea Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 7 Feb 2024 15:54:07 -0800
Subject: [PATCH 007/148] add required methods

---
 faiss/gpu/GpuIndexCagra.cu          | 21 ++++++++++++++++++++-
 faiss/gpu/GpuIndexCagra.h           |  9 +++++----
 faiss/gpu/impl/RaftCagra.cu         |  6 ++++++
 faiss/gpu/impl/RaftCagra.cuh        |  2 ++
 faiss/gpu/test/TestGpuIndexCagra.cu |  1 +
 5 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
index b0a60268d3..1a8eb382c0 100644
--- a/faiss/gpu/GpuIndexCagra.cu
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -59,9 +59,17 @@ void GpuIndexCagra::train(idx_t n, const float* x) {
     index_->train(n, x);
 
     this->is_trained = true;
-    this->ntotal += n;
+    this->ntotal = n;
 }
 
+bool GpuIndexCagra::addImplRequiresIDs_() const {
+    return false;
+};
+
+void GpuIndexCagra::addImpl_(idx_t n, const float* x, const idx_t* ids) {
+    FAISS_THROW_MSG("adding vectors is not supported by GpuIndexCagra.");
+};
+
 void GpuIndexCagra::searchImpl_(
         idx_t n,
         const float* x,
@@ -108,5 +116,16 @@ void GpuIndexCagra::searchImpl_(
     }
 }
 
+void GpuIndexCagra::reset() {
+    DeviceScope scope(config_.device);
+
+    if (index_) {
+        index_->reset();
+        this->ntotal = 0;
+    } else {
+        FAISS_ASSERT(this->ntotal == 0);
+    }
+}
+
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
index a812ebafee..2c31ab9f59 100644
--- a/faiss/gpu/GpuIndexCagra.h
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -113,17 +113,18 @@ struct GpuIndexCagra : public GpuIndex {
             faiss::MetricType metric = faiss::METRIC_L2,
             GpuIndexCagraConfig config = GpuIndexCagraConfig());
 
-    ~GpuIndexCagra() {}
+    ~GpuIndexCagra() override = default;
 
     /// Trains CAGRA based on the given vector data
     void train(idx_t n, const float* x) override;
 
-    void reset() {}
+    void reset() override;
 
    protected:
-    bool addImplRequiresIDs_() const {}
+    bool addImplRequiresIDs_() const override;
+
+    void addImpl_(idx_t n, const float* x, const idx_t* ids) override;
 
-    void addImpl_(idx_t n, const float* x, const idx_t* ids) {}
     /// Called from GpuIndex for search
     void searchImpl_(
             idx_t n,
diff --git a/faiss/gpu/impl/RaftCagra.cu b/faiss/gpu/impl/RaftCagra.cu
index 6253213fde..c0f7bbba69 100644
--- a/faiss/gpu/impl/RaftCagra.cu
+++ b/faiss/gpu/impl/RaftCagra.cu
@@ -59,6 +59,8 @@ RaftCagra::RaftCagra(
             static_cast<raft::neighbors::cagra::graph_build_algo>(
                     graph_build_algo);
     index_pams_.nn_descent_niter = nn_descent_niter;
+
+    reset();
 }
 
 void RaftCagra::train(idx_t n, const float* x) {
@@ -138,5 +140,9 @@ void RaftCagra::search(
             distances_view);
 }
 
+void RaftCagra::reset() {
+    raft_knn_index.reset();
+}
+
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/impl/RaftCagra.cuh b/faiss/gpu/impl/RaftCagra.cuh
index 5783cbf706..7f2b8b485c 100644
--- a/faiss/gpu/impl/RaftCagra.cuh
+++ b/faiss/gpu/impl/RaftCagra.cuh
@@ -77,6 +77,8 @@ class RaftCagra {
             idx_t num_random_samplings,
             idx_t rand_xor_mask);
 
+    void reset();
+
    private:
     /// Collection of GPU resources that we use
     GpuResources* resources_;
diff --git a/faiss/gpu/test/TestGpuIndexCagra.cu b/faiss/gpu/test/TestGpuIndexCagra.cu
index 3a99ba35f0..8ba11e63ac 100644
--- a/faiss/gpu/test/TestGpuIndexCagra.cu
+++ b/faiss/gpu/test/TestGpuIndexCagra.cu
@@ -82,6 +82,7 @@ void queryTest() {
 
         faiss::IndexHNSWFlat cpuIndex(opt.dim, opt.graphDegree / 2);
         cpuIndex.hnsw.efConstruction = opt.k * 2;
+        // Training IndexHNSW is a no-op
         cpuIndex.train(opt.numTrain, trainVecs.data());
         cpuIndex.add(opt.numTrain, trainVecs.data());
 

From 382c178cde27b13d8d7ccfe0af29ca713fe9295f Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Thu, 8 Feb 2024 12:11:44 -0800
Subject: [PATCH 008/148] conditionally compile cagra

---
 faiss/gpu/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index 1bc08826ee..2efa622fe3 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -29,7 +29,7 @@ set(FAISS_GPU_SRC
   GpuIndexIVFFlat.cu
   GpuIndexIVFPQ.cu
   GpuIndexIVFScalarQuantizer.cu
-  GpuIndexCagra.cu
+  $<$<BOOL:${FAISS_ENABLE_RAFT}>:GpuIndexCagra.cu>
   GpuResources.cpp
   StandardGpuResources.cpp
   impl/BinaryDistance.cu
@@ -92,7 +92,7 @@ set(FAISS_GPU_HEADERS
   GpuFaissAssert.h
   GpuIndex.h
   GpuIndexBinaryFlat.h
-  GpuIndexCagra.h
+  $<$<BOOL:${FAISS_ENABLE_RAFT}>:GpuIndexCagra.h>
   GpuIndexFlat.h
   GpuIndexIVF.h
   GpuIndexIVFFlat.h

From 867597429a5686c5ad3e124f5fac468255c2db5f Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 14 Feb 2024 13:39:26 -0800
Subject: [PATCH 009/148] copyTo and copyFrom

---
 faiss/IndexHNSW.cpp                 |  27 +++-
 faiss/IndexHNSW.h                   |   8 +
 faiss/gpu/GpuIndexCagra.cu          |  80 +++++++++-
 faiss/gpu/GpuIndexCagra.h           |  14 ++
 faiss/gpu/impl/RaftCagra.cu         | 106 +++++++++++++
 faiss/gpu/impl/RaftCagra.cuh        |  17 ++
 faiss/gpu/test/TestGpuIndexCagra.cu | 235 +++++++++++++++++++++++++++-
 faiss/impl/HNSW.cpp                 |  56 +++++--
 faiss/impl/HNSW.h                   |   9 +-
 9 files changed, 527 insertions(+), 25 deletions(-)

diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index 9a67332d67..8305764df8 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -192,7 +192,7 @@ void hnsw_add_vertices(
 
         int i1 = n;
 
-        for (int pt_level = hist.size() - 1; pt_level >= 0; pt_level--) {
+        for (int pt_level = hist.size() - 1; pt_level >= !index_hnsw.init_level0; pt_level--) {
             int i0 = i1 - hist[pt_level];
 
             if (verbose) {
@@ -228,7 +228,7 @@ void hnsw_add_vertices(
                         continue;
                     }
 
-                    hnsw.add_with_locks(*dis, pt_level, pt_id, locks, vt);
+                    hnsw.add_with_locks(*dis, pt_level, pt_id, locks, vt, index_hnsw.keep_max_size_level0);
 
                     if (prev_display >= 0 && i - i0 > prev_display + 10000) {
                         prev_display = i - i0;
@@ -248,7 +248,12 @@ void hnsw_add_vertices(
             }
             i1 = i0;
         }
-        FAISS_ASSERT(i1 == 0);
+        if (index_hnsw.init_level0) {
+            FAISS_ASSERT(i1 == 0);
+        }
+        else {
+            FAISS_ASSERT((i1 - hist[0]) == 0);
+        }
     }
     if (verbose) {
         printf("Done in %.3f ms\n", getmillisecs() - t0);
@@ -914,4 +919,20 @@ void IndexHNSW2Level::flip_to_ivf() {
     delete storage2l;
 }
 
+/**************************************************************
+ * IndexHNSWCagra implementation
+ **************************************************************/
+
+IndexHNSWCagra::IndexHNSWCagra() {
+    is_trained = true;
+}
+
+IndexHNSWCagra::IndexHNSWCagra(int d, int M)
+        : IndexHNSW(new IndexFlatL2(d), M) {
+    own_fields = true;
+    is_trained = true;
+    init_level0 = true;
+    keep_max_size_level0 = true;
+}
+
 } // namespace faiss
diff --git a/faiss/IndexHNSW.h b/faiss/IndexHNSW.h
index e0b65fca9d..3d3162e423 100644
--- a/faiss/IndexHNSW.h
+++ b/faiss/IndexHNSW.h
@@ -34,6 +34,9 @@ struct IndexHNSW : Index {
     bool own_fields = false;
     Index* storage = nullptr;
 
+    bool init_level0 = true;
+    bool keep_max_size_level0 = false;
+
     explicit IndexHNSW(int d = 0, int M = 32, MetricType metric = METRIC_L2);
     explicit IndexHNSW(Index* storage, int M = 32);
 
@@ -148,4 +151,9 @@ struct IndexHNSW2Level : IndexHNSW {
             const SearchParameters* params = nullptr) const override;
 };
 
+struct IndexHNSWCagra : IndexHNSW {
+    IndexHNSWCagra();
+    IndexHNSWCagra(int d, int M);
+};
+
 } // namespace faiss
diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
index 1a8eb382c0..e20dbc8663 100644
--- a/faiss/gpu/GpuIndexCagra.cu
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -20,9 +20,10 @@
  * limitations under the License.
  */
 
+#include <faiss/IndexHNSW.h>
 #include <faiss/gpu/GpuIndexCagra.h>
+#include <cstddef>
 #include <faiss/gpu/impl/RaftCagra.cuh>
-#include "GpuIndexCagra.h"
 
 namespace faiss {
 namespace gpu {
@@ -54,7 +55,7 @@ void GpuIndexCagra::train(idx_t n, const float* x) {
             cagraConfig_.nn_descent_niter,
             this->metric_type,
             this->metric_arg,
-            faiss::gpu::INDICES_64_BIT);
+            INDICES_64_BIT);
 
     index_->train(n, x);
 
@@ -116,16 +117,91 @@ void GpuIndexCagra::searchImpl_(
     }
 }
 
+void GpuIndexCagra::copyFrom(const faiss::IndexHNSWCagra* index) {
+    FAISS_ASSERT(index);
+
+    auto base_index = index->storage;
+    auto l2_index = dynamic_cast<IndexFlatL2*>(base_index);
+    FAISS_ASSERT(l2_index);
+    auto distances = l2_index->get_xb();
+
+    auto hnsw = index->hnsw;
+    // copy level 0 to a dense knn graph matrix
+    std::vector<idx_t> knn_graph;
+    knn_graph.reserve(index->ntotal * hnsw.nb_neighbors(0));
+
+#pragma omp parallel for
+    for (size_t i = 0; i < index->ntotal; ++i) {
+        size_t begin, end;
+        hnsw.neighbor_range(i, 0, &begin, &end);
+        for (size_t j = begin; j < end; j++) {
+            // knn_graph.push_back(hnsw.neighbors[j]);
+            knn_graph[i * hnsw.nb_neighbors(0) + (j - begin)] =
+                    hnsw.neighbors[j];
+        }
+    }
+
+    index_ = std::make_shared<RaftCagra>(
+            this->resources_.get(),
+            this->d,
+            index->ntotal,
+            hnsw.nb_neighbors(0),
+            distances,
+            knn_graph.data(),
+            this->metric_type,
+            this->metric_arg,
+            INDICES_64_BIT);
+
+    this->is_trained = true;
+}
+
+void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const {
+    FAISS_ASSERT(index_ && this->is_trained && index);
+
+    auto graph_degree = index_->get_knngraph_degree();
+    FAISS_THROW_IF_NOT_MSG(
+            (index->hnsw.nb_neighbors(0)) == graph_degree,
+            "IndexHNSWCagra.hnsw.nb_neighbors(0) should be equal to GpuIndexCagraConfig.graph_degree");
+
+    auto n_train = this->ntotal;
+    auto train_dataset = index_->get_training_dataset();
+
+    // turn off as level 0 is copied from CAGRA graph
+    index->init_level0 = false;
+    index->add(n_train, train_dataset.data());
+
+    auto graph = get_knngraph();
+
+#pragma omp parallel for
+    for (idx_t i = 0; i < n_train; i++) {
+        size_t begin, end;
+        index->hnsw.neighbor_range(i, 0, &begin, &end);
+        for (size_t j = begin; j < end; j++) {
+            index->hnsw.neighbors[j] = graph[i * graph_degree + (j - begin)];
+        }
+    }
+
+    // turn back on to allow new vectors to be added to level 0
+    index->init_level0 = true;
+}
+
 void GpuIndexCagra::reset() {
     DeviceScope scope(config_.device);
 
     if (index_) {
         index_->reset();
         this->ntotal = 0;
+        this->is_trained = false;
     } else {
         FAISS_ASSERT(this->ntotal == 0);
     }
 }
 
+std::vector<idx_t> GpuIndexCagra::get_knngraph() const {
+    FAISS_ASSERT(index_ && this->is_trained);
+
+    return index_->get_knngraph();
+}
+
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
index 2c31ab9f59..902a0d34e7 100644
--- a/faiss/gpu/GpuIndexCagra.h
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -24,6 +24,10 @@
 
 #include <faiss/gpu/GpuIndex.h>
 
+namespace faiss {
+struct IndexHNSWCagra;
+}
+
 namespace faiss {
 namespace gpu {
 
@@ -118,8 +122,18 @@ struct GpuIndexCagra : public GpuIndex {
     /// Trains CAGRA based on the given vector data
     void train(idx_t n, const float* x) override;
 
+    /// Initialize ourselves from the given CPU index; will overwrite
+    /// all data in ourselves
+    void copyFrom(const faiss::IndexHNSWCagra* index);
+
+    /// Copy ourselves to the given CPU index; will overwrite all data
+    /// in the index instance
+    void copyTo(faiss::IndexHNSWCagra* index) const;
+
     void reset() override;
 
+    std::vector<idx_t> get_knngraph() const;
+
    protected:
     bool addImplRequiresIDs_() const override;
 
diff --git a/faiss/gpu/impl/RaftCagra.cu b/faiss/gpu/impl/RaftCagra.cu
index c0f7bbba69..0a55901a1b 100644
--- a/faiss/gpu/impl/RaftCagra.cu
+++ b/faiss/gpu/impl/RaftCagra.cu
@@ -63,6 +63,61 @@ RaftCagra::RaftCagra(
     reset();
 }
 
+RaftCagra::RaftCagra(
+        GpuResources* resources,
+        int dim,
+        idx_t n,
+        int graph_degree,
+        const float* distances,
+        const idx_t* knn_graph,
+        faiss::MetricType metric,
+        float metricArg,
+        IndicesOptions indicesOptions)
+        : resources_(resources),
+          dim_(dim),
+          metric_(metric),
+          metricArg_(metricArg) {
+    FAISS_THROW_IF_NOT_MSG(
+            metric == faiss::METRIC_L2,
+            "CAGRA currently only supports L2 metric.");
+    FAISS_THROW_IF_NOT_MSG(
+            indicesOptions == faiss::gpu::INDICES_64_BIT,
+            "only INDICES_64_BIT is supported for RAFT CAGRA index");
+
+    auto distances_on_gpu = getDeviceForAddress(distances) >= 0;
+    auto knn_graph_on_gpu = getDeviceForAddress(knn_graph) >= 0;
+
+    FAISS_ASSERT(distances_on_gpu == knn_graph_on_gpu);
+
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    if (distances_on_gpu && knn_graph_on_gpu) {
+        auto distances_mds =
+                raft::make_device_matrix_view<const float, int64_t>(
+                        distances, n, dim);
+        auto knn_graph_mds =
+                raft::make_device_matrix_view<const idx_t, int64_t>(
+                        knn_graph, n, graph_degree);
+
+        raft_knn_index = raft::neighbors::cagra::index<float, idx_t>(
+                raft_handle,
+                raft::distance::DistanceType::L2Expanded,
+                distances_mds,
+                knn_graph_mds);
+    } else {
+        auto distances_mds = raft::make_host_matrix_view<const float, int64_t>(
+                distances, n, dim);
+        auto knn_graph_mds = raft::make_host_matrix_view<const idx_t, int64_t>(
+                knn_graph, n, graph_degree);
+
+        raft_knn_index = raft::neighbors::cagra::index<float, idx_t>(
+                raft_handle,
+                raft::distance::DistanceType::L2Expanded,
+                distances_mds,
+                knn_graph_mds);
+    }
+}
+
 void RaftCagra::train(idx_t n, const float* x) {
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
@@ -144,5 +199,56 @@ void RaftCagra::reset() {
     raft_knn_index.reset();
 }
 
+idx_t RaftCagra::get_knngraph_degree() const {
+    FAISS_ASSERT(raft_knn_index.has_value());
+    return static_cast<idx_t>(raft_knn_index.value().graph_degree());
+}
+
+std::vector<idx_t> RaftCagra::get_knngraph() const {
+    FAISS_ASSERT(raft_knn_index.has_value());
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    auto stream = raft_handle.get_stream();
+
+    auto device_graph = raft_knn_index.value().graph();
+
+    std::vector<idx_t> host_graph(
+            device_graph.extent(0) * device_graph.extent(1));
+
+    raft::update_host(
+            host_graph.data(),
+            device_graph.data_handle(),
+            host_graph.size(),
+            stream);
+    raft_handle.sync_stream();
+
+    return host_graph;
+}
+
+std::vector<float> RaftCagra::get_training_dataset() const {
+    FAISS_ASSERT(raft_knn_index.has_value());
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    auto stream = raft_handle.get_stream();
+
+    auto device_dataset = raft_knn_index.value().dataset();
+
+    std::vector<float> host_dataset(
+            device_dataset.extent(0) * device_dataset.extent(1));
+
+    RAFT_CUDA_TRY(cudaMemcpy2DAsync(
+            host_dataset.data(),
+            sizeof(float) * dim_,
+            device_dataset.data_handle(),
+            sizeof(float) * device_dataset.stride(0),
+            sizeof(float) * dim_,
+            device_dataset.extent(0),
+            cudaMemcpyDefault,
+            raft_handle.get_stream()));
+    raft_handle.sync_stream();
+
+    return host_dataset;
+}
+
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/impl/RaftCagra.cuh b/faiss/gpu/impl/RaftCagra.cuh
index 7f2b8b485c..0fddb3b39f 100644
--- a/faiss/gpu/impl/RaftCagra.cuh
+++ b/faiss/gpu/impl/RaftCagra.cuh
@@ -54,6 +54,17 @@ class RaftCagra {
             float metricArg,
             IndicesOptions indicesOptions);
 
+    RaftCagra(
+            GpuResources* resources,
+            int dim,
+            idx_t n,
+            int graph_degree,
+            const float* distances,
+            const idx_t* knn_graph,
+            faiss::MetricType metric,
+            float metricArg,
+            IndicesOptions indicesOptions);
+
     ~RaftCagra() = default;
 
     void train(idx_t n, const float* x);
@@ -79,6 +90,12 @@ class RaftCagra {
 
     void reset();
 
+    idx_t get_knngraph_degree() const;
+
+    std::vector<idx_t> get_knngraph() const;
+
+    std::vector<float> get_training_dataset() const;
+
    private:
     /// Collection of GPU resources that we use
     GpuResources* resources_;
diff --git a/faiss/gpu/test/TestGpuIndexCagra.cu b/faiss/gpu/test/TestGpuIndexCagra.cu
index 8ba11e63ac..658ec8858f 100644
--- a/faiss/gpu/test/TestGpuIndexCagra.cu
+++ b/faiss/gpu/test/TestGpuIndexCagra.cu
@@ -39,6 +39,7 @@ struct Options {
     Options() {
         numTrain = 2 * faiss::gpu::randVal(2000, 5000);
         dim = faiss::gpu::randVal(4, 10);
+        numAdd = faiss::gpu::randVal(1000, 3000);
 
         graphDegree = faiss::gpu::randSelect({32, 64});
         intermediateGraphDegree = faiss::gpu::randSelect({64, 98});
@@ -64,6 +65,7 @@ struct Options {
     }
 
     int numTrain;
+    int numAdd;
     int dim;
     size_t graphDegree;
     size_t intermediateGraphDegree;
@@ -74,18 +76,18 @@ struct Options {
 };
 
 void queryTest() {
-    for (int tries = 0; tries < 2; ++tries) {
+    for (int tries = 0; tries < 5; ++tries) {
         Options opt;
 
         std::vector<float> trainVecs =
                 faiss::gpu::randVecs(opt.numTrain, opt.dim);
 
+        // train cpu index
         faiss::IndexHNSWFlat cpuIndex(opt.dim, opt.graphDegree / 2);
         cpuIndex.hnsw.efConstruction = opt.k * 2;
-        // Training IndexHNSW is a no-op
-        cpuIndex.train(opt.numTrain, trainVecs.data());
         cpuIndex.add(opt.numTrain, trainVecs.data());
 
+        // train gpu index
         faiss::gpu::StandardGpuResources res;
         res.noTempMemory();
 
@@ -99,6 +101,7 @@ void queryTest() {
                 &res, cpuIndex.d, faiss::METRIC_L2, config);
         gpuIndex.train(opt.numTrain, trainVecs.data());
 
+        // query
         auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
 
         std::vector<float> refDistance(opt.numQuery * opt.k, 0);
@@ -113,6 +116,7 @@ void queryTest() {
                 refIndices.data(),
                 &cpuSearchParams);
 
+        // test quality of searches
         auto gpuRes = res.getResources();
         auto devAlloc = faiss::gpu::makeDevAlloc(
                 faiss::gpu::AllocType::FlatData,
@@ -176,6 +180,231 @@ TEST(TestGpuIndexCagra, Float32_Query_L2) {
     queryTest();
 }
 
+void copyToTest() {
+    for (int tries = 0; tries < 5; ++tries) {
+        Options opt;
+
+        std::vector<float> trainVecs =
+                faiss::gpu::randVecs(opt.numTrain, opt.dim);
+        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+        faiss::gpu::StandardGpuResources res;
+        res.noTempMemory();
+
+        // train gpu index and copy to cpu index
+        faiss::gpu::GpuIndexCagraConfig config;
+        config.device = opt.device;
+        config.graph_degree = opt.graphDegree;
+        config.intermediate_graph_degree = opt.intermediateGraphDegree;
+        config.build_algo = opt.buildAlgo;
+
+        faiss::gpu::GpuIndexCagra gpuIndex(
+                &res, opt.dim, faiss::METRIC_L2, config);
+        gpuIndex.train(opt.numTrain, trainVecs.data());
+
+        faiss::IndexHNSWCagra copiedCpuIndex(opt.dim, opt.graphDegree / 2);
+        copiedCpuIndex.hnsw.efConstruction = opt.k * 2;
+        gpuIndex.copyTo(&copiedCpuIndex);
+
+        // add more vecs to copied cpu index
+        copiedCpuIndex.add(opt.numAdd, addVecs.data());
+
+        // train cpu index
+        faiss::IndexHNSWFlat cpuIndex(opt.dim, opt.graphDegree / 2);
+        cpuIndex.hnsw.efConstruction = opt.k * 2;
+        cpuIndex.add(opt.numTrain, trainVecs.data());
+
+        // add more vecs to cpu index
+        cpuIndex.add(opt.numAdd, addVecs.data());
+
+        // query indexes
+        auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+
+        std::vector<float> refDistance(opt.numQuery * opt.k, 0);
+        std::vector<faiss::idx_t> refIndices(opt.numQuery * opt.k, -1);
+        faiss::SearchParametersHNSW cpuSearchParams;
+        cpuSearchParams.efSearch = opt.k * 2;
+        cpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                refDistance.data(),
+                refIndices.data(),
+                &cpuSearchParams);
+
+        std::vector<float> copyRefDistance(opt.numQuery * opt.k, 0);
+        std::vector<faiss::idx_t> copyRefIndices(opt.numQuery * opt.k, -1);
+        faiss::SearchParametersHNSW cpuSearchParamstwo;
+        cpuSearchParamstwo.efSearch = opt.k * 2;
+        copiedCpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                copyRefDistance.data(),
+                copyRefIndices.data(),
+                &cpuSearchParamstwo);
+
+        // test quality of search
+        auto gpuRes = res.getResources();
+
+        auto refDistanceDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                refDistance,
+                gpuRes->getDefaultStreamCurrentDevice());
+        auto refIndicesDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                refIndices,
+                gpuRes->getDefaultStreamCurrentDevice());
+
+        auto copyRefDistanceDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                copyRefDistance,
+                gpuRes->getDefaultStreamCurrentDevice());
+        auto copyRefIndicesDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                copyRefIndices,
+                gpuRes->getDefaultStreamCurrentDevice());
+
+        auto raft_handle = gpuRes->getRaftHandleCurrentDevice();
+
+        auto ref_dis_mds = raft::make_device_matrix_view<const float, int>(
+                refDistanceDev.data(), opt.numQuery, opt.k);
+        auto ref_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        ref_dis_mds);
+        auto ref_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        refIndicesDev.data(), opt.numQuery, opt.k);
+
+        auto copy_ref_dis_mds = raft::make_device_matrix_view<const float, int>(
+                copyRefDistanceDev.data(), opt.numQuery, opt.k);
+        auto copy_ref_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        copy_ref_dis_mds);
+        auto copy_ref_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        copyRefIndicesDev.data(), opt.numQuery, opt.k);
+
+        double scalar_init = 0;
+        auto recall_score = raft::make_host_scalar(scalar_init);
+
+        raft::stats::neighborhood_recall(
+                raft_handle,
+                copy_ref_ind_mds,
+                ref_ind_mds,
+                recall_score.view(),
+                copy_ref_dis_mds_opt,
+                ref_dis_mds_opt);
+        ASSERT_TRUE(*recall_score.data_handle() > 0.99);
+    }
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyTo_L2) {
+    copyToTest();
+}
+
+void copyFromTest() {
+    for (int tries = 0; tries < 5; ++tries) {
+        Options opt;
+
+        std::vector<float> trainVecs =
+                faiss::gpu::randVecs(opt.numTrain, opt.dim);
+
+        // train cpu index
+        faiss::IndexHNSWCagra cpuIndex(opt.dim, opt.graphDegree / 2);
+        cpuIndex.hnsw.efConstruction = opt.k * 2;
+        cpuIndex.add(opt.numTrain, trainVecs.data());
+
+        faiss::gpu::StandardGpuResources res;
+        res.noTempMemory();
+
+        // convert to gpu index
+        faiss::gpu::GpuIndexCagra copiedGpuIndex(
+                &res, cpuIndex.d, faiss::METRIC_L2);
+        copiedGpuIndex.copyFrom(&cpuIndex);
+
+        // train gpu index
+        faiss::gpu::GpuIndexCagraConfig config;
+        config.device = opt.device;
+        config.graph_degree = opt.graphDegree;
+        config.intermediate_graph_degree = opt.intermediateGraphDegree;
+        config.build_algo = opt.buildAlgo;
+
+        faiss::gpu::GpuIndexCagra gpuIndex(
+                &res, opt.dim, faiss::METRIC_L2, config);
+        gpuIndex.train(opt.numTrain, trainVecs.data());
+
+        // query
+        auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+
+        auto gpuRes = res.getResources();
+        auto devAlloc = faiss::gpu::makeDevAlloc(
+                faiss::gpu::AllocType::FlatData,
+                gpuRes->getDefaultStreamCurrentDevice());
+        faiss::gpu::DeviceTensor<float, 2, true> copyTestDistance(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        faiss::gpu::DeviceTensor<faiss::idx_t, 2, true> copyTestIndices(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        copiedGpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                copyTestDistance.data(),
+                copyTestIndices.data());
+
+        faiss::gpu::DeviceTensor<float, 2, true> testDistance(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        faiss::gpu::DeviceTensor<faiss::idx_t, 2, true> testIndices(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        gpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                testDistance.data(),
+                testIndices.data());
+
+        // test quality of searches
+        auto raft_handle = gpuRes->getRaftHandleCurrentDevice();
+
+        auto test_dis_mds = raft::make_device_matrix_view<const float, int>(
+                testDistance.data(), opt.numQuery, opt.k);
+        auto test_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        test_dis_mds);
+
+        auto test_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        testIndices.data(), opt.numQuery, opt.k);
+
+        auto copy_test_dis_mds =
+                raft::make_device_matrix_view<const float, int>(
+                        copyTestDistance.data(), opt.numQuery, opt.k);
+        auto copy_test_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        copy_test_dis_mds);
+
+        auto copy_test_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        copyTestIndices.data(), opt.numQuery, opt.k);
+
+        double scalar_init = 0;
+        auto recall_score = raft::make_host_scalar(scalar_init);
+
+        raft::stats::neighborhood_recall(
+                raft_handle,
+                copy_test_ind_mds,
+                test_ind_mds,
+                recall_score.view(),
+                copy_test_dis_mds_opt,
+                test_dis_mds_opt);
+        ASSERT_TRUE(*recall_score.data_handle() > 0.99);
+    }
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyFrom_L2) {
+    copyFromTest();
+}
+
 int main(int argc, char** argv) {
     testing::InitGoogleTest(&argc, argv);
 
diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp
index a9fb9daf5b..c886f7d5df 100644
--- a/faiss/impl/HNSW.cpp
+++ b/faiss/impl/HNSW.cpp
@@ -7,6 +7,7 @@
 
 #include <faiss/impl/HNSW.h>
 
+#include <cstddef>
 #include <string>
 
 #include <faiss/impl/AuxIndexStructures.h>
@@ -110,8 +111,8 @@ void HNSW::print_neighbor_stats(int level) const {
            level,
            nb_neighbors(level));
     size_t tot_neigh = 0, tot_common = 0, tot_reciprocal = 0, n_node = 0;
-#pragma omp parallel for reduction(+: tot_neigh) reduction(+: tot_common) \
-  reduction(+: tot_reciprocal) reduction(+: n_node)
+#pragma omp parallel for reduction(+ : tot_neigh) reduction(+ : tot_common) \
+        reduction(+ : tot_reciprocal) reduction(+ : n_node)
     for (int i = 0; i < levels.size(); i++) {
         if (levels[i] > level) {
             n_node++;
@@ -215,8 +216,8 @@ int HNSW::prepare_level_tab(size_t n, bool preset_levels) {
         if (pt_level > max_level)
             max_level = pt_level;
         offsets.push_back(offsets.back() + cum_nb_neighbors(pt_level + 1));
-        neighbors.resize(offsets.back(), -1);
     }
+    neighbors.resize(offsets.back(), -1);
 
     return max_level;
 }
@@ -229,7 +230,14 @@ void HNSW::shrink_neighbor_list(
         DistanceComputer& qdis,
         std::priority_queue<NodeDistFarther>& input,
         std::vector<NodeDistFarther>& output,
-        int max_size) {
+        int max_size,
+        bool keep_max_size_level0) {
+    // This prevents number of neighbors at
+    // level 0 from being shrunk to less than 2 * M.
+    // This is essential in making sure
+    // `faiss::gpu::GpuIndexCagra::copyFrom(IndexHNSWCagra*)` is functional
+    std::vector<NodeDistFarther> outsiders;
+
     while (input.size() > 0) {
         NodeDistFarther v1 = input.top();
         input.pop();
@@ -250,8 +258,14 @@ void HNSW::shrink_neighbor_list(
             if (output.size() >= max_size) {
                 return;
             }
+        } else if (keep_max_size_level0) {
+            outsiders.push_back(v1);
         }
     }
+    size_t idx = 0;
+    while (keep_max_size_level0 && output.size() < max_size) {
+        output.push_back(outsiders[idx++]);
+    }
 }
 
 namespace {
@@ -268,7 +282,8 @@ using NodeDistFarther = HNSW::NodeDistFarther;
 void shrink_neighbor_list(
         DistanceComputer& qdis,
         std::priority_queue<NodeDistCloser>& resultSet1,
-        int max_size) {
+        int max_size,
+        bool keep_max_size_level0 = false) {
     if (resultSet1.size() < max_size) {
         return;
     }
@@ -280,7 +295,8 @@ void shrink_neighbor_list(
         resultSet1.pop();
     }
 
-    HNSW::shrink_neighbor_list(qdis, resultSet, returnlist, max_size);
+    HNSW::shrink_neighbor_list(
+            qdis, resultSet, returnlist, max_size, keep_max_size_level0);
 
     for (NodeDistFarther curen2 : returnlist) {
         resultSet1.emplace(curen2.d, curen2.id);
@@ -294,7 +310,8 @@ void add_link(
         DistanceComputer& qdis,
         storage_idx_t src,
         storage_idx_t dest,
-        int level) {
+        int level,
+        bool keep_max_size_level0 = false) {
     size_t begin, end;
     hnsw.neighbor_range(src, level, &begin, &end);
     if (hnsw.neighbors[end - 1] == -1) {
@@ -319,7 +336,8 @@ void add_link(
         resultSet.emplace(qdis.symmetric_dis(src, neigh), neigh);
     }
 
-    shrink_neighbor_list(qdis, resultSet, end - begin);
+    shrink_neighbor_list(
+            qdis, resultSet, end - begin, keep_max_size_level0 && level == 0);
 
     // ...and back
     size_t i = begin;
@@ -429,7 +447,8 @@ void HNSW::add_links_starting_from(
         float d_nearest,
         int level,
         omp_lock_t* locks,
-        VisitedTable& vt) {
+        VisitedTable& vt,
+        bool keep_max_size_level0) {
     std::priority_queue<NodeDistCloser> link_targets;
 
     search_neighbors_to_add(
@@ -438,13 +457,14 @@ void HNSW::add_links_starting_from(
     // but we can afford only this many neighbors
     int M = nb_neighbors(level);
 
-    ::faiss::shrink_neighbor_list(ptdis, link_targets, M);
+    ::faiss::shrink_neighbor_list(
+            ptdis, link_targets, M, keep_max_size_level0 && level == 0);
 
     std::vector<storage_idx_t> neighbors;
     neighbors.reserve(link_targets.size());
     while (!link_targets.empty()) {
         storage_idx_t other_id = link_targets.top().id;
-        add_link(*this, ptdis, pt_id, other_id, level);
+        add_link(*this, ptdis, pt_id, other_id, level, keep_max_size_level0);
         neighbors.push_back(other_id);
         link_targets.pop();
     }
@@ -452,7 +472,7 @@ void HNSW::add_links_starting_from(
     omp_unset_lock(&locks[pt_id]);
     for (storage_idx_t other_id : neighbors) {
         omp_set_lock(&locks[other_id]);
-        add_link(*this, ptdis, other_id, pt_id, level);
+        add_link(*this, ptdis, other_id, pt_id, level, keep_max_size_level0);
         omp_unset_lock(&locks[other_id]);
     }
     omp_set_lock(&locks[pt_id]);
@@ -467,7 +487,8 @@ void HNSW::add_with_locks(
         int pt_level,
         int pt_id,
         std::vector<omp_lock_t>& locks,
-        VisitedTable& vt) {
+        VisitedTable& vt,
+        bool keep_max_size_level0) {
     //  greedy search on upper levels
 
     storage_idx_t nearest;
@@ -496,7 +517,14 @@ void HNSW::add_with_locks(
 
     for (; level >= 0; level--) {
         add_links_starting_from(
-                ptdis, pt_id, nearest, d_nearest, level, locks.data(), vt);
+                ptdis,
+                pt_id,
+                nearest,
+                d_nearest,
+                level,
+                locks.data(),
+                vt,
+                keep_max_size_level0);
     }
 
     omp_unset_lock(&locks[pt_id]);
diff --git a/faiss/impl/HNSW.h b/faiss/impl/HNSW.h
index cb6b422c3d..d1b9a955a6 100644
--- a/faiss/impl/HNSW.h
+++ b/faiss/impl/HNSW.h
@@ -184,7 +184,8 @@ struct HNSW {
             float d_nearest,
             int level,
             omp_lock_t* locks,
-            VisitedTable& vt);
+            VisitedTable& vt,
+            bool keep_max_size_level0 = false);
 
     /** add point pt_id on all levels <= pt_level and build the link
      * structure for them. */
@@ -193,7 +194,8 @@ struct HNSW {
             int pt_level,
             int pt_id,
             std::vector<omp_lock_t>& locks,
-            VisitedTable& vt);
+            VisitedTable& vt,
+            bool keep_max_size_level0 = false);
 
     /// search interface for 1 point, single thread
     HNSWStats search(
@@ -224,7 +226,8 @@ struct HNSW {
             DistanceComputer& qdis,
             std::priority_queue<NodeDistFarther>& input,
             std::vector<NodeDistFarther>& output,
-            int max_size);
+            int max_size,
+            bool keep_max_size_level0 = false);
 
     void permute_entries(const idx_t* map);
 };

From c7fcf4a030bd300d41668b97093eb151b0a9a890 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 14 Feb 2024 13:56:26 -0800
Subject: [PATCH 010/148] style check

---
 faiss/IndexHNSW.cpp | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index 8305764df8..1589b10a6c 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -192,7 +192,9 @@ void hnsw_add_vertices(
 
         int i1 = n;
 
-        for (int pt_level = hist.size() - 1; pt_level >= !index_hnsw.init_level0; pt_level--) {
+        for (int pt_level = hist.size() - 1;
+             pt_level >= !index_hnsw.init_level0;
+             pt_level--) {
             int i0 = i1 - hist[pt_level];
 
             if (verbose) {
@@ -228,7 +230,13 @@ void hnsw_add_vertices(
                         continue;
                     }
 
-                    hnsw.add_with_locks(*dis, pt_level, pt_id, locks, vt, index_hnsw.keep_max_size_level0);
+                    hnsw.add_with_locks(
+                            *dis,
+                            pt_level,
+                            pt_id,
+                            locks,
+                            vt,
+                            index_hnsw.keep_max_size_level0);
 
                     if (prev_display >= 0 && i - i0 > prev_display + 10000) {
                         prev_display = i - i0;
@@ -250,8 +258,7 @@ void hnsw_add_vertices(
         }
         if (index_hnsw.init_level0) {
             FAISS_ASSERT(i1 == 0);
-        }
-        else {
+        } else {
             FAISS_ASSERT((i1 - hist[0]) == 0);
         }
     }

From 065f912d6daa23fbf7e1c345982f7c5858ba7988 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Tue, 20 Feb 2024 13:57:09 -0800
Subject: [PATCH 011/148] add read/write

---
 faiss/impl/index_read.cpp  | 3 +++
 faiss/impl/index_write.cpp | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index ac62e0269e..8622b99c06 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -958,7 +958,10 @@ Index* read_index(IOReader* f, int io_flags) {
             idxhnsw = new IndexHNSWSQ();
         if (h == fourcc("IHN2"))
             idxhnsw = new IndexHNSW2Level();
+        if (h == fourcc("IHNc"))
+            idxhnsw = new IndexHNSWCagra();
         read_index_header(idxhnsw, f);
+        READ1(idxhnsw->keep_max_size_level0);
         read_HNSW(&idxhnsw->hnsw, f);
         idxhnsw->storage = read_index(f, io_flags);
         idxhnsw->own_fields = true;
diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp
index b2808d7170..1f27a68451 100644
--- a/faiss/impl/index_write.cpp
+++ b/faiss/impl/index_write.cpp
@@ -760,10 +760,12 @@ void write_index(const Index* idx, IOWriter* f) {
                 : dynamic_cast<const IndexHNSWPQ*>(idx)      ? fourcc("IHNp")
                 : dynamic_cast<const IndexHNSWSQ*>(idx)      ? fourcc("IHNs")
                 : dynamic_cast<const IndexHNSW2Level*>(idx)  ? fourcc("IHN2")
+                : dynamic_cast<const IndexHNSWCagra*>(idx)  ? fourcc("IHNc")
                                                              : 0;
         FAISS_THROW_IF_NOT(h != 0);
         WRITE1(h);
         write_index_header(idxhnsw, f);
+        WRITE1(idxhnsw->keep_max_size_level0);
         write_HNSW(&idxhnsw->hnsw, f);
         write_index(idxhnsw->storage, f);
     } else if (const IndexNSG* idxnsg = dynamic_cast<const IndexNSG*>(idx)) {

From 2b0ea76e2b581f74eb47c460a80e6ef56208f12a Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Tue, 20 Feb 2024 14:43:49 -0800
Subject: [PATCH 012/148] add destructor

---
 faiss/gpu/GpuIndexCagra.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
index 902a0d34e7..324ef9c089 100644
--- a/faiss/gpu/GpuIndexCagra.h
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -117,7 +117,7 @@ struct GpuIndexCagra : public GpuIndex {
             faiss::MetricType metric = faiss::METRIC_L2,
             GpuIndexCagraConfig config = GpuIndexCagraConfig());
 
-    ~GpuIndexCagra() override = default;
+    ~GpuIndexCagra() override {};
 
     /// Trains CAGRA based on the given vector data
     void train(idx_t n, const float* x) override;

From 8c83bd23bae8d7bb68a455e4c259a17b94f417b2 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 21 Feb 2024 08:50:38 -0800
Subject: [PATCH 013/148] destructor body, copyto reset

---
 faiss/gpu/GpuIndexCagra.cu | 9 ++++++---
 faiss/gpu/GpuIndexCagra.h  | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
index e20dbc8663..81c7b79d00 100644
--- a/faiss/gpu/GpuIndexCagra.cu
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -24,6 +24,7 @@
 #include <faiss/gpu/GpuIndexCagra.h>
 #include <cstddef>
 #include <faiss/gpu/impl/RaftCagra.cuh>
+#include "GpuIndexCagra.h"
 
 namespace faiss {
 namespace gpu {
@@ -38,6 +39,8 @@ GpuIndexCagra::GpuIndexCagra(
     this->is_trained = false;
 }
 
+GpuIndexCagra::~GpuIndexCagra() {}
+
 void GpuIndexCagra::train(idx_t n, const float* x) {
     if (this->is_trained) {
         FAISS_ASSERT(index_);
@@ -158,10 +161,10 @@ void GpuIndexCagra::copyFrom(const faiss::IndexHNSWCagra* index) {
 void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const {
     FAISS_ASSERT(index_ && this->is_trained && index);
 
+    index->reset();
+
     auto graph_degree = index_->get_knngraph_degree();
-    FAISS_THROW_IF_NOT_MSG(
-            (index->hnsw.nb_neighbors(0)) == graph_degree,
-            "IndexHNSWCagra.hnsw.nb_neighbors(0) should be equal to GpuIndexCagraConfig.graph_degree");
+    index->hnsw.M = graph_degree / 2;
 
     auto n_train = this->ntotal;
     auto train_dataset = index_->get_training_dataset();
diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
index 324ef9c089..35783a848c 100644
--- a/faiss/gpu/GpuIndexCagra.h
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -117,7 +117,7 @@ struct GpuIndexCagra : public GpuIndex {
             faiss::MetricType metric = faiss::METRIC_L2,
             GpuIndexCagraConfig config = GpuIndexCagraConfig());
 
-    ~GpuIndexCagra() override {};
+    ~GpuIndexCagra() override;
 
     /// Trains CAGRA based on the given vector data
     void train(idx_t n, const float* x) override;

From 39fb35a9d49295b191a9153e4bc76f5c25dc696b Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 21 Feb 2024 09:14:42 -0800
Subject: [PATCH 014/148] remove destructor

---
 faiss/gpu/GpuIndexCagra.cu | 6 +++---
 faiss/gpu/GpuIndexCagra.h  | 2 --
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
index 81c7b79d00..f94b5cbcf5 100644
--- a/faiss/gpu/GpuIndexCagra.cu
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -39,8 +39,6 @@ GpuIndexCagra::GpuIndexCagra(
     this->is_trained = false;
 }
 
-GpuIndexCagra::~GpuIndexCagra() {}
-
 void GpuIndexCagra::train(idx_t n, const float* x) {
     if (this->is_trained) {
         FAISS_ASSERT(index_);
@@ -164,7 +162,9 @@ void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const {
     index->reset();
 
     auto graph_degree = index_->get_knngraph_degree();
-    index->hnsw.M = graph_degree / 2;
+    auto M = graph_degree / 2;
+    index->hnsw.set_default_probas(M, 1.0 / log(M));
+    index->hnsw.offsets.push_back(0);
 
     auto n_train = this->ntotal;
     auto train_dataset = index_->get_training_dataset();
diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
index 35783a848c..62c0b489fb 100644
--- a/faiss/gpu/GpuIndexCagra.h
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -117,8 +117,6 @@ struct GpuIndexCagra : public GpuIndex {
             faiss::MetricType metric = faiss::METRIC_L2,
             GpuIndexCagraConfig config = GpuIndexCagraConfig());
 
-    ~GpuIndexCagra() override;
-
     /// Trains CAGRA based on the given vector data
     void train(idx_t n, const float* x) override;
 

From 49e261018618b1e2c8bb71f7ba8766e162df9564 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 21 Feb 2024 09:38:08 -0800
Subject: [PATCH 015/148] move cmake sources around

---
 faiss/gpu/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index 2efa622fe3..b060d2efe1 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -29,7 +29,6 @@ set(FAISS_GPU_SRC
   GpuIndexIVFFlat.cu
   GpuIndexIVFPQ.cu
   GpuIndexIVFScalarQuantizer.cu
-  $<$<BOOL:${FAISS_ENABLE_RAFT}>:GpuIndexCagra.cu>
   GpuResources.cpp
   StandardGpuResources.cpp
   impl/BinaryDistance.cu
@@ -92,7 +91,6 @@ set(FAISS_GPU_HEADERS
   GpuFaissAssert.h
   GpuIndex.h
   GpuIndexBinaryFlat.h
-  $<$<BOOL:${FAISS_ENABLE_RAFT}>:GpuIndexCagra.h>
   GpuIndexFlat.h
   GpuIndexIVF.h
   GpuIndexIVFFlat.h
@@ -240,11 +238,13 @@ generate_ivf_interleaved_code()
 
 if(FAISS_ENABLE_RAFT)
   list(APPEND FAISS_GPU_HEADERS
+          GpuIndexCagra.h
           impl/RaftCagra.cuh
           impl/RaftUtils.h
           impl/RaftIVFFlat.cuh
           impl/RaftFlatIndex.cuh)
   list(APPEND FAISS_GPU_SRC
+          GpuIndexCagra.cu
           impl/RaftCagra.cu
           impl/RaftFlatIndex.cu
           impl/RaftIVFFlat.cu)

From d4434bb45532e4651c8b28a41e0fefa42f3305a1 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 21 Feb 2024 14:41:51 -0800
Subject: [PATCH 016/148] more protections for copying

---
 faiss/gpu/GpuIndexCagra.cu          | 13 +++++++++++--
 faiss/gpu/test/TestGpuIndexCagra.cu |  2 +-
 faiss/impl/HNSW.cpp                 |  2 ++
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
index f94b5cbcf5..d69958d0be 100644
--- a/faiss/gpu/GpuIndexCagra.cu
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -121,6 +121,10 @@ void GpuIndexCagra::searchImpl_(
 void GpuIndexCagra::copyFrom(const faiss::IndexHNSWCagra* index) {
     FAISS_ASSERT(index);
 
+    DeviceScope scope(config_.device);
+
+    GpuIndex::copyFrom(index);
+
     auto base_index = index->storage;
     auto l2_index = dynamic_cast<IndexFlatL2*>(base_index);
     FAISS_ASSERT(l2_index);
@@ -159,12 +163,17 @@ void GpuIndexCagra::copyFrom(const faiss::IndexHNSWCagra* index) {
 void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const {
     FAISS_ASSERT(index_ && this->is_trained && index);
 
-    index->reset();
+    DeviceScope scope(config_.device);
+
+    //
+    // Index information
+    //
+    GpuIndex::copyTo(index);
 
     auto graph_degree = index_->get_knngraph_degree();
     auto M = graph_degree / 2;
+    index->reset();
     index->hnsw.set_default_probas(M, 1.0 / log(M));
-    index->hnsw.offsets.push_back(0);
 
     auto n_train = this->ntotal;
     auto train_dataset = index_->get_training_dataset();
diff --git a/faiss/gpu/test/TestGpuIndexCagra.cu b/faiss/gpu/test/TestGpuIndexCagra.cu
index 658ec8858f..aac7dcc3f0 100644
--- a/faiss/gpu/test/TestGpuIndexCagra.cu
+++ b/faiss/gpu/test/TestGpuIndexCagra.cu
@@ -203,8 +203,8 @@ void copyToTest() {
         gpuIndex.train(opt.numTrain, trainVecs.data());
 
         faiss::IndexHNSWCagra copiedCpuIndex(opt.dim, opt.graphDegree / 2);
-        copiedCpuIndex.hnsw.efConstruction = opt.k * 2;
         gpuIndex.copyTo(&copiedCpuIndex);
+        copiedCpuIndex.hnsw.efConstruction = opt.k * 2;
 
         // add more vecs to copied cpu index
         copiedCpuIndex.add(opt.numAdd, addVecs.data());
diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp
index c886f7d5df..f1b00fd3e0 100644
--- a/faiss/impl/HNSW.cpp
+++ b/faiss/impl/HNSW.cpp
@@ -99,6 +99,8 @@ void HNSW::clear_neighbor_tables(int level) {
 void HNSW::reset() {
     max_level = -1;
     entry_point = -1;
+    assign_probas.clear();
+    cum_nneighbor_per_level.clear();
     offsets.clear();
     offsets.push_back(0);
     levels.clear();

From ac65c2d14b609305a2e3d548ff5cdc780dbc99c2 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 21 Feb 2024 16:34:36 -0800
Subject: [PATCH 017/148] support default constructed IndexHnswCagra in copyTo

---
 faiss/gpu/GpuIndexCagra.cu          | 11 ++++++++++-
 faiss/gpu/test/TestGpuIndexCagra.cu |  2 +-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
index d69958d0be..b70ae78f79 100644
--- a/faiss/gpu/GpuIndexCagra.cu
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -169,10 +169,19 @@ void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const {
     // Index information
     //
     GpuIndex::copyTo(index);
+    // This needs to be zeroed out as this implementation adds vectors to the
+    // cpuIndex instead of copying fields
+    index->ntotal = 0;
 
     auto graph_degree = index_->get_knngraph_degree();
     auto M = graph_degree / 2;
-    index->reset();
+    if (index->storage and index->own_fields) {
+        delete index->storage;
+    }
+    index->storage = new IndexFlatL2(index->d);
+    index->own_fields = true;
+    index->keep_max_size_level0 = true;
+    index->hnsw.reset();
     index->hnsw.set_default_probas(M, 1.0 / log(M));
 
     auto n_train = this->ntotal;
diff --git a/faiss/gpu/test/TestGpuIndexCagra.cu b/faiss/gpu/test/TestGpuIndexCagra.cu
index aac7dcc3f0..54987cd5f5 100644
--- a/faiss/gpu/test/TestGpuIndexCagra.cu
+++ b/faiss/gpu/test/TestGpuIndexCagra.cu
@@ -202,7 +202,7 @@ void copyToTest() {
                 &res, opt.dim, faiss::METRIC_L2, config);
         gpuIndex.train(opt.numTrain, trainVecs.data());
 
-        faiss::IndexHNSWCagra copiedCpuIndex(opt.dim, opt.graphDegree / 2);
+        faiss::IndexHNSWCagra copiedCpuIndex;
         gpuIndex.copyTo(&copiedCpuIndex);
         copiedCpuIndex.hnsw.efConstruction = opt.k * 2;
 

From 619c37662b0ec8148f060f395a45dfa75ac152bf Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Thu, 22 Feb 2024 14:58:11 -0800
Subject: [PATCH 018/148] fix failing binary hnsw tests

---
 faiss/gpu/GpuIndexCagra.cu | 2 ++
 faiss/impl/HNSW.cpp        | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
index b70ae78f79..541a1caa75 100644
--- a/faiss/gpu/GpuIndexCagra.cu
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -182,6 +182,8 @@ void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const {
     index->own_fields = true;
     index->keep_max_size_level0 = true;
     index->hnsw.reset();
+    index->hnsw.assign_probas.clear();
+    index->hnsw.cum_nneighbor_per_level.clear();
     index->hnsw.set_default_probas(M, 1.0 / log(M));
 
     auto n_train = this->ntotal;
diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp
index f1b00fd3e0..c886f7d5df 100644
--- a/faiss/impl/HNSW.cpp
+++ b/faiss/impl/HNSW.cpp
@@ -99,8 +99,6 @@ void HNSW::clear_neighbor_tables(int level) {
 void HNSW::reset() {
     max_level = -1;
     entry_point = -1;
-    assign_probas.clear();
-    cum_nneighbor_per_level.clear();
     offsets.clear();
     offsets.push_back(0);
     levels.clear();

From e25f8a4883271c0825f6372a9822455ec7b4b631 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Fri, 23 Feb 2024 09:48:04 -0800
Subject: [PATCH 019/148] link faiss_gpu target to OpenMP

---
 faiss/gpu/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index d9e4775e02..ec72c48d7c 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -320,5 +320,5 @@ __nv_relfatbin : { *(__nv_relfatbin) }
 target_link_options(faiss_gpu PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
 
 find_package(CUDAToolkit REQUIRED)
-target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass>)
+target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass> OpenMP::OpenMP_CXX)
 target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr>)

From e8351503c9d104ce81a4c48db1426bf35adf2fc0 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Fri, 23 Feb 2024 10:14:21 -0800
Subject: [PATCH 020/148] raft still can't find openmp

---
 faiss/gpu/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index ec72c48d7c..f48a86d8e3 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -320,5 +320,5 @@ __nv_relfatbin : { *(__nv_relfatbin) }
 target_link_options(faiss_gpu PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
 
 find_package(CUDAToolkit REQUIRED)
-target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass> OpenMP::OpenMP_CXX)
-target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr>)
+target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass> $<$<BOOL:${FAISS_ENABLE_RAFT}>:OpenMP::OpenMP_CXX>)
+target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr> $<$<BOOL:${FAISS_ENABLE_RAFT}>:-fopenmp>)

From aeabe122b5035e99b8be5d177a82e710704ec2f2 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Mon, 26 Feb 2024 15:10:51 -0800
Subject: [PATCH 021/148] openmp flags and uint32 IndexType

---
 faiss/gpu/CMakeLists.txt     |  2 +-
 faiss/gpu/impl/RaftCagra.cu  | 64 ++++++++++++++++++++++++++----------
 faiss/gpu/impl/RaftCagra.cuh |  4 +--
 3 files changed, 49 insertions(+), 21 deletions(-)

diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index f48a86d8e3..d20f3b7f8e 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -321,4 +321,4 @@ target_link_options(faiss_gpu PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
 
 find_package(CUDAToolkit REQUIRED)
 target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass> $<$<BOOL:${FAISS_ENABLE_RAFT}>:OpenMP::OpenMP_CXX>)
-target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr> $<$<BOOL:${FAISS_ENABLE_RAFT}>:-fopenmp>)
+target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$<BOOL:${FAISS_ENABLE_RAFT}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
diff --git a/faiss/gpu/impl/RaftCagra.cu b/faiss/gpu/impl/RaftCagra.cu
index 0a55901a1b..858bc4fe88 100644
--- a/faiss/gpu/impl/RaftCagra.cu
+++ b/faiss/gpu/impl/RaftCagra.cu
@@ -22,10 +22,12 @@
 
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <cstddef>
+#include <cstdint>
 #include <faiss/gpu/impl/RaftCagra.cuh>
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
 #include <raft/neighbors/cagra.cuh>
 
 namespace faiss {
@@ -91,30 +93,47 @@ RaftCagra::RaftCagra(
 
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
+
     if (distances_on_gpu && knn_graph_on_gpu) {
+        raft_handle.sync_stream();
+        // Copying to host so that raft::neighbors::cagra::index
+        // creates an owning copy of the knn graph on device
+        auto knn_graph_copy =
+                raft::make_host_matrix<uint32_t, int64_t>(n, graph_degree);
+        thrust::copy(
+                thrust::device_ptr<const idx_t>(knn_graph),
+                thrust::device_ptr<const idx_t>(knn_graph + (n * graph_degree)),
+                knn_graph_copy.data_handle());
+
         auto distances_mds =
                 raft::make_device_matrix_view<const float, int64_t>(
                         distances, n, dim);
-        auto knn_graph_mds =
-                raft::make_device_matrix_view<const idx_t, int64_t>(
-                        knn_graph, n, graph_degree);
 
-        raft_knn_index = raft::neighbors::cagra::index<float, idx_t>(
+        raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
                 raft_handle,
                 raft::distance::DistanceType::L2Expanded,
                 distances_mds,
-                knn_graph_mds);
-    } else {
+                raft::make_const_mdspan(knn_graph_copy.view()));
+    } else if (!distances_on_gpu && !knn_graph_on_gpu) {
+        // copy idx_t (int64_t) host knn_graph to uint32_t host knn_graph
+        auto knn_graph_copy =
+                raft::make_host_matrix<uint32_t, int64_t>(n, graph_degree);
+        std::copy(
+                knn_graph,
+                knn_graph + (n * graph_degree),
+                knn_graph_copy.data_handle());
+
         auto distances_mds = raft::make_host_matrix_view<const float, int64_t>(
                 distances, n, dim);
-        auto knn_graph_mds = raft::make_host_matrix_view<const idx_t, int64_t>(
-                knn_graph, n, graph_degree);
 
-        raft_knn_index = raft::neighbors::cagra::index<float, idx_t>(
+        raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
                 raft_handle,
                 raft::distance::DistanceType::L2Expanded,
                 distances_mds,
-                knn_graph_mds);
+                raft::make_const_mdspan(knn_graph_copy.view()));
+    } else {
+        FAISS_THROW_MSG(
+                "distances and knn_graph must both be in device or host memory");
     }
 }
 
@@ -122,12 +141,12 @@ void RaftCagra::train(idx_t n, const float* x) {
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
     if (getDeviceForAddress(x) >= 0) {
-        raft_knn_index = raft::neighbors::cagra::build<float, idx_t>(
+        raft_knn_index = raft::neighbors::cagra::build<float, uint32_t>(
                 raft_handle,
                 index_pams_,
                 raft::make_device_matrix_view<const float, idx_t>(x, n, dim_));
     } else {
-        raft_knn_index = raft::neighbors::cagra::build<float, idx_t>(
+        raft_knn_index = raft::neighbors::cagra::build<float, uint32_t>(
                 raft_handle,
                 index_pams_,
                 raft::make_host_matrix_view<const float, idx_t>(x, n, dim_));
@@ -186,13 +205,21 @@ void RaftCagra::search(
     search_pams.num_random_samplings = num_random_samplings;
     search_pams.rand_xor_mask = rand_xor_mask;
 
+    auto indices_copy = raft::make_device_matrix<uint32_t, idx_t>(
+            raft_handle, numQueries, k_);
+
     raft::neighbors::cagra::search(
             raft_handle,
             search_pams,
             raft_knn_index.value(),
             queries_view,
-            indices_view,
+            indices_copy.view(),
             distances_view);
+    thrust::copy(
+            raft::resource::get_thrust_policy(raft_handle),
+            indices_copy.data_handle(),
+            indices_copy.data_handle() + indices_copy.size(),
+            indices_view.data_handle());
 }
 
 void RaftCagra::reset() {
@@ -215,13 +242,14 @@ std::vector<idx_t> RaftCagra::get_knngraph() const {
     std::vector<idx_t> host_graph(
             device_graph.extent(0) * device_graph.extent(1));
 
-    raft::update_host(
-            host_graph.data(),
-            device_graph.data_handle(),
-            host_graph.size(),
-            stream);
     raft_handle.sync_stream();
 
+    thrust::copy(
+            thrust::device_ptr<const uint32_t>(device_graph.data_handle()),
+            thrust::device_ptr<const uint32_t>(
+                    device_graph.data_handle() + device_graph.size()),
+            host_graph.data());
+
     return host_graph;
 }
 
diff --git a/faiss/gpu/impl/RaftCagra.cuh b/faiss/gpu/impl/RaftCagra.cuh
index 0fddb3b39f..6d0bf69c17 100644
--- a/faiss/gpu/impl/RaftCagra.cuh
+++ b/faiss/gpu/impl/RaftCagra.cuh
@@ -113,8 +113,8 @@ class RaftCagra {
     raft::neighbors::cagra::index_params index_pams_;
 
     /// Instance of trained RAFT CAGRA index
-    std::optional<raft::neighbors::cagra::index<float, idx_t>> raft_knn_index{
-            std::nullopt};
+    std::optional<raft::neighbors::cagra::index<float, uint32_t>>
+            raft_knn_index{std::nullopt};
 };
 
 } // namespace gpu

From 4e80586fd17c83f3d86d68bc96b0c5fab0495b01 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Mon, 26 Feb 2024 15:34:12 -0800
Subject: [PATCH 022/148] forgot conditional check in index_read

---
 faiss/impl/index_read.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index 8622b99c06..1b84f4a453 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -948,7 +948,7 @@ Index* read_index(IOReader* f, int io_flags) {
         idx = idxp;
     } else if (
             h == fourcc("IHNf") || h == fourcc("IHNp") || h == fourcc("IHNs") ||
-            h == fourcc("IHN2")) {
+            h == fourcc("IHN2") || h == fourcc("IHNc")) {
         IndexHNSW* idxhnsw = nullptr;
         if (h == fourcc("IHNf"))
             idxhnsw = new IndexHNSWFlat();

From c4bcabae3ba57eef938c215077d98f39a0b4926d Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Thu, 7 Mar 2024 09:33:24 -0800
Subject: [PATCH 023/148] minor changes

---
 faiss/IndexHNSW.cpp | 5 ++++-
 faiss/impl/HNSW.cpp | 8 +++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index 1589b10a6c..5fdc774b92 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -7,6 +7,8 @@
 
 // -*- c++ -*-
 
+#include <iostream>
+
 #include <faiss/IndexHNSW.h>
 
 #include <omp.h>
@@ -196,6 +198,7 @@ void hnsw_add_vertices(
              pt_level >= !index_hnsw.init_level0;
              pt_level--) {
             int i0 = i1 - hist[pt_level];
+            // std::cout << "level: " << pt_level << "points: " << hist[pt_level] << std::endl;
 
             if (verbose) {
                 printf("Adding %d elements at level %d\n", i1 - i0, pt_level);
@@ -236,7 +239,7 @@ void hnsw_add_vertices(
                             pt_id,
                             locks,
                             vt,
-                            index_hnsw.keep_max_size_level0);
+                            index_hnsw.keep_max_size_level0 && (pt_level == 0));
 
                     if (prev_display >= 0 && i - i0 > prev_display + 10000) {
                         prev_display = i - i0;
diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp
index c886f7d5df..f449c8446a 100644
--- a/faiss/impl/HNSW.cpp
+++ b/faiss/impl/HNSW.cpp
@@ -5,6 +5,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <iostream>
+
 #include <faiss/impl/HNSW.h>
 
 #include <cstddef>
@@ -263,7 +265,7 @@ void HNSW::shrink_neighbor_list(
         }
     }
     size_t idx = 0;
-    while (keep_max_size_level0 && output.size() < max_size) {
+    while (keep_max_size_level0 && (output.size() < max_size) && (idx < outsiders.size())) {
         output.push_back(outsiders[idx++]);
     }
 }
@@ -337,7 +339,7 @@ void add_link(
     }
 
     shrink_neighbor_list(
-            qdis, resultSet, end - begin, keep_max_size_level0 && level == 0);
+            qdis, resultSet, end - begin, keep_max_size_level0);
 
     // ...and back
     size_t i = begin;
@@ -458,7 +460,7 @@ void HNSW::add_links_starting_from(
     int M = nb_neighbors(level);
 
     ::faiss::shrink_neighbor_list(
-            ptdis, link_targets, M, keep_max_size_level0 && level == 0);
+            ptdis, link_targets, M, keep_max_size_level0);
 
     std::vector<storage_idx_t> neighbors;
     neighbors.reserve(link_targets.size());

From 341a3fcef1a07bb284c96cf564674900526f5f62 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Thu, 7 Mar 2024 09:35:32 -0800
Subject: [PATCH 024/148] api change

---
 faiss/gpu/impl/RaftIVFFlat.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index 1e310723d0..0906a60f46 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -403,7 +403,8 @@ void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
     }
 
     // Update the pointers and the sizes
-    raft_knn_index.value().recompute_internal_state(raft_handle);
+    raft::neighbors::ivf_flat::helpers::recompute_internal_state(
+            raft_handle, &(raft_knn_index.value()));
 
     for (size_t i = 0; i < nlist; ++i) {
         size_t listSize = ivf->list_size(i);

From 172aa6570e0b1a2a5b91cb60e98ce47f39ae6b9e Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 20 Mar 2024 16:06:08 -0700
Subject: [PATCH 025/148] working python

---
 faiss/python/CMakeLists.txt |  5 +++++
 faiss/python/swigfaiss.swig | 13 +++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt
index 8bca710f5f..a2a3fdddf7 100644
--- a/faiss/python/CMakeLists.txt
+++ b/faiss/python/CMakeLists.txt
@@ -38,6 +38,11 @@ macro(configure_swigfaiss source)
     set_source_files_properties(${source} PROPERTIES
       COMPILE_DEFINITIONS GPU_WRAPPER
     )
+    if (FAISS_ENABLE_RAFT)
+      set_source_files_properties(${source} PROPERTIES
+        COMPILE_DEFINITIONS FAISS_ENABLE_RAFT
+      )
+    endif()
   endif()
 endmacro()
 
diff --git a/faiss/python/swigfaiss.swig b/faiss/python/swigfaiss.swig
index fb7f50dd2e..ab069002fa 100644
--- a/faiss/python/swigfaiss.swig
+++ b/faiss/python/swigfaiss.swig
@@ -304,6 +304,11 @@ void gpu_sync_all_devices();
 #include <faiss/gpu/GpuIndicesOptions.h>
 #include <faiss/gpu/GpuClonerOptions.h>
 #include <faiss/gpu/GpuIndex.h>
+
+#ifdef FAISS_ENABLE_RAFT
+#include <faiss/gpu/GpuIndexCagra.h>
+#endif
+
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVF.h>
 #include <faiss/gpu/GpuIndexIVFPQ.h>
@@ -557,6 +562,11 @@ struct faiss::simd16uint16 {};
 %include  <faiss/gpu/GpuIndicesOptions.h>
 %include  <faiss/gpu/GpuClonerOptions.h>
 %include  <faiss/gpu/GpuIndex.h>
+
+#ifdef FAISS_ENABLE_RAFT
+%include  <faiss/gpu/GpuIndexCagra.h>
+#endif
+
 %include  <faiss/gpu/GpuIndexFlat.h>
 %include  <faiss/gpu/GpuIndexIVF.h>
 %include  <faiss/gpu/GpuIndexIVFPQ.h>
@@ -673,6 +683,9 @@ struct faiss::simd16uint16 {};
     DOWNCAST ( IndexRowwiseMinMax )
     DOWNCAST ( IndexRowwiseMinMaxFP16 )
 #ifdef GPU_WRAPPER
+#ifdef FAISS_ENABLE_RAFT
+    DOWNCAST_GPU ( GpuIndexCagra )
+#endif
     DOWNCAST_GPU ( GpuIndexIVFPQ )
     DOWNCAST_GPU ( GpuIndexIVFFlat )
     DOWNCAST_GPU ( GpuIndexIVFScalarQuantizer )

From 0cd684e19601d67351411c75bbe89c8205f11dce Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 20 Mar 2024 20:07:45 -0700
Subject: [PATCH 026/148] compile option to swig

---
 CMakeLists.txt              | 2 ++
 faiss/python/CMakeLists.txt | 2 +-
 faiss/python/swigfaiss.swig | 6 ------
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6cdc37c46f..39b5e18325 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,6 +46,8 @@ project(faiss
   LANGUAGES ${FAISS_LANGUAGES})
 include(GNUInstallDirs)
 
+set(CMAKE_INSTALL_PREFIX "$ENV{CONDA_PREFIX}")
+
 set(CMAKE_CXX_STANDARD 17)
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt
index a2a3fdddf7..9dc14e9837 100644
--- a/faiss/python/CMakeLists.txt
+++ b/faiss/python/CMakeLists.txt
@@ -39,7 +39,7 @@ macro(configure_swigfaiss source)
       COMPILE_DEFINITIONS GPU_WRAPPER
     )
     if (FAISS_ENABLE_RAFT)
-      set_source_files_properties(${source} PROPERTIES
+      set_property(SOURCE ${source} APPEND PROPERTY
         COMPILE_DEFINITIONS FAISS_ENABLE_RAFT
       )
     endif()
diff --git a/faiss/python/swigfaiss.swig b/faiss/python/swigfaiss.swig
index ab069002fa..9bf5e83aee 100644
--- a/faiss/python/swigfaiss.swig
+++ b/faiss/python/swigfaiss.swig
@@ -304,11 +304,7 @@ void gpu_sync_all_devices();
 #include <faiss/gpu/GpuIndicesOptions.h>
 #include <faiss/gpu/GpuClonerOptions.h>
 #include <faiss/gpu/GpuIndex.h>
-
-#ifdef FAISS_ENABLE_RAFT
 #include <faiss/gpu/GpuIndexCagra.h>
-#endif
-
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVF.h>
 #include <faiss/gpu/GpuIndexIVFPQ.h>
@@ -562,11 +558,9 @@ struct faiss::simd16uint16 {};
 %include  <faiss/gpu/GpuIndicesOptions.h>
 %include  <faiss/gpu/GpuClonerOptions.h>
 %include  <faiss/gpu/GpuIndex.h>
-
 #ifdef FAISS_ENABLE_RAFT
 %include  <faiss/gpu/GpuIndexCagra.h>
 #endif
-
 %include  <faiss/gpu/GpuIndexFlat.h>
 %include  <faiss/gpu/GpuIndexIVF.h>
 %include  <faiss/gpu/GpuIndexIVFPQ.h>

From 7ff8b3b6c1c93545bc8d4c0f7e6da1aaa87e69dd Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 3 Apr 2024 15:15:34 -0700
Subject: [PATCH 027/148] expose ivf pq params

---
 faiss/gpu/GpuIndexCagra.cu   |  37 +++++++++++-
 faiss/gpu/GpuIndexCagra.h    | 114 +++++++++++++++++++++++++++++++++++
 faiss/gpu/impl/RaftCagra.cu  | 104 +++++++++++++++++++++++++++-----
 faiss/gpu/impl/RaftCagra.cuh |  12 +++-
 4 files changed, 250 insertions(+), 17 deletions(-)

diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
index 541a1caa75..916f774bc1 100644
--- a/faiss/gpu/GpuIndexCagra.cu
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -24,6 +24,7 @@
 #include <faiss/gpu/GpuIndexCagra.h>
 #include <cstddef>
 #include <faiss/gpu/impl/RaftCagra.cuh>
+#include <optional>
 #include "GpuIndexCagra.h"
 
 namespace faiss {
@@ -47,6 +48,38 @@ void GpuIndexCagra::train(idx_t n, const float* x) {
 
     FAISS_ASSERT(!index_);
 
+    std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params =
+            std::nullopt;
+    std::optional<raft::neighbors::ivf_pq::search_params> ivf_pq_search_params =
+            std::nullopt;
+    if (cagraConfig_.ivf_pq_params != nullptr) {
+        ivf_pq_params =
+                std::make_optional<raft::neighbors::ivf_pq::index_params>();
+        ivf_pq_params->n_lists = cagraConfig_.ivf_pq_params->n_lists;
+        ivf_pq_params->kmeans_n_iters =
+                cagraConfig_.ivf_pq_params->kmeans_n_iters;
+        ivf_pq_params->kmeans_trainset_fraction =
+                cagraConfig_.ivf_pq_params->kmeans_trainset_fraction;
+        ivf_pq_params->pq_bits = cagraConfig_.ivf_pq_params->pq_bits;
+        ivf_pq_params->pq_dim = cagraConfig_.ivf_pq_params->pq_dim;
+        ivf_pq_params->codebook_kind =
+                static_cast<raft::neighbors::ivf_pq::codebook_gen>(
+                        cagraConfig_.ivf_pq_params->codebook_kind);
+        ivf_pq_params->force_random_rotation =
+                cagraConfig_.ivf_pq_params->force_random_rotation;
+        ivf_pq_params->conservative_memory_allocation =
+                cagraConfig_.ivf_pq_params->conservative_memory_allocation;
+    }
+    if (cagraConfig_.ivf_pq_search_params != nullptr) {
+        ivf_pq_search_params =
+                std::make_optional<raft::neighbors::ivf_pq::search_params>();
+        ivf_pq_search_params->n_probes =
+                cagraConfig_.ivf_pq_search_params->n_probes;
+        ivf_pq_search_params->lut_dtype =
+                cagraConfig_.ivf_pq_search_params->lut_dtype;
+        ivf_pq_search_params->preferred_shmem_carveout =
+                cagraConfig_.ivf_pq_search_params->preferred_shmem_carveout;
+    }
     index_ = std::make_shared<RaftCagra>(
             this->resources_.get(),
             this->d,
@@ -56,7 +89,9 @@ void GpuIndexCagra::train(idx_t n, const float* x) {
             cagraConfig_.nn_descent_niter,
             this->metric_type,
             this->metric_arg,
-            INDICES_64_BIT);
+            INDICES_64_BIT,
+            ivf_pq_params,
+            ivf_pq_search_params);
 
     index_->train(n, x);
 
diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
index 62c0b489fb..5c04259092 100644
--- a/faiss/gpu/GpuIndexCagra.h
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -23,6 +23,9 @@
 #pragma once
 
 #include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/IndexIVF.h>
+#include "GpuIndexIVFPQ.h"
 
 namespace faiss {
 struct IndexHNSWCagra;
@@ -40,6 +43,114 @@ enum class graph_build_algo {
     NN_DESCENT
 };
 
+/** A type for specifying how PQ codebooks are created. */
+enum class codebook_gen {  // NOLINT
+  PER_SUBSPACE = 0,        // NOLINT
+  PER_CLUSTER  = 1,        // NOLINT
+};
+
+struct IVFPQBuildCagraConfig {
+  /**
+   * The number of inverted lists (clusters)
+   *
+   * Hint: the number of vectors per cluster (`n_rows/n_lists`) should be approximately 1,000 to
+   * 10,000.
+   */
+  uint32_t n_lists = 1024;
+  /** The number of iterations searching for kmeans centers (index building). */
+  uint32_t kmeans_n_iters = 20;
+  /** The fraction of data to use during iterative kmeans building. */
+  double kmeans_trainset_fraction = 0.5;
+  /**
+   * The bit length of the vector element after compression by PQ.
+   *
+   * Possible values: [4, 5, 6, 7, 8].
+   *
+   * Hint: the smaller the 'pq_bits', the smaller the index size and the better the search
+   * performance, but the lower the recall.
+   */
+  uint32_t pq_bits = 8;
+  /**
+   * The dimensionality of the vector after compression by PQ. When zero, an optimal value is
+   * selected using a heuristic.
+   *
+   * NB: `pq_dim * pq_bits` must be a multiple of 8.
+   *
+   * Hint: a smaller 'pq_dim' results in a smaller index size and better search performance, but
+   * lower recall. If 'pq_bits' is 8, 'pq_dim' can be set to any number, but multiple of 8 are
+   * desirable for good performance. If 'pq_bits' is not 8, 'pq_dim' should be a multiple of 8.
+   * For good performance, it is desirable that 'pq_dim' is a multiple of 32. Ideally, 'pq_dim'
+   * should be also a divisor of the dataset dim.
+   */
+  uint32_t pq_dim = 0;
+  /** How PQ codebooks are created. */
+  codebook_gen codebook_kind = codebook_gen::PER_SUBSPACE;
+  /**
+   * Apply a random rotation matrix on the input data and queries even if `dim % pq_dim == 0`.
+   *
+   * Note: if `dim` is not multiple of `pq_dim`, a random rotation is always applied to the input
+   * data and queries to transform the working space from `dim` to `rot_dim`, which may be slightly
+   * larger than the original space and and is a multiple of `pq_dim` (`rot_dim % pq_dim == 0`).
+   * However, this transform is not necessary when `dim` is multiple of `pq_dim`
+   *   (`dim == rot_dim`, hence no need in adding "extra" data columns / features).
+   *
+   * By default, if `dim == rot_dim`, the rotation transform is initialized with the identity
+   * matrix. When `force_random_rotation == true`, a random orthogonal transform matrix is generated
+   * regardless of the values of `dim` and `pq_dim`.
+   */
+  bool force_random_rotation = false;
+  /**
+   * By default, the algorithm allocates more space than necessary for individual clusters
+   * (`list_data`). This allows to amortize the cost of memory allocation and reduce the number of
+   * data copies during repeated calls to `extend` (extending the database).
+   *
+   * The alternative is the conservative allocation behavior; when enabled, the algorithm always
+   * allocates the minimum amount of memory required to store the given number of records. Set this
+   * flag to `true` if you prefer to use as little GPU memory for the database as possible.
+   */
+  bool conservative_memory_allocation = false;
+};
+
+struct IVFPQSearchCagraConfig {
+  /** The number of clusters to search. */
+  uint32_t n_probes = 20;
+  /**
+   * Data type of look up table to be created dynamically at search time.
+   *
+   * Possible values: [CUDA_R_32F, CUDA_R_16F, CUDA_R_8U]
+   *
+   * The use of low-precision types reduces the amount of shared memory required at search time, so
+   * fast shared memory kernels can be used even for datasets with large dimansionality. Note that
+   * the recall is slightly degraded when low-precision type is selected.
+   */
+  cudaDataType_t lut_dtype = CUDA_R_32F;
+  /**
+   * Storage data type for distance/similarity computed at search time.
+   *
+   * Possible values: [CUDA_R_16F, CUDA_R_32F]
+   *
+   * If the performance limiter at search time is device memory access, selecting FP16 will improve
+   * performance slightly.
+   */
+  cudaDataType_t internal_distance_dtype = CUDA_R_32F;
+  /**
+   * Preferred fraction of SM's unified memory / L1 cache to be used as shared memory.
+   *
+   * Possible values: [0.0 - 1.0] as a fraction of the `sharedMemPerMultiprocessor`.
+   *
+   * One wants to increase the carveout to make sure a good GPU occupancy for the main search
+   * kernel, but not to keep it too high to leave some memory to be used as L1 cache. Note, this
+   * value is interpreted only as a hint. Moreover, a GPU usually allows only a fixed set of cache
+   * configurations, so the provided value is rounded up to the nearest configuration. Refer to the
+   * NVIDIA tuning guide for the target GPU architecture.
+   *
+   * Note, this is a low-level tuning parameter that can have drastic negative effects on the search
+   * performance if tweaked incorrectly.
+   */
+  double preferred_shmem_carveout = 1.0;
+};
+
+
 struct GpuIndexCagraConfig : public GpuIndexConfig {
     /** Degree of input graph for pruning. */
     size_t intermediate_graph_degree = 128;
@@ -49,6 +160,9 @@ struct GpuIndexCagraConfig : public GpuIndexConfig {
     graph_build_algo build_algo = graph_build_algo::IVF_PQ;
     /** Number of Iterations to run if building with NN_DESCENT */
     size_t nn_descent_niter = 20;
+
+    IVFPQBuildCagraConfig *ivf_pq_params = nullptr;
+    IVFPQSearchCagraConfig *ivf_pq_search_params = nullptr;
 };
 
 enum class search_algo {
diff --git a/faiss/gpu/impl/RaftCagra.cu b/faiss/gpu/impl/RaftCagra.cu
index 858bc4fe88..c2657103f2 100644
--- a/faiss/gpu/impl/RaftCagra.cu
+++ b/faiss/gpu/impl/RaftCagra.cu
@@ -28,6 +28,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
+#include <optional>
 #include <raft/neighbors/cagra.cuh>
 
 namespace faiss {
@@ -42,12 +43,17 @@ RaftCagra::RaftCagra(
         size_t nn_descent_niter,
         faiss::MetricType metric,
         float metricArg,
-        IndicesOptions indicesOptions)
+        IndicesOptions indicesOptions,
+        std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params,
+        std::optional<raft::neighbors::ivf_pq::search_params>
+                ivf_pq_search_params)
         : resources_(resources),
           dim_(dim),
           metric_(metric),
           metricArg_(metricArg),
-          index_pams_() {
+          index_pams_(),
+          ivf_pq_params_(ivf_pq_params),
+          ivf_pq_search_params_(ivf_pq_search_params) {
     FAISS_THROW_IF_NOT_MSG(
             metric == faiss::METRIC_L2,
             "CAGRA currently only supports L2 metric.");
@@ -62,6 +68,15 @@ RaftCagra::RaftCagra(
                     graph_build_algo);
     index_pams_.nn_descent_niter = nn_descent_niter;
 
+    if (!ivf_pq_params_) {
+        ivf_pq_params_ =
+                std::make_optional<raft::neighbors::ivf_pq::index_params>();
+    }
+    if (!ivf_pq_search_params_) {
+        ivf_pq_search_params_ =
+                std::make_optional<raft::neighbors::ivf_pq::search_params>();
+    }
+
     reset();
 }
 
@@ -140,16 +155,75 @@ RaftCagra::RaftCagra(
 void RaftCagra::train(idx_t n, const float* x) {
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
-    if (getDeviceForAddress(x) >= 0) {
-        raft_knn_index = raft::neighbors::cagra::build<float, uint32_t>(
-                raft_handle,
-                index_pams_,
-                raft::make_device_matrix_view<const float, idx_t>(x, n, dim_));
+    if (index_pams_.build_algo ==
+        raft::neighbors::cagra::graph_build_algo::IVF_PQ) {
+        std::optional<raft::host_matrix<uint32_t, int64_t>> knn_graph(
+                raft::make_host_matrix<uint32_t, int64_t>(
+                        n, index_pams_.intermediate_graph_degree));
+        if (getDeviceForAddress(x) >= 0) {
+            auto dataset_d =
+                    raft::make_device_matrix_view<const float, int64_t>(
+                            x, n, dim_);
+            raft::neighbors::cagra::build_knn_graph(
+                    raft_handle,
+                    dataset_d,
+                    knn_graph->view(),
+                    1.0f,
+                    ivf_pq_params_,
+                    ivf_pq_search_params_);
+        } else {
+            auto dataset_h = raft::make_host_matrix_view<const float, int64_t>(
+                    x, n, dim_);
+            raft::neighbors::cagra::build_knn_graph(
+                    raft_handle,
+                    dataset_h,
+                    knn_graph->view(),
+                    1.0f,
+                    ivf_pq_params_,
+                    ivf_pq_search_params_);
+        }
+        auto cagra_graph = raft::make_host_matrix<uint32_t, int64_t>(
+                n, index_pams_.graph_degree);
+
+        raft::neighbors::cagra::optimize<uint32_t>(
+                raft_handle, knn_graph->view(), cagra_graph.view());
+
+        // free intermediate graph before trying to create the index
+        knn_graph.reset();
+
+        if (getDeviceForAddress(x) >= 0) {
+            auto dataset_d =
+                    raft::make_device_matrix_view<const float, int64_t>(
+                            x, n, dim_);
+            raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
+                    raft_handle,
+                    index_pams_.metric,
+                    dataset_d,
+                    raft::make_const_mdspan(cagra_graph.view()));
+        } else {
+            auto dataset_h = raft::make_host_matrix_view<const float, int64_t>(
+                    x, n, dim_);
+            raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
+                    raft_handle,
+                    index_pams_.metric,
+                    dataset_h,
+                    raft::make_const_mdspan(cagra_graph.view()));
+        }
+
     } else {
-        raft_knn_index = raft::neighbors::cagra::build<float, uint32_t>(
-                raft_handle,
-                index_pams_,
-                raft::make_host_matrix_view<const float, idx_t>(x, n, dim_));
+        if (getDeviceForAddress(x) >= 0) {
+            raft_knn_index = raft::neighbors::cagra::build<float, uint32_t>(
+                    raft_handle,
+                    index_pams_,
+                    raft::make_device_matrix_view<const float, int64_t>(
+                            x, n, dim_));
+        } else {
+            raft_knn_index = raft::neighbors::cagra::build<float, uint32_t>(
+                    raft_handle,
+                    index_pams_,
+                    raft::make_host_matrix_view<const float, int64_t>(
+                            x, n, dim_));
+        }
     }
 }
 
@@ -181,11 +255,11 @@ void RaftCagra::search(
     FAISS_ASSERT(numQueries > 0);
     FAISS_ASSERT(cols == dim_);
 
-    auto queries_view = raft::make_device_matrix_view<const float, idx_t>(
+    auto queries_view = raft::make_device_matrix_view<const float, int64_t>(
             queries.data(), numQueries, cols);
-    auto distances_view = raft::make_device_matrix_view<float, idx_t>(
+    auto distances_view = raft::make_device_matrix_view<float, int64_t>(
             outDistances.data(), numQueries, k_);
-    auto indices_view = raft::make_device_matrix_view<idx_t, idx_t>(
+    auto indices_view = raft::make_device_matrix_view<idx_t, int64_t>(
             outIndices.data(), numQueries, k_);
 
     raft::neighbors::cagra::search_params search_pams;
@@ -205,7 +279,7 @@ void RaftCagra::search(
     search_pams.num_random_samplings = num_random_samplings;
     search_pams.rand_xor_mask = rand_xor_mask;
 
-    auto indices_copy = raft::make_device_matrix<uint32_t, idx_t>(
+    auto indices_copy = raft::make_device_matrix<uint32_t, int64_t>(
             raft_handle, numQueries, k_);
 
     raft::neighbors::cagra::search(
diff --git a/faiss/gpu/impl/RaftCagra.cuh b/faiss/gpu/impl/RaftCagra.cuh
index 6d0bf69c17..d75ca29fc1 100644
--- a/faiss/gpu/impl/RaftCagra.cuh
+++ b/faiss/gpu/impl/RaftCagra.cuh
@@ -26,10 +26,12 @@
 #include <faiss/gpu/GpuResources.h>
 #include <cstddef>
 #include <faiss/gpu/utils/Tensor.cuh>
+#include <optional>
 
 #include <faiss/MetricType.h>
 
 #include <raft/neighbors/cagra_types.hpp>
+#include <raft/neighbors/ivf_pq_types.hpp>
 
 namespace faiss {
 
@@ -52,7 +54,11 @@ class RaftCagra {
             size_t nn_descent_niter,
             faiss::MetricType metric,
             float metricArg,
-            IndicesOptions indicesOptions);
+            IndicesOptions indicesOptions,
+            std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params =
+                    std::nullopt,
+            std::optional<raft::neighbors::ivf_pq::search_params>
+                    ivf_pq_search_params = std::nullopt);
 
     RaftCagra(
             GpuResources* resources,
@@ -112,6 +118,10 @@ class RaftCagra {
     /// Parameters to build RAFT CAGRA index
     raft::neighbors::cagra::index_params index_pams_;
 
+    /// Parameters to build CAGRA graph using IVF PQ
+    std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params_;
+    std::optional<raft::neighbors::ivf_pq::search_params> ivf_pq_search_params_;
+
     /// Instance of trained RAFT CAGRA index
     std::optional<raft::neighbors::cagra::index<float, uint32_t>>
             raft_knn_index{std::nullopt};

From 66d236f52f64707cc29012ecd57ae7a481c34959 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Mon, 22 Apr 2024 13:53:01 -0700
Subject: [PATCH 028/148] update comments style

---
 faiss/gpu/GpuIndexCagra.h    | 230 +++++++++++++++++------------------
 faiss/gpu/impl/RaftCagra.cuh |   1 +
 2 files changed, 116 insertions(+), 115 deletions(-)

diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
index 5c04259092..0bccc27562 100644
--- a/faiss/gpu/GpuIndexCagra.h
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -37,138 +37,138 @@ namespace gpu {
 class RaftCagra;
 
 enum class graph_build_algo {
-    /* Use IVF-PQ to build all-neighbors knn graph */
+    /// Use IVF-PQ to build all-neighbors knn graph 
     IVF_PQ,
-    /* Experimental, use NN-Descent to build all-neighbors knn graph */
+    /// Experimental, use NN-Descent to build all-neighbors knn graph 
     NN_DESCENT
 };
 
-/** A type for specifying how PQ codebooks are created. */
+/// A type for specifying how PQ codebooks are created. 
 enum class codebook_gen {  // NOLINT
   PER_SUBSPACE = 0,        // NOLINT
   PER_CLUSTER  = 1,        // NOLINT
 };
 
 struct IVFPQBuildCagraConfig {
-  /**
-   * The number of inverted lists (clusters)
-   *
-   * Hint: the number of vectors per cluster (`n_rows/n_lists`) should be approximately 1,000 to
-   * 10,000.
-   */
+  ///
+  /// The number of inverted lists (clusters)
+  ///
+  /// Hint: the number of vectors per cluster (`n_rows/n_lists`) should be approximately 1,000 to
+  /// 10,000.
+   
   uint32_t n_lists = 1024;
-  /** The number of iterations searching for kmeans centers (index building). */
+  /// The number of iterations searching for kmeans centers (index building). 
   uint32_t kmeans_n_iters = 20;
-  /** The fraction of data to use during iterative kmeans building. */
+  /// The fraction of data to use during iterative kmeans building. 
   double kmeans_trainset_fraction = 0.5;
-  /**
-   * The bit length of the vector element after compression by PQ.
-   *
-   * Possible values: [4, 5, 6, 7, 8].
-   *
-   * Hint: the smaller the 'pq_bits', the smaller the index size and the better the search
-   * performance, but the lower the recall.
-   */
+  ///
+  /// The bit length of the vector element after compression by PQ.
+  ///
+  /// Possible values: [4, 5, 6, 7, 8].
+  ///
+  /// Hint: the smaller the 'pq_bits', the smaller the index size and the better the search
+  /// performance, but the lower the recall.
+   
   uint32_t pq_bits = 8;
-  /**
-   * The dimensionality of the vector after compression by PQ. When zero, an optimal value is
-   * selected using a heuristic.
-   *
-   * NB: `pq_dim * pq_bits` must be a multiple of 8.
-   *
-   * Hint: a smaller 'pq_dim' results in a smaller index size and better search performance, but
-   * lower recall. If 'pq_bits' is 8, 'pq_dim' can be set to any number, but multiple of 8 are
-   * desirable for good performance. If 'pq_bits' is not 8, 'pq_dim' should be a multiple of 8.
-   * For good performance, it is desirable that 'pq_dim' is a multiple of 32. Ideally, 'pq_dim'
-   * should be also a divisor of the dataset dim.
-   */
+  ///
+  /// The dimensionality of the vector after compression by PQ. When zero, an optimal value is
+  /// selected using a heuristic.
+  ///
+  /// NB: `pq_dim/// pq_bits` must be a multiple of 8.
+  ///
+  /// Hint: a smaller 'pq_dim' results in a smaller index size and better search performance, but
+  /// lower recall. If 'pq_bits' is 8, 'pq_dim' can be set to any number, but multiple of 8 are
+  /// desirable for good performance. If 'pq_bits' is not 8, 'pq_dim' should be a multiple of 8.
+  /// For good performance, it is desirable that 'pq_dim' is a multiple of 32. Ideally, 'pq_dim'
+  /// should be also a divisor of the dataset dim.
+   
   uint32_t pq_dim = 0;
-  /** How PQ codebooks are created. */
+  /// How PQ codebooks are created. 
   codebook_gen codebook_kind = codebook_gen::PER_SUBSPACE;
-  /**
-   * Apply a random rotation matrix on the input data and queries even if `dim % pq_dim == 0`.
-   *
-   * Note: if `dim` is not multiple of `pq_dim`, a random rotation is always applied to the input
-   * data and queries to transform the working space from `dim` to `rot_dim`, which may be slightly
-   * larger than the original space and and is a multiple of `pq_dim` (`rot_dim % pq_dim == 0`).
-   * However, this transform is not necessary when `dim` is multiple of `pq_dim`
-   *   (`dim == rot_dim`, hence no need in adding "extra" data columns / features).
-   *
-   * By default, if `dim == rot_dim`, the rotation transform is initialized with the identity
-   * matrix. When `force_random_rotation == true`, a random orthogonal transform matrix is generated
-   * regardless of the values of `dim` and `pq_dim`.
-   */
+  ///
+  /// Apply a random rotation matrix on the input data and queries even if `dim % pq_dim == 0`.
+  ///
+  /// Note: if `dim` is not multiple of `pq_dim`, a random rotation is always applied to the input
+  /// data and queries to transform the working space from `dim` to `rot_dim`, which may be slightly
+  /// larger than the original space and and is a multiple of `pq_dim` (`rot_dim % pq_dim == 0`).
+  /// However, this transform is not necessary when `dim` is multiple of `pq_dim`
+  ///   (`dim == rot_dim`, hence no need in adding "extra" data columns / features).
+  ///
+  /// By default, if `dim == rot_dim`, the rotation transform is initialized with the identity
+  /// matrix. When `force_random_rotation == true`, a random orthogonal transform matrix is generated
+  /// regardless of the values of `dim` and `pq_dim`.
+   
   bool force_random_rotation = false;
-  /**
-   * By default, the algorithm allocates more space than necessary for individual clusters
-   * (`list_data`). This allows to amortize the cost of memory allocation and reduce the number of
-   * data copies during repeated calls to `extend` (extending the database).
-   *
-   * The alternative is the conservative allocation behavior; when enabled, the algorithm always
-   * allocates the minimum amount of memory required to store the given number of records. Set this
-   * flag to `true` if you prefer to use as little GPU memory for the database as possible.
-   */
+  ///
+  /// By default, the algorithm allocates more space than necessary for individual clusters
+  /// (`list_data`). This allows to amortize the cost of memory allocation and reduce the number of
+  /// data copies during repeated calls to `extend` (extending the database).
+  ///
+  /// The alternative is the conservative allocation behavior; when enabled, the algorithm always
+  /// allocates the minimum amount of memory required to store the given number of records. Set this
+  /// flag to `true` if you prefer to use as little GPU memory for the database as possible.
+   
   bool conservative_memory_allocation = false;
 };
 
 struct IVFPQSearchCagraConfig {
-  /** The number of clusters to search. */
+  /// The number of clusters to search. 
   uint32_t n_probes = 20;
-  /**
-   * Data type of look up table to be created dynamically at search time.
-   *
-   * Possible values: [CUDA_R_32F, CUDA_R_16F, CUDA_R_8U]
-   *
-   * The use of low-precision types reduces the amount of shared memory required at search time, so
-   * fast shared memory kernels can be used even for datasets with large dimansionality. Note that
-   * the recall is slightly degraded when low-precision type is selected.
-   */
+  ///
+  /// Data type of look up table to be created dynamically at search time.
+  ///
+  /// Possible values: [CUDA_R_32F, CUDA_R_16F, CUDA_R_8U]
+  ///
+  /// The use of low-precision types reduces the amount of shared memory required at search time, so
+  /// fast shared memory kernels can be used even for datasets with large dimansionality. Note that
+  /// the recall is slightly degraded when low-precision type is selected.
+   
   cudaDataType_t lut_dtype = CUDA_R_32F;
-  /**
-   * Storage data type for distance/similarity computed at search time.
-   *
-   * Possible values: [CUDA_R_16F, CUDA_R_32F]
-   *
-   * If the performance limiter at search time is device memory access, selecting FP16 will improve
-   * performance slightly.
-   */
+  ///
+  /// Storage data type for distance/similarity computed at search time.
+  ///
+  /// Possible values: [CUDA_R_16F, CUDA_R_32F]
+  ///
+  /// If the performance limiter at search time is device memory access, selecting FP16 will improve
+  /// performance slightly.
+   
   cudaDataType_t internal_distance_dtype = CUDA_R_32F;
-  /**
-   * Preferred fraction of SM's unified memory / L1 cache to be used as shared memory.
-   *
-   * Possible values: [0.0 - 1.0] as a fraction of the `sharedMemPerMultiprocessor`.
-   *
-   * One wants to increase the carveout to make sure a good GPU occupancy for the main search
-   * kernel, but not to keep it too high to leave some memory to be used as L1 cache. Note, this
-   * value is interpreted only as a hint. Moreover, a GPU usually allows only a fixed set of cache
-   * configurations, so the provided value is rounded up to the nearest configuration. Refer to the
-   * NVIDIA tuning guide for the target GPU architecture.
-   *
-   * Note, this is a low-level tuning parameter that can have drastic negative effects on the search
-   * performance if tweaked incorrectly.
-   */
+  ///
+  /// Preferred fraction of SM's unified memory / L1 cache to be used as shared memory.
+  ///
+  /// Possible values: [0.0 - 1.0] as a fraction of the `sharedMemPerMultiprocessor`.
+  ///
+  /// One wants to increase the carveout to make sure a good GPU occupancy for the main search
+  /// kernel, but not to keep it too high to leave some memory to be used as L1 cache. Note, this
+  /// value is interpreted only as a hint. Moreover, a GPU usually allows only a fixed set of cache
+  /// configurations, so the provided value is rounded up to the nearest configuration. Refer to the
+  /// NVIDIA tuning guide for the target GPU architecture.
+  ///
+  /// Note, this is a low-level tuning parameter that can have drastic negative effects on the search
+  /// performance if tweaked incorrectly.
+   
   double preferred_shmem_carveout = 1.0;
 };
 
 
 struct GpuIndexCagraConfig : public GpuIndexConfig {
-    /** Degree of input graph for pruning. */
+    /// Degree of input graph for pruning. 
     size_t intermediate_graph_degree = 128;
-    /** Degree of output graph. */
+    /// Degree of output graph. 
     size_t graph_degree = 64;
-    /** ANN algorithm to build knn graph. */
+    /// ANN algorithm to build knn graph. 
     graph_build_algo build_algo = graph_build_algo::IVF_PQ;
-    /** Number of Iterations to run if building with NN_DESCENT */
+    /// Number of Iterations to run if building with NN_DESCENT 
     size_t nn_descent_niter = 20;
 
-    IVFPQBuildCagraConfig *ivf_pq_params = nullptr;
-    IVFPQSearchCagraConfig *ivf_pq_search_params = nullptr;
+    IVFPQBuildCagraConfig///ivf_pq_params = nullptr;
+    IVFPQSearchCagraConfig///ivf_pq_search_params = nullptr;
 };
 
 enum class search_algo {
-    /** For large batch sizes. */
+    /// For large batch sizes. 
     SINGLE_CTA,
-    /** For small batch sizes. */
+    /// For small batch sizes. 
     MULTI_CTA,
     MULTI_KERNEL,
     AUTO
@@ -177,49 +177,49 @@ enum class search_algo {
 enum class hash_mode { HASH, SMALL, AUTO };
 
 struct SearchParametersCagra : SearchParameters {
-    /** Maximum number of queries to search at the same time (batch size). Auto
-     * select when 0.*/
+    /// Maximum number of queries to search at the same time (batch size). Auto
+    /// select when 0.
     size_t max_queries = 0;
 
-    /** Number of intermediate search results retained during the search.
-     *
-     *  This is the main knob to adjust trade off between accuracy and search
-     * speed. Higher values improve the search accuracy.
-     */
+    /// Number of intermediate search results retained during the search.
+    ///
+    ///  This is the main knob to adjust trade off between accuracy and search
+    /// speed. Higher values improve the search accuracy.
+     
     size_t itopk_size = 64;
 
-    /** Upper limit of search iterations. Auto select when 0.*/
+    /// Upper limit of search iterations. Auto select when 0.
     size_t max_iterations = 0;
 
     // In the following we list additional search parameters for fine tuning.
     // Reasonable default values are automatically chosen.
 
-    /** Which search implementation to use. */
+    /// Which search implementation to use. 
     search_algo algo = search_algo::AUTO;
 
-    /** Number of threads used to calculate a single distance. 4, 8, 16, or 32.
-     */
+    /// Number of threads used to calculate a single distance. 4, 8, 16, or 32.
+     
     size_t team_size = 0;
 
-    /** Number of graph nodes to select as the starting point for the search in
-     * each iteration. aka search width?*/
+    /// Number of graph nodes to select as the starting point for the search in
+    /// each iteration. aka search width?
     size_t search_width = 1;
-    /** Lower limit of search iterations. */
+    /// Lower limit of search iterations. 
     size_t min_iterations = 0;
 
-    /** Thread block size. 0, 64, 128, 256, 512, 1024. Auto selection when 0. */
+    /// Thread block size. 0, 64, 128, 256, 512, 1024. Auto selection when 0. 
     size_t thread_block_size = 0;
-    /** Hashmap type. Auto selection when AUTO. */
+    /// Hashmap type. Auto selection when AUTO. 
     hash_mode hashmap_mode = hash_mode::AUTO;
-    /** Lower limit of hashmap bit length. More than 8. */
+    /// Lower limit of hashmap bit length. More than 8. 
     size_t hashmap_min_bitlen = 0;
-    /** Upper limit of hashmap fill rate. More than 0.1, less than 0.9.*/
+    /// Upper limit of hashmap fill rate. More than 0.1, less than 0.9.
     float hashmap_max_fill_rate = 0.5;
 
-    /** Number of iterations of initial random seed node selection. 1 or more.
-     */
+    /// Number of iterations of initial random seed node selection. 1 or more.
+     
     uint32_t num_random_samplings = 1;
-    /** Bit mask used for initial random seed node selection. */
+    /// Bit mask used for initial random seed node selection. 
     uint64_t rand_xor_mask = 0x128394;
 };
 
diff --git a/faiss/gpu/impl/RaftCagra.cuh b/faiss/gpu/impl/RaftCagra.cuh
index d75ca29fc1..878198d609 100644
--- a/faiss/gpu/impl/RaftCagra.cuh
+++ b/faiss/gpu/impl/RaftCagra.cuh
@@ -35,6 +35,7 @@
 
 namespace faiss {
 
+/// Algorithm used to build underlying CAGRA graph
 enum class cagra_build_algo { IVF_PQ, NN_DESCENT };
 
 enum class cagra_search_algo { SINGLE_CTA, MULTI_CTA };

From 1d6e6b1674a1219483e4195eda6dd4f310bab099 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Mon, 22 Apr 2024 14:26:40 -0700
Subject: [PATCH 029/148] use raft::runtime where possible

---
 faiss/gpu/GpuIndexCagra.h   | 4 ++--
 faiss/gpu/impl/RaftCagra.cu | 7 ++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
index 0bccc27562..cf34ec1900 100644
--- a/faiss/gpu/GpuIndexCagra.h
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -161,8 +161,8 @@ struct GpuIndexCagraConfig : public GpuIndexConfig {
     /// Number of Iterations to run if building with NN_DESCENT 
     size_t nn_descent_niter = 20;
 
-    IVFPQBuildCagraConfig///ivf_pq_params = nullptr;
-    IVFPQSearchCagraConfig///ivf_pq_search_params = nullptr;
+    IVFPQBuildCagraConfig *ivf_pq_params = nullptr;
+    IVFPQSearchCagraConfig *ivf_pq_search_params = nullptr;
 };
 
 enum class search_algo {
diff --git a/faiss/gpu/impl/RaftCagra.cu b/faiss/gpu/impl/RaftCagra.cu
index c2657103f2..2f51b6b35a 100644
--- a/faiss/gpu/impl/RaftCagra.cu
+++ b/faiss/gpu/impl/RaftCagra.cu
@@ -28,6 +28,7 @@
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
+#include <raft_runtime/neighbors/cagra.hpp>
 #include <optional>
 #include <raft/neighbors/cagra.cuh>
 
@@ -212,13 +213,13 @@ void RaftCagra::train(idx_t n, const float* x) {
 
     } else {
         if (getDeviceForAddress(x) >= 0) {
-            raft_knn_index = raft::neighbors::cagra::build<float, uint32_t>(
+            raft_knn_index = raft::runtime::neighbors::cagra::build(
                     raft_handle,
                     index_pams_,
                     raft::make_device_matrix_view<const float, int64_t>(
                             x, n, dim_));
         } else {
-            raft_knn_index = raft::neighbors::cagra::build<float, uint32_t>(
+            raft_knn_index = raft::runtime::neighbors::cagra::build(
                     raft_handle,
                     index_pams_,
                     raft::make_host_matrix_view<const float, int64_t>(
@@ -282,7 +283,7 @@ void RaftCagra::search(
     auto indices_copy = raft::make_device_matrix<uint32_t, int64_t>(
             raft_handle, numQueries, k_);
 
-    raft::neighbors::cagra::search(
+    raft::runtime::neighbors::cagra::search(
             raft_handle,
             search_pams,
             raft_knn_index.value(),

From 4a01ad4a4cbdf1c3da8d9d2e3693d67b2e15fcd1 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Mon, 22 Apr 2024 14:39:10 -0700
Subject: [PATCH 030/148] format

---
 faiss/IndexHNSW.cpp        |   1 -
 faiss/gpu/GpuIndexCagra.h  | 180 ++++++++++++++++++-------------------
 faiss/impl/HNSW.cpp        |   9 +-
 faiss/impl/index_write.cpp |   2 +-
 4 files changed, 95 insertions(+), 97 deletions(-)

diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index 5c542f3e89..f6a2ca9587 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -198,7 +198,6 @@ void hnsw_add_vertices(
              pt_level >= !index_hnsw.init_level0;
              pt_level--) {
             int i0 = i1 - hist[pt_level];
-            // std::cout << "level: " << pt_level << "points: " << hist[pt_level] << std::endl;
 
             if (verbose) {
                 printf("Adding %d elements at level %d\n", i1 - i0, pt_level);
diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
index cf34ec1900..563208857c 100644
--- a/faiss/gpu/GpuIndexCagra.h
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -22,9 +22,9 @@
 
 #pragma once
 
+#include <faiss/IndexIVF.h>
 #include <faiss/gpu/GpuIndex.h>
 #include <faiss/gpu/GpuIndexIVFPQ.h>
-#include <faiss/IndexIVF.h>
 #include "GpuIndexIVFPQ.h"
 
 namespace faiss {
@@ -37,128 +37,128 @@ namespace gpu {
 class RaftCagra;
 
 enum class graph_build_algo {
-    /// Use IVF-PQ to build all-neighbors knn graph 
+    /// Use IVF-PQ to build all-neighbors knn graph
     IVF_PQ,
-    /// Experimental, use NN-Descent to build all-neighbors knn graph 
+    /// Experimental, use NN-Descent to build all-neighbors knn graph
     NN_DESCENT
 };
 
-/// A type for specifying how PQ codebooks are created. 
+/// A type for specifying how PQ codebooks are created.
 enum class codebook_gen {  // NOLINT
   PER_SUBSPACE = 0,        // NOLINT
   PER_CLUSTER  = 1,        // NOLINT
 };
 
 struct IVFPQBuildCagraConfig {
-  ///
-  /// The number of inverted lists (clusters)
-  ///
-  /// Hint: the number of vectors per cluster (`n_rows/n_lists`) should be approximately 1,000 to
-  /// 10,000.
+    ///
+    /// The number of inverted lists (clusters)
+    ///
+    /// Hint: the number of vectors per cluster (`n_rows/n_lists`) should be approximately 1,000 to
+    /// 10,000.
    
   uint32_t n_lists = 1024;
-  /// The number of iterations searching for kmeans centers (index building). 
+    /// The number of iterations searching for kmeans centers (index building). 
   uint32_t kmeans_n_iters = 20;
-  /// The fraction of data to use during iterative kmeans building. 
+    /// The fraction of data to use during iterative kmeans building. 
   double kmeans_trainset_fraction = 0.5;
-  ///
-  /// The bit length of the vector element after compression by PQ.
-  ///
-  /// Possible values: [4, 5, 6, 7, 8].
-  ///
-  /// Hint: the smaller the 'pq_bits', the smaller the index size and the better the search
-  /// performance, but the lower the recall.
+    ///
+    /// The bit length of the vector element after compression by PQ.
+    ///
+    /// Possible values: [4, 5, 6, 7, 8].
+    ///
+    /// Hint: the smaller the 'pq_bits', the smaller the index size and the better the search
+    /// performance, but the lower the recall.
    
   uint32_t pq_bits = 8;
-  ///
-  /// The dimensionality of the vector after compression by PQ. When zero, an optimal value is
-  /// selected using a heuristic.
-  ///
-  /// NB: `pq_dim/// pq_bits` must be a multiple of 8.
-  ///
-  /// Hint: a smaller 'pq_dim' results in a smaller index size and better search performance, but
-  /// lower recall. If 'pq_bits' is 8, 'pq_dim' can be set to any number, but multiple of 8 are
-  /// desirable for good performance. If 'pq_bits' is not 8, 'pq_dim' should be a multiple of 8.
-  /// For good performance, it is desirable that 'pq_dim' is a multiple of 32. Ideally, 'pq_dim'
-  /// should be also a divisor of the dataset dim.
+    ///
+    /// The dimensionality of the vector after compression by PQ. When zero, an optimal value is
+    /// selected using a heuristic.
+    ///
+    /// NB: `pq_dim  /// pq_bits` must be a multiple of 8.
+    ///
+    /// Hint: a smaller 'pq_dim' results in a smaller index size and better search performance, but
+    /// lower recall. If 'pq_bits' is 8, 'pq_dim' can be set to any number, but multiple of 8 are
+    /// desirable for good performance. If 'pq_bits' is not 8, 'pq_dim' should be a multiple of 8.
+    /// For good performance, it is desirable that 'pq_dim' is a multiple of 32. Ideally, 'pq_dim'
+    /// should be also a divisor of the dataset dim.
    
   uint32_t pq_dim = 0;
-  /// How PQ codebooks are created. 
+    /// How PQ codebooks are created. 
   codebook_gen codebook_kind = codebook_gen::PER_SUBSPACE;
-  ///
-  /// Apply a random rotation matrix on the input data and queries even if `dim % pq_dim == 0`.
-  ///
-  /// Note: if `dim` is not multiple of `pq_dim`, a random rotation is always applied to the input
-  /// data and queries to transform the working space from `dim` to `rot_dim`, which may be slightly
-  /// larger than the original space and and is a multiple of `pq_dim` (`rot_dim % pq_dim == 0`).
-  /// However, this transform is not necessary when `dim` is multiple of `pq_dim`
-  ///   (`dim == rot_dim`, hence no need in adding "extra" data columns / features).
-  ///
-  /// By default, if `dim == rot_dim`, the rotation transform is initialized with the identity
-  /// matrix. When `force_random_rotation == true`, a random orthogonal transform matrix is generated
-  /// regardless of the values of `dim` and `pq_dim`.
+    ///
+    /// Apply a random rotation matrix on the input data and queries even if `dim % pq_dim == 0`.
+    ///
+    /// Note: if `dim` is not multiple of `pq_dim`, a random rotation is always applied to the input
+    /// data and queries to transform the working space from `dim` to `rot_dim`, which may be slightly
+    /// larger than the original space and and is a multiple of `pq_dim` (`rot_dim % pq_dim == 0`).
+    /// However, this transform is not necessary when `dim` is multiple of `pq_dim`
+    ///   (`dim == rot_dim`, hence no need in adding "extra" data columns / features).
+    ///
+    /// By default, if `dim == rot_dim`, the rotation transform is initialized with the identity
+    /// matrix. When `force_random_rotation == true`, a random orthogonal transform matrix is generated
+    /// regardless of the values of `dim` and `pq_dim`.
    
   bool force_random_rotation = false;
-  ///
-  /// By default, the algorithm allocates more space than necessary for individual clusters
-  /// (`list_data`). This allows to amortize the cost of memory allocation and reduce the number of
-  /// data copies during repeated calls to `extend` (extending the database).
-  ///
-  /// The alternative is the conservative allocation behavior; when enabled, the algorithm always
-  /// allocates the minimum amount of memory required to store the given number of records. Set this
-  /// flag to `true` if you prefer to use as little GPU memory for the database as possible.
+    ///
+    /// By default, the algorithm allocates more space than necessary for individual clusters
+    /// (`list_data`). This allows to amortize the cost of memory allocation and reduce the number of
+    /// data copies during repeated calls to `extend` (extending the database).
+    ///
+    /// The alternative is the conservative allocation behavior; when enabled, the algorithm always
+    /// allocates the minimum amount of memory required to store the given number of records. Set this
+    /// flag to `true` if you prefer to use as little GPU memory for the database as possible.
    
   bool conservative_memory_allocation = false;
 };
 
 struct IVFPQSearchCagraConfig {
-  /// The number of clusters to search. 
+    /// The number of clusters to search. 
   uint32_t n_probes = 20;
-  ///
-  /// Data type of look up table to be created dynamically at search time.
-  ///
-  /// Possible values: [CUDA_R_32F, CUDA_R_16F, CUDA_R_8U]
-  ///
-  /// The use of low-precision types reduces the amount of shared memory required at search time, so
-  /// fast shared memory kernels can be used even for datasets with large dimansionality. Note that
-  /// the recall is slightly degraded when low-precision type is selected.
+    ///
+    /// Data type of look up table to be created dynamically at search time.
+    ///
+    /// Possible values: [CUDA_R_32F, CUDA_R_16F, CUDA_R_8U]
+    ///
+    /// The use of low-precision types reduces the amount of shared memory required at search time, so
+    /// fast shared memory kernels can be used even for datasets with large dimansionality. Note that
+    /// the recall is slightly degraded when low-precision type is selected.
    
   cudaDataType_t lut_dtype = CUDA_R_32F;
-  ///
-  /// Storage data type for distance/similarity computed at search time.
-  ///
-  /// Possible values: [CUDA_R_16F, CUDA_R_32F]
-  ///
-  /// If the performance limiter at search time is device memory access, selecting FP16 will improve
-  /// performance slightly.
+    ///
+    /// Storage data type for distance/similarity computed at search time.
+    ///
+    /// Possible values: [CUDA_R_16F, CUDA_R_32F]
+    ///
+    /// If the performance limiter at search time is device memory access, selecting FP16 will improve
+    /// performance slightly.
    
   cudaDataType_t internal_distance_dtype = CUDA_R_32F;
-  ///
-  /// Preferred fraction of SM's unified memory / L1 cache to be used as shared memory.
-  ///
-  /// Possible values: [0.0 - 1.0] as a fraction of the `sharedMemPerMultiprocessor`.
-  ///
-  /// One wants to increase the carveout to make sure a good GPU occupancy for the main search
-  /// kernel, but not to keep it too high to leave some memory to be used as L1 cache. Note, this
-  /// value is interpreted only as a hint. Moreover, a GPU usually allows only a fixed set of cache
-  /// configurations, so the provided value is rounded up to the nearest configuration. Refer to the
-  /// NVIDIA tuning guide for the target GPU architecture.
-  ///
-  /// Note, this is a low-level tuning parameter that can have drastic negative effects on the search
-  /// performance if tweaked incorrectly.
+    ///
+    /// Preferred fraction of SM's unified memory / L1 cache to be used as shared memory.
+    ///
+    /// Possible values: [0.0 - 1.0] as a fraction of the `sharedMemPerMultiprocessor`.
+    ///
+    /// One wants to increase the carveout to make sure a good GPU occupancy for the main search
+    /// kernel, but not to keep it too high to leave some memory to be used as L1 cache. Note, this
+    /// value is interpreted only as a hint. Moreover, a GPU usually allows only a fixed set of cache
+    /// configurations, so the provided value is rounded up to the nearest configuration. Refer to the
+    /// NVIDIA tuning guide for the target GPU architecture.
+    ///
+    /// Note, this is a low-level tuning parameter that can have drastic negative effects on the search
+    /// performance if tweaked incorrectly.
    
   double preferred_shmem_carveout = 1.0;
 };
 
 
 struct GpuIndexCagraConfig : public GpuIndexConfig {
-    /// Degree of input graph for pruning. 
+    /// Degree of input graph for pruning.
     size_t intermediate_graph_degree = 128;
-    /// Degree of output graph. 
+    /// Degree of output graph.
     size_t graph_degree = 64;
-    /// ANN algorithm to build knn graph. 
+    /// ANN algorithm to build knn graph.
     graph_build_algo build_algo = graph_build_algo::IVF_PQ;
-    /// Number of Iterations to run if building with NN_DESCENT 
+    /// Number of Iterations to run if building with NN_DESCENT
     size_t nn_descent_niter = 20;
 
     IVFPQBuildCagraConfig *ivf_pq_params = nullptr;
@@ -166,9 +166,9 @@ struct GpuIndexCagraConfig : public GpuIndexConfig {
 };
 
 enum class search_algo {
-    /// For large batch sizes. 
+    /// For large batch sizes.
     SINGLE_CTA,
-    /// For small batch sizes. 
+    /// For small batch sizes.
     MULTI_CTA,
     MULTI_KERNEL,
     AUTO
@@ -194,7 +194,7 @@ struct SearchParametersCagra : SearchParameters {
     // In the following we list additional search parameters for fine tuning.
     // Reasonable default values are automatically chosen.
 
-    /// Which search implementation to use. 
+    /// Which search implementation to use.
     search_algo algo = search_algo::AUTO;
 
     /// Number of threads used to calculate a single distance. 4, 8, 16, or 32.
@@ -204,14 +204,14 @@ struct SearchParametersCagra : SearchParameters {
     /// Number of graph nodes to select as the starting point for the search in
     /// each iteration. aka search width?
     size_t search_width = 1;
-    /// Lower limit of search iterations. 
+    /// Lower limit of search iterations.
     size_t min_iterations = 0;
 
-    /// Thread block size. 0, 64, 128, 256, 512, 1024. Auto selection when 0. 
+    /// Thread block size. 0, 64, 128, 256, 512, 1024. Auto selection when 0.
     size_t thread_block_size = 0;
     /// Hashmap type. Auto selection when AUTO. 
     hash_mode hashmap_mode = hash_mode::AUTO;
-    /// Lower limit of hashmap bit length. More than 8. 
+    /// Lower limit of hashmap bit length. More than 8.
     size_t hashmap_min_bitlen = 0;
     /// Upper limit of hashmap fill rate. More than 0.1, less than 0.9.
     float hashmap_max_fill_rate = 0.5;
@@ -219,7 +219,7 @@ struct SearchParametersCagra : SearchParameters {
     /// Number of iterations of initial random seed node selection. 1 or more.
      
     uint32_t num_random_samplings = 1;
-    /// Bit mask used for initial random seed node selection. 
+    /// Bit mask used for initial random seed node selection.
     uint64_t rand_xor_mask = 0x128394;
 };
 
diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp
index efc837f267..26a2860d2b 100644
--- a/faiss/impl/HNSW.cpp
+++ b/faiss/impl/HNSW.cpp
@@ -265,7 +265,8 @@ void HNSW::shrink_neighbor_list(
         }
     }
     size_t idx = 0;
-    while (keep_max_size_level0 && (output.size() < max_size) && (idx < outsiders.size())) {
+    while (keep_max_size_level0 && (output.size() < max_size) && 
+           (idx < outsiders.size())) {
         output.push_back(outsiders[idx++]);
     }
 }
@@ -338,8 +339,7 @@ void add_link(
         resultSet.emplace(qdis.symmetric_dis(src, neigh), neigh);
     }
 
-    shrink_neighbor_list(
-            qdis, resultSet, end - begin, keep_max_size_level0);
+    shrink_neighbor_list(qdis, resultSet, end - begin, keep_max_size_level0);
 
     // ...and back
     size_t i = begin;
@@ -459,8 +459,7 @@ void HNSW::add_links_starting_from(
     // but we can afford only this many neighbors
     int M = nb_neighbors(level);
 
-    ::faiss::shrink_neighbor_list(
-            ptdis, link_targets, M, keep_max_size_level0);
+    ::faiss::shrink_neighbor_list(ptdis, link_targets, M, keep_max_size_level0);
 
     std::vector<storage_idx_t> neighbors;
     neighbors.reserve(link_targets.size());
diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp
index 1f27a68451..efdc488112 100644
--- a/faiss/impl/index_write.cpp
+++ b/faiss/impl/index_write.cpp
@@ -760,7 +760,7 @@ void write_index(const Index* idx, IOWriter* f) {
                 : dynamic_cast<const IndexHNSWPQ*>(idx)      ? fourcc("IHNp")
                 : dynamic_cast<const IndexHNSWSQ*>(idx)      ? fourcc("IHNs")
                 : dynamic_cast<const IndexHNSW2Level*>(idx)  ? fourcc("IHN2")
-                : dynamic_cast<const IndexHNSWCagra*>(idx)  ? fourcc("IHNc")
+                : dynamic_cast<const IndexHNSWCagra*>(idx)   ? fourcc("IHNc")
                                                              : 0;
         FAISS_THROW_IF_NOT(h != 0);
         WRITE1(h);

From 949e6349f9dece623d097e502631a102db6195d4 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Mon, 22 Apr 2024 14:46:52 -0700
Subject: [PATCH 031/148] format properly

---
 faiss/gpu/GpuIndexCagra.h | 160 ++++++++++++++++++++------------------
 faiss/impl/HNSW.cpp       |   2 +-
 2 files changed, 87 insertions(+), 75 deletions(-)

diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
index 563208857c..a97543f3d9 100644
--- a/faiss/gpu/GpuIndexCagra.h
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -44,112 +44,124 @@ enum class graph_build_algo {
 };
 
 /// A type for specifying how PQ codebooks are created.
-enum class codebook_gen {  // NOLINT
-  PER_SUBSPACE = 0,        // NOLINT
-  PER_CLUSTER  = 1,        // NOLINT
+enum class codebook_gen { // NOLINT
+    PER_SUBSPACE = 0,     // NOLINT
+    PER_CLUSTER = 1,      // NOLINT
 };
 
 struct IVFPQBuildCagraConfig {
     ///
     /// The number of inverted lists (clusters)
     ///
-    /// Hint: the number of vectors per cluster (`n_rows/n_lists`) should be approximately 1,000 to
-    /// 10,000.
-   
-  uint32_t n_lists = 1024;
-    /// The number of iterations searching for kmeans centers (index building). 
-  uint32_t kmeans_n_iters = 20;
-    /// The fraction of data to use during iterative kmeans building. 
-  double kmeans_trainset_fraction = 0.5;
+    /// Hint: the number of vectors per cluster (`n_rows/n_lists`) should be
+    /// approximately 1,000 to 10,000.
+
+    uint32_t n_lists = 1024;
+    /// The number of iterations searching for kmeans centers (index building).
+    uint32_t kmeans_n_iters = 20;
+    /// The fraction of data to use during iterative kmeans building.
+    double kmeans_trainset_fraction = 0.5;
     ///
     /// The bit length of the vector element after compression by PQ.
     ///
     /// Possible values: [4, 5, 6, 7, 8].
     ///
-    /// Hint: the smaller the 'pq_bits', the smaller the index size and the better the search
-    /// performance, but the lower the recall.
-   
-  uint32_t pq_bits = 8;
+    /// Hint: the smaller the 'pq_bits', the smaller the index size and the
+    /// better the search performance, but the lower the recall.
+
+    uint32_t pq_bits = 8;
     ///
-    /// The dimensionality of the vector after compression by PQ. When zero, an optimal value is
-    /// selected using a heuristic.
+    /// The dimensionality of the vector after compression by PQ. When zero, an
+    /// optimal value is selected using a heuristic.
     ///
     /// NB: `pq_dim  /// pq_bits` must be a multiple of 8.
     ///
-    /// Hint: a smaller 'pq_dim' results in a smaller index size and better search performance, but
-    /// lower recall. If 'pq_bits' is 8, 'pq_dim' can be set to any number, but multiple of 8 are
-    /// desirable for good performance. If 'pq_bits' is not 8, 'pq_dim' should be a multiple of 8.
-    /// For good performance, it is desirable that 'pq_dim' is a multiple of 32. Ideally, 'pq_dim'
-    /// should be also a divisor of the dataset dim.
-   
-  uint32_t pq_dim = 0;
-    /// How PQ codebooks are created. 
-  codebook_gen codebook_kind = codebook_gen::PER_SUBSPACE;
+    /// Hint: a smaller 'pq_dim' results in a smaller index size and better
+    /// search performance, but lower recall. If 'pq_bits' is 8, 'pq_dim' can be
+    /// set to any number, but multiple of 8 are desirable for good performance.
+    /// If 'pq_bits' is not 8, 'pq_dim' should be a multiple of 8. For good
+    /// performance, it is desirable that 'pq_dim' is a multiple of 32. Ideally,
+    /// 'pq_dim' should be also a divisor of the dataset dim.
+
+    uint32_t pq_dim = 0;
+    /// How PQ codebooks are created.
+    codebook_gen codebook_kind = codebook_gen::PER_SUBSPACE;
     ///
-    /// Apply a random rotation matrix on the input data and queries even if `dim % pq_dim == 0`.
+    /// Apply a random rotation matrix on the input data and queries even if
+    /// `dim % pq_dim == 0`.
     ///
-    /// Note: if `dim` is not multiple of `pq_dim`, a random rotation is always applied to the input
-    /// data and queries to transform the working space from `dim` to `rot_dim`, which may be slightly
-    /// larger than the original space and and is a multiple of `pq_dim` (`rot_dim % pq_dim == 0`).
-    /// However, this transform is not necessary when `dim` is multiple of `pq_dim`
-    ///   (`dim == rot_dim`, hence no need in adding "extra" data columns / features).
+    /// Note: if `dim` is not multiple of `pq_dim`, a random rotation is always
+    /// applied to the input data and queries to transform the working space
+    /// from `dim` to `rot_dim`, which may be slightly larger than the original
+    /// space and and is a multiple of `pq_dim` (`rot_dim % pq_dim == 0`).
+    /// However, this transform is not necessary when `dim` is multiple of
+    /// `pq_dim`
+    ///   (`dim == rot_dim`, hence no need in adding "extra" data columns /
+    ///   features).
     ///
-    /// By default, if `dim == rot_dim`, the rotation transform is initialized with the identity
-    /// matrix. When `force_random_rotation == true`, a random orthogonal transform matrix is generated
-    /// regardless of the values of `dim` and `pq_dim`.
-   
-  bool force_random_rotation = false;
+    /// By default, if `dim == rot_dim`, the rotation transform is initialized
+    /// with the identity matrix. When `force_random_rotation == true`, a random
+    /// orthogonal transform matrix is generated regardless of the values of
+    /// `dim` and `pq_dim`.
+
+    bool force_random_rotation = false;
     ///
-    /// By default, the algorithm allocates more space than necessary for individual clusters
-    /// (`list_data`). This allows to amortize the cost of memory allocation and reduce the number of
-    /// data copies during repeated calls to `extend` (extending the database).
+    /// By default, the algorithm allocates more space than necessary for
+    /// individual clusters
+    /// (`list_data`). This allows to amortize the cost of memory allocation and
+    /// reduce the number of data copies during repeated calls to `extend`
+    /// (extending the database).
     ///
-    /// The alternative is the conservative allocation behavior; when enabled, the algorithm always
-    /// allocates the minimum amount of memory required to store the given number of records. Set this
-    /// flag to `true` if you prefer to use as little GPU memory for the database as possible.
-   
-  bool conservative_memory_allocation = false;
+    /// The alternative is the conservative allocation behavior; when enabled,
+    /// the algorithm always allocates the minimum amount of memory required to
+    /// store the given number of records. Set this flag to `true` if you prefer
+    /// to use as little GPU memory for the database as possible.
+
+    bool conservative_memory_allocation = false;
 };
 
 struct IVFPQSearchCagraConfig {
-    /// The number of clusters to search. 
-  uint32_t n_probes = 20;
+    /// The number of clusters to search.
+    uint32_t n_probes = 20;
     ///
     /// Data type of look up table to be created dynamically at search time.
     ///
     /// Possible values: [CUDA_R_32F, CUDA_R_16F, CUDA_R_8U]
     ///
-    /// The use of low-precision types reduces the amount of shared memory required at search time, so
-    /// fast shared memory kernels can be used even for datasets with large dimansionality. Note that
-    /// the recall is slightly degraded when low-precision type is selected.
-   
-  cudaDataType_t lut_dtype = CUDA_R_32F;
+    /// The use of low-precision types reduces the amount of shared memory
+    /// required at search time, so fast shared memory kernels can be used even
+    /// for datasets with large dimansionality. Note that the recall is slightly
+    /// degraded when low-precision type is selected.
+
+    cudaDataType_t lut_dtype = CUDA_R_32F;
     ///
     /// Storage data type for distance/similarity computed at search time.
     ///
     /// Possible values: [CUDA_R_16F, CUDA_R_32F]
     ///
-    /// If the performance limiter at search time is device memory access, selecting FP16 will improve
-    /// performance slightly.
-   
-  cudaDataType_t internal_distance_dtype = CUDA_R_32F;
+    /// If the performance limiter at search time is device memory access,
+    /// selecting FP16 will improve performance slightly.
+
+    cudaDataType_t internal_distance_dtype = CUDA_R_32F;
     ///
-    /// Preferred fraction of SM's unified memory / L1 cache to be used as shared memory.
+    /// Preferred fraction of SM's unified memory / L1 cache to be used as
+    /// shared memory.
     ///
-    /// Possible values: [0.0 - 1.0] as a fraction of the `sharedMemPerMultiprocessor`.
+    /// Possible values: [0.0 - 1.0] as a fraction of the
+    /// `sharedMemPerMultiprocessor`.
     ///
-    /// One wants to increase the carveout to make sure a good GPU occupancy for the main search
-    /// kernel, but not to keep it too high to leave some memory to be used as L1 cache. Note, this
-    /// value is interpreted only as a hint. Moreover, a GPU usually allows only a fixed set of cache
-    /// configurations, so the provided value is rounded up to the nearest configuration. Refer to the
-    /// NVIDIA tuning guide for the target GPU architecture.
+    /// One wants to increase the carveout to make sure a good GPU occupancy for
+    /// the main search kernel, but not to keep it too high to leave some memory
+    /// to be used as L1 cache. Note, this value is interpreted only as a hint.
+    /// Moreover, a GPU usually allows only a fixed set of cache configurations,
+    /// so the provided value is rounded up to the nearest configuration. Refer
+    /// to the NVIDIA tuning guide for the target GPU architecture.
     ///
-    /// Note, this is a low-level tuning parameter that can have drastic negative effects on the search
-    /// performance if tweaked incorrectly.
-   
-  double preferred_shmem_carveout = 1.0;
-};
+    /// Note, this is a low-level tuning parameter that can have drastic
+    /// negative effects on the search performance if tweaked incorrectly.
 
+    double preferred_shmem_carveout = 1.0;
+};
 
 struct GpuIndexCagraConfig : public GpuIndexConfig {
     /// Degree of input graph for pruning.
@@ -161,8 +173,8 @@ struct GpuIndexCagraConfig : public GpuIndexConfig {
     /// Number of Iterations to run if building with NN_DESCENT
     size_t nn_descent_niter = 20;
 
-    IVFPQBuildCagraConfig *ivf_pq_params = nullptr;
-    IVFPQSearchCagraConfig *ivf_pq_search_params = nullptr;
+    IVFPQBuildCagraConfig* ivf_pq_params = nullptr;
+    IVFPQSearchCagraConfig* ivf_pq_search_params = nullptr;
 };
 
 enum class search_algo {
@@ -185,7 +197,7 @@ struct SearchParametersCagra : SearchParameters {
     ///
     ///  This is the main knob to adjust trade off between accuracy and search
     /// speed. Higher values improve the search accuracy.
-     
+
     size_t itopk_size = 64;
 
     /// Upper limit of search iterations. Auto select when 0.
@@ -198,7 +210,7 @@ struct SearchParametersCagra : SearchParameters {
     search_algo algo = search_algo::AUTO;
 
     /// Number of threads used to calculate a single distance. 4, 8, 16, or 32.
-     
+
     size_t team_size = 0;
 
     /// Number of graph nodes to select as the starting point for the search in
@@ -209,7 +221,7 @@ struct SearchParametersCagra : SearchParameters {
 
     /// Thread block size. 0, 64, 128, 256, 512, 1024. Auto selection when 0.
     size_t thread_block_size = 0;
-    /// Hashmap type. Auto selection when AUTO. 
+    /// Hashmap type. Auto selection when AUTO.
     hash_mode hashmap_mode = hash_mode::AUTO;
     /// Lower limit of hashmap bit length. More than 8.
     size_t hashmap_min_bitlen = 0;
@@ -217,7 +229,7 @@ struct SearchParametersCagra : SearchParameters {
     float hashmap_max_fill_rate = 0.5;
 
     /// Number of iterations of initial random seed node selection. 1 or more.
-     
+
     uint32_t num_random_samplings = 1;
     /// Bit mask used for initial random seed node selection.
     uint64_t rand_xor_mask = 0x128394;
diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp
index 26a2860d2b..fedfd801a8 100644
--- a/faiss/impl/HNSW.cpp
+++ b/faiss/impl/HNSW.cpp
@@ -265,7 +265,7 @@ void HNSW::shrink_neighbor_list(
         }
     }
     size_t idx = 0;
-    while (keep_max_size_level0 && (output.size() < max_size) && 
+    while (keep_max_size_level0 && (output.size() < max_size) &&
            (idx < outsiders.size())) {
         output.push_back(outsiders[idx++]);
     }

From bccd54a3f6a8308ed5e10fab748bf2a6c7c6949e Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Tue, 30 Apr 2024 11:30:33 -0700
Subject: [PATCH 032/148] InnerProduct

---
 faiss/gpu/impl/RaftCagra.cu         | 50 ++++++++++++++++++-----------
 faiss/gpu/impl/RaftCagra.cuh        |  2 +-
 faiss/gpu/test/TestGpuIndexCagra.cu | 36 +++++++++++++--------
 3 files changed, 55 insertions(+), 33 deletions(-)

diff --git a/faiss/gpu/impl/RaftCagra.cu b/faiss/gpu/impl/RaftCagra.cu
index 2f51b6b35a..292079321d 100644
--- a/faiss/gpu/impl/RaftCagra.cu
+++ b/faiss/gpu/impl/RaftCagra.cu
@@ -52,22 +52,22 @@ RaftCagra::RaftCagra(
           dim_(dim),
           metric_(metric),
           metricArg_(metricArg),
-          index_pams_(),
+          index_params_(),
           ivf_pq_params_(ivf_pq_params),
           ivf_pq_search_params_(ivf_pq_search_params) {
     FAISS_THROW_IF_NOT_MSG(
-            metric == faiss::METRIC_L2,
-            "CAGRA currently only supports L2 metric.");
+            metric == faiss::METRIC_L2 || metric == faiss::METRIC_INNER_PRODUCT,
+            "CAGRA currently only supports L2 or Inner Product metric.");
     FAISS_THROW_IF_NOT_MSG(
             indicesOptions == faiss::gpu::INDICES_64_BIT,
             "only INDICES_64_BIT is supported for RAFT CAGRA index");
 
-    index_pams_.intermediate_graph_degree = intermediate_graph_degree;
-    index_pams_.graph_degree = graph_degree;
-    index_pams_.build_algo =
+    index_params_.intermediate_graph_degree = intermediate_graph_degree;
+    index_params_.graph_degree = graph_degree;
+    index_params_.build_algo =
             static_cast<raft::neighbors::cagra::graph_build_algo>(
                     graph_build_algo);
-    index_pams_.nn_descent_niter = nn_descent_niter;
+    index_params_.nn_descent_niter = nn_descent_niter;
 
     if (!ivf_pq_params_) {
         ivf_pq_params_ =
@@ -77,6 +77,12 @@ RaftCagra::RaftCagra(
         ivf_pq_search_params_ =
                 std::make_optional<raft::neighbors::ivf_pq::search_params>();
     }
+    index_params_.metric = metric_ == faiss::METRIC_L2
+            ? raft::distance::DistanceType::L2Expanded
+            : raft::distance::DistanceType::InnerProduct;
+    ivf_pq_params_->metric = metric_ == faiss::METRIC_L2
+            ? raft::distance::DistanceType::L2Expanded
+            : raft::distance::DistanceType::InnerProduct;
 
     reset();
 }
@@ -96,8 +102,8 @@ RaftCagra::RaftCagra(
           metric_(metric),
           metricArg_(metricArg) {
     FAISS_THROW_IF_NOT_MSG(
-            metric == faiss::METRIC_L2,
-            "CAGRA currently only supports L2 metric.");
+            metric == faiss::METRIC_L2 || metric == faiss::METRIC_INNER_PRODUCT,
+            "CAGRA currently only supports L2 or Inner Product metric.");
     FAISS_THROW_IF_NOT_MSG(
             indicesOptions == faiss::gpu::INDICES_64_BIT,
             "only INDICES_64_BIT is supported for RAFT CAGRA index");
@@ -127,7 +133,9 @@ RaftCagra::RaftCagra(
 
         raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
                 raft_handle,
-                raft::distance::DistanceType::L2Expanded,
+                metric_ == faiss::METRIC_L2
+                        ? raft::distance::DistanceType::L2Expanded
+                        : raft::distance::DistanceType::InnerProduct,
                 distances_mds,
                 raft::make_const_mdspan(knn_graph_copy.view()));
     } else if (!distances_on_gpu && !knn_graph_on_gpu) {
@@ -144,7 +152,9 @@ RaftCagra::RaftCagra(
 
         raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
                 raft_handle,
-                raft::distance::DistanceType::L2Expanded,
+                metric_ == faiss::METRIC_L2
+                        ? raft::distance::DistanceType::L2Expanded
+                        : raft::distance::DistanceType::InnerProduct,
                 distances_mds,
                 raft::make_const_mdspan(knn_graph_copy.view()));
     } else {
@@ -156,11 +166,11 @@ RaftCagra::RaftCagra(
 void RaftCagra::train(idx_t n, const float* x) {
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
-    if (index_pams_.build_algo ==
+    if (index_params_.build_algo ==
         raft::neighbors::cagra::graph_build_algo::IVF_PQ) {
         std::optional<raft::host_matrix<uint32_t, int64_t>> knn_graph(
                 raft::make_host_matrix<uint32_t, int64_t>(
-                        n, index_pams_.intermediate_graph_degree));
+                        n, index_params_.intermediate_graph_degree));
         if (getDeviceForAddress(x) >= 0) {
             auto dataset_d =
                     raft::make_device_matrix_view<const float, int64_t>(
@@ -184,7 +194,7 @@ void RaftCagra::train(idx_t n, const float* x) {
                     ivf_pq_search_params_);
         }
         auto cagra_graph = raft::make_host_matrix<uint32_t, int64_t>(
-                n, index_pams_.graph_degree);
+                n, index_params_.graph_degree);
 
         raft::neighbors::cagra::optimize<uint32_t>(
                 raft_handle, knn_graph->view(), cagra_graph.view());
@@ -198,7 +208,9 @@ void RaftCagra::train(idx_t n, const float* x) {
                             x, n, dim_);
             raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
                     raft_handle,
-                    index_pams_.metric,
+                    metric_ == faiss::METRIC_L2
+                            ? raft::distance::DistanceType::L2Expanded
+                            : raft::distance::DistanceType::InnerProduct,
                     dataset_d,
                     raft::make_const_mdspan(cagra_graph.view()));
         } else {
@@ -206,7 +218,9 @@ void RaftCagra::train(idx_t n, const float* x) {
                     x, n, dim_);
             raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
                     raft_handle,
-                    index_pams_.metric,
+                    metric_ == faiss::METRIC_L2
+                            ? raft::distance::DistanceType::L2Expanded
+                            : raft::distance::DistanceType::InnerProduct,
                     dataset_h,
                     raft::make_const_mdspan(cagra_graph.view()));
         }
@@ -215,13 +229,13 @@ void RaftCagra::train(idx_t n, const float* x) {
         if (getDeviceForAddress(x) >= 0) {
             raft_knn_index = raft::runtime::neighbors::cagra::build(
                     raft_handle,
-                    index_pams_,
+                    index_params_,
                     raft::make_device_matrix_view<const float, int64_t>(
                             x, n, dim_));
         } else {
             raft_knn_index = raft::runtime::neighbors::cagra::build(
                     raft_handle,
-                    index_pams_,
+                    index_params_,
                     raft::make_host_matrix_view<const float, int64_t>(
                             x, n, dim_));
         }
diff --git a/faiss/gpu/impl/RaftCagra.cuh b/faiss/gpu/impl/RaftCagra.cuh
index 878198d609..95f6c03fca 100644
--- a/faiss/gpu/impl/RaftCagra.cuh
+++ b/faiss/gpu/impl/RaftCagra.cuh
@@ -117,7 +117,7 @@ class RaftCagra {
     float metricArg_;
 
     /// Parameters to build RAFT CAGRA index
-    raft::neighbors::cagra::index_params index_pams_;
+    raft::neighbors::cagra::index_params index_params_;
 
     /// Parameters to build CAGRA graph using IVF PQ
     std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params_;
diff --git a/faiss/gpu/test/TestGpuIndexCagra.cu b/faiss/gpu/test/TestGpuIndexCagra.cu
index 54987cd5f5..c4b6c8a768 100644
--- a/faiss/gpu/test/TestGpuIndexCagra.cu
+++ b/faiss/gpu/test/TestGpuIndexCagra.cu
@@ -75,7 +75,7 @@ struct Options {
     int device;
 };
 
-void queryTest() {
+void queryTest(faiss::MetricType metric) {
     for (int tries = 0; tries < 5; ++tries) {
         Options opt;
 
@@ -97,8 +97,7 @@ void queryTest() {
         config.intermediate_graph_degree = opt.intermediateGraphDegree;
         config.build_algo = opt.buildAlgo;
 
-        faiss::gpu::GpuIndexCagra gpuIndex(
-                &res, cpuIndex.d, faiss::METRIC_L2, config);
+        faiss::gpu::GpuIndexCagra gpuIndex(&res, cpuIndex.d, metric, config);
         gpuIndex.train(opt.numTrain, trainVecs.data());
 
         // query
@@ -177,10 +176,14 @@ void queryTest() {
 }
 
 TEST(TestGpuIndexCagra, Float32_Query_L2) {
-    queryTest();
+    queryTest(faiss::METRIC_L2);
 }
 
-void copyToTest() {
+TEST(TestGpuIndexCagra, Float32_Query_IP) {
+    queryTest(faiss::METRIC_INNER_PRODUCT);
+}
+
+void copyToTest(faiss::MetricType metric) {
     for (int tries = 0; tries < 5; ++tries) {
         Options opt;
 
@@ -198,8 +201,7 @@ void copyToTest() {
         config.intermediate_graph_degree = opt.intermediateGraphDegree;
         config.build_algo = opt.buildAlgo;
 
-        faiss::gpu::GpuIndexCagra gpuIndex(
-                &res, opt.dim, faiss::METRIC_L2, config);
+        faiss::gpu::GpuIndexCagra gpuIndex(&res, opt.dim, metric, config);
         gpuIndex.train(opt.numTrain, trainVecs.data());
 
         faiss::IndexHNSWCagra copiedCpuIndex;
@@ -300,10 +302,14 @@ void copyToTest() {
 }
 
 TEST(TestGpuIndexCagra, Float32_CopyTo_L2) {
-    copyToTest();
+    copyToTest(faiss::METRIC_L2);
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyTo_IP) {
+    copyToTest(faiss::METRIC_INNER_PRODUCT);
 }
 
-void copyFromTest() {
+void copyFromTest(faiss::MetricType metric) {
     for (int tries = 0; tries < 5; ++tries) {
         Options opt;
 
@@ -319,8 +325,7 @@ void copyFromTest() {
         res.noTempMemory();
 
         // convert to gpu index
-        faiss::gpu::GpuIndexCagra copiedGpuIndex(
-                &res, cpuIndex.d, faiss::METRIC_L2);
+        faiss::gpu::GpuIndexCagra copiedGpuIndex(&res, cpuIndex.d, metric);
         copiedGpuIndex.copyFrom(&cpuIndex);
 
         // train gpu index
@@ -330,8 +335,7 @@ void copyFromTest() {
         config.intermediate_graph_degree = opt.intermediateGraphDegree;
         config.build_algo = opt.buildAlgo;
 
-        faiss::gpu::GpuIndexCagra gpuIndex(
-                &res, opt.dim, faiss::METRIC_L2, config);
+        faiss::gpu::GpuIndexCagra gpuIndex(&res, opt.dim, metric, config);
         gpuIndex.train(opt.numTrain, trainVecs.data());
 
         // query
@@ -402,7 +406,11 @@ void copyFromTest() {
 }
 
 TEST(TestGpuIndexCagra, Float32_CopyFrom_L2) {
-    copyFromTest();
+    copyFromTest(faiss::METRIC_L2);
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyFrom_IP) {
+    copyFromTest(faiss::METRIC_INNER_PRODUCT);
 }
 
 int main(int argc, char** argv) {

From 2aaa6e91476ee079ade9cf4a5197bb4783f7fda3 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Tue, 7 May 2024 16:01:34 -0700
Subject: [PATCH 033/148] passing ip tests

---
 faiss/IndexHNSW.cpp                 | 12 ++++++--
 faiss/IndexHNSW.h                   |  2 +-
 faiss/gpu/GpuIndexCagra.cu          | 14 +++++----
 faiss/gpu/test/TestGpuIndexCagra.cu | 45 +++++++++++++++++++----------
 4 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index f6a2ca9587..7d5d37e838 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -19,6 +19,7 @@
 #include <cstdlib>
 #include <cstring>
 
+#include <memory>
 #include <queue>
 #include <unordered_set>
 
@@ -932,8 +933,15 @@ IndexHNSWCagra::IndexHNSWCagra() {
     is_trained = true;
 }
 
-IndexHNSWCagra::IndexHNSWCagra(int d, int M)
-        : IndexHNSW(new IndexFlatL2(d), M) {
+IndexHNSWCagra::IndexHNSWCagra(int d, int M, MetricType metric)
+        : IndexHNSW(
+                  (metric == METRIC_L2)
+                          ? static_cast<IndexFlat*>(new IndexFlatL2(d))
+                          : static_cast<IndexFlat*>(new IndexFlatIP(d)),
+                  M) {
+    FAISS_THROW_IF_NOT_MSG(
+            ((metric == METRIC_L2) || (metric == METRIC_INNER_PRODUCT)),
+            "unsupported metric type for IndexHNSWCagra");
     own_fields = true;
     is_trained = true;
     init_level0 = true;
diff --git a/faiss/IndexHNSW.h b/faiss/IndexHNSW.h
index 3d3162e423..12a90cabe4 100644
--- a/faiss/IndexHNSW.h
+++ b/faiss/IndexHNSW.h
@@ -153,7 +153,7 @@ struct IndexHNSW2Level : IndexHNSW {
 
 struct IndexHNSWCagra : IndexHNSW {
     IndexHNSWCagra();
-    IndexHNSWCagra(int d, int M);
+    IndexHNSWCagra(int d, int M, MetricType metric = METRIC_L2);
 };
 
 } // namespace faiss
diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
index 916f774bc1..dcd2e6944b 100644
--- a/faiss/gpu/GpuIndexCagra.cu
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -160,10 +160,9 @@ void GpuIndexCagra::copyFrom(const faiss::IndexHNSWCagra* index) {
 
     GpuIndex::copyFrom(index);
 
-    auto base_index = index->storage;
-    auto l2_index = dynamic_cast<IndexFlatL2*>(base_index);
-    FAISS_ASSERT(l2_index);
-    auto distances = l2_index->get_xb();
+    auto base_index = dynamic_cast<IndexFlat*>(index->storage);
+    FAISS_ASSERT(base_index);
+    auto distances = base_index->get_xb();
 
     auto hnsw = index->hnsw;
     // copy level 0 to a dense knn graph matrix
@@ -213,7 +212,12 @@ void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const {
     if (index->storage and index->own_fields) {
         delete index->storage;
     }
-    index->storage = new IndexFlatL2(index->d);
+
+    if (this->metric_type == METRIC_L2) {
+        index->storage = new IndexFlatL2(index->d);
+    } else if (this->metric_type == METRIC_INNER_PRODUCT) {
+        index->storage = new IndexFlatIP(index->d);
+    }
     index->own_fields = true;
     index->keep_max_size_level0 = true;
     index->hnsw.reset();
diff --git a/faiss/gpu/test/TestGpuIndexCagra.cu b/faiss/gpu/test/TestGpuIndexCagra.cu
index c4b6c8a768..228c9ac39e 100644
--- a/faiss/gpu/test/TestGpuIndexCagra.cu
+++ b/faiss/gpu/test/TestGpuIndexCagra.cu
@@ -75,15 +75,19 @@ struct Options {
     int device;
 };
 
-void queryTest(faiss::MetricType metric) {
+void queryTest(faiss::MetricType metric, double expected_recall) {
     for (int tries = 0; tries < 5; ++tries) {
         Options opt;
+        if (opt.buildAlgo == faiss::gpu::graph_build_algo::NN_DESCENT &&
+            metric == faiss::METRIC_INNER_PRODUCT) {
+            continue;
+        }
 
         std::vector<float> trainVecs =
                 faiss::gpu::randVecs(opt.numTrain, opt.dim);
 
         // train cpu index
-        faiss::IndexHNSWFlat cpuIndex(opt.dim, opt.graphDegree / 2);
+        faiss::IndexHNSWFlat cpuIndex(opt.dim, opt.graphDegree / 2, metric);
         cpuIndex.hnsw.efConstruction = opt.k * 2;
         cpuIndex.add(opt.numTrain, trainVecs.data());
 
@@ -171,21 +175,25 @@ void queryTest(faiss::MetricType metric) {
                 recall_score.view(),
                 test_dis_mds_opt,
                 ref_dis_mds_opt);
-        ASSERT_TRUE(*recall_score.data_handle() > 0.98);
+        ASSERT_TRUE(*recall_score.data_handle() > expected_recall);
     }
 }
 
 TEST(TestGpuIndexCagra, Float32_Query_L2) {
-    queryTest(faiss::METRIC_L2);
+    queryTest(faiss::METRIC_L2, 0.98);
 }
 
 TEST(TestGpuIndexCagra, Float32_Query_IP) {
-    queryTest(faiss::METRIC_INNER_PRODUCT);
+    queryTest(faiss::METRIC_INNER_PRODUCT, 0.88);
 }
 
-void copyToTest(faiss::MetricType metric) {
+void copyToTest(faiss::MetricType metric, double expected_recall) {
     for (int tries = 0; tries < 5; ++tries) {
         Options opt;
+        if (opt.buildAlgo == faiss::gpu::graph_build_algo::NN_DESCENT &&
+            metric == faiss::METRIC_INNER_PRODUCT) {
+            continue;
+        }
 
         std::vector<float> trainVecs =
                 faiss::gpu::randVecs(opt.numTrain, opt.dim);
@@ -204,7 +212,8 @@ void copyToTest(faiss::MetricType metric) {
         faiss::gpu::GpuIndexCagra gpuIndex(&res, opt.dim, metric, config);
         gpuIndex.train(opt.numTrain, trainVecs.data());
 
-        faiss::IndexHNSWCagra copiedCpuIndex;
+        faiss::IndexHNSWCagra copiedCpuIndex(
+                opt.dim, opt.graphDegree / 2, metric);
         gpuIndex.copyTo(&copiedCpuIndex);
         copiedCpuIndex.hnsw.efConstruction = opt.k * 2;
 
@@ -212,7 +221,7 @@ void copyToTest(faiss::MetricType metric) {
         copiedCpuIndex.add(opt.numAdd, addVecs.data());
 
         // train cpu index
-        faiss::IndexHNSWFlat cpuIndex(opt.dim, opt.graphDegree / 2);
+        faiss::IndexHNSWFlat cpuIndex(opt.dim, opt.graphDegree / 2, metric);
         cpuIndex.hnsw.efConstruction = opt.k * 2;
         cpuIndex.add(opt.numTrain, trainVecs.data());
 
@@ -297,27 +306,31 @@ void copyToTest(faiss::MetricType metric) {
                 recall_score.view(),
                 copy_ref_dis_mds_opt,
                 ref_dis_mds_opt);
-        ASSERT_TRUE(*recall_score.data_handle() > 0.99);
+        ASSERT_TRUE(*recall_score.data_handle() > expected_recall);
     }
 }
 
 TEST(TestGpuIndexCagra, Float32_CopyTo_L2) {
-    copyToTest(faiss::METRIC_L2);
+    copyToTest(faiss::METRIC_L2, 0.98);
 }
 
 TEST(TestGpuIndexCagra, Float32_CopyTo_IP) {
-    copyToTest(faiss::METRIC_INNER_PRODUCT);
+    copyToTest(faiss::METRIC_INNER_PRODUCT, 0.88);
 }
 
-void copyFromTest(faiss::MetricType metric) {
+void copyFromTest(faiss::MetricType metric, double expected_recall) {
     for (int tries = 0; tries < 5; ++tries) {
         Options opt;
+        if (opt.buildAlgo == faiss::gpu::graph_build_algo::NN_DESCENT &&
+            metric == faiss::METRIC_INNER_PRODUCT) {
+            continue;
+        }
 
         std::vector<float> trainVecs =
                 faiss::gpu::randVecs(opt.numTrain, opt.dim);
 
         // train cpu index
-        faiss::IndexHNSWCagra cpuIndex(opt.dim, opt.graphDegree / 2);
+        faiss::IndexHNSWCagra cpuIndex(opt.dim, opt.graphDegree / 2, metric);
         cpuIndex.hnsw.efConstruction = opt.k * 2;
         cpuIndex.add(opt.numTrain, trainVecs.data());
 
@@ -401,16 +414,16 @@ void copyFromTest(faiss::MetricType metric) {
                 recall_score.view(),
                 copy_test_dis_mds_opt,
                 test_dis_mds_opt);
-        ASSERT_TRUE(*recall_score.data_handle() > 0.99);
+        ASSERT_TRUE(*recall_score.data_handle() > expected_recall);
     }
 }
 
 TEST(TestGpuIndexCagra, Float32_CopyFrom_L2) {
-    copyFromTest(faiss::METRIC_L2);
+    copyFromTest(faiss::METRIC_L2, 0.98);
 }
 
 TEST(TestGpuIndexCagra, Float32_CopyFrom_IP) {
-    copyFromTest(faiss::METRIC_INNER_PRODUCT);
+    copyFromTest(faiss::METRIC_INNER_PRODUCT, 0.88);
 }
 
 int main(int argc, char** argv) {

From 70b0ab8fab26f1751625165b1adf3a5c8e968997 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Thu, 9 May 2024 13:02:23 -0700
Subject: [PATCH 034/148] address review

---
 faiss/IndexHNSW.cpp                 | 2 --
 faiss/IndexHNSW.h                   | 9 +++++++++
 faiss/gpu/GpuIndexCagra.cu          | 3 +--
 faiss/gpu/GpuIndexCagra.h           | 3 +--
 faiss/gpu/test/TestGpuIndexCagra.cu | 6 +++---
 faiss/impl/HNSW.cpp                 | 2 --
 faiss/impl/index_read.cpp           | 3 ++-
 faiss/impl/index_write.cpp          | 3 ++-
 8 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index 7d5d37e838..f1e018fd4e 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -7,8 +7,6 @@
 
 // -*- c++ -*-
 
-#include <iostream>
-
 #include <faiss/IndexHNSW.h>
 
 #include <omp.h>
diff --git a/faiss/IndexHNSW.h b/faiss/IndexHNSW.h
index 12a90cabe4..a4675157de 100644
--- a/faiss/IndexHNSW.h
+++ b/faiss/IndexHNSW.h
@@ -34,7 +34,16 @@ struct IndexHNSW : Index {
     bool own_fields = false;
     Index* storage = nullptr;
 
+    // When set to false, level 0 in the knn graph is not initialized.
+    // This option is used by GpuIndexCagra::copyTo(IndexHNSWCagra*)
+    // as level 0 knn graph is copied over from the index built by
+    // GpuIndexCagra.
     bool init_level0 = true;
+
+    // When set to true, all neighbors in level 0 are filled up
+    // to the maximum size allowed (2 * M). This option is used by
+    // IndexHHNSWCagra to create a full base layer graph that is
+    // used when GpuIndexCagra::copyFrom(IndexHNSWCagra*) is invoked.
     bool keep_max_size_level0 = false;
 
     explicit IndexHNSW(int d = 0, int M = 32, MetricType metric = METRIC_L2);
diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
index dcd2e6944b..634a6c1095 100644
--- a/faiss/gpu/GpuIndexCagra.cu
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -25,7 +25,6 @@
 #include <cstddef>
 #include <faiss/gpu/impl/RaftCagra.cuh>
 #include <optional>
-#include "GpuIndexCagra.h"
 
 namespace faiss {
 namespace gpu {
@@ -146,7 +145,7 @@ void GpuIndexCagra::searchImpl_(
             params->hashmap_min_bitlen,
             params->hashmap_max_fill_rate,
             params->num_random_samplings,
-            params->rand_xor_mask);
+            params->seed);
 
     if (not search_params) {
         delete params;
diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
index a97543f3d9..6ecee3ae03 100644
--- a/faiss/gpu/GpuIndexCagra.h
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -25,7 +25,6 @@
 #include <faiss/IndexIVF.h>
 #include <faiss/gpu/GpuIndex.h>
 #include <faiss/gpu/GpuIndexIVFPQ.h>
-#include "GpuIndexIVFPQ.h"
 
 namespace faiss {
 struct IndexHNSWCagra;
@@ -232,7 +231,7 @@ struct SearchParametersCagra : SearchParameters {
 
     uint32_t num_random_samplings = 1;
     /// Bit mask used for initial random seed node selection.
-    uint64_t rand_xor_mask = 0x128394;
+    uint64_t seed = 0x128394;
 };
 
 struct GpuIndexCagra : public GpuIndex {
diff --git a/faiss/gpu/test/TestGpuIndexCagra.cu b/faiss/gpu/test/TestGpuIndexCagra.cu
index 228c9ac39e..b763c591ae 100644
--- a/faiss/gpu/test/TestGpuIndexCagra.cu
+++ b/faiss/gpu/test/TestGpuIndexCagra.cu
@@ -184,7 +184,7 @@ TEST(TestGpuIndexCagra, Float32_Query_L2) {
 }
 
 TEST(TestGpuIndexCagra, Float32_Query_IP) {
-    queryTest(faiss::METRIC_INNER_PRODUCT, 0.88);
+    queryTest(faiss::METRIC_INNER_PRODUCT, 0.85);
 }
 
 void copyToTest(faiss::MetricType metric, double expected_recall) {
@@ -315,7 +315,7 @@ TEST(TestGpuIndexCagra, Float32_CopyTo_L2) {
 }
 
 TEST(TestGpuIndexCagra, Float32_CopyTo_IP) {
-    copyToTest(faiss::METRIC_INNER_PRODUCT, 0.88);
+    copyToTest(faiss::METRIC_INNER_PRODUCT, 0.85);
 }
 
 void copyFromTest(faiss::MetricType metric, double expected_recall) {
@@ -423,7 +423,7 @@ TEST(TestGpuIndexCagra, Float32_CopyFrom_L2) {
 }
 
 TEST(TestGpuIndexCagra, Float32_CopyFrom_IP) {
-    copyFromTest(faiss::METRIC_INNER_PRODUCT, 0.88);
+    copyFromTest(faiss::METRIC_INNER_PRODUCT, 0.85);
 }
 
 int main(int argc, char** argv) {
diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp
index fedfd801a8..a07d1556d5 100644
--- a/faiss/impl/HNSW.cpp
+++ b/faiss/impl/HNSW.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <iostream>
-
 #include <faiss/impl/HNSW.h>
 
 #include <cstddef>
diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index 5f231997ff..5b08640295 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -961,7 +961,8 @@ Index* read_index(IOReader* f, int io_flags) {
         if (h == fourcc("IHNc"))
             idxhnsw = new IndexHNSWCagra();
         read_index_header(idxhnsw, f);
-        READ1(idxhnsw->keep_max_size_level0);
+        if (h == fourcc("IHNc"))
+            READ1(idxhnsw->keep_max_size_level0);
         read_HNSW(&idxhnsw->hnsw, f);
         idxhnsw->storage = read_index(f, io_flags);
         idxhnsw->own_fields = true;
diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp
index efdc488112..e9c6a23a64 100644
--- a/faiss/impl/index_write.cpp
+++ b/faiss/impl/index_write.cpp
@@ -765,7 +765,8 @@ void write_index(const Index* idx, IOWriter* f) {
         FAISS_THROW_IF_NOT(h != 0);
         WRITE1(h);
         write_index_header(idxhnsw, f);
-        WRITE1(idxhnsw->keep_max_size_level0);
+        if (h == fourcc("IHNc"))
+            WRITE1(idxhnsw->keep_max_size_level0);
         write_HNSW(&idxhnsw->hnsw, f);
         write_index(idxhnsw->storage, f);
     } else if (const IndexNSG* idxnsg = dynamic_cast<const IndexNSG*>(idx)) {

From 4148feaef697b4974321d1225e7724be30049722 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Tue, 21 May 2024 13:05:01 -0700
Subject: [PATCH 035/148] base level only search

---
 faiss/IndexHNSW.cpp                 | 64 +++++++++++++++++++++++++++++
 faiss/IndexHNSW.h                   | 24 +++++++++++
 faiss/gpu/GpuIndexCagra.cu          |  8 +++-
 faiss/gpu/test/TestGpuIndexCagra.cu | 54 ++++++++++++++++++++----
 faiss/gpu/test/test_cagra.py        | 44 ++++++++++++++++++++
 5 files changed, 186 insertions(+), 8 deletions(-)
 create mode 100644 faiss/gpu/test/test_cagra.py

diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index f1e018fd4e..5d5a455ab6 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -17,8 +17,10 @@
 #include <cstdlib>
 #include <cstring>
 
+#include <limits>
 #include <memory>
 #include <queue>
+#include <random>
 #include <unordered_set>
 
 #include <sys/stat.h>
@@ -946,4 +948,66 @@ IndexHNSWCagra::IndexHNSWCagra(int d, int M, MetricType metric)
     keep_max_size_level0 = true;
 }
 
+void IndexHNSWCagra::add(idx_t n, const float* x) {
+    FAISS_THROW_IF_NOT_MSG(
+            !base_level_only,
+            "Cannot add vectors when base_level_only is set to True");
+
+    IndexHNSW::add(n, x);
+}
+
+void IndexHNSWCagra::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* params) {
+    if (!base_level_only) {
+        IndexHNSW::search(n, x, k, distances, labels, params);
+    } else {
+        std::vector<storage_idx_t> nearest(n);
+        std::vector<float> nearest_d(n);
+
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) {
+            std::unique_ptr<DistanceComputer> dis(
+                    storage_distance_computer(this->storage));
+            dis->set_query(x + i * d);
+            storage_idx_t entrypoint = -1;
+            float entrypoint_d = std::numeric_limits<float>::max();
+
+            std::random_device rd;
+            std::mt19937 gen(i);
+            std::uniform_int_distribution<idx_t> distrib(0, this->ntotal);
+
+            for (idx_t j = 0; j < num_base_level_search_entrypoints; j++) {
+                auto idx = distrib(gen);
+                auto distance = (*dis)(idx);
+                if (distance < entrypoint_d) {
+                    entrypoint_d = distance;
+                    entrypoint = idx;
+                }
+            }
+
+            FAISS_THROW_IF_NOT_MSG(
+                    entrypoint >= 0, "Could not find a valid entrypoint.");
+
+            nearest[i] = entrypoint;
+            nearest_d[i] = entrypoint_d;
+        }
+
+        if (params) {
+            const SearchParametersHNSW* params_hnsw =
+                    dynamic_cast<const SearchParametersHNSW*>(params);
+            this->hnsw.efSearch = params_hnsw->efSearch;
+            this->hnsw.check_relative_distance =
+                    params_hnsw->check_relative_distance;
+        }
+
+        search_level_0(
+                n, x, k, nearest.data(), nearest_d.data(), distances, labels);
+    }
+}
+
 } // namespace faiss
diff --git a/faiss/IndexHNSW.h b/faiss/IndexHNSW.h
index a4675157de..e1f1d68dee 100644
--- a/faiss/IndexHNSW.h
+++ b/faiss/IndexHNSW.h
@@ -163,6 +163,30 @@ struct IndexHNSW2Level : IndexHNSW {
 struct IndexHNSWCagra : IndexHNSW {
     IndexHNSWCagra();
     IndexHNSWCagra(int d, int M, MetricType metric = METRIC_L2);
+
+    /// When set to true, the index is immutable.
+    /// This option is used to copy the knn graph from GpuIndexCagra
+    /// to the base level of IndexHNSWCagra without adding upper levels.
+    /// Doing so enables to search the HNSW index, but removes the
+    /// ability to add vectors.
+    bool base_level_only = false;
+
+    /// When `base_level_only` is set to `True`, the search function
+    /// searches only the base level knn graph of the HNSW index.
+    /// This parameter selects the entry point by randomly selecting
+    /// some points and using the best one.
+    int num_base_level_search_entrypoints = 32;
+
+    void add(idx_t n, const float* x);
+
+    /// entry point for search
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* params = nullptr);
 };
 
 } // namespace faiss
diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
index 634a6c1095..4ae56df10d 100644
--- a/faiss/gpu/GpuIndexCagra.cu
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -229,7 +229,13 @@ void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const {
 
     // turn off as level 0 is copied from CAGRA graph
     index->init_level0 = false;
-    index->add(n_train, train_dataset.data());
+    if (!index->base_level_only) {
+        index->add(n_train, train_dataset.data());
+    } else {
+        index->hnsw.prepare_level_tab(n_train, false);
+        index->storage->add(n_train, train_dataset.data());
+        index->ntotal = n_train;
+    }
 
     auto graph = get_knngraph();
 
diff --git a/faiss/gpu/test/TestGpuIndexCagra.cu b/faiss/gpu/test/TestGpuIndexCagra.cu
index b763c591ae..c77c7974d0 100644
--- a/faiss/gpu/test/TestGpuIndexCagra.cu
+++ b/faiss/gpu/test/TestGpuIndexCagra.cu
@@ -26,6 +26,7 @@
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/gpu/test/TestUtils.h>
+#include <faiss/utils/distances.h>
 #include <cstddef>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/DeviceTensor.cuh>
@@ -85,6 +86,9 @@ void queryTest(faiss::MetricType metric, double expected_recall) {
 
         std::vector<float> trainVecs =
                 faiss::gpu::randVecs(opt.numTrain, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numTrain, opt.dim, trainVecs.data());
+        }
 
         // train cpu index
         faiss::IndexHNSWFlat cpuIndex(opt.dim, opt.graphDegree / 2, metric);
@@ -106,6 +110,9 @@ void queryTest(faiss::MetricType metric, double expected_recall) {
 
         // query
         auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numQuery, opt.dim, queryVecs.data());
+        }
 
         std::vector<float> refDistance(opt.numQuery * opt.k, 0);
         std::vector<faiss::idx_t> refIndices(opt.numQuery * opt.k, -1);
@@ -184,10 +191,13 @@ TEST(TestGpuIndexCagra, Float32_Query_L2) {
 }
 
 TEST(TestGpuIndexCagra, Float32_Query_IP) {
-    queryTest(faiss::METRIC_INNER_PRODUCT, 0.85);
+    queryTest(faiss::METRIC_INNER_PRODUCT, 0.98);
 }
 
-void copyToTest(faiss::MetricType metric, double expected_recall) {
+void copyToTest(
+        faiss::MetricType metric,
+        double expected_recall,
+        bool base_level_only) {
     for (int tries = 0; tries < 5; ++tries) {
         Options opt;
         if (opt.buildAlgo == faiss::gpu::graph_build_algo::NN_DESCENT &&
@@ -197,7 +207,13 @@ void copyToTest(faiss::MetricType metric, double expected_recall) {
 
         std::vector<float> trainVecs =
                 faiss::gpu::randVecs(opt.numTrain, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numTrain, opt.dim, trainVecs.data());
+        }
         std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numAdd, opt.dim, addVecs.data());
+        }
 
         faiss::gpu::StandardGpuResources res;
         res.noTempMemory();
@@ -214,11 +230,14 @@ void copyToTest(faiss::MetricType metric, double expected_recall) {
 
         faiss::IndexHNSWCagra copiedCpuIndex(
                 opt.dim, opt.graphDegree / 2, metric);
+        copiedCpuIndex.base_level_only = base_level_only;
         gpuIndex.copyTo(&copiedCpuIndex);
         copiedCpuIndex.hnsw.efConstruction = opt.k * 2;
 
         // add more vecs to copied cpu index
-        copiedCpuIndex.add(opt.numAdd, addVecs.data());
+        if (!base_level_only) {
+            copiedCpuIndex.add(opt.numAdd, addVecs.data());
+        }
 
         // train cpu index
         faiss::IndexHNSWFlat cpuIndex(opt.dim, opt.graphDegree / 2, metric);
@@ -226,10 +245,15 @@ void copyToTest(faiss::MetricType metric, double expected_recall) {
         cpuIndex.add(opt.numTrain, trainVecs.data());
 
         // add more vecs to cpu index
-        cpuIndex.add(opt.numAdd, addVecs.data());
+        if (!base_level_only) {
+            cpuIndex.add(opt.numAdd, addVecs.data());
+        }
 
         // query indexes
         auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numQuery, opt.dim, queryVecs.data());
+        }
 
         std::vector<float> refDistance(opt.numQuery * opt.k, 0);
         std::vector<faiss::idx_t> refIndices(opt.numQuery * opt.k, -1);
@@ -306,16 +330,26 @@ void copyToTest(faiss::MetricType metric, double expected_recall) {
                 recall_score.view(),
                 copy_ref_dis_mds_opt,
                 ref_dis_mds_opt);
+        std::cout << "recall_score: " << *recall_score.data_handle()
+                  << std::endl;
         ASSERT_TRUE(*recall_score.data_handle() > expected_recall);
     }
 }
 
 TEST(TestGpuIndexCagra, Float32_CopyTo_L2) {
-    copyToTest(faiss::METRIC_L2, 0.98);
+    copyToTest(faiss::METRIC_L2, 0.98, false);
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyTo_L2_BaseLevelOnly) {
+    copyToTest(faiss::METRIC_L2, 0.98, true);
 }
 
 TEST(TestGpuIndexCagra, Float32_CopyTo_IP) {
-    copyToTest(faiss::METRIC_INNER_PRODUCT, 0.85);
+    copyToTest(faiss::METRIC_INNER_PRODUCT, 0.98, false);
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyTo_IP_BaseLevelOnly) {
+    copyToTest(faiss::METRIC_INNER_PRODUCT, 0.8, true);
 }
 
 void copyFromTest(faiss::MetricType metric, double expected_recall) {
@@ -328,6 +362,9 @@ void copyFromTest(faiss::MetricType metric, double expected_recall) {
 
         std::vector<float> trainVecs =
                 faiss::gpu::randVecs(opt.numTrain, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numTrain, opt.dim, trainVecs.data());
+        }
 
         // train cpu index
         faiss::IndexHNSWCagra cpuIndex(opt.dim, opt.graphDegree / 2, metric);
@@ -353,6 +390,9 @@ void copyFromTest(faiss::MetricType metric, double expected_recall) {
 
         // query
         auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numQuery, opt.dim, queryVecs.data());
+        }
 
         auto gpuRes = res.getResources();
         auto devAlloc = faiss::gpu::makeDevAlloc(
@@ -423,7 +463,7 @@ TEST(TestGpuIndexCagra, Float32_CopyFrom_L2) {
 }
 
 TEST(TestGpuIndexCagra, Float32_CopyFrom_IP) {
-    copyFromTest(faiss::METRIC_INNER_PRODUCT, 0.85);
+    copyFromTest(faiss::METRIC_INNER_PRODUCT, 0.98);
 }
 
 int main(int argc, char** argv) {
diff --git a/faiss/gpu/test/test_cagra.py b/faiss/gpu/test/test_cagra.py
new file mode 100644
index 0000000000..d670a75c08
--- /dev/null
+++ b/faiss/gpu/test/test_cagra.py
@@ -0,0 +1,44 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import faiss
+import numpy as np
+
+from common_faiss_tests import get_dataset_2
+
+from faiss.contrib import datasets, evaluation, big_batch_search
+from faiss.contrib.exhaustive_search import knn_ground_truth, \
+    range_ground_truth, range_search_gpu, \
+    range_search_max_results, exponential_query_iterator
+
+
+class TestComputeGT(unittest.TestCase):
+
+    def do_compute_GT(self, metric):
+        d = 64
+        xt, xb, xq = get_dataset_2(d, 0, 10000, 100)
+
+        index = faiss.GpuIndexCagra(d)
+        index.train(xb)
+        Dref, Iref = index.search(xq, 10)
+
+        # iterator function on the matrix
+
+        def matrix_iterator(xb, bs):
+            for i0 in range(0, xb.shape[0], bs):
+                yield xb[i0:i0 + bs]
+
+        Dnew, Inew = knn_ground_truth(xq, matrix_iterator(xb, 1000), 10)
+
+        np.testing.assert_array_equal(Iref, Inew)
+        np.testing.assert_almost_equal(Dref, Dnew, decimal=4)
+
+    def test_compute_GT_L2(self):
+        self.do_compute_GT(faiss.METRIC_L2)
+
+    def test_range_IP(self):
+        self.do_compute_GT(faiss.METRIC_INNER_PRODUCT)

From 24a555d78a73e54bc0f2c90fe2153a8c08386ff1 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 22 May 2024 18:32:32 -0700
Subject: [PATCH 036/148] fix virtual functions and serialization

---
 faiss/IndexHNSW.cpp                 | 49 ++++++++++++++++-------------
 faiss/IndexHNSW.h                   |  7 +++--
 faiss/gpu/test/TestGpuIndexCagra.cu |  1 +
 faiss/impl/HNSW.cpp                 | 20 +++++++++---
 faiss/impl/HNSW.h                   |  3 +-
 faiss/impl/index_read.cpp           |  6 +++-
 faiss/impl/index_write.cpp          |  6 +++-
 7 files changed, 59 insertions(+), 33 deletions(-)

diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index 5d5a455ab6..35f18014ed 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -26,6 +26,7 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <cstdint>
+#include "impl/HNSW.h"
 
 #include <faiss/Index2Layer.h>
 #include <faiss/IndexFlat.h>
@@ -466,7 +467,8 @@ void IndexHNSW::search_level_0(
         float* distances,
         idx_t* labels,
         int nprobe,
-        int search_type) const {
+        int search_type,
+        const SearchParameters* params) const {
     FAISS_THROW_IF_NOT(k > 0);
     FAISS_THROW_IF_NOT(nprobe > 0);
 
@@ -501,7 +503,9 @@ void IndexHNSW::search_level_0(
             vt.advance();
         }
 #pragma omp critical
-        { hnsw_stats.combine(search_stats); }
+        {
+            hnsw_stats.combine(search_stats);
+        }
     }
 }
 
@@ -962,7 +966,7 @@ void IndexHNSWCagra::search(
         idx_t k,
         float* distances,
         idx_t* labels,
-        const SearchParameters* params) {
+        const SearchParameters* params) const {
     if (!base_level_only) {
         IndexHNSW::search(n, x, k, distances, labels, params);
     } else {
@@ -971,42 +975,43 @@ void IndexHNSWCagra::search(
 
 #pragma omp for
         for (idx_t i = 0; i < n; i++) {
+            // std::unique_ptr<DistanceComputer> dis(
+            //         this->storage->get_distance_computer());
             std::unique_ptr<DistanceComputer> dis(
                     storage_distance_computer(this->storage));
             dis->set_query(x + i * d);
-            storage_idx_t entrypoint = -1;
-            float entrypoint_d = std::numeric_limits<float>::max();
+            nearest[i] = -1;
+            nearest_d[i] = std::numeric_limits<float>::max();
 
             std::random_device rd;
-            std::mt19937 gen(i);
+            std::mt19937 gen(rd());
             std::uniform_int_distribution<idx_t> distrib(0, this->ntotal);
 
             for (idx_t j = 0; j < num_base_level_search_entrypoints; j++) {
                 auto idx = distrib(gen);
                 auto distance = (*dis)(idx);
-                if (distance < entrypoint_d) {
-                    entrypoint_d = distance;
-                    entrypoint = idx;
+                // std::cout << "distance: " << distance << std::endl;
+                if (distance > nearest_d[i]) {
+                    nearest[i] = idx;
+                    nearest_d[i] = distance;
                 }
             }
 
             FAISS_THROW_IF_NOT_MSG(
-                    entrypoint >= 0, "Could not find a valid entrypoint.");
-
-            nearest[i] = entrypoint;
-            nearest_d[i] = entrypoint_d;
-        }
-
-        if (params) {
-            const SearchParametersHNSW* params_hnsw =
-                    dynamic_cast<const SearchParametersHNSW*>(params);
-            this->hnsw.efSearch = params_hnsw->efSearch;
-            this->hnsw.check_relative_distance =
-                    params_hnsw->check_relative_distance;
+                    nearest[i] >= 0, "Could not find a valid entrypoint.");
         }
 
         search_level_0(
-                n, x, k, nearest.data(), nearest_d.data(), distances, labels);
+                n,
+                x,
+                k,
+                nearest.data(),
+                nearest_d.data(),
+                distances,
+                labels,
+                1, // n_probes
+                1, // search_type
+                dynamic_cast<const SearchParametersHNSW*>(params));
     }
 }
 
diff --git a/faiss/IndexHNSW.h b/faiss/IndexHNSW.h
index e1f1d68dee..71807c6537 100644
--- a/faiss/IndexHNSW.h
+++ b/faiss/IndexHNSW.h
@@ -93,7 +93,8 @@ struct IndexHNSW : Index {
             float* distances,
             idx_t* labels,
             int nprobe = 1,
-            int search_type = 1) const;
+            int search_type = 1,
+            const SearchParameters* params = nullptr) const;
 
     /// alternative graph building
     void init_level_0_from_knngraph(int k, const float* D, const idx_t* I);
@@ -177,7 +178,7 @@ struct IndexHNSWCagra : IndexHNSW {
     /// some points and using the best one.
     int num_base_level_search_entrypoints = 32;
 
-    void add(idx_t n, const float* x);
+    void add(idx_t n, const float* x) override;
 
     /// entry point for search
     void search(
@@ -186,7 +187,7 @@ struct IndexHNSWCagra : IndexHNSW {
             idx_t k,
             float* distances,
             idx_t* labels,
-            const SearchParameters* params = nullptr);
+            const SearchParameters* params = nullptr) const override;
 };
 
 } // namespace faiss
diff --git a/faiss/gpu/test/TestGpuIndexCagra.cu b/faiss/gpu/test/TestGpuIndexCagra.cu
index c77c7974d0..7ed08aaa9d 100644
--- a/faiss/gpu/test/TestGpuIndexCagra.cu
+++ b/faiss/gpu/test/TestGpuIndexCagra.cu
@@ -278,6 +278,7 @@ void copyToTest(
                 copyRefDistance.data(),
                 copyRefIndices.data(),
                 &cpuSearchParamstwo);
+        std::cout << "copyRefIndices[0]: " << copyRefIndices[0] << std::endl;
 
         // test quality of search
         auto gpuRes = res.getResources();
diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp
index a07d1556d5..dc61e12152 100644
--- a/faiss/impl/HNSW.cpp
+++ b/faiss/impl/HNSW.cpp
@@ -937,8 +937,10 @@ void HNSW::search_level_0(
         const float* nearest_d,
         int search_type,
         HNSWStats& search_stats,
-        VisitedTable& vt) const {
+        VisitedTable& vt,
+        const SearchParametersHNSW* params) const {
     const HNSW& hnsw = *this;
+    auto efSearch = params ? params->efSearch : hnsw.efSearch;
     int k = extract_k_from_ResultHandler(res);
     if (search_type == 1) {
         int nres = 0;
@@ -952,16 +954,24 @@ void HNSW::search_level_0(
             if (vt.get(cj))
                 continue;
 
-            int candidates_size = std::max(hnsw.efSearch, k);
+            int candidates_size = std::max(efSearch, k);
             MinimaxHeap candidates(candidates_size);
 
             candidates.push(cj, nearest_d[j]);
 
             nres = search_from_candidates(
-                    hnsw, qdis, res, candidates, vt, search_stats, 0, nres);
+                    hnsw,
+                    qdis,
+                    res,
+                    candidates,
+                    vt,
+                    search_stats,
+                    0,
+                    nres,
+                    params);
         }
     } else if (search_type == 2) {
-        int candidates_size = std::max(hnsw.efSearch, int(k));
+        int candidates_size = std::max(efSearch, int(k));
         candidates_size = std::max(candidates_size, int(nprobe));
 
         MinimaxHeap candidates(candidates_size);
@@ -974,7 +984,7 @@ void HNSW::search_level_0(
         }
 
         search_from_candidates(
-                hnsw, qdis, res, candidates, vt, search_stats, 0);
+                hnsw, qdis, res, candidates, vt, search_stats, 0, 0, params);
     }
 }
 
diff --git a/faiss/impl/HNSW.h b/faiss/impl/HNSW.h
index 7b08096a86..f3aacf8a5b 100644
--- a/faiss/impl/HNSW.h
+++ b/faiss/impl/HNSW.h
@@ -213,7 +213,8 @@ struct HNSW {
             const float* nearest_d,
             int search_type,
             HNSWStats& search_stats,
-            VisitedTable& vt) const;
+            VisitedTable& vt,
+            const SearchParametersHNSW* params = nullptr) const;
 
     void reset();
 
diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index 5b08640295..1085d3a0d1 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -961,8 +961,12 @@ Index* read_index(IOReader* f, int io_flags) {
         if (h == fourcc("IHNc"))
             idxhnsw = new IndexHNSWCagra();
         read_index_header(idxhnsw, f);
-        if (h == fourcc("IHNc"))
+        if (h == fourcc("IHNc")) {
             READ1(idxhnsw->keep_max_size_level0);
+            auto idx_hnsw_cagra = dynamic_cast<IndexHNSWCagra*>(idxhnsw);
+            READ1(idx_hnsw_cagra->base_level_only);
+            READ1(idx_hnsw_cagra->num_base_level_search_entrypoints);
+        }
         read_HNSW(&idxhnsw->hnsw, f);
         idxhnsw->storage = read_index(f, io_flags);
         idxhnsw->own_fields = true;
diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp
index e9c6a23a64..24303ac376 100644
--- a/faiss/impl/index_write.cpp
+++ b/faiss/impl/index_write.cpp
@@ -765,8 +765,12 @@ void write_index(const Index* idx, IOWriter* f) {
         FAISS_THROW_IF_NOT(h != 0);
         WRITE1(h);
         write_index_header(idxhnsw, f);
-        if (h == fourcc("IHNc"))
+        if (h == fourcc("IHNc")) {
             WRITE1(idxhnsw->keep_max_size_level0);
+            auto idx_hnsw_cagra = dynamic_cast<const IndexHNSWCagra*>(idxhnsw);
+            WRITE1(idx_hnsw_cagra->base_level_only);
+            WRITE1(idx_hnsw_cagra->num_base_level_search_entrypoints);
+        }
         write_HNSW(&idxhnsw->hnsw, f);
         write_index(idxhnsw->storage, f);
     } else if (const IndexNSG* idxnsg = dynamic_cast<const IndexNSG*>(idx)) {

From 51227b1df6cf97d8b71e73c38e89c421391e9e6a Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 22 May 2024 18:56:38 -0700
Subject: [PATCH 037/148] invert conditional

---
 faiss/IndexHNSW.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index 35f18014ed..c20ea99b6e 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -975,8 +975,6 @@ void IndexHNSWCagra::search(
 
 #pragma omp for
         for (idx_t i = 0; i < n; i++) {
-            // std::unique_ptr<DistanceComputer> dis(
-            //         this->storage->get_distance_computer());
             std::unique_ptr<DistanceComputer> dis(
                     storage_distance_computer(this->storage));
             dis->set_query(x + i * d);
@@ -990,13 +988,11 @@ void IndexHNSWCagra::search(
             for (idx_t j = 0; j < num_base_level_search_entrypoints; j++) {
                 auto idx = distrib(gen);
                 auto distance = (*dis)(idx);
-                // std::cout << "distance: " << distance << std::endl;
-                if (distance > nearest_d[i]) {
+                if (distance < nearest_d[i]) {
                     nearest[i] = idx;
                     nearest_d[i] = distance;
                 }
             }
-
             FAISS_THROW_IF_NOT_MSG(
                     nearest[i] >= 0, "Could not find a valid entrypoint.");
         }

From 579a301be18282dfcbba1376e395e0b964c5e5dd Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Thu, 23 May 2024 09:59:36 -0700
Subject: [PATCH 038/148] debug msg

---
 faiss/IndexHNSW.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index c20ea99b6e..023e39a526 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -970,6 +970,7 @@ void IndexHNSWCagra::search(
     if (!base_level_only) {
         IndexHNSW::search(n, x, k, distances, labels, params);
     } else {
+        std::cout << "LEVEL 0 SEARCH" << std::endl;
         std::vector<storage_idx_t> nearest(n);
         std::vector<float> nearest_d(n);
 

From ae0b8ba7a708b13c55ca0d887cedeeba9c8040ae Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Thu, 23 May 2024 10:46:36 -0700
Subject: [PATCH 039/148] more debug prints

---
 faiss/impl/HNSW.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp
index dc61e12152..65fc93cfac 100644
--- a/faiss/impl/HNSW.cpp
+++ b/faiss/impl/HNSW.cpp
@@ -942,6 +942,8 @@ void HNSW::search_level_0(
     const HNSW& hnsw = *this;
     auto efSearch = params ? params->efSearch : hnsw.efSearch;
     int k = extract_k_from_ResultHandler(res);
+    std::cout << "efSearch: " << efSearch << ", k: " << k << std::endl;
+
     if (search_type == 1) {
         int nres = 0;
 

From 4170a3e4a3d0a8284519c5bf1d529159e47e7e5e Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Thu, 23 May 2024 11:50:24 -0700
Subject: [PATCH 040/148] fix efSearch setting in base search

---
 faiss/IndexHNSW.cpp | 15 +++++++++++----
 faiss/impl/HNSW.cpp |  1 -
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index 023e39a526..02445989a4 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -468,10 +468,17 @@ void IndexHNSW::search_level_0(
         idx_t* labels,
         int nprobe,
         int search_type,
-        const SearchParameters* params) const {
+        const SearchParameters* params_in) const {
     FAISS_THROW_IF_NOT(k > 0);
     FAISS_THROW_IF_NOT(nprobe > 0);
 
+    const SearchParametersHNSW* params = nullptr;
+
+    if (params_in) {
+        params = dynamic_cast<const SearchParametersHNSW*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(params, "params type invalid");
+    }
+
     storage_idx_t ntotal = hnsw.levels.size();
 
     using RH = HeapBlockResultHandler<HNSW::C>;
@@ -498,7 +505,8 @@ void IndexHNSW::search_level_0(
                     nearest_d + i * nprobe,
                     search_type,
                     search_stats,
-                    vt);
+                    vt,
+                    params);
             res.end();
             vt.advance();
         }
@@ -970,7 +978,6 @@ void IndexHNSWCagra::search(
     if (!base_level_only) {
         IndexHNSW::search(n, x, k, distances, labels, params);
     } else {
-        std::cout << "LEVEL 0 SEARCH" << std::endl;
         std::vector<storage_idx_t> nearest(n);
         std::vector<float> nearest_d(n);
 
@@ -1008,7 +1015,7 @@ void IndexHNSWCagra::search(
                 labels,
                 1, // n_probes
                 1, // search_type
-                dynamic_cast<const SearchParametersHNSW*>(params));
+                params);
     }
 }
 
diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp
index 65fc93cfac..3ba5f72f68 100644
--- a/faiss/impl/HNSW.cpp
+++ b/faiss/impl/HNSW.cpp
@@ -942,7 +942,6 @@ void HNSW::search_level_0(
     const HNSW& hnsw = *this;
     auto efSearch = params ? params->efSearch : hnsw.efSearch;
     int k = extract_k_from_ResultHandler(res);
-    std::cout << "efSearch: " << efSearch << ", k: " << k << std::endl;
 
     if (search_type == 1) {
         int nres = 0;

From 75808b1ab5d427948797a26ad9739bb972db3787 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Thu, 23 May 2024 15:42:57 -0700
Subject: [PATCH 041/148] re-negate ip distances in search_level

---
 faiss/IndexHNSW.cpp                 | 7 +++++++
 faiss/gpu/test/TestGpuIndexCagra.cu | 3 ---
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index 02445989a4..19ebfcfe01 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -515,6 +515,13 @@ void IndexHNSW::search_level_0(
             hnsw_stats.combine(search_stats);
         }
     }
+    if (is_similarity_metric(this->metric_type)) {
+// we need to revert the negated distances
+#pragma omp parallel for
+        for (size_t i = 0; i < k * n; i++) {
+            distances[i] = -distances[i];
+        }
+    }
 }
 
 void IndexHNSW::init_level_0_from_knngraph(
diff --git a/faiss/gpu/test/TestGpuIndexCagra.cu b/faiss/gpu/test/TestGpuIndexCagra.cu
index 7ed08aaa9d..62f4c7627a 100644
--- a/faiss/gpu/test/TestGpuIndexCagra.cu
+++ b/faiss/gpu/test/TestGpuIndexCagra.cu
@@ -278,7 +278,6 @@ void copyToTest(
                 copyRefDistance.data(),
                 copyRefIndices.data(),
                 &cpuSearchParamstwo);
-        std::cout << "copyRefIndices[0]: " << copyRefIndices[0] << std::endl;
 
         // test quality of search
         auto gpuRes = res.getResources();
@@ -331,8 +330,6 @@ void copyToTest(
                 recall_score.view(),
                 copy_ref_dis_mds_opt,
                 ref_dis_mds_opt);
-        std::cout << "recall_score: " << *recall_score.data_handle()
-                  << std::endl;
         ASSERT_TRUE(*recall_score.data_handle() > expected_recall);
     }
 }

From 9bd10399778e24ce570d1105fe82e41669af5c7d Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Thu, 23 May 2024 15:46:34 -0700
Subject: [PATCH 042/148] fix format

---
 faiss/IndexHNSW.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index 35e2a9f286..efd8972477 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -465,9 +465,7 @@ void IndexHNSW::search_level_0(
             vt.advance();
         }
 #pragma omp critical
-        {
-            hnsw_stats.combine(search_stats);
-        }
+        { hnsw_stats.combine(search_stats); }
     }
     if (is_similarity_metric(this->metric_type)) {
 // we need to revert the negated distances

From ea8028dc559add56f60c9531464277ad0127a77d Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Thu, 23 May 2024 15:53:36 -0700
Subject: [PATCH 043/148] re-up minimum recall for base only IP distance

---
 faiss/gpu/test/TestGpuIndexCagra.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/faiss/gpu/test/TestGpuIndexCagra.cu b/faiss/gpu/test/TestGpuIndexCagra.cu
index 62f4c7627a..8d330a81cb 100644
--- a/faiss/gpu/test/TestGpuIndexCagra.cu
+++ b/faiss/gpu/test/TestGpuIndexCagra.cu
@@ -347,7 +347,7 @@ TEST(TestGpuIndexCagra, Float32_CopyTo_IP) {
 }
 
 TEST(TestGpuIndexCagra, Float32_CopyTo_IP_BaseLevelOnly) {
-    copyToTest(faiss::METRIC_INNER_PRODUCT, 0.8, true);
+    copyToTest(faiss::METRIC_INNER_PRODUCT, 0.98, true);
 }
 
 void copyFromTest(faiss::MetricType metric, double expected_recall) {

From fc313514e21954353fd53995d8f790e34b74bf99 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 29 May 2024 17:17:04 -0700
Subject: [PATCH 044/148] add python tests

---
 faiss/gpu/GpuCloner.cpp      | 13 +++++++
 faiss/gpu/test/test_cagra.py | 69 +++++++++++++++++++++++++-----------
 2 files changed, 61 insertions(+), 21 deletions(-)

diff --git a/faiss/gpu/GpuCloner.cpp b/faiss/gpu/GpuCloner.cpp
index 8f895ac9c7..de5041b8f6 100644
--- a/faiss/gpu/GpuCloner.cpp
+++ b/faiss/gpu/GpuCloner.cpp
@@ -14,6 +14,7 @@
 
 #include <faiss/IndexBinaryFlat.h>
 #include <faiss/IndexFlat.h>
+#include <faiss/IndexHNSW.h>
 #include <faiss/IndexIVF.h>
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/IndexIVFPQ.h>
@@ -24,6 +25,7 @@
 #include <faiss/MetaIndexes.h>
 #include <faiss/gpu/GpuIndex.h>
 #include <faiss/gpu/GpuIndexBinaryFlat.h>
+#include <faiss/gpu/GpuIndexCagra.h>
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVFFlat.h>
 #include <faiss/gpu/GpuIndexIVFPQ.h>
@@ -85,6 +87,10 @@ Index* ToCPUCloner::clone_Index(const Index* index) {
         // objective is to make a single component out of them
         // (inverse op of ToGpuClonerMultiple)
 
+    } else if (auto icg = dynamic_cast<const GpuIndexCagra*>(index)) {
+        IndexHNSWCagra* res = new IndexHNSWCagra();
+        icg->copyTo(res);
+        return res;
     } else if (auto ish = dynamic_cast<const IndexShards*>(index)) {
         int nshard = ish->count();
         FAISS_ASSERT(nshard > 0);
@@ -214,6 +220,13 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
             res->reserveMemory(reserveVecs);
         }
 
+        return res;
+    } else if (auto icg = dynamic_cast<const faiss::IndexHNSWCagra*>(index)) {
+        GpuIndexCagraConfig config;
+        config.device = device;
+        GpuIndexCagra* res =
+                new GpuIndexCagra(provider, icg->d, icg->metric_type, config);
+        res->copyFrom(icg);
         return res;
     } else {
         // use CPU cloner for IDMap and PreTransform
diff --git a/faiss/gpu/test/test_cagra.py b/faiss/gpu/test/test_cagra.py
index d670a75c08..dd7d09f2de 100644
--- a/faiss/gpu/test/test_cagra.py
+++ b/faiss/gpu/test/test_cagra.py
@@ -8,37 +8,64 @@
 import faiss
 import numpy as np
 
-from common_faiss_tests import get_dataset_2
-
-from faiss.contrib import datasets, evaluation, big_batch_search
-from faiss.contrib.exhaustive_search import knn_ground_truth, \
-    range_ground_truth, range_search_gpu, \
-    range_search_max_results, exponential_query_iterator
+from faiss.contrib import datasets, evaluation
 
 
+@unittest.skipIf(
+    "RAFT" not in faiss.get_compile_options(),
+    "only if RAFT is compiled in")
 class TestComputeGT(unittest.TestCase):
 
     def do_compute_GT(self, metric):
         d = 64
-        xt, xb, xq = get_dataset_2(d, 0, 10000, 100)
-
-        index = faiss.GpuIndexCagra(d)
-        index.train(xb)
-        Dref, Iref = index.search(xq, 10)
-
-        # iterator function on the matrix
-
-        def matrix_iterator(xb, bs):
-            for i0 in range(0, xb.shape[0], bs):
-                yield xb[i0:i0 + bs]
+        k = 12
+        ds = datasets.SyntheticDataset(d, 0, 10000, 100)
+        Dref, Iref = faiss.knn(ds.get_queries(), ds.get_database(), k, metric)
 
-        Dnew, Inew = knn_ground_truth(xq, matrix_iterator(xb, 1000), 10)
+        res = faiss.StandardGpuResources()
 
-        np.testing.assert_array_equal(Iref, Inew)
-        np.testing.assert_almost_equal(Dref, Dnew, decimal=4)
+        index = faiss.GpuIndexCagra(res, d, metric)
+        index.train(ds.get_database())
+        Dnew, Inew = index.search(ds.get_queries(), k)
+        
+        evaluation.check_ref_knn_with_draws(Dref, Iref, Dnew, Inew, k)
 
     def test_compute_GT_L2(self):
         self.do_compute_GT(faiss.METRIC_L2)
 
-    def test_range_IP(self):
+    def test_compute_GT_IP(self):
         self.do_compute_GT(faiss.METRIC_INNER_PRODUCT)
+
+@unittest.skipIf(
+    "RAFT" not in faiss.get_compile_options(),
+    "only if RAFT is compiled in")
+class TestInterop(unittest.TestCase):
+
+    def do_interop(self, metric):
+        d = 64
+        k = 12
+        ds = datasets.SyntheticDataset(d, 0, 10000, 100)
+
+        res = faiss.StandardGpuResources()
+
+        index = faiss.GpuIndexCagra(res, d, metric)
+        index.train(ds.get_database())
+        Dnew, Inew = index.search(ds.get_queries(), k)
+
+        cpu_index = faiss.index_gpu_to_cpu(index)
+        Dref, Iref = cpu_index.search(ds.get_queries(), k)
+        
+        evaluation.check_ref_knn_with_draws(Dref, Iref, Dnew, Inew, k)
+
+        faiss.write_index(cpu_index, "index_hnsw_cagra.index")
+        deserialized_index = faiss.read_index("index_hnsw_cagra.index")
+        gpu_index = faiss.index_cpu_to_gpu(res, 0, deserialized_index)
+        Dnew2, Inew2 = gpu_index.search(ds.get_queries(), k)
+
+        evaluation.check_ref_knn_with_draws(Dnew2, Inew2, Dnew, Inew, k)
+
+    def test_interop_L2(self):
+        self.do_interop(faiss.METRIC_L2)
+
+    def test_interop_IP(self):
+        self.do_interop(faiss.METRIC_INNER_PRODUCT)

From 03ee1fb65bdc423859855af8d4aa042e94595b44 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 29 May 2024 19:17:07 -0700
Subject: [PATCH 045/148] ifdef guards in gpu cloner

---
 faiss/gpu/GpuCloner.cpp | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/faiss/gpu/GpuCloner.cpp b/faiss/gpu/GpuCloner.cpp
index de5041b8f6..b6d55a47aa 100644
--- a/faiss/gpu/GpuCloner.cpp
+++ b/faiss/gpu/GpuCloner.cpp
@@ -14,7 +14,9 @@
 
 #include <faiss/IndexBinaryFlat.h>
 #include <faiss/IndexFlat.h>
+#if defined USE_NVIDIA_RAFT
 #include <faiss/IndexHNSW.h>
+#endif
 #include <faiss/IndexIVF.h>
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/IndexIVFPQ.h>
@@ -25,7 +27,9 @@
 #include <faiss/MetaIndexes.h>
 #include <faiss/gpu/GpuIndex.h>
 #include <faiss/gpu/GpuIndexBinaryFlat.h>
+#if defined USE_NVIDIA_RAFT
 #include <faiss/gpu/GpuIndexCagra.h>
+#endif
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVFFlat.h>
 #include <faiss/gpu/GpuIndexIVFPQ.h>
@@ -87,11 +91,15 @@ Index* ToCPUCloner::clone_Index(const Index* index) {
         // objective is to make a single component out of them
         // (inverse op of ToGpuClonerMultiple)
 
-    } else if (auto icg = dynamic_cast<const GpuIndexCagra*>(index)) {
+    }
+#if defined USE_NVIDIA_RAFT
+    else if (auto icg = dynamic_cast<const GpuIndexCagra*>(index)) {
         IndexHNSWCagra* res = new IndexHNSWCagra();
         icg->copyTo(res);
         return res;
-    } else if (auto ish = dynamic_cast<const IndexShards*>(index)) {
+    }
+#endif
+    else if (auto ish = dynamic_cast<const IndexShards*>(index)) {
         int nshard = ish->count();
         FAISS_ASSERT(nshard > 0);
         Index* res = clone_Index(ish->at(0));
@@ -221,14 +229,18 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         }
 
         return res;
-    } else if (auto icg = dynamic_cast<const faiss::IndexHNSWCagra*>(index)) {
+    }
+#if defined USE_NVIDIA_RAFT
+    else if (auto icg = dynamic_cast<const faiss::IndexHNSWCagra*>(index)) {
         GpuIndexCagraConfig config;
         config.device = device;
         GpuIndexCagra* res =
                 new GpuIndexCagra(provider, icg->d, icg->metric_type, config);
         res->copyFrom(icg);
         return res;
-    } else {
+    }
+#endif
+    else {
         // use CPU cloner for IDMap and PreTransform
         auto index_idmap = dynamic_cast<const IndexIDMap*>(index);
         auto index_pt = dynamic_cast<const IndexPreTransform*>(index);

From 2e9cbc839b1bb11f20e0eac19cca034a49fd0abc Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Thu, 6 Jun 2024 15:02:39 -0700
Subject: [PATCH 046/148] option to exclude dataset store on index

---
 faiss/gpu/GpuIndexCagra.cu          |  26 ++++-
 faiss/gpu/GpuIndexCagra.h           |   2 +
 faiss/gpu/impl/RaftCagra.cu         | 157 +++++++++++-----------------
 faiss/gpu/impl/RaftCagra.cuh        |  15 ++-
 faiss/gpu/test/TestGpuIndexCagra.cu |  15 +--
 5 files changed, 108 insertions(+), 107 deletions(-)

diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
index 4ae56df10d..b183e74568 100644
--- a/faiss/gpu/GpuIndexCagra.cu
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -86,11 +86,13 @@ void GpuIndexCagra::train(idx_t n, const float* x) {
             cagraConfig_.graph_degree,
             static_cast<faiss::cagra_build_algo>(cagraConfig_.build_algo),
             cagraConfig_.nn_descent_niter,
+            cagraConfig_.store_dataset,
             this->metric_type,
             this->metric_arg,
             INDICES_64_BIT,
             ivf_pq_params,
-            ivf_pq_search_params);
+            ivf_pq_search_params,
+            cagraConfig_.refine_rate);
 
     index_->train(n, x);
 
@@ -225,17 +227,33 @@ void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const {
     index->hnsw.set_default_probas(M, 1.0 / log(M));
 
     auto n_train = this->ntotal;
-    auto train_dataset = index_->get_training_dataset();
+    float* train_dataset;
+    auto dataset = index_->get_training_dataset();
+    bool allocation = false;
+    if (getDeviceForAddress(dataset) >= 0) {
+        train_dataset = new float[n_train * index->d];
+        allocation = true;
+        raft::copy(
+                train_dataset,
+                dataset,
+                n_train * index->d,
+                this->resources_->getRaftHandleCurrentDevice().get_stream());
+    } else {
+        train_dataset = const_cast<float*>(dataset);
+    }
 
     // turn off as level 0 is copied from CAGRA graph
     index->init_level0 = false;
     if (!index->base_level_only) {
-        index->add(n_train, train_dataset.data());
+        index->add(n_train, train_dataset);
     } else {
         index->hnsw.prepare_level_tab(n_train, false);
-        index->storage->add(n_train, train_dataset.data());
+        index->storage->add(n_train, train_dataset);
         index->ntotal = n_train;
     }
+    if (allocation) {
+        delete[] train_dataset;
+    }
 
     auto graph = get_knngraph();
 
diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
index 6ecee3ae03..62042d531f 100644
--- a/faiss/gpu/GpuIndexCagra.h
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -174,6 +174,8 @@ struct GpuIndexCagraConfig : public GpuIndexConfig {
 
     IVFPQBuildCagraConfig* ivf_pq_params = nullptr;
     IVFPQSearchCagraConfig* ivf_pq_search_params = nullptr;
+    float refine_rate = 2.0f;
+    bool store_dataset = true;
 };
 
 enum class search_algo {
diff --git a/faiss/gpu/impl/RaftCagra.cu b/faiss/gpu/impl/RaftCagra.cu
index 292079321d..50903220df 100644
--- a/faiss/gpu/impl/RaftCagra.cu
+++ b/faiss/gpu/impl/RaftCagra.cu
@@ -42,19 +42,23 @@ RaftCagra::RaftCagra(
         idx_t graph_degree,
         faiss::cagra_build_algo graph_build_algo,
         size_t nn_descent_niter,
+        bool store_dataset,
         faiss::MetricType metric,
         float metricArg,
         IndicesOptions indicesOptions,
         std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params,
         std::optional<raft::neighbors::ivf_pq::search_params>
-                ivf_pq_search_params)
+                ivf_pq_search_params,
+        float refine_rate)
         : resources_(resources),
           dim_(dim),
+          store_dataset_(store_dataset),
           metric_(metric),
           metricArg_(metricArg),
           index_params_(),
           ivf_pq_params_(ivf_pq_params),
-          ivf_pq_search_params_(ivf_pq_search_params) {
+          ivf_pq_search_params_(ivf_pq_search_params),
+          refine_rate_(refine_rate) {
     FAISS_THROW_IF_NOT_MSG(
             metric == faiss::METRIC_L2 || metric == faiss::METRIC_INNER_PRODUCT,
             "CAGRA currently only supports L2 or Inner Product metric.");
@@ -113,6 +117,9 @@ RaftCagra::RaftCagra(
 
     FAISS_ASSERT(distances_on_gpu == knn_graph_on_gpu);
 
+    storage_ = distances;
+    n_ = n;
+
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
 
@@ -164,81 +171,50 @@ RaftCagra::RaftCagra(
 }
 
 void RaftCagra::train(idx_t n, const float* x) {
+    storage_ = x;
+    n_ = n;
+
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
+
+    auto nn_descent_params = std::make_optional<
+            raft::neighbors::experimental::nn_descent::index_params>();
+    nn_descent_params->graph_degree = index_params_.intermediate_graph_degree;
+    nn_descent_params->intermediate_graph_degree =
+            1.5 * index_params_.intermediate_graph_degree;
+    nn_descent_params->max_iterations = index_params_.nn_descent_niter;
+
     if (index_params_.build_algo ==
-        raft::neighbors::cagra::graph_build_algo::IVF_PQ) {
-        std::optional<raft::host_matrix<uint32_t, int64_t>> knn_graph(
-                raft::make_host_matrix<uint32_t, int64_t>(
-                        n, index_params_.intermediate_graph_degree));
-        if (getDeviceForAddress(x) >= 0) {
-            auto dataset_d =
-                    raft::make_device_matrix_view<const float, int64_t>(
-                            x, n, dim_);
-            raft::neighbors::cagra::build_knn_graph(
-                    raft_handle,
-                    dataset_d,
-                    knn_graph->view(),
-                    1.0f,
-                    ivf_pq_params_,
-                    ivf_pq_search_params_);
-        } else {
-            auto dataset_h = raft::make_host_matrix_view<const float, int64_t>(
-                    x, n, dim_);
-            raft::neighbors::cagra::build_knn_graph(
-                    raft_handle,
-                    dataset_h,
-                    knn_graph->view(),
-                    1.0f,
-                    ivf_pq_params_,
-                    ivf_pq_search_params_);
-        }
-        auto cagra_graph = raft::make_host_matrix<uint32_t, int64_t>(
-                n, index_params_.graph_degree);
-
-        raft::neighbors::cagra::optimize<uint32_t>(
-                raft_handle, knn_graph->view(), cagra_graph.view());
-
-        // free intermediate graph before trying to create the index
-        knn_graph.reset();
-
-        if (getDeviceForAddress(x) >= 0) {
-            auto dataset_d =
-                    raft::make_device_matrix_view<const float, int64_t>(
-                            x, n, dim_);
-            raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
-                    raft_handle,
-                    metric_ == faiss::METRIC_L2
-                            ? raft::distance::DistanceType::L2Expanded
-                            : raft::distance::DistanceType::InnerProduct,
-                    dataset_d,
-                    raft::make_const_mdspan(cagra_graph.view()));
-        } else {
-            auto dataset_h = raft::make_host_matrix_view<const float, int64_t>(
-                    x, n, dim_);
-            raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
-                    raft_handle,
-                    metric_ == faiss::METRIC_L2
-                            ? raft::distance::DistanceType::L2Expanded
-                            : raft::distance::DistanceType::InnerProduct,
-                    dataset_h,
-                    raft::make_const_mdspan(cagra_graph.view()));
-        }
+                raft::neighbors::cagra::graph_build_algo::IVF_PQ &&
+        index_params_.graph_degree == index_params_.intermediate_graph_degree) {
+        index_params_.intermediate_graph_degree =
+                1.5 * index_params_.graph_degree;
+    }
 
+    if (getDeviceForAddress(x) >= 0) {
+        auto dataset =
+                raft::make_device_matrix_view<const float, int64_t>(x, n, dim_);
+        raft_knn_index = raft::neighbors::cagra::detail::build<float, uint32_t>(
+                raft_handle,
+                index_params_,
+                dataset,
+                nn_descent_params,
+                refine_rate_,
+                ivf_pq_params_,
+                ivf_pq_search_params_,
+                store_dataset_);
     } else {
-        if (getDeviceForAddress(x) >= 0) {
-            raft_knn_index = raft::runtime::neighbors::cagra::build(
-                    raft_handle,
-                    index_params_,
-                    raft::make_device_matrix_view<const float, int64_t>(
-                            x, n, dim_));
-        } else {
-            raft_knn_index = raft::runtime::neighbors::cagra::build(
-                    raft_handle,
-                    index_params_,
-                    raft::make_host_matrix_view<const float, int64_t>(
-                            x, n, dim_));
-        }
+        auto dataset =
+                raft::make_host_matrix_view<const float, int64_t>(x, n, dim_);
+        raft_knn_index = raft::neighbors::cagra::detail::build<float, uint32_t>(
+                raft_handle,
+                index_params_,
+                dataset,
+                nn_descent_params,
+                refine_rate_,
+                ivf_pq_params_,
+                ivf_pq_search_params_,
+                store_dataset_);
     }
 }
 
@@ -270,6 +246,18 @@ void RaftCagra::search(
     FAISS_ASSERT(numQueries > 0);
     FAISS_ASSERT(cols == dim_);
 
+    if (!store_dataset_) {
+        if (getDeviceForAddress(storage_) >= 0) {
+            auto dataset = raft::make_device_matrix_view<const float, int64_t>(
+                    storage_, n_, dim_);
+            raft_knn_index.value().update_dataset(raft_handle, dataset);
+        } else {
+            auto dataset = raft::make_host_matrix_view<const float, int64_t>(
+                    storage_, n_, dim_);
+            raft_knn_index.value().update_dataset(raft_handle, dataset);
+        }
+    }
+
     auto queries_view = raft::make_device_matrix_view<const float, int64_t>(
             queries.data(), numQueries, cols);
     auto distances_view = raft::make_device_matrix_view<float, int64_t>(
@@ -342,29 +330,8 @@ std::vector<idx_t> RaftCagra::get_knngraph() const {
     return host_graph;
 }
 
-std::vector<float> RaftCagra::get_training_dataset() const {
-    FAISS_ASSERT(raft_knn_index.has_value());
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-    auto stream = raft_handle.get_stream();
-
-    auto device_dataset = raft_knn_index.value().dataset();
-
-    std::vector<float> host_dataset(
-            device_dataset.extent(0) * device_dataset.extent(1));
-
-    RAFT_CUDA_TRY(cudaMemcpy2DAsync(
-            host_dataset.data(),
-            sizeof(float) * dim_,
-            device_dataset.data_handle(),
-            sizeof(float) * device_dataset.stride(0),
-            sizeof(float) * dim_,
-            device_dataset.extent(0),
-            cudaMemcpyDefault,
-            raft_handle.get_stream()));
-    raft_handle.sync_stream();
-
-    return host_dataset;
+const float* RaftCagra::get_training_dataset() const {
+    return storage_;
 }
 
 } // namespace gpu
diff --git a/faiss/gpu/impl/RaftCagra.cuh b/faiss/gpu/impl/RaftCagra.cuh
index 95f6c03fca..0913ba5947 100644
--- a/faiss/gpu/impl/RaftCagra.cuh
+++ b/faiss/gpu/impl/RaftCagra.cuh
@@ -53,13 +53,15 @@ class RaftCagra {
             idx_t graph_degree,
             faiss::cagra_build_algo graph_build_algo,
             size_t nn_descent_niter,
+            bool store_dataset,
             faiss::MetricType metric,
             float metricArg,
             IndicesOptions indicesOptions,
             std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params =
                     std::nullopt,
             std::optional<raft::neighbors::ivf_pq::search_params>
-                    ivf_pq_search_params = std::nullopt);
+                    ivf_pq_search_params = std::nullopt,
+            float refine_rate = 2.0f);
 
     RaftCagra(
             GpuResources* resources,
@@ -101,15 +103,23 @@ class RaftCagra {
 
     std::vector<idx_t> get_knngraph() const;
 
-    std::vector<float> get_training_dataset() const;
+    const float* get_training_dataset() const;
 
    private:
     /// Collection of GPU resources that we use
     GpuResources* resources_;
 
+    /// Training dataset
+    const float* storage_;
+    int n_;
+
     /// Expected dimensionality of the vectors
     const int dim_;
 
+    /// Controls the underlying RAFT index if it should store the dataset in
+    /// device memory
+    bool store_dataset_;
+
     /// Metric type of the index
     faiss::MetricType metric_;
 
@@ -122,6 +132,7 @@ class RaftCagra {
     /// Parameters to build CAGRA graph using IVF PQ
     std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params_;
     std::optional<raft::neighbors::ivf_pq::search_params> ivf_pq_search_params_;
+    std::optional<float> refine_rate_;
 
     /// Instance of trained RAFT CAGRA index
     std::optional<raft::neighbors::cagra::index<float, uint32_t>>
diff --git a/faiss/gpu/test/TestGpuIndexCagra.cu b/faiss/gpu/test/TestGpuIndexCagra.cu
index 8d330a81cb..3d9e14ae34 100644
--- a/faiss/gpu/test/TestGpuIndexCagra.cu
+++ b/faiss/gpu/test/TestGpuIndexCagra.cu
@@ -38,7 +38,7 @@
 
 struct Options {
     Options() {
-        numTrain = 2 * faiss::gpu::randVal(2000, 5000);
+        numTrain = 2 * faiss::gpu::randVal(4000, 10000);
         dim = faiss::gpu::randVal(4, 10);
         numAdd = faiss::gpu::randVal(1000, 3000);
 
@@ -47,8 +47,9 @@ struct Options {
         buildAlgo = faiss::gpu::randSelect(
                 {faiss::gpu::graph_build_algo::IVF_PQ,
                  faiss::gpu::graph_build_algo::NN_DESCENT});
+        storeDataset = faiss::gpu::randSelect({true, false});
 
-        numQuery = faiss::gpu::randVal(32, 100);
+        numQuery = faiss::gpu::randVal(300, 600);
         k = faiss::gpu::randVal(10, 30);
 
         device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
@@ -71,6 +72,7 @@ struct Options {
     size_t graphDegree;
     size_t intermediateGraphDegree;
     faiss::gpu::graph_build_algo buildAlgo;
+    bool storeDataset;
     int numQuery;
     int k;
     int device;
@@ -224,6 +226,7 @@ void copyToTest(
         config.graph_degree = opt.graphDegree;
         config.intermediate_graph_degree = opt.intermediateGraphDegree;
         config.build_algo = opt.buildAlgo;
+        config.store_dataset = opt.storeDataset;
 
         faiss::gpu::GpuIndexCagra gpuIndex(&res, opt.dim, metric, config);
         gpuIndex.train(opt.numTrain, trainVecs.data());
@@ -339,7 +342,7 @@ TEST(TestGpuIndexCagra, Float32_CopyTo_L2) {
 }
 
 TEST(TestGpuIndexCagra, Float32_CopyTo_L2_BaseLevelOnly) {
-    copyToTest(faiss::METRIC_L2, 0.98, true);
+    copyToTest(faiss::METRIC_L2, 0.95, true);
 }
 
 TEST(TestGpuIndexCagra, Float32_CopyTo_IP) {
@@ -347,7 +350,7 @@ TEST(TestGpuIndexCagra, Float32_CopyTo_IP) {
 }
 
 TEST(TestGpuIndexCagra, Float32_CopyTo_IP_BaseLevelOnly) {
-    copyToTest(faiss::METRIC_INNER_PRODUCT, 0.98, true);
+    copyToTest(faiss::METRIC_INNER_PRODUCT, 0.95, true);
 }
 
 void copyFromTest(faiss::MetricType metric, double expected_recall) {
@@ -457,11 +460,11 @@ void copyFromTest(faiss::MetricType metric, double expected_recall) {
 }
 
 TEST(TestGpuIndexCagra, Float32_CopyFrom_L2) {
-    copyFromTest(faiss::METRIC_L2, 0.98);
+    copyFromTest(faiss::METRIC_L2, 0.95);
 }
 
 TEST(TestGpuIndexCagra, Float32_CopyFrom_IP) {
-    copyFromTest(faiss::METRIC_INNER_PRODUCT, 0.98);
+    copyFromTest(faiss::METRIC_INNER_PRODUCT, 0.95);
 }
 
 int main(int argc, char** argv) {

From 826ed98cb1834f04a80f413243d46c6a2319d4c6 Mon Sep 17 00:00:00 2001
From: Kumar Saurabh Arora <kuarora@meta.com>
Date: Thu, 30 May 2024 09:27:55 -0700
Subject: [PATCH 047/148] sys.big_endian to sys.byteorder (#3422)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3422

Found vec_io failing when running some benchmarking.
There is no such field named big_endian in sys. So, reverting it to original field byteorder

Reviewed By: algoriddle

Differential Revision: D56718607

fbshipit-source-id: 553f1d2d6bc967581142a92282e534f3f164e8f9
---
 contrib/vecs_io.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/contrib/vecs_io.py b/contrib/vecs_io.py
index 5d18c0b162..9ef9e0ab64 100644
--- a/contrib/vecs_io.py
+++ b/contrib/vecs_io.py
@@ -14,7 +14,7 @@
 
 def ivecs_read(fname):
     a = np.fromfile(fname, dtype='int32')
-    if sys.big_endian:
+    if sys.byteorder == 'big':
         a.byteswap(inplace=True)
     d = a[0]
     return a.reshape(-1, d + 1)[:, 1:].copy()
@@ -25,7 +25,7 @@ def fvecs_read(fname):
 
 
 def ivecs_mmap(fname):
-    assert not sys.big_endian
+    assert sys.byteorder != 'big'
     a = np.memmap(fname, dtype='int32', mode='r')
     d = a[0]
     return a.reshape(-1, d + 1)[:, 1:]
@@ -37,7 +37,7 @@ def fvecs_mmap(fname):
 
 def bvecs_mmap(fname):
     x = np.memmap(fname, dtype='uint8', mode='r')
-    if sys.big_endian:
+    if sys.byteorder == 'big':
         da = x[:4][::-1].copy()
         d = da.view('int32')[0]
     else:
@@ -50,7 +50,7 @@ def ivecs_write(fname, m):
     m1 = np.empty((n, d + 1), dtype='int32')
     m1[:, 0] = d
     m1[:, 1:] = m
-    if sys.big_endian:
+    if sys.byteorder == 'big':
         m1.byteswap(inplace=True)
     m1.tofile(fname)
 

From 4dcdff5dafc6d865ba271e86f743207f7a41d0a4 Mon Sep 17 00:00:00 2001
From: Kumar Saurabh Arora <kuarora@meta.com>
Date: Fri, 31 May 2024 14:30:39 -0700
Subject: [PATCH 048/148] Adding buck target for experiment bench_fw_ivf
 (#3423)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3423

Adding small fixes to run experiments from fbcode.
1. Added buck target
2. Full import path of faiss bench_fw modules
3. new dataset path to run tests locally as we can't use  an existing directory ./data in fbcode.

Reviewed By: algoriddle, junjieqi

Differential Revision: D57235092

fbshipit-source-id: f78a23199e619b640a19ca37f8b52ff0abdd8298
---
 benchs/bench_fw_ivf.py | 13 +++++++++----
 contrib/datasets.py    |  6 +++++-
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/benchs/bench_fw_ivf.py b/benchs/bench_fw_ivf.py
index 8c84743e27..e9e144c569 100644
--- a/benchs/bench_fw_ivf.py
+++ b/benchs/bench_fw_ivf.py
@@ -3,16 +3,20 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
 import argparse
+import logging
 import os
 
-from bench_fw.benchmark import Benchmark
-from bench_fw.benchmark_io import BenchmarkIO
-from bench_fw.descriptors import DatasetDescriptor, IndexDescriptor
+from faiss.benchs.bench_fw.benchmark import Benchmark
+from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
+from faiss.benchs.bench_fw.descriptors import (
+    DatasetDescriptor,
+    IndexDescriptor,
+)
 
 logging.basicConfig(level=logging.INFO)
 
+
 def sift1M(bio):
     benchmark = Benchmark(
         num_threads=32,
@@ -37,6 +41,7 @@ def sift1M(bio):
     benchmark.set_io(bio)
     benchmark.benchmark(result_file="result.json", local=False, train=True, reconstruct=False, knn=True, range=False)
 
+
 def bigann(bio):
     for scale in [1, 2, 5, 10, 20, 50]:
         benchmark = Benchmark(
diff --git a/contrib/datasets.py b/contrib/datasets.py
index f37a2fb6e4..281f16e2fa 100644
--- a/contrib/datasets.py
+++ b/contrib/datasets.py
@@ -6,6 +6,8 @@
 import os
 import numpy as np
 import faiss
+import getpass
+
 
 from .vecs_io import fvecs_read, ivecs_read, bvecs_mmap, fvecs_mmap
 from .exhaustive_search import knn
@@ -115,10 +117,12 @@ def get_groundtruth(self, k=100):
 # that directory is
 ############################################################################
 
+username = getpass.getuser()
 
 for dataset_basedir in (
         '/datasets01/simsearch/041218/',
-        '/mnt/vol/gfsai-flash3-east/ai-group/datasets/simsearch/'):
+        '/mnt/vol/gfsai-flash3-east/ai-group/datasets/simsearch/',
+        f'/home/{username}/simsearch/data/'):
     if os.path.exists(dataset_basedir):
         break
 else:

From 756ad46f3b4a915779f68c6e39078ea22daa4dd3 Mon Sep 17 00:00:00 2001
From: Matthijs Douze <matthijs@meta.com>
Date: Fri, 31 May 2024 14:48:13 -0700
Subject: [PATCH 049/148] add skip_storage flag to HNSW (#3487)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3487

Sometimes it is not useful to serialize the storage index along with a HNSW index. This diff adds a flag that supports skipping the storage of the index.

Searchign and adding to the index is not possible until a storage index is added back in.

Reviewed By: junjieqi

Differential Revision: D57911060

fbshipit-source-id: 5a4ceee4a8f53f6f746df59af3942b813a99c14f
---
 faiss/IndexHNSW.cpp        |  5 ++---
 faiss/impl/index_read.cpp  | 10 ++++++----
 faiss/impl/index_write.cpp | 25 ++++++++++++++++---------
 faiss/index_io.h           | 11 ++++++-----
 faiss/python/__init__.py   |  4 ++--
 tests/test_graph_based.py  | 36 ++++++++++++++++++++++++++++++++++++
 6 files changed, 68 insertions(+), 23 deletions(-)

diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index efd8972477..c04642d218 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/IndexHNSW.h>
 
 #include <omp.h>
@@ -267,7 +265,8 @@ void hnsw_search(
         const SearchParameters* params_in) {
     FAISS_THROW_IF_NOT_MSG(
             index->storage,
-            "Please use IndexHNSWFlat (or variants) instead of IndexHNSW directly");
+            "No storage index, please use IndexHNSWFlat (or variants) "
+            "instead of IndexHNSW directly");
     const SearchParametersHNSW* params = nullptr;
     const HNSW& hnsw = index->hnsw;
 
diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index 1085d3a0d1..aa041c0fac 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/index_io.h>
 
 #include <faiss/impl/io_macros.h>
@@ -531,7 +529,11 @@ Index* read_index(IOReader* f, int io_flags) {
     Index* idx = nullptr;
     uint32_t h;
     READ1(h);
-    if (h == fourcc("IxFI") || h == fourcc("IxF2") || h == fourcc("IxFl")) {
+    if (h == fourcc("null")) {
+        // denotes a missing index, useful for some cases
+        return nullptr;
+    } else if (
+            h == fourcc("IxFI") || h == fourcc("IxF2") || h == fourcc("IxFl")) {
         IndexFlat* idxf;
         if (h == fourcc("IxFI")) {
             idxf = new IndexFlatIP();
@@ -969,7 +971,7 @@ Index* read_index(IOReader* f, int io_flags) {
         }
         read_HNSW(&idxhnsw->hnsw, f);
         idxhnsw->storage = read_index(f, io_flags);
-        idxhnsw->own_fields = true;
+        idxhnsw->own_fields = idxhnsw->storage != nullptr;
         if (h == fourcc("IHNp") && !(io_flags & IO_FLAG_PQ_SKIP_SDC_TABLE)) {
             dynamic_cast<IndexPQ*>(idxhnsw->storage)->pq.compute_sdc_table();
         }
diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp
index 24303ac376..0a924d0225 100644
--- a/faiss/impl/index_write.cpp
+++ b/faiss/impl/index_write.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/index_io.h>
 
 #include <faiss/impl/io.h>
@@ -390,8 +388,12 @@ static void write_ivf_header(const IndexIVF* ivf, IOWriter* f) {
     write_direct_map(&ivf->direct_map, f);
 }
 
-void write_index(const Index* idx, IOWriter* f) {
-    if (const IndexFlat* idxf = dynamic_cast<const IndexFlat*>(idx)) {
+void write_index(const Index* idx, IOWriter* f, int io_flags) {
+    if (idx == nullptr) {
+        // eg. for a storage component of HNSW that is set to nullptr
+        uint32_t h = fourcc("null");
+        WRITE1(h);
+    } else if (const IndexFlat* idxf = dynamic_cast<const IndexFlat*>(idx)) {
         uint32_t h =
                 fourcc(idxf->metric_type == METRIC_INNER_PRODUCT ? "IxFI"
                                : idxf->metric_type == METRIC_L2  ? "IxF2"
@@ -772,7 +774,12 @@ void write_index(const Index* idx, IOWriter* f) {
             WRITE1(idx_hnsw_cagra->num_base_level_search_entrypoints);
         }
         write_HNSW(&idxhnsw->hnsw, f);
-        write_index(idxhnsw->storage, f);
+        if (io_flags & IO_FLAG_SKIP_STORAGE) {
+            uint32_t n4 = fourcc("null");
+            WRITE1(n4);
+        } else {
+            write_index(idxhnsw->storage, f);
+        }
     } else if (const IndexNSG* idxnsg = dynamic_cast<const IndexNSG*>(idx)) {
         uint32_t h = dynamic_cast<const IndexNSGFlat*>(idx) ? fourcc("INSf")
                 : dynamic_cast<const IndexNSGPQ*>(idx)      ? fourcc("INSp")
@@ -848,14 +855,14 @@ void write_index(const Index* idx, IOWriter* f) {
     }
 }
 
-void write_index(const Index* idx, FILE* f) {
+void write_index(const Index* idx, FILE* f, int io_flags) {
     FileIOWriter writer(f);
-    write_index(idx, &writer);
+    write_index(idx, &writer, io_flags);
 }
 
-void write_index(const Index* idx, const char* fname) {
+void write_index(const Index* idx, const char* fname, int io_flags) {
     FileIOWriter writer(fname);
-    write_index(idx, &writer);
+    write_index(idx, &writer, io_flags);
 }
 
 void write_VectorTransform(const VectorTransform* vt, const char* fname) {
diff --git a/faiss/index_io.h b/faiss/index_io.h
index f73cd073b7..3e77d0227c 100644
--- a/faiss/index_io.h
+++ b/faiss/index_io.h
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 // I/O code for indexes
 
 #ifndef FAISS_INDEX_IO_H
@@ -35,9 +33,12 @@ struct IOReader;
 struct IOWriter;
 struct InvertedLists;
 
-void write_index(const Index* idx, const char* fname);
-void write_index(const Index* idx, FILE* f);
-void write_index(const Index* idx, IOWriter* writer);
+/// skip the storage for graph-based indexes
+const int IO_FLAG_SKIP_STORAGE = 1;
+
+void write_index(const Index* idx, const char* fname, int io_flags = 0);
+void write_index(const Index* idx, FILE* f, int io_flags = 0);
+void write_index(const Index* idx, IOWriter* writer, int io_flags = 0);
 
 void write_index_binary(const IndexBinary* idx, const char* fname);
 void write_index_binary(const IndexBinary* idx, FILE* f);
diff --git a/faiss/python/__init__.py b/faiss/python/__init__.py
index 0562d1dd89..ce4b42c618 100644
--- a/faiss/python/__init__.py
+++ b/faiss/python/__init__.py
@@ -292,10 +292,10 @@ def range_search_with_parameters(index, x, radius, params=None, output_stats=Fal
 ###########################################
 
 
-def serialize_index(index):
+def serialize_index(index, io_flags=0):
     """ convert an index to a numpy uint8 array  """
     writer = VectorIOWriter()
-    write_index(index, writer)
+    write_index(index, writer, io_flags)
     return vector_to_array(writer.data)
 
 
diff --git a/tests/test_graph_based.py b/tests/test_graph_based.py
index d5ddbeec37..95925d7ae9 100644
--- a/tests/test_graph_based.py
+++ b/tests/test_graph_based.py
@@ -133,6 +133,42 @@ def test_ndis_stats(self):
         Dhnsw, Ihnsw = index.search(self.xq, 1)
         self.assertGreater(stats.ndis, len(self.xq) * index.hnsw.efSearch)
 
+    def test_io_no_storage(self):
+        d = self.xq.shape[1]
+        index = faiss.IndexHNSWFlat(d, 16)
+        index.add(self.xb)
+
+        Dref, Iref = index.search(self.xq, 5)
+
+        # test writing without storage
+        index2 = faiss.deserialize_index(
+            faiss.serialize_index(index, faiss.IO_FLAG_SKIP_STORAGE)
+        )
+        self.assertEquals(index2.storage, None)
+        self.assertRaises(
+            RuntimeError,
+            index2.search, self.xb, 1)
+
+        # make sure we can store an index with empty storage
+        index4 = faiss.deserialize_index(
+            faiss.serialize_index(index2))
+
+        # add storage afterwards
+        index.storage = faiss.clone_index(index.storage)
+        index.own_fields = True
+
+        Dnew, Inew = index.search(self.xq, 5)
+        np.testing.assert_array_equal(Dnew, Dref)
+        np.testing.assert_array_equal(Inew, Iref)
+
+        if False:
+            # test reading without storage
+            # not implemented because it is hard to skip over an index
+            index3 = faiss.deserialize_index(
+                faiss.serialize_index(index), faiss.IO_FLAG_SKIP_STORAGE
+            )
+            self.assertEquals(index3.storage, None)
+
 
 class TestNSG(unittest.TestCase):
 

From c48fba3a87f297df9a45c37429afa797d81d3d52 Mon Sep 17 00:00:00 2001
From: Xiao Fu <xiaofu@meta.com>
Date: Tue, 4 Jun 2024 19:44:27 -0700
Subject: [PATCH 050/148] Add cpp tutorial for index factory refine index
 construction (#3494)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3494

This tasks focus on the refine index construction tutorial with different index refinement on fp16/sq8 quantization. The python version was added a while ago.

Reviewed By: junjieqi

Differential Revision: D58161983

fbshipit-source-id: 1c598fe612b5dee3952c5f7398e6802e117f141d
---
 tutorial/cpp/9-RefineComparison.cpp | 104 ++++++++++++++++++++++++++++
 tutorial/cpp/CMakeLists.txt         |   3 +
 2 files changed, 107 insertions(+)
 create mode 100644 tutorial/cpp/9-RefineComparison.cpp

diff --git a/tutorial/cpp/9-RefineComparison.cpp b/tutorial/cpp/9-RefineComparison.cpp
new file mode 100644
index 0000000000..d7fbc90aec
--- /dev/null
+++ b/tutorial/cpp/9-RefineComparison.cpp
@@ -0,0 +1,104 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+
+#include <faiss/IndexPQFastScan.h>
+#include <faiss/IndexRefine.h>
+#include <faiss/index_factory.h>
+using idx_t = faiss::idx_t;
+
+int main() {
+    int d = 64;      // dimension
+    int nb = 100000; // database size
+    int nq = 10000;  // nb of queries
+
+    std::mt19937 rng;
+    std::uniform_real_distribution<> distrib;
+
+    float* xb = new float[(int)(d * nb)];
+    float* xq = new float[(int)(d * nq)];
+
+    for (int i = 0; i < nb; i++) {
+        for (int j = 0; j < d; j++) {
+            xb[d * i + j] = distrib(rng);
+        }
+        xb[d * i] += i / 1000.;
+    }
+
+    for (int i = 0; i < nq; i++) {
+        for (int j = 0; j < d; j++) {
+            xq[d * i + j] = distrib(rng);
+        }
+        xq[d * i] += i / 1000.;
+    }
+
+    // Constructing the refine PQ index with SQfp16 with index factory
+    faiss::Index* index_fp16;
+    index_fp16 = faiss::index_factory(
+            d, "PQ32x4fs,Refine(SQfp16)", faiss::METRIC_L2);
+    index_fp16->train(nb, xb);
+    index_fp16->add(nb, xb);
+
+    // Constructing the refine PQ index with SQ8
+    faiss::Index* index_sq8;
+    index_sq8 =
+            faiss::index_factory(d, "PQ32x4fs,Refine(SQ8)", faiss::METRIC_L2);
+    index_sq8->train(nb, xb);
+    index_sq8->add(nb, xb);
+
+    int k = 10;
+    { // search xq
+        idx_t* I_fp16 = new idx_t[(int)(k * nq)];
+        float* D_fp16 = new float[(int)(k * nq)];
+        idx_t* I_sq8 = new idx_t[(int)(k * nq)];
+        float* D_sq8 = new float[(int)(k * nq)];
+
+        // Parameterization on k factor while doing search for index refinement
+        float k_factor = 3;
+        faiss::IndexRefineSearchParameters* params =
+                new faiss::IndexRefineSearchParameters();
+        params->k_factor = k_factor;
+
+        // Perform index search using different index refinement
+        index_fp16->search(nq, xq, k, D_fp16, I_fp16, params);
+        index_sq8->search(nq, xq, k, D_sq8, I_sq8, params);
+
+        printf("I_fp16=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++) {
+                printf("%5zd ", I_fp16[i * k + j]);
+            }
+            printf("\n");
+        }
+
+        printf("I_sq8=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++) {
+                printf("%5zd ", I_sq8[i * k + j]);
+            }
+            printf("\n");
+        }
+
+        delete[] I_fp16;
+        delete[] D_fp16;
+        delete[] I_sq8;
+        delete[] D_sq8;
+        delete params;
+
+        delete index_fp16;
+        delete index_sq8;
+    }
+
+    delete[] xb;
+    delete[] xq;
+
+    return 0;
+}
diff --git a/tutorial/cpp/CMakeLists.txt b/tutorial/cpp/CMakeLists.txt
index ad152c499d..f964b3dda9 100644
--- a/tutorial/cpp/CMakeLists.txt
+++ b/tutorial/cpp/CMakeLists.txt
@@ -27,3 +27,6 @@ target_link_libraries(7-PQFastScan PRIVATE faiss)
 
 add_executable(8-PQFastScanRefine EXCLUDE_FROM_ALL 8-PQFastScanRefine.cpp)
 target_link_libraries(8-PQFastScanRefine PRIVATE faiss)
+
+add_executable(9-RefineComparison EXCLUDE_FROM_ALL 9-RefineComparison.cpp)
+target_link_libraries(9-RefineComparison PRIVATE faiss)

From 66fd9acaf85de0d0623a9f0c1dfb786ad5d6897c Mon Sep 17 00:00:00 2001
From: Abhiram Vadlapatla <v.abhiram97@gmail.com>
Date: Tue, 4 Jun 2024 21:59:57 -0700
Subject: [PATCH 051/148] Update .gitignore (#3492)

Summary:
Adding build folder to gitignore, so that they don't show up in the commit tree while building from source

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3492

Reviewed By: junjieqi

Differential Revision: D58171359

Pulled By: asadoughi

fbshipit-source-id: b0efed348769328a3bdbcc13098dcb84cadb6c4f
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index caab1304c8..d6df432fa5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@
 *.dylib
 *.pyc
 *~
+/build/
 /config.*
 /aclocal.m4
 /autom4te.cache/

From 4615b007a01c190f9d06e13876c9b571cd9bef24 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 25 Jun 2024 10:57:18 -0700
Subject: [PATCH 052/148] rebase on latest changes

---
 faiss/gpu/GpuIndexCagra.cu          |  26 +----
 faiss/gpu/GpuIndexCagra.h           |   2 -
 faiss/gpu/impl/RaftCagra.cu         | 157 +++++++++++++++++-----------
 faiss/gpu/impl/RaftCagra.cuh        |  15 +--
 faiss/gpu/test/TestGpuIndexCagra.cu |  15 ++-
 5 files changed, 107 insertions(+), 108 deletions(-)

diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
index b183e74568..4ae56df10d 100644
--- a/faiss/gpu/GpuIndexCagra.cu
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -86,13 +86,11 @@ void GpuIndexCagra::train(idx_t n, const float* x) {
             cagraConfig_.graph_degree,
             static_cast<faiss::cagra_build_algo>(cagraConfig_.build_algo),
             cagraConfig_.nn_descent_niter,
-            cagraConfig_.store_dataset,
             this->metric_type,
             this->metric_arg,
             INDICES_64_BIT,
             ivf_pq_params,
-            ivf_pq_search_params,
-            cagraConfig_.refine_rate);
+            ivf_pq_search_params);
 
     index_->train(n, x);
 
@@ -227,33 +225,17 @@ void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const {
     index->hnsw.set_default_probas(M, 1.0 / log(M));
 
     auto n_train = this->ntotal;
-    float* train_dataset;
-    auto dataset = index_->get_training_dataset();
-    bool allocation = false;
-    if (getDeviceForAddress(dataset) >= 0) {
-        train_dataset = new float[n_train * index->d];
-        allocation = true;
-        raft::copy(
-                train_dataset,
-                dataset,
-                n_train * index->d,
-                this->resources_->getRaftHandleCurrentDevice().get_stream());
-    } else {
-        train_dataset = const_cast<float*>(dataset);
-    }
+    auto train_dataset = index_->get_training_dataset();
 
     // turn off as level 0 is copied from CAGRA graph
     index->init_level0 = false;
     if (!index->base_level_only) {
-        index->add(n_train, train_dataset);
+        index->add(n_train, train_dataset.data());
     } else {
         index->hnsw.prepare_level_tab(n_train, false);
-        index->storage->add(n_train, train_dataset);
+        index->storage->add(n_train, train_dataset.data());
         index->ntotal = n_train;
     }
-    if (allocation) {
-        delete[] train_dataset;
-    }
 
     auto graph = get_knngraph();
 
diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
index 62042d531f..6ecee3ae03 100644
--- a/faiss/gpu/GpuIndexCagra.h
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -174,8 +174,6 @@ struct GpuIndexCagraConfig : public GpuIndexConfig {
 
     IVFPQBuildCagraConfig* ivf_pq_params = nullptr;
     IVFPQSearchCagraConfig* ivf_pq_search_params = nullptr;
-    float refine_rate = 2.0f;
-    bool store_dataset = true;
 };
 
 enum class search_algo {
diff --git a/faiss/gpu/impl/RaftCagra.cu b/faiss/gpu/impl/RaftCagra.cu
index 50903220df..292079321d 100644
--- a/faiss/gpu/impl/RaftCagra.cu
+++ b/faiss/gpu/impl/RaftCagra.cu
@@ -42,23 +42,19 @@ RaftCagra::RaftCagra(
         idx_t graph_degree,
         faiss::cagra_build_algo graph_build_algo,
         size_t nn_descent_niter,
-        bool store_dataset,
         faiss::MetricType metric,
         float metricArg,
         IndicesOptions indicesOptions,
         std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params,
         std::optional<raft::neighbors::ivf_pq::search_params>
-                ivf_pq_search_params,
-        float refine_rate)
+                ivf_pq_search_params)
         : resources_(resources),
           dim_(dim),
-          store_dataset_(store_dataset),
           metric_(metric),
           metricArg_(metricArg),
           index_params_(),
           ivf_pq_params_(ivf_pq_params),
-          ivf_pq_search_params_(ivf_pq_search_params),
-          refine_rate_(refine_rate) {
+          ivf_pq_search_params_(ivf_pq_search_params) {
     FAISS_THROW_IF_NOT_MSG(
             metric == faiss::METRIC_L2 || metric == faiss::METRIC_INNER_PRODUCT,
             "CAGRA currently only supports L2 or Inner Product metric.");
@@ -117,9 +113,6 @@ RaftCagra::RaftCagra(
 
     FAISS_ASSERT(distances_on_gpu == knn_graph_on_gpu);
 
-    storage_ = distances;
-    n_ = n;
-
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
 
@@ -171,50 +164,81 @@ RaftCagra::RaftCagra(
 }
 
 void RaftCagra::train(idx_t n, const float* x) {
-    storage_ = x;
-    n_ = n;
-
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
-
-    auto nn_descent_params = std::make_optional<
-            raft::neighbors::experimental::nn_descent::index_params>();
-    nn_descent_params->graph_degree = index_params_.intermediate_graph_degree;
-    nn_descent_params->intermediate_graph_degree =
-            1.5 * index_params_.intermediate_graph_degree;
-    nn_descent_params->max_iterations = index_params_.nn_descent_niter;
-
     if (index_params_.build_algo ==
-                raft::neighbors::cagra::graph_build_algo::IVF_PQ &&
-        index_params_.graph_degree == index_params_.intermediate_graph_degree) {
-        index_params_.intermediate_graph_degree =
-                1.5 * index_params_.graph_degree;
-    }
+        raft::neighbors::cagra::graph_build_algo::IVF_PQ) {
+        std::optional<raft::host_matrix<uint32_t, int64_t>> knn_graph(
+                raft::make_host_matrix<uint32_t, int64_t>(
+                        n, index_params_.intermediate_graph_degree));
+        if (getDeviceForAddress(x) >= 0) {
+            auto dataset_d =
+                    raft::make_device_matrix_view<const float, int64_t>(
+                            x, n, dim_);
+            raft::neighbors::cagra::build_knn_graph(
+                    raft_handle,
+                    dataset_d,
+                    knn_graph->view(),
+                    1.0f,
+                    ivf_pq_params_,
+                    ivf_pq_search_params_);
+        } else {
+            auto dataset_h = raft::make_host_matrix_view<const float, int64_t>(
+                    x, n, dim_);
+            raft::neighbors::cagra::build_knn_graph(
+                    raft_handle,
+                    dataset_h,
+                    knn_graph->view(),
+                    1.0f,
+                    ivf_pq_params_,
+                    ivf_pq_search_params_);
+        }
+        auto cagra_graph = raft::make_host_matrix<uint32_t, int64_t>(
+                n, index_params_.graph_degree);
+
+        raft::neighbors::cagra::optimize<uint32_t>(
+                raft_handle, knn_graph->view(), cagra_graph.view());
+
+        // free intermediate graph before trying to create the index
+        knn_graph.reset();
+
+        if (getDeviceForAddress(x) >= 0) {
+            auto dataset_d =
+                    raft::make_device_matrix_view<const float, int64_t>(
+                            x, n, dim_);
+            raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
+                    raft_handle,
+                    metric_ == faiss::METRIC_L2
+                            ? raft::distance::DistanceType::L2Expanded
+                            : raft::distance::DistanceType::InnerProduct,
+                    dataset_d,
+                    raft::make_const_mdspan(cagra_graph.view()));
+        } else {
+            auto dataset_h = raft::make_host_matrix_view<const float, int64_t>(
+                    x, n, dim_);
+            raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
+                    raft_handle,
+                    metric_ == faiss::METRIC_L2
+                            ? raft::distance::DistanceType::L2Expanded
+                            : raft::distance::DistanceType::InnerProduct,
+                    dataset_h,
+                    raft::make_const_mdspan(cagra_graph.view()));
+        }
 
-    if (getDeviceForAddress(x) >= 0) {
-        auto dataset =
-                raft::make_device_matrix_view<const float, int64_t>(x, n, dim_);
-        raft_knn_index = raft::neighbors::cagra::detail::build<float, uint32_t>(
-                raft_handle,
-                index_params_,
-                dataset,
-                nn_descent_params,
-                refine_rate_,
-                ivf_pq_params_,
-                ivf_pq_search_params_,
-                store_dataset_);
     } else {
-        auto dataset =
-                raft::make_host_matrix_view<const float, int64_t>(x, n, dim_);
-        raft_knn_index = raft::neighbors::cagra::detail::build<float, uint32_t>(
-                raft_handle,
-                index_params_,
-                dataset,
-                nn_descent_params,
-                refine_rate_,
-                ivf_pq_params_,
-                ivf_pq_search_params_,
-                store_dataset_);
+        if (getDeviceForAddress(x) >= 0) {
+            raft_knn_index = raft::runtime::neighbors::cagra::build(
+                    raft_handle,
+                    index_params_,
+                    raft::make_device_matrix_view<const float, int64_t>(
+                            x, n, dim_));
+        } else {
+            raft_knn_index = raft::runtime::neighbors::cagra::build(
+                    raft_handle,
+                    index_params_,
+                    raft::make_host_matrix_view<const float, int64_t>(
+                            x, n, dim_));
+        }
     }
 }
 
@@ -246,18 +270,6 @@ void RaftCagra::search(
     FAISS_ASSERT(numQueries > 0);
     FAISS_ASSERT(cols == dim_);
 
-    if (!store_dataset_) {
-        if (getDeviceForAddress(storage_) >= 0) {
-            auto dataset = raft::make_device_matrix_view<const float, int64_t>(
-                    storage_, n_, dim_);
-            raft_knn_index.value().update_dataset(raft_handle, dataset);
-        } else {
-            auto dataset = raft::make_host_matrix_view<const float, int64_t>(
-                    storage_, n_, dim_);
-            raft_knn_index.value().update_dataset(raft_handle, dataset);
-        }
-    }
-
     auto queries_view = raft::make_device_matrix_view<const float, int64_t>(
             queries.data(), numQueries, cols);
     auto distances_view = raft::make_device_matrix_view<float, int64_t>(
@@ -330,8 +342,29 @@ std::vector<idx_t> RaftCagra::get_knngraph() const {
     return host_graph;
 }
 
-const float* RaftCagra::get_training_dataset() const {
-    return storage_;
+std::vector<float> RaftCagra::get_training_dataset() const {
+    FAISS_ASSERT(raft_knn_index.has_value());
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    auto stream = raft_handle.get_stream();
+
+    auto device_dataset = raft_knn_index.value().dataset();
+
+    std::vector<float> host_dataset(
+            device_dataset.extent(0) * device_dataset.extent(1));
+
+    RAFT_CUDA_TRY(cudaMemcpy2DAsync(
+            host_dataset.data(),
+            sizeof(float) * dim_,
+            device_dataset.data_handle(),
+            sizeof(float) * device_dataset.stride(0),
+            sizeof(float) * dim_,
+            device_dataset.extent(0),
+            cudaMemcpyDefault,
+            raft_handle.get_stream()));
+    raft_handle.sync_stream();
+
+    return host_dataset;
 }
 
 } // namespace gpu
diff --git a/faiss/gpu/impl/RaftCagra.cuh b/faiss/gpu/impl/RaftCagra.cuh
index 0913ba5947..95f6c03fca 100644
--- a/faiss/gpu/impl/RaftCagra.cuh
+++ b/faiss/gpu/impl/RaftCagra.cuh
@@ -53,15 +53,13 @@ class RaftCagra {
             idx_t graph_degree,
             faiss::cagra_build_algo graph_build_algo,
             size_t nn_descent_niter,
-            bool store_dataset,
             faiss::MetricType metric,
             float metricArg,
             IndicesOptions indicesOptions,
             std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params =
                     std::nullopt,
             std::optional<raft::neighbors::ivf_pq::search_params>
-                    ivf_pq_search_params = std::nullopt,
-            float refine_rate = 2.0f);
+                    ivf_pq_search_params = std::nullopt);
 
     RaftCagra(
             GpuResources* resources,
@@ -103,23 +101,15 @@ class RaftCagra {
 
     std::vector<idx_t> get_knngraph() const;
 
-    const float* get_training_dataset() const;
+    std::vector<float> get_training_dataset() const;
 
    private:
     /// Collection of GPU resources that we use
     GpuResources* resources_;
 
-    /// Training dataset
-    const float* storage_;
-    int n_;
-
     /// Expected dimensionality of the vectors
     const int dim_;
 
-    /// Controls the underlying RAFT index if it should store the dataset in
-    /// device memory
-    bool store_dataset_;
-
     /// Metric type of the index
     faiss::MetricType metric_;
 
@@ -132,7 +122,6 @@ class RaftCagra {
     /// Parameters to build CAGRA graph using IVF PQ
     std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params_;
     std::optional<raft::neighbors::ivf_pq::search_params> ivf_pq_search_params_;
-    std::optional<float> refine_rate_;
 
     /// Instance of trained RAFT CAGRA index
     std::optional<raft::neighbors::cagra::index<float, uint32_t>>
diff --git a/faiss/gpu/test/TestGpuIndexCagra.cu b/faiss/gpu/test/TestGpuIndexCagra.cu
index 3d9e14ae34..8d330a81cb 100644
--- a/faiss/gpu/test/TestGpuIndexCagra.cu
+++ b/faiss/gpu/test/TestGpuIndexCagra.cu
@@ -38,7 +38,7 @@
 
 struct Options {
     Options() {
-        numTrain = 2 * faiss::gpu::randVal(4000, 10000);
+        numTrain = 2 * faiss::gpu::randVal(2000, 5000);
         dim = faiss::gpu::randVal(4, 10);
         numAdd = faiss::gpu::randVal(1000, 3000);
 
@@ -47,9 +47,8 @@ struct Options {
         buildAlgo = faiss::gpu::randSelect(
                 {faiss::gpu::graph_build_algo::IVF_PQ,
                  faiss::gpu::graph_build_algo::NN_DESCENT});
-        storeDataset = faiss::gpu::randSelect({true, false});
 
-        numQuery = faiss::gpu::randVal(300, 600);
+        numQuery = faiss::gpu::randVal(32, 100);
         k = faiss::gpu::randVal(10, 30);
 
         device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
@@ -72,7 +71,6 @@ struct Options {
     size_t graphDegree;
     size_t intermediateGraphDegree;
     faiss::gpu::graph_build_algo buildAlgo;
-    bool storeDataset;
     int numQuery;
     int k;
     int device;
@@ -226,7 +224,6 @@ void copyToTest(
         config.graph_degree = opt.graphDegree;
         config.intermediate_graph_degree = opt.intermediateGraphDegree;
         config.build_algo = opt.buildAlgo;
-        config.store_dataset = opt.storeDataset;
 
         faiss::gpu::GpuIndexCagra gpuIndex(&res, opt.dim, metric, config);
         gpuIndex.train(opt.numTrain, trainVecs.data());
@@ -342,7 +339,7 @@ TEST(TestGpuIndexCagra, Float32_CopyTo_L2) {
 }
 
 TEST(TestGpuIndexCagra, Float32_CopyTo_L2_BaseLevelOnly) {
-    copyToTest(faiss::METRIC_L2, 0.95, true);
+    copyToTest(faiss::METRIC_L2, 0.98, true);
 }
 
 TEST(TestGpuIndexCagra, Float32_CopyTo_IP) {
@@ -350,7 +347,7 @@ TEST(TestGpuIndexCagra, Float32_CopyTo_IP) {
 }
 
 TEST(TestGpuIndexCagra, Float32_CopyTo_IP_BaseLevelOnly) {
-    copyToTest(faiss::METRIC_INNER_PRODUCT, 0.95, true);
+    copyToTest(faiss::METRIC_INNER_PRODUCT, 0.98, true);
 }
 
 void copyFromTest(faiss::MetricType metric, double expected_recall) {
@@ -460,11 +457,11 @@ void copyFromTest(faiss::MetricType metric, double expected_recall) {
 }
 
 TEST(TestGpuIndexCagra, Float32_CopyFrom_L2) {
-    copyFromTest(faiss::METRIC_L2, 0.95);
+    copyFromTest(faiss::METRIC_L2, 0.98);
 }
 
 TEST(TestGpuIndexCagra, Float32_CopyFrom_IP) {
-    copyFromTest(faiss::METRIC_INNER_PRODUCT, 0.95);
+    copyFromTest(faiss::METRIC_INNER_PRODUCT, 0.98);
 }
 
 int main(int argc, char** argv) {

From 2ce4b2895206e6150c1b43f3b6eef8847453f274 Mon Sep 17 00:00:00 2001
From: Matthijs Douze <matthijs@meta.com>
Date: Tue, 11 Jun 2024 08:14:48 -0700
Subject: [PATCH 053/148] fix spurious include to land the cagra diff (#3502)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3502

include probably added by vscode

Reviewed By: mengdilin

Differential Revision: D58411537

fbshipit-source-id: 3035f690d26decc937fb492c54ffa2f974ee2db8
---
 CMakeLists.txt               | 2 --
 faiss/IndexHNSW.cpp          | 1 -
 faiss/gpu/test/test_cagra.py | 6 +++---
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1a468fb247..cedee9c456 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,8 +46,6 @@ project(faiss
   LANGUAGES ${FAISS_LANGUAGES})
 include(GNUInstallDirs)
 
-set(CMAKE_INSTALL_PREFIX "$ENV{CONDA_PREFIX}")
-
 set(CMAKE_CXX_STANDARD 17)
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index c04642d218..fd80b87df7 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -24,7 +24,6 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <cstdint>
-#include "impl/HNSW.h"
 
 #include <faiss/Index2Layer.h>
 #include <faiss/IndexFlat.h>
diff --git a/faiss/gpu/test/test_cagra.py b/faiss/gpu/test/test_cagra.py
index dd7d09f2de..4c7e532c2b 100644
--- a/faiss/gpu/test/test_cagra.py
+++ b/faiss/gpu/test/test_cagra.py
@@ -6,7 +6,6 @@
 import unittest
 
 import faiss
-import numpy as np
 
 from faiss.contrib import datasets, evaluation
 
@@ -57,8 +56,9 @@ def do_interop(self, metric):
         
         evaluation.check_ref_knn_with_draws(Dref, Iref, Dnew, Inew, k)
 
-        faiss.write_index(cpu_index, "index_hnsw_cagra.index")
-        deserialized_index = faiss.read_index("index_hnsw_cagra.index")
+        deserialized_index = faiss.deserialize_index(
+            faiss.serialize_index(cpu_index))
+
         gpu_index = faiss.index_cpu_to_gpu(res, 0, deserialized_index)
         Dnew2, Inew2 = gpu_index.search(ds.get_queries(), k)
 

From 959cd49a01367e533bfdc56bc26ce4c20f771054 Mon Sep 17 00:00:00 2001
From: Gergely Szilvasy <gsz@meta.com>
Date: Wed, 12 Jun 2024 19:19:23 -0700
Subject: [PATCH 054/148] add use_raft to knn_gpu (torch) (#3509)

Summary:
Add support for `use_raft` in the torch version of `knn_gpu`. The numpy version already has this support, see https://github.com/facebookresearch/faiss/blob/main/faiss/python/gpu_wrappers.py#L59

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3509

Reviewed By: mlomeli1, junjieqi

Differential Revision: D58489851

Pulled By: algoriddle

fbshipit-source-id: cfad722fefd4809b135b765d0d43587cfd782d0e
---
 contrib/torch_utils.py                   |  4 +++-
 faiss/gpu/test/torch_test_contrib_gpu.py | 20 +++++++++++++-------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/contrib/torch_utils.py b/contrib/torch_utils.py
index e371932c9f..18f136e914 100644
--- a/contrib/torch_utils.py
+++ b/contrib/torch_utils.py
@@ -492,8 +492,9 @@ def torch_replacement_sa_decode(self, codes, x=None):
         if issubclass(the_class, faiss.Index):
             handle_torch_Index(the_class)
 
+
 # allows torch tensor usage with bfKnn
-def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRIC_L2, device=-1):
+def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRIC_L2, device=-1, use_raft=False):
     if type(xb) is np.ndarray:
         # Forward to faiss __init__.py base method
         return faiss.knn_gpu_numpy(res, xq, xb, k, D, I, metric, device)
@@ -574,6 +575,7 @@ def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRI
     args.outIndices = I_ptr
     args.outIndicesType = I_type
     args.device = device
+    args.use_raft = use_raft
 
     with using_stream(res):
         faiss.bfKnn(res, args)
diff --git a/faiss/gpu/test/torch_test_contrib_gpu.py b/faiss/gpu/test/torch_test_contrib_gpu.py
index 0c949c29f2..f7444337f1 100644
--- a/faiss/gpu/test/torch_test_contrib_gpu.py
+++ b/faiss/gpu/test/torch_test_contrib_gpu.py
@@ -249,7 +249,7 @@ def test_sa_encode_decode(self):
         return
 
 class TestTorchUtilsKnnGpu(unittest.TestCase):
-    def test_knn_gpu(self):
+    def test_knn_gpu(self, use_raft=False):
         torch.manual_seed(10)
         d = 32
         nb = 1024
@@ -286,7 +286,7 @@ def test_knn_gpu(self):
                     else:
                         xb_c = xb_np
 
-                    D, I = faiss.knn_gpu(res, xq_c, xb_c, k)
+                    D, I = faiss.knn_gpu(res, xq_c, xb_c, k, use_raft=use_raft)
 
                     self.assertTrue(torch.equal(torch.from_numpy(I), gt_I))
                     self.assertLess((torch.from_numpy(D) - gt_D).abs().max(), 1e-4)
@@ -312,7 +312,7 @@ def test_knn_gpu(self):
                             xb_c = to_column_major_torch(xb)
                             assert not xb_c.is_contiguous()
 
-                        D, I = faiss.knn_gpu(res, xq_c, xb_c, k)
+                        D, I = faiss.knn_gpu(res, xq_c, xb_c, k, use_raft=use_raft)
 
                         self.assertTrue(torch.equal(I.cpu(), gt_I))
                         self.assertLess((D.cpu() - gt_D).abs().max(), 1e-4)
@@ -320,7 +320,7 @@ def test_knn_gpu(self):
                         # test on subset
                         try:
                             # This internally uses the current pytorch stream
-                            D, I = faiss.knn_gpu(res, xq_c[6:8], xb_c, k)
+                            D, I = faiss.knn_gpu(res, xq_c[6:8], xb_c, k, use_raft=use_raft)
                         except TypeError:
                             if not xq_row_major:
                                 # then it is expected
@@ -331,7 +331,13 @@ def test_knn_gpu(self):
                         self.assertTrue(torch.equal(I.cpu(), gt_I[6:8]))
                         self.assertLess((D.cpu() - gt_D[6:8]).abs().max(), 1e-4)
 
-    def test_knn_gpu_datatypes(self):
+    @unittest.skipUnless(
+        "RAFT" in faiss.get_compile_options(),
+        "only if RAFT is compiled in")
+    def test_knn_gpu_raft(self):
+        self.test_knn_gpu(use_raft=True)
+
+    def test_knn_gpu_datatypes(self, use_raft=False):
         torch.manual_seed(10)
         d = 10
         nb = 1024
@@ -354,7 +360,7 @@ def test_knn_gpu_datatypes(self):
         D = torch.zeros(nq, k, device=xb_c.device, dtype=torch.float32)
         I = torch.zeros(nq, k, device=xb_c.device, dtype=torch.int32)
 
-        faiss.knn_gpu(res, xq_c, xb_c, k, D, I)
+        faiss.knn_gpu(res, xq_c, xb_c, k, D, I, use_raft=use_raft)
 
         self.assertTrue(torch.equal(I.long().cpu(), gt_I))
         self.assertLess((D.float().cpu() - gt_D).abs().max(), 1.5e-3)
@@ -366,7 +372,7 @@ def test_knn_gpu_datatypes(self):
         xb_c = xb.half().numpy()
         xq_c = xq.half().numpy()
 
-        faiss.knn_gpu(res, xq_c, xb_c, k, D, I)
+        faiss.knn_gpu(res, xq_c, xb_c, k, D, I, use_raft=use_raft)
 
         self.assertTrue(torch.equal(torch.from_numpy(I).long(), gt_I))
         self.assertLess((torch.from_numpy(D) - gt_D).abs().max(), 1.5e-3)

From 2d1e49a4aac7a80e15fb3e749ee1572844ba8ebe Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Thu, 13 Jun 2024 08:14:38 -0700
Subject: [PATCH 055/148] Add conda bin to path early in the cmake GitHub
 action (#3512)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3512

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3510

GitHub hosted runners some with the build-essentials package pre-installed, self-hosted runners on AWS do not have this package. This made it all steps other than the `all targets` one fall back to the system executables which unintentially worked on GitHub hosted runners but not on the self-hosted ones. This diff fixes it by pulling the line that adds conda bin to path early in the cmake build action.

Reviewed By: asadoughi

Differential Revision: D58513853

fbshipit-source-id: 23e95459e0031c96bd142515db07d1b700d713cf
---
 .github/actions/build_cmake/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index cd023aaca7..6e21f785ea 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -30,6 +30,7 @@ runs:
       run: |
         conda install -y -q -c conda-forge gxx_linux-64=11.2 sysroot_linux-64=2.28
         conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest
+        echo "$CONDA/bin" >> $GITHUB_PATH
     - name: Install CUDA
       if: inputs.gpu == 'ON' && inputs.raft == 'OFF'
       shell: bash
@@ -72,7 +73,6 @@ runs:
       shell: bash
       run: |
         conda install -y pytest
-        echo "$CONDA/bin" >> $GITHUB_PATH
     - name: Python tests (CPU only)
       if: inputs.gpu == 'OFF'
       shell: bash

From 11d6ce43eb306575baff948579b72a8d01367ce3 Mon Sep 17 00:00:00 2001
From: Gergely Szilvasy <gsz@meta.com>
Date: Thu, 13 Jun 2024 13:31:34 -0700
Subject: [PATCH 056/148] typo in test_io_no_storage (#3515)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3515

Fix typo `test_io_no_storage`

Reviewed By: kuarora, asadoughi

Differential Revision: D58540190

fbshipit-source-id: b8b9cacd7ea6005c0edb94014de74188450318c1
---
 tests/test_graph_based.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_graph_based.py b/tests/test_graph_based.py
index 95925d7ae9..c769e03ade 100644
--- a/tests/test_graph_based.py
+++ b/tests/test_graph_based.py
@@ -144,7 +144,7 @@ def test_io_no_storage(self):
         index2 = faiss.deserialize_index(
             faiss.serialize_index(index, faiss.IO_FLAG_SKIP_STORAGE)
         )
-        self.assertEquals(index2.storage, None)
+        self.assertEqual(index2.storage, None)
         self.assertRaises(
             RuntimeError,
             index2.search, self.xb, 1)

From a6515930f3e2597388082618a336990b8824c25d Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Thu, 13 Jun 2024 22:30:28 -0700
Subject: [PATCH 057/148] Consolidate build environment configuration steps in
 cmake builds (#3516)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3516

This diff seeks to simplify the steps that install conda packages and environment configuration into a single step at the start of the cmake build action.

Reviewed By: mnorris11

Differential Revision: D58560454

fbshipit-source-id: ee2c6b36865809f31eb335cfb3c2fffdccaa318d
---
 .github/actions/build_cmake/action.yml | 44 ++++++++++++++------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index 6e21f785ea..2bc476add5 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -20,27 +20,35 @@ runs:
       with:
         python-version: '3.11'
         miniconda-version: latest
-    - name: Initialize Conda environment
+    - name: Configure build environment
       shell: bash
       run: |
+        # initialize Conda
         conda config --set solver libmamba
         conda update -y -q conda
-    - name: Configure Conda environment
-      shell: bash
-      run: |
+        echo "$CONDA/bin" >> $GITHUB_PATH
+
+        # install base packages
         conda install -y -q -c conda-forge gxx_linux-64=11.2 sysroot_linux-64=2.28
         conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest
-        echo "$CONDA/bin" >> $GITHUB_PATH
-    - name: Install CUDA
-      if: inputs.gpu == 'ON' && inputs.raft == 'OFF'
-      shell: bash
-      run: |
-        conda install -y -q cuda-toolkit -c "nvidia/label/cuda-11.8.0"
-    - name: Install RAFT
-      if: inputs.raft == 'ON'
-      shell: bash
-      run: |
-        conda install -y -q libraft cuda-version=11.8 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-11.8.0" -c conda-forge
+
+        # install CUDA packages
+        if [ "${{ inputs.gpu }}" = "ON" ] && [ "${{ inputs.raft }}" = "OFF" ]; then
+          conda install -y -q cuda-toolkit -c "nvidia/label/cuda-11.8.0"
+        fi
+
+        # install RAFT packages
+        if [ "${{ inputs.raft }}" = "ON" ]; then
+          conda install -y -q libraft cuda-version=11.8 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-11.8.0" -c conda-forge
+        fi
+
+        # install test packages
+        conda install -y pytest
+        if [ "${{ inputs.gpu }}" = "ON" ]; then
+          conda install -y -q pytorch pytorch-cuda=11.8 -c pytorch -c nvidia/label/cuda-11.8.0
+        else
+          conda install -y -q pytorch -c pytorch
+        fi
     - name: Build all targets
       shell: bash
       run: |
@@ -69,22 +77,16 @@ runs:
       working-directory: build/faiss/python
       run: |
         $CONDA/bin/python setup.py install
-    - name: Install pytest
-      shell: bash
-      run: |
-        conda install -y pytest
     - name: Python tests (CPU only)
       if: inputs.gpu == 'OFF'
       shell: bash
       run: |
-        conda install -y -q pytorch -c pytorch
         pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
         pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
     - name: Python tests (CPU + GPU)
       if: inputs.gpu == 'ON'
       shell: bash
       run: |
-        conda install -y -q pytorch pytorch-cuda=11.8 -c pytorch -c nvidia/label/cuda-11.8.0
         pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
         pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
         cp tests/common_faiss_tests.py faiss/gpu/test

From 6d7317d8f07d30859a717091d54db2ea13ad2ed7 Mon Sep 17 00:00:00 2001
From: Gergely Szilvasy <gsz@meta.com>
Date: Fri, 14 Jun 2024 13:00:05 -0700
Subject: [PATCH 058/148] fix Windows build - signed int OMP for MSVC (#3517)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3517

MSVC doesn't support unsigned int for OMP

Reviewed By: kuarora, junjieqi, ramilbakhshyiev

Differential Revision: D58591594

fbshipit-source-id: ac7d6b37a82f9543be3e0fe418f0f6b439751475
---
 faiss/IndexHNSW.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index fd80b87df7..8e5c654f04 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -468,7 +468,7 @@ void IndexHNSW::search_level_0(
     if (is_similarity_metric(this->metric_type)) {
 // we need to revert the negated distances
 #pragma omp parallel for
-        for (size_t i = 0; i < k * n; i++) {
+        for (int64_t i = 0; i < k * n; i++) {
             distances[i] = -distances[i];
         }
     }

From 1e0e35e428bb516965e36b7c6841a4a6d5cee2d8 Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Mon, 17 Jun 2024 01:40:32 -0700
Subject: [PATCH 059/148] Unbreak RAFT conda builds (#3519)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3519

Fixing the conda conflicts because of `_openmp_mutex` build versions. This change pins that version for RAFT conda package builds.

Reviewed By: algoriddle

Differential Revision: D58646659

fbshipit-source-id: 4c1eaa9f08bd354da016b9399a36698007a497d8
---
 conda/faiss-gpu-raft/meta.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/conda/faiss-gpu-raft/meta.yaml b/conda/faiss-gpu-raft/meta.yaml
index 23e4835032..9a5fd542f1 100644
--- a/conda/faiss-gpu-raft/meta.yaml
+++ b/conda/faiss-gpu-raft/meta.yaml
@@ -50,14 +50,18 @@ outputs:
         - llvm-openmp  # [osx]
         - cmake >=3.24.0
         - make  # [not win]
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
+        - mkl =2023  # [x86_64]
         - mkl-devel =2023  # [x86_64]
         - cuda-toolkit {{ cudatoolkit }}
       host:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
         - libraft =24.04
         - cuda-version {{ cuda_constraints }}
       run:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
         - cuda-cudart {{ cuda_constraints }}
@@ -87,12 +91,16 @@ outputs:
         - swig
         - cmake >=3.24.0
         - make  # [not win]
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
+        - mkl =2023  # [x86_64]
         - cuda-toolkit {{ cudatoolkit }}
       host:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - python {{ python }}
         - numpy >=1.19,<2
         - {{ pin_subpackage('libfaiss', exact=True) }}
       run:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - python {{ python }}
         - numpy >=1.19,<2
         - packaging

From 5109d96b9e2fb62c6910de721f52fa0b37e2acee Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Mon, 17 Jun 2024 17:59:13 -0700
Subject: [PATCH 060/148] Bump libraft to 24.06 to unblock nightly RAFT builds
 (#3522)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3522

Quick fix to unblock nightly

Reviewed By: mlomeli1

Differential Revision: D58694193

fbshipit-source-id: ea323991cc2e2b958fc11ab614dcd6e09d4c072c
---
 conda/faiss-gpu-raft/meta.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conda/faiss-gpu-raft/meta.yaml b/conda/faiss-gpu-raft/meta.yaml
index 9a5fd542f1..1dde8e9868 100644
--- a/conda/faiss-gpu-raft/meta.yaml
+++ b/conda/faiss-gpu-raft/meta.yaml
@@ -58,7 +58,7 @@ outputs:
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
-        - libraft =24.04
+        - libraft =24.06
         - cuda-version {{ cuda_constraints }}
       run:
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
@@ -66,7 +66,7 @@ outputs:
         - openblas  # [not x86_64]
         - cuda-cudart {{ cuda_constraints }}
         - libcublas {{ libcublas_constraints }}
-        - libraft =24.04
+        - libraft =24.06
         - cuda-version {{ cuda_constraints }}
     test:
       requires:

From 4e98f8fcc431c2b93549cb7260ae2e8897e3c7a0 Mon Sep 17 00:00:00 2001
From: Matthijs Douze <matthijs@meta.com>
Date: Tue, 18 Jun 2024 03:13:51 -0700
Subject: [PATCH 061/148] Add ABS_INNER_PRODUCT metric (#3524)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3524

Searches with the metric abs(dot(query, database))
This makes it possible to search vectors that are closest to a hyperplane

* adds support for alternative metrics in faiss.knn in python

* checks that it works with HNSW

* simplifies the extra distances interface by removing the template on

Reviewed By: asadoughi

Differential Revision: D58695971

fbshipit-source-id: 2a0ff49c7f7ac2c005d85f141cc5de148081c9c4
---
 faiss/IndexFlat.cpp               | 18 ++++++----
 faiss/MetricType.h                |  8 +++--
 faiss/python/extra_wrappers.py    | 12 +++++--
 faiss/utils/extra_distances-inl.h | 12 +++++++
 faiss/utils/extra_distances.cpp   | 55 +++++++++++--------------------
 faiss/utils/extra_distances.h     |  5 +--
 tests/test_extra_distances.py     |  7 ++++
 tests/test_graph_based.py         | 15 +++++++++
 8 files changed, 82 insertions(+), 50 deletions(-)

diff --git a/faiss/IndexFlat.cpp b/faiss/IndexFlat.cpp
index f606f8e621..7d29ca5387 100644
--- a/faiss/IndexFlat.cpp
+++ b/faiss/IndexFlat.cpp
@@ -41,15 +41,19 @@ void IndexFlat::search(
     } else if (metric_type == METRIC_L2) {
         float_maxheap_array_t res = {size_t(n), size_t(k), labels, distances};
         knn_L2sqr(x, get_xb(), d, n, ntotal, &res, nullptr, sel);
-    } else if (is_similarity_metric(metric_type)) {
-        float_minheap_array_t res = {size_t(n), size_t(k), labels, distances};
-        knn_extra_metrics(
-                x, get_xb(), d, n, ntotal, metric_type, metric_arg, &res);
     } else {
-        FAISS_THROW_IF_NOT(!sel);
-        float_maxheap_array_t res = {size_t(n), size_t(k), labels, distances};
+        FAISS_THROW_IF_NOT(!sel); // TODO implement with selector
         knn_extra_metrics(
-                x, get_xb(), d, n, ntotal, metric_type, metric_arg, &res);
+                x,
+                get_xb(),
+                d,
+                n,
+                ntotal,
+                metric_type,
+                metric_arg,
+                k,
+                distances,
+                labels);
     }
 }
 
diff --git a/faiss/MetricType.h b/faiss/MetricType.h
index 4689d4d018..8e889b1a03 100644
--- a/faiss/MetricType.h
+++ b/faiss/MetricType.h
@@ -31,9 +31,13 @@ enum MetricType {
     METRIC_Canberra = 20,
     METRIC_BrayCurtis,
     METRIC_JensenShannon,
-    METRIC_Jaccard, ///< defined as: sum_i(min(a_i, b_i)) / sum_i(max(a_i, b_i))
-                    ///< where a_i, b_i > 0
+
+    /// sum_i(min(a_i, b_i)) / sum_i(max(a_i, b_i)) where a_i, b_i > 0
+    METRIC_Jaccard,
+    /// Squared Eucliden distance, ignoring NaNs
     METRIC_NaNEuclidean,
+    /// abs(x | y): the distance to a hyperplane
+    METRIC_ABS_INNER_PRODUCT,
 };
 
 /// all vector indices are this type
diff --git a/faiss/python/extra_wrappers.py b/faiss/python/extra_wrappers.py
index d7fd05bc9f..a037b0280f 100644
--- a/faiss/python/extra_wrappers.py
+++ b/faiss/python/extra_wrappers.py
@@ -330,7 +330,7 @@ def lookup(self, keys):
 # KNN function
 ######################################################
 
-def knn(xq, xb, k, metric=METRIC_L2):
+def knn(xq, xb, k, metric=METRIC_L2, metric_arg=0.0):
     """
     Compute the k nearest neighbors of a vector without constructing an index
 
@@ -374,10 +374,16 @@ def knn(xq, xb, k, metric=METRIC_L2):
             swig_ptr(xq), swig_ptr(xb),
             d, nq, nb, k, swig_ptr(D), swig_ptr(I)
         )
-    else:
-        raise NotImplementedError("only L2 and INNER_PRODUCT are supported")
+    else: 
+        knn_extra_metrics(
+            swig_ptr(xq), swig_ptr(xb),
+            d, nq, nb, metric, metric_arg, k, 
+            swig_ptr(D), swig_ptr(I)
+        )
+
     return D, I
 
+
 def knn_hamming(xq, xb, k, variant="hc"):
     """
     Compute the k nearest neighbors of a set of vectors without constructing an index.
diff --git a/faiss/utils/extra_distances-inl.h b/faiss/utils/extra_distances-inl.h
index 5b21482d18..3171580f8c 100644
--- a/faiss/utils/extra_distances-inl.h
+++ b/faiss/utils/extra_distances-inl.h
@@ -150,4 +150,16 @@ inline float VectorDistance<METRIC_NaNEuclidean>::operator()(
     }
     return float(d) / float(present) * accu;
 }
+
+template <>
+inline float VectorDistance<METRIC_ABS_INNER_PRODUCT>::operator()(
+        const float* x,
+        const float* y) const {
+    float accu = 0;
+    for (size_t i = 0; i < d; i++) {
+        accu += fabs(x[i] * y[i]);
+    }
+    return accu;
+}
+
 } // namespace faiss
diff --git a/faiss/utils/extra_distances.cpp b/faiss/utils/extra_distances.cpp
index fb225e7c9e..407057e58e 100644
--- a/faiss/utils/extra_distances.cpp
+++ b/faiss/utils/extra_distances.cpp
@@ -50,16 +50,18 @@ void pairwise_extra_distances_template(
     }
 }
 
-template <class VD, class C>
+template <class VD>
 void knn_extra_metrics_template(
         VD vd,
         const float* x,
         const float* y,
         size_t nx,
         size_t ny,
-        HeapArray<C>* res) {
-    size_t k = res->k;
+        size_t k,
+        float* distances,
+        int64_t* labels) {
     size_t d = vd.d;
+    using C = typename VD::C;
     size_t check_period = InterruptCallback::get_period_hint(ny * d);
     check_period *= omp_get_max_threads();
 
@@ -71,18 +73,15 @@ void knn_extra_metrics_template(
             const float* x_i = x + i * d;
             const float* y_j = y;
             size_t j;
-            float* simi = res->get_val(i);
-            int64_t* idxi = res->get_ids(i);
+            float* simi = distances + k * i;
+            int64_t* idxi = labels + k * i;
 
             // maxheap_heapify(k, simi, idxi);
             heap_heapify<C>(k, simi, idxi);
             for (j = 0; j < ny; j++) {
                 float disij = vd(x_i, y_j);
 
-                // if (disij < simi[0]) {
-                if ((!vd.is_similarity && (disij < simi[0])) ||
-                    (vd.is_similarity && (disij > simi[0]))) {
-                    // maxheap_replace_top(k, simi, idxi, disij, j);
+                if (C::cmp(simi[0], disij)) {
                     heap_replace_top<C>(k, simi, idxi, disij, j);
                 }
                 y_j += d;
@@ -165,13 +164,13 @@ void pairwise_extra_distances(
         HANDLE_VAR(Lp);
         HANDLE_VAR(Jaccard);
         HANDLE_VAR(NaNEuclidean);
+        HANDLE_VAR(ABS_INNER_PRODUCT);
 #undef HANDLE_VAR
         default:
             FAISS_THROW_MSG("metric type not implemented");
     }
 }
 
-template <class C>
 void knn_extra_metrics(
         const float* x,
         const float* y,
@@ -180,13 +179,15 @@ void knn_extra_metrics(
         size_t ny,
         MetricType mt,
         float metric_arg,
-        HeapArray<C>* res) {
+        size_t k,
+        float* distances,
+        int64_t* indexes) {
     switch (mt) {
-#define HANDLE_VAR(kw)                                            \
-    case METRIC_##kw: {                                           \
-        VectorDistance<METRIC_##kw> vd = {(size_t)d, metric_arg}; \
-        knn_extra_metrics_template(vd, x, y, nx, ny, res);        \
-        break;                                                    \
+#define HANDLE_VAR(kw)                                                       \
+    case METRIC_##kw: {                                                      \
+        VectorDistance<METRIC_##kw> vd = {(size_t)d, metric_arg};            \
+        knn_extra_metrics_template(vd, x, y, nx, ny, k, distances, indexes); \
+        break;                                                               \
     }
         HANDLE_VAR(L2);
         HANDLE_VAR(L1);
@@ -197,32 +198,13 @@ void knn_extra_metrics(
         HANDLE_VAR(Lp);
         HANDLE_VAR(Jaccard);
         HANDLE_VAR(NaNEuclidean);
+        HANDLE_VAR(ABS_INNER_PRODUCT);
 #undef HANDLE_VAR
         default:
             FAISS_THROW_MSG("metric type not implemented");
     }
 }
 
-template void knn_extra_metrics<CMax<float, int64_t>>(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        MetricType mt,
-        float metric_arg,
-        HeapArray<CMax<float, int64_t>>* res);
-
-template void knn_extra_metrics<CMin<float, int64_t>>(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        MetricType mt,
-        float metric_arg,
-        HeapArray<CMin<float, int64_t>>* res);
-
 FlatCodesDistanceComputer* get_extra_distance_computer(
         size_t d,
         MetricType mt,
@@ -245,6 +227,7 @@ FlatCodesDistanceComputer* get_extra_distance_computer(
         HANDLE_VAR(Lp);
         HANDLE_VAR(Jaccard);
         HANDLE_VAR(NaNEuclidean);
+        HANDLE_VAR(ABS_INNER_PRODUCT);
 #undef HANDLE_VAR
         default:
             FAISS_THROW_MSG("metric type not implemented");
diff --git a/faiss/utils/extra_distances.h b/faiss/utils/extra_distances.h
index 79b65bc1e9..f8b47cfba5 100644
--- a/faiss/utils/extra_distances.h
+++ b/faiss/utils/extra_distances.h
@@ -33,7 +33,6 @@ void pairwise_extra_distances(
         int64_t ldb = -1,
         int64_t ldd = -1);
 
-template <class C>
 void knn_extra_metrics(
         const float* x,
         const float* y,
@@ -42,7 +41,9 @@ void knn_extra_metrics(
         size_t ny,
         MetricType mt,
         float metric_arg,
-        HeapArray<C>* res);
+        size_t k,
+        float* distances,
+        int64_t* indexes);
 
 /** get a DistanceComputer that refers to this type of distance and
  *  indexes a flat array of size nb */
diff --git a/tests/test_extra_distances.py b/tests/test_extra_distances.py
index 66318f76c5..fcaf4d383d 100644
--- a/tests/test_extra_distances.py
+++ b/tests/test_extra_distances.py
@@ -114,6 +114,13 @@ def test_nan_euclidean(self):
         new_dis = faiss.pairwise_distances(x, q, faiss.METRIC_NaNEuclidean)
         self.assertTrue(np.isnan(new_dis[0]))
 
+    def test_abs_inner_product(self):
+        xq, yb = self.make_example()
+        dis = faiss.pairwise_distances(xq, yb, faiss.METRIC_ABS_INNER_PRODUCT)
+
+        gt_dis = np.abs(xq @ yb.T)
+        np.testing.assert_allclose(dis, gt_dis, atol=1e-5)
+
 
 class TestKNN(unittest.TestCase):
     """ test that the knn search gives the same as distance matrix + argmin """
diff --git a/tests/test_graph_based.py b/tests/test_graph_based.py
index c769e03ade..d5797186da 100644
--- a/tests/test_graph_based.py
+++ b/tests/test_graph_based.py
@@ -169,6 +169,21 @@ def test_io_no_storage(self):
             )
             self.assertEquals(index3.storage, None)
 
+    def test_abs_inner_product(self):
+        """Test HNSW with abs inner product (not a real distance, so dubious that triangular inequality works)"""
+        d = self.xq.shape[1]
+        xb = self.xb - self.xb.mean(axis=0)  # need to be centered to give interesting directions
+        xq = self.xq - self.xq.mean(axis=0)
+        Dref, Iref = faiss.knn(xq, xb, 10, faiss.METRIC_ABS_INNER_PRODUCT)
+        
+        index = faiss.IndexHNSWFlat(d, 32, faiss.METRIC_ABS_INNER_PRODUCT)
+        index.add(xb)
+        Dnew, Inew = index.search(xq, 10)
+
+        inter = faiss.eval_intersection(Iref, Inew)
+        # 4769 vs. 500*10
+        self.assertGreater(inter, Iref.size * 0.9)
+ 
 
 class TestNSG(unittest.TestCase):
 

From bea734325dc2c4e9952dcffeb2996296a29c86b7 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 25 Jun 2024 08:51:33 -0700
Subject: [PATCH 062/148] initial commit

---
 CMakeLists.txt                                |   8 +-
 INSTALL.md                                    |   2 +-
 ..._ivfflat_raft.py => bench_ivfflat_cuvs.py} |  60 +++----
 ...ench_ivfpq_raft.py => bench_ivfpq_cuvs.py} |  36 ++--
 build.sh                                      |  58 +++++++
 c_api/gpu/CMakeLists.txt                      |   2 +-
 cmake/thirdparty/fetch_rapids.cmake           |   2 +-
 conda/faiss-gpu-raft/build-lib.sh             |   2 +-
 conda/faiss-gpu-raft/build-pkg.sh             |   2 +-
 conda/faiss-gpu/build-lib.sh                  |   2 +-
 conda/faiss-gpu/build-pkg.sh                  |   2 +-
 contrib/torch_utils.py                        |   4 +-
 faiss/gpu/CMakeLists.txt                      |  44 ++---
 faiss/gpu/GpuCloner.cpp                       |  22 +--
 faiss/gpu/GpuClonerOptions.h                  |   8 +-
 faiss/gpu/GpuDistance.cu                      |  46 ++---
 faiss/gpu/GpuDistance.h                       |   4 +-
 faiss/gpu/GpuIndex.cu                         |   8 +-
 faiss/gpu/GpuIndex.h                          |   8 +-
 faiss/gpu/GpuIndexCagra.cu                    |  18 +-
 faiss/gpu/GpuIndexCagra.h                     |   4 +-
 faiss/gpu/GpuIndexFlat.cu                     |  12 +-
 faiss/gpu/GpuIndexIVF.cu                      |   2 +-
 faiss/gpu/GpuIndexIVFFlat.cu                  |  74 ++++----
 faiss/gpu/GpuIndexIVFPQ.cu                    |  95 ++++++-----
 faiss/gpu/GpuIndexIVFPQ.h                     |   2 +-
 faiss/gpu/GpuResources.cpp                    |   2 +-
 faiss/gpu/GpuResources.h                      |   6 +-
 faiss/gpu/StandardGpuResources.cpp            |  24 +--
 faiss/gpu/StandardGpuResources.h              |   8 +-
 faiss/gpu/impl/{RaftCagra.cu => CuvsCagra.cu} | 160 ++++++------------
 .../gpu/impl/{RaftCagra.cuh => CuvsCagra.cuh} |  26 +--
 .../{RaftFlatIndex.cu => CuvsFlatIndex.cu}    |  26 +--
 .../{RaftFlatIndex.cuh => CuvsFlatIndex.cuh}  |   4 +-
 .../impl/{RaftIVFFlat.cu => CuvsIVFFlat.cu}   | 157 ++++++++---------
 .../impl/{RaftIVFFlat.cuh => CuvsIVFFlat.cuh} |  20 +--
 faiss/gpu/impl/{RaftIVFPQ.cu => CuvsIVFPQ.cu} | 140 +++++++--------
 .../gpu/impl/{RaftIVFPQ.cuh => CuvsIVFPQ.cuh} |  16 +-
 faiss/gpu/test/CMakeLists.txt                 |   4 +-
 faiss/gpu/test/TestGpuDistance.cu             |  34 ++--
 faiss/gpu/test/TestGpuIndexFlat.cpp           |  90 +++++-----
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp        |  70 ++++----
 faiss/gpu/test/TestGpuIndexIVFPQ.cpp          |  26 +--
 faiss/gpu/test/TestGpuMemoryException.cpp     |   2 +-
 faiss/gpu/test/test_gpu_index.py              |  14 +-
 faiss/gpu/test/test_gpu_index_ivfflat.py      |   2 +-
 faiss/gpu/test/test_gpu_index_ivfsq.py        |   6 +-
 faiss/gpu/test/test_gpu_index_serialize.py    |   4 +-
 faiss/gpu/test/test_index_cpu_to_gpu.py       |  16 +-
 faiss/gpu/test/test_multi_gpu.py              |  10 +-
 faiss/gpu/test/test_raft.py                   |   6 +-
 faiss/gpu/test/torch_test_contrib_gpu.py      |  18 +-
 .../gpu/utils/{RaftUtils.cu => CuvsUtils.cu}  |   2 +-
 faiss/gpu/utils/{RaftUtils.h => CuvsUtils.h}  |  20 +--
 faiss/python/CMakeLists.txt                   |  12 +-
 faiss/python/gpu_wrappers.py                  |   4 +-
 faiss/python/swigfaiss.swig                   |   4 +-
 tests/CMakeLists.txt                          |   2 +-
 58 files changed, 742 insertions(+), 720 deletions(-)
 rename benchs/{bench_ivfflat_raft.py => bench_ivfflat_cuvs.py} (81%)
 rename benchs/{bench_ivfpq_raft.py => bench_ivfpq_cuvs.py} (88%)
 create mode 100755 build.sh
 rename faiss/gpu/impl/{RaftCagra.cu => CuvsCagra.cu} (62%)
 rename faiss/gpu/impl/{RaftCagra.cuh => CuvsCagra.cuh} (85%)
 rename faiss/gpu/impl/{RaftFlatIndex.cu => CuvsFlatIndex.cu} (88%)
 rename faiss/gpu/impl/{RaftFlatIndex.cuh => CuvsFlatIndex.cuh} (97%)
 rename faiss/gpu/impl/{RaftIVFFlat.cu => CuvsIVFFlat.cu} (78%)
 rename faiss/gpu/impl/{RaftIVFFlat.cuh => CuvsIVFFlat.cuh} (91%)
 rename faiss/gpu/impl/{RaftIVFPQ.cu => CuvsIVFPQ.cu} (81%)
 rename faiss/gpu/impl/{RaftIVFPQ.cuh => CuvsIVFPQ.cuh} (93%)
 rename faiss/gpu/utils/{RaftUtils.cu => CuvsUtils.cu} (98%)
 rename faiss/gpu/utils/{RaftUtils.h => CuvsUtils.h} (77%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cedee9c456..3064f22c53 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,7 +26,7 @@ if(FAISS_ENABLE_GPU)
   list(APPEND FAISS_LANGUAGES CUDA)
 endif()
 
-if(FAISS_ENABLE_RAFT)
+if(FAISS_ENABLE_CUVS)
 include(cmake/thirdparty/fetch_rapids.cmake)
 include(rapids-cmake)
 include(rapids-cpm)
@@ -53,7 +53,7 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 # Valid values are "generic", "avx2", "avx512".
 option(FAISS_OPT_LEVEL "" "generic")
 option(FAISS_ENABLE_GPU "Enable support for GPU indexes." ON)
-option(FAISS_ENABLE_RAFT "Enable RAFT for GPU indexes." OFF)
+option(FAISS_ENABLE_CUVS "Enable CUVS for GPU indexes." OFF)
 option(FAISS_ENABLE_PYTHON "Build Python extension." ON)
 option(FAISS_ENABLE_C_API "Build C API." OFF)
 
@@ -62,8 +62,8 @@ if(FAISS_ENABLE_GPU)
   enable_language(CUDA)
 endif()
 
-if(FAISS_ENABLE_RAFT AND NOT TARGET raft::raft)
-   find_package(raft COMPONENTS compiled distributed)
+if(FAISS_ENABLE_CUVS AND NOT TARGET cuvs::cuvs)
+   find_package(cuvs)
  endif()
 
 add_subdirectory(faiss)
diff --git a/INSTALL.md b/INSTALL.md
index 5bd4f6d448..6553ad5072 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -110,7 +110,7 @@ Several options can be passed to CMake, among which:
   values are `ON` and `OFF`),
   - `-DFAISS_ENABLE_PYTHON=OFF` in order to disable building python bindings
   (possible values are `ON` and `OFF`),
-  - `-DFAISS_ENABLE_RAFT=ON` in order to enable building the RAFT implementations
+  - `-DFAISS_ENABLE_CUVS=ON` in order to enable building the cuvs implementations
     of the IVF-Flat and IVF-PQ GPU-accelerated indices (default is `OFF`, possible
     values are `ON` and `OFF`)
   - `-DBUILD_TESTING=OFF` in order to disable building C++ tests,
diff --git a/benchs/bench_ivfflat_raft.py b/benchs/bench_ivfflat_cuvs.py
similarity index 81%
rename from benchs/bench_ivfflat_raft.py
rename to benchs/bench_ivfflat_cuvs.py
index 9ebfcb3422..269bd9cf21 100644
--- a/benchs/bench_ivfflat_raft.py
+++ b/benchs/bench_ivfflat_cuvs.py
@@ -43,8 +43,8 @@ def aa(*args, **kwargs):
    help='whether to benchmark add operation on GPU index')
 aa('--bm_search', default=True,
    help='whether to benchmark search operation on GPU index')
-aa('--raft_only', default=False, action='store_true',
-   help='whether to only produce RAFT enabled benchmarks')
+aa('--cuvs_only', default=False, action='store_true',
+   help='whether to only produce CUVS enabled benchmarks')
 
 
 group = parser.add_argument_group('IVF options')
@@ -69,9 +69,9 @@ def aa(*args, **kwargs):
 mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource())
 rmm.mr.set_current_device_resource(mr)
 
-def bench_train_milliseconds(index, trainVecs, use_raft):
+def bench_train_milliseconds(index, trainVecs, use_cuvs):
     co = faiss.GpuMultipleClonerOptions()
-    co.use_raft = use_raft
+    co.use_cuvs = use_cuvs
     index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
     t0 = time.time()
     index_gpu.train(trainVecs)
@@ -88,21 +88,21 @@ def bench_train_milliseconds(index, trainVecs, use_raft):
         for n_cols in dataset_dims:
             index = faiss.index_factory(n_cols, "IVF{},Flat".format(args.n_centroids))
             trainVecs = rs.rand(n_rows, n_cols).astype('float32')
-            raft_gpu_train_time = bench_train_milliseconds(
+            cuvs_gpu_train_time = bench_train_milliseconds(
                 index, trainVecs, True)
-            if args.raft_only:
-                print("Method: IVFFlat, Operation: TRAIN, dim: %d, n_centroids %d, numTrain: %d, RAFT enabled GPU train time: %.3f milliseconds" % (
-                    n_cols, args.n_centroids, n_rows, raft_gpu_train_time))
+            if args.cuvs_only:
+                print("Method: IVFFlat, Operation: TRAIN, dim: %d, n_centroids %d, numTrain: %d, CUVS enabled GPU train time: %.3f milliseconds" % (
+                    n_cols, args.n_centroids, n_rows, cuvs_gpu_train_time))
             else:
                 classical_gpu_train_time = bench_train_milliseconds(
                     index, trainVecs, False)
-                print("Method: IVFFlat, Operation: TRAIN, dim: %d, n_centroids %d, numTrain: %d, classical GPU train time: %.3f milliseconds, RAFT enabled GPU train time: %.3f milliseconds" % (
-                    n_cols, args.n_centroids, n_rows, classical_gpu_train_time, raft_gpu_train_time))
+                print("Method: IVFFlat, Operation: TRAIN, dim: %d, n_centroids %d, numTrain: %d, classical GPU train time: %.3f milliseconds, CUVS enabled GPU train time: %.3f milliseconds" % (
+                    n_cols, args.n_centroids, n_rows, classical_gpu_train_time, cuvs_gpu_train_time))
 
 
-def bench_add_milliseconds(index, addVecs, use_raft):
+def bench_add_milliseconds(index, addVecs, use_cuvs):
     co = faiss.GpuMultipleClonerOptions()
-    co.use_raft = use_raft
+    co.use_cuvs = use_cuvs
     index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
     index_gpu.copyFrom(index)
     t0 = time.time()
@@ -124,20 +124,20 @@ def bench_add_milliseconds(index, addVecs, use_raft):
     for n_rows in addset_sizes:
         for n_cols in dataset_dims:
             addVecs = rs.rand(n_rows, n_cols).astype('float32')
-            raft_gpu_add_time = bench_add_milliseconds(index, addVecs, True)
-            if args.raft_only:
-                print("Method: IVFFlat, Operation: ADD, dim: %d, n_centroids %d, numAdd: %d, RAFT enabled GPU add time: %.3f milliseconds" % (
-                    n_train, n_rows, n_cols, args.n_centroids, raft_gpu_add_time))
+            cuvs_gpu_add_time = bench_add_milliseconds(index, addVecs, True)
+            if args.cuvs_only:
+                print("Method: IVFFlat, Operation: ADD, dim: %d, n_centroids %d, numAdd: %d, CUVS enabled GPU add time: %.3f milliseconds" % (
+                    n_train, n_rows, n_cols, args.n_centroids, cuvs_gpu_add_time))
             else:
                 classical_gpu_add_time = bench_add_milliseconds(
                     index, addVecs, False)
-                print("Method: IVFFlat, Operation: ADD, dim: %d, n_centroids %d, numAdd: %d, classical GPU add time: %.3f milliseconds, RAFT enabled GPU add time: %.3f milliseconds" % (
-                    n_train, n_rows, n_cols, args.n_centroids, classical_gpu_add_time, raft_gpu_add_time))
+                print("Method: IVFFlat, Operation: ADD, dim: %d, n_centroids %d, numAdd: %d, classical GPU add time: %.3f milliseconds, CUVS enabled GPU add time: %.3f milliseconds" % (
+                    n_train, n_rows, n_cols, args.n_centroids, classical_gpu_add_time, cuvs_gpu_add_time))
 
 
-def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_raft):
+def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_cuvs):
     co = faiss.GpuMultipleClonerOptions()
-    co.use_raft = use_raft
+    co.use_cuvs = use_cuvs
     index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
     index_gpu.copyFrom(index)
     index_gpu.add(addVecs)
@@ -162,19 +162,19 @@ def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_raft):
         addVecs = rs.rand(n_add, n_cols).astype('float32')
         for n_rows in queryset_sizes:
             queryVecs = rs.rand(n_rows, n_cols).astype('float32')
-            raft_gpu_search_time = bench_search_milliseconds(
+            cuvs_gpu_search_time = bench_search_milliseconds(
                 index, addVecs, queryVecs, args.nprobe, args.k, True)
-            if args.raft_only:
-                print("Method: IVFFlat, Operation: SEARCH, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, RAFT enabled GPU search time: %.3f milliseconds" % (
-                    n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time))
+            if args.cuvs_only:
+                print("Method: IVFFlat, Operation: SEARCH, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, CUVS enabled GPU search time: %.3f milliseconds" % (
+                    n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, cuvs_gpu_search_time))
             else:
                 classical_gpu_search_time = bench_search_milliseconds(
                     index, addVecs, queryVecs, args.nprobe, args.k, False)
-                print("Method: IVFFlat, Operation: SEARCH, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, RAFT enabled GPU search time: %.3f milliseconds" % (
-                    n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, classical_gpu_search_time, raft_gpu_search_time))
+                print("Method: IVFFlat, Operation: SEARCH, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, CUVS enabled GPU search time: %.3f milliseconds" % (
+                    n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, classical_gpu_search_time, cuvs_gpu_search_time))
 
     print("=" * 40)
-    print("Large RAFT Enabled Benchmarks")
+    print("Large CUVS Enabled Benchmarks")
     print("=" * 40)
     # Avoid classical GPU Benchmarks for large datasets because of OOM for more than 500000 queries and/or large dims as well as for large k
     queryset_sizes = [100000, 500000, 1000000]
@@ -187,7 +187,7 @@ def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_raft):
         addVecs = rs.rand(n_add, n_cols).astype('float32')
         for n_rows in queryset_sizes:
             queryVecs = rs.rand(n_rows, n_cols).astype('float32')
-            raft_gpu_search_time = bench_search_milliseconds(
+            cuvs_gpu_search_time = bench_search_milliseconds(
                 index, addVecs, queryVecs, args.nprobe, args.k, True)
-            print("Method: IVFFlat, Operation: SEARCH, numTrain: %d, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, RAFT enabled GPU search time: %.3f milliseconds" % (
-                n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time))
+            print("Method: IVFFlat, Operation: SEARCH, numTrain: %d, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, CUVS enabled GPU search time: %.3f milliseconds" % (
+                n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, cuvs_gpu_search_time))
diff --git a/benchs/bench_ivfpq_raft.py b/benchs/bench_ivfpq_cuvs.py
similarity index 88%
rename from benchs/bench_ivfpq_raft.py
rename to benchs/bench_ivfpq_cuvs.py
index 3494a18741..02f1a487ae 100644
--- a/benchs/bench_ivfpq_raft.py
+++ b/benchs/bench_ivfpq_cuvs.py
@@ -41,13 +41,13 @@ def aa(*args, **kwargs):
 
 
 group = parser.add_argument_group('benchmarking options')
-aa('--raft_only', default=False, action='store_true',
-   help='whether to only produce RAFT enabled benchmarks')
+aa('--cuvs_only', default=False, action='store_true',
+   help='whether to only produce CUVS enabled benchmarks')
 
 group = parser.add_argument_group('IVF options')
 aa('--bits_per_code', default=8, type=int, help='bits per code. Note that < 8 is only supported when RAFT is enabled')
 aa('--pq_len', default=2, type=int, help='number of vector elements represented by one PQ code')
-aa('--use_precomputed', default=True, type=bool, help='use precomputed codes (not with RAFT enabled)')
+aa('--use_precomputed', default=True, type=bool, help='use precomputed codes (not with CUVS enabled)')
 
 group = parser.add_argument_group('searching')
 aa('--k', default=10, type=int, help='nb of nearest neighbors')
@@ -73,11 +73,11 @@ def compute_nlist(numVecs):
     return int(nlist)
 
 
-def bench_train_milliseconds(index, trainVecs, use_raft):
+def bench_train_milliseconds(index, trainVecs, use_cuvs):
     co = faiss.GpuMultipleClonerOptions()
     # use float 16 lookup tables to save space
     co.useFloat16LookupTables = True
-    co.use_raft = use_raft
+    co.use_cuvs = use_cuvs
     index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
     t0 = time.time()
     index_gpu.train(trainVecs)
@@ -93,21 +93,21 @@ def bench_train_milliseconds(index, trainVecs, use_raft):
 print("GPU Train Benchmarks")
 print("=" * 40)
 raft_gpu_train_time = bench_train_milliseconds(index, xt, True)
-if args.raft_only:
-    print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, RAFT enabled GPU train time: %.3f milliseconds" % (
+if args.cuvs_only:
+    print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, CUVS enabled GPU train time: %.3f milliseconds" % (
         n_cols, nlist, M, args.bits_per_code, n_train, raft_gpu_train_time))
 else:
     classical_gpu_train_time = bench_train_milliseconds(
         index, xt, False)
-    print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, classical GPU train time: %.3f milliseconds, RAFT enabled GPU train time: %.3f milliseconds" % (
+    print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, classical GPU train time: %.3f milliseconds, CUVS enabled GPU train time: %.3f milliseconds" % (
         n_cols, nlist, M, args.bits_per_code, n_train, classical_gpu_train_time, raft_gpu_train_time))
 
 
-def bench_add_milliseconds(index, addVecs, use_raft):
+def bench_add_milliseconds(index, addVecs, use_cuvs):
     co = faiss.GpuMultipleClonerOptions()
     # use float 16 lookup tables to save space
     co.useFloat16LookupTables = True
-    co.use_raft = use_raft
+    co.use_cuvs = use_cuvs
     index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
     index_gpu.copyFrom(index)
     t0 = time.time()
@@ -119,19 +119,19 @@ def bench_add_milliseconds(index, addVecs, use_raft):
 print("=" * 40)
 index.train(xt)
 raft_gpu_add_time = bench_add_milliseconds(index, xb, True)
-if args.raft_only:
-    print("Method: IVFPQ, Operation: ADD, dim: %d, n_centroids %d numSubQuantizers %d, bitsPerCode %d, numAdd %d, RAFT enabled GPU add time: %.3f milliseconds" % (
+if args.cuvs_only:
+    print("Method: IVFPQ, Operation: ADD, dim: %d, n_centroids %d numSubQuantizers %d, bitsPerCode %d, numAdd %d, CUVS enabled GPU add time: %.3f milliseconds" % (
         n_cols, nlist, M, args.bits_per_code, n_rows, raft_gpu_add_time))
 else:
     classical_gpu_add_time = bench_add_milliseconds(
         index, xb, False)
-    print("Method: IVFFPQ, Operation: ADD, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numAdd %d, classical GPU add time: %.3f milliseconds, RAFT enabled GPU add time: %.3f milliseconds" % (
+    print("Method: IVFFPQ, Operation: ADD, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numAdd %d, classical GPU add time: %.3f milliseconds, CUVS enabled GPU add time: %.3f milliseconds" % (
         n_cols, nlist, M, args.bits_per_code, n_rows, classical_gpu_add_time, raft_gpu_add_time))
 
 
-def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_raft):
+def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_cuvs):
     co = faiss.GpuMultipleClonerOptions()
-    co.use_raft = use_raft
+    co.use_cuvs = use_cuvs
     co.useFloat16LookupTables = True
     index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
     index_gpu.copyFrom(index)
@@ -158,11 +158,11 @@ def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_raft):
         queryVecs = xq[np.random.choice(xq.shape[0], n_rows, replace=False)]
         raft_gpu_search_time = bench_search_milliseconds(
             index, xb, queryVecs, args.nprobe, args.k, True)
-        if args.raft_only:
-            print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, RAFT enabled GPU search time: %.3f milliseconds" % (
+        if args.cuvs_only:
+            print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, CUVS enabled GPU search time: %.3f milliseconds" % (
                 n_cols, nlist, M, args.bits_per_code, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time))
         else:
             classical_gpu_search_time = bench_search_milliseconds(
                 index, xb, queryVecs, args.nprobe, args.k, False)
-            print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, RAFT enabled GPU search time: %.3f milliseconds" % (
+            print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, CUVS enabled GPU search time: %.3f milliseconds" % (
                 n_cols, nlist, M, args.bits_per_code, n_add, n_rows, args.nprobe, args.k, classical_gpu_search_time, raft_gpu_search_time))
\ No newline at end of file
diff --git a/build.sh b/build.sh
new file mode 100755
index 0000000000..eaa767f2af
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+# NOTE: This file is temporary for the proof-of-concept branch and will be removed before this PR is merged
+
+BUILD_TYPE=Release
+BUILD_DIR=build/
+
+RAFT_REPO_REL=""
+EXTRA_CMAKE_ARGS=""
+set -e
+
+if [[ ${RAFT_REPO_REL} != "" ]]; then
+  RAFT_REPO_PATH="`readlink -f \"${RAFT_REPO_REL}\"`"
+  EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCPM_raft_SOURCE=${RAFT_REPO_PATH}"
+fi
+
+if [ "$1" == "clean" ]; then
+  rm -rf build
+  rm -rf .cache
+  exit 0
+fi
+
+if [ "$1" == "test" ]; then
+  make -C build -j test
+  exit 0
+fi
+
+if [ "$1" == "test-raft" ]; then
+  ./build/faiss/gpu/test/TestRaftIndexIVFFlat
+  exit 0
+fi
+
+mkdir -p $BUILD_DIR
+cd $BUILD_DIR
+
+cmake \
+ -DFAISS_ENABLE_GPU=ON \
+ -DFAISS_ENABLE_CUVS=ON \
+ -DFAISS_ENABLE_PYTHON=OFF \
+ -DBUILD_TESTING=ON \
+ -DBUILD_SHARED_LIBS=OFF \
+ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+ -DFAISS_OPT_LEVEL=avx2 \
+ -DRAFT_NVTX=OFF \
+ -DCMAKE_CUDA_ARCHITECTURES="NATIVE" \
+ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+ -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
+ -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+ ${EXTRA_CMAKE_ARGS} \
+ ../
+
+
+# make -C build -j12 faiss
+cmake  --build . -j12
+# make -C build -j12 swigfaiss
+# (cd build/faiss/python && python setup.py install)
+
diff --git a/c_api/gpu/CMakeLists.txt b/c_api/gpu/CMakeLists.txt
index 4ec926439d..5dcea36ae9 100644
--- a/c_api/gpu/CMakeLists.txt
+++ b/c_api/gpu/CMakeLists.txt
@@ -16,7 +16,7 @@ file(GLOB FAISS_C_API_GPU_HEADERS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.h")
 faiss_install_headers("${FAISS_C_API_GPU_HEADERS}" c_api/gpu)
 
 find_package(CUDAToolkit REQUIRED)
-target_link_libraries(faiss_c PUBLIC CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass>)
+target_link_libraries(faiss_c PUBLIC CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
 
 add_executable(example_gpu_c EXCLUDE_FROM_ALL example_gpu_c.c)
 target_link_libraries(example_gpu_c PRIVATE faiss_c)
diff --git a/cmake/thirdparty/fetch_rapids.cmake b/cmake/thirdparty/fetch_rapids.cmake
index 3e0f6b6ac4..3b9d9b140a 100644
--- a/cmake/thirdparty/fetch_rapids.cmake
+++ b/cmake/thirdparty/fetch_rapids.cmake
@@ -15,7 +15,7 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 # =============================================================================
-set(RAPIDS_VERSION "24.02")
+set(RAPIDS_VERSION "24.08")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
diff --git a/conda/faiss-gpu-raft/build-lib.sh b/conda/faiss-gpu-raft/build-lib.sh
index 79ca8da2cd..ba5e1f5338 100644
--- a/conda/faiss-gpu-raft/build-lib.sh
+++ b/conda/faiss-gpu-raft/build-lib.sh
@@ -13,7 +13,7 @@ cmake -B _build \
       -DBUILD_TESTING=OFF \
       -DFAISS_OPT_LEVEL=avx512 \
       -DFAISS_ENABLE_GPU=ON \
-      -DFAISS_ENABLE_RAFT=ON \
+      -DFAISS_ENABLE_CUVS=ON \
       -DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCHS}" \
       -DFAISS_ENABLE_PYTHON=OFF \
       -DBLA_VENDOR=Intel10_64lp \
diff --git a/conda/faiss-gpu-raft/build-pkg.sh b/conda/faiss-gpu-raft/build-pkg.sh
index da5fdefca9..88bfb4a801 100644
--- a/conda/faiss-gpu-raft/build-pkg.sh
+++ b/conda/faiss-gpu-raft/build-pkg.sh
@@ -12,7 +12,7 @@ cmake -B _build_python_${PY_VER} \
       -Dfaiss_ROOT=_libfaiss_stage/ \
       -DFAISS_OPT_LEVEL=avx512 \
       -DFAISS_ENABLE_GPU=ON \
-      -DFAISS_ENABLE_RAFT=ON \
+      -DFAISS_ENABLE_CUVS=ON \
       -DCMAKE_BUILD_TYPE=Release \
       -DPython_EXECUTABLE=$PYTHON \
       faiss/python
diff --git a/conda/faiss-gpu/build-lib.sh b/conda/faiss-gpu/build-lib.sh
index 9957be96ea..ed79535792 100755
--- a/conda/faiss-gpu/build-lib.sh
+++ b/conda/faiss-gpu/build-lib.sh
@@ -19,7 +19,7 @@ cmake -B _build \
       -DBUILD_TESTING=OFF \
       -DFAISS_OPT_LEVEL=avx512 \
       -DFAISS_ENABLE_GPU=ON \
-      -DFAISS_ENABLE_RAFT=OFF \
+      -DFAISS_ENABLE_CUVS=OFF \
       -DCMAKE_CUDA_ARCHITECTURES="${CUDA_ARCHS}" \
       -DFAISS_ENABLE_PYTHON=OFF \
       -DBLA_VENDOR=Intel10_64lp \
diff --git a/conda/faiss-gpu/build-pkg.sh b/conda/faiss-gpu/build-pkg.sh
index e529a83d80..0801a69d3e 100755
--- a/conda/faiss-gpu/build-pkg.sh
+++ b/conda/faiss-gpu/build-pkg.sh
@@ -12,7 +12,7 @@ cmake -B _build_python_${PY_VER} \
       -Dfaiss_ROOT=_libfaiss_stage/ \
       -DFAISS_OPT_LEVEL=avx512 \
       -DFAISS_ENABLE_GPU=ON \
-      -DFAISS_ENABLE_RAFT=OFF \
+      -DFAISS_ENABLE_CUVS=OFF \
       -DCMAKE_BUILD_TYPE=Release \
       -DPython_EXECUTABLE=$PYTHON \
       faiss/python
diff --git a/contrib/torch_utils.py b/contrib/torch_utils.py
index 18f136e914..bbb5282652 100644
--- a/contrib/torch_utils.py
+++ b/contrib/torch_utils.py
@@ -494,7 +494,7 @@ def torch_replacement_sa_decode(self, codes, x=None):
 
 
 # allows torch tensor usage with bfKnn
-def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRIC_L2, device=-1, use_raft=False):
+def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRIC_L2, device=-1, use_cuvs=False):
     if type(xb) is np.ndarray:
         # Forward to faiss __init__.py base method
         return faiss.knn_gpu_numpy(res, xq, xb, k, D, I, metric, device)
@@ -575,7 +575,7 @@ def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRI
     args.outIndices = I_ptr
     args.outIndicesType = I_type
     args.device = device
-    args.use_raft = use_raft
+    args.use_cuvs = use_cuvs
 
     with using_stream(res):
         faiss.bfKnn(res, args)
diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index d20f3b7f8e..e53a2535f8 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -236,21 +236,21 @@ endfunction()
 
 generate_ivf_interleaved_code()
 
-if(FAISS_ENABLE_RAFT)
+if(FAISS_ENABLE_CUVS)
   list(APPEND FAISS_GPU_HEADERS
           GpuIndexCagra.h
-          impl/RaftCagra.cuh
-          impl/RaftFlatIndex.cuh
-          impl/RaftIVFFlat.cuh
-          impl/RaftIVFPQ.cuh
-          utils/RaftUtils.h)
+          impl/CuvsCagra.cuh
+          impl/CuvsFlatIndex.cuh
+          impl/CuvsIVFFlat.cuh
+          impl/CuvsIVFPQ.cuh
+          utils/CuvsUtils.h)
   list(APPEND FAISS_GPU_SRC
           GpuIndexCagra.cu
-          impl/RaftCagra.cu
-          impl/RaftFlatIndex.cu
-          impl/RaftIVFFlat.cu
-          impl/RaftIVFPQ.cu
-          utils/RaftUtils.cu)
+          impl/CuvsCagra.cu
+          impl/CuvsFlatIndex.cu
+          impl/CuvsIVFFlat.cu
+          impl/CuvsIVFPQ.cu
+          utils/CuvsUtils.cu)
 endif()
 
 add_library(faiss_gpu STATIC ${FAISS_GPU_SRC})
@@ -261,10 +261,10 @@ set_target_properties(faiss_gpu PROPERTIES
 target_include_directories(faiss_gpu PUBLIC
   $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>)
 
-if(FAISS_ENABLE_RAFT)
-  target_compile_definitions(faiss PUBLIC USE_NVIDIA_RAFT=1)
-  target_compile_definitions(faiss_avx2 PUBLIC USE_NVIDIA_RAFT=1)
-  target_compile_definitions(faiss_avx512 PUBLIC USE_NVIDIA_RAFT=1)
+if(FAISS_ENABLE_CUVS)
+  target_compile_definitions(faiss PUBLIC USE_NVIDIA_RAPIDS=1)
+  target_compile_definitions(faiss_avx2 PUBLIC USE_NVIDIA_RAPIDS=1)
+  target_compile_definitions(faiss_avx512 PUBLIC USE_NVIDIA_RAPIDS=1)
 
   # Mark all functions as hidden so that we don't generate
   # global 'public' functions that also exist in libraft.so
@@ -281,13 +281,13 @@ if(FAISS_ENABLE_RAFT)
   set_source_files_properties(
     GpuDistance.cu
     StandardGpuResources.cpp
-    impl/RaftFlatIndex.cu
-    impl/RaftIVFFlat.cu
-    impl/RaftIVFPQ.cu
-    utils/RaftUtils.cu
+    impl/CuvsFlatIndex.cu
+    impl/CuvsIVFFlat.cu
+    impl/CuvsIVFPQ.cu
+    utils/CuvsUtils.cu
     TARGET_DIRECTORY faiss
     PROPERTIES COMPILE_OPTIONS "-fvisibility=hidden")
-  target_compile_definitions(faiss_gpu PUBLIC USE_NVIDIA_RAFT=1)
+  target_compile_definitions(faiss_gpu PUBLIC USE_NVIDIA_RAPIDS=1)
 endif()
 
 # Export FAISS_GPU_HEADERS variable to parent scope.
@@ -320,5 +320,5 @@ __nv_relfatbin : { *(__nv_relfatbin) }
 target_link_options(faiss_gpu PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
 
 find_package(CUDAToolkit REQUIRED)
-target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass> $<$<BOOL:${FAISS_ENABLE_RAFT}>:OpenMP::OpenMP_CXX>)
-target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$<BOOL:${FAISS_ENABLE_RAFT}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
+target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass> $<$<BOOL:${FAISS_ENABLE_CUVS}>:OpenMP::OpenMP_CXX>)
+target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$<BOOL:${FAISS_ENABLE_CUVS}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
diff --git a/faiss/gpu/GpuCloner.cpp b/faiss/gpu/GpuCloner.cpp
index b6d55a47aa..f005bfd364 100644
--- a/faiss/gpu/GpuCloner.cpp
+++ b/faiss/gpu/GpuCloner.cpp
@@ -14,7 +14,7 @@
 
 #include <faiss/IndexBinaryFlat.h>
 #include <faiss/IndexFlat.h>
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 #include <faiss/IndexHNSW.h>
 #endif
 #include <faiss/IndexIVF.h>
@@ -27,7 +27,7 @@
 #include <faiss/MetaIndexes.h>
 #include <faiss/gpu/GpuIndex.h>
 #include <faiss/gpu/GpuIndexBinaryFlat.h>
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 #include <faiss/gpu/GpuIndexCagra.h>
 #endif
 #include <faiss/gpu/GpuIndexFlat.h>
@@ -92,7 +92,7 @@ Index* ToCPUCloner::clone_Index(const Index* index) {
         // (inverse op of ToGpuClonerMultiple)
 
     }
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     else if (auto icg = dynamic_cast<const GpuIndexCagra*>(index)) {
         IndexHNSWCagra* res = new IndexHNSWCagra();
         icg->copyTo(res);
@@ -138,7 +138,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         GpuIndexFlatConfig config;
         config.device = device;
         config.useFloat16 = useFloat16;
-        config.use_raft = use_raft;
+        config.use_cuvs = use_cuvs;
         return new GpuIndexFlat(provider, ifl, config);
     } else if (
             dynamic_cast<const IndexScalarQuantizer*>(index) &&
@@ -148,7 +148,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         config.device = device;
         config.useFloat16 = true;
         FAISS_THROW_IF_NOT_MSG(
-                !use_raft, "this type of index is not implemented for RAFT");
+                !use_cuvs, "this type of index is not implemented for CUVS");
         GpuIndexFlat* gif = new GpuIndexFlat(
                 provider, index->d, index->metric_type, config);
         // transfer data by blocks
@@ -166,7 +166,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         config.device = device;
         config.indicesOptions = indicesOptions;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-        config.use_raft = use_raft;
+        config.use_cuvs = use_cuvs;
         config.allowCpuCoarseQuantizer = allowCpuCoarseQuantizer;
 
         GpuIndexIVFFlat* res = new GpuIndexIVFFlat(
@@ -185,7 +185,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         config.indicesOptions = indicesOptions;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
         FAISS_THROW_IF_NOT_MSG(
-                !use_raft, "this type of index is not implemented for RAFT");
+                !use_cuvs, "this type of index is not implemented for CUVS");
 
         GpuIndexIVFScalarQuantizer* res = new GpuIndexIVFScalarQuantizer(
                 provider,
@@ -218,8 +218,8 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
         config.useFloat16LookupTables = useFloat16;
         config.usePrecomputedTables = usePrecomputed;
-        config.use_raft = use_raft;
-        config.interleavedLayout = use_raft;
+        config.use_cuvs = use_cuvs;
+        config.interleavedLayout = use_cuvs;
         config.allowCpuCoarseQuantizer = allowCpuCoarseQuantizer;
 
         GpuIndexIVFPQ* res = new GpuIndexIVFPQ(provider, ipq, config);
@@ -230,7 +230,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
 
         return res;
     }
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     else if (auto icg = dynamic_cast<const faiss::IndexHNSWCagra*>(index)) {
         GpuIndexCagraConfig config;
         config.device = device;
@@ -541,7 +541,7 @@ faiss::IndexBinary* index_binary_cpu_to_gpu(
         GpuIndexBinaryFlatConfig config;
         config.device = device;
         if (options) {
-            config.use_raft = options->use_raft;
+            config.use_cuvs = options->use_cuvs;
         }
         return new GpuIndexBinaryFlat(provider, ii, config);
     } else {
diff --git a/faiss/gpu/GpuClonerOptions.h b/faiss/gpu/GpuClonerOptions.h
index e643e848fb..10bfa4e9ca 100644
--- a/faiss/gpu/GpuClonerOptions.h
+++ b/faiss/gpu/GpuClonerOptions.h
@@ -37,11 +37,11 @@ struct GpuClonerOptions {
     /// Set verbose options on the index
     bool verbose = false;
 
-    /// use the RAFT implementation
-#if defined USE_NVIDIA_RAFT
-    bool use_raft = true;
+    /// use the CUVS implementation
+#if defined USE_NVIDIA_RAPIDS
+    bool use_cuvs = true;
 #else
-    bool use_raft = false;
+    bool use_cuvs = false;
 #endif
 
     /// This flag controls the CPU fallback logic for coarse quantizer
diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index 38a62f03bb..1a51cf53e3 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -29,9 +29,12 @@
 #include <faiss/gpu/utils/ConversionOperators.cuh>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <optional>
 
-#if defined USE_NVIDIA_RAFT
-#include <faiss/gpu/utils/RaftUtils.h>
+#if defined USE_NVIDIA_RAPIDS
+#include <raft/neighbors/brute_force.cuh>
+#include <cuvs/neighbors/brute_force.hpp>
+#include <faiss/gpu/utils/CuvsUtils.h>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
 #include <raft/core/error.hpp>
@@ -39,19 +42,17 @@
 #include <raft/core/operators.hpp>
 #include <raft/core/temporary_device_buffer.hpp>
 #include <raft/linalg/unary_op.cuh>
-#include <raft/neighbors/brute_force.cuh>
-#define RAFT_NAME "raft"
+// #define RAFT_NAME "raft"
 #endif
 
 namespace faiss {
 namespace gpu {
 
-#if defined USE_NVIDIA_RAFT
-using namespace raft::distance;
-using namespace raft::neighbors;
-#endif
+// #if defined USE_NVIDIA_RAPIDS
+// using namespace cuvs::neighbors;
+// #endif
 
-bool should_use_raft(GpuDistanceParams args) {
+bool should_use_cuvs(GpuDistanceParams args) {
     cudaDeviceProp prop;
     int dev = args.device >= 0 ? args.device : getCurrentDevice();
     cudaGetDeviceProperties(&prop, dev);
@@ -59,7 +60,7 @@ bool should_use_raft(GpuDistanceParams args) {
     if (prop.major < 7)
         return false;
 
-    return args.use_raft;
+    return args.use_cuvs;
 }
 
 template <typename T>
@@ -237,10 +238,10 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
             "limitation: both vectorType and queryType must currently "
             "be the same (F32 or F16");
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     // Note: For now, RAFT bfknn requires queries and vectors to be same layout
-    if (should_use_raft(args) && args.queriesRowMajor == args.vectorsRowMajor) {
-        DistanceType distance = metricFaissToRaft(args.metric, false);
+    if (should_use_cuvs(args) && args.queriesRowMajor == args.vectorsRowMajor) {
+        cuvsDistanceType distance = metricFaissToCuvs(args.metric, false);
 
         auto resImpl = prov->getResources();
         auto res = resImpl.get();
@@ -299,10 +300,15 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
                         raft::vector_extent<int64_t>(num_queries));
                 norms_view = norms->view();
             }
-            raft::neighbors::brute_force::index idx(
+            cuvs::neighbors::brute_force::index idx(
                     handle, index.view(), norms_view, distance, metric_arg);
-            raft::neighbors::brute_force::search<float, idx_t>(
-                    handle, idx, search.view(), inds.view(), dists.view());
+            cuvs::neighbors::brute_force::search(
+                    handle,
+                    idx,
+                    search.view(),
+                    inds.view(),
+                    dists.view(),
+                    std::nullopt);
         } else {
             auto index = raft::make_readonly_temporary_device_buffer<
                     const float,
@@ -328,13 +334,13 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
                     raft::col_major>>
                     index_vec = {index.view()};
 
-            brute_force::knn(
+            raft::neighbors::brute_force::knn(
                     handle,
                     index_vec,
                     search.view(),
                     inds.view(),
                     dists.view(),
-                    distance,
+                    raft::distance::DistanceType::L2Expanded,
                     metric_arg);
         }
 
@@ -357,9 +363,9 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
         handle.sync_stream();
     } else
 #else
-    if (should_use_raft(args)) {
+    if (should_use_cuvs(args)) {
         FAISS_THROW_IF_NOT_MSG(
-                !should_use_raft(args),
+                !should_use_cuvs(args),
                 "RAFT has not been compiled into the current version so it cannot be used.");
     } else
 #endif
diff --git a/faiss/gpu/GpuDistance.h b/faiss/gpu/GpuDistance.h
index 17dbee617b..f55e813392 100644
--- a/faiss/gpu/GpuDistance.h
+++ b/faiss/gpu/GpuDistance.h
@@ -108,12 +108,12 @@ struct GpuDistanceParams {
 
     /// Should the index dispatch down to RAFT?
     /// TODO: change default to true if RAFT is enabled
-    bool use_raft = false;
+    bool use_cuvs = false;
 };
 
 /// A function that determines whether RAFT should be used based on various
 /// conditions (such as unsupported architecture)
-bool should_use_raft(GpuDistanceParams args);
+bool should_use_cuvs(GpuDistanceParams args);
 
 /// A wrapper for gpu/impl/Distance.cuh to expose direct brute-force k-nearest
 /// neighbor searches on an externally-provided region of memory (e.g., from a
diff --git a/faiss/gpu/GpuIndex.cu b/faiss/gpu/GpuIndex.cu
index d667ae1494..d29741c111 100644
--- a/faiss/gpu/GpuIndex.cu
+++ b/faiss/gpu/GpuIndex.cu
@@ -42,14 +42,14 @@ constexpr idx_t kAddVecSize = (idx_t)512 * 1024;
 // FIXME: parameterize based on algorithm need
 constexpr idx_t kSearchVecSize = (idx_t)32 * 1024;
 
-bool should_use_raft(GpuIndexConfig config_) {
+bool should_use_cuvs(GpuIndexConfig config_) {
     cudaDeviceProp prop;
     cudaGetDeviceProperties(&prop, config_.device);
 
     if (prop.major < 7)
         return false;
 
-    return config_.use_raft;
+    return config_.use_cuvs;
 }
 
 GpuIndex::GpuIndex(
@@ -142,7 +142,7 @@ void GpuIndex::addPaged_(idx_t n, const float* x, const idx_t* ids) {
     if (n > 0) {
         idx_t totalSize = n * this->d * sizeof(float);
 
-        if (!should_use_raft(config_) &&
+        if (!should_use_cuvs(config_) &&
             (totalSize > kAddPageSize || n > kAddVecSize)) {
             // How many vectors fit into kAddPageSize?
             idx_t maxNumVecsForPageSize =
@@ -534,7 +534,7 @@ extern std::string gpu_compile_options;
 struct InitGpuCompileOptions {
     InitGpuCompileOptions() {
         gpu_compile_options = "GPU ";
-#ifdef USE_NVIDIA_RAFT
+#ifdef USE_NVIDIA_RAPIDS
         gpu_compile_options += "NVIDIA_RAFT ";
 #endif
     }
diff --git a/faiss/gpu/GpuIndex.h b/faiss/gpu/GpuIndex.h
index cc10f21589..4b73ba5051 100644
--- a/faiss/gpu/GpuIndex.h
+++ b/faiss/gpu/GpuIndex.h
@@ -38,16 +38,16 @@ struct GpuIndexConfig {
     MemorySpace memorySpace = MemorySpace::Device;
 
     /// Should the index dispatch down to RAFT?
-#if defined USE_NVIDIA_RAFT
-    bool use_raft = true;
+#if defined USE_NVIDIA_RAPIDS
+    bool use_cuvs = true;
 #else
-    bool use_raft = false;
+    bool use_cuvs = false;
 #endif
 };
 
 /// A centralized function that determines whether RAFT should
 /// be used based on various conditions (such as unsupported architecture)
-bool should_use_raft(GpuIndexConfig config_);
+bool should_use_cuvs(GpuIndexConfig config_);
 
 class GpuIndex : public faiss::Index {
    public:
diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
index 4ae56df10d..c26e77e5e1 100644
--- a/faiss/gpu/GpuIndexCagra.cu
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -23,7 +23,7 @@
 #include <faiss/IndexHNSW.h>
 #include <faiss/gpu/GpuIndexCagra.h>
 #include <cstddef>
-#include <faiss/gpu/impl/RaftCagra.cuh>
+#include <faiss/gpu/impl/CuvsCagra.cuh>
 #include <optional>
 
 namespace faiss {
@@ -47,13 +47,13 @@ void GpuIndexCagra::train(idx_t n, const float* x) {
 
     FAISS_ASSERT(!index_);
 
-    std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params =
+    std::optional<cuvs::neighbors::ivf_pq::index_params> ivf_pq_params =
             std::nullopt;
-    std::optional<raft::neighbors::ivf_pq::search_params> ivf_pq_search_params =
+    std::optional<cuvs::neighbors::ivf_pq::search_params> ivf_pq_search_params =
             std::nullopt;
-    if (cagraConfig_.ivf_pq_params != nullptr) {
+    if (cagraConfig_.graph_build_params != nullptr) {
         ivf_pq_params =
-                std::make_optional<raft::neighbors::ivf_pq::index_params>();
+                std::make_optional<cuvs::neighbors::ivf_pq::index_params>();
         ivf_pq_params->n_lists = cagraConfig_.ivf_pq_params->n_lists;
         ivf_pq_params->kmeans_n_iters =
                 cagraConfig_.ivf_pq_params->kmeans_n_iters;
@@ -62,7 +62,7 @@ void GpuIndexCagra::train(idx_t n, const float* x) {
         ivf_pq_params->pq_bits = cagraConfig_.ivf_pq_params->pq_bits;
         ivf_pq_params->pq_dim = cagraConfig_.ivf_pq_params->pq_dim;
         ivf_pq_params->codebook_kind =
-                static_cast<raft::neighbors::ivf_pq::codebook_gen>(
+                static_cast<cuvs::neighbors::ivf_pq::codebook_gen>(
                         cagraConfig_.ivf_pq_params->codebook_kind);
         ivf_pq_params->force_random_rotation =
                 cagraConfig_.ivf_pq_params->force_random_rotation;
@@ -71,7 +71,7 @@ void GpuIndexCagra::train(idx_t n, const float* x) {
     }
     if (cagraConfig_.ivf_pq_search_params != nullptr) {
         ivf_pq_search_params =
-                std::make_optional<raft::neighbors::ivf_pq::search_params>();
+                std::make_optional<cuvs::neighbors::ivf_pq::search_params>();
         ivf_pq_search_params->n_probes =
                 cagraConfig_.ivf_pq_search_params->n_probes;
         ivf_pq_search_params->lut_dtype =
@@ -79,7 +79,7 @@ void GpuIndexCagra::train(idx_t n, const float* x) {
         ivf_pq_search_params->preferred_shmem_carveout =
                 cagraConfig_.ivf_pq_search_params->preferred_shmem_carveout;
     }
-    index_ = std::make_shared<RaftCagra>(
+    index_ = std::make_shared<CuvsCagra>(
             this->resources_.get(),
             this->d,
             cagraConfig_.intermediate_graph_degree,
@@ -179,7 +179,7 @@ void GpuIndexCagra::copyFrom(const faiss::IndexHNSWCagra* index) {
         }
     }
 
-    index_ = std::make_shared<RaftCagra>(
+    index_ = std::make_shared<CuvsCagra>(
             this->resources_.get(),
             this->d,
             index->ntotal,
diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
index 6ecee3ae03..5a73f16ba9 100644
--- a/faiss/gpu/GpuIndexCagra.h
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -33,7 +33,7 @@ struct IndexHNSWCagra;
 namespace faiss {
 namespace gpu {
 
-class RaftCagra;
+class CuvsCagra;
 
 enum class graph_build_algo {
     /// Use IVF-PQ to build all-neighbors knn graph
@@ -275,7 +275,7 @@ struct GpuIndexCagra : public GpuIndex {
     const GpuIndexCagraConfig cagraConfig_;
 
     /// Instance that we own; contains the inverted lists
-    std::shared_ptr<RaftCagra> index_;
+    std::shared_ptr<CuvsCagra> index_;
 };
 
 } // namespace gpu
diff --git a/faiss/gpu/GpuIndexFlat.cu b/faiss/gpu/GpuIndexFlat.cu
index d361a7182a..7785fcd763 100644
--- a/faiss/gpu/GpuIndexFlat.cu
+++ b/faiss/gpu/GpuIndexFlat.cu
@@ -18,8 +18,8 @@
 #include <faiss/gpu/utils/Float16.cuh>
 #include <limits>
 
-#if defined USE_NVIDIA_RAFT
-#include <faiss/gpu/impl/RaftFlatIndex.cuh>
+#if defined USE_NVIDIA_RAPIDS
+#include <faiss/gpu/impl/CuvsFlatIndex.cuh>
 #endif
 
 namespace faiss {
@@ -93,17 +93,17 @@ GpuIndexFlat::GpuIndexFlat(
 GpuIndexFlat::~GpuIndexFlat() {}
 
 void GpuIndexFlat::resetIndex_(int dims) {
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 
-    if (should_use_raft(config_)) {
-        data_.reset(new RaftFlatIndex(
+    if (should_use_cuvs(config_)) {
+        data_.reset(new CuvsFlatIndex(
                 resources_.get(),
                 dims,
                 flatConfig_.useFloat16,
                 config_.memorySpace));
     } else
 #else
-    if (should_use_raft(config_)) {
+    if (should_use_cuvs(config_)) {
         FAISS_THROW_MSG(
                 "RAFT has not been compiled into the current version so it cannot be used.");
     } else
diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu
index 40129a54c5..1b0af9a990 100644
--- a/faiss/gpu/GpuIndexIVF.cu
+++ b/faiss/gpu/GpuIndexIVF.cu
@@ -92,7 +92,7 @@ void GpuIndexIVF::init_() {
         GpuIndexFlatConfig config = ivfConfig_.flatConfig;
         // inherit our same device
         config.device = config_.device;
-        config.use_raft = config_.use_raft;
+        config.use_cuvs = config_.use_cuvs;
 
         if (metric_type == faiss::METRIC_L2) {
             quantizer = new GpuIndexFlatL2(resources_, d, config);
diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 884b5b0fc0..83c5f1dac3 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -15,10 +15,10 @@
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/Float16.cuh>
 
-#if defined USE_NVIDIA_RAFT
-#include <faiss/gpu/utils/RaftUtils.h>
-#include <faiss/gpu/impl/RaftIVFFlat.cuh>
-#include <raft/neighbors/ivf_flat.cuh>
+#if defined USE_NVIDIA_RAPIDS
+#include <cuvs/neighbors/ivf_flat.hpp>
+#include <faiss/gpu/utils/CuvsUtils.h>
+#include <faiss/gpu/impl/CuvsIVFFlat.cuh>
 #endif
 
 #include <limits>
@@ -73,7 +73,7 @@ GpuIndexIVFFlat::GpuIndexIVFFlat(
           ivfFlatConfig_(config),
           reserveMemoryVecs_(0) {
     FAISS_THROW_IF_NOT_MSG(
-            !should_use_raft(config),
+            !should_use_cuvs(config),
             "GpuIndexIVFFlat: RAFT does not support separate coarseQuantizer");
     // We could have been passed an already trained coarse quantizer. There is
     // no other quantizer that we need to train, so this is sufficient
@@ -100,9 +100,9 @@ GpuIndexIVFFlat::~GpuIndexIVFFlat() {}
 void GpuIndexIVFFlat::reserveMemory(size_t numVecs) {
     DeviceScope scope(config_.device);
 
-    if (should_use_raft(config_)) {
+    if (should_use_cuvs(config_)) {
         FAISS_THROW_MSG(
-                "Pre-allocation of IVF lists is not supported with RAFT enabled.");
+                "Pre-allocation of IVF lists is not supported with CUVS enabled.");
     }
 
     reserveMemoryVecs_ = numVecs;
@@ -121,7 +121,7 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
     index_.reset();
 
     // skip base class allocations if RAFT is enabled
-    if (!should_use_raft(config_)) {
+    if (!should_use_cuvs(config_)) {
         baseIndex_.reset();
     }
 
@@ -213,12 +213,12 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
 
     if (this->is_trained) {
         FAISS_ASSERT(index_);
-        if (should_use_raft(config_)) {
-            // if RAFT is enabled, copy the IVF centroids to the RAFT index in
-            // case it has been reset. This is because reset clears the RAFT
-            // index and its centroids.
+        if (should_use_cuvs(config_)) {
+            // copy the IVF centroids to the CUVS index
+            // in case it has been reset. This is because `reset` clears the
+            // CUVS index and its centroids.
             // TODO: change this once the coarse quantizer is separated from
-            // RAFT index
+            // CUVS index
             updateQuantizer();
         };
         return;
@@ -226,8 +226,8 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
 
     FAISS_ASSERT(!index_);
 
-    if (should_use_raft(config_)) {
-#if defined USE_NVIDIA_RAFT
+    if (should_use_cuvs(config_)) {
+#if defined USE_NVIDIA_RAPIDS
         setIndex_(
                 resources_.get(),
                 this->d,
@@ -242,27 +242,37 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
         const raft::device_resources& raft_handle =
                 resources_->getRaftHandleCurrentDevice();
 
-        raft::neighbors::ivf_flat::index_params raft_idx_params;
+        cuvs::neighbors::ivf_flat::index_params raft_idx_params;
         raft_idx_params.n_lists = nlist;
-        raft_idx_params.metric = metricFaissToRaft(metric_type, false);
+        raft_idx_params.metric = metricFaissToCuvs(metric_type, false);
         raft_idx_params.add_data_on_build = false;
         raft_idx_params.kmeans_trainset_fraction =
                 static_cast<double>(cp.max_points_per_centroid * nlist) /
                 static_cast<double>(n);
         raft_idx_params.kmeans_n_iters = cp.niter;
 
-        auto raftIndex_ =
-                std::static_pointer_cast<RaftIVFFlat, IVFFlat>(index_);
-
-        raft::neighbors::ivf_flat::index<float, idx_t> raft_ivfflat_index =
-                raft::neighbors::ivf_flat::build<float, idx_t>(
-                        raft_handle, raft_idx_params, x, n, (idx_t)d);
+        auto cuvsIndex_ =
+                std::static_pointer_cast<CuvsIVFFlat, IVFFlat>(index_);
+
+        std::optional<cuvs::neighbors::ivf_flat::index<float, idx_t>> cuvs_ivfflat_index;
+
+        if (getDeviceForAddress(x) >= 0) {
+            auto dataset_d =
+                    raft::make_device_matrix_view<const float, idx_t>(x, n, d);
+            cuvs_ivfflat_index = cuvs::neighbors::ivf_flat::build(
+                    raft_handle, raft_idx_params, dataset_d);
+        } else {
+            auto x_view =
+                    raft::make_host_matrix_view<const float, idx_t>(x, n, d);
+            cuvs_ivfflat_index = cuvs::neighbors::ivf_flat::build(
+                    raft_handle, raft_idx_params, x_view);
+        }
 
-        quantizer->train(nlist, raft_ivfflat_index.centers().data_handle());
-        quantizer->add(nlist, raft_ivfflat_index.centers().data_handle());
+        quantizer->train(nlist, cuvs_ivfflat_index.value().centers().data_handle());
+        quantizer->add(nlist, cuvs_ivfflat_index.value().centers().data_handle());
         raft_handle.sync_stream();
 
-        raftIndex_->setRaftIndex(std::move(raft_ivfflat_index));
+        cuvsIndex_->setCuvsIndex(std::move(cuvs_ivfflat_index.value()));
 #else
         FAISS_THROW_MSG(
                 "RAFT has not been compiled into the current version so it cannot be used.");
@@ -295,9 +305,9 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
     baseIndex_ = std::static_pointer_cast<IVFBase, IVFFlat>(index_);
 
     if (reserveMemoryVecs_) {
-        if (should_use_raft(config_)) {
+        if (should_use_cuvs(config_)) {
             FAISS_THROW_MSG(
-                    "Pre-allocation of IVF lists is not supported with RAFT enabled.");
+                    "Pre-allocation of IVF lists is not supported with CUVS enabled.");
         } else
             index_->reserveMemory(reserveMemoryVecs_);
     }
@@ -317,16 +327,16 @@ void GpuIndexIVFFlat::setIndex_(
         bool interleavedLayout,
         IndicesOptions indicesOptions,
         MemorySpace space) {
-    if (should_use_raft(config_)) {
-#if defined USE_NVIDIA_RAFT
+    if (should_use_cuvs(config_)) {
+#if defined USE_NVIDIA_RAPIDS
         FAISS_THROW_IF_NOT_MSG(
                 ivfFlatConfig_.indicesOptions == INDICES_64_BIT,
                 "RAFT only supports INDICES_64_BIT");
         if (!ivfFlatConfig_.interleavedLayout) {
             fprintf(stderr,
-                    "WARN: interleavedLayout is set to False with RAFT enabled. This will be ignored.\n");
+                    "WARN: interleavedLayout is set to False with CUVS enabled. This will be ignored.\n");
         }
-        index_.reset(new RaftIVFFlat(
+        index_.reset(new CuvsIVFFlat(
                 resources,
                 dim,
                 nlist,
diff --git a/faiss/gpu/GpuIndexIVFPQ.cu b/faiss/gpu/GpuIndexIVFPQ.cu
index d39f036b89..8d1fa504ce 100644
--- a/faiss/gpu/GpuIndexIVFPQ.cu
+++ b/faiss/gpu/GpuIndexIVFPQ.cu
@@ -15,11 +15,10 @@
 #include <faiss/gpu/impl/IVFPQ.cuh>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 
-#if defined USE_NVIDIA_RAFT
-#include <faiss/gpu/utils/RaftUtils.h>
-#include <faiss/gpu/impl/RaftIVFPQ.cuh>
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/ivf_pq_helpers.cuh>
+#if defined USE_NVIDIA_RAPIDS
+#include <cuvs/neighbors/ivf_pq.hpp>
+#include <faiss/gpu/utils/CuvsUtils.h>
+#include <faiss/gpu/impl/CuvsIVFPQ.cuh>
 #endif
 
 #include <limits>
@@ -95,7 +94,7 @@ GpuIndexIVFPQ::GpuIndexIVFPQ(
     this->is_trained = false;
 
     FAISS_THROW_IF_NOT_MSG(
-            !config.use_raft,
+            !config.use_cuvs,
             "GpuIndexIVFPQ: RAFT does not support separate coarseQuantizer");
 
     verifyPQSettings_();
@@ -113,7 +112,7 @@ void GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
     index_.reset();
 
     // skip base class allocations if RAFT is enabled
-    if (!should_use_raft(config_)) {
+    if (!should_use_cuvs(config_)) {
         baseIndex_.reset();
     }
 
@@ -323,7 +322,7 @@ void GpuIndexIVFPQ::trainResidualQuantizer_(idx_t n, const float* x) {
         try {
             GpuIndexFlatConfig config;
             config.device = ivfpqConfig_.device;
-            config.use_raft = false;
+            config.use_cuvs = false;
             GpuIndexFlatL2 pqIndex(resources_, pq.dsub, config);
 
             pq.assign_index = &pqIndex;
@@ -349,7 +348,7 @@ void GpuIndexIVFPQ::train(idx_t n, const float* x) {
 
     if (this->is_trained) {
         FAISS_ASSERT(index_);
-        if (should_use_raft(config_)) {
+        if (should_use_cuvs(config_)) {
             // if RAFT is enabled, copy the IVF centroids to the RAFT index in
             // case it has been reset. This is because reset clears the RAFT
             // index and its centroids.
@@ -364,11 +363,11 @@ void GpuIndexIVFPQ::train(idx_t n, const float* x) {
 
     // RAFT does not support using an external index for assignment. Fall back
     // to the classical GPU impl
-    if (should_use_raft(config_)) {
-#if defined USE_NVIDIA_RAFT
+    if (should_use_cuvs(config_)) {
+#if defined USE_NVIDIA_RAPIDS
         if (pq.assign_index) {
             fprintf(stderr,
-                    "WARN: The Product Quantizer's assign_index will be ignored with RAFT enabled.\n");
+                    "WARN: The Product Quantizer's assign_index will be ignored with CUVS enabled.\n");
         }
         // first initialize the index. The PQ centroids will be updated
         // retroactively.
@@ -390,41 +389,51 @@ void GpuIndexIVFPQ::train(idx_t n, const float* x) {
         const raft::device_resources& raft_handle =
                 resources_->getRaftHandleCurrentDevice();
 
-        raft::neighbors::ivf_pq::index_params raft_idx_params;
-        raft_idx_params.n_lists = nlist;
-        raft_idx_params.metric = metricFaissToRaft(metric_type, false);
-        raft_idx_params.kmeans_trainset_fraction =
+        cuvs::neighbors::ivf_pq::index_params cuvs_index_params;
+        cuvs_index_params.n_lists = nlist;
+        cuvs_index_params.metric = metricFaissToCuvs(metric_type, false);
+        cuvs_index_params.kmeans_trainset_fraction =
                 static_cast<double>(cp.max_points_per_centroid * nlist) /
                 static_cast<double>(n);
-        raft_idx_params.kmeans_n_iters = cp.niter;
-        raft_idx_params.pq_bits = bitsPerCode_;
-        raft_idx_params.pq_dim = subQuantizers_;
-        raft_idx_params.conservative_memory_allocation = false;
-        raft_idx_params.add_data_on_build = false;
-
-        auto raftIndex_ = std::static_pointer_cast<RaftIVFPQ, IVFPQ>(index_);
-
-        raft::neighbors::ivf_pq::index<idx_t> raft_ivfpq_index =
-                raft::neighbors::ivf_pq::build<float, idx_t>(
-                        raft_handle, raft_idx_params, x, n, (idx_t)d);
+        cuvs_index_params.kmeans_n_iters = cp.niter;
+        cuvs_index_params.pq_bits = bitsPerCode_;
+        cuvs_index_params.pq_dim = subQuantizers_;
+        cuvs_index_params.conservative_memory_allocation = false;
+        cuvs_index_params.add_data_on_build = false;
+
+        auto cuvsIndex_ = std::static_pointer_cast<CuvsIVFPQ, IVFPQ>(index_);
+        
+        std::optional<cuvs::neighbors::ivf_pq::index<idx_t>> cuvs_ivfpq_index;
+
+        if (getDeviceForAddress(x) >= 0) {
+            auto dataset_d =
+                    raft::make_device_matrix_view<const float, idx_t>(x, n, d);
+            cuvs_ivfpq_index = cuvs::neighbors::ivf_pq::build(
+                    raft_handle, cuvs_index_params, dataset_d);
+        } else {
+            auto dataset_h =
+                    raft::make_host_matrix_view<const float, idx_t>(x, n, d);
+            cuvs_ivfpq_index = cuvs::neighbors::ivf_pq::build(
+                    raft_handle, cuvs_index_params, dataset_h);
+        }
 
-        auto raft_centers = raft::make_device_matrix<float>(
+        auto cluster_centers = raft::make_device_matrix<float>(
                 raft_handle,
-                raft_ivfpq_index.n_lists(),
-                raft_ivfpq_index.dim());
-        raft::neighbors::ivf_pq::helpers::extract_centers(
-                raft_handle, raft_ivfpq_index, raft_centers.view());
+                cuvs_ivfpq_index.value().n_lists(),
+                cuvs_ivfpq_index.value().dim());
+        cuvs::neighbors::ivf_pq::helpers::extract_centers(
+                raft_handle, cuvs_ivfpq_index.value(), cluster_centers.view());
 
-        quantizer->train(nlist, raft_centers.data_handle());
-        quantizer->add(nlist, raft_centers.data_handle());
+        quantizer->train(nlist, cluster_centers.data_handle());
+        quantizer->add(nlist, cluster_centers.data_handle());
 
         raft::copy(
                 pq.get_centroids(0, 0),
-                raft_ivfpq_index.pq_centers().data_handle(),
-                raft_ivfpq_index.pq_centers().size(),
+                cuvs_ivfpq_index.value().pq_centers().data_handle(),
+                cuvs_ivfpq_index.value().pq_centers().size(),
                 raft_handle.get_stream());
         raft_handle.sync_stream();
-        raftIndex_->setRaftIndex(std::move(raft_ivfpq_index));
+        cuvsIndex_->setCuvsIndex(std::move(cuvs_ivfpq_index.value()));
 #else
         FAISS_THROW_MSG(
                 "RAFT has not been compiled into the current version so it cannot be used.");
@@ -484,9 +493,9 @@ void GpuIndexIVFPQ::setIndex_(
         float* pqCentroidData,
         IndicesOptions indicesOptions,
         MemorySpace space) {
-    if (should_use_raft(config_)) {
-#if defined USE_NVIDIA_RAFT
-        index_.reset(new RaftIVFPQ(
+    if (should_use_cuvs(config_)) {
+#if defined USE_NVIDIA_RAPIDS
+        index_.reset(new CuvsIVFPQ(
                 resources,
                 dim,
                 nlist,
@@ -529,10 +538,10 @@ void GpuIndexIVFPQ::verifyPQSettings_() const {
     FAISS_THROW_IF_NOT_MSG(nlist > 0, "nlist must be >0");
 
     // up to a single byte per code
-    if (should_use_raft(config_)) {
+    if (should_use_cuvs(config_)) {
         if (!ivfpqConfig_.interleavedLayout) {
             fprintf(stderr,
-                    "WARN: interleavedLayout is set to False with RAFT enabled. This will be ignored.\n");
+                    "WARN: interleavedLayout is set to False with CUVS enabled. This will be ignored.\n");
         }
         FAISS_THROW_IF_NOT_FMT(
                 bitsPerCode_ >= 4 && bitsPerCode_ <= 8,
@@ -567,7 +576,7 @@ void GpuIndexIVFPQ::verifyPQSettings_() const {
             "is not supported",
             subQuantizers_);
 
-    if (!should_use_raft(config_)) {
+    if (!should_use_cuvs(config_)) {
         // Sub-quantizers must evenly divide dimensions available
         FAISS_THROW_IF_NOT_FMT(
                 this->d % subQuantizers_ == 0,
diff --git a/faiss/gpu/GpuIndexIVFPQ.h b/faiss/gpu/GpuIndexIVFPQ.h
index 1084d4d0d2..ce3bfa6c1e 100644
--- a/faiss/gpu/GpuIndexIVFPQ.h
+++ b/faiss/gpu/GpuIndexIVFPQ.h
@@ -34,7 +34,7 @@ struct GpuIndexIVFPQConfig : public GpuIndexIVFConfig {
 
     /// Use the alternative memory layout for the IVF lists
     /// WARNING: this is a feature under development, and is only supported with
-    /// RAFT enabled for the index. Do not use if RAFT is not enabled.
+    /// CUVS enabled for the index. Do not use if RAFT is not enabled.
     bool interleavedLayout = false;
 
     /// Use GEMM-backed computation of PQ code distances for the no precomputed
diff --git a/faiss/gpu/GpuResources.cpp b/faiss/gpu/GpuResources.cpp
index 1ed3a6ddd5..428c756f20 100644
--- a/faiss/gpu/GpuResources.cpp
+++ b/faiss/gpu/GpuResources.cpp
@@ -168,7 +168,7 @@ cudaStream_t GpuResources::getDefaultStreamCurrentDevice() {
     return getDefaultStream(getCurrentDevice());
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 raft::device_resources& GpuResources::getRaftHandleCurrentDevice() {
     return getRaftHandle(getCurrentDevice());
 }
diff --git a/faiss/gpu/GpuResources.h b/faiss/gpu/GpuResources.h
index fc6dd591b4..6c4710e0bf 100644
--- a/faiss/gpu/GpuResources.h
+++ b/faiss/gpu/GpuResources.h
@@ -30,7 +30,7 @@
 #include <utility>
 #include <vector>
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 #include <raft/core/device_resources.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #endif
@@ -161,7 +161,7 @@ struct AllocRequest : public AllocInfo {
     /// The size in bytes of the allocation
     size_t size = 0;
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     rmm::mr::device_memory_resource* mr = nullptr;
 #endif
 };
@@ -211,7 +211,7 @@ class GpuResources {
     /// given device
     virtual cudaStream_t getDefaultStream(int device) = 0;
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     /// Returns the raft handle for the given device which can be used to
     /// make calls to other raft primitives.
     virtual raft::device_resources& getRaftHandle(int device) = 0;
diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
index 78336b4994..178face71f 100644
--- a/faiss/gpu/StandardGpuResources.cpp
+++ b/faiss/gpu/StandardGpuResources.cpp
@@ -20,7 +20,7 @@
  * limitations under the License.
  */
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 #include <raft/core/device_resources.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -90,7 +90,7 @@ std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
 
 StandardGpuResourcesImpl::StandardGpuResourcesImpl()
         :
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
           mmr_(new rmm::mr::managed_memory_resource),
           pmr_(new rmm::mr::pinned_memory_resource),
 #endif
@@ -158,7 +158,7 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
     }
 
     if (pinnedMemAlloc_) {
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
         pmr_->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_);
 #else
         auto err = cudaFreeHost(pinnedMemAlloc_);
@@ -257,7 +257,7 @@ void StandardGpuResourcesImpl::setDefaultStream(
         if (prevStream != stream) {
             streamWait({stream}, {prevStream});
         }
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
         // delete the raft handle for this device, which will be initialized
         // with the updated stream during any subsequent calls to getRaftHandle
         auto it2 = raftHandles_.find(device);
@@ -283,7 +283,7 @@ void StandardGpuResourcesImpl::revertDefaultStream(int device) {
 
             streamWait({newStream}, {prevStream});
         }
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
         // delete the raft handle for this device, which will be initialized
         // with the updated stream during any subsequent calls to getRaftHandle
         auto it2 = raftHandles_.find(device);
@@ -323,7 +323,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
     // If this is the first device that we're initializing, create our
     // pinned memory allocation
     if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
         // If this is the first device that we're initializing, create our
         // pinned memory allocation
         if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
@@ -376,7 +376,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
 
     defaultStreams_[device] = defaultStream;
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     raftHandles_.emplace(std::make_pair(device, defaultStream));
 #endif
 
@@ -442,7 +442,7 @@ cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) {
     return defaultStreams_[device];
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 raft::device_resources& StandardGpuResourcesImpl::getRaftHandle(int device) {
     initializeForDevice(device);
 
@@ -513,7 +513,7 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
         // Otherwise, we can handle this locally
         p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
     } else if (adjReq.space == MemorySpace::Device) {
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
         try {
             rmm::mr::device_memory_resource* current_mr =
                     rmm::mr::get_per_device_resource(
@@ -547,7 +547,7 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
         }
 #endif
     } else if (adjReq.space == MemorySpace::Unified) {
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
         try {
             // for now, use our own managed MR to do Unified Memory allocations.
             // TODO: change this to use the current device resource once RMM has
@@ -616,7 +616,7 @@ void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
     } else if (
             req.space == MemorySpace::Device ||
             req.space == MemorySpace::Unified) {
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
         req.mr->deallocate_async(p, req.size, req.stream);
 #else
         auto err = cudaFree(p);
@@ -710,7 +710,7 @@ cudaStream_t StandardGpuResources::getDefaultStream(int device) {
     return res_->getDefaultStream(device);
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 raft::device_resources& StandardGpuResources::getRaftHandle(int device) {
     return res_->getRaftHandle(device);
 }
diff --git a/faiss/gpu/StandardGpuResources.h b/faiss/gpu/StandardGpuResources.h
index 661c784aee..7badad5290 100644
--- a/faiss/gpu/StandardGpuResources.h
+++ b/faiss/gpu/StandardGpuResources.h
@@ -22,7 +22,7 @@
 
 #pragma once
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 #include <raft/core/device_resources.hpp>
 #include <rmm/mr/host/pinned_memory_resource.hpp>
 #endif
@@ -79,7 +79,7 @@ class StandardGpuResourcesImpl : public GpuResources {
     /// this stream upon exit from an index or other Faiss GPU call.
     cudaStream_t getDefaultStream(int device) override;
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     /// Returns the raft handle for the given device which can be used to
     /// make calls to other raft primitives.
     raft::device_resources& getRaftHandle(int device) override;
@@ -151,7 +151,7 @@ class StandardGpuResourcesImpl : public GpuResources {
     /// cuBLAS handle for each device
     std::unordered_map<int, cublasHandle_t> blasHandles_;
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     /// raft handle for each device
     std::unordered_map<int, raft::device_resources> raftHandles_;
 
@@ -234,7 +234,7 @@ class StandardGpuResources : public GpuResourcesProvider {
     /// Returns the current default stream
     cudaStream_t getDefaultStream(int device);
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     /// Returns the raft handle for the given device which can be used to
     /// make calls to other raft primitives.
     raft::device_resources& getRaftHandle(int device);
diff --git a/faiss/gpu/impl/RaftCagra.cu b/faiss/gpu/impl/CuvsCagra.cu
similarity index 62%
rename from faiss/gpu/impl/RaftCagra.cu
rename to faiss/gpu/impl/CuvsCagra.cu
index 292079321d..72af0b9dd4 100644
--- a/faiss/gpu/impl/RaftCagra.cu
+++ b/faiss/gpu/impl/CuvsCagra.cu
@@ -23,19 +23,19 @@
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <cstddef>
 #include <cstdint>
-#include <faiss/gpu/impl/RaftCagra.cuh>
+#include <faiss/gpu/impl/CuvsCagra.cuh>
 
+#include <cuvs/neighbors/cagra.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
 #include <raft_runtime/neighbors/cagra.hpp>
 #include <optional>
-#include <raft/neighbors/cagra.cuh>
 
 namespace faiss {
 namespace gpu {
 
-RaftCagra::RaftCagra(
+CuvsCagra::CuvsCagra(
         GpuResources* resources,
         int dim,
         idx_t intermediate_graph_degree,
@@ -45,8 +45,8 @@ RaftCagra::RaftCagra(
         faiss::MetricType metric,
         float metricArg,
         IndicesOptions indicesOptions,
-        std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params,
-        std::optional<raft::neighbors::ivf_pq::search_params>
+        std::optional<cuvs::neighbors::ivf_pq::index_params> ivf_pq_params,
+        std::optional<cuvs::neighbors::ivf_pq::search_params>
                 ivf_pq_search_params)
         : resources_(resources),
           dim_(dim),
@@ -65,29 +65,29 @@ RaftCagra::RaftCagra(
     index_params_.intermediate_graph_degree = intermediate_graph_degree;
     index_params_.graph_degree = graph_degree;
     index_params_.build_algo =
-            static_cast<raft::neighbors::cagra::graph_build_algo>(
+            static_cast<cuvs::neighbors::cagra::graph_build_algo>(
                     graph_build_algo);
     index_params_.nn_descent_niter = nn_descent_niter;
 
     if (!ivf_pq_params_) {
         ivf_pq_params_ =
-                std::make_optional<raft::neighbors::ivf_pq::index_params>();
+                std::make_optional<cuvs::neighbors::ivf_pq::index_params>();
     }
     if (!ivf_pq_search_params_) {
         ivf_pq_search_params_ =
-                std::make_optional<raft::neighbors::ivf_pq::search_params>();
+                std::make_optional<cuvs::neighbors::ivf_pq::search_params>();
     }
     index_params_.metric = metric_ == faiss::METRIC_L2
-            ? raft::distance::DistanceType::L2Expanded
-            : raft::distance::DistanceType::InnerProduct;
+            ? cuvsDistanceType::L2Expanded
+            : cuvsDistanceType::InnerProduct;
     ivf_pq_params_->metric = metric_ == faiss::METRIC_L2
-            ? raft::distance::DistanceType::L2Expanded
-            : raft::distance::DistanceType::InnerProduct;
+            ? cuvsDistanceType::L2Expanded
+            : cuvsDistanceType::InnerProduct;
 
     reset();
 }
 
-RaftCagra::RaftCagra(
+CuvsCagra::CuvsCagra(
         GpuResources* resources,
         int dim,
         idx_t n,
@@ -118,7 +118,7 @@ RaftCagra::RaftCagra(
 
     if (distances_on_gpu && knn_graph_on_gpu) {
         raft_handle.sync_stream();
-        // Copying to host so that raft::neighbors::cagra::index
+        // Copying to host so that cuvs::neighbors::cagra::index
         // creates an owning copy of the knn graph on device
         auto knn_graph_copy =
                 raft::make_host_matrix<uint32_t, int64_t>(n, graph_degree);
@@ -131,11 +131,10 @@ RaftCagra::RaftCagra(
                 raft::make_device_matrix_view<const float, int64_t>(
                         distances, n, dim);
 
-        raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
+        cuvs_index = cuvs::neighbors::cagra::index<float, uint32_t>(
                 raft_handle,
-                metric_ == faiss::METRIC_L2
-                        ? raft::distance::DistanceType::L2Expanded
-                        : raft::distance::DistanceType::InnerProduct,
+                metric_ == faiss::METRIC_L2 ? cuvsDistanceType::L2Expanded
+                                            : cuvsDistanceType::InnerProduct,
                 distances_mds,
                 raft::make_const_mdspan(knn_graph_copy.view()));
     } else if (!distances_on_gpu && !knn_graph_on_gpu) {
@@ -150,11 +149,10 @@ RaftCagra::RaftCagra(
         auto distances_mds = raft::make_host_matrix_view<const float, int64_t>(
                 distances, n, dim);
 
-        raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
+        cuvs_index = cuvs::neighbors::cagra::index<float, uint32_t>(
                 raft_handle,
-                metric_ == faiss::METRIC_L2
-                        ? raft::distance::DistanceType::L2Expanded
-                        : raft::distance::DistanceType::InnerProduct,
+                metric_ == faiss::METRIC_L2 ? cuvsDistanceType::L2Expanded
+                                            : cuvsDistanceType::InnerProduct,
                 distances_mds,
                 raft::make_const_mdspan(knn_graph_copy.view()));
     } else {
@@ -163,86 +161,24 @@ RaftCagra::RaftCagra(
     }
 }
 
-void RaftCagra::train(idx_t n, const float* x) {
+void CuvsCagra::train(idx_t n, const float* x) {
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
-    if (index_params_.build_algo ==
-        raft::neighbors::cagra::graph_build_algo::IVF_PQ) {
-        std::optional<raft::host_matrix<uint32_t, int64_t>> knn_graph(
-                raft::make_host_matrix<uint32_t, int64_t>(
-                        n, index_params_.intermediate_graph_degree));
-        if (getDeviceForAddress(x) >= 0) {
-            auto dataset_d =
-                    raft::make_device_matrix_view<const float, int64_t>(
-                            x, n, dim_);
-            raft::neighbors::cagra::build_knn_graph(
-                    raft_handle,
-                    dataset_d,
-                    knn_graph->view(),
-                    1.0f,
-                    ivf_pq_params_,
-                    ivf_pq_search_params_);
-        } else {
-            auto dataset_h = raft::make_host_matrix_view<const float, int64_t>(
-                    x, n, dim_);
-            raft::neighbors::cagra::build_knn_graph(
-                    raft_handle,
-                    dataset_h,
-                    knn_graph->view(),
-                    1.0f,
-                    ivf_pq_params_,
-                    ivf_pq_search_params_);
-        }
-        auto cagra_graph = raft::make_host_matrix<uint32_t, int64_t>(
-                n, index_params_.graph_degree);
-
-        raft::neighbors::cagra::optimize<uint32_t>(
-                raft_handle, knn_graph->view(), cagra_graph.view());
-
-        // free intermediate graph before trying to create the index
-        knn_graph.reset();
-
-        if (getDeviceForAddress(x) >= 0) {
-            auto dataset_d =
-                    raft::make_device_matrix_view<const float, int64_t>(
-                            x, n, dim_);
-            raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
-                    raft_handle,
-                    metric_ == faiss::METRIC_L2
-                            ? raft::distance::DistanceType::L2Expanded
-                            : raft::distance::DistanceType::InnerProduct,
-                    dataset_d,
-                    raft::make_const_mdspan(cagra_graph.view()));
-        } else {
-            auto dataset_h = raft::make_host_matrix_view<const float, int64_t>(
-                    x, n, dim_);
-            raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
-                    raft_handle,
-                    metric_ == faiss::METRIC_L2
-                            ? raft::distance::DistanceType::L2Expanded
-                            : raft::distance::DistanceType::InnerProduct,
-                    dataset_h,
-                    raft::make_const_mdspan(cagra_graph.view()));
-        }
-
+    if (getDeviceForAddress(x) >= 0) {
+        cuvs_index = raft::runtime::neighbors::cagra::build(
+                raft_handle,
+                index_params_,
+                raft::make_device_matrix_view<const float, int64_t>(
+                        x, n, dim_));
     } else {
-        if (getDeviceForAddress(x) >= 0) {
-            raft_knn_index = raft::runtime::neighbors::cagra::build(
-                    raft_handle,
-                    index_params_,
-                    raft::make_device_matrix_view<const float, int64_t>(
-                            x, n, dim_));
-        } else {
-            raft_knn_index = raft::runtime::neighbors::cagra::build(
-                    raft_handle,
-                    index_params_,
-                    raft::make_host_matrix_view<const float, int64_t>(
-                            x, n, dim_));
-        }
+        cuvs_index = raft::runtime::neighbors::cagra::build(
+                raft_handle,
+                index_params_,
+                raft::make_host_matrix_view<const float, int64_t>(x, n, dim_));
     }
 }
 
-void RaftCagra::search(
+void CuvsCagra::search(
         Tensor<float, 2, true>& queries,
         int k,
         Tensor<float, 2, true>& outDistances,
@@ -266,7 +202,7 @@ void RaftCagra::search(
     idx_t cols = queries.getSize(1);
     idx_t k_ = k;
 
-    FAISS_ASSERT(raft_knn_index.has_value());
+    FAISS_ASSERT(cuvs_index.has_value());
     FAISS_ASSERT(numQueries > 0);
     FAISS_ASSERT(cols == dim_);
 
@@ -277,18 +213,18 @@ void RaftCagra::search(
     auto indices_view = raft::make_device_matrix_view<idx_t, int64_t>(
             outIndices.data(), numQueries, k_);
 
-    raft::neighbors::cagra::search_params search_pams;
+    cuvs::neighbors::cagra::search_params search_pams;
     search_pams.max_queries = max_queries;
     search_pams.itopk_size = itopk_size;
     search_pams.max_iterations = max_iterations;
     search_pams.algo =
-            static_cast<raft::neighbors::cagra::search_algo>(graph_search_algo);
+            static_cast<cuvs::neighbors::cagra::search_algo>(graph_search_algo);
     search_pams.team_size = team_size;
     search_pams.search_width = search_width;
     search_pams.min_iterations = min_iterations;
     search_pams.thread_block_size = thread_block_size;
     search_pams.hashmap_mode =
-            static_cast<raft::neighbors::cagra::hash_mode>(hash_mode);
+            static_cast<cuvs::neighbors::cagra::hash_mode>(hash_mode);
     search_pams.hashmap_min_bitlen = hashmap_min_bitlen;
     search_pams.hashmap_max_fill_rate = hashmap_max_fill_rate;
     search_pams.num_random_samplings = num_random_samplings;
@@ -300,7 +236,7 @@ void RaftCagra::search(
     raft::runtime::neighbors::cagra::search(
             raft_handle,
             search_pams,
-            raft_knn_index.value(),
+            *cuvs_index,
             queries_view,
             indices_copy.view(),
             distances_view);
@@ -311,22 +247,22 @@ void RaftCagra::search(
             indices_view.data_handle());
 }
 
-void RaftCagra::reset() {
-    raft_knn_index.reset();
+void CuvsCagra::reset() {
+    cuvs_index.reset();
 }
 
-idx_t RaftCagra::get_knngraph_degree() const {
-    FAISS_ASSERT(raft_knn_index.has_value());
-    return static_cast<idx_t>(raft_knn_index.value().graph_degree());
+idx_t CuvsCagra::get_knngraph_degree() const {
+    FAISS_ASSERT(cuvs_index.has_value());
+    return static_cast<idx_t>(cuvs_index->graph_degree());
 }
 
-std::vector<idx_t> RaftCagra::get_knngraph() const {
-    FAISS_ASSERT(raft_knn_index.has_value());
+std::vector<idx_t> CuvsCagra::get_knngraph() const {
+    FAISS_ASSERT(cuvs_index.has_value());
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
     auto stream = raft_handle.get_stream();
 
-    auto device_graph = raft_knn_index.value().graph();
+    auto device_graph = cuvs_index->graph();
 
     std::vector<idx_t> host_graph(
             device_graph.extent(0) * device_graph.extent(1));
@@ -342,13 +278,13 @@ std::vector<idx_t> RaftCagra::get_knngraph() const {
     return host_graph;
 }
 
-std::vector<float> RaftCagra::get_training_dataset() const {
-    FAISS_ASSERT(raft_knn_index.has_value());
+std::vector<float> CuvsCagra::get_training_dataset() const {
+    FAISS_ASSERT(cuvs_index.has_value());
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
     auto stream = raft_handle.get_stream();
 
-    auto device_dataset = raft_knn_index.value().dataset();
+    auto device_dataset = cuvs_index->dataset();
 
     std::vector<float> host_dataset(
             device_dataset.extent(0) * device_dataset.extent(1));
diff --git a/faiss/gpu/impl/RaftCagra.cuh b/faiss/gpu/impl/CuvsCagra.cuh
similarity index 85%
rename from faiss/gpu/impl/RaftCagra.cuh
rename to faiss/gpu/impl/CuvsCagra.cuh
index 95f6c03fca..d4d8266673 100644
--- a/faiss/gpu/impl/RaftCagra.cuh
+++ b/faiss/gpu/impl/CuvsCagra.cuh
@@ -30,8 +30,8 @@
 
 #include <faiss/MetricType.h>
 
-#include <raft/neighbors/cagra_types.hpp>
-#include <raft/neighbors/ivf_pq_types.hpp>
+#include <cuvs/neighbors/cagra.hpp>
+#include <cuvs/neighbors/ivf_pq.hpp>
 
 namespace faiss {
 
@@ -44,9 +44,9 @@ enum class cagra_hash_mode { HASH, SMALL, AUTO };
 
 namespace gpu {
 
-class RaftCagra {
+class CuvsCagra {
    public:
-    RaftCagra(
+    CuvsCagra(
             GpuResources* resources,
             int dim,
             idx_t intermediate_graph_degree,
@@ -56,12 +56,12 @@ class RaftCagra {
             faiss::MetricType metric,
             float metricArg,
             IndicesOptions indicesOptions,
-            std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params =
+            std::optional<cuvs::neighbors::ivf_pq::index_params> ivf_pq_params =
                     std::nullopt,
-            std::optional<raft::neighbors::ivf_pq::search_params>
+            std::optional<cuvs::neighbors::ivf_pq::search_params>
                     ivf_pq_search_params = std::nullopt);
 
-    RaftCagra(
+    CuvsCagra(
             GpuResources* resources,
             int dim,
             idx_t n,
@@ -72,7 +72,7 @@ class RaftCagra {
             float metricArg,
             IndicesOptions indicesOptions);
 
-    ~RaftCagra() = default;
+    ~CuvsCagra() = default;
 
     void train(idx_t n, const float* x);
 
@@ -117,15 +117,15 @@ class RaftCagra {
     float metricArg_;
 
     /// Parameters to build RAFT CAGRA index
-    raft::neighbors::cagra::index_params index_params_;
+    cuvs::neighbors::cagra::index_params index_params_;
 
     /// Parameters to build CAGRA graph using IVF PQ
-    std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params_;
-    std::optional<raft::neighbors::ivf_pq::search_params> ivf_pq_search_params_;
+    std::optional<cuvs::neighbors::ivf_pq::index_params> ivf_pq_params_;
+    std::optional<cuvs::neighbors::ivf_pq::search_params> ivf_pq_search_params_;
 
     /// Instance of trained RAFT CAGRA index
-    std::optional<raft::neighbors::cagra::index<float, uint32_t>>
-            raft_knn_index{std::nullopt};
+    std::optional<cuvs::neighbors::cagra::index<float, uint32_t>>
+            cuvs_index{std::nullopt};
 };
 
 } // namespace gpu
diff --git a/faiss/gpu/impl/RaftFlatIndex.cu b/faiss/gpu/impl/CuvsFlatIndex.cu
similarity index 88%
rename from faiss/gpu/impl/RaftFlatIndex.cu
rename to faiss/gpu/impl/CuvsFlatIndex.cu
index 24a6d39604..da990f848f 100644
--- a/faiss/gpu/impl/RaftFlatIndex.cu
+++ b/faiss/gpu/impl/CuvsFlatIndex.cu
@@ -20,33 +20,34 @@
  * limitations under the License.
  */
 
-#include <faiss/gpu/utils/RaftUtils.h>
-#include <faiss/gpu/impl/RaftFlatIndex.cuh>
+#include <faiss/gpu/utils/CuvsUtils.h>
+#include <faiss/gpu/impl/CuvsFlatIndex.cuh>
 #include <faiss/gpu/utils/ConversionOperators.cuh>
 
 #include <vector>
 
+#include <cuvs/neighbors/brute_force.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
-#include <raft/neighbors/brute_force.cuh>
+#include <raft/linalg/unary_op.cuh>
 
-#define RAFT_NAME "raft"
+// #define RAFT_NAME "raft"
 
 namespace faiss {
 namespace gpu {
 
-using namespace raft::distance;
-using namespace raft::neighbors;
+using namespace cuvs::distance;
+using namespace cuvs::neighbors;
 
-RaftFlatIndex::RaftFlatIndex(
+CuvsFlatIndex::CuvsFlatIndex(
         GpuResources* res,
         int dim,
         bool useFloat16,
         MemorySpace space)
         : FlatIndex(res, dim, useFloat16, space) {}
 
-void RaftFlatIndex::query(
+void CuvsFlatIndex::query(
         Tensor<float, 2, true>& input,
         int k,
         faiss::MetricType metric,
@@ -91,16 +92,15 @@ void RaftFlatIndex::query(
                 outDistances.getSize(0),
                 outDistances.getSize(1));
 
-        DistanceType distance = metricFaissToRaft(metric, exactDistance);
+        cuvsDistanceType distance = metricFaissToCuvs(metric, exactDistance);
 
         std::optional<raft::device_vector_view<const float, int64_t>>
                 norms_view = raft::make_device_vector_view(
                         norms_.data(), norms_.getSize(0));
 
-        raft::neighbors::brute_force::index idx(
+        cuvs::neighbors::brute_force::index idx(
                 handle, index, norms_view, distance, metricArg);
-        raft::neighbors::brute_force::search<float, int64_t>(
-                handle, idx, search, inds, dists);
+        cuvs::neighbors::brute_force::search(handle, idx, search, inds, dists, std::nullopt);
 
         if (metric == MetricType::METRIC_Lp) {
             raft::linalg::unary_op(
@@ -120,7 +120,7 @@ void RaftFlatIndex::query(
     }
 }
 
-void RaftFlatIndex::query(
+void CuvsFlatIndex::query(
         Tensor<half, 2, true>& vecs,
         int k,
         faiss::MetricType metric,
diff --git a/faiss/gpu/impl/RaftFlatIndex.cuh b/faiss/gpu/impl/CuvsFlatIndex.cuh
similarity index 97%
rename from faiss/gpu/impl/RaftFlatIndex.cuh
rename to faiss/gpu/impl/CuvsFlatIndex.cuh
index d3823bbf58..225d369be0 100644
--- a/faiss/gpu/impl/RaftFlatIndex.cuh
+++ b/faiss/gpu/impl/CuvsFlatIndex.cuh
@@ -39,9 +39,9 @@ class GpuResources;
 /// the vectors in float32.
 /// If float16, we store the vectors in both float16 and float32, where float32
 /// data is possibly needed for certain residual operations
-class RaftFlatIndex : public FlatIndex {
+class CuvsFlatIndex : public FlatIndex {
    public:
-    RaftFlatIndex(
+    CuvsFlatIndex(
             GpuResources* res,
             int dim,
             bool useFloat16,
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/CuvsIVFFlat.cu
similarity index 78%
rename from faiss/gpu/impl/RaftIVFFlat.cu
rename to faiss/gpu/impl/CuvsIVFFlat.cu
index 0906a60f46..364c09e0bc 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/CuvsIVFFlat.cu
@@ -24,15 +24,17 @@
 #include <cstdint>
 
 #include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/utils/RaftUtils.h>
+#include <faiss/gpu/utils/CuvsUtils.h>
+#include <faiss/gpu/impl/CuvsIVFFlat.cuh>
 #include <faiss/gpu/impl/FlatIndex.cuh>
 #include <faiss/gpu/impl/IVFFlat.cuh>
-#include <faiss/gpu/impl/RaftIVFFlat.cuh>
 #include <faiss/gpu/utils/Transpose.cuh>
 
-#include <raft/neighbors/ivf_flat_codepacker.hpp>
-#include <raft/neighbors/ivf_flat.cuh>
-#include <raft/neighbors/ivf_flat_helpers.cuh>
+// #include <cuvs/neighbors/ivf_flat_codepacker.hpp>
+#include <cuvs/neighbors/ivf_flat.hpp>
+// #include <cuvs/neighbors/ivf_flat_helpers.cuh>
+#include <raft/linalg/map.cuh>
+#include <raft/linalg/norm.cuh>
 
 #include <limits>
 #include <memory>
@@ -40,7 +42,7 @@
 namespace faiss {
 namespace gpu {
 
-RaftIVFFlat::RaftIVFFlat(
+CuvsIVFFlat::CuvsIVFFlat(
         GpuResources* res,
         int dim,
         int nlist,
@@ -68,23 +70,23 @@ RaftIVFFlat::RaftIVFFlat(
             "only INDICES_64_BIT is supported for RAFT index");
 }
 
-RaftIVFFlat::~RaftIVFFlat() {}
+CuvsIVFFlat::~CuvsIVFFlat() {}
 
-void RaftIVFFlat::reserveMemory(idx_t numVecs) {
+void CuvsIVFFlat::reserveMemory(idx_t numVecs) {
     fprintf(stderr,
-            "WARN: reserveMemory is NOP. Pre-allocation of IVF lists is not supported with RAFT enabled.\n");
+            "WARN: reserveMemory is NOP. Pre-allocation of IVF lists is not supported with CUVS enabled.\n");
 }
 
-void RaftIVFFlat::reset() {
-    raft_knn_index.reset();
+void CuvsIVFFlat::reset() {
+    cuvs_index.reset();
 }
 
-void RaftIVFFlat::setRaftIndex(
-        raft::neighbors::ivf_flat::index<float, idx_t>&& idx) {
-    raft_knn_index.emplace(std::move(idx));
+void CuvsIVFFlat::setCuvsIndex(
+        std::shared_ptr<cuvs::neighbors::ivf_flat::index<float, idx_t>> idx) {
+    cuvs_index = idx;
 }
 
-void RaftIVFFlat::search(
+void CuvsIVFFlat::search(
         Index* coarseQuantizer,
         Tensor<float, 2, true>& queries,
         int nprobe,
@@ -100,14 +102,14 @@ void RaftIVFFlat::search(
     uint32_t k_ = k;
 
     // Device is already set in GpuIndex::search
-    FAISS_ASSERT(raft_knn_index.has_value());
+    FAISS_ASSERT(cuvs_index.has_value());
     FAISS_ASSERT(numQueries > 0);
     FAISS_ASSERT(cols == dim_);
     FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= numLists_);
 
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
-    raft::neighbors::ivf_flat::search_params pams;
+    cuvs::neighbors::ivf_flat::search_params pams;
     pams.n_probes = nprobe;
 
     auto queries_view = raft::make_device_matrix_view<const float, idx_t>(
@@ -117,10 +119,10 @@ void RaftIVFFlat::search(
     auto out_dists_view = raft::make_device_matrix_view<float, idx_t>(
             outDistances.data(), (idx_t)numQueries, (idx_t)k_);
 
-    raft::neighbors::ivf_flat::search<float, idx_t>(
+    cuvs::neighbors::ivf_flat::search(
             raft_handle,
             pams,
-            raft_knn_index.value(),
+            cuvs_index.value(),
             queries_view,
             out_inds_view,
             out_dists_view);
@@ -157,7 +159,7 @@ void RaftIVFFlat::search(
             });
 }
 
-idx_t RaftIVFFlat::addVectors(
+idx_t CuvsIVFFlat::addVectors(
         Index* coarseQuantizer,
         Tensor<float, 2, true>& vecs,
         Tensor<idx_t, 1, true>& indices) {
@@ -165,7 +167,7 @@ idx_t RaftIVFFlat::addVectors(
     /// called updateQuantizer() to update the RAFT index if the quantizer was
     /// modified externally
 
-    FAISS_ASSERT(raft_knn_index.has_value());
+    FAISS_ASSERT(cuvs_index.has_value());
 
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
@@ -173,27 +175,30 @@ idx_t RaftIVFFlat::addVectors(
     /// Remove rows containing NaNs
     idx_t n_rows_valid = inplaceGatherFilteredRows(resources_, vecs, indices);
 
-    raft_knn_index.emplace(raft::neighbors::ivf_flat::extend(
-            raft_handle,
-            raft::make_device_matrix_view<const float, idx_t>(
-                    vecs.data(), n_rows_valid, dim_),
-            std::make_optional<raft::device_vector_view<const idx_t, idx_t>>(
-                    raft::make_device_vector_view<const idx_t, idx_t>(
-                            indices.data(), n_rows_valid)),
-            raft_knn_index.value()));
+    cuvs_index = std::make_shared<
+            cuvs::neighbors::ivf_flat::index<float, idx_t>>(
+            cuvs::neighbors::ivf_flat::extend(
+                    raft_handle,
+                    raft::make_device_matrix_view<const float, idx_t>(
+                            vecs.data(), n_rows_valid, dim_),
+                    std::make_optional<
+                            raft::device_vector_view<const idx_t, idx_t>>(
+                            raft::make_device_vector_view<const idx_t, idx_t>(
+                                    indices.data(), n_rows_valid)),
+                    *cuvs_index));
 
     return n_rows_valid;
 }
 
-idx_t RaftIVFFlat::getListLength(idx_t listId) const {
-    FAISS_ASSERT(raft_knn_index.has_value());
+idx_t CuvsIVFFlat::getListLength(idx_t listId) const {
+    FAISS_ASSERT(cuvs_index.has_value());
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
 
     uint32_t size;
     raft::update_host(
             &size,
-            raft_knn_index.value().list_sizes().data_handle() + listId,
+            cuvs_index->list_sizes().data_handle() + listId,
             1,
             raft_handle.get_stream());
     raft_handle.sync_stream();
@@ -202,8 +207,8 @@ idx_t RaftIVFFlat::getListLength(idx_t listId) const {
 }
 
 /// Return the list indices of a particular list back to the CPU
-std::vector<idx_t> RaftIVFFlat::getListIndices(idx_t listId) const {
-    FAISS_ASSERT(raft_knn_index.has_value());
+std::vector<idx_t> CuvsIVFFlat::getListIndices(idx_t listId) const {
+    FAISS_ASSERT(cuvs_index.has_value());
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
     auto stream = raft_handle.get_stream();
@@ -217,9 +222,7 @@ std::vector<idx_t> RaftIVFFlat::getListIndices(idx_t listId) const {
 
     raft::update_host(
             &list_indices_ptr,
-            const_cast<idx_t**>(
-                    raft_knn_index.value().inds_ptrs().data_handle()) +
-                    listId,
+            const_cast<idx_t**>(cuvs_index->inds_ptrs().data_handle()) + listId,
             1,
             stream);
     raft_handle.sync_stream();
@@ -231,13 +234,13 @@ std::vector<idx_t> RaftIVFFlat::getListIndices(idx_t listId) const {
 }
 
 /// Return the encoded vectors of a particular list back to the CPU
-std::vector<uint8_t> RaftIVFFlat::getListVectorData(
+std::vector<uint8_t> CuvsIVFFlat::getListVectorData(
         idx_t listId,
         bool gpuFormat) const {
     if (gpuFormat) {
         FAISS_THROW_MSG("gpuFormat should be false for RAFT indices");
     }
-    FAISS_ASSERT(raft_knn_index.has_value());
+    FAISS_ASSERT(cuvs_index.has_value());
 
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
@@ -258,7 +261,7 @@ std::vector<uint8_t> RaftIVFFlat::getListVectorData(
     // fetch the list data ptr on host
     raft::update_host(
             &list_data_ptr,
-            raft_knn_index.value().data_ptrs().data_handle() + listId,
+            cuvs_index->data_ptrs().data_handle() + listId,
             1,
             stream);
     raft_handle.sync_stream();
@@ -270,15 +273,15 @@ std::vector<uint8_t> RaftIVFFlat::getListVectorData(
             stream);
     raft_handle.sync_stream();
 
-    RaftIVFFlatCodePackerInterleaved packer(
-            (size_t)listSize, dim_, raft_knn_index.value().veclen());
+    CuvsIVFFlatCodePackerInterleaved packer(
+            (size_t)listSize, dim_, cuvs_index->veclen());
     packer.unpack_all(interleaved_codes.data(), flat_codes.data());
     return flat_codes;
 }
 
 /// Performs search when we are already given the IVF cells to look at
 /// (GpuIndexIVF::search_preassigned implementation)
-void RaftIVFFlat::searchPreassigned(
+void CuvsIVFFlat::searchPreassigned(
         Index* coarseQuantizer,
         Tensor<float, 2, true>& vecs,
         Tensor<float, 2, true>& ivfDistances,
@@ -290,7 +293,7 @@ void RaftIVFFlat::searchPreassigned(
     // TODO: Fill this in!
 }
 
-void RaftIVFFlat::updateQuantizer(Index* quantizer) {
+void CuvsIVFFlat::updateQuantizer(Index* quantizer) {
     FAISS_THROW_IF_NOT(quantizer->is_trained);
 
     // Must match our basic IVF parameters
@@ -303,14 +306,14 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
 
-    raft::neighbors::ivf_flat::index_params pams;
+    cuvs::neighbors::ivf_flat::index_params pams;
     pams.add_data_on_build = false;
-    pams.metric = metricFaissToRaft(metric_, false);
+    pams.metric = metricFaissToCuvs(metric_, false);
     pams.n_lists = numLists_;
-    raft_knn_index.emplace(raft_handle, pams, static_cast<uint32_t>(dim_));
+    cuvs_index.emplace(raft_handle, pams, static_cast<uint32_t>(dim_));
 
-    raft::neighbors::ivf_flat::helpers::reset_index(
-            raft_handle, &raft_knn_index.value());
+    cuvs::neighbors::ivf_flat::helpers::reset_index(
+            raft_handle, cuvs_index.get());
 
     // If the index instance is a GpuIndexFlat, then we can use direct access to
     // the centroids within.
@@ -329,7 +332,7 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
             gpuData->reconstruct(0, gpuData->getSize(), centroids);
 
             raft::update_device(
-                    raft_knn_index.value().centers().data_handle(),
+                    cuvs_index->centers().data_handle(),
                     centroids.data(),
                     total_elems,
                     stream);
@@ -338,7 +341,7 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
             auto centroids = gpuData->getVectorsFloat32Ref();
 
             raft::update_device(
-                    raft_knn_index.value().centers().data_handle(),
+                    cuvs_index->centers().data_handle(),
                     centroids.data(),
                     total_elems,
                     stream);
@@ -351,14 +354,14 @@ void RaftIVFFlat::updateQuantizer(Index* quantizer) {
         quantizer->reconstruct_n(0, quantizer->ntotal, vecs.data());
 
         raft::update_device(
-                raft_knn_index.value().centers().data_handle(),
+                cuvs_index->centers().data_handle(),
                 vecs.data(),
                 total_elems,
                 stream);
     }
 }
 
-void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
+void CuvsIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
     size_t nlist = ivf ? ivf->nlist : 0;
     size_t ntotal = ivf ? ivf->compute_ntotal() : 0;
 
@@ -369,12 +372,12 @@ void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
     std::vector<idx_t> indices_(ntotal);
 
     // the index must already exist
-    FAISS_ASSERT(raft_knn_index.has_value());
+    FAISS_ASSERT(cuvs_index.has_value());
 
-    auto& raft_lists = raft_knn_index.value().lists();
+    auto& raft_lists = cuvs_index->lists();
 
     // conservative memory alloc for cloning cpu inverted lists
-    raft::neighbors::ivf_flat::list_spec<uint32_t, float, idx_t> raft_list_spec{
+    cuvs::neighbors::ivf_flat::list_spec<uint32_t, float, idx_t> raft_list_spec{
             static_cast<uint32_t>(dim_), true};
 
     for (size_t i = 0; i < nlist; ++i) {
@@ -394,7 +397,7 @@ void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
         // This RAFT list must currently be empty
         FAISS_ASSERT(getListLength(i) == 0);
 
-        raft::neighbors::ivf::resize_list(
+        cuvs::neighbors::ivf::resize_list(
                 raft_handle,
                 raft_lists[i],
                 raft_list_spec,
@@ -403,8 +406,8 @@ void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
     }
 
     // Update the pointers and the sizes
-    raft::neighbors::ivf_flat::helpers::recompute_internal_state(
-            raft_handle, &(raft_knn_index.value()));
+    cuvs::neighbors::ivf_flat::helpers::recompute_internal_state(
+            raft_handle, cuvs_index.get());
 
     for (size_t i = 0; i < nlist; ++i) {
         size_t listSize = ivf->list_size(i);
@@ -413,18 +416,18 @@ void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
     }
 
     raft::update_device(
-            raft_knn_index.value().list_sizes().data_handle(),
+            cuvs_index->list_sizes().data_handle(),
             list_sizes_.data(),
             nlist,
             raft_handle.get_stream());
 
     // Precompute the centers vector norms for L2Expanded distance
     if (this->metric_ == faiss::METRIC_L2) {
-        raft_knn_index.value().allocate_center_norms(raft_handle);
+        cuvs_index->allocate_center_norms(raft_handle);
         raft::linalg::rowNorm(
-                raft_knn_index.value().center_norms().value().data_handle(),
-                raft_knn_index.value().centers().data_handle(),
-                raft_knn_index.value().dim(),
+                cuvs_index->center_norms().value().data_handle(),
+                cuvs_index->centers().data_handle(),
+                cuvs_index->dim(),
                 (uint32_t)nlist,
                 raft::linalg::L2Norm,
                 true,
@@ -432,7 +435,7 @@ void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
     }
 }
 
-size_t RaftIVFFlat::getGpuVectorsEncodingSize_(idx_t numVecs) const {
+size_t CuvsIVFFlat::getGpuVectorsEncodingSize_(idx_t numVecs) const {
     idx_t bits = 32 /* float */;
 
     // bytes to encode a block of 32 vectors (single dimension)
@@ -443,13 +446,13 @@ size_t RaftIVFFlat::getGpuVectorsEncodingSize_(idx_t numVecs) const {
 
     // number of blocks of 32 vectors we have
     idx_t numBlocks =
-            utils::divUp(numVecs, raft::neighbors::ivf_flat::kIndexGroupSize);
+            utils::divUp(numVecs, cuvs::neighbors::ivf_flat::kIndexGroupSize);
 
     // total size to encode numVecs
     return bytesPerBlock * numBlocks;
 }
 
-void RaftIVFFlat::addEncodedVectorsToList_(
+void CuvsIVFFlat::addEncodedVectorsToList_(
         idx_t listId,
         const void* codes,
         const idx_t* indices,
@@ -469,8 +472,8 @@ void RaftIVFFlat::addEncodedVectorsToList_(
     FAISS_ASSERT(gpuListSizeInBytes <= (size_t)std::numeric_limits<int>::max());
 
     std::vector<uint8_t> interleaved_codes(gpuListSizeInBytes);
-    RaftIVFFlatCodePackerInterleaved packer(
-            (size_t)numVecs, (uint32_t)dim_, raft_knn_index.value().veclen());
+    CuvsIVFFlatCodePackerInterleaved packer(
+            (size_t)numVecs, (uint32_t)dim_, cuvs_index->veclen());
 
     packer.pack_all(
             reinterpret_cast<const uint8_t*>(codes), interleaved_codes.data());
@@ -482,7 +485,7 @@ void RaftIVFFlat::addEncodedVectorsToList_(
     /// fetch the list data ptr on host
     raft::update_host(
             &list_data_ptr,
-            raft_knn_index.value().data_ptrs().data_handle() + listId,
+            cuvs_index->data_ptrs().data_handle() + listId,
             1,
             stream);
     raft_handle.sync_stream();
@@ -499,7 +502,7 @@ void RaftIVFFlat::addEncodedVectorsToList_(
     // fetch the list indices ptr on host
     raft::update_host(
             &list_indices_ptr,
-            raft_knn_index.value().inds_ptrs().data_handle() + listId,
+            cuvs_index->inds_ptrs().data_handle() + listId,
             1,
             stream);
     raft_handle.sync_stream();
@@ -507,7 +510,7 @@ void RaftIVFFlat::addEncodedVectorsToList_(
     raft::update_device(list_indices_ptr, indices, numVecs, stream);
 }
 
-RaftIVFFlatCodePackerInterleaved::RaftIVFFlatCodePackerInterleaved(
+CuvsIVFFlatCodePackerInterleaved::CuvsIVFFlatCodePackerInterleaved(
         size_t list_size,
         uint32_t dim,
         uint32_t chunk_size) {
@@ -518,14 +521,14 @@ RaftIVFFlatCodePackerInterleaved::RaftIVFFlatCodePackerInterleaved(
     nvec = list_size;
     code_size = dim * 4;
     block_size =
-            utils::roundUp(nvec, raft::neighbors::ivf_flat::kIndexGroupSize);
+            utils::roundUp(nvec, cuvs::neighbors::ivf_flat::kIndexGroupSize);
 }
 
-void RaftIVFFlatCodePackerInterleaved::pack_1(
+void CuvsIVFFlatCodePackerInterleaved::pack_1(
         const uint8_t* flat_code,
         size_t offset,
         uint8_t* block) const {
-    raft::neighbors::ivf_flat::codepacker::pack_1(
+    cuvs::neighbors::ivf_flat::helpers::codepacker::pack_1(
             reinterpret_cast<const uint32_t*>(flat_code),
             reinterpret_cast<uint32_t*>(block),
             dim,
@@ -533,11 +536,11 @@ void RaftIVFFlatCodePackerInterleaved::pack_1(
             static_cast<uint32_t>(offset));
 }
 
-void RaftIVFFlatCodePackerInterleaved::unpack_1(
+void CuvsIVFFlatCodePackerInterleaved::unpack_1(
         const uint8_t* block,
         size_t offset,
         uint8_t* flat_code) const {
-    raft::neighbors::ivf_flat::codepacker::unpack_1(
+    cuvs::neighbors::ivf_flat::helpers::codepacker::unpack_1(
             reinterpret_cast<const uint32_t*>(block),
             reinterpret_cast<uint32_t*>(flat_code),
             dim,
diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/CuvsIVFFlat.cuh
similarity index 91%
rename from faiss/gpu/impl/RaftIVFFlat.cuh
rename to faiss/gpu/impl/CuvsIVFFlat.cuh
index 4f8c89ecb0..e0e34a7dbb 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cuh
+++ b/faiss/gpu/impl/CuvsIVFFlat.cuh
@@ -26,7 +26,7 @@
 #include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
 #include <faiss/gpu/impl/IVFFlat.cuh>
 
-#include <raft/neighbors/ivf_flat.cuh>
+#include <cuvs/neighbors/ivf_flat.hpp>
 
 #include <optional>
 
@@ -34,9 +34,9 @@
 namespace faiss {
 namespace gpu {
 
-class RaftIVFFlat : public IVFFlat {
+class CuvsIVFFlat : public IVFFlat {
    public:
-    RaftIVFFlat(
+    CuvsIVFFlat(
             GpuResources* resources,
             int dim,
             int nlist,
@@ -49,7 +49,7 @@ class RaftIVFFlat : public IVFFlat {
             IndicesOptions indicesOptions,
             MemorySpace space);
 
-    ~RaftIVFFlat() override;
+    ~CuvsIVFFlat() override;
 
     /// Reserve GPU memory in our inverted lists for this number of vectors
     void reserveMemory(idx_t numVecs) override;
@@ -106,8 +106,8 @@ class RaftIVFFlat : public IVFFlat {
     /// Copy all inverted lists from a CPU representation to ourselves
     void copyInvertedListsFrom(const InvertedLists* ivf) override;
 
-    /// Replace the Raft index
-    void setRaftIndex(raft::neighbors::ivf_flat::index<float, idx_t>&& idx);
+    /// Replace the CUVS index
+    void setCuvsIndex(cuvs::neighbors::ivf_flat::index<float, idx_t>&& idx);
 
    private:
     /// Adds a set of codes and indices to a list, with the representation
@@ -126,12 +126,12 @@ class RaftIVFFlat : public IVFFlat {
     /// this is the size for an entire IVF list
     size_t getGpuVectorsEncodingSize_(idx_t numVecs) const override;
 
-    std::optional<raft::neighbors::ivf_flat::index<float, idx_t>>
-            raft_knn_index{std::nullopt};
+    std::shared_ptr<cuvs::neighbors::ivf_flat::index<float, idx_t>> cuvs_index{
+            nullptr};
 };
 
-struct RaftIVFFlatCodePackerInterleaved : CodePacker {
-    RaftIVFFlatCodePackerInterleaved(
+struct CuvsIVFFlatCodePackerInterleaved : CodePacker {
+    CuvsIVFFlatCodePackerInterleaved(
             size_t list_size,
             uint32_t dim,
             uint32_t chuk_size);
diff --git a/faiss/gpu/impl/RaftIVFPQ.cu b/faiss/gpu/impl/CuvsIVFPQ.cu
similarity index 81%
rename from faiss/gpu/impl/RaftIVFPQ.cu
rename to faiss/gpu/impl/CuvsIVFPQ.cu
index 3a2a0a4218..2f3c3e089f 100644
--- a/faiss/gpu/impl/RaftIVFPQ.cu
+++ b/faiss/gpu/impl/CuvsIVFPQ.cu
@@ -21,13 +21,13 @@
  */
 
 #include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/utils/RaftUtils.h>
+#include <faiss/gpu/utils/CuvsUtils.h>
 #include <faiss/gpu/impl/FlatIndex.cuh>
-#include <faiss/gpu/impl/RaftIVFPQ.cuh>
+#include <faiss/gpu/impl/CuvsIVFPQ.cuh>
 #include <faiss/gpu/utils/Transpose.cuh>
 
-#include <raft/neighbors/ivf_pq.cuh>
-#include <raft/neighbors/ivf_pq_helpers.cuh>
+#include <cuvs/neighbors/ivf_pq.hpp>
+#include <raft/linalg/map.cuh>
 
 #include <limits>
 #include <memory>
@@ -35,7 +35,7 @@
 namespace faiss {
 namespace gpu {
 
-RaftIVFPQ::RaftIVFPQ(
+CuvsIVFPQ::CuvsIVFPQ(
         GpuResources* resources,
         int dim,
         idx_t nlist,
@@ -69,34 +69,34 @@ RaftIVFPQ::RaftIVFPQ(
             "only INDICES_64_BIT is supported for RAFT index");
 }
 
-RaftIVFPQ::~RaftIVFPQ() {}
+CuvsIVFPQ::~CuvsIVFPQ() {}
 
-void RaftIVFPQ::reserveMemory(idx_t numVecs) {
+void CuvsIVFPQ::reserveMemory(idx_t numVecs) {
     fprintf(stderr,
-            "WARN: reserveMemory is NOP. Pre-allocation of IVF lists is not supported with RAFT enabled.\n");
+            "WARN: reserveMemory is NOP. Pre-allocation of IVF lists is not supported with CUVS enabled.\n");
 }
 
-void RaftIVFPQ::reset() {
-    raft_knn_index.reset();
+void CuvsIVFPQ::reset() {
+    cuvs_index.reset();
 }
 
-size_t RaftIVFPQ::reclaimMemory() {
+size_t CuvsIVFPQ::reclaimMemory() {
     fprintf(stderr,
-            "WARN: reclaimMemory is NOP. reclaimMemory is not supported with RAFT enabled.\n");
+            "WARN: reclaimMemory is NOP. reclaimMemory is not supported with CUVS enabled.\n");
     return 0;
 }
 
-void RaftIVFPQ::setPrecomputedCodes(Index* quantizer, bool enable) {}
+void CuvsIVFPQ::setPrecomputedCodes(Index* quantizer, bool enable) {}
 
-idx_t RaftIVFPQ::getListLength(idx_t listId) const {
-    FAISS_ASSERT(raft_knn_index.has_value());
+idx_t CuvsIVFPQ::getListLength(idx_t listId) const {
+    FAISS_ASSERT(cuvs_index);
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
 
     uint32_t size;
     raft::update_host(
             &size,
-            raft_knn_index.value().list_sizes().data_handle() + listId,
+            cuvs_index->list_sizes().data_handle() + listId,
             1,
             raft_handle.get_stream());
     raft_handle.sync_stream();
@@ -104,7 +104,7 @@ idx_t RaftIVFPQ::getListLength(idx_t listId) const {
     return static_cast<int>(size);
 }
 
-void RaftIVFPQ::updateQuantizer(Index* quantizer) {
+void CuvsIVFPQ::updateQuantizer(Index* quantizer) {
     FAISS_THROW_IF_NOT(quantizer->is_trained);
 
     // Must match our basic IVF parameters
@@ -115,18 +115,18 @@ void RaftIVFPQ::updateQuantizer(Index* quantizer) {
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
 
-    raft::neighbors::ivf_pq::index_params pams;
-    pams.metric = metricFaissToRaft(metric_, false);
-    pams.codebook_kind = raft::neighbors::ivf_pq::codebook_gen::PER_SUBSPACE;
+    cuvs::neighbors::ivf_pq::index_params pams;
+    pams.metric = metricFaissToCuvs(metric_, false);
+    pams.codebook_kind = cuvs::neighbors::ivf_pq::codebook_gen::PER_SUBSPACE;
     pams.n_lists = numLists_;
     pams.pq_bits = bitsPerSubQuantizer_;
     pams.pq_dim = numSubQuantizers_;
-    raft_knn_index.emplace(raft_handle, pams, static_cast<uint32_t>(dim_));
+    cuvs_index = std::make_shared<>(raft_handle, pams, static_cast<uint32_t>(dim_));
 
-    raft::neighbors::ivf_pq::helpers::reset_index(
-            raft_handle, &raft_knn_index.value());
-    raft::neighbors::ivf_pq::helpers::make_rotation_matrix(
-            raft_handle, &(raft_knn_index.value()), false);
+    cuvs::neighbors::ivf_pq::helpers::reset_index(
+            raft_handle, cuvs_index.get());
+    cuvs::neighbors::ivf_pq::helpers::make_rotation_matrix(
+            raft_handle, cuvs_index.get(), false);
 
     // If the index instance is a GpuIndexFlat, then we can use direct access to
     // the centroids within.
@@ -145,9 +145,9 @@ void RaftIVFPQ::updateQuantizer(Index* quantizer) {
             // as float32 and store locally
             gpuData->reconstruct(0, gpuData->getSize(), centroids);
 
-            raft::neighbors::ivf_pq::helpers::set_centers(
+            cuvs::neighbors::ivf_pq::helpers::set_centers(
                     raft_handle,
-                    &(raft_knn_index.value()),
+                    cuvs_index.get(),
                     raft::make_device_matrix_view<float, uint32_t>(
                             centroids.data(), numLists_, dim_));
         } else {
@@ -156,9 +156,9 @@ void RaftIVFPQ::updateQuantizer(Index* quantizer) {
             // reference it
             auto centroids = gpuData->getVectorsFloat32Ref();
 
-            raft::neighbors::ivf_pq::helpers::set_centers(
+            cuvs::neighbors::ivf_pq::helpers::set_centers(
                     raft_handle,
-                    &(raft_knn_index.value()),
+                    cuvs_index.get(),
                     raft::make_device_matrix_view<float, uint32_t>(
                             centroids.data(), numLists_, dim_));
         }
@@ -176,9 +176,9 @@ void RaftIVFPQ::updateQuantizer(Index* quantizer) {
 
         centroids.copyFrom(vecs, stream);
 
-        raft::neighbors::ivf_pq::helpers::set_centers(
+        cuvs::neighbors::ivf_pq::helpers::set_centers(
                 raft_handle,
-                &(raft_knn_index.value()),
+                cuvs_index.get(),
                 raft::make_device_matrix_view<float, uint32_t>(
                         centroids.data(), numLists_, dim_));
     }
@@ -187,8 +187,8 @@ void RaftIVFPQ::updateQuantizer(Index* quantizer) {
 }
 
 /// Return the list indices of a particular list back to the CPU
-std::vector<idx_t> RaftIVFPQ::getListIndices(idx_t listId) const {
-    FAISS_ASSERT(raft_knn_index.has_value());
+std::vector<idx_t> CuvsIVFPQ::getListIndices(idx_t listId) const {
+    FAISS_ASSERT(cuvs_index);
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
     auto stream = raft_handle.get_stream();
@@ -203,7 +203,7 @@ std::vector<idx_t> RaftIVFPQ::getListIndices(idx_t listId) const {
     raft::update_host(
             &list_indices_ptr,
             const_cast<idx_t**>(
-                    raft_knn_index.value().inds_ptrs().data_handle()) +
+                    cuvs_index->inds_ptrs().data_handle()) +
                     listId,
             1,
             stream);
@@ -217,7 +217,7 @@ std::vector<idx_t> RaftIVFPQ::getListIndices(idx_t listId) const {
 
 /// Performs search when we are already given the IVF cells to look at
 /// (GpuIndexIVF::search_preassigned implementation)
-void RaftIVFPQ::searchPreassigned(
+void CuvsIVFPQ::searchPreassigned(
         Index* coarseQuantizer,
         Tensor<float, 2, true>& vecs,
         Tensor<float, 2, true>& ivfDistances,
@@ -229,19 +229,19 @@ void RaftIVFPQ::searchPreassigned(
     // TODO: Fill this in!
 }
 
-size_t RaftIVFPQ::getGpuListEncodingSize_(idx_t listId) {
+size_t CuvsIVFPQ::getGpuListEncodingSize_(idx_t listId) {
     return static_cast<size_t>(
-            raft_knn_index.value().get_list_size_in_bytes(listId));
+            cuvs_index->get_list_size_in_bytes(listId));
 }
 
 /// Return the encoded vectors of a particular list back to the CPU
-std::vector<uint8_t> RaftIVFPQ::getListVectorData(idx_t listId, bool gpuFormat)
+std::vector<uint8_t> CuvsIVFPQ::getListVectorData(idx_t listId, bool gpuFormat)
         const {
     if (gpuFormat) {
         FAISS_THROW_MSG(
                 "gpuFormat should be false for RAFT indices. Unpacked codes are flat.");
     }
-    FAISS_ASSERT(raft_knn_index.has_value());
+    FAISS_ASSERT(cuvs_index);
 
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
@@ -264,9 +264,9 @@ std::vector<uint8_t> RaftIVFPQ::getListVectorData(idx_t listId, bool gpuFormat)
         auto codes_d = raft::make_device_vector<uint8_t>(
                 raft_handle, static_cast<uint32_t>(bufferSize));
 
-        raft::neighbors::ivf_pq::helpers::unpack_contiguous_list_data(
+        cuvs::neighbors::ivf_pq::helpers::unpack_contiguous_list_data(
                 raft_handle,
-                raft_knn_index.value(),
+                cuvs_index.value(),
                 codes_d.data_handle(),
                 batchSize,
                 listId,
@@ -286,7 +286,7 @@ std::vector<uint8_t> RaftIVFPQ::getListVectorData(idx_t listId, bool gpuFormat)
 
 /// Find the approximate k nearest neighbors for `queries` against
 /// our database
-void RaftIVFPQ::search(
+void CuvsIVFPQ::search(
         Index* coarseQuantizer,
         Tensor<float, 2, true>& queries,
         int nprobe,
@@ -295,17 +295,17 @@ void RaftIVFPQ::search(
         Tensor<idx_t, 2, true>& outIndices) {
     uint32_t numQueries = queries.getSize(0);
     uint32_t cols = queries.getSize(1);
-    idx_t k_ = std::min(static_cast<idx_t>(k), raft_knn_index.value().size());
+    idx_t k_ = std::min(static_cast<idx_t>(k), cuvs_index->size());
 
     // Device is already set in GpuIndex::search
-    FAISS_ASSERT(raft_knn_index.has_value());
+    FAISS_ASSERT(cuvs_index);
     FAISS_ASSERT(numQueries > 0);
     FAISS_ASSERT(cols == dim_);
     FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= numLists_);
 
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
-    raft::neighbors::ivf_pq::search_params pams;
+    cuvs::neighbors::ivf_pq::search_params pams;
     pams.n_probes = nprobe;
     pams.lut_dtype = useFloat16LookupTables_ ? CUDA_R_16F : CUDA_R_32F;
 
@@ -316,10 +316,10 @@ void RaftIVFPQ::search(
     auto out_dists_view = raft::make_device_matrix_view<float, idx_t>(
             outDistances.data(), (idx_t)numQueries, (idx_t)k_);
 
-    raft::neighbors::ivf_pq::search<float, idx_t>(
+    cuvs::neighbors::ivf_pq::search(
             raft_handle,
             pams,
-            raft_knn_index.value(),
+            cuvs_index.value(),
             queries_view,
             out_inds_view,
             out_dists_view);
@@ -357,7 +357,7 @@ void RaftIVFPQ::search(
     raft_handle.sync_stream();
 }
 
-idx_t RaftIVFPQ::addVectors(
+idx_t CuvsIVFPQ::addVectors(
         Index* coarseQuantizer,
         Tensor<float, 2, true>& vecs,
         Tensor<idx_t, 1, true>& indices) {
@@ -365,7 +365,7 @@ idx_t RaftIVFPQ::addVectors(
     /// called updateQuantizer() to update the RAFT index if the quantizer was
     /// modified externally
 
-    FAISS_ASSERT(raft_knn_index.has_value());
+    FAISS_ASSERT(cuvs_index);
 
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
@@ -373,19 +373,19 @@ idx_t RaftIVFPQ::addVectors(
     /// Remove rows containing NaNs
     idx_t n_rows_valid = inplaceGatherFilteredRows(resources_, vecs, indices);
 
-    raft_knn_index.emplace(raft::neighbors::ivf_pq::extend(
+    cuvs_index.emplace(cuvs::neighbors::ivf_pq::extend(
             raft_handle,
             raft::make_device_matrix_view<const float, idx_t>(
                     vecs.data(), n_rows_valid, dim_),
             std::make_optional<raft::device_vector_view<const idx_t, idx_t>>(
                     raft::make_device_vector_view<const idx_t, idx_t>(
                             indices.data(), n_rows_valid)),
-            raft_knn_index.value()));
+            cuvs_index.value()));
 
     return n_rows_valid;
 }
 
-void RaftIVFPQ::copyInvertedListsFrom(const InvertedLists* ivf) {
+void CuvsIVFPQ::copyInvertedListsFrom(const InvertedLists* ivf) {
     size_t nlist = ivf ? ivf->nlist : 0;
     size_t ntotal = ivf ? ivf->compute_ntotal() : 0;
 
@@ -396,12 +396,12 @@ void RaftIVFPQ::copyInvertedListsFrom(const InvertedLists* ivf) {
     std::vector<idx_t> indices_(ntotal);
 
     // the index must already exist
-    FAISS_ASSERT(raft_knn_index.has_value());
+    FAISS_ASSERT(cuvs_index);
 
-    auto& raft_lists = raft_knn_index.value().lists();
+    auto& raft_lists = cuvs_index->lists();
 
     // conservative memory alloc for cloning cpu inverted lists
-    raft::neighbors::ivf_pq::list_spec<uint32_t, idx_t> raft_list_spec{
+    cuvs::neighbors::ivf_pq::list_spec<uint32_t, idx_t> raft_list_spec{
             static_cast<uint32_t>(bitsPerSubQuantizer_),
             static_cast<uint32_t>(numSubQuantizers_),
             true};
@@ -423,7 +423,7 @@ void RaftIVFPQ::copyInvertedListsFrom(const InvertedLists* ivf) {
         // This RAFT list must currently be empty
         FAISS_ASSERT(getListLength(i) == 0);
 
-        raft::neighbors::ivf::resize_list(
+        cuvs::neighbors::ivf::resize_list(
                 raft_handle,
                 raft_lists[i],
                 raft_list_spec,
@@ -432,14 +432,14 @@ void RaftIVFPQ::copyInvertedListsFrom(const InvertedLists* ivf) {
     }
 
     raft::update_device(
-            raft_knn_index.value().list_sizes().data_handle(),
+            cuvs_index->list_sizes().data_handle(),
             list_sizes_.data(),
             nlist,
             raft_handle.get_stream());
 
     //     Update the pointers and the sizes
-    raft::neighbors::ivf_pq::helpers::recompute_internal_state(
-            raft_handle, &(raft_knn_index.value()));
+    cuvs::neighbors::ivf_pq::helpers::recompute_internal_state(
+            raft_handle, cuvs_index.get());
 
     for (size_t i = 0; i < nlist; ++i) {
         size_t listSize = ivf->list_size(i);
@@ -448,12 +448,12 @@ void RaftIVFPQ::copyInvertedListsFrom(const InvertedLists* ivf) {
     }
 }
 
-void RaftIVFPQ::setRaftIndex(raft::neighbors::ivf_pq::index<idx_t>&& idx) {
-    raft_knn_index.emplace(std::move(idx));
+void CuvsIVFPQ::setCuvsIndex(cuvs::neighbors::ivf_pq::index<idx_t>&& idx) {
+    cuvs_index.emplace(std::move(idx));
     setBasePQCentroids_();
 }
 
-void RaftIVFPQ::addEncodedVectorsToList_(
+void CuvsIVFPQ::addEncodedVectorsToList_(
         idx_t listId,
         const void* codes,
         const idx_t* indices,
@@ -489,9 +489,9 @@ void RaftIVFPQ::addEncodedVectorsToList_(
                 bufferSize,
                 stream);
 
-        raft::neighbors::ivf_pq::helpers::pack_contiguous_list_data(
+        cuvs::neighbors::ivf_pq::helpers::pack_contiguous_list_data(
                 raft_handle,
-                &(raft_knn_index.value()),
+                cuvs_index.get(),
                 codes_d.data_handle(),
                 batchSize,
                 listId,
@@ -504,7 +504,7 @@ void RaftIVFPQ::addEncodedVectorsToList_(
     // fetch the list indices ptr on host
     raft::update_host(
             &list_indices_ptr,
-            raft_knn_index.value().inds_ptrs().data_handle() + listId,
+            cuvs_index->inds_ptrs().data_handle() + listId,
             1,
             stream);
     raft_handle.sync_stream();
@@ -512,23 +512,23 @@ void RaftIVFPQ::addEncodedVectorsToList_(
     raft::update_device(list_indices_ptr, indices, numVecs, stream);
 }
 
-void RaftIVFPQ::setPQCentroids_() {
+void CuvsIVFPQ::setPQCentroids_() {
     auto stream = resources_->getDefaultStreamCurrentDevice();
 
     raft::copy(
-            raft_knn_index.value().pq_centers().data_handle(),
+            cuvs_index->pq_centers().data_handle(),
             pqCentroidsInnermostCode_.data(),
             pqCentroidsInnermostCode_.numElements(),
             stream);
 }
 
-void RaftIVFPQ::setBasePQCentroids_() {
+void CuvsIVFPQ::setBasePQCentroids_() {
     auto stream = resources_->getDefaultStreamCurrentDevice();
 
     raft::copy(
             pqCentroidsInnermostCode_.data(),
-            raft_knn_index.value().pq_centers().data_handle(),
-            raft_knn_index.value().pq_centers().size(),
+            cuvs_index->pq_centers().data_handle(),
+            cuvs_index->pq_centers().size(),
             stream);
 
     DeviceTensor<float, 3, true> pqCentroidsMiddleCode(
diff --git a/faiss/gpu/impl/RaftIVFPQ.cuh b/faiss/gpu/impl/CuvsIVFPQ.cuh
similarity index 93%
rename from faiss/gpu/impl/RaftIVFPQ.cuh
rename to faiss/gpu/impl/CuvsIVFPQ.cuh
index a79db3c40d..c5c4cf64e5 100644
--- a/faiss/gpu/impl/RaftIVFPQ.cuh
+++ b/faiss/gpu/impl/CuvsIVFPQ.cuh
@@ -25,17 +25,18 @@
 #include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
 #include <faiss/gpu/impl/IVFPQ.cuh>
 
-#include <raft/neighbors/ivf_pq.cuh>
+#include <cuvs/neighbors/ivf_pq.hpp>
 
+#include <memory>
 #include <optional>
 
 #pragma GCC visibility push(default)
 namespace faiss {
 namespace gpu {
 /// Implementing class for IVFPQ on the GPU
-class RaftIVFPQ : public IVFPQ {
+class CuvsIVFPQ : public IVFPQ {
    public:
-    RaftIVFPQ(
+    CuvsIVFPQ(
             GpuResources* resources,
             int dim,
             idx_t nlist,
@@ -50,7 +51,7 @@ class RaftIVFPQ : public IVFPQ {
             IndicesOptions indicesOptions,
             MemorySpace space);
 
-    ~RaftIVFPQ() override;
+    ~CuvsIVFPQ() override;
 
     /// Reserve GPU memory in our inverted lists for this number of vectors
     void reserveMemory(idx_t numVecs) override;
@@ -100,7 +101,7 @@ class RaftIVFPQ : public IVFPQ {
     void copyInvertedListsFrom(const InvertedLists* ivf) override;
 
     /// Replace the Raft index
-    void setRaftIndex(raft::neighbors::ivf_pq::index<idx_t>&& idx);
+    void setCuvsIndex(cuvs::neighbors::ivf_pq::index<idx_t>&& idx);
 
     /// Classify and encode/add vectors to our IVF lists.
     /// The input data must be on our current device.
@@ -140,9 +141,8 @@ class RaftIVFPQ : public IVFPQ {
     /// Used when the RAFT index was updated externally.
     void setBasePQCentroids_();
 
-    /// optional around the Raft IVF-PQ index
-    std::optional<raft::neighbors::ivf_pq::index<idx_t>> raft_knn_index{
-            std::nullopt};
+    /// CUVS IVF-PQ index
+    std::shared_ptr<cuvs::neighbors::ivf_pq::index<idx_t>> cuvs_index{nullptr};
 };
 
 } // namespace gpu
diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt
index 60f78ef74f..b2b4c22831 100644
--- a/faiss/gpu/test/CMakeLists.txt
+++ b/faiss/gpu/test/CMakeLists.txt
@@ -22,7 +22,7 @@ find_package(CUDAToolkit REQUIRED)
 # Defines `gtest_discover_tests()`.
 include(GoogleTest)
 add_library(faiss_gpu_test_helper TestUtils.cpp)
-target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled>)
+target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft>)
 
 macro(faiss_gpu_test file)
   get_filename_component(test_name ${file} NAME_WE)
@@ -41,7 +41,7 @@ faiss_gpu_test(TestGpuIndexIVFPQ.cpp)
 faiss_gpu_test(TestGpuIndexIVFScalarQuantizer.cpp)
 faiss_gpu_test(TestGpuDistance.cu)
 faiss_gpu_test(TestGpuSelect.cu)
-if(FAISS_ENABLE_RAFT)
+if(FAISS_ENABLE_CUVS)
   faiss_gpu_test(TestGpuIndexCagra.cu)
 endif()
 
diff --git a/faiss/gpu/test/TestGpuDistance.cu b/faiss/gpu/test/TestGpuDistance.cu
index 3c59cc1a5f..b30dc6a4ee 100644
--- a/faiss/gpu/test/TestGpuDistance.cu
+++ b/faiss/gpu/test/TestGpuDistance.cu
@@ -48,7 +48,7 @@ void evaluate_bfknn(
     bfKnn(res, args);
 
     std::stringstream str;
-    str << "using raft " << args.use_raft << "metric " << metric
+    str << "using raft " << args.use_cuvs << "metric " << metric
         << " colMajorVecs " << colMajorVecs << " colMajorQueries "
         << colMajorQueries;
 
@@ -72,7 +72,7 @@ void testTransposition(
         bool colMajorVecs,
         bool colMajorQueries,
         faiss::MetricType metric,
-        bool use_raft = false,
+        bool use_cuvs = false,
         float metricArg = 0) {
     using namespace faiss::gpu;
 
@@ -168,11 +168,11 @@ void testTransposition(
     args.outIndices = gpuIndices.data();
     args.device = device;
 
-#if defined USE_NVIDIA_RAFT
-    args.use_raft = use_raft;
+#if defined USE_NVIDIA_RAPIDS
+    args.use_cuvs = use_cuvs;
 #else
     FAISS_THROW_IF_NOT_MSG(
-            !use_raft,
+            !use_cuvs,
             "RAFT has not been compiled into the current version so it cannot be used.");
 #endif
 
@@ -196,7 +196,7 @@ TEST(TestGpuDistance, Transposition_RR) {
     testTransposition(false, false, faiss::MetricType::METRIC_INNER_PRODUCT);
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 TEST(TestRaftGpuDistance, Transposition_RR) {
     testTransposition(false, false, faiss::MetricType::METRIC_L2, true);
     testTransposition(
@@ -208,7 +208,7 @@ TEST(TestGpuDistance, Transposition_RC) {
     testTransposition(false, true, faiss::MetricType::METRIC_L2);
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 TEST(TestRaftGpuDistance, Transposition_RC) {
     testTransposition(false, true, faiss::MetricType::METRIC_L2, true);
 }
@@ -218,7 +218,7 @@ TEST(TestGpuDistance, Transposition_CR) {
     testTransposition(true, false, faiss::MetricType::METRIC_L2);
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 TEST(TestRaftGpuDistance, Transposition_CR) {
     testTransposition(true, false, faiss::MetricType::METRIC_L2, true);
 }
@@ -228,7 +228,7 @@ TEST(TestGpuDistance, Transposition_CC) {
     testTransposition(true, true, faiss::MetricType::METRIC_L2);
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 TEST(TestRaftGpuDistance, Transposition_CC) {
     testTransposition(true, true, faiss::MetricType::METRIC_L2, true);
 }
@@ -238,7 +238,7 @@ TEST(TestGpuDistance, L1) {
     testTransposition(false, false, faiss::MetricType::METRIC_L1);
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 TEST(TestRaftGpuDistance, L1) {
     testTransposition(false, false, faiss::MetricType::METRIC_L1, true);
 }
@@ -249,7 +249,7 @@ TEST(TestGpuDistance, L1_RC) {
     testTransposition(false, true, faiss::MetricType::METRIC_L1);
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 // Test other transpositions with the general distance kernel
 TEST(TestRaftGpuDistance, L1_RC) {
     testTransposition(false, true, faiss::MetricType::METRIC_L1, true);
@@ -260,7 +260,7 @@ TEST(TestGpuDistance, L1_CR) {
     testTransposition(true, false, faiss::MetricType::METRIC_L1);
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 TEST(TestRaftGpuDistance, L1_CR) {
     testTransposition(true, false, faiss::MetricType::METRIC_L1, true);
 }
@@ -270,7 +270,7 @@ TEST(TestGpuDistance, L1_CC) {
     testTransposition(true, true, faiss::MetricType::METRIC_L1);
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 TEST(TestRaftGpuDistance, L1_CC) {
     testTransposition(true, true, faiss::MetricType::METRIC_L1, true);
 }
@@ -281,7 +281,7 @@ TEST(TestGpuDistance, Linf) {
     testTransposition(false, false, faiss::MetricType::METRIC_Linf);
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 // Test remainder of metric types
 TEST(TestRaftGpuDistance, Linf) {
     testTransposition(false, false, faiss::MetricType::METRIC_Linf, true);
@@ -292,7 +292,7 @@ TEST(TestGpuDistance, Lp) {
     testTransposition(false, false, faiss::MetricType::METRIC_Lp, false, 3);
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 TEST(TestRaftGpuDistance, Lp) {
     testTransposition(false, false, faiss::MetricType::METRIC_Lp, true, 3);
 }
@@ -302,7 +302,7 @@ TEST(TestGpuDistance, Canberra) {
     testTransposition(false, false, faiss::MetricType::METRIC_Canberra);
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 TEST(TestRaftGpuDistance, Canberra) {
     testTransposition(false, false, faiss::MetricType::METRIC_Canberra, true);
 }
@@ -316,7 +316,7 @@ TEST(TestGpuDistance, JensenShannon) {
     testTransposition(false, false, faiss::MetricType::METRIC_JensenShannon);
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 TEST(TestRaftGpuDistance, JensenShannon) {
     testTransposition(
             false, false, faiss::MetricType::METRIC_JensenShannon, true);
diff --git a/faiss/gpu/test/TestGpuIndexFlat.cpp b/faiss/gpu/test/TestGpuIndexFlat.cpp
index 06b860ded4..c4ae04282c 100644
--- a/faiss/gpu/test/TestGpuIndexFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexFlat.cpp
@@ -29,7 +29,7 @@ struct TestFlatOptions {
               numQueriesOverride(-1),
               kOverride(-1),
               dimOverride(-1),
-              use_raft(false) {}
+              use_cuvs(false) {}
 
     faiss::MetricType metric;
     float metricArg;
@@ -39,7 +39,7 @@ struct TestFlatOptions {
     int numQueriesOverride;
     int kOverride;
     int dimOverride;
-    bool use_raft;
+    bool use_cuvs;
 };
 
 void testFlat(const TestFlatOptions& opt) {
@@ -75,7 +75,7 @@ void testFlat(const TestFlatOptions& opt) {
     faiss::gpu::GpuIndexFlatConfig config;
     config.device = device;
     config.useFloat16 = opt.useFloat16;
-    config.use_raft = opt.use_raft;
+    config.use_cuvs = opt.use_cuvs;
 
     faiss::gpu::GpuIndexFlat gpuIndex(&res, dim, opt.metric, config);
     gpuIndex.metric_arg = opt.metricArg;
@@ -114,8 +114,8 @@ TEST(TestGpuIndexFlat, IP_Float32) {
 
         testFlat(opt);
 
-#if defined USE_NVIDIA_RAFT
-        opt.use_raft = true;
+#if defined USE_NVIDIA_RAPIDS
+        opt.use_cuvs = true;
         testFlat(opt);
 #endif
     }
@@ -128,8 +128,8 @@ TEST(TestGpuIndexFlat, L1_Float32) {
 
     testFlat(opt);
 
-#if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
+#if defined USE_NVIDIA_RAPIDS
+    opt.use_cuvs = true;
     testFlat(opt);
 #endif
 }
@@ -141,8 +141,8 @@ TEST(TestGpuIndexFlat, Lp_Float32) {
     opt.useFloat16 = false;
 
     testFlat(opt);
-#if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
+#if defined USE_NVIDIA_RAPIDS
+    opt.use_cuvs = true;
     testFlat(opt);
 #endif
 }
@@ -155,8 +155,8 @@ TEST(TestGpuIndexFlat, L2_Float32) {
         opt.useFloat16 = false;
 
         testFlat(opt);
-#if defined USE_NVIDIA_RAFT
-        opt.use_raft = true;
+#if defined USE_NVIDIA_RAPIDS
+        opt.use_cuvs = true;
         testFlat(opt);
 #endif
     }
@@ -173,8 +173,8 @@ TEST(TestGpuIndexFlat, L2_k_2048) {
         opt.numVecsOverride = 10000;
 
         testFlat(opt);
-#if defined USE_NVIDIA_RAFT
-        opt.use_raft = true;
+#if defined USE_NVIDIA_RAPIDS
+        opt.use_cuvs = true;
         testFlat(opt);
 #endif
     }
@@ -189,8 +189,8 @@ TEST(TestGpuIndexFlat, L2_Float32_K1) {
         opt.kOverride = 1;
 
         testFlat(opt);
-#if defined USE_NVIDIA_RAFT
-        opt.use_raft = true;
+#if defined USE_NVIDIA_RAPIDS
+        opt.use_cuvs = true;
         testFlat(opt);
 #endif
     }
@@ -203,8 +203,8 @@ TEST(TestGpuIndexFlat, IP_Float16) {
         opt.useFloat16 = true;
 
         testFlat(opt);
-#if defined USE_NVIDIA_RAFT
-        opt.use_raft = true;
+#if defined USE_NVIDIA_RAPIDS
+        opt.use_cuvs = true;
         testFlat(opt);
 #endif
     }
@@ -217,8 +217,8 @@ TEST(TestGpuIndexFlat, L2_Float16) {
         opt.useFloat16 = true;
 
         testFlat(opt);
-#if defined USE_NVIDIA_RAFT
-        opt.use_raft = true;
+#if defined USE_NVIDIA_RAPIDS
+        opt.use_cuvs = true;
         testFlat(opt);
 #endif
     }
@@ -233,8 +233,8 @@ TEST(TestGpuIndexFlat, L2_Float16_K1) {
         opt.kOverride = 1;
 
         testFlat(opt);
-#if defined USE_NVIDIA_RAFT
-        opt.use_raft = true;
+#if defined USE_NVIDIA_RAPIDS
+        opt.use_cuvs = true;
         testFlat(opt);
 #endif
     }
@@ -254,8 +254,8 @@ TEST(TestGpuIndexFlat, L2_Tiling) {
         opt.kOverride = 64;
 
         testFlat(opt);
-#if defined USE_NVIDIA_RAFT
-        opt.use_raft = true;
+#if defined USE_NVIDIA_RAPIDS
+        opt.use_cuvs = true;
         testFlat(opt);
 #endif
     }
@@ -268,7 +268,7 @@ TEST(TestGpuIndexFlat, QueryEmpty) {
     faiss::gpu::GpuIndexFlatConfig config;
     config.device = 0;
     config.useFloat16 = false;
-    config.use_raft = false;
+    config.use_cuvs = false;
     int dim = 128;
     faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
 
@@ -292,7 +292,7 @@ TEST(TestGpuIndexFlat, QueryEmpty) {
     }
 }
 
-void testCopyFrom(bool use_raft) {
+void testCopyFrom(bool use_cuvs) {
     int numVecs = faiss::gpu::randVal(100, 200);
     int dim = faiss::gpu::randVal(1, 1000);
 
@@ -310,7 +310,7 @@ void testCopyFrom(bool use_raft) {
         faiss::gpu::GpuIndexFlatConfig config;
         config.device = device;
         config.useFloat16 = useFloat16;
-        config.use_raft = use_raft;
+        config.use_cuvs = use_cuvs;
 
         // Fill with garbage values
         faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, 2000, config);
@@ -343,13 +343,13 @@ TEST(TestGpuIndexFlat, CopyFrom) {
     testCopyFrom(false);
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 TEST(TestRaftGpuIndexFlat, CopyFrom) {
     testCopyFrom(true);
 }
 #endif
 
-void testCopyTo(bool use_raft) {
+void testCopyTo(bool use_cuvs) {
     faiss::gpu::StandardGpuResources res;
     res.noTempMemory();
 
@@ -363,7 +363,7 @@ void testCopyTo(bool use_raft) {
         faiss::gpu::GpuIndexFlatConfig config;
         config.device = device;
         config.useFloat16 = useFloat16;
-        config.use_raft = use_raft;
+        config.use_cuvs = use_cuvs;
 
         faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
         gpuIndex.add(numVecs, vecs.data());
@@ -394,13 +394,13 @@ TEST(TestGpuIndexFlat, CopyTo) {
     testCopyTo(false);
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 TEST(TestRaftGpuIndexFlat, CopyTo) {
     testCopyTo(true);
 }
 #endif
 
-void testUnifiedMemory(bool use_raft) {
+void testUnifiedMemory(bool use_cuvs) {
     // Construct on a random device to test multi-device, if we have
     // multiple devices
     int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
@@ -426,7 +426,7 @@ void testUnifiedMemory(bool use_raft) {
     faiss::gpu::GpuIndexFlatConfig config;
     config.device = device;
     config.memorySpace = faiss::gpu::MemorySpace::Unified;
-    config.use_raft = use_raft;
+    config.use_cuvs = use_cuvs;
 
     faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
 
@@ -452,13 +452,13 @@ TEST(TestGpuIndexFlat, UnifiedMemory) {
     testUnifiedMemory(false);
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 TEST(TestRaftGpuIndexFlat, UnifiedMemory) {
     testUnifiedMemory(true);
 }
 #endif
 
-void testLargeIndex(bool use_raft) {
+void testLargeIndex(bool use_cuvs) {
     // Construct on a random device to test multi-device, if we have
     // multiple devices
     int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
@@ -489,7 +489,7 @@ void testLargeIndex(bool use_raft) {
 
     faiss::gpu::GpuIndexFlatConfig config;
     config.device = device;
-    config.use_raft = use_raft;
+    config.use_cuvs = use_cuvs;
     faiss::gpu::GpuIndexFlatL2 gpuIndexL2(&res, dim, config);
 
     cpuIndexL2.add(nb, xb.data());
@@ -513,13 +513,13 @@ TEST(TestGpuIndexFlat, LargeIndex) {
     testLargeIndex(false);
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 TEST(TestRaftGpuIndexFlat, LargeIndex) {
     testLargeIndex(true);
 }
 #endif
 
-void testResidual(bool use_raft) {
+void testResidual(bool use_cuvs) {
     // Construct on a random device to test multi-device, if we have
     // multiple devices
     int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
@@ -529,7 +529,7 @@ void testResidual(bool use_raft) {
 
     faiss::gpu::GpuIndexFlatConfig config;
     config.device = device;
-    config.use_raft = use_raft;
+    config.use_cuvs = use_cuvs;
 
     int dim = 32;
     faiss::IndexFlat cpuIndex(dim, faiss::MetricType::METRIC_L2);
@@ -566,13 +566,13 @@ TEST(TestGpuIndexFlat, Residual) {
     testResidual(false);
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 TEST(TestRaftGpuIndexFlat, Residual) {
     testResidual(true);
 }
 #endif
 
-void testReconstruct(bool use_raft) {
+void testReconstruct(bool use_cuvs) {
     // Construct on a random device to test multi-device, if we have
     // multiple devices
     int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
@@ -589,7 +589,7 @@ void testReconstruct(bool use_raft) {
         faiss::gpu::GpuIndexFlatConfig config;
         config.device = device;
         config.useFloat16 = useFloat16;
-        config.use_raft = use_raft;
+        config.use_cuvs = use_cuvs;
 
         faiss::gpu::GpuIndexFlat gpuIndex(
                 &res, dim, faiss::MetricType::METRIC_L2, config);
@@ -657,13 +657,13 @@ void testReconstruct(bool use_raft) {
 TEST(TestGpuIndexFlat, Reconstruct) {
     testReconstruct(false);
 }
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 TEST(TestRaftGpuIndexFlat, Reconstruct) {
     testReconstruct(true);
 }
 #endif
 
-void testSearchAndReconstruct(bool use_raft) {
+void testSearchAndReconstruct(bool use_cuvs) {
     // Construct on a random device to test multi-device, if we have
     // multiple devices
     int device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
@@ -683,7 +683,7 @@ void testSearchAndReconstruct(bool use_raft) {
 
     faiss::gpu::GpuIndexFlatConfig config;
     config.device = device;
-    config.use_raft = use_raft;
+    config.use_cuvs = use_cuvs;
     faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
 
     cpuIndex.add(nb, xb.data());
@@ -754,7 +754,7 @@ TEST(TestGpuIndexFlat, SearchAndReconstruct) {
     testSearchAndReconstruct(false);
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 TEST(TestRaftGpuIndexFlat, SearchAndReconstruct) {
     testSearchAndReconstruct(true);
 }
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index 28eefec308..ecd7004547 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -110,7 +110,7 @@ void queryTest(
         config.device = opt.device;
         config.indicesOptions = opt.indicesOpt;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-        config.use_raft = opt.useRaft;
+        config.use_cuvs = opt.useRaft;
 
         faiss::gpu::GpuIndexIVFFlat gpuIndex(
                 &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
@@ -164,7 +164,7 @@ void addTest(
         config.indicesOptions =
                 useRaft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-        config.use_raft = useRaft;
+        config.use_cuvs = useRaft;
 
         faiss::gpu::GpuIndexIVFFlat gpuIndex(
                 &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
@@ -201,7 +201,7 @@ void copyToTest(bool useFloat16CoarseQuantizer, bool useRaft) {
     config.indicesOptions =
             useRaft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
     config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-    config.use_raft = useRaft;
+    config.use_cuvs = useRaft;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
@@ -262,7 +262,7 @@ void copyFromTest(bool useFloat16CoarseQuantizer, bool useRaft) {
     config.indicesOptions =
             useRaft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
     config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-    config.use_raft = useRaft;
+    config.use_cuvs = useRaft;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(&res, 1, 1, faiss::METRIC_L2, config);
     gpuIndex.nprobe = 1;
@@ -296,7 +296,7 @@ void copyFromTest(bool useFloat16CoarseQuantizer, bool useRaft) {
 TEST(TestGpuIndexIVFFlat, Float32_32_Add_L2) {
     addTest(faiss::METRIC_L2, false, false);
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     addTest(faiss::METRIC_L2, false, true);
 #endif
 }
@@ -304,7 +304,7 @@ TEST(TestGpuIndexIVFFlat, Float32_32_Add_L2) {
 TEST(TestGpuIndexIVFFlat, Float32_32_Add_IP) {
     addTest(faiss::METRIC_INNER_PRODUCT, false, false);
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     addTest(faiss::METRIC_INNER_PRODUCT, false, true);
 #endif
 }
@@ -312,7 +312,7 @@ TEST(TestGpuIndexIVFFlat, Float32_32_Add_IP) {
 TEST(TestGpuIndexIVFFlat, Float16_32_Add_L2) {
     addTest(faiss::METRIC_L2, true, false);
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     addTest(faiss::METRIC_L2, true, true);
 #endif
 }
@@ -320,7 +320,7 @@ TEST(TestGpuIndexIVFFlat, Float16_32_Add_L2) {
 TEST(TestGpuIndexIVFFlat, Float16_32_Add_IP) {
     addTest(faiss::METRIC_INNER_PRODUCT, true, false);
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     addTest(faiss::METRIC_INNER_PRODUCT, true, true);
 #endif
 }
@@ -333,7 +333,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2) {
     Options opt;
     queryTest(opt, faiss::METRIC_L2, false);
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
@@ -344,7 +344,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP) {
     Options opt;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
@@ -357,7 +357,7 @@ TEST(TestGpuIndexIVFFlat, LargeBatch) {
     opt.numQuery = 100000;
     queryTest(opt, faiss::METRIC_L2, false);
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
@@ -370,7 +370,7 @@ TEST(TestGpuIndexIVFFlat, Float16_32_Query_L2) {
     Options opt;
     queryTest(opt, faiss::METRIC_L2, true);
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, true);
@@ -381,7 +381,7 @@ TEST(TestGpuIndexIVFFlat, Float16_32_Query_IP) {
     Options opt;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, true);
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, true);
@@ -398,7 +398,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2_64) {
     opt.dim = 64;
     queryTest(opt, faiss::METRIC_L2, false);
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
@@ -410,7 +410,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP_64) {
     opt.dim = 64;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
@@ -422,7 +422,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2_128) {
     opt.dim = 128;
     queryTest(opt, faiss::METRIC_L2, false);
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
@@ -434,7 +434,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP_128) {
     opt.dim = 128;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
@@ -448,7 +448,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP_128) {
 TEST(TestGpuIndexIVFFlat, Float32_32_CopyTo) {
     copyToTest(false, false);
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     copyToTest(false, true);
 #endif
 }
@@ -456,7 +456,7 @@ TEST(TestGpuIndexIVFFlat, Float32_32_CopyTo) {
 TEST(TestGpuIndexIVFFlat, Float32_32_CopyFrom) {
     copyFromTest(false, false);
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     copyFromTest(false, true);
 #endif
 }
@@ -499,7 +499,7 @@ TEST(TestGpuIndexIVFFlat, Float32_negative) {
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = opt.device;
     config.indicesOptions = opt.indicesOpt;
-    config.use_raft = false;
+    config.use_cuvs = false;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
@@ -522,8 +522,8 @@ TEST(TestGpuIndexIVFFlat, Float32_negative) {
             compFloat16 ? 0.99f : 0.1f,
             compFloat16 ? 0.65f : 0.015f);
 
-#if defined USE_NVIDIA_RAFT
-    config.use_raft = true;
+#if defined USE_NVIDIA_RAPIDS
+    config.use_cuvs = true;
     config.indicesOptions = faiss::gpu::INDICES_64_BIT;
 
     faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
@@ -572,7 +572,7 @@ TEST(TestGpuIndexIVFFlat, QueryNaN) {
     config.device = opt.device;
     config.indicesOptions = opt.indicesOpt;
     config.flatConfig.useFloat16 = faiss::gpu::randBool();
-    config.use_raft = false;
+    config.use_cuvs = false;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
@@ -593,8 +593,8 @@ TEST(TestGpuIndexIVFFlat, QueryNaN) {
         }
     }
 
-#if defined USE_NVIDIA_RAFT
-    config.use_raft = true;
+#if defined USE_NVIDIA_RAPIDS
+    config.use_cuvs = true;
     config.indicesOptions = faiss::gpu::INDICES_64_BIT;
     std::fill(distances.begin(), distances.end(), 0);
     std::fill(indices.begin(), indices.end(), 0);
@@ -641,7 +641,7 @@ TEST(TestGpuIndexIVFFlat, AddNaN) {
     config.device = opt.device;
     config.indicesOptions = opt.indicesOpt;
     config.flatConfig.useFloat16 = faiss::gpu::randBool();
-    config.use_raft = false;
+    config.use_cuvs = false;
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
     gpuIndex.nprobe = opt.nprobe;
@@ -663,8 +663,8 @@ TEST(TestGpuIndexIVFFlat, AddNaN) {
             distance.data(),
             indices.data());
 
-#if defined USE_NVIDIA_RAFT
-    config.use_raft = true;
+#if defined USE_NVIDIA_RAPIDS
+    config.use_cuvs = true;
     config.indicesOptions = faiss::gpu::INDICES_64_BIT;
     faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
@@ -723,7 +723,7 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) {
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = device;
     config.memorySpace = faiss::gpu::MemorySpace::Unified;
-    config.use_raft = false;
+    config.use_cuvs = false;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, dim, numCentroids, faiss::METRIC_L2, config);
@@ -741,8 +741,8 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) {
             0.1f,
             0.015f);
 
-#if defined USE_NVIDIA_RAFT
-    config.use_raft = true;
+#if defined USE_NVIDIA_RAPIDS
+    config.use_cuvs = true;
     config.indicesOptions = faiss::gpu::INDICES_64_BIT;
     faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
             &res, dim, numCentroids, faiss::METRIC_L2, config);
@@ -801,7 +801,7 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) {
 
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = device;
-    config.use_raft = false;
+    config.use_cuvs = false;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, dim, numCentroids, faiss::METRIC_L2, config);
@@ -820,8 +820,8 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) {
             0.1f,
             0.015f);
 
-#if defined USE_NVIDIA_RAFT
-    config.use_raft = true;
+#if defined USE_NVIDIA_RAPIDS
+    config.use_cuvs = true;
     config.indicesOptions = faiss::gpu::INDICES_64_BIT;
     faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
             &res, dim, numCentroids, faiss::METRIC_L2, config);
@@ -861,7 +861,7 @@ TEST(TestGpuIndexIVFFlat, Reconstruct_n) {
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = opt.device;
     config.indicesOptions = faiss::gpu::INDICES_64_BIT;
-    config.use_raft = false;
+    config.use_cuvs = false;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
diff --git a/faiss/gpu/test/TestGpuIndexIVFPQ.cpp b/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
index 9cc52bc788..e0f246ed11 100644
--- a/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
@@ -156,7 +156,7 @@ void queryTest(Options opt, faiss::MetricType metricType) {
     config.indicesOptions = opt.indicesOpt;
     config.useFloat16LookupTables = opt.useFloat16;
     config.interleavedLayout = opt.interleavedLayout;
-    config.use_raft = opt.useRaft;
+    config.use_cuvs = opt.useRaft;
 
     faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
     gpuIndex.nprobe = opt.nprobe;
@@ -235,7 +235,7 @@ void testMMCodeDistance(faiss::MetricType mt) {
         config.usePrecomputedTables = false;
         config.useMMCodeDistance = true;
         config.indicesOptions = opt.indicesOpt;
-        config.use_raft = false;
+        config.use_cuvs = false;
 
         // Make sure that the float16 version works as well
         config.useFloat16LookupTables = (tries % 2 == 0);
@@ -286,7 +286,7 @@ void testMMCodeDistance(faiss::MetricType mt) {
         config.device = opt.device;
         config.usePrecomputedTables = false;
         config.indicesOptions = opt.indicesOpt;
-        config.use_raft = false;
+        config.use_cuvs = false;
 
         // Make sure that the float16 version works as well
         config.useFloat16LookupTables = (dimPerSubQ == 7);
@@ -340,7 +340,7 @@ TEST(TestGpuIndexIVFPQ, Float16Coarse) {
     config.usePrecomputedTables = opt.usePrecomputed;
     config.indicesOptions = opt.indicesOpt;
     config.useFloat16LookupTables = opt.useFloat16;
-    config.use_raft = false;
+    config.use_cuvs = false;
 
     faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
     gpuIndex.nprobe = opt.nprobe;
@@ -386,7 +386,7 @@ void addTest(Options opt, faiss::MetricType metricType) {
     config.indicesOptions = opt.indicesOpt;
     config.useFloat16LookupTables = opt.useFloat16;
     config.interleavedLayout = opt.interleavedLayout;
-    config.use_raft = opt.useRaft;
+    config.use_cuvs = opt.useRaft;
 
     faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
     gpuIndex.nprobe = opt.nprobe;
@@ -436,7 +436,7 @@ void copyToTest(Options opt) {
         config.indicesOptions = opt.indicesOpt;
         config.useFloat16LookupTables = opt.useFloat16;
         config.interleavedLayout = opt.interleavedLayout;
-        config.use_raft = opt.useRaft;
+        config.use_cuvs = opt.useRaft;
 
         faiss::gpu::GpuIndexIVFPQ gpuIndex(
                 &res,
@@ -513,7 +513,7 @@ void copyFromTest(Options opt) {
     config.indicesOptions = opt.indicesOpt;
     config.useFloat16LookupTables = opt.useFloat16;
     config.interleavedLayout = opt.interleavedLayout;
-    config.use_raft = opt.useRaft;
+    config.use_cuvs = opt.useRaft;
 
     // Use garbage values to see if we overwrite them
     faiss::gpu::GpuIndexIVFPQ gpuIndex(
@@ -567,7 +567,7 @@ void queryNaNTest(Options opt) {
     config.usePrecomputedTables = opt.usePrecomputed;
     config.indicesOptions = opt.indicesOpt;
     config.useFloat16LookupTables = opt.useFloat16;
-    config.use_raft = opt.useRaft;
+    config.use_cuvs = opt.useRaft;
     config.interleavedLayout = opt.useRaft ? true : opt.interleavedLayout;
 
     faiss::gpu::GpuIndexIVFPQ gpuIndex(
@@ -620,7 +620,7 @@ void addNaNTest(Options opt) {
     config.indicesOptions = opt.indicesOpt;
     config.useFloat16LookupTables = opt.useFloat16;
     config.interleavedLayout = opt.interleavedLayout;
-    config.use_raft = opt.useRaft;
+    config.use_cuvs = opt.useRaft;
 
     faiss::gpu::GpuIndexIVFPQ gpuIndex(
             &res,
@@ -668,7 +668,7 @@ TEST(TestGpuIndexIVFPQ, AddNaN) {
     addNaNTest(opt);
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
 TEST(TestGpuIndexIVFPQ, Query_L2_Raft) {
     for (int tries = 0; tries < 2; ++tries) {
         Options opt;
@@ -824,7 +824,7 @@ TEST(TestGpuIndexIVFPQ, UnifiedMemory) {
     faiss::gpu::GpuIndexIVFPQConfig config;
     config.device = device;
     config.memorySpace = faiss::gpu::MemorySpace::Unified;
-    config.use_raft = false;
+    config.use_cuvs = false;
 
     faiss::gpu::GpuIndexIVFPQ gpuIndex(
             &res,
@@ -848,9 +848,9 @@ TEST(TestGpuIndexIVFPQ, UnifiedMemory) {
             0.1f,
             0.015f);
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_RAPIDS
     config.interleavedLayout = true;
-    config.use_raft = true;
+    config.use_cuvs = true;
     config.indicesOptions = faiss::gpu::INDICES_64_BIT;
 
     faiss::gpu::GpuIndexIVFPQ raftGpuIndex(
diff --git a/faiss/gpu/test/TestGpuMemoryException.cpp b/faiss/gpu/test/TestGpuMemoryException.cpp
index ff4be0893e..c4e87eb3b2 100644
--- a/faiss/gpu/test/TestGpuMemoryException.cpp
+++ b/faiss/gpu/test/TestGpuMemoryException.cpp
@@ -31,7 +31,7 @@ TEST(TestGpuMemoryException, AddException) {
 
     faiss::gpu::GpuIndexFlatConfig config;
     config.device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
-    config.use_raft = false;
+    config.use_cuvs = false;
 
     faiss::gpu::GpuIndexFlatL2 gpuIndexL2Broken(
             &res, (int)brokenAddDims, config);
diff --git a/faiss/gpu/test/test_gpu_index.py b/faiss/gpu/test/test_gpu_index.py
index 28572ebcb4..da9b056994 100755
--- a/faiss/gpu/test/test_gpu_index.py
+++ b/faiss/gpu/test/test_gpu_index.py
@@ -25,7 +25,7 @@ def test_ivfflat_search_preassigned(self):
         k = 50
 
         config = faiss.GpuIndexIVFFlatConfig()
-        config.use_raft = False
+        config.use_cuvs = False
         idx_gpu = faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2, config)
         idx_gpu.nprobe = nprobe
 
@@ -59,7 +59,7 @@ def test_ivfpq_search_preassigned(self):
         k = 50
 
         config = faiss.GpuIndexIVFPQConfig()
-        config.use_raft = False
+        config.use_cuvs = False
         idx_gpu = faiss.GpuIndexIVFPQ(res, d, nlist, 4, 8, faiss.METRIC_L2, config)
         idx_gpu.nprobe = nprobe
 
@@ -141,7 +141,7 @@ def test_ivfflat_cpu_coarse(self):
         # construct a GPU index using the same trained coarse quantizer
         # from the CPU index
         config = faiss.GpuIndexIVFFlatConfig()
-        config.use_raft = False
+        config.use_cuvs = False
         idx_gpu = faiss.GpuIndexIVFFlat(res, q, d, nlist, faiss.METRIC_L2, config)
         assert(idx_gpu.is_trained)
         idx_gpu.add(xb)
@@ -233,7 +233,7 @@ def test_ivfpq_cpu_coarse(self):
         # construct a GPU index using the same trained coarse quantizer
         # from the CPU index
         config = faiss.GpuIndexIVFPQConfig()
-        config.use_raft = False
+        config.use_cuvs = False
         idx_gpu = faiss.GpuIndexIVFPQ(
             res, idx_coarse_cpu, d, nlist_lvl_2, 4, 8, faiss.METRIC_L2, config)
         assert(not idx_gpu.is_trained)
@@ -414,7 +414,7 @@ def test_indices_ivfflat(self):
 
         # Store values using 32-bit indices instead
         config.indicesOptions = faiss.INDICES_32_BIT
-        config.use_raft = False
+        config.use_cuvs = False
         idx = faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2, config)
         idx.train(xb)
         idx.add_with_ids(xb, xb_indices)
@@ -439,7 +439,7 @@ def test_indices_ivfpq(self):
         xb_indices = (xb_indices_base + 4294967296).astype('int64')
 
         config = faiss.GpuIndexIVFPQConfig()
-        config.use_raft = False
+        config.use_cuvs = False
         idx = faiss.GpuIndexIVFPQ(res, d, nlist, M, nbits,
                                   faiss.METRIC_L2, config)
         idx.train(xb)
@@ -501,7 +501,7 @@ def test_sq_cpu_to_gpu(self):
         index = faiss.index_factory(32, "SQfp16")
         index.add(np.random.rand(1000, 32).astype(np.float32))
         config = faiss.GpuClonerOptions()
-        config.use_raft = False
+        config.use_cuvs = False
         gpu_index = faiss.index_cpu_to_gpu(res, 0, index, config)
         self.assertIsInstance(gpu_index, faiss.GpuIndexFlat)
 
diff --git a/faiss/gpu/test/test_gpu_index_ivfflat.py b/faiss/gpu/test/test_gpu_index_ivfflat.py
index 099615aff5..408136d990 100644
--- a/faiss/gpu/test/test_gpu_index_ivfflat.py
+++ b/faiss/gpu/test/test_gpu_index_ivfflat.py
@@ -18,7 +18,7 @@ def test_reconstruct_n(self):
         res = faiss.StandardGpuResources()
         res.noTempMemory()
         config = faiss.GpuIndexIVFFlatConfig()
-        config.use_raft = False
+        config.use_cuvs = False
         index2 = faiss.GpuIndexIVFFlat(res, index, config)
         recons = index2.reconstruct_n(0, 10)
 
diff --git a/faiss/gpu/test/test_gpu_index_ivfsq.py b/faiss/gpu/test/test_gpu_index_ivfsq.py
index 09dcdae079..e977fd65ae 100755
--- a/faiss/gpu/test/test_gpu_index_ivfsq.py
+++ b/faiss/gpu/test/test_gpu_index_ivfsq.py
@@ -28,7 +28,7 @@ def make_indices_copy_from_cpu(nlist, d, qtype, by_residual, metric, clamp):
     res = faiss.StandardGpuResources()
     res.noTempMemory()
     config = faiss.GpuIndexIVFScalarQuantizerConfig()
-    config.use_raft = False
+    config.use_cuvs = False
     idx_gpu = faiss.GpuIndexIVFScalarQuantizer(res, idx_cpu, config)
 
     return idx_cpu, idx_gpu
@@ -40,7 +40,7 @@ def make_indices_copy_from_gpu(nlist, d, qtype, by_residual, metric, clamp):
     res = faiss.StandardGpuResources()
     res.noTempMemory()
     config = faiss.GpuIndexIVFScalarQuantizerConfig()
-    config.use_raft = False
+    config.use_cuvs = False
     idx_gpu = faiss.GpuIndexIVFScalarQuantizer(res, d, nlist,
                                                qtype, metric, by_residual, config)
     idx_gpu.train(to_train)
@@ -68,7 +68,7 @@ def make_indices_train(nlist, d, qtype, by_residual, metric, clamp):
     res = faiss.StandardGpuResources()
     res.noTempMemory()
     config = faiss.GpuIndexIVFScalarQuantizerConfig()
-    config.use_raft = False
+    config.use_cuvs = False
     idx_gpu = faiss.GpuIndexIVFScalarQuantizer(res, d, nlist,
                                                qtype, metric, by_residual, config)
     assert(by_residual == idx_gpu.by_residual)
diff --git a/faiss/gpu/test/test_gpu_index_serialize.py b/faiss/gpu/test/test_gpu_index_serialize.py
index 49e51af8b4..06a11a4f0e 100644
--- a/faiss/gpu/test/test_gpu_index_serialize.py
+++ b/faiss/gpu/test/test_gpu_index_serialize.py
@@ -35,7 +35,7 @@ def test_serialize(self):
 
         # IVFSQ
         config = faiss.GpuIndexIVFScalarQuantizerConfig()
-        config.use_raft = False
+        config.use_cuvs = False
         indexes.append(faiss.GpuIndexIVFScalarQuantizer(res, d, nlist, faiss.ScalarQuantizer.QT_fp16, faiss.METRIC_L2, True, config))
 
         # IVFPQ
@@ -52,7 +52,7 @@ def test_serialize(self):
              
             gpu_cloner_options = faiss.GpuClonerOptions()
             if isinstance(index, faiss.GpuIndexIVFScalarQuantizer):
-                gpu_cloner_options.use_raft = False
+                gpu_cloner_options.use_cuvs = False
             gpu_index_restore = faiss.index_cpu_to_gpu(res, 0, cpu_index, gpu_cloner_options)
 
             restore_d, restore_i = gpu_index_restore.search(query, k)
diff --git a/faiss/gpu/test/test_index_cpu_to_gpu.py b/faiss/gpu/test/test_index_cpu_to_gpu.py
index 088ea2bf74..79b58cb636 100644
--- a/faiss/gpu/test/test_index_cpu_to_gpu.py
+++ b/faiss/gpu/test/test_index_cpu_to_gpu.py
@@ -23,13 +23,13 @@ def create_index(self, factory_string):
 
     def create_and_clone(self, factory_string,
                          allowCpuCoarseQuantizer=None,
-                         use_raft=None):
+                         use_cuvs=None):
         idx = self.create_index(factory_string)
         config = faiss.GpuClonerOptions()
         if allowCpuCoarseQuantizer is not None:
             config.allowCpuCoarseQuantizer = allowCpuCoarseQuantizer
-        if use_raft is not None:
-            config.use_raft = use_raft
+        if use_cuvs is not None:
+            config.use_cuvs = use_cuvs
         faiss.index_cpu_to_gpu(self.res, 0, idx, config)
 
     def verify_throws_not_implemented_exception(self, factory_string):
@@ -42,12 +42,12 @@ def verify_throws_not_implemented_exception(self, factory_string):
 
     def verify_clones_successfully(self, factory_string,
                                    allowCpuCoarseQuantizer=None,
-                                   use_raft=None):
+                                   use_cuvs=None):
         try:
             self.create_and_clone(
                 factory_string,
                 allowCpuCoarseQuantizer=allowCpuCoarseQuantizer,
-                use_raft=use_raft)
+                use_cuvs=use_cuvs)
         except Exception as e:
             self.fail("Unexpected exception thrown factory_string: "
                       "%s; error message: %s." % (factory_string, str(e)))
@@ -69,10 +69,10 @@ def test_implemented_indices(self):
         self.verify_clones_successfully("PCA32,IVF32,PQ8")
         self.verify_clones_successfully("PCA32,IVF32,PQ8np")
 
-        # set use_raft to false, these index types are not supported on RAFT
-        self.verify_clones_successfully("IVF32,SQ8", use_raft=False)
+        # set use_cuvs to false, these index types are not supported on RAFT
+        self.verify_clones_successfully("IVF32,SQ8", use_cuvs=False)
         self.verify_clones_successfully(
-            "PCA32,IVF32,SQ8", use_raft=False)
+            "PCA32,IVF32,SQ8", use_cuvs=False)
 
     def test_with_flag(self):
         self.verify_clones_successfully("IVF32_HNSW,Flat",
diff --git a/faiss/gpu/test/test_multi_gpu.py b/faiss/gpu/test/test_multi_gpu.py
index e341f5715a..ac08d84bcd 100644
--- a/faiss/gpu/test/test_multi_gpu.py
+++ b/faiss/gpu/test/test_multi_gpu.py
@@ -29,7 +29,7 @@ def test_sharded(self):
 
         co = faiss.GpuMultipleClonerOptions()
         co.shard = True
-        co.use_raft = False
+        co.use_cuvs = False
         index = faiss.index_cpu_to_all_gpus(index_cpu, co, ngpu=2)
 
         index.add(xb)
@@ -72,7 +72,7 @@ def do_test_sharded_ivf(self, index_key):
         co = faiss.GpuMultipleClonerOptions()
         co.shard = True
         co.common_ivf_quantizer = True
-        co.use_raft = False
+        co.use_cuvs = False
         index = faiss.index_cpu_to_all_gpus(index, co, ngpu=2)
 
         index.quantizer  # make sure there is indeed a quantizer
@@ -113,7 +113,7 @@ def test_binary_clone(self, ngpu=1, shard=False):
 
         co = faiss.GpuMultipleClonerOptions()
         co.shard = shard
-        co.use_raft = False
+        co.use_cuvs = False
 
         # index2 = faiss.index_cpu_to_all_gpus(index, ngpu=ngpu)
         res = faiss.StandardGpuResources()
@@ -192,7 +192,7 @@ def do_cpu_to_gpu(self, index_key):
 
         res = faiss.StandardGpuResources()
         co = faiss.GpuClonerOptions()
-        co.use_raft = False
+        co.use_cuvs = False
         gpu_index = faiss.index_cpu_to_gpu(res, 0, index, co)
         ts.append(time.time())
 
@@ -222,7 +222,7 @@ def do_cpu_to_gpu(self, index_key):
             res = [faiss.StandardGpuResources() for i in range(2)]
             co = faiss.GpuMultipleClonerOptions()
             co.shard = shard
-            co.use_raft = False
+            co.use_cuvs = False
 
             gpu_index = faiss.index_cpu_to_gpu_multiple_py(res, index, co)
 
diff --git a/faiss/gpu/test/test_raft.py b/faiss/gpu/test/test_raft.py
index 37ae2ef003..663fce6fdb 100644
--- a/faiss/gpu/test/test_raft.py
+++ b/faiss/gpu/test/test_raft.py
@@ -25,13 +25,13 @@ def test_bfKnn(self):
 
         # Faiss internal implementation
         Dnew, Inew = faiss.knn_gpu(
-            res, ds.get_queries(), ds.get_database(), 12, use_raft=False)
+            res, ds.get_queries(), ds.get_database(), 12, use_cuvs=False)
         np.testing.assert_allclose(Dref, Dnew, atol=1e-5)
         np.testing.assert_array_equal(Iref, Inew)
 
         # RAFT version
         Dnew, Inew = faiss.knn_gpu(
-            res, ds.get_queries(), ds.get_database(), 12, use_raft=True)
+            res, ds.get_queries(), ds.get_database(), 12, use_cuvs=True)
         np.testing.assert_allclose(Dref, Dnew, atol=1e-5)
         np.testing.assert_array_equal(Iref, Inew)
 
@@ -46,7 +46,7 @@ def test_IndexFlat(self):
 
         res = faiss.StandardGpuResources()
         co = faiss.GpuClonerOptions()
-        co.use_raft = True
+        co.use_cuvs = True
         index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
         Dnew, Inew = index_gpu.search(ds.get_queries(), 13)
         np.testing.assert_allclose(Dref, Dnew, atol=1e-5)
diff --git a/faiss/gpu/test/torch_test_contrib_gpu.py b/faiss/gpu/test/torch_test_contrib_gpu.py
index f7444337f1..7353ea8b33 100644
--- a/faiss/gpu/test/torch_test_contrib_gpu.py
+++ b/faiss/gpu/test/torch_test_contrib_gpu.py
@@ -163,7 +163,7 @@ def test_ivfflat_reconstruct(self):
         res = faiss.StandardGpuResources()
         res.noTempMemory()
         config = faiss.GpuIndexIVFFlatConfig()
-        config.use_raft = False
+        config.use_cuvs = False
 
         index = faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2, config)
 
@@ -249,7 +249,7 @@ def test_sa_encode_decode(self):
         return
 
 class TestTorchUtilsKnnGpu(unittest.TestCase):
-    def test_knn_gpu(self, use_raft=False):
+    def test_knn_gpu(self, use_cuvs=False):
         torch.manual_seed(10)
         d = 32
         nb = 1024
@@ -286,7 +286,7 @@ def test_knn_gpu(self, use_raft=False):
                     else:
                         xb_c = xb_np
 
-                    D, I = faiss.knn_gpu(res, xq_c, xb_c, k, use_raft=use_raft)
+                    D, I = faiss.knn_gpu(res, xq_c, xb_c, k, use_cuvs=use_cuvs)
 
                     self.assertTrue(torch.equal(torch.from_numpy(I), gt_I))
                     self.assertLess((torch.from_numpy(D) - gt_D).abs().max(), 1e-4)
@@ -312,7 +312,7 @@ def test_knn_gpu(self, use_raft=False):
                             xb_c = to_column_major_torch(xb)
                             assert not xb_c.is_contiguous()
 
-                        D, I = faiss.knn_gpu(res, xq_c, xb_c, k, use_raft=use_raft)
+                        D, I = faiss.knn_gpu(res, xq_c, xb_c, k, use_cuvs=use_cuvs)
 
                         self.assertTrue(torch.equal(I.cpu(), gt_I))
                         self.assertLess((D.cpu() - gt_D).abs().max(), 1e-4)
@@ -320,7 +320,7 @@ def test_knn_gpu(self, use_raft=False):
                         # test on subset
                         try:
                             # This internally uses the current pytorch stream
-                            D, I = faiss.knn_gpu(res, xq_c[6:8], xb_c, k, use_raft=use_raft)
+                            D, I = faiss.knn_gpu(res, xq_c[6:8], xb_c, k, use_cuvs=use_cuvs)
                         except TypeError:
                             if not xq_row_major:
                                 # then it is expected
@@ -335,9 +335,9 @@ def test_knn_gpu(self, use_raft=False):
         "RAFT" in faiss.get_compile_options(),
         "only if RAFT is compiled in")
     def test_knn_gpu_raft(self):
-        self.test_knn_gpu(use_raft=True)
+        self.test_knn_gpu(use_cuvs=True)
 
-    def test_knn_gpu_datatypes(self, use_raft=False):
+    def test_knn_gpu_datatypes(self, use_cuvs=False):
         torch.manual_seed(10)
         d = 10
         nb = 1024
@@ -360,7 +360,7 @@ def test_knn_gpu_datatypes(self, use_raft=False):
         D = torch.zeros(nq, k, device=xb_c.device, dtype=torch.float32)
         I = torch.zeros(nq, k, device=xb_c.device, dtype=torch.int32)
 
-        faiss.knn_gpu(res, xq_c, xb_c, k, D, I, use_raft=use_raft)
+        faiss.knn_gpu(res, xq_c, xb_c, k, D, I, use_cuvs=use_cuvs)
 
         self.assertTrue(torch.equal(I.long().cpu(), gt_I))
         self.assertLess((D.float().cpu() - gt_D).abs().max(), 1.5e-3)
@@ -372,7 +372,7 @@ def test_knn_gpu_datatypes(self, use_raft=False):
         xb_c = xb.half().numpy()
         xq_c = xq.half().numpy()
 
-        faiss.knn_gpu(res, xq_c, xb_c, k, D, I, use_raft=use_raft)
+        faiss.knn_gpu(res, xq_c, xb_c, k, D, I, use_cuvs=use_cuvs)
 
         self.assertTrue(torch.equal(torch.from_numpy(I).long(), gt_I))
         self.assertLess((torch.from_numpy(D) - gt_D).abs().max(), 1.5e-3)
diff --git a/faiss/gpu/utils/RaftUtils.cu b/faiss/gpu/utils/CuvsUtils.cu
similarity index 98%
rename from faiss/gpu/utils/RaftUtils.cu
rename to faiss/gpu/utils/CuvsUtils.cu
index ba40c54c26..3f06343525 100644
--- a/faiss/gpu/utils/RaftUtils.cu
+++ b/faiss/gpu/utils/CuvsUtils.cu
@@ -21,7 +21,7 @@
  */
 
 #include <faiss/gpu/GpuIndex.h>
-#include <faiss/gpu/utils/RaftUtils.h>
+#include <faiss/gpu/utils/CuvsUtils.h>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/linalg/coalesced_reduction.cuh>
diff --git a/faiss/gpu/utils/RaftUtils.h b/faiss/gpu/utils/CuvsUtils.h
similarity index 77%
rename from faiss/gpu/utils/RaftUtils.h
rename to faiss/gpu/utils/CuvsUtils.h
index 4dfafa4ec5..09bcec6747 100644
--- a/faiss/gpu/utils/RaftUtils.h
+++ b/faiss/gpu/utils/CuvsUtils.h
@@ -26,32 +26,32 @@
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/utils/Tensor.cuh>
 
-#include <raft/distance/distance_types.hpp>
+#include <cuvs/distance/distance.h>
 
 #pragma GCC visibility push(default)
 namespace faiss {
 namespace gpu {
 
-inline raft::distance::DistanceType metricFaissToRaft(
+inline cuvsDistanceType metricFaissToCuvs(
         MetricType metric,
         bool exactDistance) {
     switch (metric) {
         case MetricType::METRIC_INNER_PRODUCT:
-            return raft::distance::DistanceType::InnerProduct;
+            return cuvsDistanceType::InnerProduct;
         case MetricType::METRIC_L2:
-            return raft::distance::DistanceType::L2Expanded;
+            return cuvsDistanceType::L2Expanded;
         case MetricType::METRIC_L1:
-            return raft::distance::DistanceType::L1;
+            return cuvsDistanceType::L1;
         case MetricType::METRIC_Linf:
-            return raft::distance::DistanceType::Linf;
+            return cuvsDistanceType::Linf;
         case MetricType::METRIC_Lp:
-            return raft::distance::DistanceType::LpUnexpanded;
+            return cuvsDistanceType::LpUnexpanded;
         case MetricType::METRIC_Canberra:
-            return raft::distance::DistanceType::Canberra;
+            return cuvsDistanceType::Canberra;
         case MetricType::METRIC_BrayCurtis:
-            return raft::distance::DistanceType::BrayCurtis;
+            return cuvsDistanceType::BrayCurtis;
         case MetricType::METRIC_JensenShannon:
-            return raft::distance::DistanceType::JensenShannon;
+            return cuvsDistanceType::JensenShannon;
         default:
             RAFT_FAIL("Distance type not supported");
     }
diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt
index 0073c20e04..574c9a9f32 100644
--- a/faiss/python/CMakeLists.txt
+++ b/faiss/python/CMakeLists.txt
@@ -38,9 +38,9 @@ macro(configure_swigfaiss source)
     set_source_files_properties(${source} PROPERTIES
       COMPILE_DEFINITIONS GPU_WRAPPER
     )
-    if (FAISS_ENABLE_RAFT)
+    if (FAISS_ENABLE_CUVS)
       set_property(SOURCE ${source} APPEND PROPERTY
-        COMPILE_DEFINITIONS FAISS_ENABLE_RAFT
+        COMPILE_DEFINITIONS FAISS_ENABLE_CUVS
       )
     endif()
   endif()
@@ -126,12 +126,12 @@ endif()
 
 if(FAISS_ENABLE_GPU)
   find_package(CUDAToolkit REQUIRED)
-  if(FAISS_ENABLE_RAFT)
+  if(FAISS_ENABLE_CUVS)
     find_package(raft COMPONENTS compiled distributed)
   endif()
-  target_link_libraries(swigfaiss PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass>)
-  target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass>)
-  target_link_libraries(swigfaiss_avx512 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass>)
+  target_link_libraries(swigfaiss PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
+  target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
+  target_link_libraries(swigfaiss_avx512 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
 endif()
 
 find_package(OpenMP REQUIRED)
diff --git a/faiss/python/gpu_wrappers.py b/faiss/python/gpu_wrappers.py
index 6e788511d2..4945722f6c 100644
--- a/faiss/python/gpu_wrappers.py
+++ b/faiss/python/gpu_wrappers.py
@@ -56,7 +56,7 @@ def index_cpu_to_gpus_list(index, co=None, gpus=None, ngpu=-1):
 # allows numpy ndarray usage with bfKnn
 
 
-def knn_gpu(res, xq, xb, k, D=None, I=None, metric=METRIC_L2, device=-1, use_raft=False, vectorsMemoryLimit=0, queriesMemoryLimit=0):
+def knn_gpu(res, xq, xb, k, D=None, I=None, metric=METRIC_L2, device=-1, use_cuvs=False, vectorsMemoryLimit=0, queriesMemoryLimit=0):
     """
     Compute the k nearest neighbors of a vector on one GPU without constructing an index
 
@@ -178,7 +178,7 @@ def knn_gpu(res, xq, xb, k, D=None, I=None, metric=METRIC_L2, device=-1, use_raf
     args.outIndices = I_ptr
     args.outIndicesType = I_type
     args.device = device
-    args.use_raft = use_raft
+    args.use_cuvs = use_cuvs
 
     # no stream synchronization needed, inputs and outputs are guaranteed to
     # be on the CPU (numpy arrays)
diff --git a/faiss/python/swigfaiss.swig b/faiss/python/swigfaiss.swig
index 74a371f6cd..b10fd137e6 100644
--- a/faiss/python/swigfaiss.swig
+++ b/faiss/python/swigfaiss.swig
@@ -558,7 +558,7 @@ struct faiss::simd16uint16 {};
 %include  <faiss/gpu/GpuIndicesOptions.h>
 %include  <faiss/gpu/GpuClonerOptions.h>
 %include  <faiss/gpu/GpuIndex.h>
-#ifdef FAISS_ENABLE_RAFT
+#ifdef FAISS_ENABLE_CUVS
 %include  <faiss/gpu/GpuIndexCagra.h>
 #endif
 %include  <faiss/gpu/GpuIndexFlat.h>
@@ -677,7 +677,7 @@ struct faiss::simd16uint16 {};
     DOWNCAST ( IndexRowwiseMinMax )
     DOWNCAST ( IndexRowwiseMinMaxFP16 )
 #ifdef GPU_WRAPPER
-#ifdef FAISS_ENABLE_RAFT
+#ifdef FAISS_ENABLE_CUVS
     DOWNCAST_GPU ( GpuIndexCagra )
 #endif
     DOWNCAST_GPU ( GpuIndexIVFPQ )
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 3980d7dd7c..faf7429fc8 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -95,7 +95,7 @@ find_package(GTest CONFIG REQUIRED)
 target_link_libraries(faiss_test PRIVATE
   OpenMP::OpenMP_CXX
   GTest::gtest_main
-  $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft>
+  $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>
 )
 
 # Defines `gtest_discover_tests()`.

From 45c4d49320d5efae7ed4110c448ca15ff0b4a96d Mon Sep 17 00:00:00 2001
From: Kumar Saurabh Arora <kuarora@meta.com>
Date: Fri, 21 Jun 2024 09:18:42 -0700
Subject: [PATCH 063/148] Adding faiss bench_fw to bento faiss kernel (#3531)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3531

**In this diff**
1. I have add bench_fw to bento faiss kernel target
2. First part of notebook is changed to analyze sift1M results

Reviewed By: algoriddle

Differential Revision: D58823037

fbshipit-source-id: a67d4638af4368f0575bd289ce7aff8cf1fcd38b
---
 benchs/bench_fw_notebook.ipynb | 1059 ++++++++++++++++----------------
 1 file changed, 531 insertions(+), 528 deletions(-)

diff --git a/benchs/bench_fw_notebook.ipynb b/benchs/bench_fw_notebook.ipynb
index 5752aaf5fb..c38ed11068 100644
--- a/benchs/bench_fw_notebook.ipynb
+++ b/benchs/bench_fw_notebook.ipynb
@@ -1,529 +1,532 @@
 {
-    "cells": [
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "be081589-e1b2-4569-acb7-44203e273899",
-      "metadata": {
-       "tags": []
-      },
-      "outputs": [],
-      "source": [
-       "import matplotlib.pyplot as plt\n",
-       "import itertools\n",
-       "from faiss.contrib.evaluation import OperatingPoints\n",
-       "from enum import Enum\n",
-       "from bench_fw.benchmark_io import BenchmarkIO as BIO\n",
-       "from bench_fw.utils import filter_results, ParetoMode, ParetoMetric\n",
-       "from copy import copy\n",
-       "import numpy as np\n",
-       "import datetime\n",
-       "import glob\n",
-       "import io\n",
-       "import json\n",
-       "from zipfile import ZipFile\n",
-       "import tabulate"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "a6492e95-24c7-4425-bf0a-27e10e879ca6",
-      "metadata": {
-       "tags": []
-      },
-      "outputs": [],
-      "source": [
-       "root = \"/checkpoint/gsz/bench_fw/optimize/bigann\"\n",
-       "results = BIO(root).read_json(\"result_std_d_bigann10M.json\")\n",
-       "results.keys()"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "0875d269-aef4-426d-83dd-866970f43777",
-      "metadata": {
-       "tags": []
-      },
-      "outputs": [],
-      "source": [
-       "results['experiments']"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "f080a6e2-1565-418b-8732-4adeff03a099",
-      "metadata": {
-       "tags": []
-      },
-      "outputs": [],
-      "source": [
-       "def plot_metric(experiments, accuracy_title, cost_title, plot_space=False, plot=None):\n",
-       "    if plot is None:\n",
-       "        plot = plt.subplot()\n",
-       "    x = {}\n",
-       "    y = {}\n",
-       "    for accuracy, space, time, k, v in experiments:\n",
-       "        idx_name = v['index'] + (\"snap\" if 'search_params' in v and v['search_params'][\"snap\"] == 1 else \"\")\n",
-       "        if idx_name not in x:\n",
-       "            x[idx_name] = []\n",
-       "            y[idx_name] = []\n",
-       "        x[idx_name].append(accuracy)\n",
-       "        if plot_space:\n",
-       "            y[idx_name].append(space)\n",
-       "        else:\n",
-       "            y[idx_name].append(time)\n",
-       "\n",
-       "    #plt.figure(figsize=(10,6))\n",
-       "    #plt.title(accuracy_title)\n",
-       "    plot.set_xlabel(accuracy_title)\n",
-       "    plot.set_ylabel(cost_title)\n",
-       "    plot.set_yscale(\"log\")\n",
-       "    marker = itertools.cycle((\"o\", \"v\", \"^\", \"<\", \">\", \"s\", \"p\", \"P\", \"*\", \"h\", \"X\", \"D\"))    \n",
-       "    for index in x.keys():\n",
-       "        plot.plot(x[index], y[index], marker=next(marker), label=index, linewidth=0)\n",
-       "    plot.legend(bbox_to_anchor=(1, 1), loc='upper left')"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "61007155-5edc-449e-835e-c141a01a2ae5",
-      "metadata": {
-       "tags": []
-      },
-      "outputs": [],
-      "source": [
-       "# index local optima\n",
-       "accuracy_metric = \"knn_intersection\"\n",
-       "fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1, min_accuracy=0.95)\n",
-       "plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 32 cores)\", plot_space=False)"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "f9f94dcc-5abe-4cad-9619-f5d1d24fb8c1",
-      "metadata": {
-       "tags": []
-      },
-      "outputs": [],
-      "source": [
-       "# global optima\n",
-       "accuracy_metric = \"knn_intersection\"\n",
-       "fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0.90, max_space=64, max_time=0, name_filter=lambda n: not n.startswith(\"Flat\"), pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-       "plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 32 cores)\", plot_space=False)"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "0c10f587-26ef-49ec-83a9-88f6a2a433e8",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "def pretty_params(p):\n",
-       "    p = copy(p)\n",
-       "    if 'snap' in p and p['snap'] == 0:\n",
-       "        del p['snap']\n",
-       "    return p\n",
-       "    \n",
-       "tabulate.tabulate([(accuracy, space, time, v['factory'], pretty_params(v['construction_params'][1]), pretty_params(v['search_params'])) \n",
-       "                for accuracy, space, time, k, v in fr],\n",
-       "                tablefmt=\"html\",\n",
-       "                headers=[\"accuracy\",\"space\", \"time\", \"factory\", \"quantizer cfg\", \"search cfg\"])"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "36e82084-18f6-4546-a717-163eb0224ee8",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "# index local optima @ precision 0.8\n",
-       "precision = 0.8\n",
-       "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
-       "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-       "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "aff79376-39f7-47c0-8b83-1efe5192bb7e",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "# index local optima @ precision 0.2\n",
-       "precision = 0.2\n",
-       "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
-       "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-       "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "b4834f1f-bbbe-4cae-9aa0-a459b0c842d1",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "# global optima @ precision 0.8\n",
-       "precision = 0.8\n",
-       "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
-       "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-       "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "9aead830-6209-4956-b7ea-4a5e0029d616",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "def plot_range_search_pr_curves(experiments):\n",
-       "    x = {}\n",
-       "    y = {}\n",
-       "    show = {\n",
-       "        'Flat': None,\n",
-       "    }\n",
-       "    for _, _, _, k, v in fr:\n",
-       "        if \".weighted\" in k: # and v['index'] in show:\n",
-       "            x[k] = v['range_search_pr']['recall']\n",
-       "            y[k] = v['range_search_pr']['precision']\n",
-       "    \n",
-       "    plt.title(\"range search recall\")\n",
-       "    plt.xlabel(\"recall\")\n",
-       "    plt.ylabel(\"precision\")\n",
-       "    for index in x.keys():\n",
-       "        plt.plot(x[index], y[index], '.', label=index)\n",
-       "    plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "92e45502-7a31-4a15-90df-fa3032d7d350",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "precision = 0.8\n",
-       "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
-       "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME_SPACE, scaling_factor=1)\n",
-       "plot_range_search_pr_curves(fr)"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "fdf8148a-0da6-4c5e-8d60-f8f85314574c",
-      "metadata": {
-       "tags": []
-      },
-      "outputs": [],
-      "source": [
-       "root = \"/checkpoint/gsz/bench_fw/ivf/bigann\"\n",
-       "scales = [1, 2, 5, 10, 20, 50]\n",
-       "fig, plots = plt.subplots(len(scales), sharex=True, figsize=(5,25))\n",
-       "fig.tight_layout()\n",
-       "for plot, scale in zip(plots, scales, strict=True):\n",
-       "    results = BIO(root).read_json(f\"result{scale}.json\")\n",
-       "    accuracy_metric = \"knn_intersection\"\n",
-       "    fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0.9, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-       "    plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 64 cores)\", plot=plot)"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "e503828c-ee61-45f7-814b-cce6461109bc",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "x = {}\n",
-       "y = {}\n",
-       "accuracy=0.9\n",
-       "root = \"/checkpoint/gsz/bench_fw/ivf/bigann\"\n",
-       "scales = [1, 2, 5, 10, 20, 50]\n",
-       "#fig, plots = plt.subplots(len(scales), sharex=True, figsize=(5,25))\n",
-       "#fig.tight_layout()\n",
-       "for scale in scales:\n",
-       "    results = BIO(root).read_json(f\"result{scale}.json\")\n",
-       "    scale *= 1_000_000\n",
-       "    accuracy_metric = \"knn_intersection\"\n",
-       "    fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=accuracy, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-       "    seen = set()\n",
-       "    print(scale)\n",
-       "    for _, _, _, _, exp in fr:\n",
-       "        fact = exp[\"factory\"]\n",
-       "        # \"HNSW\" in fact or \n",
-       "        if fact in seen or fact in [\"Flat\", \"IVF512,Flat\", \"IVF1024,Flat\", \"IVF2048,Flat\"]:\n",
-       "            continue\n",
-       "        seen.add(fact)\n",
-       "        if fact not in x:\n",
-       "            x[fact] = []\n",
-       "            y[fact] = []\n",
-       "        x[fact].append(scale)\n",
-       "        y[fact].append(exp[\"time\"] + exp[\"quantizer\"][\"time\"])\n",
-       "        if (exp[\"knn_intersection\"] > 0.92):\n",
-       "            print(fact)\n",
-       "            print(exp[\"search_params\"])\n",
-       "            print(exp[\"knn_intersection\"])\n",
-       "\n",
-       "        #plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 64 cores)\", plot=plot)\n",
-       "    \n",
-       "plt.title(f\"recall @ 1 = {accuracy*100}%\")\n",
-       "plt.xlabel(\"database size\")\n",
-       "plt.ylabel(\"time\")\n",
-       "plt.xscale(\"log\")\n",
-       "plt.yscale(\"log\")\n",
-       "\n",
-       "marker = itertools.cycle((\"o\", \"v\", \"^\", \"<\", \">\", \"s\", \"p\", \"P\", \"*\", \"h\", \"X\", \"D\"))    \n",
-       "for index in x.keys():\n",
-       "    if \"HNSW\" in index:\n",
-       "        plt.plot(x[index], y[index], label=index, linewidth=1, marker=next(marker), linestyle=\"dashed\")\n",
-       "    else:\n",
-       "        plt.plot(x[index], y[index], label=index, linewidth=1, marker=next(marker))\n",
-       "plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "37a99bb2-f998-461b-a345-7cc6e702cb3a",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "# global optima\n",
-       "accuracy_metric = \"sym_recall\"\n",
-       "fr = filter_results(results, evaluation=\"rec\", accuracy_metric=accuracy_metric, time_metric=lambda e:e['encode_time'], min_accuracy=0.9, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.SPACE, scaling_factor=1)\n",
-       "plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"space\", plot_space=True)"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "c973ce4e-3566-4f02-bd93-f113e3e0c791",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "def pretty_time(s):\n",
-       "    if s is None:\n",
-       "        return \"None\"\n",
-       "    s = int(s * 1000) / 1000\n",
-       "    m, s = divmod(s, 60)\n",
-       "    h, m = divmod(m, 60)\n",
-       "    d, h = divmod(h, 24)\n",
-       "    r = \"\"\n",
-       "    if d > 0:\n",
-       "        r += f\"{int(d)}d \"\n",
-       "    if h > 0:\n",
-       "        r += f\"{int(h)}h \"\n",
-       "    if m > 0:\n",
-       "        r += f\"{int(m)}m \"\n",
-       "    if s > 0 or len(r) == 0:\n",
-       "        r += f\"{s:.3f}s\"\n",
-       "    return r\n",
-       "\n",
-       "def pretty_size(s):\n",
-       "    if s > 1024 * 1024:\n",
-       "        return f\"{s / 1024 / 1024:.1f}\".rstrip('0').rstrip('.') + \"MB\"\n",
-       "    if s > 1024:\n",
-       "        return f\"{s / 1024:.1f}\".rstrip('0').rstrip('.') + \"KB\"\n",
-       "    return f\"{s}\"\n",
-       "\n",
-       "def pretty_mse(m):\n",
-       "    if m is None:\n",
-       "        return \"None\"\n",
-       "    else:\n",
-       "        return f\"{m:.6f}\""
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "1ddcf226-fb97-4a59-9fc3-3ed8f7d5e703",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "data = {}\n",
-       "root = \"/checkpoint/gsz/bench_fw/bigann\"\n",
-       "scales = [1, 2, 5, 10, 20, 50]\n",
-       "for scale in scales:\n",
-       "    results = BIO(root).read_json(f\"result{scale}.json\")\n",
-       "    accuracy_metric = \"knn_intersection\"\n",
-       "    fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
-       "    d = {}\n",
-       "    data[f\"{scale}M\"] = d\n",
-       "    for _, _, _, _, exp in fr:\n",
-       "        fact = exp[\"factory\"]\n",
-       "        # \"HNSW\" in fact or \n",
-       "        if fact in [\"Flat\", \"IVF512,Flat\", \"IVF1024,Flat\", \"IVF2048,Flat\"]:\n",
-       "            continue\n",
-       "        if fact not in d:\n",
-       "            d[fact] = []\n",
-       "        d[fact].append({\n",
-       "            \"nprobe\": exp[\"search_params\"][\"nprobe\"],\n",
-       "            \"recall\": exp[\"knn_intersection\"],\n",
-       "            \"time\": exp[\"time\"] + exp[\"quantizer\"][\"time\"],\n",
-       "        })\n",
-       "data\n",
-       "# with open(\"/checkpoint/gsz/bench_fw/codecs.json\", \"w\") as f:\n",
-       "#    json.dump(data, f)"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "e54eebb6-0a9f-4a72-84d2-f12c5bd44510",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "ds = \"deep1b\"\n",
-       "data = []\n",
-       "jss = []\n",
-       "root = f\"/checkpoint/gsz/bench_fw/codecs/{ds}\"\n",
-       "results = BIO(root).read_json(f\"result.json\")\n",
-       "for k, e in results[\"experiments\"].items():\n",
-       "    if \"rec\" in k and e['factory'] != 'Flat': # and e['sym_recall'] > 0.0: # and \"PRQ\" in e['factory'] and e['sym_recall'] > 0.0:\n",
-       "        code_size = results['indices'][e['codec']]['sa_code_size']\n",
-       "        codec_size = results['indices'][e['codec']]['codec_size']\n",
-       "        training_time = results['indices'][e['codec']]['training_time']\n",
-       "        # training_size = results['indices'][e['codec']]['training_size']\n",
-       "        cpu = e['cpu'] if 'cpu' in e else \"\"\n",
-       "        ps = ', '.join([f\"{k}={v}\" for k,v in e['construction_params'][0].items()]) if e['construction_params'] else \" \"\n",
-       "        eps = ', '.join([f\"{k}={v}\" for k,v in e['reconstruct_params'].items() if k != \"snap\"]) if e['reconstruct_params'] else \" \"\n",
-       "        data.append((code_size, f\"|{e['factory']}|{ps}|{eps}|{code_size}|{pretty_size(codec_size)}|{pretty_time(training_time)}|{training_size}|{pretty_mse(e['mse'])}|{e['sym_recall']}|{e['asym_recall']}|{pretty_time(e['encode_time'])}|{pretty_time(e['decode_time'])}|{cpu}|\"))\n",
-       "        jss.append({\n",
-       "            'factory': e['factory'],\n",
-       "            'parameters': e['construction_params'][0] if e['construction_params'] else \"\",\n",
-       "            'evaluation_params': e['reconstruct_params'],\n",
-       "            'code_size': code_size,\n",
-       "            'codec_size': codec_size,\n",
-       "            'training_time': training_time,\n",
-       "            'training_size': training_size,\n",
-       "            'mse': e['mse'],\n",
-       "            'sym_recall': e['sym_recall'],\n",
-       "            'asym_recall': e['asym_recall'],\n",
-       "            'encode_time': e['encode_time'],\n",
-       "            'decode_time': e['decode_time'],\n",
-       "            'cpu': cpu,\n",
-       "        })\n",
-       "\n",
-       "print(\"|factory key|construction parameters|evaluation parameters|code size|codec size|training time|training size|mean squared error|sym recall @ 1|asym recall @ 1|encode time|decode time|cpu|\")\n",
-       "print(\"|-|-|-|-|-|-|-|-|-|\")\n",
-       "data.sort()\n",
-       "for d in data:\n",
-       "    print(d[1])\n",
-       "\n",
-       "with open(f\"/checkpoint/gsz/bench_fw/codecs_{ds}_test.json\", \"w\") as f:\n",
-       "    json.dump(jss, f)"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "d1216733-9670-407c-b3d2-5f87bce0321c",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "def read_file(filename: str, keys):\n",
-       "    results = []\n",
-       "    with ZipFile(filename, \"r\") as zip_file:\n",
-       "        for key in keys:\n",
-       "            with zip_file.open(key, \"r\") as f:\n",
-       "                if key in [\"D\", \"I\", \"R\", \"lims\"]:\n",
-       "                    results.append(np.load(f))\n",
-       "                elif key in [\"P\"]:\n",
-       "                    t = io.TextIOWrapper(f)\n",
-       "                    results.append(json.load(t))\n",
-       "                else:\n",
-       "                    raise AssertionError()\n",
-       "    return results"
-      ]
-     },
-     {
-      "cell_type": "code",
-      "execution_count": null,
-      "id": "56de051e-22db-4bef-b242-1ddabc9e0bb9",
-      "metadata": {},
-      "outputs": [],
-      "source": [
-       "ds = \"contriever\"\n",
-       "data = []\n",
-       "jss = []\n",
-       "root = f\"/checkpoint/gsz/bench_fw/codecs/{ds}\"\n",
-       "for lf in glob.glob(root + '/*rec*.zip'):\n",
-       "    e, = read_file(lf, ['P'])\n",
-       "    if e['factory'] != 'Flat': # and e['sym_recall'] > 0.0: # and \"PRQ\" in e['factory'] and e['sym_recall'] > 0.0:\n",
-       "        code_size = e['codec_meta']['sa_code_size']\n",
-       "        codec_size = e['codec_meta']['codec_size']\n",
-       "        training_time = e['codec_meta']['training_time']\n",
-       "        training_size = None # e['codec_meta']['training_size']\n",
-       "        cpu = e['cpu'] if 'cpu' in e else \"\"\n",
-       "        ps = ', '.join([f\"{k}={v}\" for k,v in e['construction_params'][0].items()]) if e['construction_params'] else \" \"\n",
-       "        eps = ', '.join([f\"{k}={v}\" for k,v in e['reconstruct_params'].items() if k != \"snap\"]) if e['reconstruct_params'] else \" \"\n",
-       "        if eps in ps and eps != \"encode_ils_iters=16\" and eps != \"max_beam_size=32\":\n",
-       "           eps = \" \"\n",
-       "        data.append((code_size, f\"|{e['factory']}|{ps}|{eps}|{code_size}|{pretty_size(codec_size)}|{pretty_time(training_time)}|{pretty_mse(e['mse'])}|{e['sym_recall']}|{e['asym_recall']}|{pretty_time(e['encode_time'])}|{pretty_time(e['decode_time'])}|{cpu}|\"))\n",
-       "        eps = e['reconstruct_params']\n",
-       "        del eps['snap']\n",
-       "        params = copy(e['construction_params'][0]) if e['construction_params'] else {}\n",
-       "        for k, v in e['reconstruct_params'].items():\n",
-       "            params[k] = v\n",
-       "        jss.append({\n",
-       "            'factory': e['factory'],\n",
-       "            'params': params,\n",
-       "            'construction_params': e['construction_params'][0] if e['construction_params'] else {},\n",
-       "            'evaluation_params': e['reconstruct_params'],\n",
-       "            'code_size': code_size,\n",
-       "            'codec_size': codec_size,\n",
-       "            'training_time': training_time,\n",
-       "            # 'training_size': training_size,\n",
-       "            'mse': e['mse'],\n",
-       "            'sym_recall': e['sym_recall'],\n",
-       "            'asym_recall': e['asym_recall'],\n",
-       "            'encode_time': e['encode_time'],\n",
-       "            'decode_time': e['decode_time'],\n",
-       "            'cpu': cpu,\n",
-       "        })\n",
-       "\n",
-       "print(\"|factory key|construction parameters|encode/decode parameters|code size|codec size|training time|mean squared error|sym recall @ 1|asym recall @ 1|encode time|decode time|cpu|\")\n",
-       "print(\"|-|-|-|-|-|-|-|-|-|\")\n",
-       "data.sort()\n",
-       "# for d in data:\n",
-       "#   print(d[1])\n",
-       "\n",
-       "print(len(data))\n",
-       "\n",
-       "with open(f\"/checkpoint/gsz/bench_fw/codecs_{ds}_5.json\", \"w\") as f:\n",
-       "    json.dump(jss, f)"
-      ]
-     }
-    ],
-    "metadata": {
-     "kernelspec": {
-      "display_name": "Python [conda env:.conda-faiss_from_source] *",
-      "language": "python",
-      "name": "conda-env-.conda-faiss_from_source-py"
-     },
-     "language_info": {
-      "codemirror_mode": {
-       "name": "ipython",
-       "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.11.5"
-     }
-    },
-    "nbformat": 4,
-    "nbformat_minor": 5
-   }
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "be081589-e1b2-4569-acb7-44203e273899",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import itertools\n",
+    "from faiss.contrib.evaluation import OperatingPoints\n",
+    "from enum import Enum\n",
+    "from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO as BIO\n",
+    "from faiss.benchs.bench_fw.utils import filter_results, ParetoMode, ParetoMetric\n",
+    "from copy import copy\n",
+    "import numpy as np\n",
+    "import datetime\n",
+    "import glob\n",
+    "import io\n",
+    "import json\n",
+    "from zipfile import ZipFile\n",
+    "import tabulate"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a6492e95-24c7-4425-bf0a-27e10e879ca6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import getpass\n",
+    "username = getpass.getuser()\n",
+    "root = f\"/home/{username}/simsearch/data/ivf/results/sift1M\"\n",
+    "results = BIO(root).read_json(\"result.json\")\n",
+    "results.keys()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0875d269-aef4-426d-83dd-866970f43777",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "results['experiments']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "f080a6e2-1565-418b-8732-4adeff03a099",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def plot_metric(experiments, accuracy_title, cost_title, plot_space=False, plot=None):\n",
+    "    if plot is None:\n",
+    "        plot = plt.subplot()\n",
+    "    x = {}\n",
+    "    y = {}\n",
+    "    for accuracy, space, time, k, v in experiments:\n",
+    "        idx_name = v['index'] + (\"snap\" if 'search_params' in v and v['search_params'][\"snap\"] == 1 else \"\")\n",
+    "        if idx_name not in x:\n",
+    "            x[idx_name] = []\n",
+    "            y[idx_name] = []\n",
+    "        x[idx_name].append(accuracy)\n",
+    "        if plot_space:\n",
+    "            y[idx_name].append(space)\n",
+    "        else:\n",
+    "            y[idx_name].append(time)\n",
+    "\n",
+    "    #plt.figure(figsize=(10,6))\n",
+    "    #plt.title(accuracy_title)\n",
+    "    plot.set_xlabel(accuracy_title)\n",
+    "    plot.set_ylabel(cost_title)\n",
+    "    plot.set_yscale(\"log\")\n",
+    "    marker = itertools.cycle((\"o\", \"v\", \"^\", \"<\", \">\", \"s\", \"p\", \"P\", \"*\", \"h\", \"X\", \"D\"))    \n",
+    "    for index in x.keys():\n",
+    "        plot.plot(x[index], y[index], marker=next(marker), label=index, linewidth=0)\n",
+    "    plot.legend(bbox_to_anchor=(1, 1), loc='upper left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "61007155-5edc-449e-835e-c141a01a2ae5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# index local optima\n",
+    "accuracy_metric = \"knn_intersection\"\n",
+    "fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1, min_accuracy=0.95)\n",
+    "plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 32 cores)\", plot_space=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f9f94dcc-5abe-4cad-9619-f5d1d24fb8c1",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# global optima\n",
+    "accuracy_metric = \"knn_intersection\"\n",
+    "fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0.25, name_filter=lambda n: not n.startswith(\"Flat\"), pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "#fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0.90, max_space=64, max_time=0, name_filter=lambda n: not n.startswith(\"Flat\"), pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 32 cores)\", plot_space=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c10f587-26ef-49ec-83a9-88f6a2a433e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pretty_params(p):\n",
+    "    p = copy(p)\n",
+    "    if 'snap' in p and p['snap'] == 0:\n",
+    "        del p['snap']\n",
+    "    return p\n",
+    "    \n",
+    "tabulate.tabulate([(accuracy, space, time, v['factory'], pretty_params(v['construction_params'][1]), pretty_params(v['search_params'])) \n",
+    "                for accuracy, space, time, k, v in fr],\n",
+    "                tablefmt=\"html\",\n",
+    "                headers=[\"accuracy\",\"space\", \"time\", \"factory\", \"quantizer cfg\", \"search cfg\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36e82084-18f6-4546-a717-163eb0224ee8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# index local optima @ precision 0.8\n",
+    "precision = 0.8\n",
+    "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
+    "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aff79376-39f7-47c0-8b83-1efe5192bb7e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# index local optima @ precision 0.2\n",
+    "precision = 0.2\n",
+    "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
+    "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b4834f1f-bbbe-4cae-9aa0-a459b0c842d1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# global optima @ precision 0.8\n",
+    "precision = 0.8\n",
+    "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
+    "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "plot_metric(fr, accuracy_title=f\"range recall @ precision {precision}\", cost_title=\"time (seconds, 16 cores)\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9aead830-6209-4956-b7ea-4a5e0029d616",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_range_search_pr_curves(experiments):\n",
+    "    x = {}\n",
+    "    y = {}\n",
+    "    show = {\n",
+    "        'Flat': None,\n",
+    "    }\n",
+    "    for _, _, _, k, v in fr:\n",
+    "        if \".weighted\" in k: # and v['index'] in show:\n",
+    "            x[k] = v['range_search_pr']['recall']\n",
+    "            y[k] = v['range_search_pr']['precision']\n",
+    "    \n",
+    "    plt.title(\"range search recall\")\n",
+    "    plt.xlabel(\"recall\")\n",
+    "    plt.ylabel(\"precision\")\n",
+    "    for index in x.keys():\n",
+    "        plt.plot(x[index], y[index], '.', label=index)\n",
+    "    plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "92e45502-7a31-4a15-90df-fa3032d7d350",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "precision = 0.8\n",
+    "accuracy_metric = lambda exp: range_search_recall_at_precision(exp, precision)\n",
+    "fr = filter_results(results, evaluation=\"weighted\", accuracy_metric=accuracy_metric, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME_SPACE, scaling_factor=1)\n",
+    "plot_range_search_pr_curves(fr)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fdf8148a-0da6-4c5e-8d60-f8f85314574c",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "root = \"/checkpoint/gsz/bench_fw/ivf/bigann\"\n",
+    "scales = [1, 2, 5, 10, 20, 50]\n",
+    "fig, plots = plt.subplots(len(scales), sharex=True, figsize=(5,25))\n",
+    "fig.tight_layout()\n",
+    "for plot, scale in zip(plots, scales, strict=True):\n",
+    "    results = BIO(root).read_json(f\"result{scale}.json\")\n",
+    "    accuracy_metric = \"knn_intersection\"\n",
+    "    fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0.9, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "    plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 64 cores)\", plot=plot)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e503828c-ee61-45f7-814b-cce6461109bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x = {}\n",
+    "y = {}\n",
+    "accuracy=0.9\n",
+    "root = \"/checkpoint/gsz/bench_fw/ivf/bigann\"\n",
+    "scales = [1, 2, 5, 10, 20, 50]\n",
+    "#fig, plots = plt.subplots(len(scales), sharex=True, figsize=(5,25))\n",
+    "#fig.tight_layout()\n",
+    "for scale in scales:\n",
+    "    results = BIO(root).read_json(f\"result{scale}.json\")\n",
+    "    scale *= 1_000_000\n",
+    "    accuracy_metric = \"knn_intersection\"\n",
+    "    fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=accuracy, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "    seen = set()\n",
+    "    print(scale)\n",
+    "    for _, _, _, _, exp in fr:\n",
+    "        fact = exp[\"factory\"]\n",
+    "        # \"HNSW\" in fact or \n",
+    "        if fact in seen or fact in [\"Flat\", \"IVF512,Flat\", \"IVF1024,Flat\", \"IVF2048,Flat\"]:\n",
+    "            continue\n",
+    "        seen.add(fact)\n",
+    "        if fact not in x:\n",
+    "            x[fact] = []\n",
+    "            y[fact] = []\n",
+    "        x[fact].append(scale)\n",
+    "        y[fact].append(exp[\"time\"] + exp[\"quantizer\"][\"time\"])\n",
+    "        if (exp[\"knn_intersection\"] > 0.92):\n",
+    "            print(fact)\n",
+    "            print(exp[\"search_params\"])\n",
+    "            print(exp[\"knn_intersection\"])\n",
+    "\n",
+    "        #plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"time (seconds, 64 cores)\", plot=plot)\n",
+    "    \n",
+    "plt.title(f\"recall @ 1 = {accuracy*100}%\")\n",
+    "plt.xlabel(\"database size\")\n",
+    "plt.ylabel(\"time\")\n",
+    "plt.xscale(\"log\")\n",
+    "plt.yscale(\"log\")\n",
+    "\n",
+    "marker = itertools.cycle((\"o\", \"v\", \"^\", \"<\", \">\", \"s\", \"p\", \"P\", \"*\", \"h\", \"X\", \"D\"))    \n",
+    "for index in x.keys():\n",
+    "    if \"HNSW\" in index:\n",
+    "        plt.plot(x[index], y[index], label=index, linewidth=1, marker=next(marker), linestyle=\"dashed\")\n",
+    "    else:\n",
+    "        plt.plot(x[index], y[index], label=index, linewidth=1, marker=next(marker))\n",
+    "plt.legend(bbox_to_anchor=(1.0, 1.0), loc='upper left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "37a99bb2-f998-461b-a345-7cc6e702cb3a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# global optima\n",
+    "accuracy_metric = \"sym_recall\"\n",
+    "fr = filter_results(results, evaluation=\"rec\", accuracy_metric=accuracy_metric, time_metric=lambda e:e['encode_time'], min_accuracy=0.9, pareto_mode=ParetoMode.GLOBAL, pareto_metric=ParetoMetric.SPACE, scaling_factor=1)\n",
+    "plot_metric(fr, accuracy_title=\"knn intersection\", cost_title=\"space\", plot_space=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c973ce4e-3566-4f02-bd93-f113e3e0c791",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def pretty_time(s):\n",
+    "    if s is None:\n",
+    "        return \"None\"\n",
+    "    s = int(s * 1000) / 1000\n",
+    "    m, s = divmod(s, 60)\n",
+    "    h, m = divmod(m, 60)\n",
+    "    d, h = divmod(h, 24)\n",
+    "    r = \"\"\n",
+    "    if d > 0:\n",
+    "        r += f\"{int(d)}d \"\n",
+    "    if h > 0:\n",
+    "        r += f\"{int(h)}h \"\n",
+    "    if m > 0:\n",
+    "        r += f\"{int(m)}m \"\n",
+    "    if s > 0 or len(r) == 0:\n",
+    "        r += f\"{s:.3f}s\"\n",
+    "    return r\n",
+    "\n",
+    "def pretty_size(s):\n",
+    "    if s > 1024 * 1024:\n",
+    "        return f\"{s / 1024 / 1024:.1f}\".rstrip('0').rstrip('.') + \"MB\"\n",
+    "    if s > 1024:\n",
+    "        return f\"{s / 1024:.1f}\".rstrip('0').rstrip('.') + \"KB\"\n",
+    "    return f\"{s}\"\n",
+    "\n",
+    "def pretty_mse(m):\n",
+    "    if m is None:\n",
+    "        return \"None\"\n",
+    "    else:\n",
+    "        return f\"{m:.6f}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1ddcf226-fb97-4a59-9fc3-3ed8f7d5e703",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = {}\n",
+    "root = \"/checkpoint/gsz/bench_fw/bigann\"\n",
+    "scales = [1, 2, 5, 10, 20, 50]\n",
+    "for scale in scales:\n",
+    "    results = BIO(root).read_json(f\"result{scale}.json\")\n",
+    "    accuracy_metric = \"knn_intersection\"\n",
+    "    fr = filter_results(results, evaluation=\"knn\", accuracy_metric=accuracy_metric, min_accuracy=0, pareto_mode=ParetoMode.INDEX, pareto_metric=ParetoMetric.TIME, scaling_factor=1)\n",
+    "    d = {}\n",
+    "    data[f\"{scale}M\"] = d\n",
+    "    for _, _, _, _, exp in fr:\n",
+    "        fact = exp[\"factory\"]\n",
+    "        # \"HNSW\" in fact or \n",
+    "        if fact in [\"Flat\", \"IVF512,Flat\", \"IVF1024,Flat\", \"IVF2048,Flat\"]:\n",
+    "            continue\n",
+    "        if fact not in d:\n",
+    "            d[fact] = []\n",
+    "        d[fact].append({\n",
+    "            \"nprobe\": exp[\"search_params\"][\"nprobe\"],\n",
+    "            \"recall\": exp[\"knn_intersection\"],\n",
+    "            \"time\": exp[\"time\"] + exp[\"quantizer\"][\"time\"],\n",
+    "        })\n",
+    "data\n",
+    "# with open(\"/checkpoint/gsz/bench_fw/codecs.json\", \"w\") as f:\n",
+    "#    json.dump(data, f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e54eebb6-0a9f-4a72-84d2-f12c5bd44510",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = \"deep1b\"\n",
+    "data = []\n",
+    "jss = []\n",
+    "root = f\"/checkpoint/gsz/bench_fw/codecs/{ds}\"\n",
+    "results = BIO(root).read_json(f\"result.json\")\n",
+    "for k, e in results[\"experiments\"].items():\n",
+    "    if \"rec\" in k and e['factory'] != 'Flat': # and e['sym_recall'] > 0.0: # and \"PRQ\" in e['factory'] and e['sym_recall'] > 0.0:\n",
+    "        code_size = results['indices'][e['codec']]['sa_code_size']\n",
+    "        codec_size = results['indices'][e['codec']]['codec_size']\n",
+    "        training_time = results['indices'][e['codec']]['training_time']\n",
+    "        # training_size = results['indices'][e['codec']]['training_size']\n",
+    "        cpu = e['cpu'] if 'cpu' in e else \"\"\n",
+    "        ps = ', '.join([f\"{k}={v}\" for k,v in e['construction_params'][0].items()]) if e['construction_params'] else \" \"\n",
+    "        eps = ', '.join([f\"{k}={v}\" for k,v in e['reconstruct_params'].items() if k != \"snap\"]) if e['reconstruct_params'] else \" \"\n",
+    "        data.append((code_size, f\"|{e['factory']}|{ps}|{eps}|{code_size}|{pretty_size(codec_size)}|{pretty_time(training_time)}|{training_size}|{pretty_mse(e['mse'])}|{e['sym_recall']}|{e['asym_recall']}|{pretty_time(e['encode_time'])}|{pretty_time(e['decode_time'])}|{cpu}|\"))\n",
+    "        jss.append({\n",
+    "            'factory': e['factory'],\n",
+    "            'parameters': e['construction_params'][0] if e['construction_params'] else \"\",\n",
+    "            'evaluation_params': e['reconstruct_params'],\n",
+    "            'code_size': code_size,\n",
+    "            'codec_size': codec_size,\n",
+    "            'training_time': training_time,\n",
+    "            'training_size': training_size,\n",
+    "            'mse': e['mse'],\n",
+    "            'sym_recall': e['sym_recall'],\n",
+    "            'asym_recall': e['asym_recall'],\n",
+    "            'encode_time': e['encode_time'],\n",
+    "            'decode_time': e['decode_time'],\n",
+    "            'cpu': cpu,\n",
+    "        })\n",
+    "\n",
+    "print(\"|factory key|construction parameters|evaluation parameters|code size|codec size|training time|training size|mean squared error|sym recall @ 1|asym recall @ 1|encode time|decode time|cpu|\")\n",
+    "print(\"|-|-|-|-|-|-|-|-|-|\")\n",
+    "data.sort()\n",
+    "for d in data:\n",
+    "    print(d[1])\n",
+    "\n",
+    "with open(f\"/checkpoint/gsz/bench_fw/codecs_{ds}_test.json\", \"w\") as f:\n",
+    "    json.dump(jss, f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d1216733-9670-407c-b3d2-5f87bce0321c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_file(filename: str, keys):\n",
+    "    results = []\n",
+    "    with ZipFile(filename, \"r\") as zip_file:\n",
+    "        for key in keys:\n",
+    "            with zip_file.open(key, \"r\") as f:\n",
+    "                if key in [\"D\", \"I\", \"R\", \"lims\"]:\n",
+    "                    results.append(np.load(f))\n",
+    "                elif key in [\"P\"]:\n",
+    "                    t = io.TextIOWrapper(f)\n",
+    "                    results.append(json.load(t))\n",
+    "                else:\n",
+    "                    raise AssertionError()\n",
+    "    return results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "56de051e-22db-4bef-b242-1ddabc9e0bb9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = \"contriever\"\n",
+    "data = []\n",
+    "jss = []\n",
+    "root = f\"/checkpoint/gsz/bench_fw/codecs/{ds}\"\n",
+    "for lf in glob.glob(root + '/*rec*.zip'):\n",
+    "    e, = read_file(lf, ['P'])\n",
+    "    if e['factory'] != 'Flat': # and e['sym_recall'] > 0.0: # and \"PRQ\" in e['factory'] and e['sym_recall'] > 0.0:\n",
+    "        code_size = e['codec_meta']['sa_code_size']\n",
+    "        codec_size = e['codec_meta']['codec_size']\n",
+    "        training_time = e['codec_meta']['training_time']\n",
+    "        training_size = None # e['codec_meta']['training_size']\n",
+    "        cpu = e['cpu'] if 'cpu' in e else \"\"\n",
+    "        ps = ', '.join([f\"{k}={v}\" for k,v in e['construction_params'][0].items()]) if e['construction_params'] else \" \"\n",
+    "        eps = ', '.join([f\"{k}={v}\" for k,v in e['reconstruct_params'].items() if k != \"snap\"]) if e['reconstruct_params'] else \" \"\n",
+    "        if eps in ps and eps != \"encode_ils_iters=16\" and eps != \"max_beam_size=32\":\n",
+    "           eps = \" \"\n",
+    "        data.append((code_size, f\"|{e['factory']}|{ps}|{eps}|{code_size}|{pretty_size(codec_size)}|{pretty_time(training_time)}|{pretty_mse(e['mse'])}|{e['sym_recall']}|{e['asym_recall']}|{pretty_time(e['encode_time'])}|{pretty_time(e['decode_time'])}|{cpu}|\"))\n",
+    "        eps = e['reconstruct_params']\n",
+    "        del eps['snap']\n",
+    "        params = copy(e['construction_params'][0]) if e['construction_params'] else {}\n",
+    "        for k, v in e['reconstruct_params'].items():\n",
+    "            params[k] = v\n",
+    "        jss.append({\n",
+    "            'factory': e['factory'],\n",
+    "            'params': params,\n",
+    "            'construction_params': e['construction_params'][0] if e['construction_params'] else {},\n",
+    "            'evaluation_params': e['reconstruct_params'],\n",
+    "            'code_size': code_size,\n",
+    "            'codec_size': codec_size,\n",
+    "            'training_time': training_time,\n",
+    "            # 'training_size': training_size,\n",
+    "            'mse': e['mse'],\n",
+    "            'sym_recall': e['sym_recall'],\n",
+    "            'asym_recall': e['asym_recall'],\n",
+    "            'encode_time': e['encode_time'],\n",
+    "            'decode_time': e['decode_time'],\n",
+    "            'cpu': cpu,\n",
+    "        })\n",
+    "\n",
+    "print(\"|factory key|construction parameters|encode/decode parameters|code size|codec size|training time|mean squared error|sym recall @ 1|asym recall @ 1|encode time|decode time|cpu|\")\n",
+    "print(\"|-|-|-|-|-|-|-|-|-|\")\n",
+    "data.sort()\n",
+    "# for d in data:\n",
+    "#   print(d[1])\n",
+    "\n",
+    "print(len(data))\n",
+    "\n",
+    "with open(f\"/checkpoint/gsz/bench_fw/codecs_{ds}_5.json\", \"w\") as f:\n",
+    "    json.dump(jss, f)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "faiss_binary (local)",
+   "language": "python",
+   "name": "faiss_binary_local"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From c9e23efdd2b72be3ffe4b09e60980e6969ff2683 Mon Sep 17 00:00:00 2001
From: Kumar Saurabh Arora <kuarora@meta.com>
Date: Fri, 21 Jun 2024 13:04:09 -0700
Subject: [PATCH 064/148] Refactor bench_fw to support train, build & search in
 parallel (#3527)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3527

**Context**
Design Doc: [Faiss Benchmarking](https://docs.google.com/document/d/1c7zziITa4RD6jZsbG9_yOgyRjWdyueldSPH6QdZzL98/edit)

**In this diff**
1. Be able to reference codec and index from blobstore (bucket & path) outside the experiment
2. To support #1, naming is moved to descriptors.
3. Build index can be written as well.
4. You can run benchmark with train and then refer it in index built and then refer index built in knn search. Index serialization is optional. Although not yet exposed through index descriptor.
5. Benchmark can support index with different datasets sizes
6. Working with varying dataset now support multiple ground truth. There may be small fixes before we could use this.
7. Added targets for bench_fw_range, ivf, codecs and optimize.

**Analysis of ivf result**: D58823037

Reviewed By: algoriddle

Differential Revision: D57236543

fbshipit-source-id: ad03b28bae937a35f8c20f12e0a5b0a27c34ff3b
---
 benchs/bench_fw/benchmark.py    | 763 +++++++++++++++++++++++++-------
 benchs/bench_fw/benchmark_io.py |   6 +-
 benchs/bench_fw/descriptors.py  | 215 ++++++++-
 benchs/bench_fw/index.py        | 115 +++--
 benchs/bench_fw/optimize.py     |  16 +-
 benchs/bench_fw_codecs.py       |  10 +-
 benchs/bench_fw_ivf.py          |  20 +-
 benchs/bench_fw_optimize.py     |   6 +-
 benchs/bench_fw_range.py        |  20 +-
 9 files changed, 906 insertions(+), 265 deletions(-)

diff --git a/benchs/bench_fw/benchmark.py b/benchs/bench_fw/benchmark.py
index 8ca68c4cd8..237d08bd9a 100644
--- a/benchs/bench_fw/benchmark.py
+++ b/benchs/bench_fw/benchmark.py
@@ -4,8 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
-from copy import copy
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from operator import itemgetter
 from statistics import mean, median
 from typing import Any, Dict, List, Optional
@@ -16,7 +15,16 @@
 
 from scipy.optimize import curve_fit
 
-from .descriptors import DatasetDescriptor, IndexDescriptor
+from .benchmark_io import BenchmarkIO
+
+from .descriptors import (
+    CodecDescriptor,
+    DatasetDescriptor,
+    IndexDescriptor,
+    IndexDescriptorClassic,
+    KnnDescriptor,
+)
+
 from .index import Index, IndexFromCodec, IndexFromFactory
 
 from .utils import dict_merge
@@ -185,15 +193,9 @@ def sigmoid(x, a, b, c):
 
 
 @dataclass
-class Benchmark:
+class IndexOperator:
     num_threads: int
-    training_vectors: Optional[DatasetDescriptor] = None
-    database_vectors: Optional[DatasetDescriptor] = None
-    query_vectors: Optional[DatasetDescriptor] = None
-    index_descs: Optional[List[IndexDescriptor]] = None
-    range_ref_index_desc: Optional[str] = None
-    k: Optional[int] = None
-    distance_metric: str = "L2"
+    distance_metric: str
 
     def __post_init__(self):
         if self.distance_metric == "IP":
@@ -203,20 +205,167 @@ def __post_init__(self):
         else:
             raise ValueError
 
-    def set_io(self, benchmark_io):
+    def set_io(self, benchmark_io: BenchmarkIO):
         self.io = benchmark_io
         self.io.distance_metric = self.distance_metric
         self.io.distance_metric_type = self.distance_metric_type
 
-    def get_index_desc(self, factory_or_codec: str) -> Optional[IndexDescriptor]:
+
+@dataclass
+class TrainOperator(IndexOperator):
+    codec_descs: List[CodecDescriptor] = field(default_factory=lambda: [])
+
+    def get_desc(self, name: str) -> Optional[CodecDescriptor]:
+        for desc in self.codec_descs:
+            if desc.get_name() == name:
+                return desc
+            elif desc.factory == name:
+                return desc
+        return None
+
+    def get_flat_desc(self, name=None) -> Optional[CodecDescriptor]:
+        for desc in self.codec_descs:
+            desc_name = desc.get_name()
+            if desc_name == name:
+                return desc
+            if desc_name.startswith("Flat"):
+                return desc
+        return None
+
+    def build_index_wrapper(self, codec_desc: CodecDescriptor):
+        if hasattr(codec_desc, "index"):
+            return
+
+        if codec_desc.factory is not None:
+            assert (
+                codec_desc.factory == "Flat" or codec_desc.training_vectors is not None
+            )
+            index = IndexFromFactory(
+                num_threads=self.num_threads,
+                d=codec_desc.d,
+                metric=self.distance_metric,
+                construction_params=codec_desc.construction_params,
+                factory=codec_desc.factory,
+                training_vectors=codec_desc.training_vectors,
+                codec_name=codec_desc.get_name(),
+            )
+            index.set_io(self.io)
+            codec_desc.index = index
+        else:
+            assert codec_desc.is_trained()
+
+    def train(
+        self, codec_desc: CodecDescriptor, results: Dict[str, Any], dry_run=False
+    ):
+        self.build_index_wrapper(codec_desc)
+        if codec_desc.is_trained():
+            return results, None
+
+        if dry_run:
+            meta, requires = codec_desc.index.fetch_meta(dry_run=dry_run)
+        else:
+            codec_desc.index.get_codec()
+            meta, requires = codec_desc.index.fetch_meta(dry_run=dry_run)
+            assert requires is None
+
+        if requires is None:
+            results["indices"][codec_desc.get_name()] = meta
+        return results, requires
+
+
+@dataclass
+class BuildOperator(IndexOperator):
+    index_descs: List[IndexDescriptor] = field(default_factory=lambda: [])
+
+    def get_desc(self, name: str) -> Optional[IndexDescriptor]:
+        for desc in self.index_descs:
+            if desc.get_name() == name:
+                return desc
+        return None
+
+    def get_flat_desc(self, name=None) -> Optional[IndexDescriptor]:
         for desc in self.index_descs:
-            if desc.factory == factory_or_codec:
+            desc_name = desc.get_name()
+            if desc_name == name:
                 return desc
-            if desc.codec_alias == factory_or_codec:
+            if desc_name.startswith("Flat"):
+                return desc
+        return None
+
+    def build_index_wrapper(self, index_desc: IndexDescriptor):
+        if hasattr(index_desc, "index"):
+            return
+
+        if hasattr(index_desc.codec_desc, "index"):
+            index_desc.index = index_desc.codec_desc.index
+            index_desc.index.database_vectors = index_desc.database_desc
+            index_desc.index.index_name = index_desc.get_name()
+            return
+
+        if index_desc.codec_desc is not None:
+            index = IndexFromCodec(
+                num_threads=self.num_threads,
+                d=index_desc.d,
+                metric=self.distance_metric,
+                database_vectors=index_desc.database_desc,
+                bucket=index_desc.codec_desc.bucket,
+                path=index_desc.codec_desc.path,
+                index_name=index_desc.get_name(),
+                codec_name=index_desc.codec_desc.get_name(),
+            )
+            index.set_io(self.io)
+            index_desc.index = index
+        else:
+            assert index_desc.is_built()
+
+    def build(self, index_desc: IndexDescriptor, results: Dict[str, Any]):
+        self.build_index_wrapper(index_desc)
+        if index_desc.is_built():
+            return
+        index_desc.index.get_index()
+
+
+@dataclass
+class SearchOperator(IndexOperator):
+    knn_descs: List[KnnDescriptor] = field(default_factory=lambda: [])
+    range: bool = False
+
+    def get_desc(self, name: str) -> Optional[KnnDescriptor]:
+        for desc in self.knn_descs:
+            if desc.get_name() == name:
+                return desc
+        return None
+
+    def get_flat_desc(self, name=None) -> Optional[KnnDescriptor]:
+        for desc in self.knn_descs:
+            if desc.get_name().startswith("Flat"):
                 return desc
         return None
 
-    def range_search_reference(self, index, parameters, range_metric):
+    def build_index_wrapper(self, knn_desc: KnnDescriptor):
+        if hasattr(knn_desc, "index"):
+            return
+
+        if knn_desc.index_desc.index is not None:
+            knn_desc.index = knn_desc.index_desc.index
+            knn_desc.index.knn_name = knn_desc.get_name()
+            knn_desc.index.search_params = knn_desc.search_params
+        else:
+            index = Index(
+                num_threads=self.num_threads,
+                d=knn_desc.d,
+                metric=self.distance_metric,
+                bucket=knn_desc.index_desc.bucket,
+                index_path=knn_desc.index_desc.path,
+                # knn_name=knn_desc.get_name(),
+                search_params=knn_desc.search_params,
+            )
+            index.set_io(self.io)
+            knn_desc.index = index
+
+        knn_desc.index.get_index()
+
+    def range_search_reference(self, index, parameters, range_metric, query_dataset):
         logger.info("range_search_reference: begin")
         if isinstance(range_metric, list):
             assert len(range_metric) > 0
@@ -233,6 +382,7 @@ def range_search_reference(self, index, parameters, range_metric):
             index,
             parameters,
             radius=m_radius,
+            query_dataset=query_dataset,
         )
         flat = index.is_flat_index()
         (
@@ -253,11 +403,11 @@ def range_search_reference(self, index, parameters, range_metric):
             coefficients_training_data,
         )
 
-    def estimate_range(self, index, parameters, range_scoring_radius):
+    def estimate_range(self, index, parameters, range_scoring_radius, query_dataset):
         D, I, R, P, _ = index.knn_search(
             False,
             parameters,
-            self.query_vectors,
+            query_dataset,
             self.k,
         )
         samples = []
@@ -275,6 +425,7 @@ def range_search(
         dry_run,
         index: Index,
         search_parameters: Optional[Dict[str, int]],
+        query_dataset: DatasetDescriptor,
         radius: Optional[float] = None,
         gt_radius: Optional[float] = None,
         range_search_metric_function=None,
@@ -287,25 +438,21 @@ def range_search(
                 gt_radius
                 if index.is_flat()
                 else self.estimate_range(
-                    index,
-                    search_parameters,
-                    gt_radius,
+                    index, search_parameters, gt_radius, query_dataset
                 )
             )
         logger.info(f"Radius={radius}")
         lims, D, I, R, P, requires = index.range_search(
             dry_run=dry_run,
             search_parameters=search_parameters,
-            query_vectors=self.query_vectors,
+            query_vectors=query_dataset,
             radius=radius,
         )
         if requires is not None:
             return None, None, None, None, None, requires
         if range_search_metric_function is not None:
             range_search_metric = range_search_metric_function(R)
-            range_search_pr = range_search_pr_curve(
-                D, range_search_metric, gt_rsm
-            )
+            range_search_pr = range_search_pr_curve(D, range_search_metric, gt_rsm)
             range_score_sum = np.sum(range_search_metric).item()
             P |= {
                 "range_score_sum": range_score_sum,
@@ -314,23 +461,29 @@ def range_search(
             }
         return lims, D, I, R, P, requires
 
-    def range_ground_truth(self, gt_radius, range_search_metric_function):
+    def range_ground_truth(
+        self, gt_radius, range_search_metric_function, flat_desc=None
+    ):
         logger.info("range_ground_truth: begin")
-        flat_desc = self.get_index_desc("Flat")
+        if flat_desc is None:
+            flat_desc = self.get_flat_desc()
         lims, D, I, R, P, _ = self.range_search(
             False,
             flat_desc.index,
             search_parameters=None,
             radius=gt_radius,
+            query_dataset=flat_desc.query_dataset,
         )
         gt_rsm = np.sum(range_search_metric_function(R)).item()
         logger.info("range_ground_truth: end")
         return gt_rsm
 
-    def knn_ground_truth(self):
+    def knn_ground_truth(self, flat_desc=None):
         logger.info("knn_ground_truth: begin")
-        flat_desc = self.get_index_desc("Flat")
+        if flat_desc is None:
+            flat_desc = self.get_flat_desc()
         self.build_index_wrapper(flat_desc)
+        # TODO(kuarora): Consider moving gt results(gt_knn_D, gt_knn_I) to the index as there can be multiple ground truths.
         (
             self.gt_knn_D,
             self.gt_knn_I,
@@ -340,8 +493,8 @@ def knn_ground_truth(self):
         ) = flat_desc.index.knn_search(
             dry_run=False,
             search_parameters=None,
-            query_vectors=self.query_vectors,
-            k=self.k,
+            query_vectors=flat_desc.query_dataset,
+            k=flat_desc.k,
         )
         assert requires is None
         logger.info("knn_ground_truth: end")
@@ -371,6 +524,7 @@ def experiment(parameters, cost_metric, perf_metric):
                 results["experiments"][key] = metrics
             return metrics[cost_metric], metrics[perf_metric], None
 
+        requires = None
         for cost_metric in cost_metrics:
             for perf_metric in perf_metrics:
                 op = index.get_operating_points()
@@ -386,52 +540,52 @@ def experiment(parameters, cost_metric, perf_metric):
         return results, requires
 
     def knn_search_benchmark(
-        self, dry_run, results: Dict[str, Any], index: Index
+        self, dry_run, results: Dict[str, Any], knn_desc: KnnDescriptor
     ):
         return self.search_benchmark(
             name="knn_search",
-            search_func=lambda parameters: index.knn_search(
+            search_func=lambda parameters: knn_desc.index.knn_search(
                 dry_run,
                 parameters,
-                self.query_vectors,
-                self.k,
+                knn_desc.query_dataset,
+                knn_desc.k,
                 self.gt_knn_I,
                 self.gt_knn_D,
             )[3:],
-            key_func=lambda parameters: index.get_knn_search_name(
+            key_func=lambda parameters: knn_desc.index.get_knn_search_name(
                 search_parameters=parameters,
-                query_vectors=self.query_vectors,
-                k=self.k,
+                query_vectors=knn_desc.query_dataset,
+                k=knn_desc.k,
                 reconstruct=False,
             ),
             cost_metrics=["time"],
             perf_metrics=["knn_intersection", "distance_ratio"],
             results=results,
-            index=index,
+            index=knn_desc.index,
         )
 
     def reconstruct_benchmark(
-        self, dry_run, results: Dict[str, Any], index: Index
+        self, dry_run, results: Dict[str, Any], knn_desc: KnnDescriptor
     ):
         return self.search_benchmark(
             name="reconstruct",
-            search_func=lambda parameters: index.reconstruct(
+            search_func=lambda parameters: knn_desc.index.reconstruct(
                 dry_run,
                 parameters,
-                self.query_vectors,
-                self.k,
+                knn_desc.query_dataset,
+                knn_desc.k,
                 self.gt_knn_I,
             ),
-            key_func=lambda parameters: index.get_knn_search_name(
+            key_func=lambda parameters: knn_desc.index.get_knn_search_name(
                 search_parameters=parameters,
-                query_vectors=self.query_vectors,
-                k=self.k,
+                query_vectors=knn_desc.query_dataset,
+                k=knn_desc.k,
                 reconstruct=True,
             ),
             cost_metrics=["encode_time"],
             perf_metrics=["sym_recall"],
             results=results,
-            index=index,
+            index=knn_desc.index,
         )
 
     def range_search_benchmark(
@@ -444,6 +598,7 @@ def range_search_benchmark(
         gt_radius: float,
         range_search_metric_function,
         gt_rsm: float,
+        query_dataset: DatasetDescriptor,
     ):
         return self.search_benchmark(
             name="range_search",
@@ -455,10 +610,11 @@ def range_search_benchmark(
                 gt_radius=gt_radius,
                 range_search_metric_function=range_search_metric_function,
                 gt_rsm=gt_rsm,
+                query_dataset=query_dataset,
             )[4:],
             key_func=lambda parameters: index.get_range_search_name(
                 search_parameters=parameters,
-                query_vectors=self.query_vectors,
+                query_vectors=query_dataset,
                 radius=radius,
             )
             + metric_key,
@@ -468,69 +624,88 @@ def range_search_benchmark(
             index=index,
         )
 
-    def build_index_wrapper(self, index_desc: IndexDescriptor):
-        if hasattr(index_desc, "index"):
-            return
-        if index_desc.factory is not None:
-            training_vectors = copy(self.training_vectors)
-            if index_desc.training_size is not None:
-                training_vectors.num_vectors = index_desc.training_size
-            index = IndexFromFactory(
-                num_threads=self.num_threads,
-                d=self.d,
-                metric=self.distance_metric,
-                database_vectors=self.database_vectors,
-                search_params=index_desc.search_params,
-                construction_params=index_desc.construction_params,
-                factory=index_desc.factory,
-                training_vectors=training_vectors,
-            )
+
+@dataclass
+class ExecutionOperator:
+    distance_metric: str = "L2"
+    num_threads: int = 1
+    train_op: Optional[TrainOperator] = None
+    build_op: Optional[BuildOperator] = None
+    search_op: Optional[SearchOperator] = None
+
+    def __post_init__(self):
+        if self.distance_metric == "IP":
+            self.distance_metric_type = faiss.METRIC_INNER_PRODUCT
+        elif self.distance_metric == "L2":
+            self.distance_metric_type = faiss.METRIC_L2
         else:
-            index = IndexFromCodec(
-                num_threads=self.num_threads,
-                d=self.d,
-                metric=self.distance_metric,
-                database_vectors=self.database_vectors,
-                search_params=index_desc.search_params,
-                construction_params=index_desc.construction_params,
-                path=index_desc.path,
-                bucket=index_desc.bucket,
-            )
-        index.set_io(self.io)
-        index_desc.index = index
+            raise ValueError
 
-    def clone_one(self, index_desc):
-        benchmark = Benchmark(
-            num_threads=self.num_threads,
-            training_vectors=self.training_vectors,
-            database_vectors=self.database_vectors,
-            query_vectors=self.query_vectors,
-            index_descs=[self.get_index_desc("Flat"), index_desc],
-            range_ref_index_desc=self.range_ref_index_desc,
-            k=self.k,
-            distance_metric=self.distance_metric,
-        )
-        benchmark.set_io(self.io.clone())
-        return benchmark
+    def set_io(self, io: BenchmarkIO):
+        self.io = io
+        self.io.distance_metric = self.distance_metric
+        self.io.distance_metric_type = self.distance_metric_type
+        if self.train_op:
+            self.train_op.set_io(io)
+        if self.build_op:
+            self.build_op.set_io(io)
+        if self.search_op:
+            self.search_op.set_io(io)
+
+    def train_one(self, codec_desc: CodecDescriptor, results: Dict[str, Any], dry_run):
+        faiss.omp_set_num_threads(self.num_threads)
+        assert self.train_op is not None
+        self.train_op.train(codec_desc, results, dry_run)
+
+    def train(self, results, dry_run=False):
+        faiss.omp_set_num_threads(self.num_threads)
+        if self.train_op is None:
+            return
+
+        for codec_desc in self.train_op.codec_descs:
+            self.train_one(codec_desc, results, dry_run)
+
+    def build_one(self, results: Dict[str, Any], index_desc: IndexDescriptor):
+        faiss.omp_set_num_threads(self.num_threads)
+        assert self.build_op is not None
+        self.build_op.build(index_desc, results)
+
+    def build(self, results: Dict[str, Any]):
+        faiss.omp_set_num_threads(self.num_threads)
+        if self.build_op is None:
+            return
+
+        for index_desc in self.build_op.index_descs:
+            self.build_one(index_desc, results)
+
+    def search(self):
+        faiss.omp_set_num_threads(self.num_threads)
+        if self.search_op is None:
+            return
 
-    def benchmark_one(
+        for index_desc in self.search_op.knn_descs:
+            self.search_one(index_desc)
+
+    def search_one(
         self,
-        dry_run,
+        knn_desc: KnnDescriptor,
         results: Dict[str, Any],
-        index_desc: IndexDescriptor,
-        train,
-        reconstruct,
-        knn,
-        range,
+        dry_run=False,
+        range=False,
     ):
         faiss.omp_set_num_threads(self.num_threads)
+        assert self.search_op is not None
+
         if not dry_run:
-            self.knn_ground_truth()
-        self.build_index_wrapper(index_desc)
-        meta, requires = index_desc.index.fetch_meta(dry_run=dry_run)
+            self.create_gt_knn(knn_desc)
+            self.create_range_ref_knn(knn_desc)
+
+        self.search_op.build_index_wrapper(knn_desc)
+        meta, requires = knn_desc.index.fetch_meta(dry_run=dry_run)
         if requires is not None:
-            return results, (requires if train else None)
-        results["indices"][index_desc.index.get_codec_name()] = meta
+            # return results, (requires if train else None)
+            return results, requires
+        results["indices"][knn_desc.index.get_codec_name()] = meta
 
         # results, requires = self.reconstruct_benchmark(
         #     dry_run=True,
@@ -547,33 +722,32 @@ def benchmark_one(
         #             index=index_desc.index,
         #         )
         #         assert requires is None
-
-        results, requires = self.knn_search_benchmark(
+        results, requires = self.search_op.knn_search_benchmark(
             dry_run=True,
             results=results,
-            index=index_desc.index,
+            knn_desc=knn_desc,
         )
-        if knn and requires is not None:
+        if requires is not None:
             if dry_run:
                 return results, requires
             else:
-                results, requires = self.knn_search_benchmark(
+                results, requires = self.search_op.knn_search_benchmark(
                     dry_run=False,
                     results=results,
-                    index=index_desc.index,
+                    knn_desc=knn_desc,
                 )
                 assert requires is None
 
         if (
-            self.range_ref_index_desc is None
-            or not index_desc.index.supports_range_search()
+            knn_desc.range_ref_index_desc is None or
+            not knn_desc.index.supports_range_search()
         ):
             return results, None
 
-        ref_index_desc = self.get_index_desc(self.range_ref_index_desc)
+        ref_index_desc = self.search_op.get_desc(knn_desc.range_ref_index_desc)
         if ref_index_desc is None:
             raise ValueError(
-                f"Unknown range index {self.range_ref_index_desc}"
+                f"{knn_desc.get_name()}: Unknown range index {knn_desc.range_ref_index_desc}"
             )
         if ref_index_desc.range_metrics is None:
             raise ValueError(
@@ -585,92 +759,360 @@ def benchmark_one(
                 range_search_metric_function,
                 coefficients,
                 coefficients_training_data,
-            ) = self.range_search_reference(
+            ) = self.search_op.range_search_reference(
                 ref_index_desc.index,
                 ref_index_desc.search_params,
                 range_metric,
             )
-            gt_rsm = self.range_ground_truth(
+            gt_rsm = self.search_op.range_ground_truth(
                 gt_radius, range_search_metric_function
             )
-            results, requires = self.range_search_benchmark(
+            results, requires = self.search_op.range_search_benchmark(
                 dry_run=True,
                 results=results,
-                index=index_desc.index,
+                index=knn_desc.index,
                 metric_key=metric_key,
-                radius=index_desc.radius,
+                radius=knn_desc.radius,
                 gt_radius=gt_radius,
                 range_search_metric_function=range_search_metric_function,
                 gt_rsm=gt_rsm,
+                query_vectors=knn_desc.query_dataset,
             )
             if range and requires is not None:
                 if dry_run:
                     return results, requires
                 else:
-                    results, requires = self.range_search_benchmark(
+                    results, requires = self.search_op.range_search_benchmark(
                         dry_run=False,
                         results=results,
-                        index=index_desc.index,
+                        index=knn_desc.index,
                         metric_key=metric_key,
-                        radius=index_desc.radius,
+                        radius=knn_desc.radius,
                         gt_radius=gt_radius,
                         range_search_metric_function=range_search_metric_function,
                         gt_rsm=gt_rsm,
+                        query_vectors=knn_desc.query_dataset,
                     )
                     assert requires is None
 
         return results, None
 
-    def benchmark(
-        self,
-        result_file=None,
-        local=False,
-        train=False,
-        reconstruct=False,
-        knn=False,
-        range=False,
-    ):
-        logger.info("begin evaluate")
+    def create_gt_codec(
+        self, codec_desc, results, train=True
+    ) -> Optional[CodecDescriptor]:
+        gt_codec_desc = None
+        if self.train_op:
+            gt_codec_desc = self.train_op.get_flat_desc(codec_desc.flat_name())
+            if gt_codec_desc is None:
+                gt_codec_desc = CodecDescriptor(
+                    factory="Flat",
+                    d=codec_desc.d,
+                    metric=codec_desc.metric,
+                    num_threads=self.num_threads,
+                )
+                self.train_op.codec_descs.insert(0, gt_codec_desc)
+            if train:
+                self.train_op.train(gt_codec_desc, results, dry_run=False)
 
-        faiss.omp_set_num_threads(self.num_threads)
-        results = {"indices": {}, "experiments": {}}
-        xq = self.io.get_dataset(self.query_vectors)
-        self.d = xq.shape[1]
-        if self.get_index_desc("Flat") is None:
-            self.index_descs.append(IndexDescriptor(factory="Flat"))
+        return gt_codec_desc
 
-        self.knn_ground_truth()
+    def create_gt_index(
+        self, index_desc: IndexDescriptor, results: Dict[str, Any], build=True
+    ) -> Optional[IndexDescriptor]:
+        gt_index_desc = None
+        if self.build_op:
+            gt_index_desc = self.build_op.get_flat_desc(index_desc.flat_name())
+            if gt_index_desc is None:
+                gt_codec_desc = self.train_op.get_flat_desc(
+                    index_desc.codec_desc.flat_name()
+                )
+                assert gt_codec_desc is not None
+                gt_index_desc = IndexDescriptor(
+                    d=index_desc.d,
+                    metric=index_desc.metric,
+                    num_threads=self.num_threads,
+                    codec_desc=gt_codec_desc,
+                    database_desc=index_desc.database_desc,
+                )
+                self.build_op.index_descs.insert(0, gt_index_desc)
+            if build:
+                self.build_op.build(gt_index_desc, results)
 
-        if self.range_ref_index_desc is not None:
-            index_desc = self.get_index_desc(self.range_ref_index_desc)
-            if index_desc is None:
-                raise ValueError(
-                    f"Unknown range index {self.range_ref_index_desc}"
+        return gt_index_desc
+
+    def create_gt_knn(self, knn_desc, search=True) -> Optional[KnnDescriptor]:
+        gt_knn_desc = None
+        if self.search_op:
+            gt_knn_desc = self.search_op.get_flat_desc(knn_desc.flat_name())
+            if gt_knn_desc is None:
+                gt_index_desc = self.build_op.get_flat_desc(
+                    knn_desc.index_desc.flat_name()
+                )
+                assert gt_index_desc is not None
+                gt_knn_desc = KnnDescriptor(
+                    d=knn_desc.d,
+                    metric=knn_desc.metric,
+                    num_threads=self.num_threads,
+                    index_desc=gt_index_desc,
+                    query_dataset=knn_desc.query_dataset,
+                    k=knn_desc.k,
                 )
-            if index_desc.range_metrics is None:
+                self.search_op.knn_descs.insert(0, gt_knn_desc)
+            if search:
+                self.search_op.build_index_wrapper(gt_knn_desc)
+                self.search_op.knn_ground_truth(gt_knn_desc)
+
+        return gt_knn_desc
+
+    def create_range_ref_knn(self, knn_desc):
+        if (
+            knn_desc.range_ref_index_desc is None or
+            not knn_desc.index.supports_range_search()
+        ):
+            return
+
+        if knn_desc.range_ref_index_desc is not None:
+            ref_index_desc = self.get_desc(knn_desc.range_ref_index_desc)
+            if ref_index_desc is None:
+                raise ValueError(f"Unknown range index {knn_desc.range_ref_index_desc}")
+            if ref_index_desc.range_metrics is None:
                 raise ValueError(
-                    f"Range index {index_desc.factory} has no radius_score"
+                    f"Range index {knn_desc.get_name()} has no radius_score"
                 )
             results["metrics"] = {}
-            self.build_index_wrapper(index_desc)
-            for metric_key, range_metric in index_desc.range_metrics.items():
+            self.build_index_wrapper(ref_index_desc)
+            for metric_key, range_metric in ref_index_desc.range_metrics.items():
                 (
-                    gt_radius,
+                    knn_desc.gt_radius,
                     range_search_metric_function,
                     coefficients,
                     coefficients_training_data,
                 ) = self.range_search_reference(
-                    index_desc.index, index_desc.search_params, range_metric
+                    knn_desc.index, knn_desc.search_params, range_metric
                 )
                 results["metrics"][metric_key] = {
                     "coefficients": coefficients,
                     "training_data": coefficients_training_data,
                 }
-                gt_rsm = self.range_ground_truth(
-                    gt_radius, range_search_metric_function
+                knn_desc.gt_rsm = self.range_ground_truth(
+                    knn_desc.gt_radius, range_search_metric_function
+                )
+
+    def create_ground_truths(self, results: Dict[str, Any]):
+        # TODO: Create all ground truth descriptors and put them in index descriptor as reference
+        if self.train_op is not None:
+            for codec_desc in self.train_op.codec_descs:
+                self.create_gt_codec(codec_desc, results)
+
+        if self.build_op is not None:
+            for index_desc in self.build_op.index_descs:
+                self.create_gt_index(
+                    index_desc, results
+                )  # may need to pass results in future
+
+        if self.search_op is not None:
+            for knn_desc in self.search_op.knn_descs:
+                self.create_gt_knn(knn_desc, results)
+                self.create_range_ref_knn(knn_desc)
+
+    def execute(self, results: Dict[str, Any], dry_run: False):
+        if self.train_op is not None:
+            for desc in self.train_op.codec_descs:
+                results, requires = self.train_op.train(desc, results, dry_run=dry_run)
+                if dry_run:
+                    if requires is None:
+                        continue
+                    return results, requires
+                assert requires is None
+
+        if self.build_op is not None:
+            for desc in self.build_op.index_descs:
+                self.build_op.build(desc, results)
+        if self.search_op is not None:
+            for desc in self.search_op.knn_descs:
+                results, requires = self.search_one(
+                    knn_desc=desc, results=results, dry_run=dry_run, range=self.search_op.range
+                )
+                if dry_run:
+                    if requires is None:
+                        continue
+                    return results, requires
+
+                assert requires is None
+        return results, None
+
+    def execute_2(self, result_file=None):
+        results = {"indices": {}, "experiments": {}}
+        results, requires = self.execute(results=results)
+        assert requires is None
+        if result_file is not None:
+            self.io.write_json(results, result_file, overwrite=True)
+
+    def add_index_descs(self, codec_desc, index_desc, knn_desc):
+        if codec_desc is not None:
+            self.train_op.codec_descs.append(codec_desc)
+        if index_desc is not None:
+            self.build_op.index_descs.append(index_desc)
+        if knn_desc is not None:
+            self.search_op.knn_descs.append(knn_desc)
+
+
+@dataclass
+class Benchmark:
+    num_threads: int
+    training_vectors: Optional[DatasetDescriptor] = None
+    database_vectors: Optional[DatasetDescriptor] = None
+    query_vectors: Optional[DatasetDescriptor] = None
+    index_descs: Optional[List[IndexDescriptorClassic]] = None
+    range_ref_index_desc: Optional[str] = None
+    k: int = 1
+    distance_metric: str = "L2"
+
+    def set_io(self, benchmark_io):
+        self.io = benchmark_io
+
+    def get_embedding_dimension(self):
+        if self.training_vectors is not None:
+            xt = self.io.get_dataset(self.training_vectors)
+            return xt.shape[1]
+        if self.database_vectors is not None:
+            xb = self.io.get_dataset(self.database_vectors)
+            return xb.shape[1]
+        if self.query_vectors is not None:
+            xq = self.io.get_dataset(self.query_vectors)
+            return xq.shape[1]
+        raise ValueError("Failed to determine dimension of dataset")
+
+    def create_descriptors(
+        self, ci_desc: IndexDescriptorClassic, train, build, knn, reconstruct, range
+    ):
+        codec_desc = None
+        index_desc = None
+        knn_desc = None
+        dim = self.get_embedding_dimension()
+        if train and ci_desc.factory is not None:
+            codec_desc = CodecDescriptor(
+                d=dim,
+                metric=self.distance_metric,
+                num_threads=self.num_threads,
+                factory=ci_desc.factory,
+                construction_params=ci_desc.construction_params,
+                training_vectors=self.training_vectors,
+            )
+        if build:
+            if codec_desc is None:
+                assert ci_desc.path is not None
+                codec_desc = CodecDescriptor(
+                    d=dim,
+                    metric=self.distance_metric,
+                    num_threads=self.num_threads,
+                    bucket=ci_desc.bucket,
+                    path=ci_desc.path,
+                )
+            index_desc = IndexDescriptor(
+                d=codec_desc.d,
+                metric=self.distance_metric,
+                num_threads=self.num_threads,
+                codec_desc=codec_desc,
+                database_desc=self.database_vectors,
+            )
+        if knn or range:
+            if index_desc is None:
+                assert ci_desc.path is not None
+                index_desc = IndexDescriptor(
+                    d=dim,
+                    metric=self.distance_metric,
+                    num_threads=self.num_threads,
+                    bucket=ci_desc.bucket,
+                    path=ci_desc.path,
                 )
+            knn_desc = KnnDescriptor(
+                d=dim,
+                metric=self.distance_metric,
+                num_threads=self.num_threads,
+                index_desc=index_desc,
+                query_dataset=self.query_vectors,
+                search_params=ci_desc.search_params,
+                range_metrics=ci_desc.range_metrics,
+                radius=ci_desc.radius,
+                k=self.k,
+            )
 
-        self.index_descs = list(dict.fromkeys(self.index_descs))
+        return codec_desc, index_desc, knn_desc
+
+    def create_execution_operator(
+        self,
+        train,
+        build,
+        knn,
+        reconstruct,
+        range,
+    ) -> ExecutionOperator:
+        # all operators are created, as ground truth are always created in benchmarking
+        train_op = TrainOperator(
+            num_threads=self.num_threads, distance_metric=self.distance_metric
+        )
+        build_op = BuildOperator(
+            num_threads=self.num_threads, distance_metric=self.distance_metric
+        )
+        search_op = SearchOperator(
+            num_threads=self.num_threads, distance_metric=self.distance_metric
+        )
+        search_op.range = range
+
+        exec_op = ExecutionOperator(
+            train_op=train_op,
+            build_op=build_op,
+            search_op=search_op,
+            num_threads=self.num_threads,
+        )
+        assert hasattr(self, "io")
+        exec_op.set_io(self.io)
+
+        # iterate over classic descriptors
+        for ci_desc in self.index_descs:
+            codec_desc, index_desc, knn_desc = self.create_descriptors(
+                ci_desc, train, build, knn, reconstruct, range
+            )
+            exec_op.add_index_descs(codec_desc, index_desc, knn_desc)
+
+        return exec_op
+
+    def clone_one(self, index_desc):
+        benchmark = Benchmark(
+            num_threads=self.num_threads,
+            training_vectors=self.training_vectors,
+            database_vectors=self.database_vectors,
+            query_vectors=self.query_vectors,
+            # index_descs=[self.get_flat_desc("Flat"), index_desc],
+            index_descs=[index_desc],  # Should automatically find flat descriptors
+            range_ref_index_desc=self.range_ref_index_desc,
+            k=self.k,
+            distance_metric=self.distance_metric,
+        )
+        benchmark.set_io(self.io.clone())
+        return benchmark
+
+    def benchmark(
+        self,
+        result_file=None,
+        local=False,
+        train=False,
+        reconstruct=False,
+        knn=False,
+        range=False,
+    ):
+        logger.info("begin evaluate")
+        results = {"indices": {}, "experiments": {}}
+        faiss.omp_set_num_threads(self.num_threads)
+        exec_op = self.create_execution_operator(
+            train=train,
+            build=knn or range,
+            knn=knn,
+            reconstruct=reconstruct,
+            range=range,
+        )
+        exec_op.create_ground_truths(results)
 
         todo = self.index_descs
         for index_desc in self.index_descs:
@@ -681,15 +1123,7 @@ def benchmark(
             current_todo = []
             next_todo = []
             for index_desc in todo:
-                results, requires = self.benchmark_one(
-                    dry_run=True,
-                    results=results,
-                    index_desc=index_desc,
-                    train=train,
-                    reconstruct=reconstruct,
-                    knn=knn,
-                    range=range,
-                )
+                results, requires = exec_op.execute(results, dry_run=False)
                 if requires is None:
                     continue
                 if requires in queued:
@@ -731,15 +1165,14 @@ def benchmark(
 def run_benchmark_one(params):
     logger.info(params)
     index_desc, benchmark, results, train, reconstruct, knn, range = params
-    results, requires = benchmark.benchmark_one(
-        dry_run=False,
-        results=results,
-        index_desc=index_desc,
+    exec_op = benchmark.create_execution_operator(
         train=train,
-        reconstruct=reconstruct,
+        build=knn,
         knn=knn,
+        reconstruct=reconstruct,
         range=range,
     )
+    results, requires = exec_op.execute(results=results, dry_run=False)
     assert requires is None
     assert results is not None
     return results
diff --git a/benchs/bench_fw/benchmark_io.py b/benchs/bench_fw/benchmark_io.py
index b39bb60290..5ee3eb3a6a 100644
--- a/benchs/bench_fw/benchmark_io.py
+++ b/benchs/bench_fw/benchmark_io.py
@@ -53,6 +53,7 @@ def clone(self):
     def __post_init__(self):
         self.cached_ds = {}
 
+    # TODO(kuarora): rename it as get_local_file
     def get_local_filename(self, filename):
         if len(filename) > 184:
             fn, ext = os.path.splitext(filename)
@@ -61,6 +62,9 @@ def get_local_filename(self, filename):
             )
         return os.path.join(self.path, filename)
 
+    def get_remote_filepath(self, filename) -> Optional[str]:
+        return None
+
     def download_file_from_blobstore(
         self,
         filename: str,
@@ -219,7 +223,7 @@ def read_index(
         fn = self.download_file_from_blobstore(filename, bucket, path)
         logger.info(f"Loading index {fn}")
         ext = os.path.splitext(fn)[1]
-        if ext in [".faiss", ".codec"]:
+        if ext in [".faiss", ".codec", ".index"]:
             index = faiss.read_index(fn)
         elif ext == ".pkl":
             with open(fn, "rb") as model_file:
diff --git a/benchs/bench_fw/descriptors.py b/benchs/bench_fw/descriptors.py
index 173b07ce16..e76278cedc 100644
--- a/benchs/bench_fw/descriptors.py
+++ b/benchs/bench_fw/descriptors.py
@@ -3,18 +3,21 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from dataclasses import dataclass
 import logging
+import os
+from dataclasses import dataclass
 from typing import Any, Dict, List, Optional
 
 import faiss  # @manual=//faiss/python:pyfaiss_gpu
+
+from .benchmark_io import BenchmarkIO
 from .utils import timer
 
 logger = logging.getLogger(__name__)
 
 
 @dataclass
-class IndexDescriptor:
+class IndexDescriptorClassic:
     bucket: Optional[str] = None
     # either path or factory should be set,
     # but not both at the same time.
@@ -45,7 +48,6 @@ class IndexDescriptor:
     def __hash__(self):
         return hash(str(self))
 
-
 @dataclass
 class DatasetDescriptor:
     # namespace possible values:
@@ -81,7 +83,7 @@ def __hash__(self):
 
     def get_filename(
         self,
-        prefix: str = None,
+        prefix: Optional[str] = None,
     ) -> str:
         filename = ""
         if prefix is not None:
@@ -116,3 +118,208 @@ def k_means(self, io, k, dry_run):
         else:
             t = io.read_json(meta_filename)["k_means_time"]
         return kmeans_vectors, t, None
+
+@dataclass
+class IndexBaseDescriptor:
+    d: int
+    metric: str
+    desc_name: Optional[str] = None
+    flat_desc_name: Optional[str] = None
+    bucket: Optional[str] = None
+    path: Optional[str] = None
+    num_threads: int = 1
+
+    def get_name(self) -> str:
+        raise NotImplementedError()
+
+    def get_path(self, benchmark_io: BenchmarkIO) -> Optional[str]:
+        if self.path is not None:
+            return self.path
+        self.path = benchmark_io.get_remote_filepath(self.desc_name)
+        return self.path
+
+    @staticmethod
+    def param_dict_list_to_name(param_dict_list):
+        if not param_dict_list:
+            return ""
+        l = 0
+        n = ""
+        for param_dict in param_dict_list:
+            n += IndexBaseDescriptor.param_dict_to_name(param_dict, f"cp{l}")
+            l += 1
+        return n
+
+    @staticmethod
+    def param_dict_to_name(param_dict, prefix="sp"):
+        if not param_dict:
+            return ""
+        n = prefix
+        for name, val in param_dict.items():
+            if name == "snap":
+                continue
+            if name == "lsq_gpu" and val == 0:
+                continue
+            if name == "use_beam_LUT" and val == 0:
+                continue
+            n += f"_{name}_{val}"
+        if n == prefix:
+            return ""
+        n += "."
+        return n
+
+
+@dataclass
+class CodecDescriptor(IndexBaseDescriptor):
+    # either path or factory should be set,
+    # but not both at the same time.
+    factory: Optional[str] = None
+    construction_params: Optional[List[Dict[str, int]]] = None
+    training_vectors: Optional[DatasetDescriptor] = None
+
+    def __post_init__(self):
+        self.get_name()
+
+    def is_trained(self):
+        return self.factory is None and self.path is not None
+
+    def is_valid(self):
+        return self.factory is not None or self.path is not None
+
+    def get_name(self) -> str:
+        if self.desc_name is not None:
+            return self.desc_name
+        if self.factory is not None:
+            self.desc_name = self.name_from_factory()
+            return self.desc_name
+        if self.path is not None:
+            self.desc_name = self.name_from_path()
+            return self.desc_name
+        raise ValueError("name, factory or path must be set")
+
+    def flat_name(self) -> str:
+        if self.flat_desc_name is not None:
+            return self.flat_desc_name
+        self.flat_desc_name = f"Flat.d_{self.d}.{self.metric.upper()}."
+        return self.flat_desc_name
+
+    def path(self, benchmark_io) -> str:
+        if self.path is not None:
+            return self.path
+        return benchmark_io.get_remote_filepath(self.get_name())
+
+    def name_from_factory(self) -> str:
+        assert self.factory is not None
+        name = f"{self.factory.replace(',', '_')}."
+        assert self.d is not None
+        assert self.metric is not None
+        name += f"d_{self.d}.{self.metric.upper()}."
+        if self.factory != "Flat":
+            assert self.training_vectors is not None
+            name += self.training_vectors.get_filename("xt")
+        name += IndexBaseDescriptor.param_dict_list_to_name(self.construction_params)
+        return name
+
+    def name_from_path(self):
+        assert self.path is not None
+        filename = os.path.basename(self.path)
+        ext = filename.split(".")[-1]
+        if filename.endswith(ext):
+            name = filename[:-len(ext)]
+        else: # should never hit this rather raise value error
+            name = filename
+        return name
+
+    def alias(self, benchmark_io : BenchmarkIO):
+        if hasattr(benchmark_io, "bucket"):
+            return CodecDescriptor(desc_name=self.get_name(), bucket=benchmark_io.bucket, path=self.get_path(benchmark_io), d=self.d, metric=self.metric)
+        return CodecDescriptor(desc_name=self.get_name(), d=self.d, metric=self.metric)
+
+
+@dataclass
+class IndexDescriptor(IndexBaseDescriptor):
+    codec_desc: Optional[CodecDescriptor] = None
+    database_desc: Optional[DatasetDescriptor] = None
+
+    def __hash__(self):
+        return hash(str(self))
+
+    def __post_init__(self):
+        self.get_name()
+
+    def is_built(self):
+        return self.codec_desc is None and self.database_desc is None
+
+    def get_name(self) -> str:
+        if self.desc_name is None:
+            self.desc_name = self.codec_desc.get_name() + self.database_desc.get_filename(prefix="xb")
+
+        return self.desc_name
+
+    def flat_name(self):
+        if self.flat_desc_name is not None:
+            return self.flat_desc_name
+        self.flat_desc_name = self.codec_desc.flat_name() + self.database_desc.get_filename(prefix="xb")
+        return self.flat_desc_name
+
+    # alias is used to refer when index is uploaded to blobstore and refered again
+    def alias(self, benchmark_io: BenchmarkIO):
+        if hasattr(benchmark_io, "bucket"):
+            return IndexDescriptor(desc_name=self.get_name(), bucket=benchmark_io.bucket, path=self.get_path(benchmark_io), d=self.d, metric=self.metric)
+        return IndexDescriptor(desc_name=self.get_name(), d=self.d, metric=self.metric)
+
+@dataclass
+class KnnDescriptor(IndexBaseDescriptor):
+    index_desc: Optional[IndexDescriptor] = None
+    gt_index_desc: Optional[IndexDescriptor] = None
+    query_dataset: Optional[DatasetDescriptor] = None
+    search_params: Optional[Dict[str, int]] = None
+    reconstruct: bool = False
+    # range metric definitions
+    # key: name
+    # value: one of the following:
+    #
+    # radius
+    #    [0..radius) -> 1
+    #    [radius..inf) -> 0
+    #
+    # [[radius1, score1], ...]
+    #    [0..radius1) -> score1
+    #    [radius1..radius2) -> score2
+    #
+    # [[radius1_from, radius1_to, score1], ...]
+    #    [radius1_from, radius1_to) -> score1,
+    #    [radius2_from, radius2_to) -> score2
+    range_metrics: Optional[Dict[str, Any]] = None
+    radius: Optional[float] = None
+    k: int = 1
+
+    range_ref_index_desc: Optional[str] = None
+
+    def __hash__(self):
+        return hash(str(self))
+
+    def get_name(self):
+        name = self.index_desc.get_name()
+        name += IndexBaseDescriptor.param_dict_to_name(self.search_params)
+        name += self.query_dataset.get_filename("q")
+        name += f"k_{self.k}."
+        name += f"t_{self.num_threads}."
+        if self.reconstruct:
+            name += "rec."
+        else:
+            name += "knn."
+        return name
+
+    def flat_name(self):
+        if self.flat_desc_name is not None:
+            return self.flat_desc_name
+        name = self.index_desc.flat_name()
+        name += self.query_dataset.get_filename("q")
+        name += f"k_{self.k}."
+        name += f"t_{self.num_threads}."
+        if self.reconstruct:
+            name += "rec."
+        else:
+            name += "knn."
+        self.flat_desc_name = name
+        return name
diff --git a/benchs/bench_fw/index.py b/benchs/bench_fw/index.py
index 3deaa4afcf..6b6c2d93af 100644
--- a/benchs/bench_fw/index.py
+++ b/benchs/bench_fw/index.py
@@ -13,6 +13,7 @@
 
 import faiss  # @manual=//faiss/python:pyfaiss_gpu
 import numpy as np
+from faiss.benchs.bench_fw.descriptors import IndexBaseDescriptor
 
 from faiss.contrib.evaluation import (  # @manual=//faiss/contrib:faiss_contrib_gpu
     knn_intersection_measure,
@@ -49,35 +50,6 @@ class IndexBase:
     def set_io(self, benchmark_io):
         self.io = benchmark_io
 
-    @staticmethod
-    def param_dict_list_to_name(param_dict_list):
-        if not param_dict_list:
-            return ""
-        l = 0
-        n = ""
-        for param_dict in param_dict_list:
-            n += IndexBase.param_dict_to_name(param_dict, f"cp{l}")
-            l += 1
-        return n
-
-    @staticmethod
-    def param_dict_to_name(param_dict, prefix="sp"):
-        if not param_dict:
-            return ""
-        n = prefix
-        for name, val in param_dict.items():
-            if name == "snap":
-                continue
-            if name == "lsq_gpu" and val == 0:
-                continue
-            if name == "use_beam_LUT" and val == 0:
-                continue
-            n += f"_{name}_{val}"
-        if n == prefix:
-            return ""
-        n += "."
-        return n
-
     @staticmethod
     def set_index_param_dict_list(index, param_dict_list, assert_same=False):
         if not param_dict_list:
@@ -282,7 +254,7 @@ def get_knn_search_name(
         reconstruct: bool = False,
     ):
         name = self.get_index_name()
-        name += Index.param_dict_to_name(search_parameters)
+        name += IndexBaseDescriptor.param_dict_to_name(search_parameters)
         name += query_vectors.get_filename("q")
         name += f"k_{k}."
         name += f"t_{self.num_threads}."
@@ -582,14 +554,21 @@ class Index(IndexBase):
     num_threads: int
     d: int
     metric: str
-    database_vectors: DatasetDescriptor
-    construction_params: List[Dict[str, int]]
-    search_params: Dict[str, int]
+    codec_name: Optional[str] = None
+    index_name: Optional[str] = None
+    database_vectors: Optional[DatasetDescriptor] = None
+    construction_params: Optional[List[Dict[str, int]]] = None
+    search_params: Optional[Dict[str, int]] = None
+    serialize_full_index: bool = False
+
+    bucket: Optional[str] = None
+    index_path: Optional[str] = None
 
     cached_codec: ClassVar[OrderedDict[str, faiss.Index]] = OrderedDict()
     cached_index: ClassVar[OrderedDict[str, faiss.Index]] = OrderedDict()
 
     def __post_init__(self):
+        logger.info(f"Initializing metric_type to {self.metric}")
         if isinstance(self.metric, str):
             if self.metric == "IP":
                 self.metric_type = faiss.METRIC_INNER_PRODUCT
@@ -628,13 +607,31 @@ def get_codec(self):
                 Index.cached_codec.popitem(last=False)
         return Index.cached_codec[codec_name]
 
-    def get_index_name(self):
-        name = self.get_codec_name()
-        assert self.database_vectors is not None
-        name += self.database_vectors.get_filename("xb")
-        return name
+    def get_codec_name(self) -> Optional[str]:
+        return self.codec_name
+
+    def get_index_name(self) -> Optional[str]:
+        return self.index_name
 
     def fetch_index(self):
+        # read index from file if it is already available
+        if self.index_path:
+            index_filename = os.path.basename(self.index_path)
+        else:
+            index_filename = self.index_name + "index"
+        if self.io.file_exist(index_filename):
+            if self.index_path:
+                index = self.io.read_index(
+                    index_filename,
+                    self.bucket,
+                    os.path.dirname(self.index_path),
+                )
+            else:
+                index = self.io.read_index(index_filename)
+            assert self.d == index.d
+            assert self.metric_type == index.metric_type
+            return index, 0
+
         index = self.get_codec()
         index.reset()
         assert index.ntotal == 0
@@ -664,10 +661,15 @@ def fetch_index(self):
             )
         assert index.ntotal == xb.shape[0] or index_ivf.ntotal == xb.shape[0]
         logger.info("Added vectors to index")
+        if self.serialize_full_index:
+            codec_size = self.io.write_index(index, index_filename)
+            assert codec_size is not None
+
         return index, t
 
     def get_index(self):
-        index_name = self.get_index_name()
+        index_name = self.index_name
+        # TODO(kuarora) : retrieve file from bucket and path.
         if index_name not in Index.cached_index:
             Index.cached_index[index_name], _ = self.fetch_index()
             if len(Index.cached_index) > 3:
@@ -784,8 +786,12 @@ def is_flat_index(self):
 # are used to wrap pre-trained Faiss indices (codecs)
 @dataclass
 class IndexFromCodec(Index):
-    path: str
-    bucket: Optional[str] = None
+    path: Optional[str] = None
+
+    def __post_init__(self):
+        super().__post_init__()
+        if self.path is None:
+            raise ValueError("path is not set")
 
     def get_quantizer(self):
         if not self.is_ivf():
@@ -804,12 +810,6 @@ def get_pretransform(self):
     def get_model_name(self):
         return os.path.basename(self.path)
 
-    def get_codec_name(self):
-        assert self.path is not None
-        name = os.path.basename(self.path)
-        name += Index.param_dict_list_to_name(self.construction_params)
-        return name
-
     def fetch_meta(self, dry_run=False):
         return None, None
 
@@ -871,20 +871,15 @@ def get_codec(self):
 # IndexFromFactory is for creating and training indices from scratch
 @dataclass
 class IndexFromFactory(Index):
-    factory: str
-    training_vectors: DatasetDescriptor
+    factory: Optional[str] = None
+    training_vectors: Optional[DatasetDescriptor] = None
 
-    def get_codec_name(self):
-        assert self.factory is not None
-        name = f"{self.factory.replace(',', '_')}."
-        assert self.d is not None
-        assert self.metric is not None
-        name += f"d_{self.d}.{self.metric.upper()}."
-        if self.factory != "Flat":
-            assert self.training_vectors is not None
-            name += self.training_vectors.get_filename("xt")
-        name += Index.param_dict_list_to_name(self.construction_params)
-        return name
+    def __post_init__(self):
+        super().__post_init__()
+        if self.factory is None:
+            raise ValueError("factory is not set")
+        if self.factory != "Flat" and self.training_vectors is None:
+            raise ValueError(f"training_vectors is not set for {self.factory}")
 
     def fetch_meta(self, dry_run=False):
         meta_filename = self.get_codec_name() + "json"
diff --git a/benchs/bench_fw/optimize.py b/benchs/bench_fw/optimize.py
index a2653b7144..b3d62980c3 100644
--- a/benchs/bench_fw/optimize.py
+++ b/benchs/bench_fw/optimize.py
@@ -14,7 +14,7 @@
 # )
 
 from .benchmark import Benchmark
-from .descriptors import DatasetDescriptor, IndexDescriptor
+from .descriptors import DatasetDescriptor, IndexDescriptorClassic
 from .utils import dict_merge, filter_results, ParetoMetric, ParetoMode
 
 logger = logging.getLogger(__name__)
@@ -78,7 +78,7 @@ def benchmark_and_filter_candidates(
         )
         assert filtered
         index_descs = [
-            IndexDescriptor(
+            IndexDescriptorClassic(
                 factory=v["factory"],
                 construction_params=v["construction_params"],
                 search_params=v["search_params"],
@@ -103,8 +103,8 @@ def optimize_quantizer(
                 dry_run=False,
             )
 
-            descs = [IndexDescriptor(factory="Flat"),] + [
-                IndexDescriptor(
+            descs = [IndexDescriptorClassic(factory="Flat"),] + [
+                IndexDescriptorClassic(
                     factory="HNSW32",
                     construction_params=[{"efConstruction": 2**i}],
                 )
@@ -131,7 +131,7 @@ def optimize_ivf(
         training_vectors: DatasetDescriptor,
         database_vectors: DatasetDescriptor,
         query_vectors: DatasetDescriptor,
-        quantizers: Dict[int, List[IndexDescriptor]],
+        quantizers: Dict[int, List[IndexDescriptorClassic]],
         codecs: List[Tuple[str, str]],
         min_accuracy: float,
     ):
@@ -159,7 +159,7 @@ def optimize_ivf(
                                 quantizer_desc.search_params,
                             )
                     ivf_descs.append(
-                        IndexDescriptor(
+                        IndexDescriptorClassic(
                             factory=f"{pretransform}IVF{nlist}({quantizer_desc.factory}),{fine_ivf}",
                             construction_params=construction_params,
                         )
@@ -188,7 +188,7 @@ def ivf_flat_nprobe_required_for_accuracy(
     ):
         _, results = self.benchmark_and_filter_candidates(
             index_descs=[
-                IndexDescriptor(factory=f"IVF{nlist}(Flat),Flat"),
+                IndexDescriptorClassic(factory=f"IVF{nlist}(Flat),Flat"),
             ],
             training_vectors=training_vectors,
             database_vectors=database_vectors,
@@ -255,7 +255,7 @@ def optimize_codec(
 
         _, filtered = self.benchmark_and_filter_candidates(
             index_descs=[
-                IndexDescriptor(
+                IndexDescriptorClassic(
                     factory=f"IVF{nlist},{pq}"
                     if opq is None
                     else f"{opq},IVF{nlist},{pq}",
diff --git a/benchs/bench_fw_codecs.py b/benchs/bench_fw_codecs.py
index 80741e23f7..d3efc2da0f 100644
--- a/benchs/bench_fw_codecs.py
+++ b/benchs/bench_fw_codecs.py
@@ -7,10 +7,10 @@
 import argparse
 import os
 
-from bench_fw.benchmark import Benchmark
-from bench_fw.benchmark_io import BenchmarkIO
-from bench_fw.descriptors import DatasetDescriptor, IndexDescriptor
-from bench_fw.index import IndexFromFactory
+from faiss.benchs.bench_fw.benchmark import Benchmark
+from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
+from faiss.benchs.bench_fw.descriptors import DatasetDescriptor, IndexDescriptorClassic
+from faiss.benchs.bench_fw.index import IndexFromFactory
 
 logging.basicConfig(level=logging.INFO)
 
@@ -107,7 +107,7 @@ def run_local(rp):
         database_vectors=database_vectors,
         query_vectors=query_vectors,
         index_descs=[
-            IndexDescriptor(
+            IndexDescriptorClassic(
                 factory=factory,
                 construction_params=construction_params,
                 training_size=training_size,
diff --git a/benchs/bench_fw_ivf.py b/benchs/bench_fw_ivf.py
index e9e144c569..b0c108b7de 100644
--- a/benchs/bench_fw_ivf.py
+++ b/benchs/bench_fw_ivf.py
@@ -11,7 +11,7 @@
 from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
 from faiss.benchs.bench_fw.descriptors import (
     DatasetDescriptor,
-    IndexDescriptor,
+    IndexDescriptorClassic,
 )
 
 logging.basicConfig(level=logging.INFO)
@@ -30,7 +30,7 @@ def sift1M(bio):
             namespace="std_q", tablename="sift1M"
         ),
         index_descs=[
-            IndexDescriptor(
+            IndexDescriptorClassic(
                 factory=f"IVF{2 ** nlist},Flat",
             )
             for nlist in range(8, 15)
@@ -38,8 +38,8 @@ def sift1M(bio):
         k=1,
         distance_metric="L2",
     )
-    benchmark.set_io(bio)
-    benchmark.benchmark(result_file="result.json", local=False, train=True, reconstruct=False, knn=True, range=False)
+    benchmark.io = bio
+    benchmark.benchmark(result_file="result.json", local=True, train=True, reconstruct=False, knn=True, range=False)
 
 
 def bigann(bio):
@@ -56,11 +56,11 @@ def bigann(bio):
                 namespace="std_q", tablename="bigann1M"
             ),
             index_descs=[
-                IndexDescriptor(
+                IndexDescriptorClassic(
                     factory=f"IVF{2 ** nlist},Flat",
                 ) for nlist in range(11, 19)
             ] + [
-                IndexDescriptor(
+                IndexDescriptorClassic(
                     factory=f"IVF{2 ** nlist}_HNSW32,Flat",
                     construction_params=[None, {"efConstruction": 200, "efSearch": 40}],
                 ) for nlist in range(11, 19)
@@ -84,18 +84,18 @@ def ssnpp(bio):
             tablename="ssnpp_queries_10K.npy"
         ),
         index_descs=[
-            IndexDescriptor(
+            IndexDescriptorClassic(
                 factory=f"IVF{2 ** nlist},PQ256x4fs,Refine(SQfp16)",
             ) for nlist in range(9, 16)
         ] + [
-            IndexDescriptor(
+            IndexDescriptorClassic(
                 factory=f"IVF{2 ** nlist},Flat",
             ) for nlist in range(9, 16)
         ] + [
-            IndexDescriptor(
+            IndexDescriptorClassic(
                 factory=f"PQ256x4fs,Refine(SQfp16)",
             ),
-            IndexDescriptor(
+            IndexDescriptorClassic(
                 factory=f"HNSW32",
             ),
         ],
diff --git a/benchs/bench_fw_optimize.py b/benchs/bench_fw_optimize.py
index 31b56f9f51..11e625e23c 100644
--- a/benchs/bench_fw_optimize.py
+++ b/benchs/bench_fw_optimize.py
@@ -7,9 +7,9 @@
 import logging
 import os
 
-from bench_fw.benchmark_io import BenchmarkIO
-from bench_fw.descriptors import DatasetDescriptor
-from bench_fw.optimize import Optimizer
+from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
+from faiss.benchs.bench_fw.descriptors import DatasetDescriptor
+from faiss.benchs.bench_fw.optimize import Optimizer
 
 logging.basicConfig(level=logging.INFO)
 
diff --git a/benchs/bench_fw_range.py b/benchs/bench_fw_range.py
index f38de114f9..0d4b65afa6 100644
--- a/benchs/bench_fw_range.py
+++ b/benchs/bench_fw_range.py
@@ -3,28 +3,29 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
 import argparse
+import logging
 import os
 
-from bench_fw.benchmark import Benchmark
-from bench_fw.benchmark_io import BenchmarkIO
-from bench_fw.descriptors import DatasetDescriptor, IndexDescriptor
+from faiss.benchs.bench_fw.benchmark import Benchmark
+from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
+from faiss.benchs.bench_fw.descriptors import DatasetDescriptor, IndexDescriptorClassic
 
 logging.basicConfig(level=logging.INFO)
 
+
 def ssnpp(bio):
     benchmark = Benchmark(
         num_threads=32,
         training_vectors=DatasetDescriptor(
-            tablename="ssnpp_training_5M.npy",
+            tablename="training.npy",
         ),
         database_vectors=DatasetDescriptor(
-            tablename="ssnpp_xb_range_filtered_119201.npy",
+            tablename="database.npy",
         ),
-        query_vectors=DatasetDescriptor(tablename="ssnpp_xq_range_filtered_33615.npy"),
+        query_vectors=DatasetDescriptor(tablename="query.npy"),
         index_descs=[
-            IndexDescriptor(
+            IndexDescriptorClassic(
                 factory="Flat",
                 range_metrics={
                     "weighted": [
@@ -56,7 +57,7 @@ def ssnpp(bio):
                     ]
                 },
             ),
-            IndexDescriptor(
+            IndexDescriptorClassic(
                 factory="IVF262144(PQ256x4fs),PQ32",
             ),
         ],
@@ -67,6 +68,7 @@ def ssnpp(bio):
     benchmark.set_io(bio)
     benchmark.benchmark("result.json", local=False, train=True, reconstruct=False, knn=False, range=True)
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument('experiment')

From 8d2440472c5a8416d1789915e9762a8db9c8d5c2 Mon Sep 17 00:00:00 2001
From: Naveen Tatikonda <navtat@amazon.com>
Date: Mon, 24 Jun 2024 05:11:53 -0700
Subject: [PATCH 065/148] Add SQ8bit signed quantization (#3501)

Summary:
### Description
Add new signed 8 bit scalar quantizer, `QT_8bit_direct_signed` to ingest signed 8 bit vectors ([-128 to 127]).

### Issues Resolved
https://github.com/facebookresearch/faiss/issues/3488

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3501

Reviewed By: mengdilin

Differential Revision: D58639363

Pulled By: mdouze

fbshipit-source-id: cf7f244fdbb7a34051d2b20c6f8086cd5628b4e0
---
 benchs/bench_fw/optimize.py    |   1 +
 c_api/IndexScalarQuantizer_c.h |   2 +
 faiss/IndexScalarQuantizer.cpp |   3 +-
 faiss/impl/ScalarQuantizer.cpp | 103 +++++++++++++++++++++++++++++++--
 faiss/impl/ScalarQuantizer.h   |   2 +
 faiss/index_factory.cpp        |   5 +-
 tests/test_index_accuracy.py   |  31 +++++++---
 7 files changed, 130 insertions(+), 17 deletions(-)

diff --git a/benchs/bench_fw/optimize.py b/benchs/bench_fw/optimize.py
index b3d62980c3..ac6c45ab0c 100644
--- a/benchs/bench_fw/optimize.py
+++ b/benchs/bench_fw/optimize.py
@@ -228,6 +228,7 @@ def optimize_codec(
                 (None, "SQfp16"),
                 (None, "SQbf16"),
                 (None, "SQ8"),
+                (None, "SQ8_direct_signed"),
             ] + [
                 (f"OPQ{M}_{M * dim}", f"PQ{M}x{b}")
                 for M in [8, 12, 16, 32, 48, 64, 96, 128, 192, 256]
diff --git a/c_api/IndexScalarQuantizer_c.h b/c_api/IndexScalarQuantizer_c.h
index 87fe6d3415..55a2676d22 100644
--- a/c_api/IndexScalarQuantizer_c.h
+++ b/c_api/IndexScalarQuantizer_c.h
@@ -27,6 +27,8 @@ typedef enum FaissQuantizerType {
     QT_8bit_direct, ///< fast indexing of uint8s
     QT_6bit,        ///< 6 bits per component
     QT_bf16,
+    QT_8bit_direct_signed, ///< fast indexing of signed int8s ranging from [-128
+                           ///< to 127]
 } FaissQuantizerType;
 
 // forward declaration
diff --git a/faiss/IndexScalarQuantizer.cpp b/faiss/IndexScalarQuantizer.cpp
index 7ce838db5e..44a628f000 100644
--- a/faiss/IndexScalarQuantizer.cpp
+++ b/faiss/IndexScalarQuantizer.cpp
@@ -33,7 +33,8 @@ IndexScalarQuantizer::IndexScalarQuantizer(
         : IndexFlatCodes(0, d, metric), sq(d, qtype) {
     is_trained = qtype == ScalarQuantizer::QT_fp16 ||
             qtype == ScalarQuantizer::QT_8bit_direct ||
-            qtype == ScalarQuantizer::QT_bf16;
+            qtype == ScalarQuantizer::QT_bf16 ||
+            qtype == ScalarQuantizer::QT_8bit_direct_signed;
     code_size = sq.code_size;
 }
 
diff --git a/faiss/impl/ScalarQuantizer.cpp b/faiss/impl/ScalarQuantizer.cpp
index 7ad50189e4..528843f606 100644
--- a/faiss/impl/ScalarQuantizer.cpp
+++ b/faiss/impl/ScalarQuantizer.cpp
@@ -621,13 +621,90 @@ struct Quantizer8bitDirect<8> : Quantizer8bitDirect<1> {
 
     FAISS_ALWAYS_INLINE float32x4x2_t
     reconstruct_8_components(const uint8_t* code, int i) const {
-        float32_t result[8] = {};
-        for (size_t j = 0; j < 8; j++) {
-            result[j] = code[i + j];
+        uint8x8_t x8 = vld1_u8((const uint8_t*)(code + i));
+        uint16x8_t y8 = vmovl_u8(x8);
+        uint16x4_t y8_0 = vget_low_u16(y8);
+        uint16x4_t y8_1 = vget_high_u16(y8);
+
+        // convert uint16 -> uint32 -> fp32
+        return {vcvtq_f32_u32(vmovl_u16(y8_0)), vcvtq_f32_u32(vmovl_u16(y8_1))};
+    }
+};
+
+#endif
+
+/*******************************************************************
+ * 8bit_direct_signed quantizer
+ *******************************************************************/
+
+template <int SIMDWIDTH>
+struct Quantizer8bitDirectSigned {};
+
+template <>
+struct Quantizer8bitDirectSigned<1> : ScalarQuantizer::SQuantizer {
+    const size_t d;
+
+    Quantizer8bitDirectSigned(size_t d, const std::vector<float>& /* unused */)
+            : d(d) {}
+
+    void encode_vector(const float* x, uint8_t* code) const final {
+        for (size_t i = 0; i < d; i++) {
+            code[i] = (uint8_t)(x[i] + 128);
         }
-        float32x4_t res1 = vld1q_f32(result);
-        float32x4_t res2 = vld1q_f32(result + 4);
-        return {res1, res2};
+    }
+
+    void decode_vector(const uint8_t* code, float* x) const final {
+        for (size_t i = 0; i < d; i++) {
+            x[i] = code[i] - 128;
+        }
+    }
+
+    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
+            const {
+        return code[i] - 128;
+    }
+};
+
+#ifdef __AVX2__
+
+template <>
+struct Quantizer8bitDirectSigned<8> : Quantizer8bitDirectSigned<1> {
+    Quantizer8bitDirectSigned(size_t d, const std::vector<float>& trained)
+            : Quantizer8bitDirectSigned<1>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE __m256
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        __m128i x8 = _mm_loadl_epi64((__m128i*)(code + i)); // 8 * int8
+        __m256i y8 = _mm256_cvtepu8_epi32(x8);              // 8 * int32
+        __m256i c8 = _mm256_set1_epi32(128);
+        __m256i z8 = _mm256_sub_epi32(y8, c8); // subtract 128 from all lanes
+        return _mm256_cvtepi32_ps(z8);         // 8 * float32
+    }
+};
+
+#endif
+
+#ifdef __aarch64__
+
+template <>
+struct Quantizer8bitDirectSigned<8> : Quantizer8bitDirectSigned<1> {
+    Quantizer8bitDirectSigned(size_t d, const std::vector<float>& trained)
+            : Quantizer8bitDirectSigned<1>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE float32x4x2_t
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        uint8x8_t x8 = vld1_u8((const uint8_t*)(code + i));
+        uint16x8_t y8 = vmovl_u8(x8); // convert uint8 -> uint16
+        uint16x4_t y8_0 = vget_low_u16(y8);
+        uint16x4_t y8_1 = vget_high_u16(y8);
+
+        float32x4_t z8_0 = vcvtq_f32_u32(
+                vmovl_u16(y8_0)); // convert uint16 -> uint32 -> fp32
+        float32x4_t z8_1 = vcvtq_f32_u32(vmovl_u16(y8_1));
+
+        // subtract 128 to convert into signed numbers
+        return {vsubq_f32(z8_0, vmovq_n_f32(128.0)),
+                vsubq_f32(z8_1, vmovq_n_f32(128.0))};
     }
 };
 
@@ -660,6 +737,8 @@ ScalarQuantizer::SQuantizer* select_quantizer_1(
             return new QuantizerBF16<SIMDWIDTH>(d, trained);
         case ScalarQuantizer::QT_8bit_direct:
             return new Quantizer8bitDirect<SIMDWIDTH>(d, trained);
+        case ScalarQuantizer::QT_8bit_direct_signed:
+            return new Quantizer8bitDirectSigned<SIMDWIDTH>(d, trained);
     }
     FAISS_THROW_MSG("unknown qtype");
 }
@@ -1460,6 +1539,11 @@ SQDistanceComputer* select_distance_computer(
                         Sim,
                         SIMDWIDTH>(d, trained);
             }
+        case ScalarQuantizer::QT_8bit_direct_signed:
+            return new DCTemplate<
+                    Quantizer8bitDirectSigned<SIMDWIDTH>,
+                    Sim,
+                    SIMDWIDTH>(d, trained);
     }
     FAISS_THROW_MSG("unknown qtype");
     return nullptr;
@@ -1483,6 +1567,7 @@ void ScalarQuantizer::set_derived_sizes() {
         case QT_8bit:
         case QT_8bit_uniform:
         case QT_8bit_direct:
+        case QT_8bit_direct_signed:
             code_size = d;
             bits = 8;
             break;
@@ -1540,6 +1625,7 @@ void ScalarQuantizer::train(size_t n, const float* x) {
         case QT_fp16:
         case QT_8bit_direct:
         case QT_bf16:
+        case QT_8bit_direct_signed:
             // no training necessary
             break;
     }
@@ -1885,6 +1971,11 @@ InvertedListScanner* sel1_InvertedListScanner(
                         Similarity,
                         SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
             }
+        case ScalarQuantizer::QT_8bit_direct_signed:
+            return sel2_InvertedListScanner<DCTemplate<
+                    Quantizer8bitDirectSigned<SIMDWIDTH>,
+                    Similarity,
+                    SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
     }
 
     FAISS_THROW_MSG("unknown qtype");
diff --git a/faiss/impl/ScalarQuantizer.h b/faiss/impl/ScalarQuantizer.h
index 49fd42cc31..904e6f6b60 100644
--- a/faiss/impl/ScalarQuantizer.h
+++ b/faiss/impl/ScalarQuantizer.h
@@ -33,6 +33,8 @@ struct ScalarQuantizer : Quantizer {
         QT_8bit_direct, ///< fast indexing of uint8s
         QT_6bit,        ///< 6 bits per component
         QT_bf16,
+        QT_8bit_direct_signed, ///< fast indexing of signed int8s ranging from
+                               ///< [-128 to 127]
     };
 
     QuantizerType qtype = QT_8bit;
diff --git a/faiss/index_factory.cpp b/faiss/index_factory.cpp
index d88fe7b393..564a164e79 100644
--- a/faiss/index_factory.cpp
+++ b/faiss/index_factory.cpp
@@ -141,8 +141,11 @@ std::map<std::string, ScalarQuantizer::QuantizerType> sq_types = {
         {"SQ6", ScalarQuantizer::QT_6bit},
         {"SQfp16", ScalarQuantizer::QT_fp16},
         {"SQbf16", ScalarQuantizer::QT_bf16},
+        {"SQ8_direct_signed", ScalarQuantizer::QT_8bit_direct_signed},
+        {"SQ8_direct", ScalarQuantizer::QT_8bit_direct},
 };
-const std::string sq_pattern = "(SQ4|SQ8|SQ6|SQfp16|SQbf16)";
+const std::string sq_pattern =
+        "(SQ4|SQ8|SQ6|SQfp16|SQbf16|SQ8_direct_signed|SQ8_direct)";
 
 std::map<std::string, AdditiveQuantizer::Search_type_t> aq_search_type = {
         {"_Nfloat", AdditiveQuantizer::ST_norm_float},
diff --git a/tests/test_index_accuracy.py b/tests/test_index_accuracy.py
index 8d8b4a28f6..2c5cf7b901 100644
--- a/tests/test_index_accuracy.py
+++ b/tests/test_index_accuracy.py
@@ -312,7 +312,7 @@ def test_parallel_mode(self):
 
 
 class TestSQByte(unittest.TestCase):
-    def subtest_8bit_direct(self, metric_type, d):
+    def subtest_8bit_direct(self, metric_type, d, quantizer_type):
         xt, xb, xq = get_dataset_2(d, 500, 1000, 30)
 
         # rescale everything to get integer
@@ -324,16 +324,28 @@ def rescale(x):
             x[x > 255] = 255
             return x
 
-        xt = rescale(xt)
-        xb = rescale(xb)
-        xq = rescale(xq)
+        def rescale_signed(x):
+            x = np.floor((x - tmin) * 256 / (tmax - tmin))
+            x[x < 0] = 0
+            x[x > 255] = 255
+            x -= 128
+            return x
+
+        if quantizer_type == faiss.ScalarQuantizer.QT_8bit_direct_signed:
+            xt = rescale_signed(xt)
+            xb = rescale_signed(xb)
+            xq = rescale_signed(xq)
+        else:
+            xt = rescale(xt)
+            xb = rescale(xb)
+            xq = rescale(xq)
 
         gt_index = faiss.IndexFlat(d, metric_type)
         gt_index.add(xb)
         Dref, Iref = gt_index.search(xq, 10)
 
         index = faiss.IndexScalarQuantizer(
-            d, faiss.ScalarQuantizer.QT_8bit_direct, metric_type
+            d, quantizer_type, metric_type
         )
         index.add(xb)
         D, I = index.search(xq, 10)
@@ -353,7 +365,7 @@ def rescale(x):
         Dref, Iref = gt_index.search(xq, 10)
 
         index = faiss.IndexIVFScalarQuantizer(
-            quantizer, d, nlist, faiss.ScalarQuantizer.QT_8bit_direct,
+            quantizer, d, nlist, quantizer_type,
             metric_type
         )
         index.nprobe = 4
@@ -366,9 +378,10 @@ def rescale(x):
         assert np.all(D == Dref)
 
     def test_8bit_direct(self):
-        for d in 13, 16, 24:
-            for metric_type in faiss.METRIC_L2, faiss.METRIC_INNER_PRODUCT:
-                self.subtest_8bit_direct(metric_type, d)
+        for quantizer in faiss.ScalarQuantizer.QT_8bit_direct, faiss.ScalarQuantizer.QT_8bit_direct_signed:
+            for d in 13, 16, 24:
+                for metric_type in faiss.METRIC_L2, faiss.METRIC_INNER_PRODUCT:
+                    self.subtest_8bit_direct(metric_type, d, quantizer)
 
 
 class TestNNDescent(unittest.TestCase):

From 702c3d243b3073ae8a30c0d42caf53d54e3b61ef Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 25 Jun 2024 10:52:40 -0700
Subject: [PATCH 066/148] rm build.sh

---
 .circleci/config.yml                   |  2 +-
 .github/actions/build_cmake/action.yml |  2 +-
 build.sh                               | 58 --------------------------
 3 files changed, 2 insertions(+), 60 deletions(-)
 delete mode 100755 build.sh

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 0330939153..48015ffc37 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -48,7 +48,7 @@ jobs:
                   -DBUILD_TESTING=ON \
                   -DBUILD_SHARED_LIBS=ON \
                   -DFAISS_ENABLE_GPU=OFF \
-                  -DFAISS_ENABLE_RAFT=OFF \
+                  -DFAISS_ENABLE_CUVS=OFF \
                   -DFAISS_OPT_LEVEL=<< parameters.opt_level >> \
                   -DFAISS_ENABLE_C_API=ON \
                   -DPYTHON_EXECUTABLE=$(which python) \
diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index 2bc476add5..222bc79619 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -58,7 +58,7 @@ runs:
               -DBUILD_TESTING=ON \
               -DBUILD_SHARED_LIBS=ON \
               -DFAISS_ENABLE_GPU=${{ inputs.gpu }} \
-              -DFAISS_ENABLE_RAFT=${{ inputs.raft }} \
+              -DFAISS_ENABLE_CUVS=${{ inputs.raft }} \
               -DFAISS_OPT_LEVEL=${{ inputs.opt_level }} \
               -DFAISS_ENABLE_C_API=ON \
               -DPYTHON_EXECUTABLE=$CONDA/bin/python \
diff --git a/build.sh b/build.sh
deleted file mode 100755
index eaa767f2af..0000000000
--- a/build.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/bash
-
-# NOTE: This file is temporary for the proof-of-concept branch and will be removed before this PR is merged
-
-BUILD_TYPE=Release
-BUILD_DIR=build/
-
-RAFT_REPO_REL=""
-EXTRA_CMAKE_ARGS=""
-set -e
-
-if [[ ${RAFT_REPO_REL} != "" ]]; then
-  RAFT_REPO_PATH="`readlink -f \"${RAFT_REPO_REL}\"`"
-  EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCPM_raft_SOURCE=${RAFT_REPO_PATH}"
-fi
-
-if [ "$1" == "clean" ]; then
-  rm -rf build
-  rm -rf .cache
-  exit 0
-fi
-
-if [ "$1" == "test" ]; then
-  make -C build -j test
-  exit 0
-fi
-
-if [ "$1" == "test-raft" ]; then
-  ./build/faiss/gpu/test/TestRaftIndexIVFFlat
-  exit 0
-fi
-
-mkdir -p $BUILD_DIR
-cd $BUILD_DIR
-
-cmake \
- -DFAISS_ENABLE_GPU=ON \
- -DFAISS_ENABLE_CUVS=ON \
- -DFAISS_ENABLE_PYTHON=OFF \
- -DBUILD_TESTING=ON \
- -DBUILD_SHARED_LIBS=OFF \
- -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
- -DFAISS_OPT_LEVEL=avx2 \
- -DRAFT_NVTX=OFF \
- -DCMAKE_CUDA_ARCHITECTURES="NATIVE" \
- -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
- -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
- -DCMAKE_C_COMPILER_LAUNCHER=ccache \
- -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
- ${EXTRA_CMAKE_ARGS} \
- ../
-
-
-# make -C build -j12 faiss
-cmake  --build . -j12
-# make -C build -j12 swigfaiss
-# (cd build/faiss/python && python setup.py install)
-

From a91876895b0cfdd4b83f5abd6663bc3f194acd82 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Fri, 5 Jul 2024 08:09:50 -0700
Subject: [PATCH 067/148] remove compilation issues

---
 build.sh                                      |  58 +++++++
 .../build-lib.sh                              |   0
 .../build-pkg.sh                              |   0
 conda/faiss-gpu-cuvs/meta.yaml                | 125 +++++++++++++++
 .../test_cpu_dispatch.sh                      |   0
 conda/faiss-gpu-raft/meta.yaml                |   4 +-
 faiss/gpu/GpuIndexCagra.cu                    |  28 +++-
 faiss/gpu/GpuIndexCagra.h                     |   4 +-
 faiss/gpu/GpuIndexIVFFlat.cu                  |   2 +-
 faiss/gpu/GpuIndexIVFPQ.cu                    |   2 +-
 faiss/gpu/impl/CuvsCagra.cu                   | 144 ++++++++++--------
 faiss/gpu/impl/CuvsCagra.cuh                  |  23 ++-
 faiss/gpu/impl/CuvsIVFFlat.cu                 |  33 ++--
 faiss/gpu/impl/CuvsIVFFlat.cuh                |   2 +-
 faiss/gpu/impl/CuvsIVFPQ.cu                   |  50 +++---
 faiss/gpu/impl/CuvsIVFPQ.cuh                  |   2 +-
 faiss/gpu/test/CMakeLists.txt                 |   6 +-
 17 files changed, 356 insertions(+), 127 deletions(-)
 create mode 100755 build.sh
 rename conda/{faiss-gpu-raft => faiss-gpu-cuvs}/build-lib.sh (100%)
 rename conda/{faiss-gpu-raft => faiss-gpu-cuvs}/build-pkg.sh (100%)
 create mode 100644 conda/faiss-gpu-cuvs/meta.yaml
 rename conda/{faiss-gpu-raft => faiss-gpu-cuvs}/test_cpu_dispatch.sh (100%)

diff --git a/build.sh b/build.sh
new file mode 100755
index 0000000000..eaa767f2af
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+# NOTE: This file is temporary for the proof-of-concept branch and will be removed before this PR is merged
+
+BUILD_TYPE=Release
+BUILD_DIR=build/
+
+RAFT_REPO_REL=""
+EXTRA_CMAKE_ARGS=""
+set -e
+
+if [[ ${RAFT_REPO_REL} != "" ]]; then
+  RAFT_REPO_PATH="`readlink -f \"${RAFT_REPO_REL}\"`"
+  EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCPM_raft_SOURCE=${RAFT_REPO_PATH}"
+fi
+
+if [ "$1" == "clean" ]; then
+  rm -rf build
+  rm -rf .cache
+  exit 0
+fi
+
+if [ "$1" == "test" ]; then
+  make -C build -j test
+  exit 0
+fi
+
+if [ "$1" == "test-raft" ]; then
+  ./build/faiss/gpu/test/TestRaftIndexIVFFlat
+  exit 0
+fi
+
+mkdir -p $BUILD_DIR
+cd $BUILD_DIR
+
+cmake \
+ -DFAISS_ENABLE_GPU=ON \
+ -DFAISS_ENABLE_CUVS=ON \
+ -DFAISS_ENABLE_PYTHON=OFF \
+ -DBUILD_TESTING=ON \
+ -DBUILD_SHARED_LIBS=OFF \
+ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+ -DFAISS_OPT_LEVEL=avx2 \
+ -DRAFT_NVTX=OFF \
+ -DCMAKE_CUDA_ARCHITECTURES="NATIVE" \
+ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+ -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
+ -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+ ${EXTRA_CMAKE_ARGS} \
+ ../
+
+
+# make -C build -j12 faiss
+cmake  --build . -j12
+# make -C build -j12 swigfaiss
+# (cd build/faiss/python && python setup.py install)
+
diff --git a/conda/faiss-gpu-raft/build-lib.sh b/conda/faiss-gpu-cuvs/build-lib.sh
similarity index 100%
rename from conda/faiss-gpu-raft/build-lib.sh
rename to conda/faiss-gpu-cuvs/build-lib.sh
diff --git a/conda/faiss-gpu-raft/build-pkg.sh b/conda/faiss-gpu-cuvs/build-pkg.sh
similarity index 100%
rename from conda/faiss-gpu-raft/build-pkg.sh
rename to conda/faiss-gpu-cuvs/build-pkg.sh
diff --git a/conda/faiss-gpu-cuvs/meta.yaml b/conda/faiss-gpu-cuvs/meta.yaml
new file mode 100644
index 0000000000..5448b0cc4a
--- /dev/null
+++ b/conda/faiss-gpu-cuvs/meta.yaml
@@ -0,0 +1,125 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+{% set version = environ.get('GIT_DESCRIBE_TAG').lstrip('v') %}
+{% set suffix = "_nightly" if environ.get('PACKAGE_TYPE') == 'nightly' else "" %}
+{% set number = GIT_DESCRIBE_NUMBER %}
+{% if cudatoolkit == '11.8.0' %}
+{% set cuda_constraints=">=11.8,<12" %}
+{% set libcublas_constraints=">=11.11,<12" %}
+{% elif cudatoolkit == '12.1.1' %}
+{% set cuda_constraints=">=12.1,<13" %}
+{% set libcublas_constraints=">=12.1,<13" %}
+{% endif %}
+
+package:
+  name: faiss-pkg
+  version: {{ version }}
+
+build:
+  number: {{ number }}
+
+about:
+  home: https://github.com/facebookresearch/faiss
+  license: MIT
+  license_family: MIT
+  license_file: LICENSE
+  summary: A library for efficient similarity search and clustering of dense vectors.
+
+source:
+  git_url: ../../
+
+outputs:
+  - name: libfaiss
+    script: build-lib.sh  # [x86_64 and not win and not osx]
+    script: build-lib-osx.sh  # [x86_64 and osx]
+    script: build-lib-arm64.sh  # [not x86_64]
+    script: build-lib.bat  # [win]
+    build:
+      string: "h{{ PKG_HASH }}_{{ number }}_cuda{{ cudatoolkit }}_raft{{ suffix }}"
+      run_exports:
+        - {{ pin_compatible('libfaiss', exact=True) }}
+      script_env:
+        - CUDA_ARCHS
+    requirements:
+      build:
+        - {{ compiler('cxx') }}
+        - sysroot_linux-64  # [linux64]
+        - llvm-openmp  # [osx]
+        - cmake >=3.24.0
+        - make  # [not win]
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
+        - mkl =2023  # [x86_64]
+        - mkl-devel =2023  # [x86_64]
+        - cuda-toolkit {{ cudatoolkit }}
+      host:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
+        - mkl =2023  # [x86_64]
+        - openblas  # [not x86_64]
+        - cuvs =24.08
+        - cuda-version {{ cuda_constraints }}
+      run:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
+        - mkl =2023  # [x86_64]
+        - openblas  # [not x86_64]
+        - cuda-cudart {{ cuda_constraints }}
+        - libcublas {{ libcublas_constraints }}
+        - cuvs =24.08
+        - cuda-version {{ cuda_constraints }}
+    test:
+      requires:
+        - conda-build
+      commands:
+        - test -f $PREFIX/lib/libfaiss$SHLIB_EXT       # [not win]
+        - test -f $PREFIX/lib/libfaiss_avx2$SHLIB_EXT  # [x86_64 and not win]
+        - conda inspect linkages -p $PREFIX $PKG_NAME  # [not win]
+        - conda inspect objects -p $PREFIX $PKG_NAME   # [osx]
+
+  - name: faiss-gpu-raft
+    script: build-pkg.sh  # [x86_64 and not win and not osx]
+    script: build-pkg-osx.sh  # [x86_64 and osx]
+    script: build-pkg-arm64.sh # [not x86_64]
+    script: build-pkg.bat  # [win]
+    build:
+      string: "py{{ PY_VER }}_h{{ PKG_HASH }}_{{ number }}_cuda{{ cudatoolkit }}{{ suffix }}"
+    requirements:
+      build:
+        - {{ compiler('cxx') }}
+        - sysroot_linux-64 =2.17 # [linux64]
+        - swig
+        - cmake >=3.24.0
+        - make  # [not win]
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
+        - mkl =2023  # [x86_64]
+        - cuda-toolkit {{ cudatoolkit }}
+      host:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
+        - python {{ python }}
+        - numpy >=1.19,<2
+        - {{ pin_subpackage('libfaiss', exact=True) }}
+      run:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
+        - python {{ python }}
+        - numpy >=1.19,<2
+        - packaging
+        - {{ pin_subpackage('libfaiss', exact=True) }}
+    test:
+      requires:
+        - numpy
+        - scipy
+        - pytorch
+        - pytorch-cuda {{ cuda_constraints }}
+      commands:
+        - python -X faulthandler -m unittest discover -v -s tests/ -p "test_*"
+        - python -X faulthandler -m unittest discover -v -s tests/ -p "torch_*"
+        - cp tests/common_faiss_tests.py faiss/gpu/test
+        - python -X faulthandler -m unittest discover -v -s faiss/gpu/test/ -p "test_*"
+        - python -X faulthandler -m unittest discover -v -s faiss/gpu/test/ -p "torch_*"
+        - sh test_cpu_dispatch.sh  # [linux64]
+      files:
+        - test_cpu_dispatch.sh  # [linux64]
+      source_files:
+        - tests/
+        - faiss/gpu/test/
diff --git a/conda/faiss-gpu-raft/test_cpu_dispatch.sh b/conda/faiss-gpu-cuvs/test_cpu_dispatch.sh
similarity index 100%
rename from conda/faiss-gpu-raft/test_cpu_dispatch.sh
rename to conda/faiss-gpu-cuvs/test_cpu_dispatch.sh
diff --git a/conda/faiss-gpu-raft/meta.yaml b/conda/faiss-gpu-raft/meta.yaml
index 1dde8e9868..5448b0cc4a 100644
--- a/conda/faiss-gpu-raft/meta.yaml
+++ b/conda/faiss-gpu-raft/meta.yaml
@@ -58,7 +58,7 @@ outputs:
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
-        - libraft =24.06
+        - cuvs =24.08
         - cuda-version {{ cuda_constraints }}
       run:
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
@@ -66,7 +66,7 @@ outputs:
         - openblas  # [not x86_64]
         - cuda-cudart {{ cuda_constraints }}
         - libcublas {{ libcublas_constraints }}
-        - libraft =24.06
+        - cuvs =24.08
         - cuda-version {{ cuda_constraints }}
     test:
       requires:
diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
index c26e77e5e1..1062270742 100644
--- a/faiss/gpu/GpuIndexCagra.cu
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -51,7 +51,7 @@ void GpuIndexCagra::train(idx_t n, const float* x) {
             std::nullopt;
     std::optional<cuvs::neighbors::ivf_pq::search_params> ivf_pq_search_params =
             std::nullopt;
-    if (cagraConfig_.graph_build_params != nullptr) {
+    if (cagraConfig_.ivf_pq_params != nullptr) {
         ivf_pq_params =
                 std::make_optional<cuvs::neighbors::ivf_pq::index_params>();
         ivf_pq_params->n_lists = cagraConfig_.ivf_pq_params->n_lists;
@@ -86,11 +86,13 @@ void GpuIndexCagra::train(idx_t n, const float* x) {
             cagraConfig_.graph_degree,
             static_cast<faiss::cagra_build_algo>(cagraConfig_.build_algo),
             cagraConfig_.nn_descent_niter,
+            cagraConfig_.store_dataset,
             this->metric_type,
             this->metric_arg,
             INDICES_64_BIT,
             ivf_pq_params,
-            ivf_pq_search_params);
+            ivf_pq_search_params,
+            cagraConfig_.refine_rate);
 
     index_->train(n, x);
 
@@ -225,17 +227,33 @@ void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const {
     index->hnsw.set_default_probas(M, 1.0 / log(M));
 
     auto n_train = this->ntotal;
-    auto train_dataset = index_->get_training_dataset();
+    float* train_dataset;
+    auto dataset = index_->get_training_dataset();
+    bool allocation = false;
+    if (getDeviceForAddress(dataset) >= 0) {
+        train_dataset = new float[n_train * index->d];
+        allocation = true;
+        raft::copy(
+                train_dataset,
+                dataset,
+                n_train * index->d,
+                this->resources_->getRaftHandleCurrentDevice().get_stream());
+    } else {
+        train_dataset = const_cast<float*>(dataset);
+    }
 
     // turn off as level 0 is copied from CAGRA graph
     index->init_level0 = false;
     if (!index->base_level_only) {
-        index->add(n_train, train_dataset.data());
+        index->add(n_train, train_dataset);
     } else {
         index->hnsw.prepare_level_tab(n_train, false);
-        index->storage->add(n_train, train_dataset.data());
+        index->storage->add(n_train, train_dataset);
         index->ntotal = n_train;
     }
+    if (allocation) {
+        delete[] train_dataset;
+    }
 
     auto graph = get_knngraph();
 
diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
index 5a73f16ba9..63a5203187 100644
--- a/faiss/gpu/GpuIndexCagra.h
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -174,6 +174,8 @@ struct GpuIndexCagraConfig : public GpuIndexConfig {
 
     IVFPQBuildCagraConfig* ivf_pq_params = nullptr;
     IVFPQSearchCagraConfig* ivf_pq_search_params = nullptr;
+    float refine_rate = 2.0f;
+    bool store_dataset = true;
 };
 
 enum class search_algo {
@@ -279,4 +281,4 @@ struct GpuIndexCagra : public GpuIndex {
 };
 
 } // namespace gpu
-} // namespace faiss
+} // namespace faiss
\ No newline at end of file
diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 83c5f1dac3..596b7db95c 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -272,7 +272,7 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
         quantizer->add(nlist, cuvs_ivfflat_index.value().centers().data_handle());
         raft_handle.sync_stream();
 
-        cuvsIndex_->setCuvsIndex(std::move(cuvs_ivfflat_index.value()));
+        // cuvsIndex_->setCuvsIndex(std::make_shared<cuvs::neighbors::ivf_flat::index<float, idx_t>>(cuvs_ivfflat_index.value()));
 #else
         FAISS_THROW_MSG(
                 "RAFT has not been compiled into the current version so it cannot be used.");
diff --git a/faiss/gpu/GpuIndexIVFPQ.cu b/faiss/gpu/GpuIndexIVFPQ.cu
index 8d1fa504ce..a12db22c00 100644
--- a/faiss/gpu/GpuIndexIVFPQ.cu
+++ b/faiss/gpu/GpuIndexIVFPQ.cu
@@ -433,7 +433,7 @@ void GpuIndexIVFPQ::train(idx_t n, const float* x) {
                 cuvs_ivfpq_index.value().pq_centers().size(),
                 raft_handle.get_stream());
         raft_handle.sync_stream();
-        cuvsIndex_->setCuvsIndex(std::move(cuvs_ivfpq_index.value()));
+        // cuvsIndex_->setCuvsIndex(std::make_shared<cuvs::neighbors::ivf_pq::index<idx_t>>(cuvs_ivfpq_index.value()));
 #else
         FAISS_THROW_MSG(
                 "RAFT has not been compiled into the current version so it cannot be used.");
diff --git a/faiss/gpu/impl/CuvsCagra.cu b/faiss/gpu/impl/CuvsCagra.cu
index 72af0b9dd4..1bed835001 100644
--- a/faiss/gpu/impl/CuvsCagra.cu
+++ b/faiss/gpu/impl/CuvsCagra.cu
@@ -20,17 +20,14 @@
  * limitations under the License.
  */
 
+#include <faiss/gpu/utils/CuvsUtils.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
-#include <cstddef>
-#include <cstdint>
 #include <faiss/gpu/impl/CuvsCagra.cuh>
 
 #include <cuvs/neighbors/cagra.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
-#include <raft_runtime/neighbors/cagra.hpp>
-#include <optional>
 
 namespace faiss {
 namespace gpu {
@@ -42,32 +39,32 @@ CuvsCagra::CuvsCagra(
         idx_t graph_degree,
         faiss::cagra_build_algo graph_build_algo,
         size_t nn_descent_niter,
+        bool store_dataset,
         faiss::MetricType metric,
         float metricArg,
         IndicesOptions indicesOptions,
         std::optional<cuvs::neighbors::ivf_pq::index_params> ivf_pq_params,
         std::optional<cuvs::neighbors::ivf_pq::search_params>
-                ivf_pq_search_params)
+                ivf_pq_search_params,
+        float refine_rate)
         : resources_(resources),
           dim_(dim),
+          store_dataset_(store_dataset),
           metric_(metric),
           metricArg_(metricArg),
           index_params_(),
           ivf_pq_params_(ivf_pq_params),
-          ivf_pq_search_params_(ivf_pq_search_params) {
+          ivf_pq_search_params_(ivf_pq_search_params),
+          refine_rate_(refine_rate) {
     FAISS_THROW_IF_NOT_MSG(
             metric == faiss::METRIC_L2 || metric == faiss::METRIC_INNER_PRODUCT,
             "CAGRA currently only supports L2 or Inner Product metric.");
     FAISS_THROW_IF_NOT_MSG(
             indicesOptions == faiss::gpu::INDICES_64_BIT,
-            "only INDICES_64_BIT is supported for RAFT CAGRA index");
+            "only INDICES_64_BIT is supported for cuVS CAGRA index");
 
     index_params_.intermediate_graph_degree = intermediate_graph_degree;
     index_params_.graph_degree = graph_degree;
-    index_params_.build_algo =
-            static_cast<cuvs::neighbors::cagra::graph_build_algo>(
-                    graph_build_algo);
-    index_params_.nn_descent_niter = nn_descent_niter;
 
     if (!ivf_pq_params_) {
         ivf_pq_params_ =
@@ -77,12 +74,22 @@ CuvsCagra::CuvsCagra(
         ivf_pq_search_params_ =
                 std::make_optional<cuvs::neighbors::ivf_pq::search_params>();
     }
-    index_params_.metric = metric_ == faiss::METRIC_L2
-            ? cuvsDistanceType::L2Expanded
-            : cuvsDistanceType::InnerProduct;
-    ivf_pq_params_->metric = metric_ == faiss::METRIC_L2
-            ? cuvsDistanceType::L2Expanded
-            : cuvsDistanceType::InnerProduct;
+    index_params_.metric = metricFaissToCuvs(metric_, false);
+    ivf_pq_params_->metric = metricFaissToCuvs(metric_, false);
+
+    if (graph_build_algo == faiss::cagra_build_algo::IVF_PQ) {
+        cuvs::neighbors::cagra::graph_build_params::ivf_pq_params
+                graph_build_params;
+        graph_build_params.build_params = ivf_pq_params_.value();
+        graph_build_params.search_params = ivf_pq_search_params_.value();
+        graph_build_params.refinement_rate = refine_rate;
+        index_params_.graph_build_params = graph_build_params;
+    } else {
+        cuvs::neighbors::cagra::graph_build_params::nn_descent_params
+                graph_build_params;
+        graph_build_params.max_iterations = nn_descent_niter;
+        index_params_.graph_build_params = graph_build_params;
+    }
 
     reset();
 }
@@ -106,13 +113,16 @@ CuvsCagra::CuvsCagra(
             "CAGRA currently only supports L2 or Inner Product metric.");
     FAISS_THROW_IF_NOT_MSG(
             indicesOptions == faiss::gpu::INDICES_64_BIT,
-            "only INDICES_64_BIT is supported for RAFT CAGRA index");
+            "only INDICES_64_BIT is supported for cuVS CAGRA index");
 
     auto distances_on_gpu = getDeviceForAddress(distances) >= 0;
     auto knn_graph_on_gpu = getDeviceForAddress(knn_graph) >= 0;
 
     FAISS_ASSERT(distances_on_gpu == knn_graph_on_gpu);
 
+    storage_ = distances;
+    n_ = n;
+
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
 
@@ -131,10 +141,10 @@ CuvsCagra::CuvsCagra(
                 raft::make_device_matrix_view<const float, int64_t>(
                         distances, n, dim);
 
-        cuvs_index = cuvs::neighbors::cagra::index<float, uint32_t>(
+        cuvs_index = std::make_shared<
+                cuvs::neighbors::cagra::index<float, uint32_t>>(
                 raft_handle,
-                metric_ == faiss::METRIC_L2 ? cuvsDistanceType::L2Expanded
-                                            : cuvsDistanceType::InnerProduct,
+                metricFaissToCuvs(metric_, false),
                 distances_mds,
                 raft::make_const_mdspan(knn_graph_copy.view()));
     } else if (!distances_on_gpu && !knn_graph_on_gpu) {
@@ -149,10 +159,10 @@ CuvsCagra::CuvsCagra(
         auto distances_mds = raft::make_host_matrix_view<const float, int64_t>(
                 distances, n, dim);
 
-        cuvs_index = cuvs::neighbors::cagra::index<float, uint32_t>(
+        cuvs_index = std::make_shared<
+                cuvs::neighbors::cagra::index<float, uint32_t>>(
                 raft_handle,
-                metric_ == faiss::METRIC_L2 ? cuvsDistanceType::L2Expanded
-                                            : cuvsDistanceType::InnerProduct,
+                metricFaissToCuvs(metric_, false),
                 distances_mds,
                 raft::make_const_mdspan(knn_graph_copy.view()));
     } else {
@@ -162,19 +172,42 @@ CuvsCagra::CuvsCagra(
 }
 
 void CuvsCagra::train(idx_t n, const float* x) {
+    storage_ = x;
+    n_ = n;
+
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
+
+    //     auto nn_descent_params = std::make_optional<
+    //             cuvs::neighbors::nn_descent::index_params>();
+    //     nn_descent_params->graph_degree =
+    //     index_params_.intermediate_graph_degree;
+    //     nn_descent_params->intermediate_graph_degree =
+    //             1.5 * index_params_.intermediate_graph_degree;
+    //     nn_descent_params->max_iterations = index_params_.nn_descent_niter;
+
+    if (std::holds_alternative<
+                cuvs::neighbors::cagra::graph_build_params::ivf_pq_params>(
+                index_params_.graph_build_params) &&
+        index_params_.graph_degree == index_params_.intermediate_graph_degree) {
+        index_params_.intermediate_graph_degree =
+                1.5 * index_params_.graph_degree;
+    }
+
     if (getDeviceForAddress(x) >= 0) {
-        cuvs_index = raft::runtime::neighbors::cagra::build(
-                raft_handle,
-                index_params_,
-                raft::make_device_matrix_view<const float, int64_t>(
-                        x, n, dim_));
+        auto dataset =
+                raft::make_device_matrix_view<const float, int64_t>(x, n, dim_);
+        cuvs_index = std::make_shared<
+                cuvs::neighbors::cagra::index<float, uint32_t>>(
+                cuvs::neighbors::cagra::build(
+                        raft_handle, index_params_, dataset));
     } else {
-        cuvs_index = raft::runtime::neighbors::cagra::build(
-                raft_handle,
-                index_params_,
-                raft::make_host_matrix_view<const float, int64_t>(x, n, dim_));
+        auto dataset =
+                raft::make_host_matrix_view<const float, int64_t>(x, n, dim_);
+        cuvs_index = std::make_shared<
+                cuvs::neighbors::cagra::index<float, uint32_t>>(
+                cuvs::neighbors::cagra::build(
+                        raft_handle, index_params_, dataset));
     }
 }
 
@@ -202,10 +235,22 @@ void CuvsCagra::search(
     idx_t cols = queries.getSize(1);
     idx_t k_ = k;
 
-    FAISS_ASSERT(cuvs_index.has_value());
+    FAISS_ASSERT(cuvs_index);
     FAISS_ASSERT(numQueries > 0);
     FAISS_ASSERT(cols == dim_);
 
+    if (!store_dataset_) {
+        if (getDeviceForAddress(storage_) >= 0) {
+            auto dataset = raft::make_device_matrix_view<const float, int64_t>(
+                    storage_, n_, dim_);
+            cuvs_index->update_dataset(raft_handle, dataset);
+        } else {
+            auto dataset = raft::make_host_matrix_view<const float, int64_t>(
+                    storage_, n_, dim_);
+            cuvs_index->update_dataset(raft_handle, dataset);
+        }
+    }
+
     auto queries_view = raft::make_device_matrix_view<const float, int64_t>(
             queries.data(), numQueries, cols);
     auto distances_view = raft::make_device_matrix_view<float, int64_t>(
@@ -233,7 +278,7 @@ void CuvsCagra::search(
     auto indices_copy = raft::make_device_matrix<uint32_t, int64_t>(
             raft_handle, numQueries, k_);
 
-    raft::runtime::neighbors::cagra::search(
+    cuvs::neighbors::cagra::search(
             raft_handle,
             search_pams,
             *cuvs_index,
@@ -252,12 +297,12 @@ void CuvsCagra::reset() {
 }
 
 idx_t CuvsCagra::get_knngraph_degree() const {
-    FAISS_ASSERT(cuvs_index.has_value());
+    FAISS_ASSERT(cuvs_index);
     return static_cast<idx_t>(cuvs_index->graph_degree());
 }
 
 std::vector<idx_t> CuvsCagra::get_knngraph() const {
-    FAISS_ASSERT(cuvs_index.has_value());
+    FAISS_ASSERT(cuvs_index);
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
     auto stream = raft_handle.get_stream();
@@ -278,29 +323,8 @@ std::vector<idx_t> CuvsCagra::get_knngraph() const {
     return host_graph;
 }
 
-std::vector<float> CuvsCagra::get_training_dataset() const {
-    FAISS_ASSERT(cuvs_index.has_value());
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-    auto stream = raft_handle.get_stream();
-
-    auto device_dataset = cuvs_index->dataset();
-
-    std::vector<float> host_dataset(
-            device_dataset.extent(0) * device_dataset.extent(1));
-
-    RAFT_CUDA_TRY(cudaMemcpy2DAsync(
-            host_dataset.data(),
-            sizeof(float) * dim_,
-            device_dataset.data_handle(),
-            sizeof(float) * device_dataset.stride(0),
-            sizeof(float) * dim_,
-            device_dataset.extent(0),
-            cudaMemcpyDefault,
-            raft_handle.get_stream()));
-    raft_handle.sync_stream();
-
-    return host_dataset;
+const float* CuvsCagra::get_training_dataset() const {
+    return storage_;
 }
 
 } // namespace gpu
diff --git a/faiss/gpu/impl/CuvsCagra.cuh b/faiss/gpu/impl/CuvsCagra.cuh
index d4d8266673..eb8964a7d9 100644
--- a/faiss/gpu/impl/CuvsCagra.cuh
+++ b/faiss/gpu/impl/CuvsCagra.cuh
@@ -53,13 +53,15 @@ class CuvsCagra {
             idx_t graph_degree,
             faiss::cagra_build_algo graph_build_algo,
             size_t nn_descent_niter,
+            bool store_dataset,
             faiss::MetricType metric,
             float metricArg,
             IndicesOptions indicesOptions,
             std::optional<cuvs::neighbors::ivf_pq::index_params> ivf_pq_params =
                     std::nullopt,
             std::optional<cuvs::neighbors::ivf_pq::search_params>
-                    ivf_pq_search_params = std::nullopt);
+                    ivf_pq_search_params = std::nullopt,
+            float refine_rate = 2.0f);
 
     CuvsCagra(
             GpuResources* resources,
@@ -101,31 +103,40 @@ class CuvsCagra {
 
     std::vector<idx_t> get_knngraph() const;
 
-    std::vector<float> get_training_dataset() const;
+    const float* get_training_dataset() const;
 
    private:
     /// Collection of GPU resources that we use
     GpuResources* resources_;
 
+    /// Training dataset
+    const float* storage_;
+    int n_;
+
     /// Expected dimensionality of the vectors
     const int dim_;
 
+    /// Controls the underlying cuVS index if it should store the dataset in
+    /// device memory
+    bool store_dataset_;
+
     /// Metric type of the index
     faiss::MetricType metric_;
 
     /// Metric arg
     float metricArg_;
 
-    /// Parameters to build RAFT CAGRA index
+    /// Parameters to build cuVS CAGRA index
     cuvs::neighbors::cagra::index_params index_params_;
 
     /// Parameters to build CAGRA graph using IVF PQ
     std::optional<cuvs::neighbors::ivf_pq::index_params> ivf_pq_params_;
     std::optional<cuvs::neighbors::ivf_pq::search_params> ivf_pq_search_params_;
+    std::optional<float> refine_rate_;
 
-    /// Instance of trained RAFT CAGRA index
-    std::optional<cuvs::neighbors::cagra::index<float, uint32_t>>
-            cuvs_index{std::nullopt};
+    /// Instance of trained cuVS CAGRA index
+    std::shared_ptr<cuvs::neighbors::cagra::index<float, uint32_t>> cuvs_index{
+            nullptr};
 };
 
 } // namespace gpu
diff --git a/faiss/gpu/impl/CuvsIVFFlat.cu b/faiss/gpu/impl/CuvsIVFFlat.cu
index 364c09e0bc..885ffffa8f 100644
--- a/faiss/gpu/impl/CuvsIVFFlat.cu
+++ b/faiss/gpu/impl/CuvsIVFFlat.cu
@@ -102,7 +102,7 @@ void CuvsIVFFlat::search(
     uint32_t k_ = k;
 
     // Device is already set in GpuIndex::search
-    FAISS_ASSERT(cuvs_index.has_value());
+    FAISS_ASSERT(cuvs_index != nullptr);
     FAISS_ASSERT(numQueries > 0);
     FAISS_ASSERT(cols == dim_);
     FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= numLists_);
@@ -122,7 +122,7 @@ void CuvsIVFFlat::search(
     cuvs::neighbors::ivf_flat::search(
             raft_handle,
             pams,
-            cuvs_index.value(),
+            *cuvs_index,
             queries_view,
             out_inds_view,
             out_dists_view);
@@ -167,7 +167,7 @@ idx_t CuvsIVFFlat::addVectors(
     /// called updateQuantizer() to update the RAFT index if the quantizer was
     /// modified externally
 
-    FAISS_ASSERT(cuvs_index.has_value());
+    FAISS_ASSERT(cuvs_index != nullptr);
 
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
@@ -191,7 +191,7 @@ idx_t CuvsIVFFlat::addVectors(
 }
 
 idx_t CuvsIVFFlat::getListLength(idx_t listId) const {
-    FAISS_ASSERT(cuvs_index.has_value());
+    FAISS_ASSERT(cuvs_index != nullptr);
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
 
@@ -208,7 +208,7 @@ idx_t CuvsIVFFlat::getListLength(idx_t listId) const {
 
 /// Return the list indices of a particular list back to the CPU
 std::vector<idx_t> CuvsIVFFlat::getListIndices(idx_t listId) const {
-    FAISS_ASSERT(cuvs_index.has_value());
+    FAISS_ASSERT(cuvs_index != nullptr);
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
     auto stream = raft_handle.get_stream();
@@ -240,7 +240,7 @@ std::vector<uint8_t> CuvsIVFFlat::getListVectorData(
     if (gpuFormat) {
         FAISS_THROW_MSG("gpuFormat should be false for RAFT indices");
     }
-    FAISS_ASSERT(cuvs_index.has_value());
+    FAISS_ASSERT(cuvs_index != nullptr);
 
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
@@ -310,10 +310,11 @@ void CuvsIVFFlat::updateQuantizer(Index* quantizer) {
     pams.add_data_on_build = false;
     pams.metric = metricFaissToCuvs(metric_, false);
     pams.n_lists = numLists_;
-    cuvs_index.emplace(raft_handle, pams, static_cast<uint32_t>(dim_));
-
+    auto new_index = cuvs::neighbors::ivf_flat::index<float, idx_t>(
+            raft_handle, pams, static_cast<uint32_t>(dim_));
     cuvs::neighbors::ivf_flat::helpers::reset_index(
-            raft_handle, cuvs_index.get());
+            raft_handle, &new_index);
+    cuvs_index.reset(&new_index);
 
     // If the index instance is a GpuIndexFlat, then we can use direct access to
     // the centroids within.
@@ -372,7 +373,7 @@ void CuvsIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
     std::vector<idx_t> indices_(ntotal);
 
     // the index must already exist
-    FAISS_ASSERT(cuvs_index.has_value());
+    FAISS_ASSERT(cuvs_index != nullptr);
 
     auto& raft_lists = cuvs_index->lists();
 
@@ -529,11 +530,7 @@ void CuvsIVFFlatCodePackerInterleaved::pack_1(
         size_t offset,
         uint8_t* block) const {
     cuvs::neighbors::ivf_flat::helpers::codepacker::pack_1(
-            reinterpret_cast<const uint32_t*>(flat_code),
-            reinterpret_cast<uint32_t*>(block),
-            dim,
-            chunk_size,
-            static_cast<uint32_t>(offset));
+            flat_code, block, dim, chunk_size, static_cast<uint32_t>(offset));
 }
 
 void CuvsIVFFlatCodePackerInterleaved::unpack_1(
@@ -541,11 +538,7 @@ void CuvsIVFFlatCodePackerInterleaved::unpack_1(
         size_t offset,
         uint8_t* flat_code) const {
     cuvs::neighbors::ivf_flat::helpers::codepacker::unpack_1(
-            reinterpret_cast<const uint32_t*>(block),
-            reinterpret_cast<uint32_t*>(flat_code),
-            dim,
-            chunk_size,
-            static_cast<uint32_t>(offset));
+            block, flat_code, dim, chunk_size, static_cast<uint32_t>(offset));
 }
 
 } // namespace gpu
diff --git a/faiss/gpu/impl/CuvsIVFFlat.cuh b/faiss/gpu/impl/CuvsIVFFlat.cuh
index e0e34a7dbb..0f3abc4ce0 100644
--- a/faiss/gpu/impl/CuvsIVFFlat.cuh
+++ b/faiss/gpu/impl/CuvsIVFFlat.cuh
@@ -107,7 +107,7 @@ class CuvsIVFFlat : public IVFFlat {
     void copyInvertedListsFrom(const InvertedLists* ivf) override;
 
     /// Replace the CUVS index
-    void setCuvsIndex(cuvs::neighbors::ivf_flat::index<float, idx_t>&& idx);
+    void setCuvsIndex(std::shared_ptr<cuvs::neighbors::ivf_flat::index<float, idx_t>> idx);
 
    private:
     /// Adds a set of codes and indices to a list, with the representation
diff --git a/faiss/gpu/impl/CuvsIVFPQ.cu b/faiss/gpu/impl/CuvsIVFPQ.cu
index 2f3c3e089f..fc6382c90f 100644
--- a/faiss/gpu/impl/CuvsIVFPQ.cu
+++ b/faiss/gpu/impl/CuvsIVFPQ.cu
@@ -22,10 +22,11 @@
 
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/utils/CuvsUtils.h>
-#include <faiss/gpu/impl/FlatIndex.cuh>
 #include <faiss/gpu/impl/CuvsIVFPQ.cuh>
+#include <faiss/gpu/impl/FlatIndex.cuh>
 #include <faiss/gpu/utils/Transpose.cuh>
 
+#include <cuvs/neighbors/common.hpp>
 #include <cuvs/neighbors/ivf_pq.hpp>
 #include <raft/linalg/map.cuh>
 
@@ -121,7 +122,8 @@ void CuvsIVFPQ::updateQuantizer(Index* quantizer) {
     pams.n_lists = numLists_;
     pams.pq_bits = bitsPerSubQuantizer_;
     pams.pq_dim = numSubQuantizers_;
-    cuvs_index = std::make_shared<>(raft_handle, pams, static_cast<uint32_t>(dim_));
+    cuvs_index = std::make_shared<cuvs::neighbors::ivf_pq::index<idx_t>>(
+            raft_handle, pams, static_cast<uint32_t>(dim_));
 
     cuvs::neighbors::ivf_pq::helpers::reset_index(
             raft_handle, cuvs_index.get());
@@ -202,9 +204,7 @@ std::vector<idx_t> CuvsIVFPQ::getListIndices(idx_t listId) const {
 
     raft::update_host(
             &list_indices_ptr,
-            const_cast<idx_t**>(
-                    cuvs_index->inds_ptrs().data_handle()) +
-                    listId,
+            const_cast<idx_t**>(cuvs_index->inds_ptrs().data_handle()) + listId,
             1,
             stream);
     raft_handle.sync_stream();
@@ -230,8 +230,7 @@ void CuvsIVFPQ::searchPreassigned(
 }
 
 size_t CuvsIVFPQ::getGpuListEncodingSize_(idx_t listId) {
-    return static_cast<size_t>(
-            cuvs_index->get_list_size_in_bytes(listId));
+    return static_cast<size_t>(cuvs_index->get_list_size_in_bytes(listId));
 }
 
 /// Return the encoded vectors of a particular list back to the CPU
@@ -264,13 +263,14 @@ std::vector<uint8_t> CuvsIVFPQ::getListVectorData(idx_t listId, bool gpuFormat)
         auto codes_d = raft::make_device_vector<uint8_t>(
                 raft_handle, static_cast<uint32_t>(bufferSize));
 
-        cuvs::neighbors::ivf_pq::helpers::unpack_contiguous_list_data(
-                raft_handle,
-                cuvs_index.value(),
-                codes_d.data_handle(),
-                batchSize,
-                listId,
-                offset_b);
+        cuvs::neighbors::ivf_pq::helpers::codepacker::
+                unpack_contiguous_list_data(
+                        raft_handle,
+                        *cuvs_index,
+                        codes_d.data_handle(),
+                        batchSize,
+                        listId,
+                        offset_b);
 
         // Copy the flat PQ codes to host
         raft::update_host(
@@ -319,7 +319,7 @@ void CuvsIVFPQ::search(
     cuvs::neighbors::ivf_pq::search(
             raft_handle,
             pams,
-            cuvs_index.value(),
+            *cuvs_index,
             queries_view,
             out_inds_view,
             out_dists_view);
@@ -373,14 +373,13 @@ idx_t CuvsIVFPQ::addVectors(
     /// Remove rows containing NaNs
     idx_t n_rows_valid = inplaceGatherFilteredRows(resources_, vecs, indices);
 
-    cuvs_index.emplace(cuvs::neighbors::ivf_pq::extend(
+    cuvs::neighbors::ivf_pq::extend(
             raft_handle,
             raft::make_device_matrix_view<const float, idx_t>(
                     vecs.data(), n_rows_valid, dim_),
-            std::make_optional<raft::device_vector_view<const idx_t, idx_t>>(
-                    raft::make_device_vector_view<const idx_t, idx_t>(
-                            indices.data(), n_rows_valid)),
-            cuvs_index.value()));
+            raft::make_device_vector_view<const idx_t, idx_t>(
+                    indices.data(), n_rows_valid),
+            cuvs_index.get());
 
     return n_rows_valid;
 }
@@ -401,7 +400,7 @@ void CuvsIVFPQ::copyInvertedListsFrom(const InvertedLists* ivf) {
     auto& raft_lists = cuvs_index->lists();
 
     // conservative memory alloc for cloning cpu inverted lists
-    cuvs::neighbors::ivf_pq::list_spec<uint32_t, idx_t> raft_list_spec{
+    cuvs::neighbors::ivf_pq::list_spec<uint32_t, idx_t> ivf_list_spec{
             static_cast<uint32_t>(bitsPerSubQuantizer_),
             static_cast<uint32_t>(numSubQuantizers_),
             true};
@@ -426,7 +425,7 @@ void CuvsIVFPQ::copyInvertedListsFrom(const InvertedLists* ivf) {
         cuvs::neighbors::ivf::resize_list(
                 raft_handle,
                 raft_lists[i],
-                raft_list_spec,
+                ivf_list_spec,
                 static_cast<uint32_t>(listSize),
                 static_cast<uint32_t>(0));
     }
@@ -448,9 +447,8 @@ void CuvsIVFPQ::copyInvertedListsFrom(const InvertedLists* ivf) {
     }
 }
 
-void CuvsIVFPQ::setCuvsIndex(cuvs::neighbors::ivf_pq::index<idx_t>&& idx) {
-    cuvs_index.emplace(std::move(idx));
-    setBasePQCentroids_();
+void CuvsIVFPQ::setCuvsIndex(std::shared_ptr<cuvs::neighbors::ivf_pq::index<idx_t>> idx) {
+    cuvs_index = idx;
 }
 
 void CuvsIVFPQ::addEncodedVectorsToList_(
@@ -489,7 +487,7 @@ void CuvsIVFPQ::addEncodedVectorsToList_(
                 bufferSize,
                 stream);
 
-        cuvs::neighbors::ivf_pq::helpers::pack_contiguous_list_data(
+        cuvs::neighbors::ivf_pq::helpers::codepacker::pack_contiguous_list_data(
                 raft_handle,
                 cuvs_index.get(),
                 codes_d.data_handle(),
diff --git a/faiss/gpu/impl/CuvsIVFPQ.cuh b/faiss/gpu/impl/CuvsIVFPQ.cuh
index c5c4cf64e5..b75a531b67 100644
--- a/faiss/gpu/impl/CuvsIVFPQ.cuh
+++ b/faiss/gpu/impl/CuvsIVFPQ.cuh
@@ -101,7 +101,7 @@ class CuvsIVFPQ : public IVFPQ {
     void copyInvertedListsFrom(const InvertedLists* ivf) override;
 
     /// Replace the Raft index
-    void setCuvsIndex(cuvs::neighbors::ivf_pq::index<idx_t>&& idx);
+    void setCuvsIndex(std::shared_ptr<cuvs::neighbors::ivf_pq::index<idx_t>> idx);
 
     /// Classify and encode/add vectors to our IVF lists.
     /// The input data must be on our current device.
diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt
index b2b4c22831..3371f2e3eb 100644
--- a/faiss/gpu/test/CMakeLists.txt
+++ b/faiss/gpu/test/CMakeLists.txt
@@ -41,9 +41,9 @@ faiss_gpu_test(TestGpuIndexIVFPQ.cpp)
 faiss_gpu_test(TestGpuIndexIVFScalarQuantizer.cpp)
 faiss_gpu_test(TestGpuDistance.cu)
 faiss_gpu_test(TestGpuSelect.cu)
-if(FAISS_ENABLE_CUVS)
-  faiss_gpu_test(TestGpuIndexCagra.cu)
-endif()
+#if(FAISS_ENABLE_CUVS)
+#  faiss_gpu_test(TestGpuIndexCagra.cu)
+#endif()
 
 add_executable(demo_ivfpq_indexing_gpu EXCLUDE_FROM_ALL
   demo_ivfpq_indexing_gpu.cpp)

From 499cae402977453c1c639037e46bb74725b9df15 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Fri, 5 Jul 2024 11:14:00 -0700
Subject: [PATCH 068/148] include common header; change dependency to libcuvs

---
 conda/faiss-gpu-cuvs/meta.yaml |  4 ++--
 faiss/gpu/impl/CuvsIVFFlat.cu  | 11 +++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/conda/faiss-gpu-cuvs/meta.yaml b/conda/faiss-gpu-cuvs/meta.yaml
index 5448b0cc4a..2ba4d42328 100644
--- a/conda/faiss-gpu-cuvs/meta.yaml
+++ b/conda/faiss-gpu-cuvs/meta.yaml
@@ -58,7 +58,7 @@ outputs:
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
-        - cuvs =24.08
+        - libcuvs =24.08
         - cuda-version {{ cuda_constraints }}
       run:
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
@@ -66,7 +66,7 @@ outputs:
         - openblas  # [not x86_64]
         - cuda-cudart {{ cuda_constraints }}
         - libcublas {{ libcublas_constraints }}
-        - cuvs =24.08
+        - libcuvs =24.08
         - cuda-version {{ cuda_constraints }}
     test:
       requires:
diff --git a/faiss/gpu/impl/CuvsIVFFlat.cu b/faiss/gpu/impl/CuvsIVFFlat.cu
index 885ffffa8f..4ff7489fb7 100644
--- a/faiss/gpu/impl/CuvsIVFFlat.cu
+++ b/faiss/gpu/impl/CuvsIVFFlat.cu
@@ -30,9 +30,8 @@
 #include <faiss/gpu/impl/IVFFlat.cuh>
 #include <faiss/gpu/utils/Transpose.cuh>
 
-// #include <cuvs/neighbors/ivf_flat_codepacker.hpp>
 #include <cuvs/neighbors/ivf_flat.hpp>
-// #include <cuvs/neighbors/ivf_flat_helpers.cuh>
+#include <cuvs/neighbors/common.hpp>
 #include <raft/linalg/map.cuh>
 #include <raft/linalg/norm.cuh>
 
@@ -310,11 +309,11 @@ void CuvsIVFFlat::updateQuantizer(Index* quantizer) {
     pams.add_data_on_build = false;
     pams.metric = metricFaissToCuvs(metric_, false);
     pams.n_lists = numLists_;
-    auto new_index = cuvs::neighbors::ivf_flat::index<float, idx_t>(
-            raft_handle, pams, static_cast<uint32_t>(dim_));
+    cuvs_index =
+            std::make_shared<cuvs::neighbors::ivf_flat::index<float, idx_t>>(
+                    raft_handle, pams, static_cast<uint32_t>(dim_));
     cuvs::neighbors::ivf_flat::helpers::reset_index(
-            raft_handle, &new_index);
-    cuvs_index.reset(&new_index);
+            raft_handle, cuvs_index.get());
 
     // If the index instance is a GpuIndexFlat, then we can use direct access to
     // the centroids within.

From ca45475a419969c711105d124f385d588066f84d Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 8 Jul 2024 17:58:41 -0700
Subject: [PATCH 069/148] variable names changed

---
 benchs/bench_ivfpq_cuvs.py             | 18 +++++------
 faiss/gpu/GpuDistance.cu               |  1 -
 faiss/gpu/GpuIndexCagra.cu             |  1 +
 faiss/gpu/GpuIndexIVFFlat.cu           | 18 +++++------
 faiss/gpu/GpuIndexIVFPQ.cu             |  2 +-
 faiss/gpu/impl/CuvsCagra.cu            |  1 +
 faiss/gpu/impl/CuvsFlatIndex.cu        |  2 --
 faiss/gpu/impl/CuvsIVFFlat.cu          | 13 ++++----
 faiss/gpu/impl/CuvsIVFFlat.cuh         |  2 +-
 faiss/gpu/impl/CuvsIVFPQ.cu            |  9 +++---
 faiss/gpu/impl/CuvsIVFPQ.cuh           |  2 +-
 faiss/gpu/test/CMakeLists.txt          |  2 +-
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 44 +++++++++++++-------------
 faiss/gpu/test/TestGpuIndexIVFPQ.cpp   | 40 +++++++++++------------
 faiss/python/CMakeLists.txt            |  6 ++--
 15 files changed, 81 insertions(+), 80 deletions(-)

diff --git a/benchs/bench_ivfpq_cuvs.py b/benchs/bench_ivfpq_cuvs.py
index 02f1a487ae..294d7d5dbe 100644
--- a/benchs/bench_ivfpq_cuvs.py
+++ b/benchs/bench_ivfpq_cuvs.py
@@ -92,15 +92,15 @@ def bench_train_milliseconds(index, trainVecs, use_cuvs):
 print("=" * 40)
 print("GPU Train Benchmarks")
 print("=" * 40)
-raft_gpu_train_time = bench_train_milliseconds(index, xt, True)
+cuvs_gpu_train_time = bench_train_milliseconds(index, xt, True)
 if args.cuvs_only:
     print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, CUVS enabled GPU train time: %.3f milliseconds" % (
-        n_cols, nlist, M, args.bits_per_code, n_train, raft_gpu_train_time))
+        n_cols, nlist, M, args.bits_per_code, n_train, cuvs_gpu_train_time))
 else:
     classical_gpu_train_time = bench_train_milliseconds(
         index, xt, False)
     print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, classical GPU train time: %.3f milliseconds, CUVS enabled GPU train time: %.3f milliseconds" % (
-        n_cols, nlist, M, args.bits_per_code, n_train, classical_gpu_train_time, raft_gpu_train_time))
+        n_cols, nlist, M, args.bits_per_code, n_train, classical_gpu_train_time, cuvs_gpu_train_time))
 
 
 def bench_add_milliseconds(index, addVecs, use_cuvs):
@@ -118,15 +118,15 @@ def bench_add_milliseconds(index, addVecs, use_cuvs):
 print("GPU Add Benchmarks")
 print("=" * 40)
 index.train(xt)
-raft_gpu_add_time = bench_add_milliseconds(index, xb, True)
+cuvs_gpu_add_time = bench_add_milliseconds(index, xb, True)
 if args.cuvs_only:
     print("Method: IVFPQ, Operation: ADD, dim: %d, n_centroids %d numSubQuantizers %d, bitsPerCode %d, numAdd %d, CUVS enabled GPU add time: %.3f milliseconds" % (
-        n_cols, nlist, M, args.bits_per_code, n_rows, raft_gpu_add_time))
+        n_cols, nlist, M, args.bits_per_code, n_rows, cuvs_gpu_add_time))
 else:
     classical_gpu_add_time = bench_add_milliseconds(
         index, xb, False)
     print("Method: IVFFPQ, Operation: ADD, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numAdd %d, classical GPU add time: %.3f milliseconds, CUVS enabled GPU add time: %.3f milliseconds" % (
-        n_cols, nlist, M, args.bits_per_code, n_rows, classical_gpu_add_time, raft_gpu_add_time))
+        n_cols, nlist, M, args.bits_per_code, n_rows, classical_gpu_add_time, cuvs_gpu_add_time))
 
 
 def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_cuvs):
@@ -156,13 +156,13 @@ def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_cuvs):
     index.train(xt)
     for n_rows in queryset_sizes:
         queryVecs = xq[np.random.choice(xq.shape[0], n_rows, replace=False)]
-        raft_gpu_search_time = bench_search_milliseconds(
+        cuvs_gpu_search_time = bench_search_milliseconds(
             index, xb, queryVecs, args.nprobe, args.k, True)
         if args.cuvs_only:
             print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, CUVS enabled GPU search time: %.3f milliseconds" % (
-                n_cols, nlist, M, args.bits_per_code, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time))
+                n_cols, nlist, M, args.bits_per_code, n_add, n_rows, args.nprobe, args.k, cuvs_gpu_search_time))
         else:
             classical_gpu_search_time = bench_search_milliseconds(
                 index, xb, queryVecs, args.nprobe, args.k, False)
             print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, CUVS enabled GPU search time: %.3f milliseconds" % (
-                n_cols, nlist, M, args.bits_per_code, n_add, n_rows, args.nprobe, args.k, classical_gpu_search_time, raft_gpu_search_time))
\ No newline at end of file
+                n_cols, nlist, M, args.bits_per_code, n_add, n_rows, args.nprobe, args.k, classical_gpu_search_time, cuvs_gpu_search_time))
\ No newline at end of file
diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index 1a51cf53e3..6839fbd46f 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -42,7 +42,6 @@
 #include <raft/core/operators.hpp>
 #include <raft/core/temporary_device_buffer.hpp>
 #include <raft/linalg/unary_op.cuh>
-// #define RAFT_NAME "raft"
 #endif
 
 namespace faiss {
diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
index 1062270742..059e0c510b 100644
--- a/faiss/gpu/GpuIndexCagra.cu
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -24,6 +24,7 @@
 #include <faiss/gpu/GpuIndexCagra.h>
 #include <cstddef>
 #include <faiss/gpu/impl/CuvsCagra.cuh>
+#include <faiss/gpu/StandardGpuResources.h>
 #include <optional>
 
 namespace faiss {
diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 596b7db95c..6b196b6155 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -242,14 +242,14 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
         const raft::device_resources& raft_handle =
                 resources_->getRaftHandleCurrentDevice();
 
-        cuvs::neighbors::ivf_flat::index_params raft_idx_params;
-        raft_idx_params.n_lists = nlist;
-        raft_idx_params.metric = metricFaissToCuvs(metric_type, false);
-        raft_idx_params.add_data_on_build = false;
-        raft_idx_params.kmeans_trainset_fraction =
+        cuvs::neighbors::ivf_flat::index_params cuvs_index_params;
+        cuvs_index_params.n_lists = nlist;
+        cuvs_index_params.metric = metricFaissToCuvs(metric_type, false);
+        cuvs_index_params.add_data_on_build = false;
+        cuvs_index_params.kmeans_trainset_fraction =
                 static_cast<double>(cp.max_points_per_centroid * nlist) /
                 static_cast<double>(n);
-        raft_idx_params.kmeans_n_iters = cp.niter;
+        cuvs_index_params.kmeans_n_iters = cp.niter;
 
         auto cuvsIndex_ =
                 std::static_pointer_cast<CuvsIVFFlat, IVFFlat>(index_);
@@ -260,19 +260,19 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
             auto dataset_d =
                     raft::make_device_matrix_view<const float, idx_t>(x, n, d);
             cuvs_ivfflat_index = cuvs::neighbors::ivf_flat::build(
-                    raft_handle, raft_idx_params, dataset_d);
+                    raft_handle, cuvs_index_params, dataset_d);
         } else {
             auto x_view =
                     raft::make_host_matrix_view<const float, idx_t>(x, n, d);
             cuvs_ivfflat_index = cuvs::neighbors::ivf_flat::build(
-                    raft_handle, raft_idx_params, x_view);
+                    raft_handle, cuvs_index_params, x_view);
         }
 
         quantizer->train(nlist, cuvs_ivfflat_index.value().centers().data_handle());
         quantizer->add(nlist, cuvs_ivfflat_index.value().centers().data_handle());
         raft_handle.sync_stream();
 
-        // cuvsIndex_->setCuvsIndex(std::make_shared<cuvs::neighbors::ivf_flat::index<float, idx_t>>(cuvs_ivfflat_index.value()));
+        cuvsIndex_->setCuvsIndex(&cuvs_ivfflat_index.value());
 #else
         FAISS_THROW_MSG(
                 "RAFT has not been compiled into the current version so it cannot be used.");
diff --git a/faiss/gpu/GpuIndexIVFPQ.cu b/faiss/gpu/GpuIndexIVFPQ.cu
index a12db22c00..35e21bb84a 100644
--- a/faiss/gpu/GpuIndexIVFPQ.cu
+++ b/faiss/gpu/GpuIndexIVFPQ.cu
@@ -433,7 +433,7 @@ void GpuIndexIVFPQ::train(idx_t n, const float* x) {
                 cuvs_ivfpq_index.value().pq_centers().size(),
                 raft_handle.get_stream());
         raft_handle.sync_stream();
-        // cuvsIndex_->setCuvsIndex(std::make_shared<cuvs::neighbors::ivf_pq::index<idx_t>>(cuvs_ivfpq_index.value()));
+        cuvsIndex_->setCuvsIndex(&cuvs_ivfpq_index.value());
 #else
         FAISS_THROW_MSG(
                 "RAFT has not been compiled into the current version so it cannot be used.");
diff --git a/faiss/gpu/impl/CuvsCagra.cu b/faiss/gpu/impl/CuvsCagra.cu
index 1bed835001..82457e2f66 100644
--- a/faiss/gpu/impl/CuvsCagra.cu
+++ b/faiss/gpu/impl/CuvsCagra.cu
@@ -23,6 +23,7 @@
 #include <faiss/gpu/utils/CuvsUtils.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/impl/CuvsCagra.cuh>
+#include <faiss/gpu/StandardGpuResources.h>
 
 #include <cuvs/neighbors/cagra.hpp>
 #include <raft/core/device_mdspan.hpp>
diff --git a/faiss/gpu/impl/CuvsFlatIndex.cu b/faiss/gpu/impl/CuvsFlatIndex.cu
index da990f848f..9930e9d640 100644
--- a/faiss/gpu/impl/CuvsFlatIndex.cu
+++ b/faiss/gpu/impl/CuvsFlatIndex.cu
@@ -32,8 +32,6 @@
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/unary_op.cuh>
 
-// #define RAFT_NAME "raft"
-
 namespace faiss {
 namespace gpu {
 
diff --git a/faiss/gpu/impl/CuvsIVFFlat.cu b/faiss/gpu/impl/CuvsIVFFlat.cu
index 4ff7489fb7..be1d984b2a 100644
--- a/faiss/gpu/impl/CuvsIVFFlat.cu
+++ b/faiss/gpu/impl/CuvsIVFFlat.cu
@@ -29,6 +29,7 @@
 #include <faiss/gpu/impl/FlatIndex.cuh>
 #include <faiss/gpu/impl/IVFFlat.cuh>
 #include <faiss/gpu/utils/Transpose.cuh>
+#include <faiss/gpu/StandardGpuResources.h>
 
 #include <cuvs/neighbors/ivf_flat.hpp>
 #include <cuvs/neighbors/common.hpp>
@@ -81,8 +82,8 @@ void CuvsIVFFlat::reset() {
 }
 
 void CuvsIVFFlat::setCuvsIndex(
-        std::shared_ptr<cuvs::neighbors::ivf_flat::index<float, idx_t>> idx) {
-    cuvs_index = idx;
+        cuvs::neighbors::ivf_flat::index<float, idx_t>* idx) {
+    cuvs_index.reset(idx);
 }
 
 void CuvsIVFFlat::search(
@@ -374,10 +375,10 @@ void CuvsIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
     // the index must already exist
     FAISS_ASSERT(cuvs_index != nullptr);
 
-    auto& raft_lists = cuvs_index->lists();
+    auto& cuvs_index_lists = cuvs_index->lists();
 
     // conservative memory alloc for cloning cpu inverted lists
-    cuvs::neighbors::ivf_flat::list_spec<uint32_t, float, idx_t> raft_list_spec{
+    cuvs::neighbors::ivf_flat::list_spec<uint32_t, float, idx_t> ivf_list_spec{
             static_cast<uint32_t>(dim_), true};
 
     for (size_t i = 0; i < nlist; ++i) {
@@ -399,8 +400,8 @@ void CuvsIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
 
         cuvs::neighbors::ivf::resize_list(
                 raft_handle,
-                raft_lists[i],
-                raft_list_spec,
+                cuvs_index_lists[i],
+                ivf_list_spec,
                 (uint32_t)listSize,
                 (uint32_t)0);
     }
diff --git a/faiss/gpu/impl/CuvsIVFFlat.cuh b/faiss/gpu/impl/CuvsIVFFlat.cuh
index 0f3abc4ce0..9296eda993 100644
--- a/faiss/gpu/impl/CuvsIVFFlat.cuh
+++ b/faiss/gpu/impl/CuvsIVFFlat.cuh
@@ -107,7 +107,7 @@ class CuvsIVFFlat : public IVFFlat {
     void copyInvertedListsFrom(const InvertedLists* ivf) override;
 
     /// Replace the CUVS index
-    void setCuvsIndex(std::shared_ptr<cuvs::neighbors::ivf_flat::index<float, idx_t>> idx);
+    void setCuvsIndex(cuvs::neighbors::ivf_flat::index<float, idx_t>* idx);
 
    private:
     /// Adds a set of codes and indices to a list, with the representation
diff --git a/faiss/gpu/impl/CuvsIVFPQ.cu b/faiss/gpu/impl/CuvsIVFPQ.cu
index fc6382c90f..5affcb5d7e 100644
--- a/faiss/gpu/impl/CuvsIVFPQ.cu
+++ b/faiss/gpu/impl/CuvsIVFPQ.cu
@@ -25,6 +25,7 @@
 #include <faiss/gpu/impl/CuvsIVFPQ.cuh>
 #include <faiss/gpu/impl/FlatIndex.cuh>
 #include <faiss/gpu/utils/Transpose.cuh>
+#include <faiss/gpu/StandardGpuResources.h>
 
 #include <cuvs/neighbors/common.hpp>
 #include <cuvs/neighbors/ivf_pq.hpp>
@@ -397,7 +398,7 @@ void CuvsIVFPQ::copyInvertedListsFrom(const InvertedLists* ivf) {
     // the index must already exist
     FAISS_ASSERT(cuvs_index);
 
-    auto& raft_lists = cuvs_index->lists();
+    auto& cuvs_index_lists = cuvs_index->lists();
 
     // conservative memory alloc for cloning cpu inverted lists
     cuvs::neighbors::ivf_pq::list_spec<uint32_t, idx_t> ivf_list_spec{
@@ -424,7 +425,7 @@ void CuvsIVFPQ::copyInvertedListsFrom(const InvertedLists* ivf) {
 
         cuvs::neighbors::ivf::resize_list(
                 raft_handle,
-                raft_lists[i],
+                cuvs_index_lists[i],
                 ivf_list_spec,
                 static_cast<uint32_t>(listSize),
                 static_cast<uint32_t>(0));
@@ -447,8 +448,8 @@ void CuvsIVFPQ::copyInvertedListsFrom(const InvertedLists* ivf) {
     }
 }
 
-void CuvsIVFPQ::setCuvsIndex(std::shared_ptr<cuvs::neighbors::ivf_pq::index<idx_t>> idx) {
-    cuvs_index = idx;
+void CuvsIVFPQ::setCuvsIndex(cuvs::neighbors::ivf_pq::index<idx_t>* idx) {
+    cuvs_index.reset(idx);
 }
 
 void CuvsIVFPQ::addEncodedVectorsToList_(
diff --git a/faiss/gpu/impl/CuvsIVFPQ.cuh b/faiss/gpu/impl/CuvsIVFPQ.cuh
index b75a531b67..e4d0435769 100644
--- a/faiss/gpu/impl/CuvsIVFPQ.cuh
+++ b/faiss/gpu/impl/CuvsIVFPQ.cuh
@@ -101,7 +101,7 @@ class CuvsIVFPQ : public IVFPQ {
     void copyInvertedListsFrom(const InvertedLists* ivf) override;
 
     /// Replace the Raft index
-    void setCuvsIndex(std::shared_ptr<cuvs::neighbors::ivf_pq::index<idx_t>> idx);
+    void setCuvsIndex(cuvs::neighbors::ivf_pq::index<idx_t>* idx);
 
     /// Classify and encode/add vectors to our IVF lists.
     /// The input data must be on our current device.
diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt
index 3371f2e3eb..34f6217970 100644
--- a/faiss/gpu/test/CMakeLists.txt
+++ b/faiss/gpu/test/CMakeLists.txt
@@ -22,7 +22,7 @@ find_package(CUDAToolkit REQUIRED)
 # Defines `gtest_discover_tests()`.
 include(GoogleTest)
 add_library(faiss_gpu_test_helper TestUtils.cpp)
-target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft>)
+target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
 
 macro(faiss_gpu_test file)
   get_filename_component(test_name ${file} NAME_WE)
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index ecd7004547..c5d8b7926c 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -57,7 +57,7 @@ struct Options {
 
         device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
 
-        useRaft = false;
+        useCuvs = false;
     }
 
     std::string toString() const {
@@ -65,7 +65,7 @@ struct Options {
         str << "IVFFlat device " << device << " numVecs " << numAdd << " dim "
             << dim << " numCentroids " << numCentroids << " nprobe " << nprobe
             << " numQuery " << numQuery << " k " << k << " indicesOpt "
-            << indicesOpt << " useRaft " << useRaft;
+            << indicesOpt << " useCuvs " << useCuvs;
 
         return str.str();
     }
@@ -79,7 +79,7 @@ struct Options {
     int k;
     int device;
     faiss::gpu::IndicesOptions indicesOpt;
-    bool useRaft;
+    bool useCuvs;
 };
 
 void queryTest(
@@ -110,7 +110,7 @@ void queryTest(
         config.device = opt.device;
         config.indicesOptions = opt.indicesOpt;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-        config.use_cuvs = opt.useRaft;
+        config.use_cuvs = opt.useCuvs;
 
         faiss::gpu::GpuIndexIVFFlat gpuIndex(
                 &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
@@ -137,7 +137,7 @@ void queryTest(
 void addTest(
         faiss::MetricType metricType,
         bool useFloat16CoarseQuantizer,
-        bool useRaft) {
+        bool useCuvs) {
     for (int tries = 0; tries < 2; ++tries) {
         Options opt;
 
@@ -162,9 +162,9 @@ void addTest(
         faiss::gpu::GpuIndexIVFFlatConfig config;
         config.device = opt.device;
         config.indicesOptions =
-                useRaft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
+                useCuvs ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-        config.use_cuvs = useRaft;
+        config.use_cuvs = useCuvs;
 
         faiss::gpu::GpuIndexIVFFlat gpuIndex(
                 &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
@@ -188,7 +188,7 @@ void addTest(
     }
 }
 
-void copyToTest(bool useFloat16CoarseQuantizer, bool useRaft) {
+void copyToTest(bool useFloat16CoarseQuantizer, bool useCuvs) {
     Options opt;
     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
     std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
@@ -199,9 +199,9 @@ void copyToTest(bool useFloat16CoarseQuantizer, bool useRaft) {
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = opt.device;
     config.indicesOptions =
-            useRaft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
+            useCuvs ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
     config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-    config.use_cuvs = useRaft;
+    config.use_cuvs = useCuvs;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
@@ -241,7 +241,7 @@ void copyToTest(bool useFloat16CoarseQuantizer, bool useRaft) {
             compFloat16 ? 0.30f : 0.015f);
 }
 
-void copyFromTest(bool useFloat16CoarseQuantizer, bool useRaft) {
+void copyFromTest(bool useFloat16CoarseQuantizer, bool useCuvs) {
     Options opt;
     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
     std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
@@ -260,9 +260,9 @@ void copyFromTest(bool useFloat16CoarseQuantizer, bool useRaft) {
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = opt.device;
     config.indicesOptions =
-            useRaft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
+            useCuvs ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
     config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-    config.use_cuvs = useRaft;
+    config.use_cuvs = useCuvs;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(&res, 1, 1, faiss::METRIC_L2, config);
     gpuIndex.nprobe = 1;
@@ -334,7 +334,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2) {
     queryTest(opt, faiss::METRIC_L2, false);
 
 #if defined USE_NVIDIA_RAPIDS
-    opt.useRaft = true;
+    opt.useCuvs = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
 #endif
@@ -345,7 +345,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP) {
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 
 #if defined USE_NVIDIA_RAPIDS
-    opt.useRaft = true;
+    opt.useCuvs = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 #endif
@@ -358,7 +358,7 @@ TEST(TestGpuIndexIVFFlat, LargeBatch) {
     queryTest(opt, faiss::METRIC_L2, false);
 
 #if defined USE_NVIDIA_RAPIDS
-    opt.useRaft = true;
+    opt.useCuvs = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
 #endif
@@ -371,7 +371,7 @@ TEST(TestGpuIndexIVFFlat, Float16_32_Query_L2) {
     queryTest(opt, faiss::METRIC_L2, true);
 
 #if defined USE_NVIDIA_RAPIDS
-    opt.useRaft = true;
+    opt.useCuvs = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, true);
 #endif
@@ -382,7 +382,7 @@ TEST(TestGpuIndexIVFFlat, Float16_32_Query_IP) {
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, true);
 
 #if defined USE_NVIDIA_RAPIDS
-    opt.useRaft = true;
+    opt.useCuvs = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, true);
 #endif
@@ -399,7 +399,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2_64) {
     queryTest(opt, faiss::METRIC_L2, false);
 
 #if defined USE_NVIDIA_RAPIDS
-    opt.useRaft = true;
+    opt.useCuvs = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
 #endif
@@ -411,7 +411,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP_64) {
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 
 #if defined USE_NVIDIA_RAPIDS
-    opt.useRaft = true;
+    opt.useCuvs = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 #endif
@@ -423,7 +423,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2_128) {
     queryTest(opt, faiss::METRIC_L2, false);
 
 #if defined USE_NVIDIA_RAPIDS
-    opt.useRaft = true;
+    opt.useCuvs = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
 #endif
@@ -435,7 +435,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP_128) {
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 
 #if defined USE_NVIDIA_RAPIDS
-    opt.useRaft = true;
+    opt.useCuvs = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 #endif
diff --git a/faiss/gpu/test/TestGpuIndexIVFPQ.cpp b/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
index e0f246ed11..41857b073b 100644
--- a/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
@@ -85,7 +85,7 @@ struct Options {
         device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
 
         interleavedLayout = false;
-        useRaft = false;
+        useCuvs = false;
     }
 
     std::string toString() const {
@@ -126,7 +126,7 @@ struct Options {
     bool useFloat16;
     int device;
     bool interleavedLayout;
-    bool useRaft;
+    bool useCuvs;
 };
 
 void queryTest(Options opt, faiss::MetricType metricType) {
@@ -156,7 +156,7 @@ void queryTest(Options opt, faiss::MetricType metricType) {
     config.indicesOptions = opt.indicesOpt;
     config.useFloat16LookupTables = opt.useFloat16;
     config.interleavedLayout = opt.interleavedLayout;
-    config.use_cuvs = opt.useRaft;
+    config.use_cuvs = opt.useCuvs;
 
     faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
     gpuIndex.nprobe = opt.nprobe;
@@ -386,7 +386,7 @@ void addTest(Options opt, faiss::MetricType metricType) {
     config.indicesOptions = opt.indicesOpt;
     config.useFloat16LookupTables = opt.useFloat16;
     config.interleavedLayout = opt.interleavedLayout;
-    config.use_cuvs = opt.useRaft;
+    config.use_cuvs = opt.useCuvs;
 
     faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
     gpuIndex.nprobe = opt.nprobe;
@@ -436,7 +436,7 @@ void copyToTest(Options opt) {
         config.indicesOptions = opt.indicesOpt;
         config.useFloat16LookupTables = opt.useFloat16;
         config.interleavedLayout = opt.interleavedLayout;
-        config.use_cuvs = opt.useRaft;
+        config.use_cuvs = opt.useCuvs;
 
         faiss::gpu::GpuIndexIVFPQ gpuIndex(
                 &res,
@@ -513,7 +513,7 @@ void copyFromTest(Options opt) {
     config.indicesOptions = opt.indicesOpt;
     config.useFloat16LookupTables = opt.useFloat16;
     config.interleavedLayout = opt.interleavedLayout;
-    config.use_cuvs = opt.useRaft;
+    config.use_cuvs = opt.useCuvs;
 
     // Use garbage values to see if we overwrite them
     faiss::gpu::GpuIndexIVFPQ gpuIndex(
@@ -567,8 +567,8 @@ void queryNaNTest(Options opt) {
     config.usePrecomputedTables = opt.usePrecomputed;
     config.indicesOptions = opt.indicesOpt;
     config.useFloat16LookupTables = opt.useFloat16;
-    config.use_cuvs = opt.useRaft;
-    config.interleavedLayout = opt.useRaft ? true : opt.interleavedLayout;
+    config.use_cuvs = opt.useCuvs;
+    config.interleavedLayout = opt.useCuvs ? true : opt.interleavedLayout;
 
     faiss::gpu::GpuIndexIVFPQ gpuIndex(
             &res,
@@ -606,7 +606,7 @@ void queryNaNTest(Options opt) {
 
 TEST(TestGpuIndexIVFPQ, QueryNaN) {
     Options opt;
-    opt.useRaft = false;
+    opt.useCuvs = false;
     queryNaNTest(opt);
 }
 
@@ -620,7 +620,7 @@ void addNaNTest(Options opt) {
     config.indicesOptions = opt.indicesOpt;
     config.useFloat16LookupTables = opt.useFloat16;
     config.interleavedLayout = opt.interleavedLayout;
-    config.use_cuvs = opt.useRaft;
+    config.use_cuvs = opt.useCuvs;
 
     faiss::gpu::GpuIndexIVFPQ gpuIndex(
             &res,
@@ -664,7 +664,7 @@ void addNaNTest(Options opt) {
 
 TEST(TestGpuIndexIVFPQ, AddNaN) {
     Options opt;
-    opt.useRaft = false;
+    opt.useCuvs = false;
     addNaNTest(opt);
 }
 
@@ -673,7 +673,7 @@ TEST(TestGpuIndexIVFPQ, Query_L2_Raft) {
     for (int tries = 0; tries < 2; ++tries) {
         Options opt;
         opt.bitsPerCode = faiss::gpu::randVal(4, 8);
-        opt.useRaft = true;
+        opt.useCuvs = true;
         opt.interleavedLayout = true;
         opt.usePrecomputed = false;
         opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
@@ -686,7 +686,7 @@ TEST(TestGpuIndexIVFPQ, Query_IP_Raft) {
     for (int tries = 0; tries < 2; ++tries) {
         Options opt;
         opt.bitsPerCode = faiss::gpu::randVal(4, 8);
-        opt.useRaft = true;
+        opt.useCuvs = true;
         opt.interleavedLayout = true;
         opt.usePrecomputed = false;
         opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
@@ -703,7 +703,7 @@ TEST(TestGpuIndexIVFPQ, LargeBatch_Raft) {
     opt.dim = 4;
     opt.numQuery = 100000;
     opt.codes = 2;
-    opt.useRaft = true;
+    opt.useCuvs = true;
     opt.interleavedLayout = true;
     opt.usePrecomputed = false;
     opt.useFloat16 = false;
@@ -715,7 +715,7 @@ TEST(TestGpuIndexIVFPQ, LargeBatch_Raft) {
 
 TEST(TestGpuIndexIVFPQ, CopyFrom_Raft) {
     Options opt;
-    opt.useRaft = true;
+    opt.useCuvs = true;
     opt.interleavedLayout = true;
     opt.bitsPerCode = faiss::gpu::randVal(4, 8);
     opt.usePrecomputed = false;
@@ -727,7 +727,7 @@ TEST(TestGpuIndexIVFPQ, CopyFrom_Raft) {
 TEST(TestGpuIndexIVFPQ, Add_L2_Raft) {
     for (int tries = 0; tries < 2; ++tries) {
         Options opt;
-        opt.useRaft = true;
+        opt.useCuvs = true;
         opt.interleavedLayout = true;
         opt.bitsPerCode = faiss::gpu::randVal(4, 8);
         opt.usePrecomputed = false;
@@ -740,7 +740,7 @@ TEST(TestGpuIndexIVFPQ, Add_L2_Raft) {
 TEST(TestGpuIndexIVFPQ, Add_IP_Raft) {
     for (int tries = 0; tries < 2; ++tries) {
         Options opt;
-        opt.useRaft = true;
+        opt.useCuvs = true;
         opt.interleavedLayout = true;
         opt.bitsPerCode = faiss::gpu::randVal(4, 8);
         opt.usePrecomputed = false;
@@ -752,7 +752,7 @@ TEST(TestGpuIndexIVFPQ, Add_IP_Raft) {
 
 TEST(TestGpuIndexIVFPQ, QueryNaN_Raft) {
     Options opt;
-    opt.useRaft = true;
+    opt.useCuvs = true;
     opt.interleavedLayout = true;
     opt.bitsPerCode = faiss::gpu::randVal(4, 8);
     opt.usePrecomputed = false;
@@ -763,7 +763,7 @@ TEST(TestGpuIndexIVFPQ, QueryNaN_Raft) {
 
 TEST(TestGpuIndexIVFPQ, AddNaN_Raft) {
     Options opt;
-    opt.useRaft = true;
+    opt.useCuvs = true;
     opt.interleavedLayout = true;
     opt.bitsPerCode = faiss::gpu::randVal(4, 8);
     opt.usePrecomputed = false;
@@ -774,7 +774,7 @@ TEST(TestGpuIndexIVFPQ, AddNaN_Raft) {
 
 TEST(TestGpuIndexIVFPQ, CopyTo_Raft) {
     Options opt;
-    opt.useRaft = true;
+    opt.useCuvs = true;
     opt.interleavedLayout = true;
     opt.bitsPerCode = faiss::gpu::randVal(4, 8);
     opt.usePrecomputed = false;
diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt
index 574c9a9f32..5703d73886 100644
--- a/faiss/python/CMakeLists.txt
+++ b/faiss/python/CMakeLists.txt
@@ -129,9 +129,9 @@ if(FAISS_ENABLE_GPU)
   if(FAISS_ENABLE_CUVS)
     find_package(raft COMPONENTS compiled distributed)
   endif()
-  target_link_libraries(swigfaiss PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
-  target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
-  target_link_libraries(swigfaiss_avx512 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
+  target_link_libraries(swigfaiss PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
+  target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
+  target_link_libraries(swigfaiss_avx512 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
 endif()
 
 find_package(OpenMP REQUIRED)

From 45c42bb9271cd25f0d35422394bfaa99a33f35f1 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 17 Jul 2024 16:33:19 -0700
Subject: [PATCH 070/148] ivf-pq tests pass

---
 CMakeLists.txt                       |  2 +-
 faiss/gpu/GpuCloner.cpp              |  4 ++--
 faiss/gpu/GpuDistance.cu             | 36 ++++++++++++++++++----------
 faiss/gpu/GpuIndexIVFFlat.cu         | 14 +++++------
 faiss/gpu/GpuIndexIVFPQ.cu           |  6 ++---
 faiss/gpu/impl/CuvsIVFFlat.cu        | 12 ++++++----
 faiss/gpu/impl/CuvsIVFFlat.cuh       |  8 +++----
 faiss/gpu/impl/CuvsIVFPQ.cu          |  9 +++----
 faiss/gpu/impl/CuvsIVFPQ.cuh         | 11 +++++----
 faiss/gpu/test/TestGpuDistance.cu    | 24 +++++++++----------
 faiss/gpu/test/TestGpuIndexFlat.cpp  | 14 +++++------
 faiss/gpu/test/TestGpuIndexIVFPQ.cpp | 36 ++++++++++++++--------------
 12 files changed, 96 insertions(+), 80 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3064f22c53..dbcafb6c46 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,7 +53,7 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 # Valid values are "generic", "avx2", "avx512".
 option(FAISS_OPT_LEVEL "" "generic")
 option(FAISS_ENABLE_GPU "Enable support for GPU indexes." ON)
-option(FAISS_ENABLE_CUVS "Enable CUVS for GPU indexes." OFF)
+option(FAISS_ENABLE_CUVS "Enable cuVS for GPU indexes." OFF)
 option(FAISS_ENABLE_PYTHON "Build Python extension." ON)
 option(FAISS_ENABLE_C_API "Build C API." OFF)
 
diff --git a/faiss/gpu/GpuCloner.cpp b/faiss/gpu/GpuCloner.cpp
index f005bfd364..b8084002be 100644
--- a/faiss/gpu/GpuCloner.cpp
+++ b/faiss/gpu/GpuCloner.cpp
@@ -148,7 +148,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         config.device = device;
         config.useFloat16 = true;
         FAISS_THROW_IF_NOT_MSG(
-                !use_cuvs, "this type of index is not implemented for CUVS");
+                !use_cuvs, "this type of index is not implemented for cuVS");
         GpuIndexFlat* gif = new GpuIndexFlat(
                 provider, index->d, index->metric_type, config);
         // transfer data by blocks
@@ -185,7 +185,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         config.indicesOptions = indicesOptions;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
         FAISS_THROW_IF_NOT_MSG(
-                !use_cuvs, "this type of index is not implemented for CUVS");
+                !use_cuvs, "this type of index is not implemented for cuVS");
 
         GpuIndexIVFScalarQuantizer* res = new GpuIndexIVFScalarQuantizer(
                 provider,
diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index 6839fbd46f..f84b07aee7 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -32,7 +32,6 @@
 #include <optional>
 
 #if defined USE_NVIDIA_RAPIDS
-#include <raft/neighbors/brute_force.cuh>
 #include <cuvs/neighbors/brute_force.hpp>
 #include <faiss/gpu/utils/CuvsUtils.h>
 #include <raft/core/device_mdspan.hpp>
@@ -42,6 +41,7 @@
 #include <raft/core/operators.hpp>
 #include <raft/core/temporary_device_buffer.hpp>
 #include <raft/linalg/unary_op.cuh>
+#include <raft/neighbors/brute_force.cuh>
 #endif
 
 namespace faiss {
@@ -238,7 +238,7 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
             "be the same (F32 or F16");
 
 #if defined USE_NVIDIA_RAPIDS
-    // Note: For now, RAFT bfknn requires queries and vectors to be same layout
+    // Note: For now, cuVS bfknn requires queries and vectors to be same layout
     if (should_use_cuvs(args) && args.queriesRowMajor == args.vectorsRowMajor) {
         cuvsDistanceType distance = metricFaissToCuvs(args.metric, false);
 
@@ -299,7 +299,8 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
                         raft::vector_extent<int64_t>(num_queries));
                 norms_view = norms->view();
             }
-            cuvs::neighbors::brute_force::index idx(
+
+            cuvs::neighbors::brute_force::index<float> idx(
                     handle, index.view(), norms_view, distance, metric_arg);
             cuvs::neighbors::brute_force::search(
                     handle,
@@ -326,21 +327,32 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
                     const_cast<float*>(
                             reinterpret_cast<const float*>(args.queries)),
                     raft::matrix_extent<int64_t>(num_queries, dims));
-
-            std::vector<raft::device_matrix_view<
+            
+            std::optional<raft::temporary_device_buffer<
                     const float,
-                    int64_t,
-                    raft::col_major>>
-                    index_vec = {index.view()};
+                    raft::vector_extent<int64_t>>>
+                    norms;
+            std::optional<raft::device_vector_view<const float, int64_t>>
+                    norms_view;
+            if (args.vectorNorms) {
+                norms = raft::make_readonly_temporary_device_buffer<
+                        const float,
+                        int64_t>(
+                        handle,
+                        args.vectorNorms,
+                        raft::vector_extent<int64_t>(num_queries));
+                norms_view = norms->view();
+            }
 
-            raft::neighbors::brute_force::knn(
+            cuvs::neighbors::brute_force::index<float> idx(
+                    handle, index.view(), norms_view, distance, metric_arg);
+            cuvs::neighbors::brute_force::search(
                     handle,
-                    index_vec,
+                    idx,
                     search.view(),
                     inds.view(),
                     dists.view(),
-                    raft::distance::DistanceType::L2Expanded,
-                    metric_arg);
+                    std::nullopt);
         }
 
         if (args.metric == MetricType::METRIC_Lp) {
diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 6b196b6155..6e6a6de7b0 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -102,7 +102,7 @@ void GpuIndexIVFFlat::reserveMemory(size_t numVecs) {
 
     if (should_use_cuvs(config_)) {
         FAISS_THROW_MSG(
-                "Pre-allocation of IVF lists is not supported with CUVS enabled.");
+                "Pre-allocation of IVF lists is not supported with cuVS enabled.");
     }
 
     reserveMemoryVecs_ = numVecs;
@@ -214,11 +214,11 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
     if (this->is_trained) {
         FAISS_ASSERT(index_);
         if (should_use_cuvs(config_)) {
-            // copy the IVF centroids to the CUVS index
+            // copy the IVF centroids to the cuVS index
             // in case it has been reset. This is because `reset` clears the
-            // CUVS index and its centroids.
+            // cuVS index and its centroids.
             // TODO: change this once the coarse quantizer is separated from
-            // CUVS index
+            // cuVS index
             updateQuantizer();
         };
         return;
@@ -272,7 +272,7 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
         quantizer->add(nlist, cuvs_ivfflat_index.value().centers().data_handle());
         raft_handle.sync_stream();
 
-        cuvsIndex_->setCuvsIndex(&cuvs_ivfflat_index.value());
+        cuvsIndex_->setCuvsIndex(std::move(*cuvs_ivfflat_index));
 #else
         FAISS_THROW_MSG(
                 "RAFT has not been compiled into the current version so it cannot be used.");
@@ -307,7 +307,7 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
     if (reserveMemoryVecs_) {
         if (should_use_cuvs(config_)) {
             FAISS_THROW_MSG(
-                    "Pre-allocation of IVF lists is not supported with CUVS enabled.");
+                    "Pre-allocation of IVF lists is not supported with cuVS enabled.");
         } else
             index_->reserveMemory(reserveMemoryVecs_);
     }
@@ -334,7 +334,7 @@ void GpuIndexIVFFlat::setIndex_(
                 "RAFT only supports INDICES_64_BIT");
         if (!ivfFlatConfig_.interleavedLayout) {
             fprintf(stderr,
-                    "WARN: interleavedLayout is set to False with CUVS enabled. This will be ignored.\n");
+                    "WARN: interleavedLayout is set to False with cuVS enabled. This will be ignored.\n");
         }
         index_.reset(new CuvsIVFFlat(
                 resources,
diff --git a/faiss/gpu/GpuIndexIVFPQ.cu b/faiss/gpu/GpuIndexIVFPQ.cu
index 35e21bb84a..0666e5035e 100644
--- a/faiss/gpu/GpuIndexIVFPQ.cu
+++ b/faiss/gpu/GpuIndexIVFPQ.cu
@@ -367,7 +367,7 @@ void GpuIndexIVFPQ::train(idx_t n, const float* x) {
 #if defined USE_NVIDIA_RAPIDS
         if (pq.assign_index) {
             fprintf(stderr,
-                    "WARN: The Product Quantizer's assign_index will be ignored with CUVS enabled.\n");
+                    "WARN: The Product Quantizer's assign_index will be ignored with cuVS enabled.\n");
         }
         // first initialize the index. The PQ centroids will be updated
         // retroactively.
@@ -433,7 +433,7 @@ void GpuIndexIVFPQ::train(idx_t n, const float* x) {
                 cuvs_ivfpq_index.value().pq_centers().size(),
                 raft_handle.get_stream());
         raft_handle.sync_stream();
-        cuvsIndex_->setCuvsIndex(&cuvs_ivfpq_index.value());
+        cuvsIndex_->setCuvsIndex(std::move(*cuvs_ivfpq_index));
 #else
         FAISS_THROW_MSG(
                 "RAFT has not been compiled into the current version so it cannot be used.");
@@ -541,7 +541,7 @@ void GpuIndexIVFPQ::verifyPQSettings_() const {
     if (should_use_cuvs(config_)) {
         if (!ivfpqConfig_.interleavedLayout) {
             fprintf(stderr,
-                    "WARN: interleavedLayout is set to False with CUVS enabled. This will be ignored.\n");
+                    "WARN: interleavedLayout is set to False with cuVS enabled. This will be ignored.\n");
         }
         FAISS_THROW_IF_NOT_FMT(
                 bitsPerCode_ >= 4 && bitsPerCode_ <= 8,
diff --git a/faiss/gpu/impl/CuvsIVFFlat.cu b/faiss/gpu/impl/CuvsIVFFlat.cu
index be1d984b2a..f74f3b97ef 100644
--- a/faiss/gpu/impl/CuvsIVFFlat.cu
+++ b/faiss/gpu/impl/CuvsIVFFlat.cu
@@ -24,15 +24,15 @@
 #include <cstdint>
 
 #include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/gpu/utils/CuvsUtils.h>
 #include <faiss/gpu/impl/CuvsIVFFlat.cuh>
 #include <faiss/gpu/impl/FlatIndex.cuh>
 #include <faiss/gpu/impl/IVFFlat.cuh>
 #include <faiss/gpu/utils/Transpose.cuh>
-#include <faiss/gpu/StandardGpuResources.h>
 
-#include <cuvs/neighbors/ivf_flat.hpp>
 #include <cuvs/neighbors/common.hpp>
+#include <cuvs/neighbors/ivf_flat.hpp>
 #include <raft/linalg/map.cuh>
 #include <raft/linalg/norm.cuh>
 
@@ -74,7 +74,7 @@ CuvsIVFFlat::~CuvsIVFFlat() {}
 
 void CuvsIVFFlat::reserveMemory(idx_t numVecs) {
     fprintf(stderr,
-            "WARN: reserveMemory is NOP. Pre-allocation of IVF lists is not supported with CUVS enabled.\n");
+            "WARN: reserveMemory is NOP. Pre-allocation of IVF lists is not supported with cuVS enabled.\n");
 }
 
 void CuvsIVFFlat::reset() {
@@ -82,8 +82,10 @@ void CuvsIVFFlat::reset() {
 }
 
 void CuvsIVFFlat::setCuvsIndex(
-        cuvs::neighbors::ivf_flat::index<float, idx_t>* idx) {
-    cuvs_index.reset(idx);
+        cuvs::neighbors::ivf_flat::index<float, idx_t>&& idx) {
+    cuvs_index =
+            std::make_shared<cuvs::neighbors::ivf_flat::index<float, idx_t>>(
+                    std::move(idx));
 }
 
 void CuvsIVFFlat::search(
diff --git a/faiss/gpu/impl/CuvsIVFFlat.cuh b/faiss/gpu/impl/CuvsIVFFlat.cuh
index 9296eda993..36127e853b 100644
--- a/faiss/gpu/impl/CuvsIVFFlat.cuh
+++ b/faiss/gpu/impl/CuvsIVFFlat.cuh
@@ -85,7 +85,7 @@ class CuvsIVFFlat : public IVFFlat {
             Tensor<float, 2, true>& vecs,
             Tensor<idx_t, 1, true>& indices) override;
 
-    /// Clear out the Raft index
+    /// Clear out the cuVS index
     void reset() override;
 
     /// For debugging purposes, return the list length of a particular
@@ -99,15 +99,15 @@ class CuvsIVFFlat : public IVFFlat {
     std::vector<uint8_t> getListVectorData(idx_t listId, bool gpuFormat)
             const override;
 
-    /// Update our Raft index with this quantizer instance; may be a CPU
+    /// Update our cuVS index with this quantizer instance; may be a CPU
     /// or GPU quantizer
     void updateQuantizer(Index* quantizer) override;
 
     /// Copy all inverted lists from a CPU representation to ourselves
     void copyInvertedListsFrom(const InvertedLists* ivf) override;
 
-    /// Replace the CUVS index
-    void setCuvsIndex(cuvs::neighbors::ivf_flat::index<float, idx_t>* idx);
+    /// Replace the cuVS index
+    void setCuvsIndex(cuvs::neighbors::ivf_flat::index<float, idx_t>&& idx);
 
    private:
     /// Adds a set of codes and indices to a list, with the representation
diff --git a/faiss/gpu/impl/CuvsIVFPQ.cu b/faiss/gpu/impl/CuvsIVFPQ.cu
index 5affcb5d7e..d2a883faa1 100644
--- a/faiss/gpu/impl/CuvsIVFPQ.cu
+++ b/faiss/gpu/impl/CuvsIVFPQ.cu
@@ -75,7 +75,7 @@ CuvsIVFPQ::~CuvsIVFPQ() {}
 
 void CuvsIVFPQ::reserveMemory(idx_t numVecs) {
     fprintf(stderr,
-            "WARN: reserveMemory is NOP. Pre-allocation of IVF lists is not supported with CUVS enabled.\n");
+            "WARN: reserveMemory is NOP. Pre-allocation of IVF lists is not supported with cuVS enabled.\n");
 }
 
 void CuvsIVFPQ::reset() {
@@ -84,7 +84,7 @@ void CuvsIVFPQ::reset() {
 
 size_t CuvsIVFPQ::reclaimMemory() {
     fprintf(stderr,
-            "WARN: reclaimMemory is NOP. reclaimMemory is not supported with CUVS enabled.\n");
+            "WARN: reclaimMemory is NOP. reclaimMemory is not supported with cuVS enabled.\n");
     return 0;
 }
 
@@ -448,8 +448,9 @@ void CuvsIVFPQ::copyInvertedListsFrom(const InvertedLists* ivf) {
     }
 }
 
-void CuvsIVFPQ::setCuvsIndex(cuvs::neighbors::ivf_pq::index<idx_t>* idx) {
-    cuvs_index.reset(idx);
+void CuvsIVFPQ::setCuvsIndex(cuvs::neighbors::ivf_pq::index<idx_t>&& idx) {
+    cuvs_index = std::make_shared<cuvs::neighbors::ivf_pq::index<idx_t>>(std::move(idx));
+    setBasePQCentroids_();
 }
 
 void CuvsIVFPQ::addEncodedVectorsToList_(
diff --git a/faiss/gpu/impl/CuvsIVFPQ.cuh b/faiss/gpu/impl/CuvsIVFPQ.cuh
index e4d0435769..41a43175f5 100644
--- a/faiss/gpu/impl/CuvsIVFPQ.cuh
+++ b/faiss/gpu/impl/CuvsIVFPQ.cuh
@@ -93,15 +93,16 @@ class CuvsIVFPQ : public IVFPQ {
     std::vector<uint8_t> getListVectorData(idx_t listId, bool gpuFormat)
             const override;
 
-    /// Update our Raft index with this quantizer instance; may be a CPU
+    /// Update our cuVS index with this quantizer instance; may be a CPU
     /// or GPU quantizer
     void updateQuantizer(Index* quantizer) override;
 
     /// Copy all inverted lists from a CPU representation to ourselves
     void copyInvertedListsFrom(const InvertedLists* ivf) override;
 
-    /// Replace the Raft index
-    void setCuvsIndex(cuvs::neighbors::ivf_pq::index<idx_t>* idx);
+    /// Replace the cuVS index
+//     void setCuvsIndex(cuvs::neighbors::ivf_pq::index<idx_t>* idx);
+    void setCuvsIndex(cuvs::neighbors::ivf_pq::index<idx_t>&& idx);
 
     /// Classify and encode/add vectors to our IVF lists.
     /// The input data must be on our current device.
@@ -133,7 +134,7 @@ class CuvsIVFPQ : public IVFPQ {
     /// Returns the encoding size for a PQ-encoded IVF list
     size_t getGpuListEncodingSize_(idx_t listId);
 
-    /// Copy the PQ centroids to the Raft index. The data is already in the
+    /// Copy the PQ centroids to the cuVS index. The data is already in the
     /// preferred format with the transpose performed by the IVFPQ class helper.
     void setPQCentroids_();
 
@@ -141,7 +142,7 @@ class CuvsIVFPQ : public IVFPQ {
     /// Used when the RAFT index was updated externally.
     void setBasePQCentroids_();
 
-    /// CUVS IVF-PQ index
+    /// cuVS IVF-PQ index
     std::shared_ptr<cuvs::neighbors::ivf_pq::index<idx_t>> cuvs_index{nullptr};
 };
 
diff --git a/faiss/gpu/test/TestGpuDistance.cu b/faiss/gpu/test/TestGpuDistance.cu
index b30dc6a4ee..bb5dd47356 100644
--- a/faiss/gpu/test/TestGpuDistance.cu
+++ b/faiss/gpu/test/TestGpuDistance.cu
@@ -197,7 +197,7 @@ TEST(TestGpuDistance, Transposition_RR) {
 }
 
 #if defined USE_NVIDIA_RAPIDS
-TEST(TestRaftGpuDistance, Transposition_RR) {
+TEST(TestCuvsGpuDistance, Transposition_RR) {
     testTransposition(false, false, faiss::MetricType::METRIC_L2, true);
     testTransposition(
             false, false, faiss::MetricType::METRIC_INNER_PRODUCT, true);
@@ -209,7 +209,7 @@ TEST(TestGpuDistance, Transposition_RC) {
 }
 
 #if defined USE_NVIDIA_RAPIDS
-TEST(TestRaftGpuDistance, Transposition_RC) {
+TEST(TestCuvsGpuDistance, Transposition_RC) {
     testTransposition(false, true, faiss::MetricType::METRIC_L2, true);
 }
 #endif
@@ -219,7 +219,7 @@ TEST(TestGpuDistance, Transposition_CR) {
 }
 
 #if defined USE_NVIDIA_RAPIDS
-TEST(TestRaftGpuDistance, Transposition_CR) {
+TEST(TestCuvsGpuDistance, Transposition_CR) {
     testTransposition(true, false, faiss::MetricType::METRIC_L2, true);
 }
 #endif
@@ -229,7 +229,7 @@ TEST(TestGpuDistance, Transposition_CC) {
 }
 
 #if defined USE_NVIDIA_RAPIDS
-TEST(TestRaftGpuDistance, Transposition_CC) {
+TEST(TestCuvsGpuDistance, Transposition_CC) {
     testTransposition(true, true, faiss::MetricType::METRIC_L2, true);
 }
 #endif
@@ -239,7 +239,7 @@ TEST(TestGpuDistance, L1) {
 }
 
 #if defined USE_NVIDIA_RAPIDS
-TEST(TestRaftGpuDistance, L1) {
+TEST(TestCuvsGpuDistance, L1) {
     testTransposition(false, false, faiss::MetricType::METRIC_L1, true);
 }
 #endif
@@ -251,7 +251,7 @@ TEST(TestGpuDistance, L1_RC) {
 
 #if defined USE_NVIDIA_RAPIDS
 // Test other transpositions with the general distance kernel
-TEST(TestRaftGpuDistance, L1_RC) {
+TEST(TestCuvsGpuDistance, L1_RC) {
     testTransposition(false, true, faiss::MetricType::METRIC_L1, true);
 }
 #endif
@@ -261,7 +261,7 @@ TEST(TestGpuDistance, L1_CR) {
 }
 
 #if defined USE_NVIDIA_RAPIDS
-TEST(TestRaftGpuDistance, L1_CR) {
+TEST(TestCuvsGpuDistance, L1_CR) {
     testTransposition(true, false, faiss::MetricType::METRIC_L1, true);
 }
 #endif
@@ -271,7 +271,7 @@ TEST(TestGpuDistance, L1_CC) {
 }
 
 #if defined USE_NVIDIA_RAPIDS
-TEST(TestRaftGpuDistance, L1_CC) {
+TEST(TestCuvsGpuDistance, L1_CC) {
     testTransposition(true, true, faiss::MetricType::METRIC_L1, true);
 }
 #endif
@@ -283,7 +283,7 @@ TEST(TestGpuDistance, Linf) {
 
 #if defined USE_NVIDIA_RAPIDS
 // Test remainder of metric types
-TEST(TestRaftGpuDistance, Linf) {
+TEST(TestCuvsGpuDistance, Linf) {
     testTransposition(false, false, faiss::MetricType::METRIC_Linf, true);
 }
 #endif
@@ -293,7 +293,7 @@ TEST(TestGpuDistance, Lp) {
 }
 
 #if defined USE_NVIDIA_RAPIDS
-TEST(TestRaftGpuDistance, Lp) {
+TEST(TestCuvsGpuDistance, Lp) {
     testTransposition(false, false, faiss::MetricType::METRIC_Lp, true, 3);
 }
 #endif
@@ -303,7 +303,7 @@ TEST(TestGpuDistance, Canberra) {
 }
 
 #if defined USE_NVIDIA_RAPIDS
-TEST(TestRaftGpuDistance, Canberra) {
+TEST(TestCuvsGpuDistance, Canberra) {
     testTransposition(false, false, faiss::MetricType::METRIC_Canberra, true);
 }
 #endif
@@ -317,7 +317,7 @@ TEST(TestGpuDistance, JensenShannon) {
 }
 
 #if defined USE_NVIDIA_RAPIDS
-TEST(TestRaftGpuDistance, JensenShannon) {
+TEST(TestCuvsGpuDistance, JensenShannon) {
     testTransposition(
             false, false, faiss::MetricType::METRIC_JensenShannon, true);
 }
diff --git a/faiss/gpu/test/TestGpuIndexFlat.cpp b/faiss/gpu/test/TestGpuIndexFlat.cpp
index c4ae04282c..7af7a1a9e0 100644
--- a/faiss/gpu/test/TestGpuIndexFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexFlat.cpp
@@ -344,7 +344,7 @@ TEST(TestGpuIndexFlat, CopyFrom) {
 }
 
 #if defined USE_NVIDIA_RAPIDS
-TEST(TestRaftGpuIndexFlat, CopyFrom) {
+TEST(TestCuvsGpuIndexFlat, CopyFrom) {
     testCopyFrom(true);
 }
 #endif
@@ -395,7 +395,7 @@ TEST(TestGpuIndexFlat, CopyTo) {
 }
 
 #if defined USE_NVIDIA_RAPIDS
-TEST(TestRaftGpuIndexFlat, CopyTo) {
+TEST(TestCuvsGpuIndexFlat, CopyTo) {
     testCopyTo(true);
 }
 #endif
@@ -453,7 +453,7 @@ TEST(TestGpuIndexFlat, UnifiedMemory) {
 }
 
 #if defined USE_NVIDIA_RAPIDS
-TEST(TestRaftGpuIndexFlat, UnifiedMemory) {
+TEST(TestCuvsGpuIndexFlat, UnifiedMemory) {
     testUnifiedMemory(true);
 }
 #endif
@@ -514,7 +514,7 @@ TEST(TestGpuIndexFlat, LargeIndex) {
 }
 
 #if defined USE_NVIDIA_RAPIDS
-TEST(TestRaftGpuIndexFlat, LargeIndex) {
+TEST(TestCuvsGpuIndexFlat, LargeIndex) {
     testLargeIndex(true);
 }
 #endif
@@ -567,7 +567,7 @@ TEST(TestGpuIndexFlat, Residual) {
 }
 
 #if defined USE_NVIDIA_RAPIDS
-TEST(TestRaftGpuIndexFlat, Residual) {
+TEST(TestCuvsGpuIndexFlat, Residual) {
     testResidual(true);
 }
 #endif
@@ -658,7 +658,7 @@ TEST(TestGpuIndexFlat, Reconstruct) {
     testReconstruct(false);
 }
 #if defined USE_NVIDIA_RAPIDS
-TEST(TestRaftGpuIndexFlat, Reconstruct) {
+TEST(TestCuvsGpuIndexFlat, Reconstruct) {
     testReconstruct(true);
 }
 #endif
@@ -755,7 +755,7 @@ TEST(TestGpuIndexFlat, SearchAndReconstruct) {
 }
 
 #if defined USE_NVIDIA_RAPIDS
-TEST(TestRaftGpuIndexFlat, SearchAndReconstruct) {
+TEST(TestCuvsGpuIndexFlat, SearchAndReconstruct) {
     testSearchAndReconstruct(true);
 }
 #endif
diff --git a/faiss/gpu/test/TestGpuIndexIVFPQ.cpp b/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
index 41857b073b..93230099b0 100644
--- a/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
@@ -35,7 +35,7 @@ void pickEncoding(int& codes, int& dim) {
     }
 }
 
-void pickRaftEncoding(int& codes, int& dim, int bitsPerCode) {
+void pickCuvsEncoding(int& codes, int& dim, int bitsPerCode) {
     // Above 32 doesn't work with no precomputed codes
     std::vector<int> dimSizes{4, 8, 10, 12, 16, 20, 24, 28, 32};
 
@@ -669,7 +669,7 @@ TEST(TestGpuIndexIVFPQ, AddNaN) {
 }
 
 #if defined USE_NVIDIA_RAPIDS
-TEST(TestGpuIndexIVFPQ, Query_L2_Raft) {
+TEST(TestGpuIndexIVFPQ, Query_L2_Cuvs) {
     for (int tries = 0; tries < 2; ++tries) {
         Options opt;
         opt.bitsPerCode = faiss::gpu::randVal(4, 8);
@@ -677,12 +677,12 @@ TEST(TestGpuIndexIVFPQ, Query_L2_Raft) {
         opt.interleavedLayout = true;
         opt.usePrecomputed = false;
         opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-        pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+        pickCuvsEncoding(opt.codes, opt.dim, opt.bitsPerCode);
         queryTest(opt, faiss::MetricType::METRIC_L2);
     }
 }
 
-TEST(TestGpuIndexIVFPQ, Query_IP_Raft) {
+TEST(TestGpuIndexIVFPQ, Query_IP_Cuvs) {
     for (int tries = 0; tries < 2; ++tries) {
         Options opt;
         opt.bitsPerCode = faiss::gpu::randVal(4, 8);
@@ -690,13 +690,13 @@ TEST(TestGpuIndexIVFPQ, Query_IP_Raft) {
         opt.interleavedLayout = true;
         opt.usePrecomputed = false;
         opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-        pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+        pickCuvsEncoding(opt.codes, opt.dim, opt.bitsPerCode);
         queryTest(opt, faiss::MetricType::METRIC_INNER_PRODUCT);
     }
 }
 
 // Large batch sizes (>= 65536) should also work
-TEST(TestGpuIndexIVFPQ, LargeBatch_Raft) {
+TEST(TestGpuIndexIVFPQ, LargeBatch_Cuvs) {
     Options opt;
 
     // override for large sizes
@@ -713,18 +713,18 @@ TEST(TestGpuIndexIVFPQ, LargeBatch_Raft) {
     queryTest(opt, faiss::MetricType::METRIC_L2);
 }
 
-TEST(TestGpuIndexIVFPQ, CopyFrom_Raft) {
+TEST(TestGpuIndexIVFPQ, CopyFrom_Cuvs) {
     Options opt;
     opt.useCuvs = true;
     opt.interleavedLayout = true;
     opt.bitsPerCode = faiss::gpu::randVal(4, 8);
     opt.usePrecomputed = false;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-    pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+    pickCuvsEncoding(opt.codes, opt.dim, opt.bitsPerCode);
     copyFromTest(opt);
 }
 
-TEST(TestGpuIndexIVFPQ, Add_L2_Raft) {
+TEST(TestGpuIndexIVFPQ, Add_L2_Cuvs) {
     for (int tries = 0; tries < 2; ++tries) {
         Options opt;
         opt.useCuvs = true;
@@ -732,12 +732,12 @@ TEST(TestGpuIndexIVFPQ, Add_L2_Raft) {
         opt.bitsPerCode = faiss::gpu::randVal(4, 8);
         opt.usePrecomputed = false;
         opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-        pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+        pickCuvsEncoding(opt.codes, opt.dim, opt.bitsPerCode);
         addTest(opt, faiss::METRIC_L2);
     }
 }
 
-TEST(TestGpuIndexIVFPQ, Add_IP_Raft) {
+TEST(TestGpuIndexIVFPQ, Add_IP_Cuvs) {
     for (int tries = 0; tries < 2; ++tries) {
         Options opt;
         opt.useCuvs = true;
@@ -745,41 +745,41 @@ TEST(TestGpuIndexIVFPQ, Add_IP_Raft) {
         opt.bitsPerCode = faiss::gpu::randVal(4, 8);
         opt.usePrecomputed = false;
         opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-        pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+        pickCuvsEncoding(opt.codes, opt.dim, opt.bitsPerCode);
         addTest(opt, faiss::METRIC_INNER_PRODUCT);
     }
 }
 
-TEST(TestGpuIndexIVFPQ, QueryNaN_Raft) {
+TEST(TestGpuIndexIVFPQ, QueryNaN_Cuvs) {
     Options opt;
     opt.useCuvs = true;
     opt.interleavedLayout = true;
     opt.bitsPerCode = faiss::gpu::randVal(4, 8);
     opt.usePrecomputed = false;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-    pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+    pickCuvsEncoding(opt.codes, opt.dim, opt.bitsPerCode);
     queryNaNTest(opt);
 }
 
-TEST(TestGpuIndexIVFPQ, AddNaN_Raft) {
+TEST(TestGpuIndexIVFPQ, AddNaN_Cuvs) {
     Options opt;
     opt.useCuvs = true;
     opt.interleavedLayout = true;
     opt.bitsPerCode = faiss::gpu::randVal(4, 8);
     opt.usePrecomputed = false;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-    pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+    pickCuvsEncoding(opt.codes, opt.dim, opt.bitsPerCode);
     addNaNTest(opt);
 }
 
-TEST(TestGpuIndexIVFPQ, CopyTo_Raft) {
+TEST(TestGpuIndexIVFPQ, CopyTo_Cuvs) {
     Options opt;
     opt.useCuvs = true;
     opt.interleavedLayout = true;
     opt.bitsPerCode = faiss::gpu::randVal(4, 8);
     opt.usePrecomputed = false;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
-    pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+    pickCuvsEncoding(opt.codes, opt.dim, opt.bitsPerCode);
     copyToTest(opt);
 }
 #endif

From a0a9b4080baabdfbe92b873ff1bf135b4bdfef35 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 17 Jul 2024 16:47:01 -0700
Subject: [PATCH 071/148] update caps

---
 benchs/bench_ivfflat_cuvs.py             | 18 +++++++++---------
 benchs/bench_ivfpq_cuvs.py               | 18 +++++++++---------
 faiss/gpu/test/torch_test_contrib_gpu.py |  6 +++---
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/benchs/bench_ivfflat_cuvs.py b/benchs/bench_ivfflat_cuvs.py
index 269bd9cf21..c1c97b3336 100644
--- a/benchs/bench_ivfflat_cuvs.py
+++ b/benchs/bench_ivfflat_cuvs.py
@@ -44,7 +44,7 @@ def aa(*args, **kwargs):
 aa('--bm_search', default=True,
    help='whether to benchmark search operation on GPU index')
 aa('--cuvs_only', default=False, action='store_true',
-   help='whether to only produce CUVS enabled benchmarks')
+   help='whether to only produce cuVS enabled benchmarks')
 
 
 group = parser.add_argument_group('IVF options')
@@ -91,12 +91,12 @@ def bench_train_milliseconds(index, trainVecs, use_cuvs):
             cuvs_gpu_train_time = bench_train_milliseconds(
                 index, trainVecs, True)
             if args.cuvs_only:
-                print("Method: IVFFlat, Operation: TRAIN, dim: %d, n_centroids %d, numTrain: %d, CUVS enabled GPU train time: %.3f milliseconds" % (
+                print("Method: IVFFlat, Operation: TRAIN, dim: %d, n_centroids %d, numTrain: %d, cuVS enabled GPU train time: %.3f milliseconds" % (
                     n_cols, args.n_centroids, n_rows, cuvs_gpu_train_time))
             else:
                 classical_gpu_train_time = bench_train_milliseconds(
                     index, trainVecs, False)
-                print("Method: IVFFlat, Operation: TRAIN, dim: %d, n_centroids %d, numTrain: %d, classical GPU train time: %.3f milliseconds, CUVS enabled GPU train time: %.3f milliseconds" % (
+                print("Method: IVFFlat, Operation: TRAIN, dim: %d, n_centroids %d, numTrain: %d, classical GPU train time: %.3f milliseconds, cuVS enabled GPU train time: %.3f milliseconds" % (
                     n_cols, args.n_centroids, n_rows, classical_gpu_train_time, cuvs_gpu_train_time))
 
 
@@ -126,12 +126,12 @@ def bench_add_milliseconds(index, addVecs, use_cuvs):
             addVecs = rs.rand(n_rows, n_cols).astype('float32')
             cuvs_gpu_add_time = bench_add_milliseconds(index, addVecs, True)
             if args.cuvs_only:
-                print("Method: IVFFlat, Operation: ADD, dim: %d, n_centroids %d, numAdd: %d, CUVS enabled GPU add time: %.3f milliseconds" % (
+                print("Method: IVFFlat, Operation: ADD, dim: %d, n_centroids %d, numAdd: %d, cuVS enabled GPU add time: %.3f milliseconds" % (
                     n_train, n_rows, n_cols, args.n_centroids, cuvs_gpu_add_time))
             else:
                 classical_gpu_add_time = bench_add_milliseconds(
                     index, addVecs, False)
-                print("Method: IVFFlat, Operation: ADD, dim: %d, n_centroids %d, numAdd: %d, classical GPU add time: %.3f milliseconds, CUVS enabled GPU add time: %.3f milliseconds" % (
+                print("Method: IVFFlat, Operation: ADD, dim: %d, n_centroids %d, numAdd: %d, classical GPU add time: %.3f milliseconds, cuVS enabled GPU add time: %.3f milliseconds" % (
                     n_train, n_rows, n_cols, args.n_centroids, classical_gpu_add_time, cuvs_gpu_add_time))
 
 
@@ -165,16 +165,16 @@ def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_cuvs):
             cuvs_gpu_search_time = bench_search_milliseconds(
                 index, addVecs, queryVecs, args.nprobe, args.k, True)
             if args.cuvs_only:
-                print("Method: IVFFlat, Operation: SEARCH, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, CUVS enabled GPU search time: %.3f milliseconds" % (
+                print("Method: IVFFlat, Operation: SEARCH, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, cuVS enabled GPU search time: %.3f milliseconds" % (
                     n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, cuvs_gpu_search_time))
             else:
                 classical_gpu_search_time = bench_search_milliseconds(
                     index, addVecs, queryVecs, args.nprobe, args.k, False)
-                print("Method: IVFFlat, Operation: SEARCH, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, CUVS enabled GPU search time: %.3f milliseconds" % (
+                print("Method: IVFFlat, Operation: SEARCH, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, cuVS enabled GPU search time: %.3f milliseconds" % (
                     n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, classical_gpu_search_time, cuvs_gpu_search_time))
 
     print("=" * 40)
-    print("Large CUVS Enabled Benchmarks")
+    print("Large cuVS Enabled Benchmarks")
     print("=" * 40)
     # Avoid classical GPU Benchmarks for large datasets because of OOM for more than 500000 queries and/or large dims as well as for large k
     queryset_sizes = [100000, 500000, 1000000]
@@ -189,5 +189,5 @@ def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_cuvs):
             queryVecs = rs.rand(n_rows, n_cols).astype('float32')
             cuvs_gpu_search_time = bench_search_milliseconds(
                 index, addVecs, queryVecs, args.nprobe, args.k, True)
-            print("Method: IVFFlat, Operation: SEARCH, numTrain: %d, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, CUVS enabled GPU search time: %.3f milliseconds" % (
+            print("Method: IVFFlat, Operation: SEARCH, numTrain: %d, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, cuVS enabled GPU search time: %.3f milliseconds" % (
                 n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, cuvs_gpu_search_time))
diff --git a/benchs/bench_ivfpq_cuvs.py b/benchs/bench_ivfpq_cuvs.py
index 294d7d5dbe..3ee6da2f52 100644
--- a/benchs/bench_ivfpq_cuvs.py
+++ b/benchs/bench_ivfpq_cuvs.py
@@ -42,12 +42,12 @@ def aa(*args, **kwargs):
 
 group = parser.add_argument_group('benchmarking options')
 aa('--cuvs_only', default=False, action='store_true',
-   help='whether to only produce CUVS enabled benchmarks')
+   help='whether to only produce cuVS enabled benchmarks')
 
 group = parser.add_argument_group('IVF options')
-aa('--bits_per_code', default=8, type=int, help='bits per code. Note that < 8 is only supported when RAFT is enabled')
+aa('--bits_per_code', default=8, type=int, help='bits per code. Note that < 8 is only supported when cuVS is enabled')
 aa('--pq_len', default=2, type=int, help='number of vector elements represented by one PQ code')
-aa('--use_precomputed', default=True, type=bool, help='use precomputed codes (not with CUVS enabled)')
+aa('--use_precomputed', default=True, type=bool, help='use precomputed codes (not with cuVS enabled)')
 
 group = parser.add_argument_group('searching')
 aa('--k', default=10, type=int, help='nb of nearest neighbors')
@@ -94,12 +94,12 @@ def bench_train_milliseconds(index, trainVecs, use_cuvs):
 print("=" * 40)
 cuvs_gpu_train_time = bench_train_milliseconds(index, xt, True)
 if args.cuvs_only:
-    print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, CUVS enabled GPU train time: %.3f milliseconds" % (
+    print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, cuVS enabled GPU train time: %.3f milliseconds" % (
         n_cols, nlist, M, args.bits_per_code, n_train, cuvs_gpu_train_time))
 else:
     classical_gpu_train_time = bench_train_milliseconds(
         index, xt, False)
-    print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, classical GPU train time: %.3f milliseconds, CUVS enabled GPU train time: %.3f milliseconds" % (
+    print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, classical GPU train time: %.3f milliseconds, cuVS enabled GPU train time: %.3f milliseconds" % (
         n_cols, nlist, M, args.bits_per_code, n_train, classical_gpu_train_time, cuvs_gpu_train_time))
 
 
@@ -120,12 +120,12 @@ def bench_add_milliseconds(index, addVecs, use_cuvs):
 index.train(xt)
 cuvs_gpu_add_time = bench_add_milliseconds(index, xb, True)
 if args.cuvs_only:
-    print("Method: IVFPQ, Operation: ADD, dim: %d, n_centroids %d numSubQuantizers %d, bitsPerCode %d, numAdd %d, CUVS enabled GPU add time: %.3f milliseconds" % (
+    print("Method: IVFPQ, Operation: ADD, dim: %d, n_centroids %d numSubQuantizers %d, bitsPerCode %d, numAdd %d, cuVS enabled GPU add time: %.3f milliseconds" % (
         n_cols, nlist, M, args.bits_per_code, n_rows, cuvs_gpu_add_time))
 else:
     classical_gpu_add_time = bench_add_milliseconds(
         index, xb, False)
-    print("Method: IVFFPQ, Operation: ADD, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numAdd %d, classical GPU add time: %.3f milliseconds, CUVS enabled GPU add time: %.3f milliseconds" % (
+    print("Method: IVFFPQ, Operation: ADD, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numAdd %d, classical GPU add time: %.3f milliseconds, cuVS enabled GPU add time: %.3f milliseconds" % (
         n_cols, nlist, M, args.bits_per_code, n_rows, classical_gpu_add_time, cuvs_gpu_add_time))
 
 
@@ -159,10 +159,10 @@ def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_cuvs):
         cuvs_gpu_search_time = bench_search_milliseconds(
             index, xb, queryVecs, args.nprobe, args.k, True)
         if args.cuvs_only:
-            print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, CUVS enabled GPU search time: %.3f milliseconds" % (
+            print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, cuVS enabled GPU search time: %.3f milliseconds" % (
                 n_cols, nlist, M, args.bits_per_code, n_add, n_rows, args.nprobe, args.k, cuvs_gpu_search_time))
         else:
             classical_gpu_search_time = bench_search_milliseconds(
                 index, xb, queryVecs, args.nprobe, args.k, False)
-            print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, CUVS enabled GPU search time: %.3f milliseconds" % (
+            print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, cuVS enabled GPU search time: %.3f milliseconds" % (
                 n_cols, nlist, M, args.bits_per_code, n_add, n_rows, args.nprobe, args.k, classical_gpu_search_time, cuvs_gpu_search_time))
\ No newline at end of file
diff --git a/faiss/gpu/test/torch_test_contrib_gpu.py b/faiss/gpu/test/torch_test_contrib_gpu.py
index 7353ea8b33..96cd436c32 100644
--- a/faiss/gpu/test/torch_test_contrib_gpu.py
+++ b/faiss/gpu/test/torch_test_contrib_gpu.py
@@ -332,9 +332,9 @@ def test_knn_gpu(self, use_cuvs=False):
                         self.assertLess((D.cpu() - gt_D[6:8]).abs().max(), 1e-4)
 
     @unittest.skipUnless(
-        "RAFT" in faiss.get_compile_options(),
-        "only if RAFT is compiled in")
-    def test_knn_gpu_raft(self):
+        "CUVS" in faiss.get_compile_options(),
+        "only if cuVS is compiled in")
+    def test_knn_gpu_cuvs(self):
         self.test_knn_gpu(use_cuvs=True)
 
     def test_knn_gpu_datatypes(self, use_cuvs=False):

From 0106e12e2a7140df09acf201b5ad9cc3d22774ae Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Fri, 19 Jul 2024 10:33:40 -0700
Subject: [PATCH 072/148] fix failing tests

---
 faiss/gpu/GpuCloner.cpp                |  8 ++---
 faiss/gpu/GpuClonerOptions.h           |  4 +--
 faiss/gpu/GpuDistance.cu               |  6 ++--
 faiss/gpu/GpuIndex.cu                  |  4 +--
 faiss/gpu/GpuIndex.h                   |  2 +-
 faiss/gpu/GpuIndexFlat.cu              |  4 +--
 faiss/gpu/GpuIndexIVFFlat.cu           |  8 ++---
 faiss/gpu/GpuIndexIVFPQ.cu             |  6 ++--
 faiss/gpu/GpuIndexIVFPQ.h              |  2 +-
 faiss/gpu/GpuResources.cpp             |  2 +-
 faiss/gpu/GpuResources.h               |  7 ++--
 faiss/gpu/StandardGpuResources.cpp     | 24 ++++++-------
 faiss/gpu/StandardGpuResources.h       |  8 ++---
 faiss/gpu/impl/CuvsIVFFlat.cu          |  4 +--
 faiss/gpu/test/TestGpuDistance.cu      | 26 +++++++-------
 faiss/gpu/test/TestGpuIndexFlat.cpp    | 34 +++++++++----------
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 47 +++++++++++++-------------
 faiss/gpu/test/TestGpuIndexIVFPQ.cpp   |  4 +--
 18 files changed, 101 insertions(+), 99 deletions(-)

diff --git a/faiss/gpu/GpuCloner.cpp b/faiss/gpu/GpuCloner.cpp
index b8084002be..19200dbb1e 100644
--- a/faiss/gpu/GpuCloner.cpp
+++ b/faiss/gpu/GpuCloner.cpp
@@ -14,7 +14,7 @@
 
 #include <faiss/IndexBinaryFlat.h>
 #include <faiss/IndexFlat.h>
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 #include <faiss/IndexHNSW.h>
 #endif
 #include <faiss/IndexIVF.h>
@@ -27,7 +27,7 @@
 #include <faiss/MetaIndexes.h>
 #include <faiss/gpu/GpuIndex.h>
 #include <faiss/gpu/GpuIndexBinaryFlat.h>
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 #include <faiss/gpu/GpuIndexCagra.h>
 #endif
 #include <faiss/gpu/GpuIndexFlat.h>
@@ -92,7 +92,7 @@ Index* ToCPUCloner::clone_Index(const Index* index) {
         // (inverse op of ToGpuClonerMultiple)
 
     }
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     else if (auto icg = dynamic_cast<const GpuIndexCagra*>(index)) {
         IndexHNSWCagra* res = new IndexHNSWCagra();
         icg->copyTo(res);
@@ -230,7 +230,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
 
         return res;
     }
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     else if (auto icg = dynamic_cast<const faiss::IndexHNSWCagra*>(index)) {
         GpuIndexCagraConfig config;
         config.device = device;
diff --git a/faiss/gpu/GpuClonerOptions.h b/faiss/gpu/GpuClonerOptions.h
index 10bfa4e9ca..d7213c7bfa 100644
--- a/faiss/gpu/GpuClonerOptions.h
+++ b/faiss/gpu/GpuClonerOptions.h
@@ -37,8 +37,8 @@ struct GpuClonerOptions {
     /// Set verbose options on the index
     bool verbose = false;
 
-    /// use the CUVS implementation
-#if defined USE_NVIDIA_RAPIDS
+    /// use the cuVS implementation
+#if defined USE_NVIDIA_CUVS
     bool use_cuvs = true;
 #else
     bool use_cuvs = false;
diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index f84b07aee7..5d8bb2d76b 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -31,7 +31,7 @@
 #include <faiss/gpu/utils/DeviceTensor.cuh>
 #include <optional>
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 #include <cuvs/neighbors/brute_force.hpp>
 #include <faiss/gpu/utils/CuvsUtils.h>
 #include <raft/core/device_mdspan.hpp>
@@ -47,7 +47,7 @@
 namespace faiss {
 namespace gpu {
 
-// #if defined USE_NVIDIA_RAPIDS
+// #if defined USE_NVIDIA_CUVS
 // using namespace cuvs::neighbors;
 // #endif
 
@@ -237,7 +237,7 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
             "limitation: both vectorType and queryType must currently "
             "be the same (F32 or F16");
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     // Note: For now, cuVS bfknn requires queries and vectors to be same layout
     if (should_use_cuvs(args) && args.queriesRowMajor == args.vectorsRowMajor) {
         cuvsDistanceType distance = metricFaissToCuvs(args.metric, false);
diff --git a/faiss/gpu/GpuIndex.cu b/faiss/gpu/GpuIndex.cu
index d29741c111..951486d796 100644
--- a/faiss/gpu/GpuIndex.cu
+++ b/faiss/gpu/GpuIndex.cu
@@ -534,8 +534,8 @@ extern std::string gpu_compile_options;
 struct InitGpuCompileOptions {
     InitGpuCompileOptions() {
         gpu_compile_options = "GPU ";
-#ifdef USE_NVIDIA_RAPIDS
-        gpu_compile_options += "NVIDIA_RAFT ";
+#ifdef USE_NVIDIA_CUVS
+        gpu_compile_options += "NVIDIA_CUVS ";
 #endif
     }
 };
diff --git a/faiss/gpu/GpuIndex.h b/faiss/gpu/GpuIndex.h
index 4b73ba5051..a050974b1e 100644
--- a/faiss/gpu/GpuIndex.h
+++ b/faiss/gpu/GpuIndex.h
@@ -38,7 +38,7 @@ struct GpuIndexConfig {
     MemorySpace memorySpace = MemorySpace::Device;
 
     /// Should the index dispatch down to RAFT?
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     bool use_cuvs = true;
 #else
     bool use_cuvs = false;
diff --git a/faiss/gpu/GpuIndexFlat.cu b/faiss/gpu/GpuIndexFlat.cu
index 7785fcd763..3a7e774a5f 100644
--- a/faiss/gpu/GpuIndexFlat.cu
+++ b/faiss/gpu/GpuIndexFlat.cu
@@ -18,7 +18,7 @@
 #include <faiss/gpu/utils/Float16.cuh>
 #include <limits>
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 #include <faiss/gpu/impl/CuvsFlatIndex.cuh>
 #endif
 
@@ -93,7 +93,7 @@ GpuIndexFlat::GpuIndexFlat(
 GpuIndexFlat::~GpuIndexFlat() {}
 
 void GpuIndexFlat::resetIndex_(int dims) {
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 
     if (should_use_cuvs(config_)) {
         data_.reset(new CuvsFlatIndex(
diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 6e6a6de7b0..3ed98eef84 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -15,7 +15,7 @@
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/Float16.cuh>
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 #include <cuvs/neighbors/ivf_flat.hpp>
 #include <faiss/gpu/utils/CuvsUtils.h>
 #include <faiss/gpu/impl/CuvsIVFFlat.cuh>
@@ -227,7 +227,7 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
     FAISS_ASSERT(!index_);
 
     if (should_use_cuvs(config_)) {
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
         setIndex_(
                 resources_.get(),
                 this->d,
@@ -328,10 +328,10 @@ void GpuIndexIVFFlat::setIndex_(
         IndicesOptions indicesOptions,
         MemorySpace space) {
     if (should_use_cuvs(config_)) {
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
         FAISS_THROW_IF_NOT_MSG(
                 ivfFlatConfig_.indicesOptions == INDICES_64_BIT,
-                "RAFT only supports INDICES_64_BIT");
+                "cuVS only supports INDICES_64_BIT");
         if (!ivfFlatConfig_.interleavedLayout) {
             fprintf(stderr,
                     "WARN: interleavedLayout is set to False with cuVS enabled. This will be ignored.\n");
diff --git a/faiss/gpu/GpuIndexIVFPQ.cu b/faiss/gpu/GpuIndexIVFPQ.cu
index 0666e5035e..339b626123 100644
--- a/faiss/gpu/GpuIndexIVFPQ.cu
+++ b/faiss/gpu/GpuIndexIVFPQ.cu
@@ -15,7 +15,7 @@
 #include <faiss/gpu/impl/IVFPQ.cuh>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 #include <cuvs/neighbors/ivf_pq.hpp>
 #include <faiss/gpu/utils/CuvsUtils.h>
 #include <faiss/gpu/impl/CuvsIVFPQ.cuh>
@@ -364,7 +364,7 @@ void GpuIndexIVFPQ::train(idx_t n, const float* x) {
     // RAFT does not support using an external index for assignment. Fall back
     // to the classical GPU impl
     if (should_use_cuvs(config_)) {
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
         if (pq.assign_index) {
             fprintf(stderr,
                     "WARN: The Product Quantizer's assign_index will be ignored with cuVS enabled.\n");
@@ -494,7 +494,7 @@ void GpuIndexIVFPQ::setIndex_(
         IndicesOptions indicesOptions,
         MemorySpace space) {
     if (should_use_cuvs(config_)) {
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
         index_.reset(new CuvsIVFPQ(
                 resources,
                 dim,
diff --git a/faiss/gpu/GpuIndexIVFPQ.h b/faiss/gpu/GpuIndexIVFPQ.h
index ce3bfa6c1e..5769dbb1d2 100644
--- a/faiss/gpu/GpuIndexIVFPQ.h
+++ b/faiss/gpu/GpuIndexIVFPQ.h
@@ -34,7 +34,7 @@ struct GpuIndexIVFPQConfig : public GpuIndexIVFConfig {
 
     /// Use the alternative memory layout for the IVF lists
     /// WARNING: this is a feature under development, and is only supported with
-    /// CUVS enabled for the index. Do not use if RAFT is not enabled.
+    /// cuVS enabled for the index. Do not use if RAFT is not enabled.
     bool interleavedLayout = false;
 
     /// Use GEMM-backed computation of PQ code distances for the no precomputed
diff --git a/faiss/gpu/GpuResources.cpp b/faiss/gpu/GpuResources.cpp
index 428c756f20..e7bd6aac34 100644
--- a/faiss/gpu/GpuResources.cpp
+++ b/faiss/gpu/GpuResources.cpp
@@ -168,7 +168,7 @@ cudaStream_t GpuResources::getDefaultStreamCurrentDevice() {
     return getDefaultStream(getCurrentDevice());
 }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 raft::device_resources& GpuResources::getRaftHandleCurrentDevice() {
     return getRaftHandle(getCurrentDevice());
 }
diff --git a/faiss/gpu/GpuResources.h b/faiss/gpu/GpuResources.h
index 6c4710e0bf..b1a181d214 100644
--- a/faiss/gpu/GpuResources.h
+++ b/faiss/gpu/GpuResources.h
@@ -30,7 +30,7 @@
 #include <utility>
 #include <vector>
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 #include <raft/core/device_resources.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #endif
@@ -161,8 +161,9 @@ struct AllocRequest : public AllocInfo {
     /// The size in bytes of the allocation
     size_t size = 0;
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     rmm::mr::device_memory_resource* mr = nullptr;
+    // rmm::device_async_resource_ref mr;
 #endif
 };
 
@@ -211,7 +212,7 @@ class GpuResources {
     /// given device
     virtual cudaStream_t getDefaultStream(int device) = 0;
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     /// Returns the raft handle for the given device which can be used to
     /// make calls to other raft primitives.
     virtual raft::device_resources& getRaftHandle(int device) = 0;
diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
index 178face71f..53d1dadad0 100644
--- a/faiss/gpu/StandardGpuResources.cpp
+++ b/faiss/gpu/StandardGpuResources.cpp
@@ -20,7 +20,7 @@
  * limitations under the License.
  */
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 #include <raft/core/device_resources.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -90,7 +90,7 @@ std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
 
 StandardGpuResourcesImpl::StandardGpuResourcesImpl()
         :
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
           mmr_(new rmm::mr::managed_memory_resource),
           pmr_(new rmm::mr::pinned_memory_resource),
 #endif
@@ -158,7 +158,7 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
     }
 
     if (pinnedMemAlloc_) {
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
         pmr_->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_);
 #else
         auto err = cudaFreeHost(pinnedMemAlloc_);
@@ -257,7 +257,7 @@ void StandardGpuResourcesImpl::setDefaultStream(
         if (prevStream != stream) {
             streamWait({stream}, {prevStream});
         }
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
         // delete the raft handle for this device, which will be initialized
         // with the updated stream during any subsequent calls to getRaftHandle
         auto it2 = raftHandles_.find(device);
@@ -283,7 +283,7 @@ void StandardGpuResourcesImpl::revertDefaultStream(int device) {
 
             streamWait({newStream}, {prevStream});
         }
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
         // delete the raft handle for this device, which will be initialized
         // with the updated stream during any subsequent calls to getRaftHandle
         auto it2 = raftHandles_.find(device);
@@ -323,7 +323,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
     // If this is the first device that we're initializing, create our
     // pinned memory allocation
     if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
         // If this is the first device that we're initializing, create our
         // pinned memory allocation
         if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
@@ -376,7 +376,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
 
     defaultStreams_[device] = defaultStream;
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     raftHandles_.emplace(std::make_pair(device, defaultStream));
 #endif
 
@@ -442,7 +442,7 @@ cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) {
     return defaultStreams_[device];
 }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 raft::device_resources& StandardGpuResourcesImpl::getRaftHandle(int device) {
     initializeForDevice(device);
 
@@ -513,7 +513,7 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
         // Otherwise, we can handle this locally
         p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
     } else if (adjReq.space == MemorySpace::Device) {
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
         try {
             rmm::mr::device_memory_resource* current_mr =
                     rmm::mr::get_per_device_resource(
@@ -547,7 +547,7 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
         }
 #endif
     } else if (adjReq.space == MemorySpace::Unified) {
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
         try {
             // for now, use our own managed MR to do Unified Memory allocations.
             // TODO: change this to use the current device resource once RMM has
@@ -616,7 +616,7 @@ void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
     } else if (
             req.space == MemorySpace::Device ||
             req.space == MemorySpace::Unified) {
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
         req.mr->deallocate_async(p, req.size, req.stream);
 #else
         auto err = cudaFree(p);
@@ -710,7 +710,7 @@ cudaStream_t StandardGpuResources::getDefaultStream(int device) {
     return res_->getDefaultStream(device);
 }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 raft::device_resources& StandardGpuResources::getRaftHandle(int device) {
     return res_->getRaftHandle(device);
 }
diff --git a/faiss/gpu/StandardGpuResources.h b/faiss/gpu/StandardGpuResources.h
index 7badad5290..175a470836 100644
--- a/faiss/gpu/StandardGpuResources.h
+++ b/faiss/gpu/StandardGpuResources.h
@@ -22,7 +22,7 @@
 
 #pragma once
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 #include <raft/core/device_resources.hpp>
 #include <rmm/mr/host/pinned_memory_resource.hpp>
 #endif
@@ -79,7 +79,7 @@ class StandardGpuResourcesImpl : public GpuResources {
     /// this stream upon exit from an index or other Faiss GPU call.
     cudaStream_t getDefaultStream(int device) override;
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     /// Returns the raft handle for the given device which can be used to
     /// make calls to other raft primitives.
     raft::device_resources& getRaftHandle(int device) override;
@@ -151,7 +151,7 @@ class StandardGpuResourcesImpl : public GpuResources {
     /// cuBLAS handle for each device
     std::unordered_map<int, cublasHandle_t> blasHandles_;
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     /// raft handle for each device
     std::unordered_map<int, raft::device_resources> raftHandles_;
 
@@ -234,7 +234,7 @@ class StandardGpuResources : public GpuResourcesProvider {
     /// Returns the current default stream
     cudaStream_t getDefaultStream(int device);
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     /// Returns the raft handle for the given device which can be used to
     /// make calls to other raft primitives.
     raft::device_resources& getRaftHandle(int device);
diff --git a/faiss/gpu/impl/CuvsIVFFlat.cu b/faiss/gpu/impl/CuvsIVFFlat.cu
index f74f3b97ef..7c3226ccab 100644
--- a/faiss/gpu/impl/CuvsIVFFlat.cu
+++ b/faiss/gpu/impl/CuvsIVFFlat.cu
@@ -532,7 +532,7 @@ void CuvsIVFFlatCodePackerInterleaved::pack_1(
         size_t offset,
         uint8_t* block) const {
     cuvs::neighbors::ivf_flat::helpers::codepacker::pack_1(
-            flat_code, block, dim, chunk_size, static_cast<uint32_t>(offset));
+            reinterpret_cast<const float*>(flat_code), reinterpret_cast<float*>(block), dim, chunk_size, static_cast<uint32_t>(offset));
 }
 
 void CuvsIVFFlatCodePackerInterleaved::unpack_1(
@@ -540,7 +540,7 @@ void CuvsIVFFlatCodePackerInterleaved::unpack_1(
         size_t offset,
         uint8_t* flat_code) const {
     cuvs::neighbors::ivf_flat::helpers::codepacker::unpack_1(
-            block, flat_code, dim, chunk_size, static_cast<uint32_t>(offset));
+            reinterpret_cast<const float*>(block), reinterpret_cast<float*>(flat_code), dim, chunk_size, static_cast<uint32_t>(offset));
 }
 
 } // namespace gpu
diff --git a/faiss/gpu/test/TestGpuDistance.cu b/faiss/gpu/test/TestGpuDistance.cu
index bb5dd47356..c9742ed92b 100644
--- a/faiss/gpu/test/TestGpuDistance.cu
+++ b/faiss/gpu/test/TestGpuDistance.cu
@@ -168,7 +168,7 @@ void testTransposition(
     args.outIndices = gpuIndices.data();
     args.device = device;
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     args.use_cuvs = use_cuvs;
 #else
     FAISS_THROW_IF_NOT_MSG(
@@ -196,7 +196,7 @@ TEST(TestGpuDistance, Transposition_RR) {
     testTransposition(false, false, faiss::MetricType::METRIC_INNER_PRODUCT);
 }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 TEST(TestCuvsGpuDistance, Transposition_RR) {
     testTransposition(false, false, faiss::MetricType::METRIC_L2, true);
     testTransposition(
@@ -208,7 +208,7 @@ TEST(TestGpuDistance, Transposition_RC) {
     testTransposition(false, true, faiss::MetricType::METRIC_L2);
 }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 TEST(TestCuvsGpuDistance, Transposition_RC) {
     testTransposition(false, true, faiss::MetricType::METRIC_L2, true);
 }
@@ -218,7 +218,7 @@ TEST(TestGpuDistance, Transposition_CR) {
     testTransposition(true, false, faiss::MetricType::METRIC_L2);
 }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 TEST(TestCuvsGpuDistance, Transposition_CR) {
     testTransposition(true, false, faiss::MetricType::METRIC_L2, true);
 }
@@ -228,7 +228,7 @@ TEST(TestGpuDistance, Transposition_CC) {
     testTransposition(true, true, faiss::MetricType::METRIC_L2);
 }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 TEST(TestCuvsGpuDistance, Transposition_CC) {
     testTransposition(true, true, faiss::MetricType::METRIC_L2, true);
 }
@@ -238,7 +238,7 @@ TEST(TestGpuDistance, L1) {
     testTransposition(false, false, faiss::MetricType::METRIC_L1);
 }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 TEST(TestCuvsGpuDistance, L1) {
     testTransposition(false, false, faiss::MetricType::METRIC_L1, true);
 }
@@ -249,7 +249,7 @@ TEST(TestGpuDistance, L1_RC) {
     testTransposition(false, true, faiss::MetricType::METRIC_L1);
 }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 // Test other transpositions with the general distance kernel
 TEST(TestCuvsGpuDistance, L1_RC) {
     testTransposition(false, true, faiss::MetricType::METRIC_L1, true);
@@ -260,7 +260,7 @@ TEST(TestGpuDistance, L1_CR) {
     testTransposition(true, false, faiss::MetricType::METRIC_L1);
 }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 TEST(TestCuvsGpuDistance, L1_CR) {
     testTransposition(true, false, faiss::MetricType::METRIC_L1, true);
 }
@@ -270,7 +270,7 @@ TEST(TestGpuDistance, L1_CC) {
     testTransposition(true, true, faiss::MetricType::METRIC_L1);
 }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 TEST(TestCuvsGpuDistance, L1_CC) {
     testTransposition(true, true, faiss::MetricType::METRIC_L1, true);
 }
@@ -281,7 +281,7 @@ TEST(TestGpuDistance, Linf) {
     testTransposition(false, false, faiss::MetricType::METRIC_Linf);
 }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 // Test remainder of metric types
 TEST(TestCuvsGpuDistance, Linf) {
     testTransposition(false, false, faiss::MetricType::METRIC_Linf, true);
@@ -292,7 +292,7 @@ TEST(TestGpuDistance, Lp) {
     testTransposition(false, false, faiss::MetricType::METRIC_Lp, false, 3);
 }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 TEST(TestCuvsGpuDistance, Lp) {
     testTransposition(false, false, faiss::MetricType::METRIC_Lp, true, 3);
 }
@@ -302,7 +302,7 @@ TEST(TestGpuDistance, Canberra) {
     testTransposition(false, false, faiss::MetricType::METRIC_Canberra);
 }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 TEST(TestCuvsGpuDistance, Canberra) {
     testTransposition(false, false, faiss::MetricType::METRIC_Canberra, true);
 }
@@ -316,7 +316,7 @@ TEST(TestGpuDistance, JensenShannon) {
     testTransposition(false, false, faiss::MetricType::METRIC_JensenShannon);
 }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 TEST(TestCuvsGpuDistance, JensenShannon) {
     testTransposition(
             false, false, faiss::MetricType::METRIC_JensenShannon, true);
diff --git a/faiss/gpu/test/TestGpuIndexFlat.cpp b/faiss/gpu/test/TestGpuIndexFlat.cpp
index 7af7a1a9e0..d462ea2f04 100644
--- a/faiss/gpu/test/TestGpuIndexFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexFlat.cpp
@@ -114,7 +114,7 @@ TEST(TestGpuIndexFlat, IP_Float32) {
 
         testFlat(opt);
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
         opt.use_cuvs = true;
         testFlat(opt);
 #endif
@@ -128,7 +128,7 @@ TEST(TestGpuIndexFlat, L1_Float32) {
 
     testFlat(opt);
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     opt.use_cuvs = true;
     testFlat(opt);
 #endif
@@ -141,7 +141,7 @@ TEST(TestGpuIndexFlat, Lp_Float32) {
     opt.useFloat16 = false;
 
     testFlat(opt);
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     opt.use_cuvs = true;
     testFlat(opt);
 #endif
@@ -155,7 +155,7 @@ TEST(TestGpuIndexFlat, L2_Float32) {
         opt.useFloat16 = false;
 
         testFlat(opt);
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
         opt.use_cuvs = true;
         testFlat(opt);
 #endif
@@ -173,7 +173,7 @@ TEST(TestGpuIndexFlat, L2_k_2048) {
         opt.numVecsOverride = 10000;
 
         testFlat(opt);
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
         opt.use_cuvs = true;
         testFlat(opt);
 #endif
@@ -189,7 +189,7 @@ TEST(TestGpuIndexFlat, L2_Float32_K1) {
         opt.kOverride = 1;
 
         testFlat(opt);
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
         opt.use_cuvs = true;
         testFlat(opt);
 #endif
@@ -203,7 +203,7 @@ TEST(TestGpuIndexFlat, IP_Float16) {
         opt.useFloat16 = true;
 
         testFlat(opt);
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
         opt.use_cuvs = true;
         testFlat(opt);
 #endif
@@ -217,7 +217,7 @@ TEST(TestGpuIndexFlat, L2_Float16) {
         opt.useFloat16 = true;
 
         testFlat(opt);
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
         opt.use_cuvs = true;
         testFlat(opt);
 #endif
@@ -233,7 +233,7 @@ TEST(TestGpuIndexFlat, L2_Float16_K1) {
         opt.kOverride = 1;
 
         testFlat(opt);
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
         opt.use_cuvs = true;
         testFlat(opt);
 #endif
@@ -254,7 +254,7 @@ TEST(TestGpuIndexFlat, L2_Tiling) {
         opt.kOverride = 64;
 
         testFlat(opt);
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
         opt.use_cuvs = true;
         testFlat(opt);
 #endif
@@ -343,7 +343,7 @@ TEST(TestGpuIndexFlat, CopyFrom) {
     testCopyFrom(false);
 }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 TEST(TestCuvsGpuIndexFlat, CopyFrom) {
     testCopyFrom(true);
 }
@@ -394,7 +394,7 @@ TEST(TestGpuIndexFlat, CopyTo) {
     testCopyTo(false);
 }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 TEST(TestCuvsGpuIndexFlat, CopyTo) {
     testCopyTo(true);
 }
@@ -452,7 +452,7 @@ TEST(TestGpuIndexFlat, UnifiedMemory) {
     testUnifiedMemory(false);
 }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 TEST(TestCuvsGpuIndexFlat, UnifiedMemory) {
     testUnifiedMemory(true);
 }
@@ -513,7 +513,7 @@ TEST(TestGpuIndexFlat, LargeIndex) {
     testLargeIndex(false);
 }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 TEST(TestCuvsGpuIndexFlat, LargeIndex) {
     testLargeIndex(true);
 }
@@ -566,7 +566,7 @@ TEST(TestGpuIndexFlat, Residual) {
     testResidual(false);
 }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 TEST(TestCuvsGpuIndexFlat, Residual) {
     testResidual(true);
 }
@@ -657,7 +657,7 @@ void testReconstruct(bool use_cuvs) {
 TEST(TestGpuIndexFlat, Reconstruct) {
     testReconstruct(false);
 }
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 TEST(TestCuvsGpuIndexFlat, Reconstruct) {
     testReconstruct(true);
 }
@@ -754,7 +754,7 @@ TEST(TestGpuIndexFlat, SearchAndReconstruct) {
     testSearchAndReconstruct(false);
 }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 TEST(TestCuvsGpuIndexFlat, SearchAndReconstruct) {
     testSearchAndReconstruct(true);
 }
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index c5d8b7926c..976d0f9b9b 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -140,6 +140,7 @@ void addTest(
         bool useCuvs) {
     for (int tries = 0; tries < 2; ++tries) {
         Options opt;
+        opt.useCuvs = useCuvs;
 
         std::vector<float> trainVecs =
                 faiss::gpu::randVecs(opt.numTrain, opt.dim);
@@ -162,9 +163,9 @@ void addTest(
         faiss::gpu::GpuIndexIVFFlatConfig config;
         config.device = opt.device;
         config.indicesOptions =
-                useCuvs ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
+                opt.useCuvs ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-        config.use_cuvs = useCuvs;
+        config.use_cuvs = opt.useCuvs;
 
         faiss::gpu::GpuIndexIVFFlat gpuIndex(
                 &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
@@ -296,7 +297,7 @@ void copyFromTest(bool useFloat16CoarseQuantizer, bool useCuvs) {
 TEST(TestGpuIndexIVFFlat, Float32_32_Add_L2) {
     addTest(faiss::METRIC_L2, false, false);
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     addTest(faiss::METRIC_L2, false, true);
 #endif
 }
@@ -304,7 +305,7 @@ TEST(TestGpuIndexIVFFlat, Float32_32_Add_L2) {
 TEST(TestGpuIndexIVFFlat, Float32_32_Add_IP) {
     addTest(faiss::METRIC_INNER_PRODUCT, false, false);
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     addTest(faiss::METRIC_INNER_PRODUCT, false, true);
 #endif
 }
@@ -312,7 +313,7 @@ TEST(TestGpuIndexIVFFlat, Float32_32_Add_IP) {
 TEST(TestGpuIndexIVFFlat, Float16_32_Add_L2) {
     addTest(faiss::METRIC_L2, true, false);
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     addTest(faiss::METRIC_L2, true, true);
 #endif
 }
@@ -320,7 +321,7 @@ TEST(TestGpuIndexIVFFlat, Float16_32_Add_L2) {
 TEST(TestGpuIndexIVFFlat, Float16_32_Add_IP) {
     addTest(faiss::METRIC_INNER_PRODUCT, true, false);
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     addTest(faiss::METRIC_INNER_PRODUCT, true, true);
 #endif
 }
@@ -331,9 +332,9 @@ TEST(TestGpuIndexIVFFlat, Float16_32_Add_IP) {
 
 TEST(TestGpuIndexIVFFlat, Float32_Query_L2) {
     Options opt;
-    queryTest(opt, faiss::METRIC_L2, false);
+    // queryTest(opt, faiss::METRIC_L2, false);
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     opt.useCuvs = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
@@ -344,7 +345,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP) {
     Options opt;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     opt.useCuvs = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
@@ -357,7 +358,7 @@ TEST(TestGpuIndexIVFFlat, LargeBatch) {
     opt.numQuery = 100000;
     queryTest(opt, faiss::METRIC_L2, false);
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     opt.useCuvs = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
@@ -370,7 +371,7 @@ TEST(TestGpuIndexIVFFlat, Float16_32_Query_L2) {
     Options opt;
     queryTest(opt, faiss::METRIC_L2, true);
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     opt.useCuvs = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, true);
@@ -381,7 +382,7 @@ TEST(TestGpuIndexIVFFlat, Float16_32_Query_IP) {
     Options opt;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, true);
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     opt.useCuvs = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, true);
@@ -398,7 +399,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2_64) {
     opt.dim = 64;
     queryTest(opt, faiss::METRIC_L2, false);
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     opt.useCuvs = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
@@ -410,7 +411,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP_64) {
     opt.dim = 64;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     opt.useCuvs = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
@@ -422,7 +423,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2_128) {
     opt.dim = 128;
     queryTest(opt, faiss::METRIC_L2, false);
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     opt.useCuvs = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
@@ -434,7 +435,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP_128) {
     opt.dim = 128;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     opt.useCuvs = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
@@ -448,7 +449,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP_128) {
 TEST(TestGpuIndexIVFFlat, Float32_32_CopyTo) {
     copyToTest(false, false);
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     copyToTest(false, true);
 #endif
 }
@@ -456,7 +457,7 @@ TEST(TestGpuIndexIVFFlat, Float32_32_CopyTo) {
 TEST(TestGpuIndexIVFFlat, Float32_32_CopyFrom) {
     copyFromTest(false, false);
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     copyFromTest(false, true);
 #endif
 }
@@ -522,7 +523,7 @@ TEST(TestGpuIndexIVFFlat, Float32_negative) {
             compFloat16 ? 0.99f : 0.1f,
             compFloat16 ? 0.65f : 0.015f);
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     config.use_cuvs = true;
     config.indicesOptions = faiss::gpu::INDICES_64_BIT;
 
@@ -593,7 +594,7 @@ TEST(TestGpuIndexIVFFlat, QueryNaN) {
         }
     }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     config.use_cuvs = true;
     config.indicesOptions = faiss::gpu::INDICES_64_BIT;
     std::fill(distances.begin(), distances.end(), 0);
@@ -663,7 +664,7 @@ TEST(TestGpuIndexIVFFlat, AddNaN) {
             distance.data(),
             indices.data());
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     config.use_cuvs = true;
     config.indicesOptions = faiss::gpu::INDICES_64_BIT;
     faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
@@ -741,7 +742,7 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) {
             0.1f,
             0.015f);
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     config.use_cuvs = true;
     config.indicesOptions = faiss::gpu::INDICES_64_BIT;
     faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
@@ -820,7 +821,7 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) {
             0.1f,
             0.015f);
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     config.use_cuvs = true;
     config.indicesOptions = faiss::gpu::INDICES_64_BIT;
     faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
diff --git a/faiss/gpu/test/TestGpuIndexIVFPQ.cpp b/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
index 93230099b0..ccc9c8ff8f 100644
--- a/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
@@ -668,7 +668,7 @@ TEST(TestGpuIndexIVFPQ, AddNaN) {
     addNaNTest(opt);
 }
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
 TEST(TestGpuIndexIVFPQ, Query_L2_Cuvs) {
     for (int tries = 0; tries < 2; ++tries) {
         Options opt;
@@ -848,7 +848,7 @@ TEST(TestGpuIndexIVFPQ, UnifiedMemory) {
             0.1f,
             0.015f);
 
-#if defined USE_NVIDIA_RAPIDS
+#if defined USE_NVIDIA_CUVS
     config.interleavedLayout = true;
     config.use_cuvs = true;
     config.indicesOptions = faiss::gpu::INDICES_64_BIT;

From 57915d447c8e780ff18effdff60c65c6fb48b65b Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Fri, 19 Jul 2024 13:59:31 -0700
Subject: [PATCH 073/148] update CMake, tests

---
 build.sh                     | 7 ++++---
 faiss/gpu/CMakeLists.txt     | 8 ++++----
 faiss/gpu/test/test_cagra.py | 8 ++++----
 faiss/gpu/test/test_raft.py  | 6 +++---
 4 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/build.sh b/build.sh
index eaa767f2af..4f522189a7 100755
--- a/build.sh
+++ b/build.sh
@@ -36,21 +36,22 @@ cd $BUILD_DIR
 cmake \
  -DFAISS_ENABLE_GPU=ON \
  -DFAISS_ENABLE_CUVS=ON \
- -DFAISS_ENABLE_PYTHON=OFF \
+ -DFAISS_ENABLE_PYTHON=ON \
  -DBUILD_TESTING=ON \
- -DBUILD_SHARED_LIBS=OFF \
+ -DBUILD_SHARED_LIBS=ON \
  -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
  -DFAISS_OPT_LEVEL=avx2 \
  -DRAFT_NVTX=OFF \
  -DCMAKE_CUDA_ARCHITECTURES="NATIVE" \
  -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
- -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
+  -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
  -DCMAKE_C_COMPILER_LAUNCHER=ccache \
  -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
  ${EXTRA_CMAKE_ARGS} \
  ../
 
 
+
 # make -C build -j12 faiss
 cmake  --build . -j12
 # make -C build -j12 swigfaiss
diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index e53a2535f8..83eb055ebd 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -262,9 +262,9 @@ target_include_directories(faiss_gpu PUBLIC
   $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>)
 
 if(FAISS_ENABLE_CUVS)
-  target_compile_definitions(faiss PUBLIC USE_NVIDIA_RAPIDS=1)
-  target_compile_definitions(faiss_avx2 PUBLIC USE_NVIDIA_RAPIDS=1)
-  target_compile_definitions(faiss_avx512 PUBLIC USE_NVIDIA_RAPIDS=1)
+  target_compile_definitions(faiss PUBLIC USE_NVIDIA_CUVS=1)
+  target_compile_definitions(faiss_avx2 PUBLIC USE_NVIDIA_CUVS=1)
+  target_compile_definitions(faiss_avx512 PUBLIC USE_NVIDIA_CUVS=1)
 
   # Mark all functions as hidden so that we don't generate
   # global 'public' functions that also exist in libraft.so
@@ -287,7 +287,7 @@ if(FAISS_ENABLE_CUVS)
     utils/CuvsUtils.cu
     TARGET_DIRECTORY faiss
     PROPERTIES COMPILE_OPTIONS "-fvisibility=hidden")
-  target_compile_definitions(faiss_gpu PUBLIC USE_NVIDIA_RAPIDS=1)
+  target_compile_definitions(faiss_gpu PUBLIC USE_NVIDIA_CUVS=1)
 endif()
 
 # Export FAISS_GPU_HEADERS variable to parent scope.
diff --git a/faiss/gpu/test/test_cagra.py b/faiss/gpu/test/test_cagra.py
index 4c7e532c2b..2acb93541e 100644
--- a/faiss/gpu/test/test_cagra.py
+++ b/faiss/gpu/test/test_cagra.py
@@ -11,8 +11,8 @@
 
 
 @unittest.skipIf(
-    "RAFT" not in faiss.get_compile_options(),
-    "only if RAFT is compiled in")
+    "CUVS" not in faiss.get_compile_options(),
+    "only if cuVS is compiled in")
 class TestComputeGT(unittest.TestCase):
 
     def do_compute_GT(self, metric):
@@ -36,8 +36,8 @@ def test_compute_GT_IP(self):
         self.do_compute_GT(faiss.METRIC_INNER_PRODUCT)
 
 @unittest.skipIf(
-    "RAFT" not in faiss.get_compile_options(),
-    "only if RAFT is compiled in")
+    "CUVS" not in faiss.get_compile_options(),
+    "only if cuVS is compiled in")
 class TestInterop(unittest.TestCase):
 
     def do_interop(self, metric):
diff --git a/faiss/gpu/test/test_raft.py b/faiss/gpu/test/test_raft.py
index 663fce6fdb..9450ff927c 100644
--- a/faiss/gpu/test/test_raft.py
+++ b/faiss/gpu/test/test_raft.py
@@ -11,8 +11,8 @@
 
 
 @unittest.skipIf(
-    "RAFT" not in faiss.get_compile_options(),
-    "only if RAFT is compiled in")
+    "CUVS" not in faiss.get_compile_options(),
+    "only if CUVS is compiled in")
 class TestBfKnn(unittest.TestCase):
 
     def test_bfKnn(self):
@@ -29,7 +29,7 @@ def test_bfKnn(self):
         np.testing.assert_allclose(Dref, Dnew, atol=1e-5)
         np.testing.assert_array_equal(Iref, Inew)
 
-        # RAFT version
+        # cuVS version
         Dnew, Inew = faiss.knn_gpu(
             res, ds.get_queries(), ds.get_database(), 12, use_cuvs=True)
         np.testing.assert_allclose(Dref, Dnew, atol=1e-5)

From 3393d33f30a6e18c5c61340b8549d646fedefed8 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Fri, 19 Jul 2024 14:13:11 -0700
Subject: [PATCH 074/148] update github actions; conda pkg

---
 INSTALL.md                     |   2 +-
 conda/faiss-gpu-cuvs/meta.yaml |   4 +-
 conda/faiss-gpu-raft/meta.yaml | 125 ---------------------------------
 3 files changed, 3 insertions(+), 128 deletions(-)
 delete mode 100644 conda/faiss-gpu-raft/meta.yaml

diff --git a/INSTALL.md b/INSTALL.md
index 6553ad5072..f065638369 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -110,7 +110,7 @@ Several options can be passed to CMake, among which:
   values are `ON` and `OFF`),
   - `-DFAISS_ENABLE_PYTHON=OFF` in order to disable building python bindings
   (possible values are `ON` and `OFF`),
-  - `-DFAISS_ENABLE_CUVS=ON` in order to enable building the cuvs implementations
+  - `-DFAISS_ENABLE_CUVS=ON` in order to enable building the cuVS implementations
     of the IVF-Flat and IVF-PQ GPU-accelerated indices (default is `OFF`, possible
     values are `ON` and `OFF`)
   - `-DBUILD_TESTING=OFF` in order to disable building C++ tests,
diff --git a/conda/faiss-gpu-cuvs/meta.yaml b/conda/faiss-gpu-cuvs/meta.yaml
index 2ba4d42328..6c43d19a7c 100644
--- a/conda/faiss-gpu-cuvs/meta.yaml
+++ b/conda/faiss-gpu-cuvs/meta.yaml
@@ -38,7 +38,7 @@ outputs:
     script: build-lib-arm64.sh  # [not x86_64]
     script: build-lib.bat  # [win]
     build:
-      string: "h{{ PKG_HASH }}_{{ number }}_cuda{{ cudatoolkit }}_raft{{ suffix }}"
+      string: "h{{ PKG_HASH }}_{{ number }}_cuda{{ cudatoolkit }}_cuvs{{ suffix }}"
       run_exports:
         - {{ pin_compatible('libfaiss', exact=True) }}
       script_env:
@@ -77,7 +77,7 @@ outputs:
         - conda inspect linkages -p $PREFIX $PKG_NAME  # [not win]
         - conda inspect objects -p $PREFIX $PKG_NAME   # [osx]
 
-  - name: faiss-gpu-raft
+  - name: faiss-gpu-cuvs
     script: build-pkg.sh  # [x86_64 and not win and not osx]
     script: build-pkg-osx.sh  # [x86_64 and osx]
     script: build-pkg-arm64.sh # [not x86_64]
diff --git a/conda/faiss-gpu-raft/meta.yaml b/conda/faiss-gpu-raft/meta.yaml
deleted file mode 100644
index 2ba4d42328..0000000000
--- a/conda/faiss-gpu-raft/meta.yaml
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-{% set version = environ.get('GIT_DESCRIBE_TAG').lstrip('v') %}
-{% set suffix = "_nightly" if environ.get('PACKAGE_TYPE') == 'nightly' else "" %}
-{% set number = GIT_DESCRIBE_NUMBER %}
-{% if cudatoolkit == '11.8.0' %}
-{% set cuda_constraints=">=11.8,<12" %}
-{% set libcublas_constraints=">=11.11,<12" %}
-{% elif cudatoolkit == '12.1.1' %}
-{% set cuda_constraints=">=12.1,<13" %}
-{% set libcublas_constraints=">=12.1,<13" %}
-{% endif %}
-
-package:
-  name: faiss-pkg
-  version: {{ version }}
-
-build:
-  number: {{ number }}
-
-about:
-  home: https://github.com/facebookresearch/faiss
-  license: MIT
-  license_family: MIT
-  license_file: LICENSE
-  summary: A library for efficient similarity search and clustering of dense vectors.
-
-source:
-  git_url: ../../
-
-outputs:
-  - name: libfaiss
-    script: build-lib.sh  # [x86_64 and not win and not osx]
-    script: build-lib-osx.sh  # [x86_64 and osx]
-    script: build-lib-arm64.sh  # [not x86_64]
-    script: build-lib.bat  # [win]
-    build:
-      string: "h{{ PKG_HASH }}_{{ number }}_cuda{{ cudatoolkit }}_raft{{ suffix }}"
-      run_exports:
-        - {{ pin_compatible('libfaiss', exact=True) }}
-      script_env:
-        - CUDA_ARCHS
-    requirements:
-      build:
-        - {{ compiler('cxx') }}
-        - sysroot_linux-64  # [linux64]
-        - llvm-openmp  # [osx]
-        - cmake >=3.24.0
-        - make  # [not win]
-        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
-        - mkl =2023  # [x86_64]
-        - mkl-devel =2023  # [x86_64]
-        - cuda-toolkit {{ cudatoolkit }}
-      host:
-        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
-        - mkl =2023  # [x86_64]
-        - openblas  # [not x86_64]
-        - libcuvs =24.08
-        - cuda-version {{ cuda_constraints }}
-      run:
-        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
-        - mkl =2023  # [x86_64]
-        - openblas  # [not x86_64]
-        - cuda-cudart {{ cuda_constraints }}
-        - libcublas {{ libcublas_constraints }}
-        - libcuvs =24.08
-        - cuda-version {{ cuda_constraints }}
-    test:
-      requires:
-        - conda-build
-      commands:
-        - test -f $PREFIX/lib/libfaiss$SHLIB_EXT       # [not win]
-        - test -f $PREFIX/lib/libfaiss_avx2$SHLIB_EXT  # [x86_64 and not win]
-        - conda inspect linkages -p $PREFIX $PKG_NAME  # [not win]
-        - conda inspect objects -p $PREFIX $PKG_NAME   # [osx]
-
-  - name: faiss-gpu-raft
-    script: build-pkg.sh  # [x86_64 and not win and not osx]
-    script: build-pkg-osx.sh  # [x86_64 and osx]
-    script: build-pkg-arm64.sh # [not x86_64]
-    script: build-pkg.bat  # [win]
-    build:
-      string: "py{{ PY_VER }}_h{{ PKG_HASH }}_{{ number }}_cuda{{ cudatoolkit }}{{ suffix }}"
-    requirements:
-      build:
-        - {{ compiler('cxx') }}
-        - sysroot_linux-64 =2.17 # [linux64]
-        - swig
-        - cmake >=3.24.0
-        - make  # [not win]
-        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
-        - mkl =2023  # [x86_64]
-        - cuda-toolkit {{ cudatoolkit }}
-      host:
-        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
-        - python {{ python }}
-        - numpy >=1.19,<2
-        - {{ pin_subpackage('libfaiss', exact=True) }}
-      run:
-        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
-        - python {{ python }}
-        - numpy >=1.19,<2
-        - packaging
-        - {{ pin_subpackage('libfaiss', exact=True) }}
-    test:
-      requires:
-        - numpy
-        - scipy
-        - pytorch
-        - pytorch-cuda {{ cuda_constraints }}
-      commands:
-        - python -X faulthandler -m unittest discover -v -s tests/ -p "test_*"
-        - python -X faulthandler -m unittest discover -v -s tests/ -p "torch_*"
-        - cp tests/common_faiss_tests.py faiss/gpu/test
-        - python -X faulthandler -m unittest discover -v -s faiss/gpu/test/ -p "test_*"
-        - python -X faulthandler -m unittest discover -v -s faiss/gpu/test/ -p "torch_*"
-        - sh test_cpu_dispatch.sh  # [linux64]
-      files:
-        - test_cpu_dispatch.sh  # [linux64]
-      source_files:
-        - tests/
-        - faiss/gpu/test/

From 87e2e6c583aab96e0ef70ad664f089586de7baeb Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Fri, 19 Jul 2024 14:28:14 -0700
Subject: [PATCH 075/148] formatting; update github actions

---
 .github/actions/build_cmake/action.yml | 16 ++++++++--------
 .github/actions/build_conda/action.yml | 20 ++++++++++----------
 .github/workflows/build.yml            | 18 +++++++++---------
 .github/workflows/nightly.yml          | 12 ++++++------
 faiss/gpu/GpuDistance.cu               |  2 +-
 faiss/gpu/GpuIndexCagra.cu             |  2 +-
 faiss/gpu/GpuIndexIVFFlat.cu           | 13 ++++++++-----
 faiss/gpu/GpuIndexIVFPQ.cu             |  6 +++---
 faiss/gpu/impl/CuvsCagra.cu            |  2 +-
 faiss/gpu/impl/CuvsFlatIndex.cu        |  3 ++-
 faiss/gpu/impl/CuvsIVFFlat.cu          | 12 ++++++++++--
 faiss/gpu/impl/CuvsIVFPQ.cu            |  5 +++--
 faiss/gpu/impl/CuvsIVFPQ.cuh           |  1 -
 faiss/gpu/test/TestGpuDistance.cu      |  2 +-
 14 files changed, 63 insertions(+), 51 deletions(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index 222bc79619..456e0f7b3e 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -8,8 +8,8 @@ inputs:
     description: 'Enable GPU support.'
     required: false
     default: OFF
-  raft:
-    description: 'Enable RAFT support.'
+  cuvs:
+    description: 'Enable cuVS support.'
     required: false
     default: OFF
 runs:
@@ -33,13 +33,13 @@ runs:
         conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest
 
         # install CUDA packages
-        if [ "${{ inputs.gpu }}" = "ON" ] && [ "${{ inputs.raft }}" = "OFF" ]; then
+        if [ "${{ inputs.gpu }}" = "ON" ] && [ "${{ inputs.cuvs }}" = "OFF" ]; then
           conda install -y -q cuda-toolkit -c "nvidia/label/cuda-11.8.0"
         fi
 
-        # install RAFT packages
-        if [ "${{ inputs.raft }}" = "ON" ]; then
-          conda install -y -q libraft cuda-version=11.8 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-11.8.0" -c conda-forge
+        # install cuVS packages
+        if [ "${{ inputs.cuvs }}" = "ON" ]; then
+          conda install -y -q libcuvs cuda-version=11.8 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-11.8.0" -c conda-forge
         fi
 
         # install test packages
@@ -58,7 +58,7 @@ runs:
               -DBUILD_TESTING=ON \
               -DBUILD_SHARED_LIBS=ON \
               -DFAISS_ENABLE_GPU=${{ inputs.gpu }} \
-              -DFAISS_ENABLE_CUVS=${{ inputs.raft }} \
+              -DFAISS_ENABLE_CUVS=${{ inputs.cuvs }} \
               -DFAISS_OPT_LEVEL=${{ inputs.opt_level }} \
               -DFAISS_ENABLE_C_API=ON \
               -DPYTHON_EXECUTABLE=$CONDA/bin/python \
@@ -101,5 +101,5 @@ runs:
     - name: Upload test results
       uses: actions/upload-artifact@v4
       with:
-        name: test-results-${{ inputs.opt_level }}-${{ inputs.gpu }}-${{ inputs.raft }}
+        name: test-results-${{ inputs.opt_level }}-${{ inputs.gpu }}-${{ inputs.cuvs }}
         path: test-results
diff --git a/.github/actions/build_conda/action.yml b/.github/actions/build_conda/action.yml
index 982430c351..eac5efd61c 100644
--- a/.github/actions/build_conda/action.yml
+++ b/.github/actions/build_conda/action.yml
@@ -9,8 +9,8 @@ inputs:
     description: "CUDA toolkit version to use."
     default: ""
     required: false
-  raft:
-    description: "Enable RAFT support."
+  cuvs:
+    description: "Enable cuVS support."
     default: ""
     required: false
   compiler_version:
@@ -63,14 +63,14 @@ runs:
       run: |
         conda build faiss --user pytorch --label ${{ inputs.label }} -c pytorch
     - name: Conda build (GPU)
-      if: inputs.label == '' && inputs.cuda != '' && inputs.raft == ''
+      if: inputs.label == '' && inputs.cuda != '' && inputs.cuvs == ''
       shell: ${{ steps.choose_shell.outputs.shell }}
       working-directory: conda
       run: |
         conda build faiss-gpu --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
             -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia
     - name: Conda build (GPU) w/ anaconda upload
-      if: inputs.label != '' && inputs.cuda != '' && inputs.raft == ''
+      if: inputs.label != '' && inputs.cuda != '' && inputs.cuvs == ''
       shell: ${{ steps.choose_shell.outputs.shell }}
       working-directory: conda
       env:
@@ -78,19 +78,19 @@ runs:
       run: |
         conda build faiss-gpu --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
             --user pytorch --label ${{ inputs.label }} -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia
-    - name: Conda build (GPU w/ RAFT)
-      if: inputs.label == '' && inputs.cuda != '' && inputs.raft != ''
+    - name: Conda build (GPU w/ cuVS)
+      if: inputs.label == '' && inputs.cuda != '' && inputs.cuvs != ''
       shell: ${{ steps.choose_shell.outputs.shell }}
       working-directory: conda
       run: |
-        conda build faiss-gpu-raft --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
+        conda build faiss-gpu-cuvs --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
             -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge
-    - name: Conda build (GPU w/ RAFT) w/ anaconda upload
-      if: inputs.label != '' && inputs.cuda != '' && inputs.raft != ''
+    - name: Conda build (GPU w/ cuVS) w/ anaconda upload
+      if: inputs.label != '' && inputs.cuda != '' && inputs.cuvs != ''
       shell: ${{ steps.choose_shell.outputs.shell }}
       working-directory: conda
       env:
         PACKAGE_TYPE: ${{ inputs.label }}
       run: |
-        conda build faiss-gpu-raft --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
+        conda build faiss-gpu-cuvs --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
             --user pytorch --label ${{ inputs.label }} -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ed3f371bb0..f4aef4690d 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -77,8 +77,8 @@ jobs:
       - uses: ./.github/actions/build_cmake
         with:
           gpu: ON
-  linux-x86_64-GPU-w-RAFT-cmake:
-    name: Linux x86_64 GPU w/ RAFT (cmake)
+  linux-x86_64-GPU-w-CUVS-cmake:
+    name: Linux x86_64 GPU w/ cuVS (cmake)
     needs: linux-x86_64-cmake
     runs-on: 4-core-ubuntu-gpu-t4
     steps:
@@ -87,7 +87,7 @@ jobs:
       - uses: ./.github/actions/build_cmake
         with:
           gpu: ON
-          raft: ON
+          cuvs: ON
   linux-x86_64-conda:
     name: Linux x86_64 (conda)
     needs: linux-x86_64-cmake
@@ -152,8 +152,8 @@ jobs:
           label: main
           cuda: "11.4.4"
           compiler_version: "11.2"
-  linux-x86_64-GPU-RAFT-packages-CUDA11-8-0:
-    name: Linux x86_64 GPU w/ RAFT packages (CUDA 11.8.0)
+  linux-x86_64-GPU-CUVS-packages-CUDA11-8-0:
+    name: Linux x86_64 GPU w/ cuVS packages (CUDA 11.8.0)
     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
     runs-on: 4-core-ubuntu-gpu-t4
     env:
@@ -167,7 +167,7 @@ jobs:
       - uses: ./.github/actions/build_conda
         with:
           label: main
-          raft: "ON"
+          cuvs: "ON"
           cuda: "11.8.0"
           compiler_version: "11.2"
   linux-x86_64-GPU-packages-CUDA-12-1-1:
@@ -187,8 +187,8 @@ jobs:
           label: main
           cuda: "12.1.1"
           compiler_version: "11.2"
-  linux-x86_64-GPU-RAFT-packages-CUDA12-1-1:
-    name: Linux x86_64 GPU w/ RAFT packages (CUDA 12.1.1)
+  linux-x86_64-GPU-CUVS-packages-CUDA12-1-1:
+    name: Linux x86_64 GPU w/ cuVS packages (CUDA 12.1.1)
     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
     runs-on: 4-core-ubuntu-gpu-t4
     env:
@@ -202,7 +202,7 @@ jobs:
       - uses: ./.github/actions/build_conda
         with:
           label: main
-          raft: "ON"
+          cuvs: "ON"
           cuda: "12.1.1"
           compiler_version: "11.2"
   windows-x86_64-packages:
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index eabee07744..0d4434e2fa 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -39,8 +39,8 @@ jobs:
           label: nightly
           cuda: "11.4.4"
           compiler_version: "11.2"
-  linux-x86_64-GPU-RAFT-CUDA11-8-0-nightly:
-    name: Linux x86_64 GPU w/ RAFT nightlies (CUDA 11.8.0)
+  linux-x86_64-GPU-CUVS-CUDA11-8-0-nightly:
+    name: Linux x86_64 GPU w/ cuVS nightlies (CUDA 11.8.0)
     runs-on: 4-core-ubuntu-gpu-t4
     env:
       CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
@@ -55,7 +55,7 @@ jobs:
           ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
         with:
           label: nightly
-          raft: "ON"
+          cuvs: "ON"
           cuda: "11.8.0"
           compiler_version: "11.2"
   linux-x86_64-GPU-CUDA-12-1-1-nightly:
@@ -76,8 +76,8 @@ jobs:
           label: nightly
           cuda: "12.1.1"
           compiler_version: "11.2"
-  linux-x86_64-GPU-RAFT-CUDA12-1-1-nightly:
-    name: Linux x86_64 GPU w/ RAFT nightlies (CUDA 12.1.1)
+  linux-x86_64-GPU-CUVS-CUDA12-1-1-nightly:
+    name: Linux x86_64 GPU w/ cuVS nightlies (CUDA 12.1.1)
     runs-on: 4-core-ubuntu-gpu-t4
     env:
       CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
@@ -92,7 +92,7 @@ jobs:
           ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
         with:
           label: nightly
-          raft: "ON"
+          cuvs: "ON"
           cuda: "12.1.1"
           compiler_version: "11.2"
   windows-x86_64-nightly:
diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index 5d8bb2d76b..d5c9098c97 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -377,7 +377,7 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
     if (should_use_cuvs(args)) {
         FAISS_THROW_IF_NOT_MSG(
                 !should_use_cuvs(args),
-                "RAFT has not been compiled into the current version so it cannot be used.");
+                "cuVS has not been compiled into the current version so it cannot be used.");
     } else
 #endif
             if (args.vectorType == DistanceDataType::F32) {
diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
index 059e0c510b..5c4430633b 100644
--- a/faiss/gpu/GpuIndexCagra.cu
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -22,9 +22,9 @@
 
 #include <faiss/IndexHNSW.h>
 #include <faiss/gpu/GpuIndexCagra.h>
+#include <faiss/gpu/StandardGpuResources.h>
 #include <cstddef>
 #include <faiss/gpu/impl/CuvsCagra.cuh>
-#include <faiss/gpu/StandardGpuResources.h>
 #include <optional>
 
 namespace faiss {
diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 3ed98eef84..eaf3a3ae7a 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -254,7 +254,8 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
         auto cuvsIndex_ =
                 std::static_pointer_cast<CuvsIVFFlat, IVFFlat>(index_);
 
-        std::optional<cuvs::neighbors::ivf_flat::index<float, idx_t>> cuvs_ivfflat_index;
+        std::optional<cuvs::neighbors::ivf_flat::index<float, idx_t>>
+                cuvs_ivfflat_index;
 
         if (getDeviceForAddress(x) >= 0) {
             auto dataset_d =
@@ -268,14 +269,16 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
                     raft_handle, cuvs_index_params, x_view);
         }
 
-        quantizer->train(nlist, cuvs_ivfflat_index.value().centers().data_handle());
-        quantizer->add(nlist, cuvs_ivfflat_index.value().centers().data_handle());
+        quantizer->train(
+                nlist, cuvs_ivfflat_index.value().centers().data_handle());
+        quantizer->add(
+                nlist, cuvs_ivfflat_index.value().centers().data_handle());
         raft_handle.sync_stream();
 
         cuvsIndex_->setCuvsIndex(std::move(*cuvs_ivfflat_index));
 #else
         FAISS_THROW_MSG(
-                "RAFT has not been compiled into the current version so it cannot be used.");
+                "cuVS has not been compiled into the current version so it cannot be used.");
 #endif
     } else {
         // FIXME: GPUize more of this
@@ -349,7 +352,7 @@ void GpuIndexIVFFlat::setIndex_(
                 space));
 #else
         FAISS_THROW_MSG(
-                "RAFT has not been compiled into the current version so it cannot be used.");
+                "cuVS has not been compiled into the current version so it cannot be used.");
 #endif
     } else {
         index_.reset(new IVFFlat(
diff --git a/faiss/gpu/GpuIndexIVFPQ.cu b/faiss/gpu/GpuIndexIVFPQ.cu
index 339b626123..39f14b6af0 100644
--- a/faiss/gpu/GpuIndexIVFPQ.cu
+++ b/faiss/gpu/GpuIndexIVFPQ.cu
@@ -402,7 +402,7 @@ void GpuIndexIVFPQ::train(idx_t n, const float* x) {
         cuvs_index_params.add_data_on_build = false;
 
         auto cuvsIndex_ = std::static_pointer_cast<CuvsIVFPQ, IVFPQ>(index_);
-        
+
         std::optional<cuvs::neighbors::ivf_pq::index<idx_t>> cuvs_ivfpq_index;
 
         if (getDeviceForAddress(x) >= 0) {
@@ -436,7 +436,7 @@ void GpuIndexIVFPQ::train(idx_t n, const float* x) {
         cuvsIndex_->setCuvsIndex(std::move(*cuvs_ivfpq_index));
 #else
         FAISS_THROW_MSG(
-                "RAFT has not been compiled into the current version so it cannot be used.");
+                "cuVS has not been compiled into the current version so it cannot be used.");
 #endif
     } else {
         // FIXME: GPUize more of this
@@ -511,7 +511,7 @@ void GpuIndexIVFPQ::setIndex_(
                 space));
 #else
         FAISS_THROW_MSG(
-                "RAFT has not been compiled into the current version so it cannot be used.");
+                "cuVS has not been compiled into the current version so it cannot be used.");
 #endif
     } else {
         index_.reset(new IVFPQ(
diff --git a/faiss/gpu/impl/CuvsCagra.cu b/faiss/gpu/impl/CuvsCagra.cu
index 82457e2f66..c4e9590240 100644
--- a/faiss/gpu/impl/CuvsCagra.cu
+++ b/faiss/gpu/impl/CuvsCagra.cu
@@ -20,10 +20,10 @@
  * limitations under the License.
  */
 
+#include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/gpu/utils/CuvsUtils.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/impl/CuvsCagra.cuh>
-#include <faiss/gpu/StandardGpuResources.h>
 
 #include <cuvs/neighbors/cagra.hpp>
 #include <raft/core/device_mdspan.hpp>
diff --git a/faiss/gpu/impl/CuvsFlatIndex.cu b/faiss/gpu/impl/CuvsFlatIndex.cu
index 9930e9d640..682795d102 100644
--- a/faiss/gpu/impl/CuvsFlatIndex.cu
+++ b/faiss/gpu/impl/CuvsFlatIndex.cu
@@ -98,7 +98,8 @@ void CuvsFlatIndex::query(
 
         cuvs::neighbors::brute_force::index idx(
                 handle, index, norms_view, distance, metricArg);
-        cuvs::neighbors::brute_force::search(handle, idx, search, inds, dists, std::nullopt);
+        cuvs::neighbors::brute_force::search(
+                handle, idx, search, inds, dists, std::nullopt);
 
         if (metric == MetricType::METRIC_Lp) {
             raft::linalg::unary_op(
diff --git a/faiss/gpu/impl/CuvsIVFFlat.cu b/faiss/gpu/impl/CuvsIVFFlat.cu
index 7c3226ccab..baa31e8470 100644
--- a/faiss/gpu/impl/CuvsIVFFlat.cu
+++ b/faiss/gpu/impl/CuvsIVFFlat.cu
@@ -532,7 +532,11 @@ void CuvsIVFFlatCodePackerInterleaved::pack_1(
         size_t offset,
         uint8_t* block) const {
     cuvs::neighbors::ivf_flat::helpers::codepacker::pack_1(
-            reinterpret_cast<const float*>(flat_code), reinterpret_cast<float*>(block), dim, chunk_size, static_cast<uint32_t>(offset));
+            reinterpret_cast<const float*>(flat_code),
+            reinterpret_cast<float*>(block),
+            dim,
+            chunk_size,
+            static_cast<uint32_t>(offset));
 }
 
 void CuvsIVFFlatCodePackerInterleaved::unpack_1(
@@ -540,7 +544,11 @@ void CuvsIVFFlatCodePackerInterleaved::unpack_1(
         size_t offset,
         uint8_t* flat_code) const {
     cuvs::neighbors::ivf_flat::helpers::codepacker::unpack_1(
-            reinterpret_cast<const float*>(block), reinterpret_cast<float*>(flat_code), dim, chunk_size, static_cast<uint32_t>(offset));
+            reinterpret_cast<const float*>(block),
+            reinterpret_cast<float*>(flat_code),
+            dim,
+            chunk_size,
+            static_cast<uint32_t>(offset));
 }
 
 } // namespace gpu
diff --git a/faiss/gpu/impl/CuvsIVFPQ.cu b/faiss/gpu/impl/CuvsIVFPQ.cu
index d2a883faa1..e7d91beeed 100644
--- a/faiss/gpu/impl/CuvsIVFPQ.cu
+++ b/faiss/gpu/impl/CuvsIVFPQ.cu
@@ -21,11 +21,11 @@
  */
 
 #include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/gpu/utils/CuvsUtils.h>
 #include <faiss/gpu/impl/CuvsIVFPQ.cuh>
 #include <faiss/gpu/impl/FlatIndex.cuh>
 #include <faiss/gpu/utils/Transpose.cuh>
-#include <faiss/gpu/StandardGpuResources.h>
 
 #include <cuvs/neighbors/common.hpp>
 #include <cuvs/neighbors/ivf_pq.hpp>
@@ -449,7 +449,8 @@ void CuvsIVFPQ::copyInvertedListsFrom(const InvertedLists* ivf) {
 }
 
 void CuvsIVFPQ::setCuvsIndex(cuvs::neighbors::ivf_pq::index<idx_t>&& idx) {
-    cuvs_index = std::make_shared<cuvs::neighbors::ivf_pq::index<idx_t>>(std::move(idx));
+    cuvs_index = std::make_shared<cuvs::neighbors::ivf_pq::index<idx_t>>(
+            std::move(idx));
     setBasePQCentroids_();
 }
 
diff --git a/faiss/gpu/impl/CuvsIVFPQ.cuh b/faiss/gpu/impl/CuvsIVFPQ.cuh
index 41a43175f5..67984ce158 100644
--- a/faiss/gpu/impl/CuvsIVFPQ.cuh
+++ b/faiss/gpu/impl/CuvsIVFPQ.cuh
@@ -101,7 +101,6 @@ class CuvsIVFPQ : public IVFPQ {
     void copyInvertedListsFrom(const InvertedLists* ivf) override;
 
     /// Replace the cuVS index
-//     void setCuvsIndex(cuvs::neighbors::ivf_pq::index<idx_t>* idx);
     void setCuvsIndex(cuvs::neighbors::ivf_pq::index<idx_t>&& idx);
 
     /// Classify and encode/add vectors to our IVF lists.
diff --git a/faiss/gpu/test/TestGpuDistance.cu b/faiss/gpu/test/TestGpuDistance.cu
index c9742ed92b..6176779d77 100644
--- a/faiss/gpu/test/TestGpuDistance.cu
+++ b/faiss/gpu/test/TestGpuDistance.cu
@@ -173,7 +173,7 @@ void testTransposition(
 #else
     FAISS_THROW_IF_NOT_MSG(
             !use_cuvs,
-            "RAFT has not been compiled into the current version so it cannot be used.");
+            "cuVS has not been compiled into the current version so it cannot be used.");
 #endif
 
     evaluate_bfknn(

From 5dad218378d53b55bfaa33c6e3a55a7ce7f799d3 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Fri, 19 Jul 2024 14:30:15 -0700
Subject: [PATCH 076/148] format

---
 faiss/gpu/GpuDistance.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index d5c9098c97..c80b898636 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -327,7 +327,7 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
                     const_cast<float*>(
                             reinterpret_cast<const float*>(args.queries)),
                     raft::matrix_extent<int64_t>(num_queries, dims));
-            
+
             std::optional<raft::temporary_device_buffer<
                     const float,
                     raft::vector_extent<int64_t>>>

From 245fe2004e16de6b227ddb3426b3fa651d27e55c Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Fri, 19 Jul 2024 14:33:53 -0700
Subject: [PATCH 077/148] small change to error message

---
 faiss/gpu/GpuIndexFlat.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/faiss/gpu/GpuIndexFlat.cu b/faiss/gpu/GpuIndexFlat.cu
index 3a7e774a5f..8e37dbf127 100644
--- a/faiss/gpu/GpuIndexFlat.cu
+++ b/faiss/gpu/GpuIndexFlat.cu
@@ -105,7 +105,7 @@ void GpuIndexFlat::resetIndex_(int dims) {
 #else
     if (should_use_cuvs(config_)) {
         FAISS_THROW_MSG(
-                "RAFT has not been compiled into the current version so it cannot be used.");
+                "cuVS has not been compiled into the current version so it cannot be used.");
     } else
 #endif
     {

From 1fe434a779123a484995ad34120b5136369026bb Mon Sep 17 00:00:00 2001
From: tarang-jain <jaintarang2015.com>
Date: Wed, 24 Jul 2024 10:11:56 -0700
Subject: [PATCH 078/148] don't need to link against raft::raft

---
 c_api/gpu/CMakeLists.txt      | 2 +-
 faiss/gpu/CMakeLists.txt      | 2 +-
 faiss/gpu/test/CMakeLists.txt | 2 +-
 faiss/python/CMakeLists.txt   | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/c_api/gpu/CMakeLists.txt b/c_api/gpu/CMakeLists.txt
index 5dcea36ae9..0f6d2f3186 100644
--- a/c_api/gpu/CMakeLists.txt
+++ b/c_api/gpu/CMakeLists.txt
@@ -16,7 +16,7 @@ file(GLOB FAISS_C_API_GPU_HEADERS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.h")
 faiss_install_headers("${FAISS_C_API_GPU_HEADERS}" c_api/gpu)
 
 find_package(CUDAToolkit REQUIRED)
-target_link_libraries(faiss_c PUBLIC CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
+target_link_libraries(faiss_c PUBLIC CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
 
 add_executable(example_gpu_c EXCLUDE_FROM_ALL example_gpu_c.c)
 target_link_libraries(example_gpu_c PRIVATE faiss_c)
diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index 83eb055ebd..537ac164ef 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -320,5 +320,5 @@ __nv_relfatbin : { *(__nv_relfatbin) }
 target_link_options(faiss_gpu PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
 
 find_package(CUDAToolkit REQUIRED)
-target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass> $<$<BOOL:${FAISS_ENABLE_CUVS}>:OpenMP::OpenMP_CXX>)
+target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass> $<$<BOOL:${FAISS_ENABLE_CUVS}>:OpenMP::OpenMP_CXX>)
 target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$<BOOL:${FAISS_ENABLE_CUVS}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt
index 34f6217970..5b71072df3 100644
--- a/faiss/gpu/test/CMakeLists.txt
+++ b/faiss/gpu/test/CMakeLists.txt
@@ -22,7 +22,7 @@ find_package(CUDAToolkit REQUIRED)
 # Defines `gtest_discover_tests()`.
 include(GoogleTest)
 add_library(faiss_gpu_test_helper TestUtils.cpp)
-target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
+target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
 
 macro(faiss_gpu_test file)
   get_filename_component(test_name ${file} NAME_WE)
diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt
index 5703d73886..040157497c 100644
--- a/faiss/python/CMakeLists.txt
+++ b/faiss/python/CMakeLists.txt
@@ -129,9 +129,9 @@ if(FAISS_ENABLE_GPU)
   if(FAISS_ENABLE_CUVS)
     find_package(raft COMPONENTS compiled distributed)
   endif()
-  target_link_libraries(swigfaiss PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
-  target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
-  target_link_libraries(swigfaiss_avx512 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
+  target_link_libraries(swigfaiss PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
+  target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
+  target_link_libraries(swigfaiss_avx512 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
 endif()
 
 find_package(OpenMP REQUIRED)

From b63d54308e29f27169f152081419d338c835efa1 Mon Sep 17 00:00:00 2001
From: tarang-jain <jaintarang2015.com>
Date: Wed, 24 Jul 2024 10:12:26 -0700
Subject: [PATCH 079/148] rm build.sh

---
 build.sh | 59 --------------------------------------------------------
 1 file changed, 59 deletions(-)
 delete mode 100755 build.sh

diff --git a/build.sh b/build.sh
deleted file mode 100755
index 4f522189a7..0000000000
--- a/build.sh
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/bin/bash
-
-# NOTE: This file is temporary for the proof-of-concept branch and will be removed before this PR is merged
-
-BUILD_TYPE=Release
-BUILD_DIR=build/
-
-RAFT_REPO_REL=""
-EXTRA_CMAKE_ARGS=""
-set -e
-
-if [[ ${RAFT_REPO_REL} != "" ]]; then
-  RAFT_REPO_PATH="`readlink -f \"${RAFT_REPO_REL}\"`"
-  EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCPM_raft_SOURCE=${RAFT_REPO_PATH}"
-fi
-
-if [ "$1" == "clean" ]; then
-  rm -rf build
-  rm -rf .cache
-  exit 0
-fi
-
-if [ "$1" == "test" ]; then
-  make -C build -j test
-  exit 0
-fi
-
-if [ "$1" == "test-raft" ]; then
-  ./build/faiss/gpu/test/TestRaftIndexIVFFlat
-  exit 0
-fi
-
-mkdir -p $BUILD_DIR
-cd $BUILD_DIR
-
-cmake \
- -DFAISS_ENABLE_GPU=ON \
- -DFAISS_ENABLE_CUVS=ON \
- -DFAISS_ENABLE_PYTHON=ON \
- -DBUILD_TESTING=ON \
- -DBUILD_SHARED_LIBS=ON \
- -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
- -DFAISS_OPT_LEVEL=avx2 \
- -DRAFT_NVTX=OFF \
- -DCMAKE_CUDA_ARCHITECTURES="NATIVE" \
- -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-  -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
- -DCMAKE_C_COMPILER_LAUNCHER=ccache \
- -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
- ${EXTRA_CMAKE_ARGS} \
- ../
-
-
-
-# make -C build -j12 faiss
-cmake  --build . -j12
-# make -C build -j12 swigfaiss
-# (cd build/faiss/python && python setup.py install)
-

From a598ecb32eb2a672bfb200942fffe3f30d83e683 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 30 Jul 2024 23:46:00 -0700
Subject: [PATCH 080/148] cleanup docs; merge latest main

---
 .github/actions/build_cmake/action.yml | 2 +-
 faiss/gpu/GpuDistance.cu               | 4 ----
 faiss/gpu/GpuIndexCagra.h              | 2 +-
 faiss/gpu/GpuIndexIVFFlat.cu           | 6 +++---
 faiss/gpu/GpuIndexIVFPQ.cu             | 4 ++--
 faiss/gpu/GpuResources.h               | 1 -
 6 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index 9bc4d356d6..62d1d1bc6b 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -112,5 +112,5 @@ runs:
     - name: Upload test results
       uses: actions/upload-artifact@v4
       with:
-        name: test-results-${{ inputs.opt_level }}-${{ inputs.gpu }}-${{ inputs.cuvs }}
+        name: test-results-${{ runner.arch }}-${{ inputs.opt_level }}-${{ inputs.gpu }}-${{ inputs.cuvs }}
         path: test-results
diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index c80b898636..97b5c6c136 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -47,10 +47,6 @@
 namespace faiss {
 namespace gpu {
 
-// #if defined USE_NVIDIA_CUVS
-// using namespace cuvs::neighbors;
-// #endif
-
 bool should_use_cuvs(GpuDistanceParams args) {
     cudaDeviceProp prop;
     int dev = args.device >= 0 ? args.device : getCurrentDevice();
diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
index 63a5203187..99009c11aa 100644
--- a/faiss/gpu/GpuIndexCagra.h
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -281,4 +281,4 @@ struct GpuIndexCagra : public GpuIndex {
 };
 
 } // namespace gpu
-} // namespace faiss
\ No newline at end of file
+} // namespace faiss
diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index eaf3a3ae7a..0ceaa7aa56 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -74,7 +74,7 @@ GpuIndexIVFFlat::GpuIndexIVFFlat(
           reserveMemoryVecs_(0) {
     FAISS_THROW_IF_NOT_MSG(
             !should_use_cuvs(config),
-            "GpuIndexIVFFlat: RAFT does not support separate coarseQuantizer");
+            "GpuIndexIVFFlat: cuVS does not support separate coarseQuantizer");
     // We could have been passed an already trained coarse quantizer. There is
     // no other quantizer that we need to train, so this is sufficient
     if (this->is_trained) {
@@ -263,10 +263,10 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
             cuvs_ivfflat_index = cuvs::neighbors::ivf_flat::build(
                     raft_handle, cuvs_index_params, dataset_d);
         } else {
-            auto x_view =
+            auto dataset_h =
                     raft::make_host_matrix_view<const float, idx_t>(x, n, d);
             cuvs_ivfflat_index = cuvs::neighbors::ivf_flat::build(
-                    raft_handle, cuvs_index_params, x_view);
+                    raft_handle, cuvs_index_params, dataset_h);
         }
 
         quantizer->train(
diff --git a/faiss/gpu/GpuIndexIVFPQ.cu b/faiss/gpu/GpuIndexIVFPQ.cu
index 39f14b6af0..656a090264 100644
--- a/faiss/gpu/GpuIndexIVFPQ.cu
+++ b/faiss/gpu/GpuIndexIVFPQ.cu
@@ -95,7 +95,7 @@ GpuIndexIVFPQ::GpuIndexIVFPQ(
 
     FAISS_THROW_IF_NOT_MSG(
             !config.use_cuvs,
-            "GpuIndexIVFPQ: RAFT does not support separate coarseQuantizer");
+            "GpuIndexIVFPQ: cuVS does not support separate coarseQuantizer");
 
     verifyPQSettings_();
 }
@@ -361,7 +361,7 @@ void GpuIndexIVFPQ::train(idx_t n, const float* x) {
 
     FAISS_ASSERT(!index_);
 
-    // RAFT does not support using an external index for assignment. Fall back
+    // cuVS does not support using an external index for assignment. Fall back
     // to the classical GPU impl
     if (should_use_cuvs(config_)) {
 #if defined USE_NVIDIA_CUVS
diff --git a/faiss/gpu/GpuResources.h b/faiss/gpu/GpuResources.h
index b1a181d214..e200ce2dea 100644
--- a/faiss/gpu/GpuResources.h
+++ b/faiss/gpu/GpuResources.h
@@ -163,7 +163,6 @@ struct AllocRequest : public AllocInfo {
 
 #if defined USE_NVIDIA_CUVS
     rmm::mr::device_memory_resource* mr = nullptr;
-    // rmm::device_async_resource_ref mr;
 #endif
 };
 

From f62ed131c7580c8d94d8f6bcfb4bec379567cdc1 Mon Sep 17 00:00:00 2001
From: tarang-jain <jaintarang2015.com>
Date: Tue, 20 Aug 2024 12:10:03 -0700
Subject: [PATCH 081/148] rm RAFT mentions

---
 faiss/gpu/GpuDistance.h                 |  9 +++--
 faiss/gpu/GpuIndex.h                    |  4 +--
 faiss/gpu/GpuIndexIVFFlat.cu            |  2 +-
 faiss/gpu/GpuIndexIVFPQ.cu              |  8 ++---
 faiss/gpu/GpuIndexIVFPQ.h               |  2 +-
 faiss/gpu/impl/CuvsFlatIndex.cu         |  2 +-
 faiss/gpu/impl/CuvsIVFFlat.cu           | 12 +++----
 faiss/gpu/impl/CuvsIVFPQ.cu             | 10 +++---
 faiss/gpu/impl/CuvsIVFPQ.cuh            |  4 +--
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp  | 48 ++++++++++++-------------
 faiss/gpu/test/TestGpuIndexIVFPQ.cpp    |  8 ++---
 faiss/gpu/test/test_index_cpu_to_gpu.py |  2 +-
 12 files changed, 57 insertions(+), 54 deletions(-)

diff --git a/faiss/gpu/GpuDistance.h b/faiss/gpu/GpuDistance.h
index f55e813392..251b7d011e 100644
--- a/faiss/gpu/GpuDistance.h
+++ b/faiss/gpu/GpuDistance.h
@@ -106,12 +106,15 @@ struct GpuDistanceParams {
     /// execution
     int device = -1;
 
-    /// Should the index dispatch down to RAFT?
-    /// TODO: change default to true if RAFT is enabled
+    /// Should the index dispatch down to cuVS?
+#if defined USE_NVIDIA_CUVS
+    bool use_cuvs = true;
+#else
     bool use_cuvs = false;
+#endif
 };
 
-/// A function that determines whether RAFT should be used based on various
+/// A function that determines whether cuVS should be used based on various
 /// conditions (such as unsupported architecture)
 bool should_use_cuvs(GpuDistanceParams args);
 
diff --git a/faiss/gpu/GpuIndex.h b/faiss/gpu/GpuIndex.h
index a050974b1e..4b1cb752fe 100644
--- a/faiss/gpu/GpuIndex.h
+++ b/faiss/gpu/GpuIndex.h
@@ -37,7 +37,7 @@ struct GpuIndexConfig {
     /// more memory than is available on the GPU.
     MemorySpace memorySpace = MemorySpace::Device;
 
-    /// Should the index dispatch down to RAFT?
+    /// Should the index dispatch down to cuVS?
 #if defined USE_NVIDIA_CUVS
     bool use_cuvs = true;
 #else
@@ -45,7 +45,7 @@ struct GpuIndexConfig {
 #endif
 };
 
-/// A centralized function that determines whether RAFT should
+/// A centralized function that determines whether cuVS should
 /// be used based on various conditions (such as unsupported architecture)
 bool should_use_cuvs(GpuIndexConfig config_);
 
diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 0ceaa7aa56..47cbac1352 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -120,7 +120,7 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
     // Clear out our old data
     index_.reset();
 
-    // skip base class allocations if RAFT is enabled
+    // skip base class allocations if cuVS is enabled
     if (!should_use_cuvs(config_)) {
         baseIndex_.reset();
     }
diff --git a/faiss/gpu/GpuIndexIVFPQ.cu b/faiss/gpu/GpuIndexIVFPQ.cu
index 656a090264..c78c6970f8 100644
--- a/faiss/gpu/GpuIndexIVFPQ.cu
+++ b/faiss/gpu/GpuIndexIVFPQ.cu
@@ -111,7 +111,7 @@ void GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
     // Clear out our old data
     index_.reset();
 
-    // skip base class allocations if RAFT is enabled
+    // skip base class allocations if cuVS is enabled
     if (!should_use_cuvs(config_)) {
         baseIndex_.reset();
     }
@@ -349,11 +349,11 @@ void GpuIndexIVFPQ::train(idx_t n, const float* x) {
     if (this->is_trained) {
         FAISS_ASSERT(index_);
         if (should_use_cuvs(config_)) {
-            // if RAFT is enabled, copy the IVF centroids to the RAFT index in
-            // case it has been reset. This is because reset clears the RAFT
+            // if cuVS is enabled, copy the IVF centroids to the cuVS index in
+            // case it has been reset. This is because reset clears the cuVS
             // index and its centroids.
             // TODO: change this once the coarse quantizer is separated from
-            // RAFT index
+            // cuVS index
             updateQuantizer();
         };
         return;
diff --git a/faiss/gpu/GpuIndexIVFPQ.h b/faiss/gpu/GpuIndexIVFPQ.h
index 5769dbb1d2..e51cb511e7 100644
--- a/faiss/gpu/GpuIndexIVFPQ.h
+++ b/faiss/gpu/GpuIndexIVFPQ.h
@@ -34,7 +34,7 @@ struct GpuIndexIVFPQConfig : public GpuIndexIVFConfig {
 
     /// Use the alternative memory layout for the IVF lists
     /// WARNING: this is a feature under development, and is only supported with
-    /// cuVS enabled for the index. Do not use if RAFT is not enabled.
+    /// cuVS enabled for the index. Do not use if cuVS is not enabled.
     bool interleavedLayout = false;
 
     /// Use GEMM-backed computation of PQ code distances for the no precomputed
diff --git a/faiss/gpu/impl/CuvsFlatIndex.cu b/faiss/gpu/impl/CuvsFlatIndex.cu
index 682795d102..736203d04f 100644
--- a/faiss/gpu/impl/CuvsFlatIndex.cu
+++ b/faiss/gpu/impl/CuvsFlatIndex.cu
@@ -54,7 +54,7 @@ void CuvsFlatIndex::query(
         Tensor<idx_t, 2, true>& outIndices,
         bool exactDistance) {
     /**
-     * RAFT doesn't yet support half-precision in bfknn.
+     * cuVS doesn't yet support half-precision in bfknn.
      * Use FlatIndex for float16 for now
      */
     if (useFloat16_) {
diff --git a/faiss/gpu/impl/CuvsIVFFlat.cu b/faiss/gpu/impl/CuvsIVFFlat.cu
index baa31e8470..1b73a8cfad 100644
--- a/faiss/gpu/impl/CuvsIVFFlat.cu
+++ b/faiss/gpu/impl/CuvsIVFFlat.cu
@@ -61,13 +61,13 @@ CuvsIVFFlat::CuvsIVFFlat(
                   useResidual,
                   scalarQ,
                   interleavedLayout,
-                  // skip ptr allocations in base class (handled by RAFT
+                  // skip ptr allocations in base class (handled by cuVS
                   // internally)
                   indicesOptions,
                   space) {
     FAISS_THROW_IF_NOT_MSG(
             indicesOptions == INDICES_64_BIT,
-            "only INDICES_64_BIT is supported for RAFT index");
+            "only INDICES_64_BIT is supported for cuVS index");
 }
 
 CuvsIVFFlat::~CuvsIVFFlat() {}
@@ -96,7 +96,7 @@ void CuvsIVFFlat::search(
         Tensor<float, 2, true>& outDistances,
         Tensor<idx_t, 2, true>& outIndices) {
     /// NB: The coarse quantizer is ignored here. The user is assumed to have
-    /// called updateQuantizer() to modify the RAFT index if the quantizer was
+    /// called updateQuantizer() to modify the cuVS index if the quantizer was
     /// modified externally
 
     uint32_t numQueries = queries.getSize(0);
@@ -166,7 +166,7 @@ idx_t CuvsIVFFlat::addVectors(
         Tensor<float, 2, true>& vecs,
         Tensor<idx_t, 1, true>& indices) {
     /// NB: The coarse quantizer is ignored here. The user is assumed to have
-    /// called updateQuantizer() to update the RAFT index if the quantizer was
+    /// called updateQuantizer() to update the cuVS index if the quantizer was
     /// modified externally
 
     FAISS_ASSERT(cuvs_index != nullptr);
@@ -240,7 +240,7 @@ std::vector<uint8_t> CuvsIVFFlat::getListVectorData(
         idx_t listId,
         bool gpuFormat) const {
     if (gpuFormat) {
-        FAISS_THROW_MSG("gpuFormat should be false for RAFT indices");
+        FAISS_THROW_MSG("gpuFormat should be false for cuVS indices");
     }
     FAISS_ASSERT(cuvs_index != nullptr);
 
@@ -397,7 +397,7 @@ void CuvsIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
         // store the list size
         list_sizes_[i] = static_cast<uint32_t>(listSize);
 
-        // This RAFT list must currently be empty
+        // This cuVS list must currently be empty
         FAISS_ASSERT(getListLength(i) == 0);
 
         cuvs::neighbors::ivf::resize_list(
diff --git a/faiss/gpu/impl/CuvsIVFPQ.cu b/faiss/gpu/impl/CuvsIVFPQ.cu
index e7d91beeed..43b48d7e04 100644
--- a/faiss/gpu/impl/CuvsIVFPQ.cu
+++ b/faiss/gpu/impl/CuvsIVFPQ.cu
@@ -61,14 +61,14 @@ CuvsIVFPQ::CuvsIVFPQ(
                 useFloat16LookupTables,
                 useMMCodeDistance,
                 interleavedLayout,
-                // skip ptr allocations in base class (handled by RAFT
+                // skip ptr allocations in base class (handled by cuVS
                 // internally) false,
                 pqCentroidData,
                 indicesOptions,
                 space) {
     FAISS_THROW_IF_NOT_MSG(
             indicesOptions == INDICES_64_BIT,
-            "only INDICES_64_BIT is supported for RAFT index");
+            "only INDICES_64_BIT is supported for cuVS index");
 }
 
 CuvsIVFPQ::~CuvsIVFPQ() {}
@@ -239,7 +239,7 @@ std::vector<uint8_t> CuvsIVFPQ::getListVectorData(idx_t listId, bool gpuFormat)
         const {
     if (gpuFormat) {
         FAISS_THROW_MSG(
-                "gpuFormat should be false for RAFT indices. Unpacked codes are flat.");
+                "gpuFormat should be false for cuVS indices. Unpacked codes are flat.");
     }
     FAISS_ASSERT(cuvs_index);
 
@@ -363,7 +363,7 @@ idx_t CuvsIVFPQ::addVectors(
         Tensor<float, 2, true>& vecs,
         Tensor<idx_t, 1, true>& indices) {
     /// NB: The coarse quantizer is ignored here. The user is assumed to have
-    /// called updateQuantizer() to update the RAFT index if the quantizer was
+    /// called updateQuantizer() to update the cuVS index if the quantizer was
     /// modified externally
 
     FAISS_ASSERT(cuvs_index);
@@ -420,7 +420,7 @@ void CuvsIVFPQ::copyInvertedListsFrom(const InvertedLists* ivf) {
         // store the list size
         list_sizes_[i] = static_cast<uint32_t>(listSize);
 
-        // This RAFT list must currently be empty
+        // This cuVS list must currently be empty
         FAISS_ASSERT(getListLength(i) == 0);
 
         cuvs::neighbors::ivf::resize_list(
diff --git a/faiss/gpu/impl/CuvsIVFPQ.cuh b/faiss/gpu/impl/CuvsIVFPQ.cuh
index 67984ce158..d8eb5039ef 100644
--- a/faiss/gpu/impl/CuvsIVFPQ.cuh
+++ b/faiss/gpu/impl/CuvsIVFPQ.cuh
@@ -56,7 +56,7 @@ class CuvsIVFPQ : public IVFPQ {
     /// Reserve GPU memory in our inverted lists for this number of vectors
     void reserveMemory(idx_t numVecs) override;
 
-    /// Clear out the RAFT index
+    /// Clear out the cuVS index
     void reset() override;
 
     /// After adding vectors, one can call this to reclaim device memory
@@ -138,7 +138,7 @@ class CuvsIVFPQ : public IVFPQ {
     void setPQCentroids_();
 
     /// Update the product quantizer centroids buffer held in the IVFPQ class.
-    /// Used when the RAFT index was updated externally.
+    /// Used when the cuVS index was updated externally.
     void setBasePQCentroids_();
 
     /// cuVS IVF-PQ index
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index 976d0f9b9b..7298ab4b88 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -527,15 +527,15 @@ TEST(TestGpuIndexIVFFlat, Float32_negative) {
     config.use_cuvs = true;
     config.indicesOptions = faiss::gpu::INDICES_64_BIT;
 
-    faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
+    faiss::gpu::GpuIndexIVFFlat cuvsGpuIndex(
             &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
-    raftGpuIndex.copyFrom(&cpuIndex);
-    raftGpuIndex.nprobe = opt.nprobe;
+    cuvsGpuIndex.copyFrom(&cpuIndex);
+    cuvsGpuIndex.nprobe = opt.nprobe;
 
     faiss::gpu::compareIndices(
             queryVecs,
             cpuIndex,
-            raftGpuIndex,
+            cuvsGpuIndex,
             opt.numQuery,
             opt.dim,
             opt.k,
@@ -599,14 +599,14 @@ TEST(TestGpuIndexIVFFlat, QueryNaN) {
     config.indicesOptions = faiss::gpu::INDICES_64_BIT;
     std::fill(distances.begin(), distances.end(), 0);
     std::fill(indices.begin(), indices.end(), 0);
-    faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
+    faiss::gpu::GpuIndexIVFFlat cuvsGpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-    raftGpuIndex.nprobe = opt.nprobe;
+    cuvsGpuIndex.nprobe = opt.nprobe;
 
-    raftGpuIndex.train(opt.numTrain, trainVecs.data());
-    raftGpuIndex.add(opt.numAdd, addVecs.data());
+    cuvsGpuIndex.train(opt.numTrain, trainVecs.data());
+    cuvsGpuIndex.add(opt.numAdd, addVecs.data());
 
-    raftGpuIndex.search(
+    cuvsGpuIndex.search(
             numQuery, nans.data(), opt.k, distances.data(), indices.data());
 
     for (int q = 0; q < numQuery; ++q) {
@@ -667,17 +667,17 @@ TEST(TestGpuIndexIVFFlat, AddNaN) {
 #if defined USE_NVIDIA_CUVS
     config.use_cuvs = true;
     config.indicesOptions = faiss::gpu::INDICES_64_BIT;
-    faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
+    faiss::gpu::GpuIndexIVFFlat cuvsGpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
-    raftGpuIndex.nprobe = opt.nprobe;
-    raftGpuIndex.train(opt.numTrain, trainVecs.data());
+    cuvsGpuIndex.nprobe = opt.nprobe;
+    cuvsGpuIndex.train(opt.numTrain, trainVecs.data());
 
     // should not crash
-    EXPECT_EQ(raftGpuIndex.ntotal, 0);
-    raftGpuIndex.add(numNans, nans.data());
+    EXPECT_EQ(cuvsGpuIndex.ntotal, 0);
+    cuvsGpuIndex.add(numNans, nans.data());
 
     // should not crash
-    raftGpuIndex.search(
+    cuvsGpuIndex.search(
             opt.numQuery,
             queryVecs.data(),
             opt.k,
@@ -745,14 +745,14 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) {
 #if defined USE_NVIDIA_CUVS
     config.use_cuvs = true;
     config.indicesOptions = faiss::gpu::INDICES_64_BIT;
-    faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
+    faiss::gpu::GpuIndexIVFFlat cuvsGpuIndex(
             &res, dim, numCentroids, faiss::METRIC_L2, config);
-    raftGpuIndex.copyFrom(&cpuIndex);
-    raftGpuIndex.nprobe = nprobe;
+    cuvsGpuIndex.copyFrom(&cpuIndex);
+    cuvsGpuIndex.nprobe = nprobe;
 
     faiss::gpu::compareIndices(
             cpuIndex,
-            raftGpuIndex,
+            cuvsGpuIndex,
             numQuery,
             dim,
             k,
@@ -824,15 +824,15 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) {
 #if defined USE_NVIDIA_CUVS
     config.use_cuvs = true;
     config.indicesOptions = faiss::gpu::INDICES_64_BIT;
-    faiss::gpu::GpuIndexIVFFlat raftGpuIndex(
+    faiss::gpu::GpuIndexIVFFlat cuvsGpuIndex(
             &res, dim, numCentroids, faiss::METRIC_L2, config);
-    raftGpuIndex.train(numTrain, trainVecs.data());
-    raftGpuIndex.add(numAdd, addVecs.data());
-    raftGpuIndex.nprobe = 1;
+    cuvsGpuIndex.train(numTrain, trainVecs.data());
+    cuvsGpuIndex.add(numAdd, addVecs.data());
+    cuvsGpuIndex.nprobe = 1;
 
     faiss::gpu::compareIndices(
             cpuIndex,
-            raftGpuIndex,
+            cuvsGpuIndex,
             numQuery,
             dim,
             k,
diff --git a/faiss/gpu/test/TestGpuIndexIVFPQ.cpp b/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
index ccc9c8ff8f..c36348d03c 100644
--- a/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
@@ -853,7 +853,7 @@ TEST(TestGpuIndexIVFPQ, UnifiedMemory) {
     config.use_cuvs = true;
     config.indicesOptions = faiss::gpu::INDICES_64_BIT;
 
-    faiss::gpu::GpuIndexIVFPQ raftGpuIndex(
+    faiss::gpu::GpuIndexIVFPQ cuvsGpuIndex(
             &res,
             dim,
             numCentroids,
@@ -861,12 +861,12 @@ TEST(TestGpuIndexIVFPQ, UnifiedMemory) {
             bitsPerCode,
             faiss::METRIC_L2,
             config);
-    raftGpuIndex.copyFrom(&cpuIndex);
-    raftGpuIndex.nprobe = nprobe;
+    cuvsGpuIndex.copyFrom(&cpuIndex);
+    cuvsGpuIndex.nprobe = nprobe;
 
     faiss::gpu::compareIndices(
             cpuIndex,
-            raftGpuIndex,
+            cuvsGpuIndex,
             numQuery,
             dim,
             k,
diff --git a/faiss/gpu/test/test_index_cpu_to_gpu.py b/faiss/gpu/test/test_index_cpu_to_gpu.py
index 79b58cb636..743b46705a 100644
--- a/faiss/gpu/test/test_index_cpu_to_gpu.py
+++ b/faiss/gpu/test/test_index_cpu_to_gpu.py
@@ -69,7 +69,7 @@ def test_implemented_indices(self):
         self.verify_clones_successfully("PCA32,IVF32,PQ8")
         self.verify_clones_successfully("PCA32,IVF32,PQ8np")
 
-        # set use_cuvs to false, these index types are not supported on RAFT
+        # set use_cuvs to false, these index types are not supported on cuVS
         self.verify_clones_successfully("IVF32,SQ8", use_cuvs=False)
         self.verify_clones_successfully(
             "PCA32,IVF32,SQ8", use_cuvs=False)

From 171979e6eba99ff2307d9e80a4cc08f61398290e Mon Sep 17 00:00:00 2001
From: tarang-jain <jaintarang2015.com>
Date: Tue, 20 Aug 2024 12:11:18 -0700
Subject: [PATCH 082/148] update action.yml

---
 .github/actions/build_cmake/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index 8785cb6a45..255be44037 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -103,7 +103,7 @@ runs:
       shell: bash
       run: ./faiss/gpu/hipify.sh
     - name: Symblink system dependencies
-      if: inputs.raft == 'ON' || inputs.rocm == 'ON'
+      if: inputs.cuvs == 'ON' || inputs.rocm == 'ON'
       shell: bash
       run: |
         # symblink system libraries for HIP compiler

From 68b3d2db9b97700b65683b5b942920c9d4374352 Mon Sep 17 00:00:00 2001
From: tarang-jain <jaintarang2015.com>
Date: Tue, 20 Aug 2024 12:16:45 -0700
Subject: [PATCH 083/148] undo some merge changes

---
 faiss/gpu/CMakeLists.txt | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index 56594174d4..d0d60b5c7f 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -298,10 +298,6 @@ if (FAISS_ENABLE_ROCM)
   list(TRANSFORM FAISS_GPU_SRC REPLACE cu$ hip)
 endif()
 
-if (FAISS_ENABLE_ROCM)
-  list(TRANSFORM FAISS_GPU_SRC REPLACE cu$ hip)
-endif()
-
 # Export FAISS_GPU_HEADERS variable to parent scope.
 set(FAISS_GPU_HEADERS ${FAISS_GPU_HEADERS} PARENT_SCOPE)
 

From bd80b31f348232792626d7e8ac13c5aba735ffdd Mon Sep 17 00:00:00 2001
From: tarang-jain <jaintarang2015.com>
Date: Tue, 20 Aug 2024 12:18:27 -0700
Subject: [PATCH 084/148] undo some merge changes

---
 .github/workflows/build.yml | 23 -----------------------
 1 file changed, 23 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 896162f1f6..8de43c3f8a 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -90,29 +90,6 @@ jobs:
         with:
           gpu: ON
           cuvs: ON
-  linux-x86_64-GPU-w-ROCm-cmake:
-    name: Linux x86_64 GPU w/ ROCm (cmake)
-    needs: linux-x86_64-cmake
-    runs-on: faiss-amd-MI200
-    container:
-      image: ubuntu:22.04
-      options: --device=/dev/kfd --device=/dev/dri --ipc=host --shm-size 16G --group-add video --cap-add=SYS_PTRACE --cap-add=SYS_ADMIN
-    steps:
-      - name: Container setup
-        run: |
-            if [ -f /.dockerenv ]; then
-              apt-get update && apt-get install -y sudo && apt-get install -y git
-              git config --global --add safe.directory '*'
-            else
-              echo 'Skipping. Current job is not running inside a container.'
-            fi
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Build and Test (cmake)
-        uses: ./.github/actions/build_cmake
-        with:
-          gpu: ON
-          cuvs: ON
   linux-x86_64-GPU-w-ROCm-cmake:
     name: Linux x86_64 GPU w/ ROCm (cmake)
     needs: linux-x86_64-cmake

From a9efc3fb80983eba2b5a3cb3ee7e958346db6249 Mon Sep 17 00:00:00 2001
From: tarang-jain <jaintarang2015.com>
Date: Tue, 27 Aug 2024 07:51:41 -0700
Subject: [PATCH 085/148] updates after PR reviews

---
 .circleci/config.yml                   | 86 --------------------------
 faiss/gpu/impl/CuvsCagra.cu            | 10 +--
 faiss/gpu/test/TestGpuDistance.cu      |  2 +-
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp |  2 +-
 4 files changed, 3 insertions(+), 97 deletions(-)
 delete mode 100644 .circleci/config.yml

diff --git a/.circleci/config.yml b/.circleci/config.yml
deleted file mode 100644
index 0b55aa06d4..0000000000
--- a/.circleci/config.yml
+++ /dev/null
@@ -1,86 +0,0 @@
-version: 2.1
-
-executors:
-  linux-x86_64-cpu:
-    docker:
-      - image: continuumio/miniconda3
-    resource_class: large
-
-jobs:
-  build_cmake:
-    parameters:
-      exec:
-        type: executor
-      opt_level:
-        type: string
-        default: generic
-    executor: << parameters.exec >>
-    environment:
-      OMP_NUM_THREADS: 10
-      MKL_THREADING_LAYER: GNU
-    steps:
-      - checkout
-      - run:
-          name: Install conda
-          command: |
-            if [ -n "${CONDA_ARCH}" ]
-            then
-              curl https://repo.anaconda.com/miniconda/Miniconda3-latest-${CONDA_ARCH}.sh --output miniconda.sh
-              bash miniconda.sh -b -p $HOME/miniconda
-              ~/miniconda/bin/conda init
-            fi
-      - run:
-          name: Set up environment
-          command: |
-            conda config --set solver libmamba
-            conda update -y -q conda
-      - run:
-          name: Install env using main channel
-          command: |
-            conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64 sysroot_linux-64
-      - run:
-          name: Build all targets
-          no_output_timeout: 30m
-          command: |
-            eval "$(conda shell.bash hook)"
-            conda activate
-            cmake -B build \
-                  -DBUILD_TESTING=ON \
-                  -DBUILD_SHARED_LIBS=ON \
-                  -DFAISS_ENABLE_GPU=OFF \
-                  -DFAISS_ENABLE_CUVS=OFF \
-                  -DFAISS_OPT_LEVEL=<< parameters.opt_level >> \
-                  -DFAISS_ENABLE_C_API=ON \
-                  -DPYTHON_EXECUTABLE=$(which python) \
-                  -DCMAKE_BUILD_TYPE=Release \
-                  -DBLA_VENDOR=Intel10_64_dyn \
-                  -DCMAKE_CUDA_FLAGS="-gencode arch=compute_75,code=sm_75" \
-                  .
-            make -k -C build -j$(nproc)
-      - run:
-          name: C++ tests
-          command: |
-            export GTEST_OUTPUT="xml:$(realpath .)/test-results/googletest/"
-            make -C build test
-      - run:
-          name: Install Python extension
-          command: |
-            cd build/faiss/python
-            python setup.py install
-      - run:
-          name: Python tests (CPU only)
-          command: |
-            conda install -y -q pytorch -c pytorch
-            pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
-            pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
-      - store_test_results:
-          path: test-results
-
-workflows:
-  version: 2
-  build:
-    jobs:
-      - build_cmake:
-          name: Linux x86_64 AVX512 (cmake)
-          exec: linux-x86_64-cpu
-          opt_level: "avx512"
diff --git a/faiss/gpu/impl/CuvsCagra.cu b/faiss/gpu/impl/CuvsCagra.cu
index c4e9590240..de9e9f3aab 100644
--- a/faiss/gpu/impl/CuvsCagra.cu
+++ b/faiss/gpu/impl/CuvsCagra.cu
@@ -87,7 +87,7 @@ CuvsCagra::CuvsCagra(
         index_params_.graph_build_params = graph_build_params;
     } else {
         cuvs::neighbors::cagra::graph_build_params::nn_descent_params
-                graph_build_params;
+                graph_build_params(intermediate_graph_degree);
         graph_build_params.max_iterations = nn_descent_niter;
         index_params_.graph_build_params = graph_build_params;
     }
@@ -179,14 +179,6 @@ void CuvsCagra::train(idx_t n, const float* x) {
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
 
-    //     auto nn_descent_params = std::make_optional<
-    //             cuvs::neighbors::nn_descent::index_params>();
-    //     nn_descent_params->graph_degree =
-    //     index_params_.intermediate_graph_degree;
-    //     nn_descent_params->intermediate_graph_degree =
-    //             1.5 * index_params_.intermediate_graph_degree;
-    //     nn_descent_params->max_iterations = index_params_.nn_descent_niter;
-
     if (std::holds_alternative<
                 cuvs::neighbors::cagra::graph_build_params::ivf_pq_params>(
                 index_params_.graph_build_params) &&
diff --git a/faiss/gpu/test/TestGpuDistance.cu b/faiss/gpu/test/TestGpuDistance.cu
index 6176779d77..b36da5ff5f 100644
--- a/faiss/gpu/test/TestGpuDistance.cu
+++ b/faiss/gpu/test/TestGpuDistance.cu
@@ -48,7 +48,7 @@ void evaluate_bfknn(
     bfKnn(res, args);
 
     std::stringstream str;
-    str << "using raft " << args.use_cuvs << "metric " << metric
+    str << "using cuVS " << args.use_cuvs << "metric " << metric
         << " colMajorVecs " << colMajorVecs << " colMajorQueries "
         << colMajorQueries;
 
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index 7298ab4b88..d1a6e1e09d 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -332,7 +332,7 @@ TEST(TestGpuIndexIVFFlat, Float16_32_Add_IP) {
 
 TEST(TestGpuIndexIVFFlat, Float32_Query_L2) {
     Options opt;
-    // queryTest(opt, faiss::METRIC_L2, false);
+    queryTest(opt, faiss::METRIC_L2, false);
 
 #if defined USE_NVIDIA_CUVS
     opt.useCuvs = true;

From 8d4496abba77b2d727ef4ab15df38236f3254df5 Mon Sep 17 00:00:00 2001
From: tarang-jain <jaintarang2015.com>
Date: Tue, 27 Aug 2024 10:00:42 -0700
Subject: [PATCH 086/148] rename test_raft.py

---
 faiss/gpu/test/{test_raft.py => test_cuvs.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename faiss/gpu/test/{test_raft.py => test_cuvs.py} (100%)

diff --git a/faiss/gpu/test/test_raft.py b/faiss/gpu/test/test_cuvs.py
similarity index 100%
rename from faiss/gpu/test/test_raft.py
rename to faiss/gpu/test/test_cuvs.py

From 3deb1e1473339172478709fe9be3e49fe78abdca Mon Sep 17 00:00:00 2001
From: tarang-jain <jaintarang2015.com>
Date: Tue, 27 Aug 2024 11:52:34 -0700
Subject: [PATCH 087/148] resolve failing Cagra test

---
 faiss/gpu/GpuIndexCagra.h    |   2 +-
 faiss/gpu/impl/CuvsCagra.cu  | 490 ++++++++++++++++-------------------
 faiss/gpu/impl/CuvsCagra.cuh |  91 +++----
 3 files changed, 258 insertions(+), 325 deletions(-)

diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
index 99009c11aa..174fab0293 100644
--- a/faiss/gpu/GpuIndexCagra.h
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -38,7 +38,7 @@ class CuvsCagra;
 enum class graph_build_algo {
     /// Use IVF-PQ to build all-neighbors knn graph
     IVF_PQ,
-    /// Experimental, use NN-Descent to build all-neighbors knn graph
+    /// Use NN-Descent to build all-neighbors knn graph
     NN_DESCENT
 };
 
diff --git a/faiss/gpu/impl/CuvsCagra.cu b/faiss/gpu/impl/CuvsCagra.cu
index de9e9f3aab..037c4ae150 100644
--- a/faiss/gpu/impl/CuvsCagra.cu
+++ b/faiss/gpu/impl/CuvsCagra.cu
@@ -21,9 +21,9 @@
  */
 
 #include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/impl/CuvsCagra.cuh>
 #include <faiss/gpu/utils/CuvsUtils.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
-#include <faiss/gpu/impl/CuvsCagra.cuh>
 
 #include <cuvs/neighbors/cagra.hpp>
 #include <raft/core/device_mdspan.hpp>
@@ -33,292 +33,242 @@
 namespace faiss {
 namespace gpu {
 
-CuvsCagra::CuvsCagra(
-        GpuResources* resources,
-        int dim,
-        idx_t intermediate_graph_degree,
-        idx_t graph_degree,
-        faiss::cagra_build_algo graph_build_algo,
-        size_t nn_descent_niter,
-        bool store_dataset,
-        faiss::MetricType metric,
-        float metricArg,
-        IndicesOptions indicesOptions,
-        std::optional<cuvs::neighbors::ivf_pq::index_params> ivf_pq_params,
-        std::optional<cuvs::neighbors::ivf_pq::search_params>
-                ivf_pq_search_params,
-        float refine_rate)
-        : resources_(resources),
-          dim_(dim),
-          store_dataset_(store_dataset),
-          metric_(metric),
-          metricArg_(metricArg),
-          index_params_(),
-          ivf_pq_params_(ivf_pq_params),
-          ivf_pq_search_params_(ivf_pq_search_params),
-          refine_rate_(refine_rate) {
-    FAISS_THROW_IF_NOT_MSG(
-            metric == faiss::METRIC_L2 || metric == faiss::METRIC_INNER_PRODUCT,
-            "CAGRA currently only supports L2 or Inner Product metric.");
-    FAISS_THROW_IF_NOT_MSG(
-            indicesOptions == faiss::gpu::INDICES_64_BIT,
-            "only INDICES_64_BIT is supported for cuVS CAGRA index");
-
-    index_params_.intermediate_graph_degree = intermediate_graph_degree;
-    index_params_.graph_degree = graph_degree;
-
-    if (!ivf_pq_params_) {
-        ivf_pq_params_ =
-                std::make_optional<cuvs::neighbors::ivf_pq::index_params>();
-    }
-    if (!ivf_pq_search_params_) {
-        ivf_pq_search_params_ =
-                std::make_optional<cuvs::neighbors::ivf_pq::search_params>();
-    }
-    index_params_.metric = metricFaissToCuvs(metric_, false);
-    ivf_pq_params_->metric = metricFaissToCuvs(metric_, false);
-
-    if (graph_build_algo == faiss::cagra_build_algo::IVF_PQ) {
-        cuvs::neighbors::cagra::graph_build_params::ivf_pq_params
-                graph_build_params;
-        graph_build_params.build_params = ivf_pq_params_.value();
-        graph_build_params.search_params = ivf_pq_search_params_.value();
-        graph_build_params.refinement_rate = refine_rate;
-        index_params_.graph_build_params = graph_build_params;
-    } else {
-        cuvs::neighbors::cagra::graph_build_params::nn_descent_params
-                graph_build_params(intermediate_graph_degree);
-        graph_build_params.max_iterations = nn_descent_niter;
-        index_params_.graph_build_params = graph_build_params;
-    }
-
-    reset();
+CuvsCagra::CuvsCagra(GpuResources* resources,
+                     int dim,
+                     idx_t intermediate_graph_degree,
+                     idx_t graph_degree,
+                     faiss::cagra_build_algo graph_build_algo,
+                     size_t nn_descent_niter,
+                     bool store_dataset,
+                     faiss::MetricType metric,
+                     float metricArg,
+                     IndicesOptions indicesOptions,
+                     std::optional<cuvs::neighbors::ivf_pq::index_params> ivf_pq_params,
+                     std::optional<cuvs::neighbors::ivf_pq::search_params> ivf_pq_search_params,
+                     float refine_rate)
+  : resources_(resources),
+    dim_(dim),
+    store_dataset_(store_dataset),
+    metric_(metric),
+    metricArg_(metricArg),
+    index_params_(),
+    ivf_pq_params_(ivf_pq_params),
+    ivf_pq_search_params_(ivf_pq_search_params),
+    refine_rate_(refine_rate)
+{
+  FAISS_THROW_IF_NOT_MSG(metric == faiss::METRIC_L2 || metric == faiss::METRIC_INNER_PRODUCT,
+                         "CAGRA currently only supports L2 or Inner Product metric.");
+  FAISS_THROW_IF_NOT_MSG(indicesOptions == faiss::gpu::INDICES_64_BIT,
+                         "only INDICES_64_BIT is supported for cuVS CAGRA index");
+
+  index_params_.intermediate_graph_degree = intermediate_graph_degree;
+  index_params_.graph_degree              = graph_degree;
+
+  if (!ivf_pq_params_) {
+    ivf_pq_params_ = std::make_optional<cuvs::neighbors::ivf_pq::index_params>();
+  }
+  if (!ivf_pq_search_params_) {
+    ivf_pq_search_params_ = std::make_optional<cuvs::neighbors::ivf_pq::search_params>();
+  }
+  index_params_.metric   = metricFaissToCuvs(metric_, false);
+  ivf_pq_params_->metric = metricFaissToCuvs(metric_, false);
+
+  if (graph_build_algo == faiss::cagra_build_algo::IVF_PQ) {
+    cuvs::neighbors::cagra::graph_build_params::ivf_pq_params graph_build_params;
+    graph_build_params.build_params    = ivf_pq_params_.value();
+    graph_build_params.search_params   = ivf_pq_search_params_.value();
+    graph_build_params.refinement_rate = refine_rate;
+    index_params_.graph_build_params   = graph_build_params;
+  } else {
+    cuvs::neighbors::cagra::graph_build_params::nn_descent_params graph_build_params(
+      intermediate_graph_degree);
+    graph_build_params.max_iterations = nn_descent_niter;
+    index_params_.graph_build_params  = graph_build_params;
+  }
+
+  reset();
 }
 
-CuvsCagra::CuvsCagra(
-        GpuResources* resources,
-        int dim,
-        idx_t n,
-        int graph_degree,
-        const float* distances,
-        const idx_t* knn_graph,
-        faiss::MetricType metric,
-        float metricArg,
-        IndicesOptions indicesOptions)
-        : resources_(resources),
-          dim_(dim),
-          metric_(metric),
-          metricArg_(metricArg) {
-    FAISS_THROW_IF_NOT_MSG(
-            metric == faiss::METRIC_L2 || metric == faiss::METRIC_INNER_PRODUCT,
-            "CAGRA currently only supports L2 or Inner Product metric.");
-    FAISS_THROW_IF_NOT_MSG(
-            indicesOptions == faiss::gpu::INDICES_64_BIT,
-            "only INDICES_64_BIT is supported for cuVS CAGRA index");
-
-    auto distances_on_gpu = getDeviceForAddress(distances) >= 0;
-    auto knn_graph_on_gpu = getDeviceForAddress(knn_graph) >= 0;
-
-    FAISS_ASSERT(distances_on_gpu == knn_graph_on_gpu);
-
-    storage_ = distances;
-    n_ = n;
-
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-
-    if (distances_on_gpu && knn_graph_on_gpu) {
-        raft_handle.sync_stream();
-        // Copying to host so that cuvs::neighbors::cagra::index
-        // creates an owning copy of the knn graph on device
-        auto knn_graph_copy =
-                raft::make_host_matrix<uint32_t, int64_t>(n, graph_degree);
-        thrust::copy(
-                thrust::device_ptr<const idx_t>(knn_graph),
-                thrust::device_ptr<const idx_t>(knn_graph + (n * graph_degree)),
-                knn_graph_copy.data_handle());
-
-        auto distances_mds =
-                raft::make_device_matrix_view<const float, int64_t>(
-                        distances, n, dim);
-
-        cuvs_index = std::make_shared<
-                cuvs::neighbors::cagra::index<float, uint32_t>>(
-                raft_handle,
-                metricFaissToCuvs(metric_, false),
-                distances_mds,
-                raft::make_const_mdspan(knn_graph_copy.view()));
-    } else if (!distances_on_gpu && !knn_graph_on_gpu) {
-        // copy idx_t (int64_t) host knn_graph to uint32_t host knn_graph
-        auto knn_graph_copy =
-                raft::make_host_matrix<uint32_t, int64_t>(n, graph_degree);
-        std::copy(
-                knn_graph,
-                knn_graph + (n * graph_degree),
-                knn_graph_copy.data_handle());
-
-        auto distances_mds = raft::make_host_matrix_view<const float, int64_t>(
-                distances, n, dim);
-
-        cuvs_index = std::make_shared<
-                cuvs::neighbors::cagra::index<float, uint32_t>>(
-                raft_handle,
-                metricFaissToCuvs(metric_, false),
-                distances_mds,
-                raft::make_const_mdspan(knn_graph_copy.view()));
-    } else {
-        FAISS_THROW_MSG(
-                "distances and knn_graph must both be in device or host memory");
-    }
+CuvsCagra::CuvsCagra(GpuResources* resources,
+                     int dim,
+                     idx_t n,
+                     int graph_degree,
+                     const float* distances,
+                     const idx_t* knn_graph,
+                     faiss::MetricType metric,
+                     float metricArg,
+                     IndicesOptions indicesOptions)
+  : resources_(resources), dim_(dim), metric_(metric), metricArg_(metricArg)
+{
+  FAISS_THROW_IF_NOT_MSG(metric == faiss::METRIC_L2 || metric == faiss::METRIC_INNER_PRODUCT,
+                         "CAGRA currently only supports L2 or Inner Product metric.");
+  FAISS_THROW_IF_NOT_MSG(indicesOptions == faiss::gpu::INDICES_64_BIT,
+                         "only INDICES_64_BIT is supported for cuVS CAGRA index");
+
+  auto distances_on_gpu = getDeviceForAddress(distances) >= 0;
+  auto knn_graph_on_gpu = getDeviceForAddress(knn_graph) >= 0;
+
+  FAISS_ASSERT(distances_on_gpu == knn_graph_on_gpu);
+
+  storage_ = distances;
+  n_       = n;
+
+  const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice();
+
+  if (distances_on_gpu && knn_graph_on_gpu) {
+    raft_handle.sync_stream();
+    // Copying to host so that cuvs::neighbors::cagra::index
+    // creates an owning copy of the knn graph on device
+    auto knn_graph_copy = raft::make_host_matrix<uint32_t, int64_t>(n, graph_degree);
+    thrust::copy(thrust::device_ptr<const idx_t>(knn_graph),
+                 thrust::device_ptr<const idx_t>(knn_graph + (n * graph_degree)),
+                 knn_graph_copy.data_handle());
+
+    auto distances_mds = raft::make_device_matrix_view<const float, int64_t>(distances, n, dim);
+
+    cuvs_index = std::make_shared<cuvs::neighbors::cagra::index<float, uint32_t>>(
+      raft_handle,
+      metricFaissToCuvs(metric_, false),
+      distances_mds,
+      raft::make_const_mdspan(knn_graph_copy.view()));
+  } else if (!distances_on_gpu && !knn_graph_on_gpu) {
+    // copy idx_t (int64_t) host knn_graph to uint32_t host knn_graph
+    auto knn_graph_copy = raft::make_host_matrix<uint32_t, int64_t>(n, graph_degree);
+    std::copy(knn_graph, knn_graph + (n * graph_degree), knn_graph_copy.data_handle());
+
+    auto distances_mds = raft::make_host_matrix_view<const float, int64_t>(distances, n, dim);
+
+    cuvs_index = std::make_shared<cuvs::neighbors::cagra::index<float, uint32_t>>(
+      raft_handle,
+      metricFaissToCuvs(metric_, false),
+      distances_mds,
+      raft::make_const_mdspan(knn_graph_copy.view()));
+  } else {
+    FAISS_THROW_MSG("distances and knn_graph must both be in device or host memory");
+  }
 }
 
-void CuvsCagra::train(idx_t n, const float* x) {
-    storage_ = x;
-    n_ = n;
-
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-
-    if (std::holds_alternative<
-                cuvs::neighbors::cagra::graph_build_params::ivf_pq_params>(
-                index_params_.graph_build_params) &&
-        index_params_.graph_degree == index_params_.intermediate_graph_degree) {
-        index_params_.intermediate_graph_degree =
-                1.5 * index_params_.graph_degree;
-    }
-
-    if (getDeviceForAddress(x) >= 0) {
-        auto dataset =
-                raft::make_device_matrix_view<const float, int64_t>(x, n, dim_);
-        cuvs_index = std::make_shared<
-                cuvs::neighbors::cagra::index<float, uint32_t>>(
-                cuvs::neighbors::cagra::build(
-                        raft_handle, index_params_, dataset));
-    } else {
-        auto dataset =
-                raft::make_host_matrix_view<const float, int64_t>(x, n, dim_);
-        cuvs_index = std::make_shared<
-                cuvs::neighbors::cagra::index<float, uint32_t>>(
-                cuvs::neighbors::cagra::build(
-                        raft_handle, index_params_, dataset));
-    }
+void CuvsCagra::train(idx_t n, const float* x)
+{
+  storage_ = x;
+  n_       = n;
+
+  const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice();
+
+  if (std::holds_alternative<cuvs::neighbors::cagra::graph_build_params::ivf_pq_params>(
+        index_params_.graph_build_params) &&
+      index_params_.graph_degree == index_params_.intermediate_graph_degree) {
+    index_params_.intermediate_graph_degree = 1.5 * index_params_.graph_degree;
+  }
+
+  if (getDeviceForAddress(x) >= 0) {
+    auto dataset = raft::make_device_matrix_view<const float, int64_t>(x, n, dim_);
+    cuvs_index   = std::make_shared<cuvs::neighbors::cagra::index<float, uint32_t>>(
+      cuvs::neighbors::cagra::build(raft_handle, index_params_, dataset));
+  } else {
+    auto dataset = raft::make_host_matrix_view<const float, int64_t>(x, n, dim_);
+    cuvs_index   = std::make_shared<cuvs::neighbors::cagra::index<float, uint32_t>>(
+      cuvs::neighbors::cagra::build(raft_handle, index_params_, dataset));
+  }
 }
 
-void CuvsCagra::search(
-        Tensor<float, 2, true>& queries,
-        int k,
-        Tensor<float, 2, true>& outDistances,
-        Tensor<idx_t, 2, true>& outIndices,
-        idx_t max_queries,
-        idx_t itopk_size,
-        idx_t max_iterations,
-        faiss::cagra_search_algo graph_search_algo,
-        idx_t team_size,
-        idx_t search_width,
-        idx_t min_iterations,
-        idx_t thread_block_size,
-        faiss::cagra_hash_mode hash_mode,
-        idx_t hashmap_min_bitlen,
-        float hashmap_max_fill_rate,
-        idx_t num_random_samplings,
-        idx_t rand_xor_mask) {
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-    idx_t numQueries = queries.getSize(0);
-    idx_t cols = queries.getSize(1);
-    idx_t k_ = k;
-
-    FAISS_ASSERT(cuvs_index);
-    FAISS_ASSERT(numQueries > 0);
-    FAISS_ASSERT(cols == dim_);
-
-    if (!store_dataset_) {
-        if (getDeviceForAddress(storage_) >= 0) {
-            auto dataset = raft::make_device_matrix_view<const float, int64_t>(
-                    storage_, n_, dim_);
-            cuvs_index->update_dataset(raft_handle, dataset);
-        } else {
-            auto dataset = raft::make_host_matrix_view<const float, int64_t>(
-                    storage_, n_, dim_);
-            cuvs_index->update_dataset(raft_handle, dataset);
-        }
+void CuvsCagra::search(Tensor<float, 2, true>& queries,
+                       int k,
+                       Tensor<float, 2, true>& outDistances,
+                       Tensor<idx_t, 2, true>& outIndices,
+                       idx_t max_queries,
+                       idx_t itopk_size,
+                       idx_t max_iterations,
+                       faiss::cagra_search_algo graph_search_algo,
+                       idx_t team_size,
+                       idx_t search_width,
+                       idx_t min_iterations,
+                       idx_t thread_block_size,
+                       faiss::cagra_hash_mode hash_mode,
+                       idx_t hashmap_min_bitlen,
+                       float hashmap_max_fill_rate,
+                       idx_t num_random_samplings,
+                       idx_t rand_xor_mask)
+{
+  const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice();
+  idx_t numQueries                          = queries.getSize(0);
+  idx_t cols                                = queries.getSize(1);
+  idx_t k_                                  = k;
+
+  FAISS_ASSERT(cuvs_index);
+  FAISS_ASSERT(numQueries > 0);
+  FAISS_ASSERT(cols == dim_);
+
+  if (!store_dataset_) {
+    if (getDeviceForAddress(storage_) >= 0) {
+      auto dataset = raft::make_device_matrix_view<const float, int64_t>(storage_, n_, dim_);
+      cuvs_index->update_dataset(raft_handle, dataset);
+    } else {
+      auto dataset = raft::make_host_matrix_view<const float, int64_t>(storage_, n_, dim_);
+      cuvs_index->update_dataset(raft_handle, dataset);
     }
-
-    auto queries_view = raft::make_device_matrix_view<const float, int64_t>(
-            queries.data(), numQueries, cols);
-    auto distances_view = raft::make_device_matrix_view<float, int64_t>(
-            outDistances.data(), numQueries, k_);
-    auto indices_view = raft::make_device_matrix_view<idx_t, int64_t>(
-            outIndices.data(), numQueries, k_);
-
-    cuvs::neighbors::cagra::search_params search_pams;
-    search_pams.max_queries = max_queries;
-    search_pams.itopk_size = itopk_size;
-    search_pams.max_iterations = max_iterations;
-    search_pams.algo =
-            static_cast<cuvs::neighbors::cagra::search_algo>(graph_search_algo);
-    search_pams.team_size = team_size;
-    search_pams.search_width = search_width;
-    search_pams.min_iterations = min_iterations;
-    search_pams.thread_block_size = thread_block_size;
-    search_pams.hashmap_mode =
-            static_cast<cuvs::neighbors::cagra::hash_mode>(hash_mode);
-    search_pams.hashmap_min_bitlen = hashmap_min_bitlen;
-    search_pams.hashmap_max_fill_rate = hashmap_max_fill_rate;
-    search_pams.num_random_samplings = num_random_samplings;
-    search_pams.rand_xor_mask = rand_xor_mask;
-
-    auto indices_copy = raft::make_device_matrix<uint32_t, int64_t>(
-            raft_handle, numQueries, k_);
-
-    cuvs::neighbors::cagra::search(
-            raft_handle,
-            search_pams,
-            *cuvs_index,
-            queries_view,
-            indices_copy.view(),
-            distances_view);
-    thrust::copy(
-            raft::resource::get_thrust_policy(raft_handle),
-            indices_copy.data_handle(),
-            indices_copy.data_handle() + indices_copy.size(),
-            indices_view.data_handle());
+  }
+
+  auto queries_view =
+    raft::make_device_matrix_view<const float, int64_t>(queries.data(), numQueries, cols);
+  auto distances_view =
+    raft::make_device_matrix_view<float, int64_t>(outDistances.data(), numQueries, k_);
+  auto indices_view =
+    raft::make_device_matrix_view<idx_t, int64_t>(outIndices.data(), numQueries, k_);
+
+  cuvs::neighbors::cagra::search_params search_pams;
+  search_pams.max_queries    = max_queries;
+  search_pams.itopk_size     = itopk_size;
+  search_pams.max_iterations = max_iterations;
+  search_pams.algo           = static_cast<cuvs::neighbors::cagra::search_algo>(graph_search_algo);
+  search_pams.team_size      = team_size;
+  search_pams.search_width   = search_width;
+  search_pams.min_iterations = min_iterations;
+  search_pams.thread_block_size     = thread_block_size;
+  search_pams.hashmap_mode          = static_cast<cuvs::neighbors::cagra::hash_mode>(hash_mode);
+  search_pams.hashmap_min_bitlen    = hashmap_min_bitlen;
+  search_pams.hashmap_max_fill_rate = hashmap_max_fill_rate;
+  search_pams.num_random_samplings  = num_random_samplings;
+  search_pams.rand_xor_mask         = rand_xor_mask;
+
+  auto indices_copy = raft::make_device_matrix<uint32_t, int64_t>(raft_handle, numQueries, k_);
+
+  cuvs::neighbors::cagra::search(
+    raft_handle, search_pams, *cuvs_index, queries_view, indices_copy.view(), distances_view);
+  thrust::copy(raft::resource::get_thrust_policy(raft_handle),
+               indices_copy.data_handle(),
+               indices_copy.data_handle() + indices_copy.size(),
+               indices_view.data_handle());
 }
 
-void CuvsCagra::reset() {
-    cuvs_index.reset();
-}
+void CuvsCagra::reset() { cuvs_index.reset(); }
 
-idx_t CuvsCagra::get_knngraph_degree() const {
-    FAISS_ASSERT(cuvs_index);
-    return static_cast<idx_t>(cuvs_index->graph_degree());
+idx_t CuvsCagra::get_knngraph_degree() const
+{
+  FAISS_ASSERT(cuvs_index);
+  return static_cast<idx_t>(cuvs_index->graph_degree());
 }
 
-std::vector<idx_t> CuvsCagra::get_knngraph() const {
-    FAISS_ASSERT(cuvs_index);
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-    auto stream = raft_handle.get_stream();
+std::vector<idx_t> CuvsCagra::get_knngraph() const
+{
+  FAISS_ASSERT(cuvs_index);
+  const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice();
+  auto stream                               = raft_handle.get_stream();
 
-    auto device_graph = cuvs_index->graph();
+  auto device_graph = cuvs_index->graph();
 
-    std::vector<idx_t> host_graph(
-            device_graph.extent(0) * device_graph.extent(1));
+  std::vector<idx_t> host_graph(device_graph.extent(0) * device_graph.extent(1));
 
-    raft_handle.sync_stream();
+  raft_handle.sync_stream();
 
-    thrust::copy(
-            thrust::device_ptr<const uint32_t>(device_graph.data_handle()),
-            thrust::device_ptr<const uint32_t>(
-                    device_graph.data_handle() + device_graph.size()),
-            host_graph.data());
+  thrust::copy(thrust::device_ptr<const uint32_t>(device_graph.data_handle()),
+               thrust::device_ptr<const uint32_t>(device_graph.data_handle() + device_graph.size()),
+               host_graph.data());
 
-    return host_graph;
+  return host_graph;
 }
 
-const float* CuvsCagra::get_training_dataset() const {
-    return storage_;
-}
+const float* CuvsCagra::get_training_dataset() const { return storage_; }
 
-} // namespace gpu
-} // namespace faiss
+}  // namespace gpu
+}  // namespace faiss
diff --git a/faiss/gpu/impl/CuvsCagra.cuh b/faiss/gpu/impl/CuvsCagra.cuh
index eb8964a7d9..810a224277 100644
--- a/faiss/gpu/impl/CuvsCagra.cuh
+++ b/faiss/gpu/impl/CuvsCagra.cuh
@@ -22,9 +22,9 @@
 
 #pragma once
 
+#include <cstddef>
 #include <faiss/gpu/GpuIndicesOptions.h>
 #include <faiss/gpu/GpuResources.h>
-#include <cstddef>
 #include <faiss/gpu/utils/Tensor.cuh>
 #include <optional>
 
@@ -45,57 +45,36 @@ enum class cagra_hash_mode { HASH, SMALL, AUTO };
 namespace gpu {
 
 class CuvsCagra {
-   public:
-    CuvsCagra(
-            GpuResources* resources,
-            int dim,
-            idx_t intermediate_graph_degree,
-            idx_t graph_degree,
-            faiss::cagra_build_algo graph_build_algo,
-            size_t nn_descent_niter,
-            bool store_dataset,
-            faiss::MetricType metric,
-            float metricArg,
-            IndicesOptions indicesOptions,
-            std::optional<cuvs::neighbors::ivf_pq::index_params> ivf_pq_params =
-                    std::nullopt,
-            std::optional<cuvs::neighbors::ivf_pq::search_params>
-                    ivf_pq_search_params = std::nullopt,
-            float refine_rate = 2.0f);
-
-    CuvsCagra(
-            GpuResources* resources,
-            int dim,
-            idx_t n,
-            int graph_degree,
-            const float* distances,
-            const idx_t* knn_graph,
-            faiss::MetricType metric,
-            float metricArg,
-            IndicesOptions indicesOptions);
+  public:
+    CuvsCagra(GpuResources *resources, int dim, idx_t intermediate_graph_degree,
+              idx_t graph_degree, faiss::cagra_build_algo graph_build_algo,
+              size_t nn_descent_niter, bool store_dataset,
+              faiss::MetricType metric, float metricArg,
+              IndicesOptions indicesOptions,
+              std::optional<cuvs::neighbors::ivf_pq::index_params>
+                  ivf_pq_params = std::nullopt,
+              std::optional<cuvs::neighbors::ivf_pq::search_params>
+                  ivf_pq_search_params = std::nullopt,
+              float refine_rate = 2.0f);
+
+    CuvsCagra(GpuResources *resources, int dim, idx_t n, int graph_degree,
+              const float *distances, const idx_t *knn_graph,
+              faiss::MetricType metric, float metricArg,
+              IndicesOptions indicesOptions);
 
     ~CuvsCagra() = default;
 
-    void train(idx_t n, const float* x);
-
-    void search(
-            Tensor<float, 2, true>& queries,
-            int k,
-            Tensor<float, 2, true>& outDistances,
-            Tensor<idx_t, 2, true>& outIndices,
-            idx_t max_queries,
-            idx_t itopk_size,
-            idx_t max_iterations,
-            faiss::cagra_search_algo graph_search_algo,
-            idx_t team_size,
-            idx_t search_width,
-            idx_t min_iterations,
-            idx_t thread_block_size,
-            faiss::cagra_hash_mode hash_mode,
-            idx_t hashmap_min_bitlen,
-            float hashmap_max_fill_rate,
-            idx_t num_random_samplings,
-            idx_t rand_xor_mask);
+    void train(idx_t n, const float *x);
+
+    void search(Tensor<float, 2, true> &queries, int k,
+                Tensor<float, 2, true> &outDistances,
+                Tensor<idx_t, 2, true> &outIndices, idx_t max_queries,
+                idx_t itopk_size, idx_t max_iterations,
+                faiss::cagra_search_algo graph_search_algo, idx_t team_size,
+                idx_t search_width, idx_t min_iterations,
+                idx_t thread_block_size, faiss::cagra_hash_mode hash_mode,
+                idx_t hashmap_min_bitlen, float hashmap_max_fill_rate,
+                idx_t num_random_samplings, idx_t rand_xor_mask);
 
     void reset();
 
@@ -103,14 +82,14 @@ class CuvsCagra {
 
     std::vector<idx_t> get_knngraph() const;
 
-    const float* get_training_dataset() const;
+    const float *get_training_dataset() const;
 
-   private:
+  private:
     /// Collection of GPU resources that we use
-    GpuResources* resources_;
+    GpuResources *resources_;
 
     /// Training dataset
-    const float* storage_;
+    const float *storage_;
     int n_;
 
     /// Expected dimensionality of the vectors
@@ -127,6 +106,7 @@ class CuvsCagra {
     float metricArg_;
 
     /// Parameters to build cuVS CAGRA index
+    faiss::cagra_build_algo graph_build_algo_;
     cuvs::neighbors::cagra::index_params index_params_;
 
     /// Parameters to build CAGRA graph using IVF PQ
@@ -134,9 +114,12 @@ class CuvsCagra {
     std::optional<cuvs::neighbors::ivf_pq::search_params> ivf_pq_search_params_;
     std::optional<float> refine_rate_;
 
+    /// Parameters to build CAGRA graph using NN Descent
+    size_t nn_descent_niter_ = 20;
+
     /// Instance of trained cuVS CAGRA index
     std::shared_ptr<cuvs::neighbors::cagra::index<float, uint32_t>> cuvs_index{
-            nullptr};
+        nullptr};
 };
 
 } // namespace gpu

From 4fdea22fb41f922505be932e80627301e6545b1d Mon Sep 17 00:00:00 2001
From: tarang-jain <jaintarang2015.com>
Date: Tue, 27 Aug 2024 14:41:48 -0700
Subject: [PATCH 088/148] clang-format

---
 faiss/gpu/impl/CuvsCagra.cu | 490 ++++++++++++++++++++----------------
 1 file changed, 270 insertions(+), 220 deletions(-)

diff --git a/faiss/gpu/impl/CuvsCagra.cu b/faiss/gpu/impl/CuvsCagra.cu
index 037c4ae150..5502005134 100644
--- a/faiss/gpu/impl/CuvsCagra.cu
+++ b/faiss/gpu/impl/CuvsCagra.cu
@@ -21,9 +21,9 @@
  */
 
 #include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/impl/CuvsCagra.cuh>
 #include <faiss/gpu/utils/CuvsUtils.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
+#include <faiss/gpu/impl/CuvsCagra.cuh>
 
 #include <cuvs/neighbors/cagra.hpp>
 #include <raft/core/device_mdspan.hpp>
@@ -33,242 +33,292 @@
 namespace faiss {
 namespace gpu {
 
-CuvsCagra::CuvsCagra(GpuResources* resources,
-                     int dim,
-                     idx_t intermediate_graph_degree,
-                     idx_t graph_degree,
-                     faiss::cagra_build_algo graph_build_algo,
-                     size_t nn_descent_niter,
-                     bool store_dataset,
-                     faiss::MetricType metric,
-                     float metricArg,
-                     IndicesOptions indicesOptions,
-                     std::optional<cuvs::neighbors::ivf_pq::index_params> ivf_pq_params,
-                     std::optional<cuvs::neighbors::ivf_pq::search_params> ivf_pq_search_params,
-                     float refine_rate)
-  : resources_(resources),
-    dim_(dim),
-    store_dataset_(store_dataset),
-    metric_(metric),
-    metricArg_(metricArg),
-    index_params_(),
-    ivf_pq_params_(ivf_pq_params),
-    ivf_pq_search_params_(ivf_pq_search_params),
-    refine_rate_(refine_rate)
-{
-  FAISS_THROW_IF_NOT_MSG(metric == faiss::METRIC_L2 || metric == faiss::METRIC_INNER_PRODUCT,
-                         "CAGRA currently only supports L2 or Inner Product metric.");
-  FAISS_THROW_IF_NOT_MSG(indicesOptions == faiss::gpu::INDICES_64_BIT,
-                         "only INDICES_64_BIT is supported for cuVS CAGRA index");
-
-  index_params_.intermediate_graph_degree = intermediate_graph_degree;
-  index_params_.graph_degree              = graph_degree;
-
-  if (!ivf_pq_params_) {
-    ivf_pq_params_ = std::make_optional<cuvs::neighbors::ivf_pq::index_params>();
-  }
-  if (!ivf_pq_search_params_) {
-    ivf_pq_search_params_ = std::make_optional<cuvs::neighbors::ivf_pq::search_params>();
-  }
-  index_params_.metric   = metricFaissToCuvs(metric_, false);
-  ivf_pq_params_->metric = metricFaissToCuvs(metric_, false);
-
-  if (graph_build_algo == faiss::cagra_build_algo::IVF_PQ) {
-    cuvs::neighbors::cagra::graph_build_params::ivf_pq_params graph_build_params;
-    graph_build_params.build_params    = ivf_pq_params_.value();
-    graph_build_params.search_params   = ivf_pq_search_params_.value();
-    graph_build_params.refinement_rate = refine_rate;
-    index_params_.graph_build_params   = graph_build_params;
-  } else {
-    cuvs::neighbors::cagra::graph_build_params::nn_descent_params graph_build_params(
-      intermediate_graph_degree);
-    graph_build_params.max_iterations = nn_descent_niter;
-    index_params_.graph_build_params  = graph_build_params;
-  }
-
-  reset();
-}
+CuvsCagra::CuvsCagra(
+        GpuResources* resources,
+        int dim,
+        idx_t intermediate_graph_degree,
+        idx_t graph_degree,
+        faiss::cagra_build_algo graph_build_algo,
+        size_t nn_descent_niter,
+        bool store_dataset,
+        faiss::MetricType metric,
+        float metricArg,
+        IndicesOptions indicesOptions,
+        std::optional<cuvs::neighbors::ivf_pq::index_params> ivf_pq_params,
+        std::optional<cuvs::neighbors::ivf_pq::search_params>
+                ivf_pq_search_params,
+        float refine_rate)
+        : resources_(resources),
+          dim_(dim),
+          graph_build_algo_(graph_build_algo),
+          nn_descent_niter_(nn_descent_niter),
+          store_dataset_(store_dataset),
+          metric_(metric),
+          metricArg_(metricArg),
+          index_params_(),
+          ivf_pq_params_(ivf_pq_params),
+          ivf_pq_search_params_(ivf_pq_search_params),
+          refine_rate_(refine_rate) {
+    FAISS_THROW_IF_NOT_MSG(
+            metric == faiss::METRIC_L2 || metric == faiss::METRIC_INNER_PRODUCT,
+            "CAGRA currently only supports L2 or Inner Product metric.");
+    FAISS_THROW_IF_NOT_MSG(
+            indicesOptions == faiss::gpu::INDICES_64_BIT,
+            "only INDICES_64_BIT is supported for cuVS CAGRA index");
+
+    index_params_.intermediate_graph_degree = intermediate_graph_degree;
+    index_params_.graph_degree = graph_degree;
+
+    if (!ivf_pq_search_params_) {
+        ivf_pq_search_params_ =
+                std::make_optional<cuvs::neighbors::ivf_pq::search_params>();
+    }
+    index_params_.metric = metricFaissToCuvs(metric_, false);
 
-CuvsCagra::CuvsCagra(GpuResources* resources,
-                     int dim,
-                     idx_t n,
-                     int graph_degree,
-                     const float* distances,
-                     const idx_t* knn_graph,
-                     faiss::MetricType metric,
-                     float metricArg,
-                     IndicesOptions indicesOptions)
-  : resources_(resources), dim_(dim), metric_(metric), metricArg_(metricArg)
-{
-  FAISS_THROW_IF_NOT_MSG(metric == faiss::METRIC_L2 || metric == faiss::METRIC_INNER_PRODUCT,
-                         "CAGRA currently only supports L2 or Inner Product metric.");
-  FAISS_THROW_IF_NOT_MSG(indicesOptions == faiss::gpu::INDICES_64_BIT,
-                         "only INDICES_64_BIT is supported for cuVS CAGRA index");
-
-  auto distances_on_gpu = getDeviceForAddress(distances) >= 0;
-  auto knn_graph_on_gpu = getDeviceForAddress(knn_graph) >= 0;
-
-  FAISS_ASSERT(distances_on_gpu == knn_graph_on_gpu);
-
-  storage_ = distances;
-  n_       = n;
-
-  const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice();
-
-  if (distances_on_gpu && knn_graph_on_gpu) {
-    raft_handle.sync_stream();
-    // Copying to host so that cuvs::neighbors::cagra::index
-    // creates an owning copy of the knn graph on device
-    auto knn_graph_copy = raft::make_host_matrix<uint32_t, int64_t>(n, graph_degree);
-    thrust::copy(thrust::device_ptr<const idx_t>(knn_graph),
-                 thrust::device_ptr<const idx_t>(knn_graph + (n * graph_degree)),
-                 knn_graph_copy.data_handle());
-
-    auto distances_mds = raft::make_device_matrix_view<const float, int64_t>(distances, n, dim);
-
-    cuvs_index = std::make_shared<cuvs::neighbors::cagra::index<float, uint32_t>>(
-      raft_handle,
-      metricFaissToCuvs(metric_, false),
-      distances_mds,
-      raft::make_const_mdspan(knn_graph_copy.view()));
-  } else if (!distances_on_gpu && !knn_graph_on_gpu) {
-    // copy idx_t (int64_t) host knn_graph to uint32_t host knn_graph
-    auto knn_graph_copy = raft::make_host_matrix<uint32_t, int64_t>(n, graph_degree);
-    std::copy(knn_graph, knn_graph + (n * graph_degree), knn_graph_copy.data_handle());
-
-    auto distances_mds = raft::make_host_matrix_view<const float, int64_t>(distances, n, dim);
-
-    cuvs_index = std::make_shared<cuvs::neighbors::cagra::index<float, uint32_t>>(
-      raft_handle,
-      metricFaissToCuvs(metric_, false),
-      distances_mds,
-      raft::make_const_mdspan(knn_graph_copy.view()));
-  } else {
-    FAISS_THROW_MSG("distances and knn_graph must both be in device or host memory");
-  }
+    reset();
 }
 
-void CuvsCagra::train(idx_t n, const float* x)
-{
-  storage_ = x;
-  n_       = n;
-
-  const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice();
-
-  if (std::holds_alternative<cuvs::neighbors::cagra::graph_build_params::ivf_pq_params>(
-        index_params_.graph_build_params) &&
-      index_params_.graph_degree == index_params_.intermediate_graph_degree) {
-    index_params_.intermediate_graph_degree = 1.5 * index_params_.graph_degree;
-  }
-
-  if (getDeviceForAddress(x) >= 0) {
-    auto dataset = raft::make_device_matrix_view<const float, int64_t>(x, n, dim_);
-    cuvs_index   = std::make_shared<cuvs::neighbors::cagra::index<float, uint32_t>>(
-      cuvs::neighbors::cagra::build(raft_handle, index_params_, dataset));
-  } else {
-    auto dataset = raft::make_host_matrix_view<const float, int64_t>(x, n, dim_);
-    cuvs_index   = std::make_shared<cuvs::neighbors::cagra::index<float, uint32_t>>(
-      cuvs::neighbors::cagra::build(raft_handle, index_params_, dataset));
-  }
+CuvsCagra::CuvsCagra(
+        GpuResources* resources,
+        int dim,
+        idx_t n,
+        int graph_degree,
+        const float* distances,
+        const idx_t* knn_graph,
+        faiss::MetricType metric,
+        float metricArg,
+        IndicesOptions indicesOptions)
+        : resources_(resources),
+          dim_(dim),
+          metric_(metric),
+          metricArg_(metricArg) {
+    FAISS_THROW_IF_NOT_MSG(
+            metric == faiss::METRIC_L2 || metric == faiss::METRIC_INNER_PRODUCT,
+            "CAGRA currently only supports L2 or Inner Product metric.");
+    FAISS_THROW_IF_NOT_MSG(
+            indicesOptions == faiss::gpu::INDICES_64_BIT,
+            "only INDICES_64_BIT is supported for cuVS CAGRA index");
+
+    auto distances_on_gpu = getDeviceForAddress(distances) >= 0;
+    auto knn_graph_on_gpu = getDeviceForAddress(knn_graph) >= 0;
+
+    FAISS_ASSERT(distances_on_gpu == knn_graph_on_gpu);
+
+    storage_ = distances;
+    n_ = n;
+
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+
+    if (distances_on_gpu && knn_graph_on_gpu) {
+        raft_handle.sync_stream();
+        // Copying to host so that cuvs::neighbors::cagra::index
+        // creates an owning copy of the knn graph on device
+        auto knn_graph_copy =
+                raft::make_host_matrix<uint32_t, int64_t>(n, graph_degree);
+        thrust::copy(
+                thrust::device_ptr<const idx_t>(knn_graph),
+                thrust::device_ptr<const idx_t>(knn_graph + (n * graph_degree)),
+                knn_graph_copy.data_handle());
+
+        auto distances_mds =
+                raft::make_device_matrix_view<const float, int64_t>(
+                        distances, n, dim);
+
+        cuvs_index = std::make_shared<
+                cuvs::neighbors::cagra::index<float, uint32_t>>(
+                raft_handle,
+                metricFaissToCuvs(metric_, false),
+                distances_mds,
+                raft::make_const_mdspan(knn_graph_copy.view()));
+    } else if (!distances_on_gpu && !knn_graph_on_gpu) {
+        // copy idx_t (int64_t) host knn_graph to uint32_t host knn_graph
+        auto knn_graph_copy =
+                raft::make_host_matrix<uint32_t, int64_t>(n, graph_degree);
+        std::copy(
+                knn_graph,
+                knn_graph + (n * graph_degree),
+                knn_graph_copy.data_handle());
+
+        auto distances_mds = raft::make_host_matrix_view<const float, int64_t>(
+                distances, n, dim);
+
+        cuvs_index = std::make_shared<
+                cuvs::neighbors::cagra::index<float, uint32_t>>(
+                raft_handle,
+                metricFaissToCuvs(metric_, false),
+                distances_mds,
+                raft::make_const_mdspan(knn_graph_copy.view()));
+    } else {
+        FAISS_THROW_MSG(
+                "distances and knn_graph must both be in device or host memory");
+    }
 }
 
-void CuvsCagra::search(Tensor<float, 2, true>& queries,
-                       int k,
-                       Tensor<float, 2, true>& outDistances,
-                       Tensor<idx_t, 2, true>& outIndices,
-                       idx_t max_queries,
-                       idx_t itopk_size,
-                       idx_t max_iterations,
-                       faiss::cagra_search_algo graph_search_algo,
-                       idx_t team_size,
-                       idx_t search_width,
-                       idx_t min_iterations,
-                       idx_t thread_block_size,
-                       faiss::cagra_hash_mode hash_mode,
-                       idx_t hashmap_min_bitlen,
-                       float hashmap_max_fill_rate,
-                       idx_t num_random_samplings,
-                       idx_t rand_xor_mask)
-{
-  const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice();
-  idx_t numQueries                          = queries.getSize(0);
-  idx_t cols                                = queries.getSize(1);
-  idx_t k_                                  = k;
-
-  FAISS_ASSERT(cuvs_index);
-  FAISS_ASSERT(numQueries > 0);
-  FAISS_ASSERT(cols == dim_);
-
-  if (!store_dataset_) {
-    if (getDeviceForAddress(storage_) >= 0) {
-      auto dataset = raft::make_device_matrix_view<const float, int64_t>(storage_, n_, dim_);
-      cuvs_index->update_dataset(raft_handle, dataset);
+void CuvsCagra::train(idx_t n, const float* x) {
+    storage_ = x;
+    n_ = n;
+
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+
+    if (!ivf_pq_params_) {
+        ivf_pq_params_ = cuvs::neighbors::ivf_pq::index_params::from_dataset(
+                raft::make_extents(
+                        static_cast<uint32_t>(n_), static_cast<uint32_t>(dim_)),
+                metricFaissToCuvs(metric_));
+    }
+    if (graph_build_algo_ == faiss::cagra_build_algo::IVF_PQ) {
+        cuvs::neighbors::cagra::graph_build_params::ivf_pq_params
+                graph_build_params;
+        graph_build_params.build_params = ivf_pq_params_.value();
+        graph_build_params.search_params = ivf_pq_search_params_.value();
+        graph_build_params.refinement_rate = refine_rate_.value();
+        index_params_.graph_build_params = graph_build_params;
+        if (index_params_.graph_degree ==
+            index_params_.intermediate_graph_degree) {
+            index_params_.intermediate_graph_degree =
+                    1.5 * index_params_.graph_degree;
+        }
+    } else {
+        cuvs::neighbors::cagra::graph_build_params::nn_descent_params
+                graph_build_params(intermediate_graph_degree);
+        graph_build_params.max_iterations = nn_descent_niter_;
+        index_params_.graph_build_params = graph_build_params;
+    }
+
+    if (getDeviceForAddress(x) >= 0) {
+        auto dataset =
+                raft::make_device_matrix_view<const float, int64_t>(x, n, dim_);
+        cuvs_index = std::make_shared<
+                cuvs::neighbors::cagra::index<float, uint32_t>>(
+                cuvs::neighbors::cagra::build(
+                        raft_handle, index_params_, dataset));
     } else {
-      auto dataset = raft::make_host_matrix_view<const float, int64_t>(storage_, n_, dim_);
-      cuvs_index->update_dataset(raft_handle, dataset);
+        auto dataset =
+                raft::make_host_matrix_view<const float, int64_t>(x, n, dim_);
+        cuvs_index = std::make_shared<
+                cuvs::neighbors::cagra::index<float, uint32_t>>(
+                cuvs::neighbors::cagra::build(
+                        raft_handle, index_params_, dataset));
+    }
+}
+
+void CuvsCagra::search(
+        Tensor<float, 2, true>& queries,
+        int k,
+        Tensor<float, 2, true>& outDistances,
+        Tensor<idx_t, 2, true>& outIndices,
+        idx_t max_queries,
+        idx_t itopk_size,
+        idx_t max_iterations,
+        faiss::cagra_search_algo graph_search_algo,
+        idx_t team_size,
+        idx_t search_width,
+        idx_t min_iterations,
+        idx_t thread_block_size,
+        faiss::cagra_hash_mode hash_mode,
+        idx_t hashmap_min_bitlen,
+        float hashmap_max_fill_rate,
+        idx_t num_random_samplings,
+        idx_t rand_xor_mask) {
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    idx_t numQueries = queries.getSize(0);
+    idx_t cols = queries.getSize(1);
+    idx_t k_ = k;
+
+    FAISS_ASSERT(cuvs_index);
+    FAISS_ASSERT(numQueries > 0);
+    FAISS_ASSERT(cols == dim_);
+
+    if (!store_dataset_) {
+        if (getDeviceForAddress(storage_) >= 0) {
+            auto dataset = raft::make_device_matrix_view<const float, int64_t>(
+                    storage_, n_, dim_);
+            cuvs_index->update_dataset(raft_handle, dataset);
+        } else {
+            auto dataset = raft::make_host_matrix_view<const float, int64_t>(
+                    storage_, n_, dim_);
+            cuvs_index->update_dataset(raft_handle, dataset);
+        }
     }
-  }
-
-  auto queries_view =
-    raft::make_device_matrix_view<const float, int64_t>(queries.data(), numQueries, cols);
-  auto distances_view =
-    raft::make_device_matrix_view<float, int64_t>(outDistances.data(), numQueries, k_);
-  auto indices_view =
-    raft::make_device_matrix_view<idx_t, int64_t>(outIndices.data(), numQueries, k_);
-
-  cuvs::neighbors::cagra::search_params search_pams;
-  search_pams.max_queries    = max_queries;
-  search_pams.itopk_size     = itopk_size;
-  search_pams.max_iterations = max_iterations;
-  search_pams.algo           = static_cast<cuvs::neighbors::cagra::search_algo>(graph_search_algo);
-  search_pams.team_size      = team_size;
-  search_pams.search_width   = search_width;
-  search_pams.min_iterations = min_iterations;
-  search_pams.thread_block_size     = thread_block_size;
-  search_pams.hashmap_mode          = static_cast<cuvs::neighbors::cagra::hash_mode>(hash_mode);
-  search_pams.hashmap_min_bitlen    = hashmap_min_bitlen;
-  search_pams.hashmap_max_fill_rate = hashmap_max_fill_rate;
-  search_pams.num_random_samplings  = num_random_samplings;
-  search_pams.rand_xor_mask         = rand_xor_mask;
-
-  auto indices_copy = raft::make_device_matrix<uint32_t, int64_t>(raft_handle, numQueries, k_);
-
-  cuvs::neighbors::cagra::search(
-    raft_handle, search_pams, *cuvs_index, queries_view, indices_copy.view(), distances_view);
-  thrust::copy(raft::resource::get_thrust_policy(raft_handle),
-               indices_copy.data_handle(),
-               indices_copy.data_handle() + indices_copy.size(),
-               indices_view.data_handle());
+
+    auto queries_view = raft::make_device_matrix_view<const float, int64_t>(
+            queries.data(), numQueries, cols);
+    auto distances_view = raft::make_device_matrix_view<float, int64_t>(
+            outDistances.data(), numQueries, k_);
+    auto indices_view = raft::make_device_matrix_view<idx_t, int64_t>(
+            outIndices.data(), numQueries, k_);
+
+    cuvs::neighbors::cagra::search_params search_pams;
+    search_pams.max_queries = max_queries;
+    search_pams.itopk_size = itopk_size;
+    search_pams.max_iterations = max_iterations;
+    search_pams.algo =
+            static_cast<cuvs::neighbors::cagra::search_algo>(graph_search_algo);
+    search_pams.team_size = team_size;
+    search_pams.search_width = search_width;
+    search_pams.min_iterations = min_iterations;
+    search_pams.thread_block_size = thread_block_size;
+    search_pams.hashmap_mode =
+            static_cast<cuvs::neighbors::cagra::hash_mode>(hash_mode);
+    search_pams.hashmap_min_bitlen = hashmap_min_bitlen;
+    search_pams.hashmap_max_fill_rate = hashmap_max_fill_rate;
+    search_pams.num_random_samplings = num_random_samplings;
+    search_pams.rand_xor_mask = rand_xor_mask;
+
+    auto indices_copy = raft::make_device_matrix<uint32_t, int64_t>(
+            raft_handle, numQueries, k_);
+
+    cuvs::neighbors::cagra::search(
+            raft_handle,
+            search_pams,
+            *cuvs_index,
+            queries_view,
+            indices_copy.view(),
+            distances_view);
+    thrust::copy(
+            raft::resource::get_thrust_policy(raft_handle),
+            indices_copy.data_handle(),
+            indices_copy.data_handle() + indices_copy.size(),
+            indices_view.data_handle());
 }
 
-void CuvsCagra::reset() { cuvs_index.reset(); }
+void CuvsCagra::reset() {
+    cuvs_index.reset();
+}
 
-idx_t CuvsCagra::get_knngraph_degree() const
-{
-  FAISS_ASSERT(cuvs_index);
-  return static_cast<idx_t>(cuvs_index->graph_degree());
+idx_t CuvsCagra::get_knngraph_degree() const {
+    FAISS_ASSERT(cuvs_index);
+    return static_cast<idx_t>(cuvs_index->graph_degree());
 }
 
-std::vector<idx_t> CuvsCagra::get_knngraph() const
-{
-  FAISS_ASSERT(cuvs_index);
-  const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice();
-  auto stream                               = raft_handle.get_stream();
+std::vector<idx_t> CuvsCagra::get_knngraph() const {
+    FAISS_ASSERT(cuvs_index);
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    auto stream = raft_handle.get_stream();
 
-  auto device_graph = cuvs_index->graph();
+    auto device_graph = cuvs_index->graph();
 
-  std::vector<idx_t> host_graph(device_graph.extent(0) * device_graph.extent(1));
+    std::vector<idx_t> host_graph(
+            device_graph.extent(0) * device_graph.extent(1));
 
-  raft_handle.sync_stream();
+    raft_handle.sync_stream();
 
-  thrust::copy(thrust::device_ptr<const uint32_t>(device_graph.data_handle()),
-               thrust::device_ptr<const uint32_t>(device_graph.data_handle() + device_graph.size()),
-               host_graph.data());
+    thrust::copy(
+            thrust::device_ptr<const uint32_t>(device_graph.data_handle()),
+            thrust::device_ptr<const uint32_t>(
+                    device_graph.data_handle() + device_graph.size()),
+            host_graph.data());
 
-  return host_graph;
+    return host_graph;
 }
 
-const float* CuvsCagra::get_training_dataset() const { return storage_; }
+const float* CuvsCagra::get_training_dataset() const {
+    return storage_;
+}
 
-}  // namespace gpu
-}  // namespace faiss
+} // namespace gpu
+} // namespace faiss

From f723d8c5df695b7438b3271ad7e1cec232025b75 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 27 Aug 2024 15:48:40 -0700
Subject: [PATCH 089/148] fix compilation issues

---
 faiss/gpu/impl/CuvsCagra.cu  |  6 +--
 faiss/gpu/impl/CuvsCagra.cuh | 87 ++++++++++++++++++++++--------------
 2 files changed, 57 insertions(+), 36 deletions(-)

diff --git a/faiss/gpu/impl/CuvsCagra.cu b/faiss/gpu/impl/CuvsCagra.cu
index 5502005134..15b0a2b4cd 100644
--- a/faiss/gpu/impl/CuvsCagra.cu
+++ b/faiss/gpu/impl/CuvsCagra.cu
@@ -164,9 +164,9 @@ void CuvsCagra::train(idx_t n, const float* x) {
 
     if (!ivf_pq_params_) {
         ivf_pq_params_ = cuvs::neighbors::ivf_pq::index_params::from_dataset(
-                raft::make_extents(
+                raft::make_extents<uint32_t>(
                         static_cast<uint32_t>(n_), static_cast<uint32_t>(dim_)),
-                metricFaissToCuvs(metric_));
+                metricFaissToCuvs(metric_, false));
     }
     if (graph_build_algo_ == faiss::cagra_build_algo::IVF_PQ) {
         cuvs::neighbors::cagra::graph_build_params::ivf_pq_params
@@ -182,7 +182,7 @@ void CuvsCagra::train(idx_t n, const float* x) {
         }
     } else {
         cuvs::neighbors::cagra::graph_build_params::nn_descent_params
-                graph_build_params(intermediate_graph_degree);
+                graph_build_params(index_params_.intermediate_graph_degree);
         graph_build_params.max_iterations = nn_descent_niter_;
         index_params_.graph_build_params = graph_build_params;
     }
diff --git a/faiss/gpu/impl/CuvsCagra.cuh b/faiss/gpu/impl/CuvsCagra.cuh
index 810a224277..74a791c80b 100644
--- a/faiss/gpu/impl/CuvsCagra.cuh
+++ b/faiss/gpu/impl/CuvsCagra.cuh
@@ -22,9 +22,9 @@
 
 #pragma once
 
-#include <cstddef>
 #include <faiss/gpu/GpuIndicesOptions.h>
 #include <faiss/gpu/GpuResources.h>
+#include <cstddef>
 #include <faiss/gpu/utils/Tensor.cuh>
 #include <optional>
 
@@ -45,36 +45,57 @@ enum class cagra_hash_mode { HASH, SMALL, AUTO };
 namespace gpu {
 
 class CuvsCagra {
-  public:
-    CuvsCagra(GpuResources *resources, int dim, idx_t intermediate_graph_degree,
-              idx_t graph_degree, faiss::cagra_build_algo graph_build_algo,
-              size_t nn_descent_niter, bool store_dataset,
-              faiss::MetricType metric, float metricArg,
-              IndicesOptions indicesOptions,
-              std::optional<cuvs::neighbors::ivf_pq::index_params>
-                  ivf_pq_params = std::nullopt,
-              std::optional<cuvs::neighbors::ivf_pq::search_params>
-                  ivf_pq_search_params = std::nullopt,
-              float refine_rate = 2.0f);
-
-    CuvsCagra(GpuResources *resources, int dim, idx_t n, int graph_degree,
-              const float *distances, const idx_t *knn_graph,
-              faiss::MetricType metric, float metricArg,
-              IndicesOptions indicesOptions);
+   public:
+    CuvsCagra(
+            GpuResources* resources,
+            int dim,
+            idx_t intermediate_graph_degree,
+            idx_t graph_degree,
+            faiss::cagra_build_algo graph_build_algo,
+            size_t nn_descent_niter,
+            bool store_dataset,
+            faiss::MetricType metric,
+            float metricArg,
+            IndicesOptions indicesOptions,
+            std::optional<cuvs::neighbors::ivf_pq::index_params> ivf_pq_params =
+                    std::nullopt,
+            std::optional<cuvs::neighbors::ivf_pq::search_params>
+                    ivf_pq_search_params = std::nullopt,
+            float refine_rate = 2.0f);
+
+    CuvsCagra(
+            GpuResources* resources,
+            int dim,
+            idx_t n,
+            int graph_degree,
+            const float* distances,
+            const idx_t* knn_graph,
+            faiss::MetricType metric,
+            float metricArg,
+            IndicesOptions indicesOptions);
 
     ~CuvsCagra() = default;
 
-    void train(idx_t n, const float *x);
-
-    void search(Tensor<float, 2, true> &queries, int k,
-                Tensor<float, 2, true> &outDistances,
-                Tensor<idx_t, 2, true> &outIndices, idx_t max_queries,
-                idx_t itopk_size, idx_t max_iterations,
-                faiss::cagra_search_algo graph_search_algo, idx_t team_size,
-                idx_t search_width, idx_t min_iterations,
-                idx_t thread_block_size, faiss::cagra_hash_mode hash_mode,
-                idx_t hashmap_min_bitlen, float hashmap_max_fill_rate,
-                idx_t num_random_samplings, idx_t rand_xor_mask);
+    void train(idx_t n, const float* x);
+
+    void search(
+            Tensor<float, 2, true>& queries,
+            int k,
+            Tensor<float, 2, true>& outDistances,
+            Tensor<idx_t, 2, true>& outIndices,
+            idx_t max_queries,
+            idx_t itopk_size,
+            idx_t max_iterations,
+            faiss::cagra_search_algo graph_search_algo,
+            idx_t team_size,
+            idx_t search_width,
+            idx_t min_iterations,
+            idx_t thread_block_size,
+            faiss::cagra_hash_mode hash_mode,
+            idx_t hashmap_min_bitlen,
+            float hashmap_max_fill_rate,
+            idx_t num_random_samplings,
+            idx_t rand_xor_mask);
 
     void reset();
 
@@ -82,14 +103,14 @@ class CuvsCagra {
 
     std::vector<idx_t> get_knngraph() const;
 
-    const float *get_training_dataset() const;
+    const float* get_training_dataset() const;
 
-  private:
+   private:
     /// Collection of GPU resources that we use
-    GpuResources *resources_;
+    GpuResources* resources_;
 
     /// Training dataset
-    const float *storage_;
+    const float* storage_;
     int n_;
 
     /// Expected dimensionality of the vectors
@@ -119,7 +140,7 @@ class CuvsCagra {
 
     /// Instance of trained cuVS CAGRA index
     std::shared_ptr<cuvs::neighbors::cagra::index<float, uint32_t>> cuvs_index{
-        nullptr};
+            nullptr};
 };
 
 } // namespace gpu

From 74e21ab97ee672381f02ccfb639b49c7c0d2fd4c Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 4 Sep 2024 19:02:16 -0700
Subject: [PATCH 090/148] resolve bfknn test failures

---
 build.sh                    | 58 +++++++++++++++++++++++++++++++++++++
 faiss/gpu/GpuDistance.cu    | 22 +++++++++++++-
 faiss/gpu/test/test_cuvs.py |  4 +--
 3 files changed, 81 insertions(+), 3 deletions(-)
 create mode 100755 build.sh

diff --git a/build.sh b/build.sh
new file mode 100755
index 0000000000..dd974e4a76
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+# NOTE: This file is temporary for the proof-of-concept branch and will be removed before this PR is merged
+
+BUILD_TYPE=Debug
+BUILD_DIR=build/
+
+RAFT_REPO_REL=""
+EXTRA_CMAKE_ARGS=""
+set -e
+
+if [[ ${RAFT_REPO_REL} != "" ]]; then
+  RAFT_REPO_PATH="`readlink -f \"${RAFT_REPO_REL}\"`"
+  EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCPM_raft_SOURCE=${RAFT_REPO_PATH}"
+fi
+
+if [ "$1" == "clean" ]; then
+  rm -rf build
+  rm -rf .cache
+  exit 0
+fi
+
+if [ "$1" == "test" ]; then
+  make -C build -j test
+  exit 0
+fi
+
+if [ "$1" == "test-raft" ]; then
+  ./build/faiss/gpu/test/TestRaftIndexIVFFlat
+  exit 0
+fi
+
+mkdir -p $BUILD_DIR
+cd $BUILD_DIR
+
+cmake \
+ -DFAISS_ENABLE_GPU=ON \
+ -DFAISS_ENABLE_CUVS=ON \
+ -DFAISS_ENABLE_PYTHON=ON \
+ -DBUILD_TESTING=ON \
+ -DBUILD_SHARED_LIBS=ON \
+ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+ -DFAISS_OPT_LEVEL=avx2 \
+ -DRAFT_NVTX=OFF \
+ -DCMAKE_CUDA_ARCHITECTURES="NATIVE" \
+ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+ ${EXTRA_CMAKE_ARGS} \
+ ../
+
+#   -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
+#  -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+#  -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+
+# make -C build -j12 faiss
+cmake  --build . -j48
+# make -C build -j12 swigfaiss
+# (cd build/faiss/python && python setup.py install)
+
diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index 97b5c6c136..c66ed7e778 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -235,11 +235,31 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
 
 #if defined USE_NVIDIA_CUVS
     // Note: For now, cuVS bfknn requires queries and vectors to be same layout
-    if (should_use_cuvs(args) && args.queriesRowMajor == args.vectorsRowMajor) {
+    if (should_use_cuvs(args) && args.queriesRowMajor == args.vectorsRowMajor &&
+        args.outIndicesType == IndicesDataType::I64 &&
+        args.vectorType == DistanceDataType::F32 && args.k > 0) {
         cuvsDistanceType distance = metricFaissToCuvs(args.metric, false);
 
         auto resImpl = prov->getResources();
         auto res = resImpl.get();
+        // If the user specified a device, then ensure that it is currently set
+        int device = -1;
+        if (args.device == -1) {
+            // Original behavior if no device is specified, use the current CUDA
+            // thread local device
+            device = getCurrentDevice();
+        } else {
+            // Otherwise, use the device specified in `args`
+            device = args.device;
+
+            FAISS_THROW_IF_NOT_FMT(
+                    device >= 0 && device < getNumDevices(),
+                    "bfKnn: device specified must be -1 (current CUDA thread local device) "
+                    "or within the range [0, %d)",
+                    getNumDevices());
+        }
+
+        DeviceScope scope(device);
         raft::device_resources& handle = res->getRaftHandleCurrentDevice();
         auto stream = res->getDefaultStreamCurrentDevice();
 
diff --git a/faiss/gpu/test/test_cuvs.py b/faiss/gpu/test/test_cuvs.py
index 9450ff927c..ebbe9a82c5 100644
--- a/faiss/gpu/test/test_cuvs.py
+++ b/faiss/gpu/test/test_cuvs.py
@@ -26,13 +26,13 @@ def test_bfKnn(self):
         # Faiss internal implementation
         Dnew, Inew = faiss.knn_gpu(
             res, ds.get_queries(), ds.get_database(), 12, use_cuvs=False)
-        np.testing.assert_allclose(Dref, Dnew, atol=1e-5)
+        np.testing.assert_allclose(Dref, Dnew, atol=1e-4)
         np.testing.assert_array_equal(Iref, Inew)
 
         # cuVS version
         Dnew, Inew = faiss.knn_gpu(
             res, ds.get_queries(), ds.get_database(), 12, use_cuvs=True)
-        np.testing.assert_allclose(Dref, Dnew, atol=1e-5)
+        np.testing.assert_allclose(Dref, Dnew, atol=1e-4)
         np.testing.assert_array_equal(Iref, Inew)
 
     def test_IndexFlat(self):

From 3e097cedb6b871e29dd665420f9c7c0d5ade4749 Mon Sep 17 00:00:00 2001
From: Tarang Jain <40517122+tarang-jain@users.noreply.github.com>
Date: Wed, 4 Sep 2024 19:07:11 -0700
Subject: [PATCH 091/148] Delete build.sh

---
 build.sh | 58 --------------------------------------------------------
 1 file changed, 58 deletions(-)
 delete mode 100755 build.sh

diff --git a/build.sh b/build.sh
deleted file mode 100755
index dd974e4a76..0000000000
--- a/build.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/bash
-
-# NOTE: This file is temporary for the proof-of-concept branch and will be removed before this PR is merged
-
-BUILD_TYPE=Debug
-BUILD_DIR=build/
-
-RAFT_REPO_REL=""
-EXTRA_CMAKE_ARGS=""
-set -e
-
-if [[ ${RAFT_REPO_REL} != "" ]]; then
-  RAFT_REPO_PATH="`readlink -f \"${RAFT_REPO_REL}\"`"
-  EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCPM_raft_SOURCE=${RAFT_REPO_PATH}"
-fi
-
-if [ "$1" == "clean" ]; then
-  rm -rf build
-  rm -rf .cache
-  exit 0
-fi
-
-if [ "$1" == "test" ]; then
-  make -C build -j test
-  exit 0
-fi
-
-if [ "$1" == "test-raft" ]; then
-  ./build/faiss/gpu/test/TestRaftIndexIVFFlat
-  exit 0
-fi
-
-mkdir -p $BUILD_DIR
-cd $BUILD_DIR
-
-cmake \
- -DFAISS_ENABLE_GPU=ON \
- -DFAISS_ENABLE_CUVS=ON \
- -DFAISS_ENABLE_PYTHON=ON \
- -DBUILD_TESTING=ON \
- -DBUILD_SHARED_LIBS=ON \
- -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
- -DFAISS_OPT_LEVEL=avx2 \
- -DRAFT_NVTX=OFF \
- -DCMAKE_CUDA_ARCHITECTURES="NATIVE" \
- -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
- ${EXTRA_CMAKE_ARGS} \
- ../
-
-#   -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
-#  -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-#  -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-
-# make -C build -j12 faiss
-cmake  --build . -j48
-# make -C build -j12 swigfaiss
-# (cd build/faiss/python && python setup.py install)
-

From 337a74a09fae515554aff60b614ef0ca0e16ab31 Mon Sep 17 00:00:00 2001
From: Tarang Jain <40517122+tarang-jain@users.noreply.github.com>
Date: Wed, 11 Sep 2024 08:45:24 -0700
Subject: [PATCH 092/148] Apply suggestions from code review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

use cached device properties

Co-authored-by: Malte Förster <97973773+mfoerste4@users.noreply.github.com>
---
 faiss/gpu/GpuDistance.cu | 3 +--
 faiss/gpu/GpuIndex.cu    | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index c66ed7e778..4835586ad4 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -48,9 +48,8 @@ namespace faiss {
 namespace gpu {
 
 bool should_use_cuvs(GpuDistanceParams args) {
-    cudaDeviceProp prop;
     int dev = args.device >= 0 ? args.device : getCurrentDevice();
-    cudaGetDeviceProperties(&prop, dev);
+    auto prop = getDeviceProperties(dev);
 
     if (prop.major < 7)
         return false;
diff --git a/faiss/gpu/GpuIndex.cu b/faiss/gpu/GpuIndex.cu
index 06893dd1a7..3aeef876d4 100644
--- a/faiss/gpu/GpuIndex.cu
+++ b/faiss/gpu/GpuIndex.cu
@@ -43,8 +43,7 @@ constexpr idx_t kAddVecSize = (idx_t)512 * 1024;
 constexpr idx_t kSearchVecSize = (idx_t)32 * 1024;
 
 bool should_use_cuvs(GpuIndexConfig config_) {
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, config_.device);
+    auto prop = getDeviceProperties(config_.device);
 
     if (prop.major < 7)
         return false;

From 62bf7f3400a4f5a0a057ade04e5c89fd4c8754f5 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 11 Sep 2024 08:52:23 -0700
Subject: [PATCH 093/148] update stream; dont reset handle

---
 faiss/gpu/StandardGpuResources.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
index 4caf264399..bb931e3b98 100644
--- a/faiss/gpu/StandardGpuResources.cpp
+++ b/faiss/gpu/StandardGpuResources.cpp
@@ -262,7 +262,7 @@ void StandardGpuResourcesImpl::setDefaultStream(
         // with the updated stream during any subsequent calls to getRaftHandle
         auto it2 = raftHandles_.find(device);
         if (it2 != raftHandles_.end()) {
-            raftHandles_.erase(it2);
+            raft::resource::set_cuda_stream(it2->second, stream);
         }
 #endif
     }
@@ -288,7 +288,7 @@ void StandardGpuResourcesImpl::revertDefaultStream(int device) {
         // with the updated stream during any subsequent calls to getRaftHandle
         auto it2 = raftHandles_.find(device);
         if (it2 != raftHandles_.end()) {
-            raftHandles_.erase(it2);
+            raft::resource::set_cuda_stream(it2->second, newStream);
         }
 #endif
     }

From aae6cf262cee3a79bd7745d9e27a7a2261155663 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 11 Sep 2024 09:38:30 -0700
Subject: [PATCH 094/148] resolve compilation error

---
 faiss/gpu/StandardGpuResources.cpp | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
index bb931e3b98..5afb3f652d 100644
--- a/faiss/gpu/StandardGpuResources.cpp
+++ b/faiss/gpu/StandardGpuResources.cpp
@@ -282,15 +282,25 @@ void StandardGpuResourcesImpl::revertDefaultStream(int device) {
             cudaStream_t newStream = defaultStreams_[device];
 
             streamWait({newStream}, {prevStream});
-        }
+
 #if defined USE_NVIDIA_CUVS
-        // delete the raft handle for this device, which will be initialized
-        // with the updated stream during any subsequent calls to getRaftHandle
-        auto it2 = raftHandles_.find(device);
-        if (it2 != raftHandles_.end()) {
-            raft::resource::set_cuda_stream(it2->second, newStream);
-        }
+            // update the stream on the raft handle for this device
+            auto it2 = raftHandles_.find(device);
+            if (it2 != raftHandles_.end()) {
+                raft::resource::set_cuda_stream(it2->second, newStream);
+            }
+#endif
+        } else {
+#if defined USE_NVIDIA_CUVS
+            // delete the raft handle for this device, which will be initialized
+            // with the updated stream during any subsequent calls to
+            // getRaftHandle
+            auto it2 = raftHandles_.find(device);
+            if (it2 != raftHandles_.end()) {
+                raftHandles_.erase(it2);
+            }
 #endif
+        }
     }
 
     userDefaultStreams_.erase(device);

From 4b84f46e06a66f5c3a135b3415bab28fd6b967f9 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 16 Sep 2024 11:37:04 -0700
Subject: [PATCH 095/148] do not link cutlass

---
 c_api/gpu/CMakeLists.txt    | 3 +--
 faiss/gpu/CMakeLists.txt    | 2 +-
 faiss/python/CMakeLists.txt | 8 ++++----
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/c_api/gpu/CMakeLists.txt b/c_api/gpu/CMakeLists.txt
index c8ce1b6172..39fc274bfb 100644
--- a/c_api/gpu/CMakeLists.txt
+++ b/c_api/gpu/CMakeLists.txt
@@ -20,8 +20,7 @@ if (FAISS_ENABLE_ROCM)
 else()
   find_package(CUDAToolkit REQUIRED)
   target_link_libraries(faiss_c PUBLIC CUDA::cudart CUDA::cublas
-    $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>
-    $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
+    $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
 endif()
 
 add_executable(example_gpu_c EXCLUDE_FROM_ALL example_gpu_c.c)
diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index d0d60b5c7f..03ec63298e 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -334,7 +334,7 @@ else()
 
 
   find_package(CUDAToolkit REQUIRED)
-  target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass> $<$<BOOL:${FAISS_ENABLE_CUVS}>:OpenMP::OpenMP_CXX>)
+  target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:OpenMP::OpenMP_CXX>)
   target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$<BOOL:${FAISS_ENABLE_CUVS}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
   target_compile_options(faiss_gpu PRIVATE
     $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all
diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt
index 063d0a446c..0d0f0574ea 100644
--- a/faiss/python/CMakeLists.txt
+++ b/faiss/python/CMakeLists.txt
@@ -176,10 +176,10 @@ if(FAISS_ENABLE_GPU)
     if(FAISS_ENABLE_CUVS)
       find_package(cuvs)
     endif()
-    target_link_libraries(swigfaiss PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
-    target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
-    target_link_libraries(swigfaiss_avx512 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
-    target_link_libraries(swigfaiss_sve PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:nvidia::cutlass::cutlass>)
+    target_link_libraries(swigfaiss PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
+    target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
+    target_link_libraries(swigfaiss_avx512 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
+    target_link_libraries(swigfaiss_sve PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
   endif()
 endif()
 

From a6f1775d71559aede96c476bd811d7dfa92f2ff9 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 17 Sep 2024 10:14:31 -0700
Subject: [PATCH 096/148] link raft::raft

---
 CMakeLists.txt                | 1 +
 c_api/gpu/CMakeLists.txt      | 2 +-
 faiss/gpu/CMakeLists.txt      | 2 +-
 faiss/gpu/test/CMakeLists.txt | 2 +-
 faiss/python/CMakeLists.txt   | 9 +++++----
 tests/CMakeLists.txt          | 1 +
 6 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ea412b0023..8e286f13f6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -81,6 +81,7 @@ endif()
 
 if(FAISS_ENABLE_CUVS AND NOT TARGET cuvs::cuvs)
    find_package(cuvs)
+   find_package(raft)
  endif()
 
 add_subdirectory(faiss)
diff --git a/c_api/gpu/CMakeLists.txt b/c_api/gpu/CMakeLists.txt
index 39fc274bfb..e7b8110e4a 100644
--- a/c_api/gpu/CMakeLists.txt
+++ b/c_api/gpu/CMakeLists.txt
@@ -20,7 +20,7 @@ if (FAISS_ENABLE_ROCM)
 else()
   find_package(CUDAToolkit REQUIRED)
   target_link_libraries(faiss_c PUBLIC CUDA::cudart CUDA::cublas
-    $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
+    $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft>)
 endif()
 
 add_executable(example_gpu_c EXCLUDE_FROM_ALL example_gpu_c.c)
diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index 03ec63298e..53a615456c 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -334,7 +334,7 @@ else()
 
 
   find_package(CUDAToolkit REQUIRED)
-  target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:OpenMP::OpenMP_CXX>)
+  target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:OpenMP::OpenMP_CXX>)
   target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$<BOOL:${FAISS_ENABLE_CUVS}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
   target_compile_options(faiss_gpu PRIVATE
     $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all
diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt
index b76d082bec..8be8706b71 100644
--- a/faiss/gpu/test/CMakeLists.txt
+++ b/faiss/gpu/test/CMakeLists.txt
@@ -24,7 +24,7 @@ if(FAISS_ENABLE_ROCM)
   target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest hip::host)
 else()
   find_package(CUDAToolkit REQUIRED)
-  target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
+  target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft>)
 endif()
 
 macro(faiss_gpu_test file)
diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt
index 0d0f0574ea..e108ab907a 100644
--- a/faiss/python/CMakeLists.txt
+++ b/faiss/python/CMakeLists.txt
@@ -175,11 +175,12 @@ if(FAISS_ENABLE_GPU)
     find_package(CUDAToolkit REQUIRED)
     if(FAISS_ENABLE_CUVS)
       find_package(cuvs)
+      find_package(raft)
     endif()
-    target_link_libraries(swigfaiss PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
-    target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
-    target_link_libraries(swigfaiss_avx512 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
-    target_link_libraries(swigfaiss_sve PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
+    target_link_libraries(swigfaiss PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft>)
+    target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft>)
+    target_link_libraries(swigfaiss_avx512 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft>)
+    target_link_libraries(swigfaiss_sve PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft>)
   endif()
 endif()
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index d611e3d00c..0ffd2191de 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -123,6 +123,7 @@ target_link_libraries(faiss_test PRIVATE
   OpenMP::OpenMP_CXX
   GTest::gtest_main
   $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>
+  $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft>
   $<$<BOOL:${FAISS_ENABLE_ROCM}>:hip::host>
 )
 

From c1b959cbb4ddde11cb47a9bf989d87cfda6b29bd Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 17 Sep 2024 10:20:36 -0700
Subject: [PATCH 097/148] sconditionally fndinng raft

---
 faiss/python/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt
index e108ab907a..b9cc0b4e63 100644
--- a/faiss/python/CMakeLists.txt
+++ b/faiss/python/CMakeLists.txt
@@ -175,7 +175,8 @@ if(FAISS_ENABLE_GPU)
     find_package(CUDAToolkit REQUIRED)
     if(FAISS_ENABLE_CUVS)
       find_package(cuvs)
-      find_package(raft)
+      if(FAISS_ENABLE_RAFT AND NOT TARGET raft::raft)
+        find_package(raft)
     endif()
     target_link_libraries(swigfaiss PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft>)
     target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft>)

From 7024c0d803b80bfb68827a83927ee00c1fa9d5b1 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 17 Sep 2024 10:54:42 -0700
Subject: [PATCH 098/148] Trigger Build


From f207def99c8f1c654972653b5cd20b02d443b133 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 17 Sep 2024 11:15:38 -0700
Subject: [PATCH 099/148] change link order

---
 c_api/gpu/CMakeLists.txt      | 3 +--
 faiss/gpu/CMakeLists.txt      | 2 +-
 faiss/gpu/test/CMakeLists.txt | 2 +-
 faiss/python/CMakeLists.txt   | 8 ++++----
 tests/CMakeLists.txt          | 2 +-
 5 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/c_api/gpu/CMakeLists.txt b/c_api/gpu/CMakeLists.txt
index e7b8110e4a..a8b08316e8 100644
--- a/c_api/gpu/CMakeLists.txt
+++ b/c_api/gpu/CMakeLists.txt
@@ -19,8 +19,7 @@ if (FAISS_ENABLE_ROCM)
   target_link_libraries(faiss_c PUBLIC hip::host roc::hipblas)
 else()
   find_package(CUDAToolkit REQUIRED)
-  target_link_libraries(faiss_c PUBLIC CUDA::cudart CUDA::cublas
-    $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft>)
+  target_link_libraries(faiss_c PUBLIC CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
 endif()
 
 add_executable(example_gpu_c EXCLUDE_FROM_ALL example_gpu_c.c)
diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index 53a615456c..de8f97f972 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -334,7 +334,7 @@ else()
 
 
   find_package(CUDAToolkit REQUIRED)
-  target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:OpenMP::OpenMP_CXX>)
+  target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:OpenMP::OpenMP_CXX>)
   target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$<BOOL:${FAISS_ENABLE_CUVS}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
   target_compile_options(faiss_gpu PRIVATE
     $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all
diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt
index 8be8706b71..18a1549249 100644
--- a/faiss/gpu/test/CMakeLists.txt
+++ b/faiss/gpu/test/CMakeLists.txt
@@ -24,7 +24,7 @@ if(FAISS_ENABLE_ROCM)
   target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest hip::host)
 else()
   find_package(CUDAToolkit REQUIRED)
-  target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft>)
+  target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
 endif()
 
 macro(faiss_gpu_test file)
diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt
index b9cc0b4e63..80bdce3b8f 100644
--- a/faiss/python/CMakeLists.txt
+++ b/faiss/python/CMakeLists.txt
@@ -178,10 +178,10 @@ if(FAISS_ENABLE_GPU)
       if(FAISS_ENABLE_RAFT AND NOT TARGET raft::raft)
         find_package(raft)
     endif()
-    target_link_libraries(swigfaiss PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft>)
-    target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft>)
-    target_link_libraries(swigfaiss_avx512 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft>)
-    target_link_libraries(swigfaiss_sve PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft>)
+    target_link_libraries(swigfaiss PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
+    target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
+    target_link_libraries(swigfaiss_avx512 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
+    target_link_libraries(swigfaiss_sve PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
   endif()
 endif()
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 0ffd2191de..f098de7a9a 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -122,8 +122,8 @@ find_package(GTest CONFIG REQUIRED)
 target_link_libraries(faiss_test PRIVATE
   OpenMP::OpenMP_CXX
   GTest::gtest_main
-  $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>
   $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft>
+  $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>
   $<$<BOOL:${FAISS_ENABLE_ROCM}>:hip::host>
 )
 

From 7ac798d30d841608b7d14919007748ae762f5d5f Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 17 Sep 2024 13:29:21 -0700
Subject: [PATCH 100/148] endif()

---
 faiss/python/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt
index 80bdce3b8f..060bd5300b 100644
--- a/faiss/python/CMakeLists.txt
+++ b/faiss/python/CMakeLists.txt
@@ -177,6 +177,7 @@ if(FAISS_ENABLE_GPU)
       find_package(cuvs)
       if(FAISS_ENABLE_RAFT AND NOT TARGET raft::raft)
         find_package(raft)
+      endif()
     endif()
     target_link_libraries(swigfaiss PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
     target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)

From 93172cf04b0eb879a88bb91bd1212b89c4e58951 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Thu, 19 Sep 2024 11:11:15 -0700
Subject: [PATCH 101/148] change installed libcuvs version in git actions

---
 .github/actions/build_cmake/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index 694771b441..f3f293a5c4 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -59,7 +59,7 @@ runs:
           conda install -y -q cuda-toolkit -c "nvidia/label/cuda-12.4.0"
         # and CUDA from cuVS channel for cuVS builds
         elif [ "${{ inputs.cuvs }}" = "ON" ]; then
-          conda install -y -q libcuvs cuda-version=12.4 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-12.4.0" -c conda-forge
+          conda install -y -q libcuvs=24.08 cuda-version=12.4 cuda-toolkit -c rapidsai -c "nvidia/label/cuda-12.4.0" -c conda-forge
         fi
 
         # install test packages

From dbde0f3b95a42298a390280f333427997a5c1041 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 1 Oct 2024 15:13:22 -0700
Subject: [PATCH 102/148] dbg test, update pinned cuvs

---
 build.sh                                 | 61 ++++++++++++++++++++++++
 cmake/thirdparty/fetch_rapids.cmake      |  2 +-
 conda/faiss-gpu-cuvs/meta.yaml           |  4 +-
 faiss/gpu/impl/CuvsIVFFlat.cu            | 46 +++++++++++++-----
 faiss/gpu/test/torch_test_contrib_gpu.py | 36 +++++++++++---
 5 files changed, 129 insertions(+), 20 deletions(-)
 create mode 100755 build.sh

diff --git a/build.sh b/build.sh
new file mode 100755
index 0000000000..17106d5834
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+
+# NOTE: This file is temporary for the proof-of-concept branch and will be removed before this PR is merged
+
+BUILD_TYPE=Release
+BUILD_DIR=build/
+
+RAFT_REPO_REL=""
+EXTRA_CMAKE_ARGS=""
+set -e
+
+if [[ ${RAFT_REPO_REL} != "" ]]; then
+  RAFT_REPO_PATH="`readlink -f \"${RAFT_REPO_REL}\"`"
+  EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCPM_raft_SOURCE=${RAFT_REPO_PATH}"
+fi
+
+if [ "$1" == "clean" ]; then
+  rm -rf build
+  rm -rf .cache
+  exit 0
+fi
+
+if [ "$1" == "test" ]; then
+  make -C build -j test
+  exit 0
+fi
+
+if [ "$1" == "test-raft" ]; then
+  ./build/faiss/gpu/test/TestRaftIndexIVFFlat
+  exit 0
+fi
+
+mkdir -p $BUILD_DIR
+cd $BUILD_DIR
+
+cmake \
+ -DFAISS_ENABLE_GPU=ON \
+ -DFAISS_ENABLE_CUVS=ON \
+ -DFAISS_ENABLE_PYTHON=ON \
+ -DBUILD_TESTING=ON \
+ -DBUILD_SHARED_LIBS=ON \
+ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+ -DFAISS_OPT_LEVEL=avx2 \
+ -DRAFT_NVTX=OFF \
+ -DCMAKE_CUDA_ARCHITECTURES="NATIVE" \
+ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+ -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
+ -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+ -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+ ${EXTRA_CMAKE_ARGS} \
+ ../
+
+#   -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
+#  -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+#  -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+
+# make -C build -j12 faiss
+cmake  --build . -j12
+# make -C build -j12 swigfaiss
+# (cd build/faiss/python && python setup.py install)
+
diff --git a/cmake/thirdparty/fetch_rapids.cmake b/cmake/thirdparty/fetch_rapids.cmake
index 3b9d9b140a..cf925a0ac1 100644
--- a/cmake/thirdparty/fetch_rapids.cmake
+++ b/cmake/thirdparty/fetch_rapids.cmake
@@ -15,7 +15,7 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 # =============================================================================
-set(RAPIDS_VERSION "24.08")
+set(RAPIDS_VERSION "24.10")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
diff --git a/conda/faiss-gpu-cuvs/meta.yaml b/conda/faiss-gpu-cuvs/meta.yaml
index 6c43d19a7c..83da1fd32d 100644
--- a/conda/faiss-gpu-cuvs/meta.yaml
+++ b/conda/faiss-gpu-cuvs/meta.yaml
@@ -58,7 +58,7 @@ outputs:
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
-        - libcuvs =24.08
+        - libcuvs =24.10
         - cuda-version {{ cuda_constraints }}
       run:
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
@@ -66,7 +66,7 @@ outputs:
         - openblas  # [not x86_64]
         - cuda-cudart {{ cuda_constraints }}
         - libcublas {{ libcublas_constraints }}
-        - libcuvs =24.08
+        - libcuvs =24.10
         - cuda-version {{ cuda_constraints }}
     test:
       requires:
diff --git a/faiss/gpu/impl/CuvsIVFFlat.cu b/faiss/gpu/impl/CuvsIVFFlat.cu
index 1b73a8cfad..71c6a2719d 100644
--- a/faiss/gpu/impl/CuvsIVFFlat.cu
+++ b/faiss/gpu/impl/CuvsIVFFlat.cu
@@ -99,6 +99,8 @@ void CuvsIVFFlat::search(
     /// called updateQuantizer() to modify the cuVS index if the quantizer was
     /// modified externally
 
+    std::cout << "inside cuvs ivf flat search" << std::endl;
+
     uint32_t numQueries = queries.getSize(0);
     uint32_t cols = queries.getSize(1);
     uint32_t k_ = k;
@@ -116,11 +118,19 @@ void CuvsIVFFlat::search(
 
     auto queries_view = raft::make_device_matrix_view<const float, idx_t>(
             queries.data(), (idx_t)numQueries, (idx_t)cols);
+    raft::print_device_vector("queries", queries.data(), 25, std::cout);
     auto out_inds_view = raft::make_device_matrix_view<idx_t, idx_t>(
             outIndices.data(), (idx_t)numQueries, (idx_t)k_);
     auto out_dists_view = raft::make_device_matrix_view<float, idx_t>(
             outDistances.data(), (idx_t)numQueries, (idx_t)k_);
 
+    std::cout << "now running search";
+    raft::print_device_vector(
+            "cuvs centers before running search",
+            cuvs_index->centers().data_handle(),
+            numLists_ * dim_,
+            std::cout);
+
     cuvs::neighbors::ivf_flat::search(
             raft_handle,
             pams,
@@ -129,6 +139,11 @@ void CuvsIVFFlat::search(
             out_inds_view,
             out_dists_view);
 
+    raft::print_device_vector(
+            "outIndices before filtering",
+            outIndices.data(),
+            numQueries * k_,
+            std::cout);
     /// Identify NaN rows and mask their nearest neighbors
     auto nan_flag = raft::make_device_vector<bool>(raft_handle, numQueries);
 
@@ -159,6 +174,8 @@ void CuvsIVFFlat::search(
                     return max_val;
                 return out_dists[i];
             });
+    raft::print_device_vector(
+            "outIndices", outIndices.data(), numQueries * k_, std::cout);
 }
 
 idx_t CuvsIVFFlat::addVectors(
@@ -168,6 +185,11 @@ idx_t CuvsIVFFlat::addVectors(
     /// NB: The coarse quantizer is ignored here. The user is assumed to have
     /// called updateQuantizer() to update the cuVS index if the quantizer was
     /// modified externally
+    std::cout << "inside CuvsIVFFlat addVectors" << std::endl;
+    raft::print_device_vector(
+            "indices", indices.data(), indices.getSize(0), std::cout);
+    raft::print_device_vector(
+            "vectors", vecs.data(), vecs.getSize(1), std::cout);
 
     FAISS_ASSERT(cuvs_index != nullptr);
 
@@ -177,17 +199,14 @@ idx_t CuvsIVFFlat::addVectors(
     /// Remove rows containing NaNs
     idx_t n_rows_valid = inplaceGatherFilteredRows(resources_, vecs, indices);
 
-    cuvs_index = std::make_shared<
-            cuvs::neighbors::ivf_flat::index<float, idx_t>>(
-            cuvs::neighbors::ivf_flat::extend(
-                    raft_handle,
-                    raft::make_device_matrix_view<const float, idx_t>(
-                            vecs.data(), n_rows_valid, dim_),
-                    std::make_optional<
-                            raft::device_vector_view<const idx_t, idx_t>>(
-                            raft::make_device_vector_view<const idx_t, idx_t>(
-                                    indices.data(), n_rows_valid)),
-                    *cuvs_index));
+    cuvs::neighbors::ivf_flat::extend(
+            raft_handle,
+            raft::make_device_matrix_view<const float, idx_t>(
+                    vecs.data(), n_rows_valid, dim_),
+            std::make_optional<raft::device_vector_view<const idx_t, idx_t>>(
+                    raft::make_device_vector_view<const idx_t, idx_t>(
+                            indices.data(), n_rows_valid)),
+            cuvs_index.get());
 
     return n_rows_valid;
 }
@@ -362,6 +381,11 @@ void CuvsIVFFlat::updateQuantizer(Index* quantizer) {
                 total_elems,
                 stream);
     }
+    raft::print_device_vector(
+            "cuvs centers",
+            cuvs_index->centers().data_handle(),
+            total_elems,
+            std::cout);
 }
 
 void CuvsIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
diff --git a/faiss/gpu/test/torch_test_contrib_gpu.py b/faiss/gpu/test/torch_test_contrib_gpu.py
index 951fadd3aa..004858d6f8 100644
--- a/faiss/gpu/test/torch_test_contrib_gpu.py
+++ b/faiss/gpu/test/torch_test_contrib_gpu.py
@@ -9,6 +9,9 @@
 import faiss
 import faiss.contrib.torch_utils
 
+from rmm.allocators.torch import rmm_torch_allocator
+torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
+
 def to_column_major_torch(x):
     if hasattr(torch, 'contiguous_format'):
         return x.t().clone(memory_format=torch.contiguous_format).t()
@@ -69,6 +72,7 @@ def test_lookup(self):
         self.assertTrue(np.array_equal(d_torch_cpu.numpy(), d_np_cpu))
         self.assertTrue(np.array_equal(i_torch_cpu.numpy(), i_np_cpu))
 
+
     # tests train, add_with_ids
     def test_train_add_with_ids(self):
         d = 32
@@ -76,25 +80,43 @@ def test_train_add_with_ids(self):
         res = faiss.StandardGpuResources()
         res.noTempMemory()
 
-        index = faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2)
+        config = faiss.GpuIndexIVFFlatConfig()
+        config.use_cuvs = True
+
+        index = faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2, config)
+        from rmm.allocators.cupy import rmm_cupy_allocator
+        import cupy as cp
+        import numpy as np
+        cp.cuda.set_allocator(rmm_cupy_allocator)
         xb = torch.rand(1000, d, device=torch.device('cuda', 0), dtype=torch.float32)
+        # xb = cp.random.rand(1000, d, dtype=np.float32)
+        # xb_t = torch.as_tensor(xb, device="cuda")
+        print(xb)
+        # print("torchgpu")
         index.train(xb)
 
         ids = torch.arange(1000, 1000 + xb.shape[0], device=torch.device('cuda', 0), dtype=torch.int64)
+        # ids = torch.as_tensor(cp.arange(1000, 1000 + xb_t.shape[0], dtype=np.int64), device="cuda")
 
         # Test add_with_ids with torch gpu
         index.add_with_ids(xb, ids)
         _, I = index.search(xb[10:20], 1)
+        print(I)
         self.assertTrue(torch.equal(I.view(10), ids[10:20]))
 
         # Test add_with_ids with torch cpu
         index.reset()
+        # index2 = faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2, config)
         xb_cpu = xb.cpu()
         ids_cpu = ids.cpu()
+        # xb_cpu = cp.asarray(xb)
+        # ids_cpu = cp.asarray(ids)
 
+        print("torch_cpu")
         index.train(xb_cpu)
         index.add_with_ids(xb_cpu, ids_cpu)
         _, I = index.search(xb_cpu[10:20], 1)
+        # print(I)
         self.assertTrue(torch.equal(I.view(10), ids_cpu[10:20]))
 
         # Test add_with_ids with numpy
@@ -102,16 +124,18 @@ def test_train_add_with_ids(self):
         xb_np = xb.cpu().numpy()
         ids_np = ids.cpu().numpy()
 
-        index.train(xb_np)
-        index.add_with_ids(xb_np, ids_np)
-        _, I = index.search(xb_np[10:20], 1)
-        self.assertTrue(np.array_equal(I.reshape(10), ids_np[10:20]))
+        print("torch_np")
+        # index.train(xb_np)
+        # index.add_with_ids(xb_np, ids_np)
+        # _, I = index.search(xb_np[10:20], 1)
+        # self.assertTrue(np.array_equal(I.reshape(10), ids_np[10:20]))
+        # self.assertTrue(False)
 
     # tests reconstruct, reconstruct_n
     def test_flat_reconstruct(self):
         d = 32
         res = faiss.StandardGpuResources()
-        res.noTempMemory()
+        # res.noTempMemory()
         index = faiss.GpuIndexFlatL2(res, d)
 
         xb = torch.rand(100, d, device=torch.device('cuda', 0), dtype=torch.float32)

From 3c775f7a6b2e68dd22fa3b6673ec3718a77b1724 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 1 Oct 2024 15:14:05 -0700
Subject: [PATCH 103/148] update action.yml

---
 .github/actions/build_cmake/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index f3f293a5c4..a0386b37ed 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -59,7 +59,7 @@ runs:
           conda install -y -q cuda-toolkit -c "nvidia/label/cuda-12.4.0"
         # and CUDA from cuVS channel for cuVS builds
         elif [ "${{ inputs.cuvs }}" = "ON" ]; then
-          conda install -y -q libcuvs=24.08 cuda-version=12.4 cuda-toolkit -c rapidsai -c "nvidia/label/cuda-12.4.0" -c conda-forge
+          conda install -y -q libcuvs=24.10 cuda-version=12.4 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-12.4.0" -c conda-forge
         fi
 
         # install test packages

From a17969767ef1906f7060b7aa2c2b1c8ac686fc8c Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 2 Oct 2024 13:40:03 -0700
Subject: [PATCH 104/148] rm debug statements, restore torch tests

---
 faiss/gpu/impl/CuvsIVFFlat.cu            | 27 -----------
 faiss/gpu/test/torch_test_contrib_gpu.py | 57 ++++++++----------------
 2 files changed, 18 insertions(+), 66 deletions(-)

diff --git a/faiss/gpu/impl/CuvsIVFFlat.cu b/faiss/gpu/impl/CuvsIVFFlat.cu
index 71c6a2719d..fd59b565f8 100644
--- a/faiss/gpu/impl/CuvsIVFFlat.cu
+++ b/faiss/gpu/impl/CuvsIVFFlat.cu
@@ -99,8 +99,6 @@ void CuvsIVFFlat::search(
     /// called updateQuantizer() to modify the cuVS index if the quantizer was
     /// modified externally
 
-    std::cout << "inside cuvs ivf flat search" << std::endl;
-
     uint32_t numQueries = queries.getSize(0);
     uint32_t cols = queries.getSize(1);
     uint32_t k_ = k;
@@ -118,19 +116,11 @@ void CuvsIVFFlat::search(
 
     auto queries_view = raft::make_device_matrix_view<const float, idx_t>(
             queries.data(), (idx_t)numQueries, (idx_t)cols);
-    raft::print_device_vector("queries", queries.data(), 25, std::cout);
     auto out_inds_view = raft::make_device_matrix_view<idx_t, idx_t>(
             outIndices.data(), (idx_t)numQueries, (idx_t)k_);
     auto out_dists_view = raft::make_device_matrix_view<float, idx_t>(
             outDistances.data(), (idx_t)numQueries, (idx_t)k_);
 
-    std::cout << "now running search";
-    raft::print_device_vector(
-            "cuvs centers before running search",
-            cuvs_index->centers().data_handle(),
-            numLists_ * dim_,
-            std::cout);
-
     cuvs::neighbors::ivf_flat::search(
             raft_handle,
             pams,
@@ -139,11 +129,6 @@ void CuvsIVFFlat::search(
             out_inds_view,
             out_dists_view);
 
-    raft::print_device_vector(
-            "outIndices before filtering",
-            outIndices.data(),
-            numQueries * k_,
-            std::cout);
     /// Identify NaN rows and mask their nearest neighbors
     auto nan_flag = raft::make_device_vector<bool>(raft_handle, numQueries);
 
@@ -174,8 +159,6 @@ void CuvsIVFFlat::search(
                     return max_val;
                 return out_dists[i];
             });
-    raft::print_device_vector(
-            "outIndices", outIndices.data(), numQueries * k_, std::cout);
 }
 
 idx_t CuvsIVFFlat::addVectors(
@@ -185,11 +168,6 @@ idx_t CuvsIVFFlat::addVectors(
     /// NB: The coarse quantizer is ignored here. The user is assumed to have
     /// called updateQuantizer() to update the cuVS index if the quantizer was
     /// modified externally
-    std::cout << "inside CuvsIVFFlat addVectors" << std::endl;
-    raft::print_device_vector(
-            "indices", indices.data(), indices.getSize(0), std::cout);
-    raft::print_device_vector(
-            "vectors", vecs.data(), vecs.getSize(1), std::cout);
 
     FAISS_ASSERT(cuvs_index != nullptr);
 
@@ -381,11 +359,6 @@ void CuvsIVFFlat::updateQuantizer(Index* quantizer) {
                 total_elems,
                 stream);
     }
-    raft::print_device_vector(
-            "cuvs centers",
-            cuvs_index->centers().data_handle(),
-            total_elems,
-            std::cout);
 }
 
 void CuvsIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
diff --git a/faiss/gpu/test/torch_test_contrib_gpu.py b/faiss/gpu/test/torch_test_contrib_gpu.py
index ad2b10bc78..bd5e2f7f10 100644
--- a/faiss/gpu/test/torch_test_contrib_gpu.py
+++ b/faiss/gpu/test/torch_test_contrib_gpu.py
@@ -13,9 +13,6 @@
 from faiss.contrib.torch import clustering
 
 
-from rmm.allocators.torch import rmm_torch_allocator
-torch.cuda.memory.change_current_allocator(rmm_torch_allocator)
-
 def to_column_major_torch(x):
     if hasattr(torch, 'contiguous_format'):
         return x.t().clone(memory_format=torch.contiguous_format).t()
@@ -76,7 +73,6 @@ def test_lookup(self):
         self.assertTrue(np.array_equal(d_torch_cpu.numpy(), d_np_cpu))
         self.assertTrue(np.array_equal(i_torch_cpu.numpy(), i_np_cpu))
 
-
     # tests train, add_with_ids
     def test_train_add_with_ids(self):
         d = 32
@@ -85,42 +81,27 @@ def test_train_add_with_ids(self):
         res.noTempMemory()
 
         config = faiss.GpuIndexIVFFlatConfig()
-        config.use_cuvs = True
+        config.use_cuvs = False
 
         index = faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2, config)
-        from rmm.allocators.cupy import rmm_cupy_allocator
-        import cupy as cp
-        import numpy as np
-        cp.cuda.set_allocator(rmm_cupy_allocator)
         xb = torch.rand(1000, d, device=torch.device('cuda', 0), dtype=torch.float32)
-        # xb = cp.random.rand(1000, d, dtype=np.float32)
-        # xb_t = torch.as_tensor(xb, device="cuda")
-        print(xb)
-        # print("torchgpu")
         index.train(xb)
 
         ids = torch.arange(1000, 1000 + xb.shape[0], device=torch.device('cuda', 0), dtype=torch.int64)
-        # ids = torch.as_tensor(cp.arange(1000, 1000 + xb_t.shape[0], dtype=np.int64), device="cuda")
 
         # Test add_with_ids with torch gpu
         index.add_with_ids(xb, ids)
         _, I = index.search(xb[10:20], 1)
-        print(I)
         self.assertTrue(torch.equal(I.view(10), ids[10:20]))
 
         # Test add_with_ids with torch cpu
         index.reset()
-        # index2 = faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2, config)
         xb_cpu = xb.cpu()
         ids_cpu = ids.cpu()
-        # xb_cpu = cp.asarray(xb)
-        # ids_cpu = cp.asarray(ids)
 
-        print("torch_cpu")
         index.train(xb_cpu)
         index.add_with_ids(xb_cpu, ids_cpu)
         _, I = index.search(xb_cpu[10:20], 1)
-        # print(I)
         self.assertTrue(torch.equal(I.view(10), ids_cpu[10:20]))
 
         # Test add_with_ids with numpy
@@ -128,18 +109,16 @@ def test_train_add_with_ids(self):
         xb_np = xb.cpu().numpy()
         ids_np = ids.cpu().numpy()
 
-        print("torch_np")
-        # index.train(xb_np)
-        # index.add_with_ids(xb_np, ids_np)
-        # _, I = index.search(xb_np[10:20], 1)
-        # self.assertTrue(np.array_equal(I.reshape(10), ids_np[10:20]))
-        # self.assertTrue(False)
+        index.train(xb_np)
+        index.add_with_ids(xb_np, ids_np)
+        _, I = index.search(xb_np[10:20], 1)
+        self.assertTrue(np.array_equal(I.reshape(10), ids_np[10:20]))
 
     # tests reconstruct, reconstruct_n
     def test_flat_reconstruct(self):
         d = 32
         res = faiss.StandardGpuResources()
-        # res.noTempMemory()
+        res.noTempMemory()
         index = faiss.GpuIndexFlatL2(res, d)
 
         xb = torch.rand(100, d, device=torch.device('cuda', 0), dtype=torch.float32)
@@ -191,7 +170,7 @@ def test_ivfflat_reconstruct(self):
         res = faiss.StandardGpuResources()
         res.noTempMemory()
         config = faiss.GpuIndexIVFFlatConfig()
-        config.use_cuvs = False
+        config.use_raft = False
 
         index = faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2, config)
 
@@ -277,7 +256,7 @@ def test_sa_encode_decode(self):
         return
 
 class TestTorchUtilsKnnGpu(unittest.TestCase):
-    def test_knn_gpu(self, use_cuvs=False):
+    def test_knn_gpu(self, use_raft=False):
         torch.manual_seed(10)
         d = 32
         nb = 1024
@@ -314,7 +293,7 @@ def test_knn_gpu(self, use_cuvs=False):
                     else:
                         xb_c = xb_np
 
-                    D, I = faiss.knn_gpu(res, xq_c, xb_c, k, use_cuvs=use_cuvs)
+                    D, I = faiss.knn_gpu(res, xq_c, xb_c, k, use_raft=use_raft)
 
                     self.assertTrue(torch.equal(torch.from_numpy(I), gt_I))
                     self.assertLess((torch.from_numpy(D) - gt_D).abs().max(), 1e-4)
@@ -340,7 +319,7 @@ def test_knn_gpu(self, use_cuvs=False):
                             xb_c = to_column_major_torch(xb)
                             assert not xb_c.is_contiguous()
 
-                        D, I = faiss.knn_gpu(res, xq_c, xb_c, k, use_cuvs=use_cuvs)
+                        D, I = faiss.knn_gpu(res, xq_c, xb_c, k, use_raft=use_raft)
 
                         self.assertTrue(torch.equal(I.cpu(), gt_I))
                         self.assertLess((D.cpu() - gt_D).abs().max(), 1e-4)
@@ -348,7 +327,7 @@ def test_knn_gpu(self, use_cuvs=False):
                         # test on subset
                         try:
                             # This internally uses the current pytorch stream
-                            D, I = faiss.knn_gpu(res, xq_c[6:8], xb_c, k, use_cuvs=use_cuvs)
+                            D, I = faiss.knn_gpu(res, xq_c[6:8], xb_c, k, use_raft=use_raft)
                         except TypeError:
                             if not xq_row_major:
                                 # then it is expected
@@ -360,12 +339,12 @@ def test_knn_gpu(self, use_cuvs=False):
                         self.assertLess((D.cpu() - gt_D[6:8]).abs().max(), 1e-4)
 
     @unittest.skipUnless(
-        "CUVS" in faiss.get_compile_options(),
-        "only if cuVS is compiled in")
-    def test_knn_gpu_cuvs(self):
-        self.test_knn_gpu(use_cuvs=True)
+        "RAFT" in faiss.get_compile_options(),
+        "only if RAFT is compiled in")
+    def test_knn_gpu_raft(self):
+        self.test_knn_gpu(use_raft=True)
 
-    def test_knn_gpu_datatypes(self, use_cuvs=False):
+    def test_knn_gpu_datatypes(self, use_raft=False):
         torch.manual_seed(10)
         d = 10
         nb = 1024
@@ -388,7 +367,7 @@ def test_knn_gpu_datatypes(self, use_cuvs=False):
         D = torch.zeros(nq, k, device=xb_c.device, dtype=torch.float32)
         I = torch.zeros(nq, k, device=xb_c.device, dtype=torch.int32)
 
-        faiss.knn_gpu(res, xq_c, xb_c, k, D, I, use_cuvs=use_cuvs)
+        faiss.knn_gpu(res, xq_c, xb_c, k, D, I, use_raft=use_raft)
 
         self.assertTrue(torch.equal(I.long().cpu(), gt_I))
         self.assertLess((D.float().cpu() - gt_D).abs().max(), 1.5e-3)
@@ -400,7 +379,7 @@ def test_knn_gpu_datatypes(self, use_cuvs=False):
         xb_c = xb.half().numpy()
         xq_c = xq.half().numpy()
 
-        faiss.knn_gpu(res, xq_c, xb_c, k, D, I, use_cuvs=use_cuvs)
+        faiss.knn_gpu(res, xq_c, xb_c, k, D, I, use_raft=use_raft)
 
         self.assertTrue(torch.equal(torch.from_numpy(I).long(), gt_I))
         self.assertLess((torch.from_numpy(D) - gt_D).abs().max(), 1.5e-3)

From 5509c49e4d6c11022412e5a246b3d5b3e7216395 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 7 Oct 2024 11:37:43 -0700
Subject: [PATCH 105/148] remove find_package(raft)

---
 CMakeLists.txt |  1 -
 build.sh       | 61 --------------------------------------------------
 2 files changed, 62 deletions(-)
 delete mode 100755 build.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2e14be8d27..057088faeb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -81,7 +81,6 @@ endif()
 
 if(FAISS_ENABLE_CUVS AND NOT TARGET cuvs::cuvs)
    find_package(cuvs)
-   find_package(raft)
  endif()
 
 add_subdirectory(faiss)
diff --git a/build.sh b/build.sh
deleted file mode 100755
index 17106d5834..0000000000
--- a/build.sh
+++ /dev/null
@@ -1,61 +0,0 @@
-#!/bin/bash
-
-# NOTE: This file is temporary for the proof-of-concept branch and will be removed before this PR is merged
-
-BUILD_TYPE=Release
-BUILD_DIR=build/
-
-RAFT_REPO_REL=""
-EXTRA_CMAKE_ARGS=""
-set -e
-
-if [[ ${RAFT_REPO_REL} != "" ]]; then
-  RAFT_REPO_PATH="`readlink -f \"${RAFT_REPO_REL}\"`"
-  EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCPM_raft_SOURCE=${RAFT_REPO_PATH}"
-fi
-
-if [ "$1" == "clean" ]; then
-  rm -rf build
-  rm -rf .cache
-  exit 0
-fi
-
-if [ "$1" == "test" ]; then
-  make -C build -j test
-  exit 0
-fi
-
-if [ "$1" == "test-raft" ]; then
-  ./build/faiss/gpu/test/TestRaftIndexIVFFlat
-  exit 0
-fi
-
-mkdir -p $BUILD_DIR
-cd $BUILD_DIR
-
-cmake \
- -DFAISS_ENABLE_GPU=ON \
- -DFAISS_ENABLE_CUVS=ON \
- -DFAISS_ENABLE_PYTHON=ON \
- -DBUILD_TESTING=ON \
- -DBUILD_SHARED_LIBS=ON \
- -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
- -DFAISS_OPT_LEVEL=avx2 \
- -DRAFT_NVTX=OFF \
- -DCMAKE_CUDA_ARCHITECTURES="NATIVE" \
- -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
- -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
- -DCMAKE_C_COMPILER_LAUNCHER=ccache \
- -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
- ${EXTRA_CMAKE_ARGS} \
- ../
-
-#   -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
-#  -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-#  -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-
-# make -C build -j12 faiss
-cmake  --build . -j12
-# make -C build -j12 swigfaiss
-# (cd build/faiss/python && python setup.py install)
-

From 4a2228c84bd8e9a62143c489d64551f00f275d75 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 7 Oct 2024 14:14:31 -0700
Subject: [PATCH 106/148] make brute_force::search compatible with 24.10

---
 faiss/gpu/GpuDistance.cu        | 14 ++------------
 faiss/gpu/impl/CuvsFlatIndex.cu |  3 +--
 2 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index 4835586ad4..fcf07f8bd8 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -318,12 +318,7 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
             cuvs::neighbors::brute_force::index<float> idx(
                     handle, index.view(), norms_view, distance, metric_arg);
             cuvs::neighbors::brute_force::search(
-                    handle,
-                    idx,
-                    search.view(),
-                    inds.view(),
-                    dists.view(),
-                    std::nullopt);
+                    handle, idx, search.view(), inds.view(), dists.view());
         } else {
             auto index = raft::make_readonly_temporary_device_buffer<
                     const float,
@@ -362,12 +357,7 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
             cuvs::neighbors::brute_force::index<float> idx(
                     handle, index.view(), norms_view, distance, metric_arg);
             cuvs::neighbors::brute_force::search(
-                    handle,
-                    idx,
-                    search.view(),
-                    inds.view(),
-                    dists.view(),
-                    std::nullopt);
+                    handle, idx, search.view(), inds.view(), dists.view());
         }
 
         if (args.metric == MetricType::METRIC_Lp) {
diff --git a/faiss/gpu/impl/CuvsFlatIndex.cu b/faiss/gpu/impl/CuvsFlatIndex.cu
index 736203d04f..c162d67792 100644
--- a/faiss/gpu/impl/CuvsFlatIndex.cu
+++ b/faiss/gpu/impl/CuvsFlatIndex.cu
@@ -98,8 +98,7 @@ void CuvsFlatIndex::query(
 
         cuvs::neighbors::brute_force::index idx(
                 handle, index, norms_view, distance, metricArg);
-        cuvs::neighbors::brute_force::search(
-                handle, idx, search, inds, dists, std::nullopt);
+        cuvs::neighbors::brute_force::search(handle, idx, search, inds, dists);
 
         if (metric == MetricType::METRIC_Lp) {
             raft::linalg::unary_op(

From d55c55325f19f38eb19457d70dd5d73d45ccb5f7 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 7 Oct 2024 15:56:45 -0700
Subject: [PATCH 107/148] rm raft::raft linkage

---
 c_api/gpu/CMakeLists.txt      |  2 +-
 faiss/gpu/CMakeLists.txt      |  2 +-
 faiss/gpu/test/CMakeLists.txt |  2 +-
 faiss/python/CMakeLists.txt   | 11 ++++-------
 tests/CMakeLists.txt          |  1 -
 5 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/c_api/gpu/CMakeLists.txt b/c_api/gpu/CMakeLists.txt
index a8b08316e8..78ef7736fc 100644
--- a/c_api/gpu/CMakeLists.txt
+++ b/c_api/gpu/CMakeLists.txt
@@ -19,7 +19,7 @@ if (FAISS_ENABLE_ROCM)
   target_link_libraries(faiss_c PUBLIC hip::host roc::hipblas)
 else()
   find_package(CUDAToolkit REQUIRED)
-  target_link_libraries(faiss_c PUBLIC CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
+  target_link_libraries(faiss_c PUBLIC CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
 endif()
 
 add_executable(example_gpu_c EXCLUDE_FROM_ALL example_gpu_c.c)
diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index de8f97f972..03ec63298e 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -334,7 +334,7 @@ else()
 
 
   find_package(CUDAToolkit REQUIRED)
-  target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:OpenMP::OpenMP_CXX>)
+  target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:OpenMP::OpenMP_CXX>)
   target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$<BOOL:${FAISS_ENABLE_CUVS}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
   target_compile_options(faiss_gpu PRIVATE
     $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all
diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt
index 18a1549249..b76d082bec 100644
--- a/faiss/gpu/test/CMakeLists.txt
+++ b/faiss/gpu/test/CMakeLists.txt
@@ -24,7 +24,7 @@ if(FAISS_ENABLE_ROCM)
   target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest hip::host)
 else()
   find_package(CUDAToolkit REQUIRED)
-  target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
+  target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
 endif()
 
 macro(faiss_gpu_test file)
diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt
index 341d5ba380..c19fd623ee 100644
--- a/faiss/python/CMakeLists.txt
+++ b/faiss/python/CMakeLists.txt
@@ -175,14 +175,11 @@ if(FAISS_ENABLE_GPU)
     find_package(CUDAToolkit REQUIRED)
     if(FAISS_ENABLE_CUVS)
       find_package(cuvs)
-      if(FAISS_ENABLE_RAFT AND NOT TARGET raft::raft)
-        find_package(raft)
-      endif()
     endif()
-    target_link_libraries(swigfaiss PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
-    target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
-    target_link_libraries(swigfaiss_avx512 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
-    target_link_libraries(swigfaiss_sve PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
+    target_link_libraries(swigfaiss PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
+    target_link_libraries(swigfaiss_avx2 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
+    target_link_libraries(swigfaiss_avx512 PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
+    target_link_libraries(swigfaiss_sve PRIVATE CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
   endif()
 endif()
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index f098de7a9a..d611e3d00c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -122,7 +122,6 @@ find_package(GTest CONFIG REQUIRED)
 target_link_libraries(faiss_test PRIVATE
   OpenMP::OpenMP_CXX
   GTest::gtest_main
-  $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft>
   $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>
   $<$<BOOL:${FAISS_ENABLE_ROCM}>:hip::host>
 )

From 31c91e65e2f7d5581cc7c25c60ad13d11ac56596 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 7 Oct 2024 16:37:59 -0700
Subject: [PATCH 108/148] downgrade cmake version

---
 .github/actions/build_cmake/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index f402c2aa5a..c523757108 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -32,7 +32,7 @@ runs:
         conda update -y -q conda
         echo "$CONDA/bin" >> $GITHUB_PATH
 
-        conda install -y -q python=3.11 cmake make swig numpy scipy pytest gflags
+        conda install -y -q python=3.11 cmake=3.26.4 make swig numpy scipy pytest gflags
 
         # install base packages for ARM64
         if [ "${{ runner.arch }}" = "ARM64" ]; then

From 16e23bb6c314f4e1a2559efda27bb2e1b20d243f Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 7 Oct 2024 17:31:19 -0700
Subject: [PATCH 109/148] rm mentions of use_raft

---
 faiss/gpu/test/torch_test_contrib_gpu.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/faiss/gpu/test/torch_test_contrib_gpu.py b/faiss/gpu/test/torch_test_contrib_gpu.py
index bd5e2f7f10..60e47d2a68 100644
--- a/faiss/gpu/test/torch_test_contrib_gpu.py
+++ b/faiss/gpu/test/torch_test_contrib_gpu.py
@@ -170,7 +170,7 @@ def test_ivfflat_reconstruct(self):
         res = faiss.StandardGpuResources()
         res.noTempMemory()
         config = faiss.GpuIndexIVFFlatConfig()
-        config.use_raft = False
+        config.use_cuvs = False
 
         index = faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2, config)
 
@@ -256,7 +256,7 @@ def test_sa_encode_decode(self):
         return
 
 class TestTorchUtilsKnnGpu(unittest.TestCase):
-    def test_knn_gpu(self, use_raft=False):
+    def test_knn_gpu(self, use_cuvs=False):
         torch.manual_seed(10)
         d = 32
         nb = 1024
@@ -293,7 +293,7 @@ def test_knn_gpu(self, use_raft=False):
                     else:
                         xb_c = xb_np
 
-                    D, I = faiss.knn_gpu(res, xq_c, xb_c, k, use_raft=use_raft)
+                    D, I = faiss.knn_gpu(res, xq_c, xb_c, k, use_cuvs=use_cuvs)
 
                     self.assertTrue(torch.equal(torch.from_numpy(I), gt_I))
                     self.assertLess((torch.from_numpy(D) - gt_D).abs().max(), 1e-4)
@@ -319,7 +319,7 @@ def test_knn_gpu(self, use_raft=False):
                             xb_c = to_column_major_torch(xb)
                             assert not xb_c.is_contiguous()
 
-                        D, I = faiss.knn_gpu(res, xq_c, xb_c, k, use_raft=use_raft)
+                        D, I = faiss.knn_gpu(res, xq_c, xb_c, k, use_cuvs=use_cuvs)
 
                         self.assertTrue(torch.equal(I.cpu(), gt_I))
                         self.assertLess((D.cpu() - gt_D).abs().max(), 1e-4)
@@ -327,7 +327,7 @@ def test_knn_gpu(self, use_raft=False):
                         # test on subset
                         try:
                             # This internally uses the current pytorch stream
-                            D, I = faiss.knn_gpu(res, xq_c[6:8], xb_c, k, use_raft=use_raft)
+                            D, I = faiss.knn_gpu(res, xq_c[6:8], xb_c, k, use_cuvs=use_cuvs)
                         except TypeError:
                             if not xq_row_major:
                                 # then it is expected
@@ -342,9 +342,9 @@ def test_knn_gpu(self, use_raft=False):
         "RAFT" in faiss.get_compile_options(),
         "only if RAFT is compiled in")
     def test_knn_gpu_raft(self):
-        self.test_knn_gpu(use_raft=True)
+        self.test_knn_gpu(use_cuvs=True)
 
-    def test_knn_gpu_datatypes(self, use_raft=False):
+    def test_knn_gpu_datatypes(self, use_cuvs=False):
         torch.manual_seed(10)
         d = 10
         nb = 1024
@@ -367,7 +367,7 @@ def test_knn_gpu_datatypes(self, use_raft=False):
         D = torch.zeros(nq, k, device=xb_c.device, dtype=torch.float32)
         I = torch.zeros(nq, k, device=xb_c.device, dtype=torch.int32)
 
-        faiss.knn_gpu(res, xq_c, xb_c, k, D, I, use_raft=use_raft)
+        faiss.knn_gpu(res, xq_c, xb_c, k, D, I, use_cuvs=use_cuvs)
 
         self.assertTrue(torch.equal(I.long().cpu(), gt_I))
         self.assertLess((D.float().cpu() - gt_D).abs().max(), 1.5e-3)
@@ -379,7 +379,7 @@ def test_knn_gpu_datatypes(self, use_raft=False):
         xb_c = xb.half().numpy()
         xq_c = xq.half().numpy()
 
-        faiss.knn_gpu(res, xq_c, xb_c, k, D, I, use_raft=use_raft)
+        faiss.knn_gpu(res, xq_c, xb_c, k, D, I, use_cuvs=use_cuvs)
 
         self.assertTrue(torch.equal(torch.from_numpy(I).long(), gt_I))
         self.assertLess((torch.from_numpy(D) - gt_D).abs().max(), 1.5e-3)

From f472eb40638e5f62d87d4da76c31f9cf4938ef1b Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 7 Oct 2024 18:10:44 -0700
Subject: [PATCH 110/148] empty commit


From 10344a157d589a798aeaf867046b2393c5993149 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 8 Oct 2024 09:02:03 -0700
Subject: [PATCH 111/148] do not symblink sys dependencies for cuvs

---
 .github/actions/build_cmake/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index c523757108..cbcf4b2fa8 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -111,7 +111,7 @@ runs:
       shell: bash
       run: ./faiss/gpu/hipify.sh
     - name: Symblink system dependencies
-      if: inputs.cuvs == 'ON' || inputs.rocm == 'ON'
+      if: inputs.rocm == 'ON'
       shell: bash
       run: |
         # symblink system libraries for HIP compiler

From b1bb9bae2dd5a905330911957e4727d3bfcfb2c5 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 8 Oct 2024 09:02:58 -0700
Subject: [PATCH 112/148] update torch test

---
 faiss/gpu/test/torch_test_contrib_gpu.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/faiss/gpu/test/torch_test_contrib_gpu.py b/faiss/gpu/test/torch_test_contrib_gpu.py
index 60e47d2a68..714cbe3674 100644
--- a/faiss/gpu/test/torch_test_contrib_gpu.py
+++ b/faiss/gpu/test/torch_test_contrib_gpu.py
@@ -339,9 +339,9 @@ def test_knn_gpu(self, use_cuvs=False):
                         self.assertLess((D.cpu() - gt_D[6:8]).abs().max(), 1e-4)
 
     @unittest.skipUnless(
-        "RAFT" in faiss.get_compile_options(),
-        "only if RAFT is compiled in")
-    def test_knn_gpu_raft(self):
+        "CUVS" in faiss.get_compile_options(),
+        "only if CUVS is compiled in")
+    def test_knn_gpu_cuvs(self):
         self.test_knn_gpu(use_cuvs=True)
 
     def test_knn_gpu_datatypes(self, use_cuvs=False):

From 3229a386519d055f8e56b3d05a9b3a33b6069077 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 8 Oct 2024 12:43:13 -0700
Subject: [PATCH 113/148] revert

---
 .github/actions/build_cmake/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index cbcf4b2fa8..c523757108 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -111,7 +111,7 @@ runs:
       shell: bash
       run: ./faiss/gpu/hipify.sh
     - name: Symblink system dependencies
-      if: inputs.rocm == 'ON'
+      if: inputs.cuvs == 'ON' || inputs.rocm == 'ON'
       shell: bash
       run: |
         # symblink system libraries for HIP compiler

From 3eb6a56b42e5195ef2384e53942dd737e566e0d9 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 9 Oct 2024 10:47:27 -0700
Subject: [PATCH 114/148] glibc version

---
 .github/actions/build_cmake/action.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index c523757108..e77808c03c 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -115,6 +115,7 @@ runs:
       shell: bash
       run: |
         # symblink system libraries for HIP compiler
+        sudo apt-get install -y glibc=2.31
         sudo ln -s /lib/x86_64-linux-gnu/libc.so.6 /lib64/libc.so.6
         sudo ln -s /lib/x86_64-linux-gnu/libc_nonshared.a /usr/lib64/libc_nonshared.a
         sudo ln -s /usr/lib/x86_64-linux-gnu/libpthread.so.0 /lib64/libpthread.so.0

From abfe7f42f2962c953d6e3d71efaaefcfc37f6874 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 9 Oct 2024 11:09:49 -0700
Subject: [PATCH 115/148] install cmd

---
 .github/actions/build_cmake/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index 2071601297..3144d0a8d3 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -110,7 +110,7 @@ runs:
       shell: bash
       run: |
         # symblink system libraries for HIP compiler
-        sudo apt-get install -y glibc=2.31
+        sudo apt-get install -y libc6=2.31
         sudo ln -s /lib/x86_64-linux-gnu/libc.so.6 /lib64/libc.so.6
         sudo ln -s /lib/x86_64-linux-gnu/libc_nonshared.a /usr/lib64/libc_nonshared.a
         sudo ln -s /usr/lib/x86_64-linux-gnu/libpthread.so.0 /lib64/libpthread.so.0

From 26a45f2f60e18f1b47e724a9144bca0f341d4653 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 9 Oct 2024 11:34:51 -0700
Subject: [PATCH 116/148] upgrade pkgs

---
 .github/actions/build_cmake/action.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index 3144d0a8d3..fd3c567b59 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -110,7 +110,9 @@ runs:
       shell: bash
       run: |
         # symblink system libraries for HIP compiler
-        sudo apt-get install -y libc6=2.31
+        ldd --version
+        sudo apt-get update
+        sudo apt-get upgrade
         sudo ln -s /lib/x86_64-linux-gnu/libc.so.6 /lib64/libc.so.6
         sudo ln -s /lib/x86_64-linux-gnu/libc_nonshared.a /usr/lib64/libc_nonshared.a
         sudo ln -s /usr/lib/x86_64-linux-gnu/libpthread.so.0 /lib64/libpthread.so.0

From 265fc5273cf6e23da3b8079b13e05004d1a79e9d Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 9 Oct 2024 13:43:16 -0700
Subject: [PATCH 117/148] link omp in tests

---
 faiss/gpu/test/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt
index b76d082bec..dbd9e372de 100644
--- a/faiss/gpu/test/CMakeLists.txt
+++ b/faiss/gpu/test/CMakeLists.txt
@@ -24,7 +24,7 @@ if(FAISS_ENABLE_ROCM)
   target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest hip::host)
 else()
   find_package(CUDAToolkit REQUIRED)
-  target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs>)
+  target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:OpenMP::OpenMP_CXX>)
 endif()
 
 macro(faiss_gpu_test file)

From 209048a73973d71dbef9c9e60aeb11a4a54f7c08 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 9 Oct 2024 15:27:56 -0700
Subject: [PATCH 118/148] Xcompiler

---
 faiss/gpu/CMakeLists.txt      | 1 -
 faiss/gpu/test/CMakeLists.txt | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index 03ec63298e..5951824b6c 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -335,7 +335,6 @@ else()
 
   find_package(CUDAToolkit REQUIRED)
   target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:OpenMP::OpenMP_CXX>)
-  target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$<BOOL:${FAISS_ENABLE_CUVS}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
   target_compile_options(faiss_gpu PRIVATE
     $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all
     --expt-extended-lambda --expt-relaxed-constexpr
diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt
index dbd9e372de..108a4cbab0 100644
--- a/faiss/gpu/test/CMakeLists.txt
+++ b/faiss/gpu/test/CMakeLists.txt
@@ -25,6 +25,8 @@ if(FAISS_ENABLE_ROCM)
 else()
   find_package(CUDAToolkit REQUIRED)
   target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:OpenMP::OpenMP_CXX>)
+  target_compile_options(faiss_gpu_test_helper PUBLIC
+    $<$<BOOL:${FAISS_ENABLE_CUVS}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
 endif()
 
 macro(faiss_gpu_test file)

From 54e772c9cab6571d9c913121b428d2c93d288af7 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Thu, 10 Oct 2024 12:55:41 -0700
Subject: [PATCH 119/148] add raft::raft to tests cmake

---
 .github/actions/build_cmake/action.yml | 3 ---
 faiss/gpu/test/CMakeLists.txt          | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index fd3c567b59..8f1d607b2c 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -110,9 +110,6 @@ runs:
       shell: bash
       run: |
         # symblink system libraries for HIP compiler
-        ldd --version
-        sudo apt-get update
-        sudo apt-get upgrade
         sudo ln -s /lib/x86_64-linux-gnu/libc.so.6 /lib64/libc.so.6
         sudo ln -s /lib/x86_64-linux-gnu/libc_nonshared.a /usr/lib64/libc_nonshared.a
         sudo ln -s /usr/lib/x86_64-linux-gnu/libpthread.so.0 /lib64/libpthread.so.0
diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt
index 108a4cbab0..e22740809e 100644
--- a/faiss/gpu/test/CMakeLists.txt
+++ b/faiss/gpu/test/CMakeLists.txt
@@ -24,7 +24,7 @@ if(FAISS_ENABLE_ROCM)
   target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest hip::host)
 else()
   find_package(CUDAToolkit REQUIRED)
-  target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:OpenMP::OpenMP_CXX>)
+  target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:OpenMP::OpenMP_CXX>)
   target_compile_options(faiss_gpu_test_helper PUBLIC
     $<$<BOOL:${FAISS_ENABLE_CUVS}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
 endif()

From 4ba1389c9a218c4a7ec82f439cd34501527c707e Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Fri, 11 Oct 2024 10:37:31 -0700
Subject: [PATCH 120/148] update compile_options

---
 faiss/gpu/test/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt
index e22740809e..644704d521 100644
--- a/faiss/gpu/test/CMakeLists.txt
+++ b/faiss/gpu/test/CMakeLists.txt
@@ -25,14 +25,14 @@ if(FAISS_ENABLE_ROCM)
 else()
   find_package(CUDAToolkit REQUIRED)
   target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$<BOOL:${FAISS_ENABLE_CUVS}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_CUVS}>:cuvs::cuvs> $<$<BOOL:${FAISS_ENABLE_CUVS}>:OpenMP::OpenMP_CXX>)
-  target_compile_options(faiss_gpu_test_helper PUBLIC
-    $<$<BOOL:${FAISS_ENABLE_CUVS}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
 endif()
 
 macro(faiss_gpu_test file)
   get_filename_component(test_name ${file} NAME_WE)
   add_executable(${test_name} ${file})
   target_link_libraries(${test_name} PRIVATE faiss_gpu_test_helper)
+  target_compile_options(${test_name} PRIVATE
+    $<$<COMPILE_LANGUAGE:CUDA>:$<$<BOOL:${FAISS_ENABLE_CUVS}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
   gtest_discover_tests(${test_name})
 endmacro()
 

From 19ddf28cd8b159da9a6db4cdc2e4ccd140c09030 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Fri, 11 Oct 2024 14:55:44 -0700
Subject: [PATCH 121/148] downgrade to cuvs=24.08

---
 cmake/thirdparty/fetch_rapids.cmake |  2 +-
 conda/faiss-gpu-cuvs/meta.yaml      |  4 ++--
 faiss/gpu/GpuDistance.cu            | 14 ++++++++++++--
 faiss/gpu/impl/CuvsFlatIndex.cu     |  4 +++-
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/cmake/thirdparty/fetch_rapids.cmake b/cmake/thirdparty/fetch_rapids.cmake
index cf925a0ac1..3b9d9b140a 100644
--- a/cmake/thirdparty/fetch_rapids.cmake
+++ b/cmake/thirdparty/fetch_rapids.cmake
@@ -15,7 +15,7 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 # =============================================================================
-set(RAPIDS_VERSION "24.10")
+set(RAPIDS_VERSION "24.08")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
diff --git a/conda/faiss-gpu-cuvs/meta.yaml b/conda/faiss-gpu-cuvs/meta.yaml
index be2576222d..5f6ac8010f 100644
--- a/conda/faiss-gpu-cuvs/meta.yaml
+++ b/conda/faiss-gpu-cuvs/meta.yaml
@@ -58,7 +58,7 @@ outputs:
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
-        - libcuvs =24.10
+        - libcuvs =24.08
         - cuda-version {{ cuda_constraints }}
       run:
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
@@ -66,7 +66,7 @@ outputs:
         - openblas  # [not x86_64]
         - cuda-cudart {{ cuda_constraints }}
         - libcublas {{ libcublas_constraints }}
-        - libcuvs =24.10
+        - libcuvs =24.08
         - cuda-version {{ cuda_constraints }}
     test:
       requires:
diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index fcf07f8bd8..4835586ad4 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -318,7 +318,12 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
             cuvs::neighbors::brute_force::index<float> idx(
                     handle, index.view(), norms_view, distance, metric_arg);
             cuvs::neighbors::brute_force::search(
-                    handle, idx, search.view(), inds.view(), dists.view());
+                    handle,
+                    idx,
+                    search.view(),
+                    inds.view(),
+                    dists.view(),
+                    std::nullopt);
         } else {
             auto index = raft::make_readonly_temporary_device_buffer<
                     const float,
@@ -357,7 +362,12 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
             cuvs::neighbors::brute_force::index<float> idx(
                     handle, index.view(), norms_view, distance, metric_arg);
             cuvs::neighbors::brute_force::search(
-                    handle, idx, search.view(), inds.view(), dists.view());
+                    handle,
+                    idx,
+                    search.view(),
+                    inds.view(),
+                    dists.view(),
+                    std::nullopt);
         }
 
         if (args.metric == MetricType::METRIC_Lp) {
diff --git a/faiss/gpu/impl/CuvsFlatIndex.cu b/faiss/gpu/impl/CuvsFlatIndex.cu
index c162d67792..30b3a8c827 100644
--- a/faiss/gpu/impl/CuvsFlatIndex.cu
+++ b/faiss/gpu/impl/CuvsFlatIndex.cu
@@ -24,6 +24,7 @@
 #include <faiss/gpu/impl/CuvsFlatIndex.cuh>
 #include <faiss/gpu/utils/ConversionOperators.cuh>
 
+#include <optional>
 #include <vector>
 
 #include <cuvs/neighbors/brute_force.hpp>
@@ -98,7 +99,8 @@ void CuvsFlatIndex::query(
 
         cuvs::neighbors::brute_force::index idx(
                 handle, index, norms_view, distance, metricArg);
-        cuvs::neighbors::brute_force::search(handle, idx, search, inds, dists);
+        cuvs::neighbors::brute_force::search(
+                handle, idx, search, inds, dists, std::nullopt);
 
         if (metric == MetricType::METRIC_Lp) {
             raft::linalg::unary_op(

From 0ab074abcbe28608404743ae00e6f4d91b9e63fb Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Sat, 12 Oct 2024 16:26:02 -0700
Subject: [PATCH 122/148] action

---
 .github/actions/build_cmake/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index 8f1d607b2c..e3d755536b 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -54,7 +54,7 @@ runs:
           conda install -y -q cuda-toolkit -c "nvidia/label/cuda-12.4.0"
         # and CUDA from cuVS channel for cuVS builds
         elif [ "${{ inputs.cuvs }}" = "ON" ]; then
-          conda install -y -q libcuvs=24.10 cuda-version=12.4 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-12.4.0" -c conda-forge
+          conda install -y -q libcuvs=24.08 cuda-version=12.4 cuda-toolkit -c rapidsai -c "nvidia/label/cuda-12.4.0" -c conda-forge
         fi
 
         # install test packages

From 7ad8315a59bf7d5608fe756b68e5ef8c54fcbdc8 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 15 Oct 2024 09:30:36 -0700
Subject: [PATCH 123/148] upgrade cmake with libcuvs=24.08

---
 .github/actions/build_cmake/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index e3d755536b..f08a1f160a 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -32,7 +32,7 @@ runs:
         conda update -y -q conda
         echo "$CONDA/bin" >> $GITHUB_PATH
 
-        conda install -y -q python=3.11 cmake=3.26.4 make swig numpy scipy pytest gflags
+        conda install -y -q python=3.11 cmake make swig numpy scipy pytest gflags
 
         # install base packages for ARM64
         if [ "${{ runner.arch }}" = "ARM64" ]; then

From c23df0a2e658df7d4b56d58d27e57996bef68e7e Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 16 Oct 2024 15:38:35 -0700
Subject: [PATCH 124/148] one more try with 24.10

---
 cmake/thirdparty/fetch_rapids.cmake | 2 +-
 conda/faiss-gpu-cuvs/meta.yaml      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/thirdparty/fetch_rapids.cmake b/cmake/thirdparty/fetch_rapids.cmake
index 3b9d9b140a..cf925a0ac1 100644
--- a/cmake/thirdparty/fetch_rapids.cmake
+++ b/cmake/thirdparty/fetch_rapids.cmake
@@ -15,7 +15,7 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 # =============================================================================
-set(RAPIDS_VERSION "24.08")
+set(RAPIDS_VERSION "24.10")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
diff --git a/conda/faiss-gpu-cuvs/meta.yaml b/conda/faiss-gpu-cuvs/meta.yaml
index 6c43d19a7c..83da1fd32d 100644
--- a/conda/faiss-gpu-cuvs/meta.yaml
+++ b/conda/faiss-gpu-cuvs/meta.yaml
@@ -58,7 +58,7 @@ outputs:
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
-        - libcuvs =24.08
+        - libcuvs =24.10
         - cuda-version {{ cuda_constraints }}
       run:
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
@@ -66,7 +66,7 @@ outputs:
         - openblas  # [not x86_64]
         - cuda-cudart {{ cuda_constraints }}
         - libcublas {{ libcublas_constraints }}
-        - libcuvs =24.08
+        - libcuvs =24.10
         - cuda-version {{ cuda_constraints }}
     test:
       requires:

From b4989e64542df4953510067c7efdd15dfe447a56 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 16 Oct 2024 15:39:29 -0700
Subject: [PATCH 125/148] one more try with 24.10

---
 .github/actions/build_cmake/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index f08a1f160a..ad0ce995e0 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -54,7 +54,7 @@ runs:
           conda install -y -q cuda-toolkit -c "nvidia/label/cuda-12.4.0"
         # and CUDA from cuVS channel for cuVS builds
         elif [ "${{ inputs.cuvs }}" = "ON" ]; then
-          conda install -y -q libcuvs=24.08 cuda-version=12.4 cuda-toolkit -c rapidsai -c "nvidia/label/cuda-12.4.0" -c conda-forge
+          conda install -y -q libcuvs=24.10 cuda-version=12.4 cuda-toolkit -c rapidsai -c "nvidia/label/cuda-12.4.0" -c conda-forge
         fi
 
         # install test packages

From bb5cd83a287698bbb6b0a35944f7ab57fd0a87c9 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 16 Oct 2024 15:44:05 -0700
Subject: [PATCH 126/148] rm std::nullopt

---
 faiss/gpu/GpuDistance.cu        | 14 ++------------
 faiss/gpu/impl/CuvsFlatIndex.cu |  3 +--
 2 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index aa3f8c497b..b449c694cc 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -318,12 +318,7 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
             cuvs::neighbors::brute_force::index<float> idx(
                     handle, index.view(), norms_view, distance, metric_arg);
             cuvs::neighbors::brute_force::search(
-                    handle,
-                    idx,
-                    search.view(),
-                    inds.view(),
-                    dists.view(),
-                    std::nullopt);
+                    handle, idx, search.view(), inds.view(), dists.view());
         } else {
             auto index = raft::make_readonly_temporary_device_buffer<
                     const float,
@@ -362,12 +357,7 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
             cuvs::neighbors::brute_force::index<float> idx(
                     handle, index.view(), norms_view, distance, metric_arg);
             cuvs::neighbors::brute_force::search(
-                    handle,
-                    idx,
-                    search.view(),
-                    inds.view(),
-                    dists.view(),
-                    std::nullopt);
+                    handle, idx, search.view(), inds.view(), dists.view());
         }
 
         if (args.metric == MetricType::METRIC_Lp) {
diff --git a/faiss/gpu/impl/CuvsFlatIndex.cu b/faiss/gpu/impl/CuvsFlatIndex.cu
index 30b3a8c827..a2b7e1a943 100644
--- a/faiss/gpu/impl/CuvsFlatIndex.cu
+++ b/faiss/gpu/impl/CuvsFlatIndex.cu
@@ -99,8 +99,7 @@ void CuvsFlatIndex::query(
 
         cuvs::neighbors::brute_force::index idx(
                 handle, index, norms_view, distance, metricArg);
-        cuvs::neighbors::brute_force::search(
-                handle, idx, search, inds, dists, std::nullopt);
+        cuvs::neighbors::brute_force::search(handle, idx, search, inds, dists);
 
         if (metric == MetricType::METRIC_Lp) {
             raft::linalg::unary_op(

From 3361b445ee3c8cac2d47458c940a52b9b4607dfd Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 16 Oct 2024 18:01:15 -0700
Subject: [PATCH 127/148] revert changes to should_use_cuvs

---
 faiss/gpu/GpuDistance.cu | 2 +-
 faiss/gpu/GpuIndex.cu    | 5 +----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index b449c694cc..fcf07f8bd8 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -51,7 +51,7 @@ bool should_use_cuvs(GpuDistanceParams args) {
     int dev = args.device >= 0 ? args.device : getCurrentDevice();
     auto prop = getDeviceProperties(dev);
 
-    if (device_major_version < 7)
+    if (prop.major < 7)
         return false;
 
     return args.use_cuvs;
diff --git a/faiss/gpu/GpuIndex.cu b/faiss/gpu/GpuIndex.cu
index 39a1ce7a8f..3aeef876d4 100644
--- a/faiss/gpu/GpuIndex.cu
+++ b/faiss/gpu/GpuIndex.cu
@@ -42,13 +42,10 @@ constexpr idx_t kAddVecSize = (idx_t)512 * 1024;
 // FIXME: parameterize based on algorithm need
 constexpr idx_t kSearchVecSize = (idx_t)32 * 1024;
 
-/// Caches device major version
-extern int device_major_version;
-
 bool should_use_cuvs(GpuIndexConfig config_) {
     auto prop = getDeviceProperties(config_.device);
 
-    if (device_major_version < 7)
+    if (prop.major < 7)
         return false;
 
     return config_.use_cuvs;

From 4decfb4d1929c5af1b04ab2de2669ea853a04386 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 21 Oct 2024 16:58:47 -0700
Subject: [PATCH 128/148] pin gcc_linux-64 and sysroot_linux-64

---
 .github/actions/build_cmake/action.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index ad0ce995e0..6cc1d1d65c 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -42,7 +42,7 @@ runs:
         # install base packages for X86_64
         if [ "${{ runner.arch }}" = "X64" ]; then
           # TODO: merge this with ARM64
-          conda install -y -q -c conda-forge gxx_linux-64 sysroot_linux-64
+          conda install -y -q -c conda-forge gxx_linux-64=11.2 sysroot_linux-64=2.28
           conda install -y -q mkl=2023 mkl-devel=2023
         fi
 
@@ -54,7 +54,7 @@ runs:
           conda install -y -q cuda-toolkit -c "nvidia/label/cuda-12.4.0"
         # and CUDA from cuVS channel for cuVS builds
         elif [ "${{ inputs.cuvs }}" = "ON" ]; then
-          conda install -y -q libcuvs=24.10 cuda-version=12.4 cuda-toolkit -c rapidsai -c "nvidia/label/cuda-12.4.0" -c conda-forge
+          conda install -y -q cmake=3.26 libcuvs=24.10 cuda-version=12.4 cuda-toolkit -c rapidsai -c "nvidia/label/cuda-12.4.0" -c conda-forge
         fi
 
         # install test packages

From 929a6d80f9e536ddafb753a4ec78a226c41727d6 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 21 Oct 2024 17:00:36 -0700
Subject: [PATCH 129/148] do not symlink system deps

---
 .github/actions/build_cmake/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index 88af43cfe1..373b5e5263 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -102,7 +102,7 @@ runs:
         sudo apt-get -qq clean >/dev/null
         sudo rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
     - name: Symblink system dependencies
-      if: inputs.cuvs == 'ON' || inputs.rocm == 'ON'
+      if: inputs.rocm == 'ON'
       shell: bash
       run: |
         # symblink system libraries for HIP compiler

From 48ba59da9502382bbeef121fc64e0c6f3bef2759 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 22 Oct 2024 10:24:53 -0700
Subject: [PATCH 130/148] put conda-forge before nvidia channel

---
 .github/actions/build_cmake/action.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index 373b5e5263..9caa310841 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -42,7 +42,7 @@ runs:
         # install base packages for X86_64
         if [ "${{ runner.arch }}" = "X64" ]; then
           # TODO: merge this with ARM64
-          conda install -y -q -c conda-forge gxx_linux-64=11.2 sysroot_linux-64=2.28
+          conda install -y -q -c conda-forge gxx_linux-64=11.2 sysroot_linux-64
           conda install -y -q mkl=2023 mkl-devel=2023
         fi
 
@@ -54,7 +54,7 @@ runs:
           conda install -y -q cuda-toolkit -c "nvidia/label/cuda-12.4.0"
         # and CUDA from cuVS channel for cuVS builds
         elif [ "${{ inputs.cuvs }}" = "ON" ]; then
-          conda install -y -q cmake=3.26 libcuvs=24.10 cuda-version=12.4 cuda-toolkit -c rapidsai -c "nvidia/label/cuda-12.4.0" -c conda-forge
+          conda install -y -q cmake=3.26 libcuvs=24.10 cuda-version=12.4 cuda-toolkit -c rapidsai -c conda-forge -c "nvidia/label/cuda-12.4.0"
         fi
 
         # install test packages

From d8ebaa3cad347543fdb5ec6d0e421a18bc6a56af Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Thu, 24 Oct 2024 22:03:28 -0700
Subject: [PATCH 131/148] set device scope in cagra train

---
 faiss/gpu/GpuIndexCagra.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
index b1c1bbef54..fe0c82b8aa 100644
--- a/faiss/gpu/GpuIndexCagra.cu
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -42,6 +42,7 @@ GpuIndexCagra::GpuIndexCagra(
 }
 
 void GpuIndexCagra::train(idx_t n, const float* x) {
+    DeviceScope scope(config_.device);
     if (this->is_trained) {
         FAISS_ASSERT(index_);
         return;

From d779cf2937c7f90b0cdd3de86c86716d50115853 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Thu, 24 Oct 2024 22:05:55 -0700
Subject: [PATCH 132/148] unpin pkgs in action

---
 .github/actions/build_cmake/action.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index 5a80fbcac1..70e2b28fbd 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -42,7 +42,7 @@ runs:
         # install base packages for X86_64
         if [ "${{ runner.arch }}" = "X64" ]; then
           # TODO: merge this with ARM64
-          conda install -y -q -c conda-forge gxx_linux-64=11.2 sysroot_linux-64
+          conda install -y -q -c conda-forge gxx_linux-64 sysroot_linux-64
           conda install -y -q mkl=2023 mkl-devel=2023
         fi
 
@@ -54,7 +54,7 @@ runs:
           conda install -y -q cuda-toolkit -c "nvidia/label/cuda-12.4.0"
         # and CUDA from cuVS channel for cuVS builds
         elif [ "${{ inputs.cuvs }}" = "ON" ]; then
-          conda install -y -q cmake=3.26 libcuvs=24.10 cuda-version=12.4 cuda-toolkit -c rapidsai -c conda-forge -c "nvidia/label/cuda-12.4.0"
+          conda install -y -q libcuvs=24.10 cuda-version=12.4 cuda-toolkit -c rapidsai -c conda-forge -c "nvidia/label/cuda-12.4.0"
         fi
 
         # install test packages

From 38e14b412d11b823d3c4b9dd64b40500b15a6168 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Fri, 25 Oct 2024 15:58:43 -0700
Subject: [PATCH 133/148] pin cmake

---
 .github/actions/build_cmake/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index 70e2b28fbd..065f4edb42 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -54,7 +54,7 @@ runs:
           conda install -y -q cuda-toolkit -c "nvidia/label/cuda-12.4.0"
         # and CUDA from cuVS channel for cuVS builds
         elif [ "${{ inputs.cuvs }}" = "ON" ]; then
-          conda install -y -q libcuvs=24.10 cuda-version=12.4 cuda-toolkit -c rapidsai -c conda-forge -c "nvidia/label/cuda-12.4.0"
+          conda install -y -q cmake=3.26 libcuvs=24.10 cuda-version=12.4 cuda-toolkit -c rapidsai -c conda-forge -c "nvidia/label/cuda-12.4.0"
         fi
 
         # install test packages

From 9de6563a40c3ce52e5b79ef9aff01652ffed4a74 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 29 Oct 2024 10:02:41 -0700
Subject: [PATCH 134/148] specify conda-forge while install base deps

---
 .github/actions/build_cmake/action.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index 065f4edb42..b4c62b0bfd 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -32,7 +32,7 @@ runs:
         conda update -y -q conda
         echo "$CONDA/bin" >> $GITHUB_PATH
 
-        conda install -y -q python=3.11 cmake make swig numpy scipy pytest gflags
+        conda install -y -q -c conda-forge python=3.11 make swig numpy scipy pytest gflags
 
         # install base packages for ARM64
         if [ "${{ runner.arch }}" = "ARM64" ]; then
@@ -54,7 +54,7 @@ runs:
           conda install -y -q cuda-toolkit -c "nvidia/label/cuda-12.4.0"
         # and CUDA from cuVS channel for cuVS builds
         elif [ "${{ inputs.cuvs }}" = "ON" ]; then
-          conda install -y -q cmake=3.26 libcuvs=24.10 cuda-version=12.4 cuda-toolkit -c rapidsai -c conda-forge -c "nvidia/label/cuda-12.4.0"
+          conda install -y -q libcuvs=24.10 cuda-version=12.4 cuda-toolkit -c rapidsai -c conda-forge -c "nvidia/label/cuda-12.4.0"
         fi
 
         # install test packages

From 0d0d7d3325f0f018dbb4ca58daa98471f665990f Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 29 Oct 2024 21:28:18 -0700
Subject: [PATCH 135/148] install cmake from conda-forge

---
 .github/actions/build_cmake/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index b4c62b0bfd..65580e5a78 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -32,7 +32,7 @@ runs:
         conda update -y -q conda
         echo "$CONDA/bin" >> $GITHUB_PATH
 
-        conda install -y -q -c conda-forge python=3.11 make swig numpy scipy pytest gflags
+        conda install -y -q -c conda-forge cmake python=3.11 make swig numpy scipy pytest gflags
 
         # install base packages for ARM64
         if [ "${{ runner.arch }}" = "ARM64" ]; then

From 2bbdd15361afa8b7a9485aa6641ff08644d3a76b Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 29 Oct 2024 21:32:30 -0700
Subject: [PATCH 136/148] rm conda-forge

---
 .github/actions/build_cmake/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index 65580e5a78..4299fcaf81 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -32,7 +32,7 @@ runs:
         conda update -y -q conda
         echo "$CONDA/bin" >> $GITHUB_PATH
 
-        conda install -y -q -c conda-forge cmake python=3.11 make swig numpy scipy pytest gflags
+        conda install -y -q cmake python=3.11 make swig numpy scipy pytest gflags
 
         # install base packages for ARM64
         if [ "${{ runner.arch }}" = "ARM64" ]; then

From 281c2b6071790b7192ccf4a348e70ad0288f3935 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 29 Oct 2024 21:45:23 -0700
Subject: [PATCH 137/148] only install cmake from conda-forge

---
 .github/actions/build_cmake/action.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index 4299fcaf81..ab7d0d87ef 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -32,7 +32,8 @@ runs:
         conda update -y -q conda
         echo "$CONDA/bin" >> $GITHUB_PATH
 
-        conda install -y -q cmake python=3.11 make swig numpy scipy pytest gflags
+        conda install -y -q -c conda-forge cmake
+        conda install -y -q python=3.11 make swig numpy scipy pytest gflags
 
         # install base packages for ARM64
         if [ "${{ runner.arch }}" = "ARM64" ]; then

From bb68cf76dcd4167a4c95ee60adf216c6dbd91849 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 29 Oct 2024 21:49:23 -0700
Subject: [PATCH 138/148] make from conda-forge

---
 .github/actions/build_cmake/action.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index ab7d0d87ef..383298e28f 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -32,8 +32,8 @@ runs:
         conda update -y -q conda
         echo "$CONDA/bin" >> $GITHUB_PATH
 
-        conda install -y -q -c conda-forge cmake
-        conda install -y -q python=3.11 make swig numpy scipy pytest gflags
+        conda install -y -q -c conda-forge cmake make
+        conda install -y -q python=3.11 swig numpy scipy pytest gflags
 
         # install base packages for ARM64
         if [ "${{ runner.arch }}" = "ARM64" ]; then

From a8c2926c0c2f728b896a3fd70965d9db3e2ca676 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Tue, 29 Oct 2024 21:55:39 -0700
Subject: [PATCH 139/148] revert

---
 .github/actions/build_cmake/action.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index 383298e28f..f07cb78f9b 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -32,8 +32,7 @@ runs:
         conda update -y -q conda
         echo "$CONDA/bin" >> $GITHUB_PATH
 
-        conda install -y -q -c conda-forge cmake make
-        conda install -y -q python=3.11 swig numpy scipy pytest gflags
+        conda install -y -q python=3.11 cmake=3.26.4 make swig numpy scipy pytest gflags
 
         # install base packages for ARM64
         if [ "${{ runner.arch }}" = "ARM64" ]; then

From b8219728d132c1f89858663c93930fa15b28f5f5 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 4 Nov 2024 08:35:26 -0800
Subject: [PATCH 140/148] debug failing cagra test

---
 faiss/gpu/StandardGpuResources.cpp | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
index 878cc1ce15..69a45323be 100644
--- a/faiss/gpu/StandardGpuResources.cpp
+++ b/faiss/gpu/StandardGpuResources.cpp
@@ -129,6 +129,12 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
 
     FAISS_ASSERT_MSG(
             !allocError, "GPU memory allocations not properly cleaned up");
+    
+#if defined USE_NVIDIA_CUVS
+    for (auto it = raftHandles_.begin(); it != raftHandles_.end(); it++) {
+        raftHandles_.erase(it); // Remove the current element and move to the next
+    }
+#endif
 
     for (auto& entry : defaultStreams_) {
         DeviceScope scope(entry.first);
@@ -147,7 +153,6 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
 
     for (auto& entry : asyncCopyStreams_) {
         DeviceScope scope(entry.first);
-
         CUDA_VERIFY(cudaStreamDestroy(entry.second));
     }
 
@@ -392,7 +397,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
     // Create streams
     cudaStream_t defaultStream = nullptr;
     CUDA_VERIFY(
-            cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking));
+            cudaStreamCreate(&defaultStream));
 
     defaultStreams_[device] = defaultStream;
 
@@ -402,14 +407,14 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
 
     cudaStream_t asyncCopyStream = 0;
     CUDA_VERIFY(
-            cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
+            cudaStreamCreate(&asyncCopyStream));
 
     asyncCopyStreams_[device] = asyncCopyStream;
 
     std::vector<cudaStream_t> deviceStreams;
     for (int j = 0; j < kNumStreams; ++j) {
         cudaStream_t stream = nullptr;
-        CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+        CUDA_VERIFY(cudaStreamCreate(&stream));
 
         deviceStreams.push_back(stream);
     }
@@ -632,12 +637,19 @@ void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
     }
 
     if (req.space == MemorySpace::Temporary) {
+        std::cout << "deallocating temp memory" << std::endl;
         tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
     } else if (
             req.space == MemorySpace::Device ||
             req.space == MemorySpace::Unified) {
 #if defined USE_NVIDIA_CUVS
-        req.mr->deallocate_async(p, req.size, req.stream);
+        std::cout << "now doing rmm dealloc" << std::endl;
+        // sleep(10);
+        cudaPointerAttributes attributes;
+        auto err = cudaPointerGetAttributes(&attributes, p);
+        if (err == cudaSuccess && attributes.type == cudaMemoryTypeDevice) {
+            req.mr->deallocate_async(p, req.size, req.stream);
+        }
 #else
         auto err = cudaFree(p);
         FAISS_ASSERT_FMT(

From dd16e77bea20f1a04f909e9087c237dc122a97c1 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 4 Nov 2024 08:39:33 -0800
Subject: [PATCH 141/148] revert to 24.08

---
 cmake/thirdparty/fetch_rapids.cmake |  2 +-
 conda/faiss-gpu-cuvs/meta.yaml      |  4 +-
 faiss/gpu/GpuDistance.cu            |  4 +-
 faiss/gpu/StandardGpuResources.cpp  | 74 ++++++++++-------------------
 faiss/gpu/impl/CuvsFlatIndex.cu     |  2 +-
 5 files changed, 32 insertions(+), 54 deletions(-)

diff --git a/cmake/thirdparty/fetch_rapids.cmake b/cmake/thirdparty/fetch_rapids.cmake
index cf925a0ac1..3b9d9b140a 100644
--- a/cmake/thirdparty/fetch_rapids.cmake
+++ b/cmake/thirdparty/fetch_rapids.cmake
@@ -15,7 +15,7 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 # =============================================================================
-set(RAPIDS_VERSION "24.10")
+set(RAPIDS_VERSION "24.08")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
diff --git a/conda/faiss-gpu-cuvs/meta.yaml b/conda/faiss-gpu-cuvs/meta.yaml
index 83da1fd32d..6c43d19a7c 100644
--- a/conda/faiss-gpu-cuvs/meta.yaml
+++ b/conda/faiss-gpu-cuvs/meta.yaml
@@ -58,7 +58,7 @@ outputs:
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
-        - libcuvs =24.10
+        - libcuvs =24.08
         - cuda-version {{ cuda_constraints }}
       run:
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
@@ -66,7 +66,7 @@ outputs:
         - openblas  # [not x86_64]
         - cuda-cudart {{ cuda_constraints }}
         - libcublas {{ libcublas_constraints }}
-        - libcuvs =24.10
+        - libcuvs =24.08
         - cuda-version {{ cuda_constraints }}
     test:
       requires:
diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index 202fa31bf6..5e23c4b6bd 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -319,7 +319,7 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
             cuvs::neighbors::brute_force::index<float> idx(
                     handle, index.view(), norms_view, distance, metric_arg);
             cuvs::neighbors::brute_force::search(
-                    handle, idx, search.view(), inds.view(), dists.view());
+                    handle, idx, search.view(), inds.view(), dists.view(), std::nullopt);
         } else {
             auto index = raft::make_readonly_temporary_device_buffer<
                     const float,
@@ -358,7 +358,7 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
             cuvs::neighbors::brute_force::index<float> idx(
                     handle, index.view(), norms_view, distance, metric_arg);
             cuvs::neighbors::brute_force::search(
-                    handle, idx, search.view(), inds.view(), dists.view());
+                    handle, idx, search.view(), inds.view(), dists.view(), std::nullopt);
         }
 
         if (args.metric == MetricType::METRIC_Lp) {
diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
index 69a45323be..f7c065e572 100644
--- a/faiss/gpu/StandardGpuResources.cpp
+++ b/faiss/gpu/StandardGpuResources.cpp
@@ -21,7 +21,7 @@
  * limitations under the License.
  */
 
-#if defined USE_NVIDIA_CUVS
+#if defined USE_NVIDIA_RAFT
 #include <raft/core/device_resources.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -91,7 +91,7 @@ std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
 
 StandardGpuResourcesImpl::StandardGpuResourcesImpl()
         :
-#if defined USE_NVIDIA_CUVS
+#if defined USE_NVIDIA_RAFT
           mmr_(new rmm::mr::managed_memory_resource),
           pmr_(new rmm::mr::pinned_memory_resource),
 #endif
@@ -129,12 +129,6 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
 
     FAISS_ASSERT_MSG(
             !allocError, "GPU memory allocations not properly cleaned up");
-    
-#if defined USE_NVIDIA_CUVS
-    for (auto it = raftHandles_.begin(); it != raftHandles_.end(); it++) {
-        raftHandles_.erase(it); // Remove the current element and move to the next
-    }
-#endif
 
     for (auto& entry : defaultStreams_) {
         DeviceScope scope(entry.first);
@@ -153,6 +147,7 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
 
     for (auto& entry : asyncCopyStreams_) {
         DeviceScope scope(entry.first);
+
         CUDA_VERIFY(cudaStreamDestroy(entry.second));
     }
 
@@ -164,7 +159,7 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
     }
 
     if (pinnedMemAlloc_) {
-#if defined USE_NVIDIA_CUVS
+#if defined USE_NVIDIA_RAFT
         pmr_->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_);
 #else
         auto err = cudaFreeHost(pinnedMemAlloc_);
@@ -263,12 +258,12 @@ void StandardGpuResourcesImpl::setDefaultStream(
         if (prevStream != stream) {
             streamWait({stream}, {prevStream});
         }
-#if defined USE_NVIDIA_CUVS
+#if defined USE_NVIDIA_RAFT
         // delete the raft handle for this device, which will be initialized
         // with the updated stream during any subsequent calls to getRaftHandle
         auto it2 = raftHandles_.find(device);
         if (it2 != raftHandles_.end()) {
-            raft::resource::set_cuda_stream(it2->second, stream);
+            raftHandles_.erase(it2);
         }
 #endif
     }
@@ -288,25 +283,15 @@ void StandardGpuResourcesImpl::revertDefaultStream(int device) {
             cudaStream_t newStream = defaultStreams_[device];
 
             streamWait({newStream}, {prevStream});
-
-#if defined USE_NVIDIA_CUVS
-            // update the stream on the raft handle for this device
-            auto it2 = raftHandles_.find(device);
-            if (it2 != raftHandles_.end()) {
-                raft::resource::set_cuda_stream(it2->second, newStream);
-            }
-#endif
-        } else {
-#if defined USE_NVIDIA_CUVS
-            // delete the raft handle for this device, which will be initialized
-            // with the updated stream during any subsequent calls to
-            // getRaftHandle
-            auto it2 = raftHandles_.find(device);
-            if (it2 != raftHandles_.end()) {
-                raftHandles_.erase(it2);
-            }
-#endif
         }
+#if defined USE_NVIDIA_RAFT
+        // delete the raft handle for this device, which will be initialized
+        // with the updated stream during any subsequent calls to getRaftHandle
+        auto it2 = raftHandles_.find(device);
+        if (it2 != raftHandles_.end()) {
+            raftHandles_.erase(it2);
+        }
+#endif
     }
 
     userDefaultStreams_.erase(device);
@@ -339,7 +324,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
     // If this is the first device that we're initializing, create our
     // pinned memory allocation
     if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
-#if defined USE_NVIDIA_CUVS
+#if defined USE_NVIDIA_RAFT
         // If this is the first device that we're initializing, create our
         // pinned memory allocation
         if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
@@ -397,24 +382,24 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
     // Create streams
     cudaStream_t defaultStream = nullptr;
     CUDA_VERIFY(
-            cudaStreamCreate(&defaultStream));
+            cudaStreamCreateWithFlags(&defaultStream, cudaStreamNonBlocking));
 
     defaultStreams_[device] = defaultStream;
 
-#if defined USE_NVIDIA_CUVS
+#if defined USE_NVIDIA_RAFT
     raftHandles_.emplace(std::make_pair(device, defaultStream));
 #endif
 
     cudaStream_t asyncCopyStream = 0;
     CUDA_VERIFY(
-            cudaStreamCreate(&asyncCopyStream));
+            cudaStreamCreateWithFlags(&asyncCopyStream, cudaStreamNonBlocking));
 
     asyncCopyStreams_[device] = asyncCopyStream;
 
     std::vector<cudaStream_t> deviceStreams;
     for (int j = 0; j < kNumStreams; ++j) {
         cudaStream_t stream = nullptr;
-        CUDA_VERIFY(cudaStreamCreate(&stream));
+        CUDA_VERIFY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 
         deviceStreams.push_back(stream);
     }
@@ -467,7 +452,7 @@ cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) {
     return defaultStreams_[device];
 }
 
-#if defined USE_NVIDIA_CUVS
+#if defined USE_NVIDIA_RAFT
 raft::device_resources& StandardGpuResourcesImpl::getRaftHandle(int device) {
     initializeForDevice(device);
 
@@ -538,7 +523,7 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
         // Otherwise, we can handle this locally
         p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
     } else if (adjReq.space == MemorySpace::Device) {
-#if defined USE_NVIDIA_CUVS
+#if defined USE_NVIDIA_RAFT
         try {
             rmm::mr::device_memory_resource* current_mr =
                     rmm::mr::get_per_device_resource(
@@ -572,7 +557,7 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
         }
 #endif
     } else if (adjReq.space == MemorySpace::Unified) {
-#if defined USE_NVIDIA_CUVS
+#if defined USE_NVIDIA_RAFT
         try {
             // for now, use our own managed MR to do Unified Memory allocations.
             // TODO: change this to use the current device resource once RMM has
@@ -637,19 +622,12 @@ void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
     }
 
     if (req.space == MemorySpace::Temporary) {
-        std::cout << "deallocating temp memory" << std::endl;
         tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
     } else if (
             req.space == MemorySpace::Device ||
             req.space == MemorySpace::Unified) {
-#if defined USE_NVIDIA_CUVS
-        std::cout << "now doing rmm dealloc" << std::endl;
-        // sleep(10);
-        cudaPointerAttributes attributes;
-        auto err = cudaPointerGetAttributes(&attributes, p);
-        if (err == cudaSuccess && attributes.type == cudaMemoryTypeDevice) {
-            req.mr->deallocate_async(p, req.size, req.stream);
-        }
+#if defined USE_NVIDIA_RAFT
+        req.mr->deallocate_async(p, req.size, req.stream);
 #else
         auto err = cudaFree(p);
         FAISS_ASSERT_FMT(
@@ -742,7 +720,7 @@ cudaStream_t StandardGpuResources::getDefaultStream(int device) {
     return res_->getDefaultStream(device);
 }
 
-#if defined USE_NVIDIA_CUVS
+#if defined USE_NVIDIA_RAFT
 raft::device_resources& StandardGpuResources::getRaftHandle(int device) {
     return res_->getRaftHandle(device);
 }
@@ -761,4 +739,4 @@ void StandardGpuResources::setLogMemoryAllocations(bool enable) {
 }
 
 } // namespace gpu
-} // namespace faiss
+} // namespace faiss
\ No newline at end of file
diff --git a/faiss/gpu/impl/CuvsFlatIndex.cu b/faiss/gpu/impl/CuvsFlatIndex.cu
index a6a0e18caa..4e4afd19b4 100644
--- a/faiss/gpu/impl/CuvsFlatIndex.cu
+++ b/faiss/gpu/impl/CuvsFlatIndex.cu
@@ -100,7 +100,7 @@ void CuvsFlatIndex::query(
 
         cuvs::neighbors::brute_force::index idx(
                 handle, index, norms_view, distance, metricArg);
-        cuvs::neighbors::brute_force::search(handle, idx, search, inds, dists);
+        cuvs::neighbors::brute_force::search(handle, idx, search, inds, dists, std::nullopt);
 
         if (metric == MetricType::METRIC_Lp) {
             raft::linalg::unary_op(

From 836a476658fe96f6e3a61dcfcb4d8939d6e289bb Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 4 Nov 2024 08:40:57 -0800
Subject: [PATCH 142/148] revert to 24.08

---
 .github/actions/build_cmake/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index f07cb78f9b..d5ea2c23ad 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -54,7 +54,7 @@ runs:
           conda install -y -q cuda-toolkit -c "nvidia/label/cuda-12.4.0"
         # and CUDA from cuVS channel for cuVS builds
         elif [ "${{ inputs.cuvs }}" = "ON" ]; then
-          conda install -y -q libcuvs=24.10 cuda-version=12.4 cuda-toolkit -c rapidsai -c conda-forge -c "nvidia/label/cuda-12.4.0"
+          conda install -y -q libcuvs=24.08 cuda-version=12.4 cuda-toolkit -c rapidsai -c conda-forge -c "nvidia/label/cuda-12.4.0"
         fi
 
         # install test packages

From aa6e2606c6399c07648cf185102696e3f258bf0e Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 4 Nov 2024 08:51:12 -0800
Subject: [PATCH 143/148] style

---
 faiss/gpu/GpuDistance.cu        | 14 ++++++++++++--
 faiss/gpu/impl/CuvsFlatIndex.cu |  3 ++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index 5e23c4b6bd..f515067889 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -319,7 +319,12 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
             cuvs::neighbors::brute_force::index<float> idx(
                     handle, index.view(), norms_view, distance, metric_arg);
             cuvs::neighbors::brute_force::search(
-                    handle, idx, search.view(), inds.view(), dists.view(), std::nullopt);
+                    handle,
+                    idx,
+                    search.view(),
+                    inds.view(),
+                    dists.view(),
+                    std::nullopt);
         } else {
             auto index = raft::make_readonly_temporary_device_buffer<
                     const float,
@@ -358,7 +363,12 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
             cuvs::neighbors::brute_force::index<float> idx(
                     handle, index.view(), norms_view, distance, metric_arg);
             cuvs::neighbors::brute_force::search(
-                    handle, idx, search.view(), inds.view(), dists.view(), std::nullopt);
+                    handle,
+                    idx,
+                    search.view(),
+                    inds.view(),
+                    dists.view(),
+                    std::nullopt);
         }
 
         if (args.metric == MetricType::METRIC_Lp) {
diff --git a/faiss/gpu/impl/CuvsFlatIndex.cu b/faiss/gpu/impl/CuvsFlatIndex.cu
index 4e4afd19b4..a9ff7dfc63 100644
--- a/faiss/gpu/impl/CuvsFlatIndex.cu
+++ b/faiss/gpu/impl/CuvsFlatIndex.cu
@@ -100,7 +100,8 @@ void CuvsFlatIndex::query(
 
         cuvs::neighbors::brute_force::index idx(
                 handle, index, norms_view, distance, metricArg);
-        cuvs::neighbors::brute_force::search(handle, idx, search, inds, dists, std::nullopt);
+        cuvs::neighbors::brute_force::search(
+                handle, idx, search, inds, dists, std::nullopt);
 
         if (metric == MetricType::METRIC_Lp) {
             raft::linalg::unary_op(

From 32bb62b6aa9ca78666992a8371b98eee9561a5c9 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 4 Nov 2024 09:04:02 -0800
Subject: [PATCH 144/148] copyright

---
 benchs/bench_ivfflat_cuvs.py        | 2 +-
 benchs/bench_ivfpq_cuvs.py          | 2 +-
 cmake/thirdparty/fetch_rapids.cmake | 2 +-
 faiss/gpu/CMakeLists.txt            | 2 +-
 faiss/gpu/StandardGpuResources.h    | 2 +-
 faiss/gpu/impl/CuvsFlatIndex.cu     | 2 +-
 faiss/gpu/impl/CuvsFlatIndex.cuh    | 2 +-
 faiss/gpu/impl/CuvsIVFFlat.cu       | 2 +-
 faiss/gpu/impl/CuvsIVFFlat.cuh      | 2 +-
 faiss/gpu/impl/CuvsIVFPQ.cu         | 2 +-
 faiss/gpu/impl/CuvsIVFPQ.cuh        | 2 +-
 faiss/gpu/utils/CuvsUtils.cu        | 2 +-
 faiss/gpu/utils/CuvsUtils.h         | 2 +-
 13 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/benchs/bench_ivfflat_cuvs.py b/benchs/bench_ivfflat_cuvs.py
index 4815142967..3628ec7422 100644
--- a/benchs/bench_ivfflat_cuvs.py
+++ b/benchs/bench_ivfflat_cuvs.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 #
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/benchs/bench_ivfpq_cuvs.py b/benchs/bench_ivfpq_cuvs.py
index 3ee6da2f52..7668afffea 100644
--- a/benchs/bench_ivfpq_cuvs.py
+++ b/benchs/bench_ivfpq_cuvs.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 #
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/cmake/thirdparty/fetch_rapids.cmake b/cmake/thirdparty/fetch_rapids.cmake
index 3b9d9b140a..8d99161071 100644
--- a/cmake/thirdparty/fetch_rapids.cmake
+++ b/cmake/thirdparty/fetch_rapids.cmake
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 # =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index aaca81a60d..84cb222145 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -5,7 +5,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 # =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
diff --git a/faiss/gpu/StandardGpuResources.h b/faiss/gpu/StandardGpuResources.h
index b017569789..322a341a00 100644
--- a/faiss/gpu/StandardGpuResources.h
+++ b/faiss/gpu/StandardGpuResources.h
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/faiss/gpu/impl/CuvsFlatIndex.cu b/faiss/gpu/impl/CuvsFlatIndex.cu
index a9ff7dfc63..08f63300ef 100644
--- a/faiss/gpu/impl/CuvsFlatIndex.cu
+++ b/faiss/gpu/impl/CuvsFlatIndex.cu
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/faiss/gpu/impl/CuvsFlatIndex.cuh b/faiss/gpu/impl/CuvsFlatIndex.cuh
index a29731e9f4..b856351cfa 100644
--- a/faiss/gpu/impl/CuvsFlatIndex.cuh
+++ b/faiss/gpu/impl/CuvsFlatIndex.cuh
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/faiss/gpu/impl/CuvsIVFFlat.cu b/faiss/gpu/impl/CuvsIVFFlat.cu
index 8cc5b8089e..0de7100c72 100644
--- a/faiss/gpu/impl/CuvsIVFFlat.cu
+++ b/faiss/gpu/impl/CuvsIVFFlat.cu
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/faiss/gpu/impl/CuvsIVFFlat.cuh b/faiss/gpu/impl/CuvsIVFFlat.cuh
index 1856a2adf1..72764c8446 100644
--- a/faiss/gpu/impl/CuvsIVFFlat.cuh
+++ b/faiss/gpu/impl/CuvsIVFFlat.cuh
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/faiss/gpu/impl/CuvsIVFPQ.cu b/faiss/gpu/impl/CuvsIVFPQ.cu
index 2b478609a4..2fc94de0f0 100644
--- a/faiss/gpu/impl/CuvsIVFPQ.cu
+++ b/faiss/gpu/impl/CuvsIVFPQ.cu
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/faiss/gpu/impl/CuvsIVFPQ.cuh b/faiss/gpu/impl/CuvsIVFPQ.cuh
index 9d8c92293d..e6a3e1edc4 100644
--- a/faiss/gpu/impl/CuvsIVFPQ.cuh
+++ b/faiss/gpu/impl/CuvsIVFPQ.cuh
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/faiss/gpu/utils/CuvsUtils.cu b/faiss/gpu/utils/CuvsUtils.cu
index 748cd9986b..1ec32179c6 100644
--- a/faiss/gpu/utils/CuvsUtils.cu
+++ b/faiss/gpu/utils/CuvsUtils.cu
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/faiss/gpu/utils/CuvsUtils.h b/faiss/gpu/utils/CuvsUtils.h
index fc2d791eb7..e44e5f12d5 100644
--- a/faiss/gpu/utils/CuvsUtils.h
+++ b/faiss/gpu/utils/CuvsUtils.h
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From aa64371f6138db51e0b3e5363519619fee9c8813 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Mon, 4 Nov 2024 09:20:08 -0800
Subject: [PATCH 145/148] StandardGpuResources

---
 faiss/gpu/StandardGpuResources.cpp | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
index f7c065e572..7e5819226c 100644
--- a/faiss/gpu/StandardGpuResources.cpp
+++ b/faiss/gpu/StandardGpuResources.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
  * limitations under the License.
  */
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_CUVS
 #include <raft/core/device_resources.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -91,7 +91,7 @@ std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
 
 StandardGpuResourcesImpl::StandardGpuResourcesImpl()
         :
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_CUVS
           mmr_(new rmm::mr::managed_memory_resource),
           pmr_(new rmm::mr::pinned_memory_resource),
 #endif
@@ -159,7 +159,7 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
     }
 
     if (pinnedMemAlloc_) {
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_CUVS
         pmr_->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_);
 #else
         auto err = cudaFreeHost(pinnedMemAlloc_);
@@ -258,7 +258,7 @@ void StandardGpuResourcesImpl::setDefaultStream(
         if (prevStream != stream) {
             streamWait({stream}, {prevStream});
         }
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_CUVS
         // delete the raft handle for this device, which will be initialized
         // with the updated stream during any subsequent calls to getRaftHandle
         auto it2 = raftHandles_.find(device);
@@ -284,7 +284,7 @@ void StandardGpuResourcesImpl::revertDefaultStream(int device) {
 
             streamWait({newStream}, {prevStream});
         }
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_CUVS
         // delete the raft handle for this device, which will be initialized
         // with the updated stream during any subsequent calls to getRaftHandle
         auto it2 = raftHandles_.find(device);
@@ -324,7 +324,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
     // If this is the first device that we're initializing, create our
     // pinned memory allocation
     if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_CUVS
         // If this is the first device that we're initializing, create our
         // pinned memory allocation
         if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
@@ -386,7 +386,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
 
     defaultStreams_[device] = defaultStream;
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_CUVS
     raftHandles_.emplace(std::make_pair(device, defaultStream));
 #endif
 
@@ -452,7 +452,7 @@ cudaStream_t StandardGpuResourcesImpl::getDefaultStream(int device) {
     return defaultStreams_[device];
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_CUVS
 raft::device_resources& StandardGpuResourcesImpl::getRaftHandle(int device) {
     initializeForDevice(device);
 
@@ -523,7 +523,7 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
         // Otherwise, we can handle this locally
         p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
     } else if (adjReq.space == MemorySpace::Device) {
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_CUVS
         try {
             rmm::mr::device_memory_resource* current_mr =
                     rmm::mr::get_per_device_resource(
@@ -557,7 +557,7 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
         }
 #endif
     } else if (adjReq.space == MemorySpace::Unified) {
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_CUVS
         try {
             // for now, use our own managed MR to do Unified Memory allocations.
             // TODO: change this to use the current device resource once RMM has
@@ -626,7 +626,7 @@ void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
     } else if (
             req.space == MemorySpace::Device ||
             req.space == MemorySpace::Unified) {
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_CUVS
         req.mr->deallocate_async(p, req.size, req.stream);
 #else
         auto err = cudaFree(p);
@@ -720,7 +720,7 @@ cudaStream_t StandardGpuResources::getDefaultStream(int device) {
     return res_->getDefaultStream(device);
 }
 
-#if defined USE_NVIDIA_RAFT
+#if defined USE_NVIDIA_CUVS
 raft::device_resources& StandardGpuResources::getRaftHandle(int device) {
     return res_->getRaftHandle(device);
 }

From af3bfccb6b7953810733b5ba2ba38a5e39badc98 Mon Sep 17 00:00:00 2001
From: tarang-jain <jaintarang2015.com>
Date: Mon, 4 Nov 2024 21:56:30 -0800
Subject: [PATCH 146/148] apply all changes to standardgpuresources

---
 build.sh                           | 58 ++++++++++++++++++++++++++++++
 faiss/gpu/StandardGpuResources.cpp | 28 ++++++++++-----
 2 files changed, 77 insertions(+), 9 deletions(-)
 create mode 100644 build.sh

diff --git a/build.sh b/build.sh
new file mode 100644
index 0000000000..3aa16c8546
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+# NOTE: This file is temporary for the proof-of-concept branch and will be removed before this PR is merged
+
+BUILD_TYPE=Release
+BUILD_DIR=build/
+
+RAFT_REPO_REL=""
+EXTRA_CMAKE_ARGS=""
+set -e
+
+if [[ ${RAFT_REPO_REL} != "" ]]; then
+  RAFT_REPO_PATH="`readlink -f \"${RAFT_REPO_REL}\"`"
+  EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCPM_raft_SOURCE=${RAFT_REPO_PATH}"
+fi
+
+if [ "$1" == "clean" ]; then
+  rm -rf build
+  rm -rf .cache
+  exit 0
+fi
+
+if [ "$1" == "test" ]; then
+  make -C build -j test
+  exit 0
+fi
+
+if [ "$1" == "test-raft" ]; then
+  ./build/faiss/gpu/test/TestRaftIndexIVFFlat
+  exit 0
+fi
+
+mkdir -p $BUILD_DIR
+cd $BUILD_DIR
+
+cmake \
+ -DFAISS_ENABLE_GPU=ON \
+ -DFAISS_ENABLE_CUVS=ON \
+ -DFAISS_ENABLE_PYTHON=ON \
+ -DBUILD_TESTING=ON \
+ -DBUILD_SHARED_LIBS=ON \
+ -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+ -DFAISS_OPT_LEVEL=avx2 \
+ -DRAFT_NVTX=OFF \
+ -DCMAKE_CUDA_ARCHITECTURES="NATIVE" \
+ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+ ${EXTRA_CMAKE_ARGS} \
+ ../
+
+#   -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
+#  -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+#  -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+
+# make -C build -j12 faiss
+cmake  --build . -j60
+# make -C build -j12 swigfaiss
+# (cd build/faiss/python && python setup.py install)
+
diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
index 7e5819226c..8724f155d4 100644
--- a/faiss/gpu/StandardGpuResources.cpp
+++ b/faiss/gpu/StandardGpuResources.cpp
@@ -263,7 +263,7 @@ void StandardGpuResourcesImpl::setDefaultStream(
         // with the updated stream during any subsequent calls to getRaftHandle
         auto it2 = raftHandles_.find(device);
         if (it2 != raftHandles_.end()) {
-            raftHandles_.erase(it2);
+            raft::resource::set_cuda_stream(it2->second, stream);
         }
 #endif
     }
@@ -283,15 +283,25 @@ void StandardGpuResourcesImpl::revertDefaultStream(int device) {
             cudaStream_t newStream = defaultStreams_[device];
 
             streamWait({newStream}, {prevStream});
-        }
+
 #if defined USE_NVIDIA_CUVS
-        // delete the raft handle for this device, which will be initialized
-        // with the updated stream during any subsequent calls to getRaftHandle
-        auto it2 = raftHandles_.find(device);
-        if (it2 != raftHandles_.end()) {
-            raftHandles_.erase(it2);
-        }
+            // update the stream on the raft handle for this device
+            auto it2 = raftHandles_.find(device);
+            if (it2 != raftHandles_.end()) {
+                raft::resource::set_cuda_stream(it2->second, newStream);
+            }
+#endif
+        } else {
+#if defined USE_NVIDIA_CUVS
+            // delete the raft handle for this device, which will be initialized
+            // with the updated stream during any subsequent calls to
+            // getRaftHandle
+            auto it2 = raftHandles_.find(device);
+            if (it2 != raftHandles_.end()) {
+                raftHandles_.erase(it2);
+            }
 #endif
+        }
     }
 
     userDefaultStreams_.erase(device);
@@ -739,4 +749,4 @@ void StandardGpuResources::setLogMemoryAllocations(bool enable) {
 }
 
 } // namespace gpu
-} // namespace faiss
\ No newline at end of file
+} // namespace faiss

From 3fb67cc48f72727c59c3be3ea74249fb3f0ece71 Mon Sep 17 00:00:00 2001
From: tarang-jain <jaintarang2015.com>
Date: Mon, 4 Nov 2024 21:57:13 -0800
Subject: [PATCH 147/148] rm build.sh

---
 build.sh | 58 --------------------------------------------------------
 1 file changed, 58 deletions(-)
 delete mode 100644 build.sh

diff --git a/build.sh b/build.sh
deleted file mode 100644
index 3aa16c8546..0000000000
--- a/build.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/bash
-
-# NOTE: This file is temporary for the proof-of-concept branch and will be removed before this PR is merged
-
-BUILD_TYPE=Release
-BUILD_DIR=build/
-
-RAFT_REPO_REL=""
-EXTRA_CMAKE_ARGS=""
-set -e
-
-if [[ ${RAFT_REPO_REL} != "" ]]; then
-  RAFT_REPO_PATH="`readlink -f \"${RAFT_REPO_REL}\"`"
-  EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DCPM_raft_SOURCE=${RAFT_REPO_PATH}"
-fi
-
-if [ "$1" == "clean" ]; then
-  rm -rf build
-  rm -rf .cache
-  exit 0
-fi
-
-if [ "$1" == "test" ]; then
-  make -C build -j test
-  exit 0
-fi
-
-if [ "$1" == "test-raft" ]; then
-  ./build/faiss/gpu/test/TestRaftIndexIVFFlat
-  exit 0
-fi
-
-mkdir -p $BUILD_DIR
-cd $BUILD_DIR
-
-cmake \
- -DFAISS_ENABLE_GPU=ON \
- -DFAISS_ENABLE_CUVS=ON \
- -DFAISS_ENABLE_PYTHON=ON \
- -DBUILD_TESTING=ON \
- -DBUILD_SHARED_LIBS=ON \
- -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
- -DFAISS_OPT_LEVEL=avx2 \
- -DRAFT_NVTX=OFF \
- -DCMAKE_CUDA_ARCHITECTURES="NATIVE" \
- -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
- ${EXTRA_CMAKE_ARGS} \
- ../
-
-#   -DCMAKE_CUDA_COMPILER_LAUNCHER=ccache \
-#  -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-#  -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-
-# make -C build -j12 faiss
-cmake  --build . -j60
-# make -C build -j12 swigfaiss
-# (cd build/faiss/python && python setup.py install)
-

From da46f72cbe6c18c07f678f54b003eaf81a116e8c Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Sun, 10 Nov 2024 02:16:51 -0800
Subject: [PATCH 148/148] unpin cmake

---
 .github/actions/build_cmake/action.yml | 2 +-
 faiss/gpu/StandardGpuResources.cpp     | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index d5ea2c23ad..3fadbc8eb5 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -32,7 +32,7 @@ runs:
         conda update -y -q conda
         echo "$CONDA/bin" >> $GITHUB_PATH
 
-        conda install -y -q python=3.11 cmake=3.26.4 make swig numpy scipy pytest gflags
+        conda install -y -q python=3.11 cmake make swig numpy scipy pytest gflags
 
         # install base packages for ARM64
         if [ "${{ runner.arch }}" = "ARM64" ]; then
diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
index 8724f155d4..a91c7f693c 100644
--- a/faiss/gpu/StandardGpuResources.cpp
+++ b/faiss/gpu/StandardGpuResources.cpp
@@ -130,6 +130,10 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
     FAISS_ASSERT_MSG(
             !allocError, "GPU memory allocations not properly cleaned up");
 
+#if defined USE_NVIDIA_CUVS
+    raftHandles_.clear();
+#endif
+
     for (auto& entry : defaultStreams_) {
         DeviceScope scope(entry.first);