From 467f70edbfde3b5f7dbc05a6b3ea69d6c46cc219 Mon Sep 17 00:00:00 2001
From: Jeff Johnson <jhj@meta.com>
Date: Wed, 15 Nov 2023 11:19:02 -0800
Subject: [PATCH] Consolidate GPU IVF query tile calculation + special handling
 for large query memory requirements

Summary:
In the GPU IVF (Flat, SQ and PQ) code, there is a requirement for using temporary memory for storing unfiltered (or partially filtered) vector distances calculated during list scanning which are k-selected by separate kernels.

While a batch query may be presented to an IVF index, the amount of temporary memory needed to store all these unfiltered distances prior to filtering may be very huge depending upon IVF characteristics (such as the maximum number of vectors encoded in any of the IVF lists), in which case we cannot process the entire batch of queries at once and instead must tile over the batch of queries to reuse the temporary memory that we make available for these distances.

The old code duplicated this roughly equivalent logic in 3 different places (the IVFFlat/SQ code, IVFPQ with precomputed codes, and IVFPQ without precomputed codes). Furthermore, in the case where either little/no temporary memory was available or where what temporary memory was available was (vastly) exceeded by the amount needed to handle a particular query, the old code enforced a minimum number of queries to be processed at once of 8. In certain cases (huge IVF list imbalance), this memory request could exceed the amount of memory that can be safely allocated on a GPU.

This diff consolidates the original 3 separate places where this calculation took place to 1 place in IVFUtils. The logic proceeds roughly as before, to figure out how many queries can be processed in the available temporary memory, except we add a new heuristic in the case where the number of queries that can be concurrently processed falls below 8. This could be either due to little temporary memory being available, or due to huge memory requirements. In this case, we instead ignore the amount of temporary memory available and instead see how many queries' memory requirements would fit into a single 512 MiB memory allocation, so we reasonably cap this amount. If the query still cannot be satisfied with this allocation, we still proceed executing 1 query at a time (which note could still potentially exhaust the GPU memory, but this is an error that is unavoidable).

While a different heuristic using the amount of actual memory allocatable on the device could be used instead of this fixed 512 MiB amount, there is no guarantee to my knowledge that a single cudaMalloc up to this limit could succeed (e.g., GPU reports 3 GiB available, you attempt to allocate all of that in a single allocation), so we just pick an amount which is a reasonable balance between efficiency (parallelism) and memory consumption. Note that if not enough temporary memory is available and a single 512 MiB allocation fails, then there is likely little memory to proceed efficiently at all under any scenario, as Faiss does require some headroom in terms of memory available for scratch spaces.

Reviewed By: mdouze

Differential Revision: D45574455

fbshipit-source-id: 08f5204e3e9656627c9134d7409b9b0960f07b2d
---
 faiss/gpu/impl/IVFFlatScan.cu                 |  38 ++----
 faiss/gpu/impl/IVFUtils.cu                    | 108 ++++++++++++++++++
 faiss/gpu/impl/IVFUtils.cuh                   |  33 ++++++
 .../impl/PQScanMultiPassNoPrecomputed-inl.cuh |  48 +++-----
 faiss/gpu/impl/PQScanMultiPassPrecomputed.cu  |  46 +++-----
 5 files changed, 185 insertions(+), 88 deletions(-)

diff --git a/faiss/gpu/impl/IVFFlatScan.cu b/faiss/gpu/impl/IVFFlatScan.cu
index fb453860ba..3acee58d71 100644
--- a/faiss/gpu/impl/IVFFlatScan.cu
+++ b/faiss/gpu/impl/IVFFlatScan.cu
@@ -345,10 +345,6 @@ void runIVFFlatScan(
         GpuResources* res) {
     auto stream = res->getDefaultStreamCurrentDevice();
 
-    constexpr idx_t kMinQueryTileSize = 8;
-    constexpr idx_t kMaxQueryTileSize = 65536; // used as blockIdx.y dimension
-    constexpr idx_t kThrustMemSize = 16384;
-
     auto nprobe = listIds.getSize(1);
 
     // If the maximum list length (in terms of number of vectors) times nprobe
@@ -359,37 +355,22 @@ void runIVFFlatScan(
 
     // Make a reservation for Thrust to do its dirty work (global memory
     // cross-block reduction space); hopefully this is large enough.
+    constexpr idx_t kThrustMemSize = 16384;
+
     DeviceTensor<char, 1, true> thrustMem1(
             res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize});
     DeviceTensor<char, 1, true> thrustMem2(
             res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize});
     DeviceTensor<char, 1, true>* thrustMem[2] = {&thrustMem1, &thrustMem2};
 
-    // How much temporary storage is available?
-    // If possible, we'd like to fit within the space available.
-    size_t sizeAvailable = res->getTempMemoryAvailableCurrentDevice();
-
-    // We run two passes of heap selection
-    // This is the size of the first-level heap passes
-    constexpr idx_t kNProbeSplit = 8;
-    idx_t pass2Chunks = std::min(nprobe, kNProbeSplit);
+    // How much temporary memory would we need to handle a single query?
+    size_t sizePerQuery = getIVFPerQueryTempMemory(k, nprobe, maxListLength);
 
-    idx_t sizeForFirstSelectPass =
-            pass2Chunks * k * (sizeof(float) + sizeof(idx_t));
-
-    // How much temporary storage we need per each query
-    idx_t sizePerQuery = 2 *                            // # streams
-            ((nprobe * sizeof(idx_t) + sizeof(idx_t)) + // prefixSumOffsets
-             nprobe * maxListLength * sizeof(float) +   // allDistances
-             sizeForFirstSelectPass);
-
-    idx_t queryTileSize = sizeAvailable / sizePerQuery;
-
-    if (queryTileSize < kMinQueryTileSize) {
-        queryTileSize = kMinQueryTileSize;
-    } else if (queryTileSize > kMaxQueryTileSize) {
-        queryTileSize = kMaxQueryTileSize;
-    }
+    // How many queries do we wish to run at once?
+    idx_t queryTileSize = getIVFQueryTileSize(
+            queries.getSize(0),
+            res->getTempMemoryAvailableCurrentDevice(),
+            sizePerQuery);
 
     // Temporary memory buffers
     // Make sure there is space prior to the start which will be 0, and
@@ -428,6 +409,7 @@ void runIVFFlatScan(
     DeviceTensor<float, 1, true>* allDistances[2] = {
             &allDistances1, &allDistances2};
 
+    idx_t pass2Chunks = getIVFKSelectionPass2Chunks(nprobe);
     DeviceTensor<float, 3, true> heapDistances1(
             res,
             makeTempAlloc(AllocType::Other, stream),
diff --git a/faiss/gpu/impl/IVFUtils.cu b/faiss/gpu/impl/IVFUtils.cu
index 6f90f8e41c..ba9724ce46 100644
--- a/faiss/gpu/impl/IVFUtils.cu
+++ b/faiss/gpu/impl/IVFUtils.cu
@@ -18,6 +18,114 @@
 namespace faiss {
 namespace gpu {
 
+size_t getIVFKSelectionPass2Chunks(size_t nprobe) {
+    // We run two passes of heap selection
+    // This is the size of the second-level heap passes
+    constexpr size_t kNProbeSplit = 8;
+    return std::min(nprobe, kNProbeSplit);
+}
+
+size_t getIVFPerQueryTempMemory(size_t k, size_t nprobe, size_t maxListLength) {
+    size_t pass2Chunks = getIVFKSelectionPass2Chunks(nprobe);
+
+    size_t sizeForFirstSelectPass =
+            pass2Chunks * k * (sizeof(float) + sizeof(idx_t));
+
+    // Each IVF list being scanned concurrently needs a separate array to
+    // indicate where the per-IVF list distances are being stored via prefix
+    // sum. There is one per each nprobe, plus 1 more entry at the end
+    size_t prefixSumOffsets = nprobe * sizeof(idx_t) + sizeof(idx_t);
+
+    // Storage for all distances from all the IVF lists we are processing
+    size_t allDistances = nprobe * maxListLength * sizeof(float);
+
+    // There are 2 streams on which computations is performed (hence the 2 *)
+    return 2 * (prefixSumOffsets + allDistances + sizeForFirstSelectPass);
+}
+
+size_t getIVFPQPerQueryTempMemory(
+        size_t k,
+        size_t nprobe,
+        size_t maxListLength,
+        bool usePrecomputedCodes,
+        size_t numSubQuantizers,
+        size_t numSubQuantizerCodes) {
+    // Residual PQ distances per each IVF partition (in case we are not using
+    // precomputed codes;
+    size_t residualDistances = usePrecomputedCodes
+            ? 0
+            : (nprobe * numSubQuantizers * numSubQuantizerCodes *
+               sizeof(float));
+
+    // There are 2 streams on which computations is performed (hence the 2 *)
+    // The IVF-generic temp memory allocation already takes this multi-streaming
+    // into account, but we need to do so for the PQ residual distances too
+    return (2 * residualDistances) +
+            getIVFPerQueryTempMemory(k, nprobe, maxListLength);
+}
+
+size_t getIVFQueryTileSize(
+        size_t numQueries,
+        size_t tempMemoryAvailable,
+        size_t sizePerQuery) {
+    // Our ideal minimum number of queries that we'd like to run concurrently
+    constexpr size_t kMinQueryTileSize = 8;
+
+    // Our absolute maximum number of queries that we can run concurrently
+    // (based on max Y grid dimension)
+    constexpr size_t kMaxQueryTileSize = 65536;
+
+    // First, see how many queries we can run within the limit of our available
+    // temporary memory. If all queries can run within the temporary memory
+    // limit, we'll just use that.
+    size_t withinTempMemoryNumQueries =
+            std::min(tempMemoryAvailable / sizePerQuery, numQueries);
+
+    // However, there is a maximum cap on the number of queries that we can run
+    // at once, even if memory were unlimited (due to max Y grid dimension)
+    withinTempMemoryNumQueries =
+            std::min(withinTempMemoryNumQueries, kMaxQueryTileSize);
+
+    // However. withinTempMemoryNumQueries could be really small, or even zero
+    // (in the case where there is no temporary memory available, or the memory
+    // resources for a single query required are really large). If we are below
+    // the ideal minimum number of queries to run concurrently, then we will
+    // ignore the temporary memory limit and fall back to a general device
+    // allocation.
+    // Note that if we only had a single query, then this is ok to run as-is
+    if (withinTempMemoryNumQueries < numQueries &&
+        withinTempMemoryNumQueries < kMinQueryTileSize) {
+        // Either the amount of temporary memory available is too low, or the
+        // amount of memory needed to run a single query is really high. Ignore
+        // the temporary memory available, and always attempt to use this amount
+        // of memory for temporary results
+        //
+        // FIXME: could look at amount of memory available on the current
+        // device, but there is no guarantee that all that memory available
+        // could be done in a single allocation, so we just pick a suitably
+        // large allocation that can yield enough efficiency but something that
+        // the GPU can likely allocate.
+        constexpr size_t kMinMemoryAllocation = 512 * 1024 * 1024; // 512 MiB
+
+        size_t withinMemoryNumQueries =
+                std::min(kMinMemoryAllocation / sizePerQuery, numQueries);
+
+        // It is possible that the per-query size is incredibly huge, in which
+        // case even the 512 MiB allocation will not fit it. In this case, we
+        // have no option except to try running a single one.
+        return std::max(withinMemoryNumQueries, size_t(1));
+    } else {
+        // withinTempMemoryNumQueries cannot be > numQueries.
+        // Either:
+        // 1. == numQueries, >= kMinQueryTileSize (i.e., we can satisfy all
+        // queries in one go, or are limited by max query tile size)
+        // 2. < numQueries, >= kMinQueryTileSize (i.e., we can't satisfy all
+        // queries in one go, but we have a large enough batch to run which is
+        // ok
+        return withinTempMemoryNumQueries;
+    }
+}
+
 // Calculates the total number of intermediate distances to consider
 // for all queries
 __global__ void getResultLengths(
diff --git a/faiss/gpu/impl/IVFUtils.cuh b/faiss/gpu/impl/IVFUtils.cuh
index f952bbfb79..4609cadbff 100644
--- a/faiss/gpu/impl/IVFUtils.cuh
+++ b/faiss/gpu/impl/IVFUtils.cuh
@@ -19,6 +19,39 @@ namespace gpu {
 
 class GpuResources;
 
+/// For the final k-selection of IVF query distances, we perform two passes.
+/// The first pass scans some number of per-IVF list distances reducing them to
+/// at most 8, then a second pass processes these <= 8 to the single final list
+/// of NN candidates
+size_t getIVFKSelectionPass2Chunks(size_t nprobe);
+
+/// Function to determine amount of temporary space that we allocate
+/// for storing basic IVF list scanning distances during query, based on the
+/// memory allocation per query. This is the memory requirement for
+/// IVFFlat/IVFSQ but IVFPQ will add some additional allocation as well (see
+/// getIVFPQPerQueryTempMemory)
+size_t getIVFPerQueryTempMemory(size_t k, size_t nprobe, size_t maxListLength);
+
+/// Function to determine amount of temporary space that we allocate
+/// for storing basic IVFPQ list scanning distances during query, based on the
+/// memory allocation per query.
+size_t getIVFPQPerQueryTempMemory(
+        size_t k,
+        size_t nprobe,
+        size_t maxListLength,
+        bool usePrecomputedCodes,
+        size_t numSubQuantizers,
+        size_t numSubQuantizerCodes);
+
+/// Based on the amount of temporary memory needed per IVF query (determined by
+/// one of the above functions) and the amount of current temporary memory
+/// available, determine how many queries we will run concurrently in a single
+/// tile so as to stay within reasonable temporary memory allocation limits.
+size_t getIVFQueryTileSize(
+        size_t numQueries,
+        size_t tempMemoryAvailable,
+        size_t sizePerQuery);
+
 /// Function for multi-pass scanning that collects the length of
 /// intermediate results for all (query, probe) pair
 void runCalcListOffsets(
diff --git a/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh b/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh
index 20b98fb9e9..a162beed85 100644
--- a/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh
+++ b/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh
@@ -550,10 +550,6 @@ void runPQScanMultiPassNoPrecomputed(
         GpuResources* res) {
     auto stream = res->getDefaultStreamCurrentDevice();
 
-    constexpr idx_t kMinQueryTileSize = 8;
-    constexpr idx_t kMaxQueryTileSize = 65536; // typical max gridDim.y
-    constexpr idx_t kThrustMemSize = 16384;
-
     auto nprobe = coarseIndices.getSize(1);
 
     // If the maximum list length (in terms of number of vectors) times nprobe
@@ -566,39 +562,28 @@ void runPQScanMultiPassNoPrecomputed(
 
     // Make a reservation for Thrust to do its dirty work (global memory
     // cross-block reduction space); hopefully this is large enough.
+    constexpr idx_t kThrustMemSize = 16384;
+
     DeviceTensor<char, 1, true> thrustMem1(
             res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize});
     DeviceTensor<char, 1, true> thrustMem2(
             res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize});
     DeviceTensor<char, 1, true>* thrustMem[2] = {&thrustMem1, &thrustMem2};
 
-    // How much temporary storage is available?
-    // If possible, we'd like to fit within the space available.
-    idx_t sizeAvailable = res->getTempMemoryAvailableCurrentDevice();
-
-    // We run two passes of heap selection
-    // This is the size of the first-level heap passes
-    constexpr idx_t kNProbeSplit = 8;
-    idx_t pass2Chunks = std::min(nprobe, kNProbeSplit);
-
-    idx_t sizeForFirstSelectPass =
-            pass2Chunks * k * (sizeof(float) + sizeof(idx_t));
-
-    // How much temporary storage we need per each query
-    idx_t sizePerQuery = 2 *                            // streams
-            ((nprobe * sizeof(idx_t) + sizeof(idx_t)) + // prefixSumOffsets
-             nprobe * maxListLength * sizeof(float) +   // allDistances
-             // residual distances
-             nprobe * numSubQuantizers * numSubQuantizerCodes * sizeof(float) +
-             sizeForFirstSelectPass);
-
-    idx_t queryTileSize = (sizeAvailable / sizePerQuery);
-
-    if (queryTileSize < kMinQueryTileSize) {
-        queryTileSize = kMinQueryTileSize;
-    } else if (queryTileSize > kMaxQueryTileSize) {
-        queryTileSize = kMaxQueryTileSize;
-    }
+    // How much temporary memory would we need to handle a single query?
+    size_t sizePerQuery = getIVFPQPerQueryTempMemory(
+            k,
+            nprobe,
+            maxListLength,
+            false, /* no precomputed codes */
+            numSubQuantizers,
+            numSubQuantizerCodes);
+
+    // How many queries do we wish to run at once?
+    idx_t queryTileSize = getIVFQueryTileSize(
+            queries.getSize(0),
+            res->getTempMemoryAvailableCurrentDevice(),
+            sizePerQuery);
 
     // Temporary memory buffers
     // Make sure there is space prior to the start which will be 0, and
@@ -664,6 +649,7 @@ void runPQScanMultiPassNoPrecomputed(
     DeviceTensor<float, 1, true>* allDistances[2] = {
             &allDistances1, &allDistances2};
 
+    idx_t pass2Chunks = getIVFKSelectionPass2Chunks(nprobe);
     DeviceTensor<float, 3, true> heapDistances1(
             res,
             makeTempAlloc(AllocType::Other, stream),
diff --git a/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu b/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu
index 404996c30e..2f31ed9fc2 100644
--- a/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu
+++ b/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu
@@ -562,10 +562,6 @@ void runPQScanMultiPassPrecomputed(
         GpuResources* res) {
     auto stream = res->getDefaultStreamCurrentDevice();
 
-    constexpr idx_t kMinQueryTileSize = 8;
-    constexpr idx_t kMaxQueryTileSize = 65536; // typical max gridDim.y
-    constexpr idx_t kThrustMemSize = 16384;
-
     auto nprobe = ivfListIds.getSize(1);
 
     // If the maximum list length (in terms of number of vectors) times nprobe
@@ -578,37 +574,28 @@ void runPQScanMultiPassPrecomputed(
 
     // Make a reservation for Thrust to do its dirty work (global memory
     // cross-block reduction space); hopefully this is large enough.
+    constexpr idx_t kThrustMemSize = 16384;
+
     DeviceTensor<char, 1, true> thrustMem1(
             res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize});
     DeviceTensor<char, 1, true> thrustMem2(
             res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize});
     DeviceTensor<char, 1, true>* thrustMem[2] = {&thrustMem1, &thrustMem2};
 
-    // How much temporary storage is available?
-    // If possible, we'd like to fit within the space available.
-    size_t sizeAvailable = res->getTempMemoryAvailableCurrentDevice();
-
-    // We run two passes of heap selection
-    // This is the size of the first-level heap passes
-    constexpr idx_t kNProbeSplit = 8;
-    idx_t pass2Chunks = std::min(nprobe, kNProbeSplit);
-
-    idx_t sizeForFirstSelectPass =
-            pass2Chunks * k * (sizeof(float) + sizeof(idx_t));
-
-    // How much temporary storage we need per each query
-    idx_t sizePerQuery = 2 *                            // # streams
-            ((nprobe * sizeof(idx_t) + sizeof(idx_t)) + // prefixSumOffsets
-             nprobe * maxListLength * sizeof(float) +   // allDistances
-             sizeForFirstSelectPass);
-
-    idx_t queryTileSize = sizeAvailable / sizePerQuery;
-
-    if (queryTileSize < kMinQueryTileSize) {
-        queryTileSize = kMinQueryTileSize;
-    } else if (queryTileSize > kMaxQueryTileSize) {
-        queryTileSize = kMaxQueryTileSize;
-    }
+    // How much temporary memory would we need to handle a single query?
+    size_t sizePerQuery = getIVFPQPerQueryTempMemory(
+            k,
+            nprobe,
+            maxListLength,
+            true, /* precomputed codes */
+            numSubQuantizers,
+            numSubQuantizerCodes);
+
+    // How many queries do we wish to run at once?
+    idx_t queryTileSize = getIVFQueryTileSize(
+            queries.getSize(0),
+            res->getTempMemoryAvailableCurrentDevice(),
+            sizePerQuery);
 
     // Temporary memory buffers
     // Make sure there is space prior to the start which will be 0, and
@@ -647,6 +634,7 @@ void runPQScanMultiPassPrecomputed(
     DeviceTensor<float, 1, true>* allDistances[2] = {
             &allDistances1, &allDistances2};
 
+    idx_t pass2Chunks = getIVFKSelectionPass2Chunks(nprobe);
     DeviceTensor<float, 3, true> heapDistances1(
             res,
             makeTempAlloc(AllocType::Other, stream),