diff --git a/faiss/gpu/impl/IVFFlatScan.cu b/faiss/gpu/impl/IVFFlatScan.cu
index fb453860ba..3acee58d71 100644
--- a/faiss/gpu/impl/IVFFlatScan.cu
+++ b/faiss/gpu/impl/IVFFlatScan.cu
@@ -345,10 +345,6 @@ void runIVFFlatScan(
         GpuResources* res) {
     auto stream = res->getDefaultStreamCurrentDevice();
 
-    constexpr idx_t kMinQueryTileSize = 8;
-    constexpr idx_t kMaxQueryTileSize = 65536; // used as blockIdx.y dimension
-    constexpr idx_t kThrustMemSize = 16384;
-
     auto nprobe = listIds.getSize(1);
 
     // If the maximum list length (in terms of number of vectors) times nprobe
@@ -359,37 +355,22 @@ void runIVFFlatScan(
 
     // Make a reservation for Thrust to do its dirty work (global memory
     // cross-block reduction space); hopefully this is large enough.
+    constexpr idx_t kThrustMemSize = 16384;
+
     DeviceTensor<char, 1, true> thrustMem1(
             res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize});
     DeviceTensor<char, 1, true> thrustMem2(
             res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize});
     DeviceTensor<char, 1, true>* thrustMem[2] = {&thrustMem1, &thrustMem2};
 
-    // How much temporary storage is available?
-    // If possible, we'd like to fit within the space available.
-    size_t sizeAvailable = res->getTempMemoryAvailableCurrentDevice();
-
-    // We run two passes of heap selection
-    // This is the size of the first-level heap passes
-    constexpr idx_t kNProbeSplit = 8;
-    idx_t pass2Chunks = std::min(nprobe, kNProbeSplit);
+    // How much temporary memory would we need to handle a single query?
+    size_t sizePerQuery = getIVFPerQueryTempMemory(k, nprobe, maxListLength);
 
-    idx_t sizeForFirstSelectPass =
-            pass2Chunks * k * (sizeof(float) + sizeof(idx_t));
-
-    // How much temporary storage we need per each query
-    idx_t sizePerQuery = 2 *                            // # streams
-            ((nprobe * sizeof(idx_t) + sizeof(idx_t)) + // prefixSumOffsets
-             nprobe * maxListLength * sizeof(float) +   // allDistances
-             sizeForFirstSelectPass);
-
-    idx_t queryTileSize = sizeAvailable / sizePerQuery;
-
-    if (queryTileSize < kMinQueryTileSize) {
-        queryTileSize = kMinQueryTileSize;
-    } else if (queryTileSize > kMaxQueryTileSize) {
-        queryTileSize = kMaxQueryTileSize;
-    }
+    // How many queries do we wish to run at once?
+    idx_t queryTileSize = getIVFQueryTileSize(
+            queries.getSize(0),
+            res->getTempMemoryAvailableCurrentDevice(),
+            sizePerQuery);
 
     // Temporary memory buffers
     // Make sure there is space prior to the start which will be 0, and
@@ -428,6 +409,7 @@ void runIVFFlatScan(
     DeviceTensor<float, 1, true>* allDistances[2] = {
             &allDistances1, &allDistances2};
 
+    idx_t pass2Chunks = getIVFKSelectionPass2Chunks(nprobe);
     DeviceTensor<float, 3, true> heapDistances1(
             res,
             makeTempAlloc(AllocType::Other, stream),
diff --git a/faiss/gpu/impl/IVFUtils.cu b/faiss/gpu/impl/IVFUtils.cu
index 6f90f8e41c..ba9724ce46 100644
--- a/faiss/gpu/impl/IVFUtils.cu
+++ b/faiss/gpu/impl/IVFUtils.cu
@@ -18,6 +18,114 @@
 namespace faiss {
 namespace gpu {
 
+size_t getIVFKSelectionPass2Chunks(size_t nprobe) {
+    // We run two passes of heap selection
+    // This is the size of the second-level heap passes
+    constexpr size_t kNProbeSplit = 8;
+    return std::min(nprobe, kNProbeSplit);
+}
+
+size_t getIVFPerQueryTempMemory(size_t k, size_t nprobe, size_t maxListLength) {
+    size_t pass2Chunks = getIVFKSelectionPass2Chunks(nprobe);
+
+    size_t sizeForFirstSelectPass =
+            pass2Chunks * k * (sizeof(float) + sizeof(idx_t));
+
+    // Each IVF list being scanned concurrently needs a separate array to
+    // indicate where the per-IVF list distances are being stored via prefix
+    // sum. There is one per each nprobe, plus 1 more entry at the end
+    size_t prefixSumOffsets = nprobe * sizeof(idx_t) + sizeof(idx_t);
+
+    // Storage for all distances from all the IVF lists we are processing
+    size_t allDistances = nprobe * maxListLength * sizeof(float);
+
+    // There are 2 streams on which computations is performed (hence the 2 *)
+    return 2 * (prefixSumOffsets + allDistances + sizeForFirstSelectPass);
+}
+
+size_t getIVFPQPerQueryTempMemory(
+        size_t k,
+        size_t nprobe,
+        size_t maxListLength,
+        bool usePrecomputedCodes,
+        size_t numSubQuantizers,
+        size_t numSubQuantizerCodes) {
+    // Residual PQ distances per each IVF partition (in case we are not using
+    // precomputed codes;
+    size_t residualDistances = usePrecomputedCodes
+            ? 0
+            : (nprobe * numSubQuantizers * numSubQuantizerCodes *
+               sizeof(float));
+
+    // There are 2 streams on which computations is performed (hence the 2 *)
+    // The IVF-generic temp memory allocation already takes this multi-streaming
+    // into account, but we need to do so for the PQ residual distances too
+    return (2 * residualDistances) +
+            getIVFPerQueryTempMemory(k, nprobe, maxListLength);
+}
+
+size_t getIVFQueryTileSize(
+        size_t numQueries,
+        size_t tempMemoryAvailable,
+        size_t sizePerQuery) {
+    // Our ideal minimum number of queries that we'd like to run concurrently
+    constexpr size_t kMinQueryTileSize = 8;
+
+    // Our absolute maximum number of queries that we can run concurrently
+    // (based on max Y grid dimension)
+    constexpr size_t kMaxQueryTileSize = 65536;
+
+    // First, see how many queries we can run within the limit of our available
+    // temporary memory. If all queries can run within the temporary memory
+    // limit, we'll just use that.
+    size_t withinTempMemoryNumQueries =
+            std::min(tempMemoryAvailable / sizePerQuery, numQueries);
+
+    // However, there is a maximum cap on the number of queries that we can run
+    // at once, even if memory were unlimited (due to max Y grid dimension)
+    withinTempMemoryNumQueries =
+            std::min(withinTempMemoryNumQueries, kMaxQueryTileSize);
+
+    // However. withinTempMemoryNumQueries could be really small, or even zero
+    // (in the case where there is no temporary memory available, or the memory
+    // resources for a single query required are really large). If we are below
+    // the ideal minimum number of queries to run concurrently, then we will
+    // ignore the temporary memory limit and fall back to a general device
+    // allocation.
+    // Note that if we only had a single query, then this is ok to run as-is
+    if (withinTempMemoryNumQueries < numQueries &&
+        withinTempMemoryNumQueries < kMinQueryTileSize) {
+        // Either the amount of temporary memory available is too low, or the
+        // amount of memory needed to run a single query is really high. Ignore
+        // the temporary memory available, and always attempt to use this amount
+        // of memory for temporary results
+        //
+        // FIXME: could look at amount of memory available on the current
+        // device, but there is no guarantee that all that memory available
+        // could be done in a single allocation, so we just pick a suitably
+        // large allocation that can yield enough efficiency but something that
+        // the GPU can likely allocate.
+        constexpr size_t kMinMemoryAllocation = 512 * 1024 * 1024; // 512 MiB
+
+        size_t withinMemoryNumQueries =
+                std::min(kMinMemoryAllocation / sizePerQuery, numQueries);
+
+        // It is possible that the per-query size is incredibly huge, in which
+        // case even the 512 MiB allocation will not fit it. In this case, we
+        // have no option except to try running a single one.
+        return std::max(withinMemoryNumQueries, size_t(1));
+    } else {
+        // withinTempMemoryNumQueries cannot be > numQueries.
+        // Either:
+        // 1. == numQueries, >= kMinQueryTileSize (i.e., we can satisfy all
+        // queries in one go, or are limited by max query tile size)
+        // 2. < numQueries, >= kMinQueryTileSize (i.e., we can't satisfy all
+        // queries in one go, but we have a large enough batch to run which is
+        // ok
+        return withinTempMemoryNumQueries;
+    }
+}
+
 // Calculates the total number of intermediate distances to consider
 // for all queries
 __global__ void getResultLengths(
diff --git a/faiss/gpu/impl/IVFUtils.cuh b/faiss/gpu/impl/IVFUtils.cuh
index f952bbfb79..4609cadbff 100644
--- a/faiss/gpu/impl/IVFUtils.cuh
+++ b/faiss/gpu/impl/IVFUtils.cuh
@@ -19,6 +19,39 @@ namespace gpu {
 
 class GpuResources;
 
+/// For the final k-selection of IVF query distances, we perform two passes.
+/// The first pass scans some number of per-IVF list distances reducing them to
+/// at most 8, then a second pass processes these <= 8 to the single final list
+/// of NN candidates
+size_t getIVFKSelectionPass2Chunks(size_t nprobe);
+
+/// Function to determine amount of temporary space that we allocate
+/// for storing basic IVF list scanning distances during query, based on the
+/// memory allocation per query. This is the memory requirement for
+/// IVFFlat/IVFSQ but IVFPQ will add some additional allocation as well (see
+/// getIVFPQPerQueryTempMemory)
+size_t getIVFPerQueryTempMemory(size_t k, size_t nprobe, size_t maxListLength);
+
+/// Function to determine amount of temporary space that we allocate
+/// for storing basic IVFPQ list scanning distances during query, based on the
+/// memory allocation per query.
+size_t getIVFPQPerQueryTempMemory(
+        size_t k,
+        size_t nprobe,
+        size_t maxListLength,
+        bool usePrecomputedCodes,
+        size_t numSubQuantizers,
+        size_t numSubQuantizerCodes);
+
+/// Based on the amount of temporary memory needed per IVF query (determined by
+/// one of the above functions) and the amount of current temporary memory
+/// available, determine how many queries we will run concurrently in a single
+/// tile so as to stay within reasonable temporary memory allocation limits.
+size_t getIVFQueryTileSize(
+        size_t numQueries,
+        size_t tempMemoryAvailable,
+        size_t sizePerQuery);
+
 /// Function for multi-pass scanning that collects the length of
 /// intermediate results for all (query, probe) pair
 void runCalcListOffsets(
diff --git a/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh b/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh
index 20b98fb9e9..a162beed85 100644
--- a/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh
+++ b/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh
@@ -550,10 +550,6 @@ void runPQScanMultiPassNoPrecomputed(
         GpuResources* res) {
     auto stream = res->getDefaultStreamCurrentDevice();
 
-    constexpr idx_t kMinQueryTileSize = 8;
-    constexpr idx_t kMaxQueryTileSize = 65536; // typical max gridDim.y
-    constexpr idx_t kThrustMemSize = 16384;
-
     auto nprobe = coarseIndices.getSize(1);
 
     // If the maximum list length (in terms of number of vectors) times nprobe
@@ -566,39 +562,28 @@ void runPQScanMultiPassNoPrecomputed(
 
     // Make a reservation for Thrust to do its dirty work (global memory
     // cross-block reduction space); hopefully this is large enough.
+    constexpr idx_t kThrustMemSize = 16384;
+
     DeviceTensor<char, 1, true> thrustMem1(
             res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize});
     DeviceTensor<char, 1, true> thrustMem2(
             res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize});
     DeviceTensor<char, 1, true>* thrustMem[2] = {&thrustMem1, &thrustMem2};
 
-    // How much temporary storage is available?
-    // If possible, we'd like to fit within the space available.
-    idx_t sizeAvailable = res->getTempMemoryAvailableCurrentDevice();
-
-    // We run two passes of heap selection
-    // This is the size of the first-level heap passes
-    constexpr idx_t kNProbeSplit = 8;
-    idx_t pass2Chunks = std::min(nprobe, kNProbeSplit);
-
-    idx_t sizeForFirstSelectPass =
-            pass2Chunks * k * (sizeof(float) + sizeof(idx_t));
-
-    // How much temporary storage we need per each query
-    idx_t sizePerQuery = 2 *                            // streams
-            ((nprobe * sizeof(idx_t) + sizeof(idx_t)) + // prefixSumOffsets
-             nprobe * maxListLength * sizeof(float) +   // allDistances
-             // residual distances
-             nprobe * numSubQuantizers * numSubQuantizerCodes * sizeof(float) +
-             sizeForFirstSelectPass);
-
-    idx_t queryTileSize = (sizeAvailable / sizePerQuery);
-
-    if (queryTileSize < kMinQueryTileSize) {
-        queryTileSize = kMinQueryTileSize;
-    } else if (queryTileSize > kMaxQueryTileSize) {
-        queryTileSize = kMaxQueryTileSize;
-    }
+    // How much temporary memory would we need to handle a single query?
+    size_t sizePerQuery = getIVFPQPerQueryTempMemory(
+            k,
+            nprobe,
+            maxListLength,
+            false, /* no precomputed codes */
+            numSubQuantizers,
+            numSubQuantizerCodes);
+
+    // How many queries do we wish to run at once?
+    idx_t queryTileSize = getIVFQueryTileSize(
+            queries.getSize(0),
+            res->getTempMemoryAvailableCurrentDevice(),
+            sizePerQuery);
 
     // Temporary memory buffers
     // Make sure there is space prior to the start which will be 0, and
@@ -664,6 +649,7 @@ void runPQScanMultiPassNoPrecomputed(
     DeviceTensor<float, 1, true>* allDistances[2] = {
             &allDistances1, &allDistances2};
 
+    idx_t pass2Chunks = getIVFKSelectionPass2Chunks(nprobe);
     DeviceTensor<float, 3, true> heapDistances1(
             res,
             makeTempAlloc(AllocType::Other, stream),
diff --git a/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu b/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu
index 404996c30e..2f31ed9fc2 100644
--- a/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu
+++ b/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu
@@ -562,10 +562,6 @@ void runPQScanMultiPassPrecomputed(
         GpuResources* res) {
     auto stream = res->getDefaultStreamCurrentDevice();
 
-    constexpr idx_t kMinQueryTileSize = 8;
-    constexpr idx_t kMaxQueryTileSize = 65536; // typical max gridDim.y
-    constexpr idx_t kThrustMemSize = 16384;
-
     auto nprobe = ivfListIds.getSize(1);
 
     // If the maximum list length (in terms of number of vectors) times nprobe
@@ -578,37 +574,28 @@ void runPQScanMultiPassPrecomputed(
 
     // Make a reservation for Thrust to do its dirty work (global memory
     // cross-block reduction space); hopefully this is large enough.
+    constexpr idx_t kThrustMemSize = 16384;
+
     DeviceTensor<char, 1, true> thrustMem1(
             res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize});
     DeviceTensor<char, 1, true> thrustMem2(
             res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize});
     DeviceTensor<char, 1, true>* thrustMem[2] = {&thrustMem1, &thrustMem2};
 
-    // How much temporary storage is available?
-    // If possible, we'd like to fit within the space available.
-    size_t sizeAvailable = res->getTempMemoryAvailableCurrentDevice();
-
-    // We run two passes of heap selection
-    // This is the size of the first-level heap passes
-    constexpr idx_t kNProbeSplit = 8;
-    idx_t pass2Chunks = std::min(nprobe, kNProbeSplit);
-
-    idx_t sizeForFirstSelectPass =
-            pass2Chunks * k * (sizeof(float) + sizeof(idx_t));
-
-    // How much temporary storage we need per each query
-    idx_t sizePerQuery = 2 *                            // # streams
-            ((nprobe * sizeof(idx_t) + sizeof(idx_t)) + // prefixSumOffsets
-             nprobe * maxListLength * sizeof(float) +   // allDistances
-             sizeForFirstSelectPass);
-
-    idx_t queryTileSize = sizeAvailable / sizePerQuery;
-
-    if (queryTileSize < kMinQueryTileSize) {
-        queryTileSize = kMinQueryTileSize;
-    } else if (queryTileSize > kMaxQueryTileSize) {
-        queryTileSize = kMaxQueryTileSize;
-    }
+    // How much temporary memory would we need to handle a single query?
+    size_t sizePerQuery = getIVFPQPerQueryTempMemory(
+            k,
+            nprobe,
+            maxListLength,
+            true, /* precomputed codes */
+            numSubQuantizers,
+            numSubQuantizerCodes);
+
+    // How many queries do we wish to run at once?
+    idx_t queryTileSize = getIVFQueryTileSize(
+            queries.getSize(0),
+            res->getTempMemoryAvailableCurrentDevice(),
+            sizePerQuery);
 
     // Temporary memory buffers
     // Make sure there is space prior to the start which will be 0, and
@@ -647,6 +634,7 @@ void runPQScanMultiPassPrecomputed(
     DeviceTensor<float, 1, true>* allDistances[2] = {
             &allDistances1, &allDistances2};
 
+    idx_t pass2Chunks = getIVFKSelectionPass2Chunks(nprobe);
     DeviceTensor<float, 3, true> heapDistances1(
             res,
             makeTempAlloc(AllocType::Other, stream),