diff --git a/faiss/gpu/impl/IVFFlatScan.cu b/faiss/gpu/impl/IVFFlatScan.cu index fb453860ba..3acee58d71 100644 --- a/faiss/gpu/impl/IVFFlatScan.cu +++ b/faiss/gpu/impl/IVFFlatScan.cu @@ -345,10 +345,6 @@ void runIVFFlatScan( GpuResources* res) { auto stream = res->getDefaultStreamCurrentDevice(); - constexpr idx_t kMinQueryTileSize = 8; - constexpr idx_t kMaxQueryTileSize = 65536; // used as blockIdx.y dimension - constexpr idx_t kThrustMemSize = 16384; - auto nprobe = listIds.getSize(1); // If the maximum list length (in terms of number of vectors) times nprobe @@ -359,37 +355,22 @@ void runIVFFlatScan( // Make a reservation for Thrust to do its dirty work (global memory // cross-block reduction space); hopefully this is large enough. + constexpr idx_t kThrustMemSize = 16384; + DeviceTensor thrustMem1( res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize}); DeviceTensor thrustMem2( res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize}); DeviceTensor* thrustMem[2] = {&thrustMem1, &thrustMem2}; - // How much temporary storage is available? - // If possible, we'd like to fit within the space available. - size_t sizeAvailable = res->getTempMemoryAvailableCurrentDevice(); - - // We run two passes of heap selection - // This is the size of the first-level heap passes - constexpr idx_t kNProbeSplit = 8; - idx_t pass2Chunks = std::min(nprobe, kNProbeSplit); + // How much temporary memory would we need to handle a single query? + size_t sizePerQuery = getIVFPerQueryTempMemory(k, nprobe, maxListLength); - idx_t sizeForFirstSelectPass = - pass2Chunks * k * (sizeof(float) + sizeof(idx_t)); - - // How much temporary storage we need per each query - idx_t sizePerQuery = 2 * // # streams - ((nprobe * sizeof(idx_t) + sizeof(idx_t)) + // prefixSumOffsets - nprobe * maxListLength * sizeof(float) + // allDistances - sizeForFirstSelectPass); - - idx_t queryTileSize = sizeAvailable / sizePerQuery; - - if (queryTileSize < kMinQueryTileSize) { - queryTileSize = kMinQueryTileSize; - } else if (queryTileSize > kMaxQueryTileSize) { - queryTileSize = kMaxQueryTileSize; - } + // How many queries do we wish to run at once? + idx_t queryTileSize = getIVFQueryTileSize( + queries.getSize(0), + res->getTempMemoryAvailableCurrentDevice(), + sizePerQuery); // Temporary memory buffers // Make sure there is space prior to the start which will be 0, and @@ -428,6 +409,7 @@ void runIVFFlatScan( DeviceTensor* allDistances[2] = { &allDistances1, &allDistances2}; + idx_t pass2Chunks = getIVFKSelectionPass2Chunks(nprobe); DeviceTensor heapDistances1( res, makeTempAlloc(AllocType::Other, stream), diff --git a/faiss/gpu/impl/IVFUtils.cu b/faiss/gpu/impl/IVFUtils.cu index 6f90f8e41c..ba9724ce46 100644 --- a/faiss/gpu/impl/IVFUtils.cu +++ b/faiss/gpu/impl/IVFUtils.cu @@ -18,6 +18,114 @@ namespace faiss { namespace gpu { +size_t getIVFKSelectionPass2Chunks(size_t nprobe) { + // We run two passes of heap selection + // This is the size of the second-level heap passes + constexpr size_t kNProbeSplit = 8; + return std::min(nprobe, kNProbeSplit); +} + +size_t getIVFPerQueryTempMemory(size_t k, size_t nprobe, size_t maxListLength) { + size_t pass2Chunks = getIVFKSelectionPass2Chunks(nprobe); + + size_t sizeForFirstSelectPass = + pass2Chunks * k * (sizeof(float) + sizeof(idx_t)); + + // Each IVF list being scanned concurrently needs a separate array to + // indicate where the per-IVF list distances are being stored via prefix + // sum. There is one per each nprobe, plus 1 more entry at the end + size_t prefixSumOffsets = nprobe * sizeof(idx_t) + sizeof(idx_t); + + // Storage for all distances from all the IVF lists we are processing + size_t allDistances = nprobe * maxListLength * sizeof(float); + + // There are 2 streams on which computations is performed (hence the 2 *) + return 2 * (prefixSumOffsets + allDistances + sizeForFirstSelectPass); +} + +size_t getIVFPQPerQueryTempMemory( + size_t k, + size_t nprobe, + size_t maxListLength, + bool usePrecomputedCodes, + size_t numSubQuantizers, + size_t numSubQuantizerCodes) { + // Residual PQ distances per each IVF partition (in case we are not using + // precomputed codes; + size_t residualDistances = usePrecomputedCodes + ? 0 + : (nprobe * numSubQuantizers * numSubQuantizerCodes * + sizeof(float)); + + // There are 2 streams on which computations is performed (hence the 2 *) + // The IVF-generic temp memory allocation already takes this multi-streaming + // into account, but we need to do so for the PQ residual distances too + return (2 * residualDistances) + + getIVFPerQueryTempMemory(k, nprobe, maxListLength); +} + +size_t getIVFQueryTileSize( + size_t numQueries, + size_t tempMemoryAvailable, + size_t sizePerQuery) { + // Our ideal minimum number of queries that we'd like to run concurrently + constexpr size_t kMinQueryTileSize = 8; + + // Our absolute maximum number of queries that we can run concurrently + // (based on max Y grid dimension) + constexpr size_t kMaxQueryTileSize = 65536; + + // First, see how many queries we can run within the limit of our available + // temporary memory. If all queries can run within the temporary memory + // limit, we'll just use that. + size_t withinTempMemoryNumQueries = + std::min(tempMemoryAvailable / sizePerQuery, numQueries); + + // However, there is a maximum cap on the number of queries that we can run + // at once, even if memory were unlimited (due to max Y grid dimension) + withinTempMemoryNumQueries = + std::min(withinTempMemoryNumQueries, kMaxQueryTileSize); + + // However. withinTempMemoryNumQueries could be really small, or even zero + // (in the case where there is no temporary memory available, or the memory + // resources for a single query required are really large). If we are below + // the ideal minimum number of queries to run concurrently, then we will + // ignore the temporary memory limit and fall back to a general device + // allocation. + // Note that if we only had a single query, then this is ok to run as-is + if (withinTempMemoryNumQueries < numQueries && + withinTempMemoryNumQueries < kMinQueryTileSize) { + // Either the amount of temporary memory available is too low, or the + // amount of memory needed to run a single query is really high. Ignore + // the temporary memory available, and always attempt to use this amount + // of memory for temporary results + // + // FIXME: could look at amount of memory available on the current + // device, but there is no guarantee that all that memory available + // could be done in a single allocation, so we just pick a suitably + // large allocation that can yield enough efficiency but something that + // the GPU can likely allocate. + constexpr size_t kMinMemoryAllocation = 512 * 1024 * 1024; // 512 MiB + + size_t withinMemoryNumQueries = + std::min(kMinMemoryAllocation / sizePerQuery, numQueries); + + // It is possible that the per-query size is incredibly huge, in which + // case even the 512 MiB allocation will not fit it. In this case, we + // have no option except to try running a single one. + return std::max(withinMemoryNumQueries, size_t(1)); + } else { + // withinTempMemoryNumQueries cannot be > numQueries. + // Either: + // 1. == numQueries, >= kMinQueryTileSize (i.e., we can satisfy all + // queries in one go, or are limited by max query tile size) + // 2. < numQueries, >= kMinQueryTileSize (i.e., we can't satisfy all + // queries in one go, but we have a large enough batch to run which is + // ok + return withinTempMemoryNumQueries; + } +} + // Calculates the total number of intermediate distances to consider // for all queries __global__ void getResultLengths( diff --git a/faiss/gpu/impl/IVFUtils.cuh b/faiss/gpu/impl/IVFUtils.cuh index f952bbfb79..4609cadbff 100644 --- a/faiss/gpu/impl/IVFUtils.cuh +++ b/faiss/gpu/impl/IVFUtils.cuh @@ -19,6 +19,39 @@ namespace gpu { class GpuResources; +/// For the final k-selection of IVF query distances, we perform two passes. +/// The first pass scans some number of per-IVF list distances reducing them to +/// at most 8, then a second pass processes these <= 8 to the single final list +/// of NN candidates +size_t getIVFKSelectionPass2Chunks(size_t nprobe); + +/// Function to determine amount of temporary space that we allocate +/// for storing basic IVF list scanning distances during query, based on the +/// memory allocation per query. This is the memory requirement for +/// IVFFlat/IVFSQ but IVFPQ will add some additional allocation as well (see +/// getIVFPQPerQueryTempMemory) +size_t getIVFPerQueryTempMemory(size_t k, size_t nprobe, size_t maxListLength); + +/// Function to determine amount of temporary space that we allocate +/// for storing basic IVFPQ list scanning distances during query, based on the +/// memory allocation per query. +size_t getIVFPQPerQueryTempMemory( + size_t k, + size_t nprobe, + size_t maxListLength, + bool usePrecomputedCodes, + size_t numSubQuantizers, + size_t numSubQuantizerCodes); + +/// Based on the amount of temporary memory needed per IVF query (determined by +/// one of the above functions) and the amount of current temporary memory +/// available, determine how many queries we will run concurrently in a single +/// tile so as to stay within reasonable temporary memory allocation limits. +size_t getIVFQueryTileSize( + size_t numQueries, + size_t tempMemoryAvailable, + size_t sizePerQuery); + /// Function for multi-pass scanning that collects the length of /// intermediate results for all (query, probe) pair void runCalcListOffsets( diff --git a/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh b/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh index 20b98fb9e9..a162beed85 100644 --- a/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh +++ b/faiss/gpu/impl/PQScanMultiPassNoPrecomputed-inl.cuh @@ -550,10 +550,6 @@ void runPQScanMultiPassNoPrecomputed( GpuResources* res) { auto stream = res->getDefaultStreamCurrentDevice(); - constexpr idx_t kMinQueryTileSize = 8; - constexpr idx_t kMaxQueryTileSize = 65536; // typical max gridDim.y - constexpr idx_t kThrustMemSize = 16384; - auto nprobe = coarseIndices.getSize(1); // If the maximum list length (in terms of number of vectors) times nprobe @@ -566,39 +562,28 @@ void runPQScanMultiPassNoPrecomputed( // Make a reservation for Thrust to do its dirty work (global memory // cross-block reduction space); hopefully this is large enough. + constexpr idx_t kThrustMemSize = 16384; + DeviceTensor thrustMem1( res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize}); DeviceTensor thrustMem2( res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize}); DeviceTensor* thrustMem[2] = {&thrustMem1, &thrustMem2}; - // How much temporary storage is available? - // If possible, we'd like to fit within the space available. - idx_t sizeAvailable = res->getTempMemoryAvailableCurrentDevice(); - - // We run two passes of heap selection - // This is the size of the first-level heap passes - constexpr idx_t kNProbeSplit = 8; - idx_t pass2Chunks = std::min(nprobe, kNProbeSplit); - - idx_t sizeForFirstSelectPass = - pass2Chunks * k * (sizeof(float) + sizeof(idx_t)); - - // How much temporary storage we need per each query - idx_t sizePerQuery = 2 * // streams - ((nprobe * sizeof(idx_t) + sizeof(idx_t)) + // prefixSumOffsets - nprobe * maxListLength * sizeof(float) + // allDistances - // residual distances - nprobe * numSubQuantizers * numSubQuantizerCodes * sizeof(float) + - sizeForFirstSelectPass); - - idx_t queryTileSize = (sizeAvailable / sizePerQuery); - - if (queryTileSize < kMinQueryTileSize) { - queryTileSize = kMinQueryTileSize; - } else if (queryTileSize > kMaxQueryTileSize) { - queryTileSize = kMaxQueryTileSize; - } + // How much temporary memory would we need to handle a single query? + size_t sizePerQuery = getIVFPQPerQueryTempMemory( + k, + nprobe, + maxListLength, + false, /* no precomputed codes */ + numSubQuantizers, + numSubQuantizerCodes); + + // How many queries do we wish to run at once? + idx_t queryTileSize = getIVFQueryTileSize( + queries.getSize(0), + res->getTempMemoryAvailableCurrentDevice(), + sizePerQuery); // Temporary memory buffers // Make sure there is space prior to the start which will be 0, and @@ -664,6 +649,7 @@ void runPQScanMultiPassNoPrecomputed( DeviceTensor* allDistances[2] = { &allDistances1, &allDistances2}; + idx_t pass2Chunks = getIVFKSelectionPass2Chunks(nprobe); DeviceTensor heapDistances1( res, makeTempAlloc(AllocType::Other, stream), diff --git a/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu b/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu index 404996c30e..2f31ed9fc2 100644 --- a/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu +++ b/faiss/gpu/impl/PQScanMultiPassPrecomputed.cu @@ -562,10 +562,6 @@ void runPQScanMultiPassPrecomputed( GpuResources* res) { auto stream = res->getDefaultStreamCurrentDevice(); - constexpr idx_t kMinQueryTileSize = 8; - constexpr idx_t kMaxQueryTileSize = 65536; // typical max gridDim.y - constexpr idx_t kThrustMemSize = 16384; - auto nprobe = ivfListIds.getSize(1); // If the maximum list length (in terms of number of vectors) times nprobe @@ -578,37 +574,28 @@ void runPQScanMultiPassPrecomputed( // Make a reservation for Thrust to do its dirty work (global memory // cross-block reduction space); hopefully this is large enough. + constexpr idx_t kThrustMemSize = 16384; + DeviceTensor thrustMem1( res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize}); DeviceTensor thrustMem2( res, makeTempAlloc(AllocType::Other, stream), {kThrustMemSize}); DeviceTensor* thrustMem[2] = {&thrustMem1, &thrustMem2}; - // How much temporary storage is available? - // If possible, we'd like to fit within the space available. - size_t sizeAvailable = res->getTempMemoryAvailableCurrentDevice(); - - // We run two passes of heap selection - // This is the size of the first-level heap passes - constexpr idx_t kNProbeSplit = 8; - idx_t pass2Chunks = std::min(nprobe, kNProbeSplit); - - idx_t sizeForFirstSelectPass = - pass2Chunks * k * (sizeof(float) + sizeof(idx_t)); - - // How much temporary storage we need per each query - idx_t sizePerQuery = 2 * // # streams - ((nprobe * sizeof(idx_t) + sizeof(idx_t)) + // prefixSumOffsets - nprobe * maxListLength * sizeof(float) + // allDistances - sizeForFirstSelectPass); - - idx_t queryTileSize = sizeAvailable / sizePerQuery; - - if (queryTileSize < kMinQueryTileSize) { - queryTileSize = kMinQueryTileSize; - } else if (queryTileSize > kMaxQueryTileSize) { - queryTileSize = kMaxQueryTileSize; - } + // How much temporary memory would we need to handle a single query? + size_t sizePerQuery = getIVFPQPerQueryTempMemory( + k, + nprobe, + maxListLength, + true, /* precomputed codes */ + numSubQuantizers, + numSubQuantizerCodes); + + // How many queries do we wish to run at once? + idx_t queryTileSize = getIVFQueryTileSize( + queries.getSize(0), + res->getTempMemoryAvailableCurrentDevice(), + sizePerQuery); // Temporary memory buffers // Make sure there is space prior to the start which will be 0, and @@ -647,6 +634,7 @@ void runPQScanMultiPassPrecomputed( DeviceTensor* allDistances[2] = { &allDistances1, &allDistances2}; + idx_t pass2Chunks = getIVFKSelectionPass2Chunks(nprobe); DeviceTensor heapDistances1( res, makeTempAlloc(AllocType::Other, stream),