Parallelize kernel compilation in FAISS (#2922)

Summary: Pull Request resolved: #2922 This parallelizes kernel compilation by taking a template function from much deeper in the stack than was previously the case and generating 128 compilation units rather than the original 8. Reviewed By: mdouze Differential Revision: D46674315 fbshipit-source-id: 830eeaf43dee2c081f735be47c809b28aa3a05f6
facebookresearch · Jun 30, 2023 · 4bfdd43 · 4bfdd43
1 parent a91a288
commit 4bfdd43
Show file tree

Hide file tree

Showing 13 changed files with 179 additions and 279 deletions.
diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
@@ -52,14 +52,6 @@ set(FAISS_GPU_SRC
   impl/PQScanMultiPassPrecomputed.cu
   impl/RemapIndices.cpp
   impl/VectorResidual.cu
-  impl/scan/IVFInterleaved1.cu
-  impl/scan/IVFInterleaved32.cu
-  impl/scan/IVFInterleaved64.cu
-  impl/scan/IVFInterleaved128.cu
-  impl/scan/IVFInterleaved256.cu
-  impl/scan/IVFInterleaved512.cu
-  impl/scan/IVFInterleaved1024.cu
-  impl/scan/IVFInterleaved2048.cu
   impl/IcmEncoder.cu
   utils/BlockSelectFloat.cu
   utils/DeviceUtils.cu
@@ -176,6 +168,74 @@ set(FAISS_GPU_HEADERS
   utils/warpselect/WarpSelectImpl.cuh
 )
 
+function(generate_ivf_interleaved_code)
+  set(SUB_CODEC_TYPE
+    "faiss::gpu::Codec<0, 1>"
+    "faiss::gpu::Codec<1, 1>"
+    "faiss::gpu::Codec<2, 1>"
+    "faiss::gpu::Codec<3, 1>"
+    "faiss::gpu::Codec<4, 1>"
+    "faiss::gpu::Codec<5, 1>"
+    "faiss::gpu::Codec<6, 1>"
+    "faiss::gpu::CodecFloat"
+  )
+
+  set(SUB_METRIC_TYPE
+    "faiss::gpu::IPDistance"
+    "faiss::gpu::L2Distance"
+  )
+
+  # Used for SUB_THREADS, SUB_NUM_WARP_Q, SUB_NUM_THREAD_Q
+  set(THREADS_AND_WARPS
+    "128|1024|8"
+    "128|1|1"
+    "128|128|3"
+    "128|256|4"
+    "128|32|2"
+    "128|512|8"
+    "128|64|3"
+    "64|2048|8"
+  )
+
+  # Traverse through the Cartesian product of X and Y
+  foreach(sub_codec ${SUB_CODEC_TYPE})
+  foreach(metric_type ${SUB_METRIC_TYPE})
+  foreach(threads_and_warps_str ${THREADS_AND_WARPS})
+    string(REPLACE "|" ";" threads_and_warps ${threads_and_warps_str})
+    list(GET threads_and_warps 0 sub_threads)
+    list(GET threads_and_warps 1 sub_num_warp_q)
+    list(GET threads_and_warps 2 sub_num_thread_q)
+
+    # Define the output file name
+    set(filename "template_${sub_codec}_${metric_type}_${sub_threads}_${sub_num_warp_q}_${sub_num_thread_q}")
+    # Remove illegal characters from filename
+    string(REGEX REPLACE "[^A-Za-z0-9_]" "" filename ${filename})
+    set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.cu")
+
+    # Read the template file
+    file(READ "${CMAKE_CURRENT_SOURCE_DIR}/impl/scan/IVFInterleavedScanKernelTemplate.cu" template_content)
+
+    # Replace the placeholders
+    string(REPLACE "SUB_CODEC_TYPE" "${sub_codec}" template_content "${template_content}")
+    string(REPLACE "SUB_METRIC_TYPE" "${metric_type}" template_content "${template_content}")
+    string(REPLACE "SUB_THREADS" "${sub_threads}" template_content "${template_content}")
+    string(REPLACE "SUB_NUM_WARP_Q" "${sub_num_warp_q}" template_content "${template_content}")
+    string(REPLACE "SUB_NUM_THREAD_Q" "${sub_num_thread_q}" template_content "${template_content}")
+
+    # Write the modified content to the output file
+    file(WRITE "${output_file}" "${template_content}")
+
+    # Add the file to the sources
+    list(APPEND FAISS_GPU_SRC "${output_file}")
+  endforeach()
+  endforeach()
+  endforeach()
+  # Propagate modified variable to the parent scope
+  set(FAISS_GPU_SRC "${FAISS_GPU_SRC}" PARENT_SCOPE)
+endfunction()
+
+generate_ivf_interleaved_code()
+
 if(FAISS_ENABLE_RAFT)
   list(APPEND FAISS_GPU_HEADERS
           impl/RaftFlatIndex.cuh)

diff --git a/faiss/gpu/impl/IVFInterleaved.cu b/faiss/gpu/impl/IVFInterleaved.cu
@@ -210,25 +210,23 @@ void runIVFInterleavedScan(
     };
 
     if (k == 1) {
-        ivf_interleaved_call(ivfInterleavedScanImpl<IVFINTERLEAVED_1_PARAMS>);
+        ivf_interleaved_call(ivfInterleavedScanImpl<128, 1, 1>);
     } else if (k <= 32) {
-        ivf_interleaved_call(ivfInterleavedScanImpl<IVFINTERLEAVED_32_PARAMS>);
+        ivf_interleaved_call(ivfInterleavedScanImpl<128, 32, 2>);
     } else if (k <= 64) {
-        ivf_interleaved_call(ivfInterleavedScanImpl<IVFINTERLEAVED_64_PARAMS>);
+        ivf_interleaved_call(ivfInterleavedScanImpl<128, 64, 3>);
     } else if (k <= 128) {
-        ivf_interleaved_call(ivfInterleavedScanImpl<IVFINTERLEAVED_128_PARAMS>);
+        ivf_interleaved_call(ivfInterleavedScanImpl<128, 128, 3>);
     } else if (k <= 256) {
-        ivf_interleaved_call(ivfInterleavedScanImpl<IVFINTERLEAVED_256_PARAMS>);
+        ivf_interleaved_call(ivfInterleavedScanImpl<128, 256, 4>);
     } else if (k <= 512) {
-        ivf_interleaved_call(ivfInterleavedScanImpl<IVFINTERLEAVED_512_PARAMS>);
+        ivf_interleaved_call(ivfInterleavedScanImpl<128, 512, 8>);
     } else if (k <= 1024) {
-        ivf_interleaved_call(
-                ivfInterleavedScanImpl<IVFINTERLEAVED_1024_PARAMS>);
+        ivf_interleaved_call(ivfInterleavedScanImpl<128, 1024, 8>);
     }
 #if GPU_MAX_SELECTION_K >= 2048
     else if (k <= 2048) {
-        ivf_interleaved_call(
-                ivfInterleavedScanImpl<IVFINTERLEAVED_2048_PARAMS>);
+        ivf_interleaved_call(ivfInterleavedScanImpl<64, 2048, 8>);
     }
 #endif
 }

diff --git a/faiss/gpu/impl/IVFInterleaved.cuh b/faiss/gpu/impl/IVFInterleaved.cuh
@@ -35,8 +35,7 @@ template <
         typename Metric,
         int ThreadsPerBlock,
         int NumWarpQ,
-        int NumThreadQ,
-        bool Residual>
+        int NumThreadQ>
 __global__ void ivfInterleavedScan(
         Tensor<float, 2, true> queries,
         Tensor<float, 3, true> residualBase,
@@ -48,7 +47,8 @@ __global__ void ivfInterleavedScan(
         int k,
         // [query][probe][k]
         Tensor<float, 3, true> distanceOut,
-        Tensor<idx_t, 3, true> indicesOut) {
+        Tensor<idx_t, 3, true> indicesOut,
+        const bool Residual) {
     extern __shared__ float smem[];
 
     constexpr int kNumWarps = ThreadsPerBlock / kWarpSize;
@@ -124,7 +124,7 @@ __global__ void ivfInterleavedScan(
             for (int dBase = 0; dBase < dimBlocks; dBase += kWarpSize) {
                 const int loadDim = dBase + laneId;
                 const float queryReg = query[loadDim];
-                [[maybe_unused]] const float residualReg =
+                const float residualReg =
                         Residual ? residualBaseSlice[loadDim] : 0;
 
                 constexpr int kUnroll = 4;
@@ -152,7 +152,7 @@ __global__ void ivfInterleavedScan(
                         decV[j] = codec.decodeNew(dBase + d, encV[j]);
                     }
 
-                    if constexpr (Residual) {
+                    if (Residual) {
 #pragma unroll
                         for (int j = 0; j < kUnroll; ++j) {
                             int d = i * kUnroll + j;
@@ -174,9 +174,9 @@ __global__ void ivfInterleavedScan(
             const bool loadDimInBounds = loadDim < dim;
 
             const float queryReg = loadDimInBounds ? query[loadDim] : 0;
-            [[maybe_unused]] const float residualReg =
-                    Residual && loadDimInBounds ? residualBaseSlice[loadDim]
-                                                : 0;
+            const float residualReg = Residual && loadDimInBounds
+                    ? residualBaseSlice[loadDim]
+                    : 0;
 
             for (int d = 0; d < dim - dimBlocks;
                  ++d, data += wordsPerVectorBlockDim) {
@@ -187,7 +187,7 @@ __global__ void ivfInterleavedScan(
                 enc = WarpPackedBits<EncodeT, Codec::kEncodeBits>::postRead(
                         laneId, enc);
                 float dec = codec.decodeNew(dimBlocks + d, enc);
-                if constexpr (Residual) {
+                if (Residual) {
                     dec += SHFL_SYNC(residualReg, d, kWarpSize);
                 }
 

diff --git a/faiss/gpu/impl/scan/IVFInterleaved1.cu b/faiss/gpu/impl/scan/IVFInterleaved1.cu
diff --git a/faiss/gpu/impl/scan/IVFInterleaved1024.cu b/faiss/gpu/impl/scan/IVFInterleaved1024.cu
diff --git a/faiss/gpu/impl/scan/IVFInterleaved128.cu b/faiss/gpu/impl/scan/IVFInterleaved128.cu
diff --git a/faiss/gpu/impl/scan/IVFInterleaved2048.cu b/faiss/gpu/impl/scan/IVFInterleaved2048.cu
diff --git a/faiss/gpu/impl/scan/IVFInterleaved256.cu b/faiss/gpu/impl/scan/IVFInterleaved256.cu
diff --git a/faiss/gpu/impl/scan/IVFInterleaved32.cu b/faiss/gpu/impl/scan/IVFInterleaved32.cu
diff --git a/faiss/gpu/impl/scan/IVFInterleaved512.cu b/faiss/gpu/impl/scan/IVFInterleaved512.cu
diff --git a/faiss/gpu/impl/scan/IVFInterleaved64.cu b/faiss/gpu/impl/scan/IVFInterleaved64.cu