Add num_multiprocessors utility (#16628)

This PR introduces a new `num_multiprocessors` utility and moves the existing `elements_per_thread` host utility to the new `cuda.hpp` header. Needed by #16619. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - David Wendt (https://github.com/davidwendt) - Mark Harris (https://github.com/harrism) URL: #16628
rapidsai · Aug 27, 2024 · 2d494ed · 2d494ed
1 parent f1cc962
commit 2d494ed
Show file tree

Hide file tree

Showing 7 changed files with 105 additions and 47 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -666,6 +666,7 @@ add_library(
   src/unary/math_ops.cu
   src/unary/nan_ops.cu
   src/unary/null_ops.cu
+  src/utilities/cuda.cpp
   src/utilities/cuda_memcpy.cu
   src/utilities/default_stream.cpp
   src/utilities/host_memory.cpp

diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/cuda.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -150,13 +151,8 @@ void generate_input_tables(key_type* const build_tbl,
   CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
     &num_blocks_init_probe_tbl, init_probe_tbl<key_type, size_type>, block_size, 0));
 
-  int dev_id{-1};
-  CUDF_CUDA_TRY(cudaGetDevice(&dev_id));
-
-  int num_sms{-1};
-  CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
-
-  int const num_states =
+  auto const num_sms = cudf::detail::num_multiprocessors();
+  auto const num_states =
     num_sms * std::max(num_blocks_init_build_tbl, num_blocks_init_probe_tbl) * block_size;
   rmm::device_uvector<curandState> devStates(num_states, cudf::get_default_stream());
 

diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
@@ -22,6 +22,7 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/cuda.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/table/table.hpp>

diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -189,35 +189,6 @@ __device__ T single_lane_block_sum_reduce(T lane_value)
   return result;
 }
 
-/**
- * @brief Get the number of elements that can be processed per thread.
- *
- * @param[in] kernel The kernel for which the elements per thread needs to be assessed
- * @param[in] total_size Number of elements
- * @param[in] block_size Expected block size
- *
- * @return cudf::size_type Elements per thread that can be processed for given specification.
- */
-template <typename Kernel>
-cudf::size_type elements_per_thread(Kernel kernel,
-                                    cudf::size_type total_size,
-                                    cudf::size_type block_size,
-                                    cudf::size_type max_per_thread = 32)
-{
-  CUDF_FUNC_RANGE();
-
-  // calculate theoretical occupancy
-  int max_blocks = 0;
-  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, kernel, block_size, 0));
-
-  int device = 0;
-  CUDF_CUDA_TRY(cudaGetDevice(&device));
-  int num_sms = 0;
-  CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
-  int per_thread = total_size / (max_blocks * num_sms * block_size);
-  return std::clamp(per_thread, 1, max_per_thread);
-}
-
 /**
  * @brief Finds the smallest value not less than `number_to_round` and modulo `modulus` is
  * zero. Expects modulus to be a power of 2.

diff --git a/cpp/include/cudf/detail/utilities/cuda.hpp b/cpp/include/cudf/detail/utilities/cuda.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <algorithm>
+
+namespace CUDF_EXPORT cudf {
+namespace detail {
+
+/**
+ * @brief Get the number of multiprocessors on the device
+ */
+cudf::size_type num_multiprocessors();
+
+/**
+ * @brief Get the number of elements that can be processed per thread.
+ *
+ * @param[in] kernel The kernel for which the elements per thread needs to be assessed
+ * @param[in] total_size Number of elements
+ * @param[in] block_size Expected block size
+ *
+ * @return cudf::size_type Elements per thread that can be processed for given specification.
+ */
+template <typename Kernel>
+cudf::size_type elements_per_thread(Kernel kernel,
+                                    cudf::size_type total_size,
+                                    cudf::size_type block_size,
+                                    cudf::size_type max_per_thread = 32)
+{
+  CUDF_FUNC_RANGE();
+
+  // calculate theoretical occupancy
+  int max_blocks = 0;
+  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, kernel, block_size, 0));
+
+  int per_thread = total_size / (max_blocks * num_multiprocessors() * block_size);
+  return std::clamp(per_thread, 1, max_per_thread);
+}
+
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu
@@ -58,6 +58,7 @@ THE SOFTWARE.
 #include "gpuinflate.hpp"
 #include "io/utilities/block_utils.cuh"
 
+#include <cudf/detail/utilities/cuda.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -2047,19 +2048,14 @@ CUDF_KERNEL void __launch_bounds__(block_size, 2)
  */
 size_t __host__ get_gpu_debrotli_scratch_size(int max_num_inputs)
 {
-  int sm_count = 0;
-  int dev      = 0;
   uint32_t max_fb_size, min_fb_size, fb_size;
-  CUDF_CUDA_TRY(cudaGetDevice(&dev));
-  if (cudaSuccess == cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev)) {
-    // printf("%d SMs on device %d\n", sm_count, dev);
-    max_num_inputs =
-      min(max_num_inputs, sm_count * 3);  // no more than 3 blocks/sm at most due to 32KB smem use
-    if (max_num_inputs <= 0) {
-      max_num_inputs = sm_count * 2;  // Target 2 blocks/SM by default for scratch mem computation
-    }
+  auto const sm_count = cudf::detail::num_multiprocessors();
+  // no more than 3 blocks/sm at most due to 32KB smem use
+  max_num_inputs = std::min(max_num_inputs, sm_count * 3);
+  if (max_num_inputs <= 0) {
+    max_num_inputs = sm_count * 2;  // Target 2 blocks/SM by default for scratch mem computation
   }
-  max_num_inputs = min(max(max_num_inputs, 1), 512);
+  max_num_inputs = std::min(std::max(max_num_inputs, 1), 512);
   // Max fb size per block occurs if all huffman tables for all 3 group types fail local_alloc()
   // with num_htrees=256 (See HuffmanTreeGroupAlloc)
   max_fb_size = 256 * (630 + 1080 + 920) * 2;  // 1.3MB

diff --git a/cpp/src/utilities/cuda.cpp b/cpp/src/utilities/cuda.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/utilities/cuda.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <cuda_runtime.h>
+
+namespace cudf::detail {
+
+cudf::size_type num_multiprocessors()
+{
+  int device = 0;
+  CUDF_CUDA_TRY(cudaGetDevice(&device));
+  int num_sms = 0;
+  CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
+  return num_sms;
+}
+
+}  // namespace cudf::detail