Merge pull request #1326 from hdelan/refactor-guess-local-worksize

[CUDA][HIP] Fix bug in guess local worksize funcs and improve local worksize guessing in HIP adapter
oneapi-src · Mar 19, 2024 · ed1f8bf · ed1f8bf
2 parents ca5c342 + 69c43b4
commit ed1f8bf
Show file tree

Hide file tree

Showing 5 changed files with 262 additions and 74 deletions.
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
@@ -18,6 +18,7 @@
 
 #include <cmath>
 #include <cuda.h>
+#include <ur/ur.hpp>
 
 ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
  uint32_t NumEventsInWaitList,
@@ -140,12 +141,10 @@ ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,
 void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
  const size_t *GlobalWorkSize, const uint32_t WorkDim,
  const size_t MaxThreadsPerBlock[3],
- ur_kernel_handle_t Kernel, uint32_t LocalSize) {
+ ur_kernel_handle_t Kernel) {
  assert(ThreadsPerBlock != nullptr);
  assert(GlobalWorkSize != nullptr);
  assert(Kernel != nullptr);
- int MinGrid, MaxBlockSize;
- size_t MaxBlockDim[3];
 
  // The below assumes a three dimensional range but this is not guaranteed by
  // UR.
@@ -154,33 +153,18 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
  GlobalSizeNormalized[i] = GlobalWorkSize[i];
  }
 
+ size_t MaxBlockDim[3];
+ MaxBlockDim[0] = MaxThreadsPerBlock[0];
  MaxBlockDim[1] = Device->getMaxBlockDimY();
  MaxBlockDim[2] = Device->getMaxBlockDimZ();
 
- UR_CHECK_ERROR(
- cuOccupancyMaxPotentialBlockSize(&MinGrid, &MaxBlockSize, Kernel->get(),
- NULL, LocalSize, MaxThreadsPerBlock[0]));
-
- ThreadsPerBlock[2] = std::min(GlobalSizeNormalized[2], MaxBlockDim[2]);
- ThreadsPerBlock[1] =
- std::min(GlobalSizeNormalized[1],
- std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1]));
- MaxBlockDim[0] = MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]);
- ThreadsPerBlock[0] = std::min(
- MaxThreadsPerBlock[0], std::min(GlobalSizeNormalized[0], MaxBlockDim[0]));
-
- static auto IsPowerOf2 = [](size_t Value) -> bool {
- return Value && !(Value & (Value - 1));
- };
-
- // Find a local work group size that is a divisor of the global
- // work group size to produce uniform work groups.
- // Additionally, for best compute utilisation, the local size has
- // to be a power of two.
- while (0u != (GlobalSizeNormalized[0] % ThreadsPerBlock[0]) ||
- !IsPowerOf2(ThreadsPerBlock[0])) {
- --ThreadsPerBlock[0];
- }
+ int MinGrid, MaxBlockSize;
+ UR_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
+ &MinGrid, &MaxBlockSize, Kernel->get(), NULL, Kernel->getLocalSize(),
+ MaxThreadsPerBlock[0]));
+
+ roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
+ MaxBlockDim, MaxBlockSize);
 }
 
 // Helper to verify out-of-registers case (exceeded block max registers).
@@ -261,7 +245,7 @@ setKernelParams(const ur_context_handle_t Context,
  }
  } else {
  guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
- MaxThreadsPerBlock, Kernel, LocalSize);
+ MaxThreadsPerBlock, Kernel);
  }
  }
 

diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp
@@ -16,6 +16,8 @@
 #include "memory.hpp"
 #include "queue.hpp"
 
+#include <ur/ur.hpp>
+
 extern size_t imageElementByteSize(hipArray_Format ArrayFormat);
 
 ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
@@ -48,23 +50,29 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
  }
 }
 
-void simpleGuessLocalWorkSize(size_t *ThreadsPerBlock,
- const size_t *GlobalWorkSize,
- const size_t MaxThreadsPerBlock[3],
- ur_kernel_handle_t Kernel) {
+// Determine local work sizes that result in uniform work groups.
+// The default threadsPerBlock only require handling the first work_dim
+// dimension.
+void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
+ const size_t *GlobalWorkSize, const uint32_t WorkDim,
+ const size_t MaxThreadsPerBlock[3]) {
  assert(ThreadsPerBlock != nullptr);
  assert(GlobalWorkSize != nullptr);
- assert(Kernel != nullptr);
 
- std::ignore = Kernel;
+ // FIXME: The below assumes a three dimensional range but this is not
+ // guaranteed by UR.
+ size_t GlobalSizeNormalized[3] = {1, 1, 1};
+ for (uint32_t i = 0; i < WorkDim; i++) {
+ GlobalSizeNormalized[i] = GlobalWorkSize[i];
+ }
 
- ThreadsPerBlock[0] = std::min(MaxThreadsPerBlock[0], GlobalWorkSize[0]);
+ size_t MaxBlockDim[3];
+ MaxBlockDim[0] = MaxThreadsPerBlock[0];
+ MaxBlockDim[1] = Device->getMaxBlockDimY();
+ MaxBlockDim[2] = Device->getMaxBlockDimZ();
 
- // Find a local work group size that is a divisor of the global
- // work group size to produce uniform work groups.
- while (GlobalWorkSize[0] % ThreadsPerBlock[0]) {
- --ThreadsPerBlock[0];
- }
+ roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
+ MaxBlockDim, MaxThreadsPerBlock[0]);
 }
 
 namespace {
@@ -1786,8 +1794,8 @@ setKernelParams(const ur_device_handle_t Device, const uint32_t WorkDim,
  return err;
  }
  } else {
- simpleGuessLocalWorkSize(ThreadsPerBlock, GlobalWorkSize,
-  MaxThreadsPerBlock, Kernel);
+ guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
+ MaxThreadsPerBlock);
  }
  }
 

diff --git a/source/ur/ur.hpp b/source/ur/ur.hpp
@@ -321,3 +321,56 @@ template <typename T> class Result {
 private:
  std::variant<ur_result_t, T> value_or_err;
 };
+
+// Helper to make sure each x, y, z dim divide the global dimension.
+//
+// In/Out: ThreadsPerBlockInDim - The dimension of workgroup in some dimension
+// In: GlobalWorkSizeInDim - The global size in some dimension
+static inline void
+roundToHighestFactorOfGlobalSize(size_t &ThreadsPerBlockInDim,
+ const size_t GlobalWorkSizeInDim) {
+ while (ThreadsPerBlockInDim > 1 &&
+ GlobalWorkSizeInDim % ThreadsPerBlockInDim) {
+ --ThreadsPerBlockInDim;
+ }
+}
+
+// Returns whether or not Value is a power of 2
+template <typename T> inline bool isPowerOf2(const T &Value) {
+ return Value && !(Value & (Value - 1));
+}
+
+// Helper to make sure each x, y, z dim divide the global dimension.
+// Additionally it makes sure that the inner dimension always is a power of 2
+//
+// In/Out: ThreadsPerBlock - The size of wg in 3d
+// In: GlobalSize - The global size in 3d (if dim < 3 then outer
+// dims == 1)
+// In: MaxBlockDim - The max size of block in 3d
+// In: MaxBlockSize - The max total size of block in all dimensions
+// In: WorkDim - The workdim (1, 2 or 3)
+static inline void roundToHighestFactorOfGlobalSizeIn3d(
+ size_t *ThreadsPerBlock, const size_t *GlobalSize,
+ const size_t *MaxBlockDim, const size_t MaxBlockSize) {
+ assert(GlobalSize[0] && "GlobalSize[0] cannot be zero");
+ assert(GlobalSize[1] && "GlobalSize[1] cannot be zero");
+ assert(GlobalSize[2] && "GlobalSize[2] cannot be zero");
+
+ ThreadsPerBlock[0] =
+ std::min(GlobalSize[0], std::min(MaxBlockSize, MaxBlockDim[0]));
+ do {
+ roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalSize[0]);
+ } while (!isPowerOf2(ThreadsPerBlock[0]) && ThreadsPerBlock[0] > 32 &&
+ --ThreadsPerBlock[0]);
+
+ ThreadsPerBlock[1] =
+ std::min(GlobalSize[1],
+ std::min(MaxBlockSize / ThreadsPerBlock[0], MaxBlockDim[1]));
+ roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalSize[1]);
+
+ ThreadsPerBlock[2] = std::min(
+ GlobalSize[2],
+ std::min(MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[0]),
+ MaxBlockDim[2]));
+ roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalSize[2]);
+}
diff --git a/test/conformance/enqueue/urEnqueueKernelLaunch.cpp b/test/conformance/enqueue/urEnqueueKernelLaunch.cpp
@@ -77,53 +77,93 @@ TEST_P(urEnqueueKernelLaunchTest, InvalidWorkDimension) {
  UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
 }
 
-struct urEnqueueKernelLaunch2DTest : uur::urKernelExecutionTest {
- void SetUp() override {
- program_name = "fill_2d";
- UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
- }
-
- uint32_t val = 42;
- size_t global_size[2] = {8, 8};
- size_t global_offset[2] = {0, 0};
- size_t buffer_size = sizeof(val) * global_size[0] * global_size[1];
- size_t n_dimensions = 2;
+struct testParametersEnqueueKernel {
+ size_t X, Y, Z;
+ size_t Dims;
 };
-UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunch2DTest);
 
-TEST_P(urEnqueueKernelLaunch2DTest, Success) {
- ur_mem_handle_t buffer = nullptr;
- AddBuffer1DArg(buffer_size, &buffer);
- AddPodArg(val);
- ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
- global_offset, global_size, nullptr, 0,
- nullptr, nullptr));
- ASSERT_SUCCESS(urQueueFinish(queue));
- ValidateBuffer(buffer, buffer_size, val);
+template <typename T>
+inline std::string printKernelLaunchTestString(
+ const testing::TestParamInfo<typename T::ParamType> &info) {
+ const auto device_handle = std::get<0>(info.param);
+ const auto platform_device_name =
+ uur::GetPlatformAndDeviceName(device_handle);
+ std::stringstream test_name;
+ test_name << platform_device_name << "__" << std::get<1>(info.param).Dims
+ << "D_" << std::get<1>(info.param).X;
+ if (std::get<1>(info.param).Dims > 1) {
+ test_name << "_" << std::get<1>(info.param).Y;
+ }
+ if (std::get<1>(info.param).Dims > 2) {
+ test_name << "_" << std::get<1>(info.param).Z;
+ }
+ test_name << "";
+ return test_name.str();
 }
 
-struct urEnqueueKernelLaunch3DTest : uur::urKernelExecutionTest {
+struct urEnqueueKernelLaunchTestWithParam
+ : uur::urBaseKernelExecutionTestWithParam<testParametersEnqueueKernel> {
  void SetUp() override {
- program_name = "fill_3d";
- UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
+ global_range[0] = std::get<1>(GetParam()).X;
+ global_range[1] = std::get<1>(GetParam()).Y;
+ global_range[2] = std::get<1>(GetParam()).Z;
+ buffer_size = sizeof(val) * global_range[0];
+ n_dimensions = std::get<1>(GetParam()).Dims;
+ if (n_dimensions == 1) {
+ program_name = "fill";
+ } else if (n_dimensions == 2) {
+ program_name = "fill_2d";
+ buffer_size *= global_range[1];
+ } else {
+ assert(n_dimensions == 3);
+ program_name = "fill_3d";
+ buffer_size *= global_range[1] * global_range[2];
+ }
+ UUR_RETURN_ON_FATAL_FAILURE(
+ urBaseKernelExecutionTestWithParam::SetUp());
+ }
+
+ void TearDown() override {
+ UUR_RETURN_ON_FATAL_FAILURE(uur::urBaseKernelExecutionTestWithParam<
+ testParametersEnqueueKernel>::TearDown());
  }
 
  uint32_t val = 42;
- size_t global_size[3] = {4, 4, 4};
+ size_t global_range[3];
  size_t global_offset[3] = {0, 0, 0};
- size_t buffer_size =
- sizeof(val) * global_size[0] * global_size[1] * global_size[2];
- size_t n_dimensions = 3;
+ size_t n_dimensions;
+ size_t buffer_size;
 };
-UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunch3DTest);
 
-TEST_P(urEnqueueKernelLaunch3DTest, Success) {
+static std::vector<testParametersEnqueueKernel> test_cases{// 1D
+ {1, 1, 1, 1},
+ {31, 1, 1, 1},
+ {1027, 1, 1, 1},
+ {32, 1, 1, 1},
+ {256, 1, 1, 1},
+ // 2D
+ {1, 1, 1, 2},
+ {31, 7, 1, 2},
+ {1027, 1, 1, 2},
+ {1, 32, 1, 2},
+ {256, 79, 1, 2},
+ // 3D
+ {1, 1, 1, 3},
+ {31, 7, 1, 3},
+ {1027, 1, 19, 3},
+ {1, 53, 19, 3},
+ {256, 79, 8, 3}};
+UUR_TEST_SUITE_P(
+ urEnqueueKernelLaunchTestWithParam, testing::ValuesIn(test_cases),
+ printKernelLaunchTestString<urEnqueueKernelLaunchTestWithParam>);
+
+TEST_P(urEnqueueKernelLaunchTestWithParam, Success) {
  ur_mem_handle_t buffer = nullptr;
  AddBuffer1DArg(buffer_size, &buffer);
  AddPodArg(val);
  ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
- global_offset, global_size, nullptr, 0,
- nullptr, nullptr));
+ global_offset, global_range, nullptr,
+ 0, nullptr, nullptr));
  ASSERT_SUCCESS(urQueueFinish(queue));
  ValidateBuffer(buffer, buffer_size, val);
 }