Skip to content

Commit

Permalink
Fix bug in CUDA range calculation
Browse files Browse the repository at this point in the history
A bug in the CUDA adapter was sometimes generating Y and Z ranges that did not divide the
global Y or Z dimension. This fixes that.

Also moves some helper functions into ur/ur.hpp that may be reused by other adapters
  • Loading branch information
hdelan committed Feb 23, 2024
1 parent 588615e commit bed33ec
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 28 deletions.
40 changes: 12 additions & 28 deletions source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <cmath>
#include <cuda.h>
#include <ur/ur.hpp>

ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
uint32_t NumEventsInWaitList,
Expand Down Expand Up @@ -140,12 +141,10 @@ ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,
void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
const size_t *GlobalWorkSize, const uint32_t WorkDim,
const size_t MaxThreadsPerBlock[3],
ur_kernel_handle_t Kernel, uint32_t LocalSize) {
ur_kernel_handle_t Kernel) {
assert(ThreadsPerBlock != nullptr);
assert(GlobalWorkSize != nullptr);
assert(Kernel != nullptr);
int MinGrid, MaxBlockSize;
size_t MaxBlockDim[3];

// The below assumes a three dimensional range but this is not guaranteed by
// UR.
Expand All @@ -154,33 +153,18 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
GlobalSizeNormalized[i] = GlobalWorkSize[i];
}

size_t MaxBlockDim[3];
MaxBlockDim[0] = MaxThreadsPerBlock[0];
MaxBlockDim[1] = Device->getMaxBlockDimY();
MaxBlockDim[2] = Device->getMaxBlockDimZ();

UR_CHECK_ERROR(
cuOccupancyMaxPotentialBlockSize(&MinGrid, &MaxBlockSize, Kernel->get(),
NULL, LocalSize, MaxThreadsPerBlock[0]));

ThreadsPerBlock[2] = std::min(GlobalSizeNormalized[2], MaxBlockDim[2]);
ThreadsPerBlock[1] =
std::min(GlobalSizeNormalized[1],
std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1]));
MaxBlockDim[0] = MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]);
ThreadsPerBlock[0] = std::min(
MaxThreadsPerBlock[0], std::min(GlobalSizeNormalized[0], MaxBlockDim[0]));

static auto IsPowerOf2 = [](size_t Value) -> bool {
return Value && !(Value & (Value - 1));
};

// Find a local work group size that is a divisor of the global
// work group size to produce uniform work groups.
// Additionally, for best compute utilisation, the local size has
// to be a power of two.
while (0u != (GlobalSizeNormalized[0] % ThreadsPerBlock[0]) ||
!IsPowerOf2(ThreadsPerBlock[0])) {
--ThreadsPerBlock[0];
}
int MinGrid, MaxBlockSize;
UR_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
&MinGrid, &MaxBlockSize, Kernel->get(), NULL, Kernel->getLocalSize(),
MaxThreadsPerBlock[0]));

roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
MaxBlockDim, MaxBlockSize);
}

// Helper to verify out-of-registers case (exceeded block max registers).
Expand Down Expand Up @@ -261,7 +245,7 @@ setKernelParams(const ur_context_handle_t Context,
}
} else {
guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
MaxThreadsPerBlock, Kernel, LocalSize);
MaxThreadsPerBlock, Kernel);
}
}

Expand Down
43 changes: 43 additions & 0 deletions source/ur/ur.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,3 +321,46 @@ template <typename T> class Result {
private:
std::variant<ur_result_t, T> value_or_err;
};

// Helper to make sure each x, y, z dim divide the global dimension.
//
// In/Out: ThreadsPerBlockInDim - The dimension of workgroup in some dimension
// In: GlobalWorkSizeInDim - The global size in some dimension
static inline void
roundToHighestFactorOfGlobalSize(size_t &ThreadsPerBlockInDim,
const size_t GlobalWorkSizeInDim) {
while (GlobalWorkSizeInDim % ThreadsPerBlockInDim) {
--ThreadsPerBlockInDim;
}
}

// Returns whether or not Value is a power of 2
template <typename T> inline bool isPowerOf2(const T &Value) {
return Value && !(Value & (Value - 1));
}

// Helper to make sure each x, y, z dim divide the global dimension.
//
// In/Out: ThreadsPerBlock - The size of wg in 3d
// In: GlobalSize - The global size in 3d (if dim < 3 then outer
// dims == 1)
// In: MaxBlockDim - The max size of block in 3d
// In: MaxBlockSize - The max total size of block in all dimensions
static inline void roundToHighestFactorOfGlobalSizeIn3d(
size_t *ThreadsPerBlock, const size_t *GlobalSize,
const size_t *MaxBlockDim, const size_t MaxBlockSize) {
ThreadsPerBlock[2] = std::min(GlobalSize[2], MaxBlockDim[2]);
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalSize[2]);

ThreadsPerBlock[1] =
std::min(GlobalSize[1],
std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1]));
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalSize[1]);

ThreadsPerBlock[0] = std::min(
GlobalSize[0], MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]));
// Make the X dim a factor of 2
do {
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalSize[0]);
} while (!isPowerOf2(ThreadsPerBlock[0]));
}

0 comments on commit bed33ec

Please sign in to comment.