Skip to content

Commit

Permalink
Always make the inner dimension a factor of 2
Browse files Browse the repository at this point in the history
The inner dimension may be in the [0], [1] or [2] index depending
on the dimensionality of the global range. This makes the inner
dim always a power of 2
  • Loading branch information
hdelan committed Mar 13, 2024
1 parent 7a5a4cf commit 4d7e829
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 9 deletions.
2 changes: 1 addition & 1 deletion source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
MaxThreadsPerBlock[0]));

roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
MaxBlockDim, MaxBlockSize);
MaxBlockDim, MaxBlockSize, WorkDim);
}

// Helper to verify out-of-registers case (exceeded block max registers).
Expand Down
3 changes: 2 additions & 1 deletion source/adapters/hip/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
MaxBlockDim[2] = Device->getMaxBlockDimZ();

roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
MaxBlockDim, MaxThreadsPerBlock[0]);
MaxBlockDim, MaxThreadsPerBlock[0],
WorkDim);
}

namespace {
Expand Down
23 changes: 16 additions & 7 deletions source/ur/ur.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -347,23 +347,32 @@ template <typename T> inline bool isPowerOf2(const T &Value) {
// dims == 1)
// In: MaxBlockDim - The max size of block in 3d
// In: MaxBlockSize - The max total size of block in all dimensions
// In: WorkDim - The workdim (1, 2 or 3)
static inline void roundToHighestFactorOfGlobalSizeIn3d(
size_t *ThreadsPerBlock, const size_t *GlobalSize,
const size_t *MaxBlockDim, const size_t MaxBlockSize) {
const size_t *MaxBlockDim, const size_t MaxBlockSize,
const size_t WorkDim) {
ThreadsPerBlock[0] = std::min(GlobalSize[0], MaxBlockDim[0]);
// Make the X dim a factor of 2
do {
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalSize[0]);
} while (!isPowerOf2(ThreadsPerBlock[0]) && ThreadsPerBlock[0] > 32 &&
--ThreadsPerBlock[0]);
} while (WorkDim == 3 && !isPowerOf2(ThreadsPerBlock[0]) &&
ThreadsPerBlock[0] > 32 && --ThreadsPerBlock[0]);

ThreadsPerBlock[1] =
std::min(GlobalSize[1],
std::min(MaxBlockSize / ThreadsPerBlock[0], MaxBlockDim[1]));
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalSize[1]);
do {
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalSize[1]);
} while (WorkDim == 2 && !isPowerOf2(ThreadsPerBlock[1]) &&
ThreadsPerBlock[1] > 32 && --ThreadsPerBlock[1]);

ThreadsPerBlock[2] = std::min(
GlobalSize[2], MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[0]));
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalSize[2]);

GlobalSize[2],
std::min(MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[0]),
MaxBlockDim[2]));
do {
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalSize[2]);
} while (WorkDim == 1 && !isPowerOf2(ThreadsPerBlock[2]) &&
ThreadsPerBlock[2] > 32 && --ThreadsPerBlock[2]);
}

0 comments on commit 4d7e829

Please sign in to comment.