Skip to content

Commit

Permalink
Move some shared code to ur/ur.hpp
Browse files Browse the repository at this point in the history
Moves code to ur/ur.hpp that is used by both CUDA/HIP adapters. Perhaps there is some
better place to put this. Also replaces the use of lambdas with free
functions.
  • Loading branch information
hdelan committed Feb 20, 2024
1 parent 96c44da commit b430b82
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 44 deletions.
33 changes: 12 additions & 21 deletions source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <cmath>
#include <cuda.h>
#include <ur/ur.hpp>

ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
uint32_t NumEventsInWaitList,
Expand Down Expand Up @@ -144,8 +145,6 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
assert(ThreadsPerBlock != nullptr);
assert(GlobalWorkSize != nullptr);
assert(Kernel != nullptr);
int MinGrid, MaxBlockSize;
size_t MaxBlockDim[3];

// The below assumes a three dimensional range but this is not guaranteed by
// UR.
Expand All @@ -154,39 +153,31 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
GlobalSizeNormalized[i] = GlobalWorkSize[i];
}

size_t MaxBlockDim[3];
MaxBlockDim[0] = MaxThreadsPerBlock[0];
MaxBlockDim[1] = Device->getMaxBlockDimY();
MaxBlockDim[2] = Device->getMaxBlockDimZ();

int MinGrid, MaxBlockSize;
UR_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
&MinGrid, &MaxBlockSize, Kernel->get(), NULL, Kernel->getLocalSize(),
MaxThreadsPerBlock[0]));

// Helper lambda to make sure each x, y, z dim divide the global dimension.
// Can optionally specify that we want the wg size to be a power of 2 in a
// given dimension, which is useful for the X dim for performance reasons.
static auto roundToHighestFactorOfGlobalSize =
[](size_t &ThreadsPerBlockInDim, const size_t GlobalWorkSizeInDim,
bool MakePowerOfTwo) {
auto IsPowerOf2 = [](size_t Value) -> bool {
return Value && !(Value & (Value - 1));
};
while (GlobalWorkSizeInDim % ThreadsPerBlockInDim ||
(MakePowerOfTwo && !IsPowerOf2(ThreadsPerBlockInDim)))
--ThreadsPerBlockInDim;
};

ThreadsPerBlock[2] = std::min(GlobalSizeNormalized[2], MaxBlockDim[2]);
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalWorkSize[2],
false);
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalWorkSize[2]);

ThreadsPerBlock[1] =
std::min(GlobalSizeNormalized[1],
std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1]));
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalWorkSize[1],
false);
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalWorkSize[1]);

MaxBlockDim[0] = MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]);
ThreadsPerBlock[0] = std::min(
MaxThreadsPerBlock[0], std::min(GlobalSizeNormalized[0], MaxBlockDim[0]));
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalWorkSize[0], true);
// Make the X dim a factor of 2
do {
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalWorkSize[0]);
} while (!isPowerOf2(ThreadsPerBlock[0]));
}

// Helper to verify out-of-registers case (exceeded block max registers).
Expand Down
37 changes: 14 additions & 23 deletions source/adapters/hip/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#include "memory.hpp"
#include "queue.hpp"

#include <ur/ur.hpp>

extern size_t imageElementByteSize(hipArray_Format ArrayFormat);

namespace {
Expand Down Expand Up @@ -59,49 +61,38 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
assert(ThreadsPerBlock != nullptr);
assert(GlobalWorkSize != nullptr);
assert(Kernel != nullptr);
int MinGrid, MaxBlockSize;
size_t MaxBlockDim[3];

// The below assumes a three dimensional range but this is not guaranteed by
// UR.
// FIXME: The below assumes a three dimensional range but this is not
// guaranteed by UR.
size_t GlobalSizeNormalized[3] = {1, 1, 1};
for (uint32_t i = 0; i < WorkDim; i++) {
GlobalSizeNormalized[i] = GlobalWorkSize[i];
}

size_t MaxBlockDim[3];
MaxBlockDim[1] = Device->getMaxBlockDimY();
MaxBlockDim[2] = Device->getMaxBlockDimZ();

int MinGrid, MaxBlockSize;
UR_CHECK_ERROR(hipOccupancyMaxPotentialBlockSize(
&MinGrid, &MaxBlockSize, Kernel->get(), Kernel->getLocalSize(),
MaxThreadsPerBlock[0]));

// Helper lambda to make sure each x, y, z dim divide the global dimension.
// Can optionally specify that we want the wg size to be a power of 2 in a
// given dimension, which is useful for the X dim for performance reasons.
static auto roundToHighestFactorOfGlobalSize =
[](size_t &ThreadsPerBlockInDim, const size_t GlobalWorkSizeInDim,
bool MakePowerOfTwo) {
auto IsPowerOf2 = [](size_t Value) -> bool {
return Value && !(Value & (Value - 1));
};
while (GlobalWorkSizeInDim % ThreadsPerBlockInDim ||
(MakePowerOfTwo && !IsPowerOf2(ThreadsPerBlockInDim)))
--ThreadsPerBlockInDim;
};

ThreadsPerBlock[2] = std::min(GlobalSizeNormalized[2], MaxBlockDim[2]);
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalWorkSize[2],
false);
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalWorkSize[2]);

ThreadsPerBlock[1] =
std::min(GlobalSizeNormalized[1],
std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1]));
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalWorkSize[1],
false);
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalWorkSize[1]);

MaxBlockDim[0] = MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]);
ThreadsPerBlock[0] = std::min(
MaxThreadsPerBlock[0], std::min(GlobalSizeNormalized[0], MaxBlockDim[0]));
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalWorkSize[0], true);
// Make the X dim a factor of 2
do {
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalWorkSize[0]);
} while (!IsPowerOf2(ThreadsPerBlock[0]));
}

ur_result_t setHipMemAdvise(const void *DevPtr, const size_t Size,
Expand Down
17 changes: 17 additions & 0 deletions source/ur/ur.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,3 +321,20 @@ template <typename T> class Result {
private:
std::variant<ur_result_t, T> value_or_err;
};

// Helper to make sure each x, y, z dim divide the global dimension.
//
// In/Out: ThreadsPerBlockInDim - The dimension of workgroup in some dimension
// In: GlobalWorkSizeInDim - The global size in some dimension
static inline void
roundToHighestFactorOfGlobalSize(size_t &ThreadsPerBlockInDim,
const size_t GlobalWorkSizeInDim) {
while (GlobalWorkSizeInDim % ThreadsPerBlockInDim) {
--ThreadsPerBlockInDim;
}
}

// Returns whether or not Value is a power of 2
template <typename T> inline bool isPowerOf2(const T &Value) {
return Value && !(Value & (Value - 1));
}

0 comments on commit b430b82

Please sign in to comment.