Skip to content

Commit

Permalink
Merge pull request #1326 from hdelan/refactor-guess-local-worksize
Browse files Browse the repository at this point in the history
[CUDA][HIP] Fix bug in guess local worksize funcs and improve local worksize guessing in HIP adapter
  • Loading branch information
kbenzie committed Mar 19, 2024
2 parents ca5c342 + 69c43b4 commit ed1f8bf
Show file tree
Hide file tree
Showing 5 changed files with 262 additions and 74 deletions.
40 changes: 12 additions & 28 deletions source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

#include <cmath>
#include <cuda.h>
#include <ur/ur.hpp>

ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
uint32_t NumEventsInWaitList,
Expand Down Expand Up @@ -140,12 +141,10 @@ ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,
void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
const size_t *GlobalWorkSize, const uint32_t WorkDim,
const size_t MaxThreadsPerBlock[3],
ur_kernel_handle_t Kernel, uint32_t LocalSize) {
ur_kernel_handle_t Kernel) {
assert(ThreadsPerBlock != nullptr);
assert(GlobalWorkSize != nullptr);
assert(Kernel != nullptr);
int MinGrid, MaxBlockSize;
size_t MaxBlockDim[3];

// The below assumes a three dimensional range but this is not guaranteed by
// UR.
Expand All @@ -154,33 +153,18 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
GlobalSizeNormalized[i] = GlobalWorkSize[i];
}

size_t MaxBlockDim[3];
MaxBlockDim[0] = MaxThreadsPerBlock[0];
MaxBlockDim[1] = Device->getMaxBlockDimY();
MaxBlockDim[2] = Device->getMaxBlockDimZ();

UR_CHECK_ERROR(
cuOccupancyMaxPotentialBlockSize(&MinGrid, &MaxBlockSize, Kernel->get(),
NULL, LocalSize, MaxThreadsPerBlock[0]));

ThreadsPerBlock[2] = std::min(GlobalSizeNormalized[2], MaxBlockDim[2]);
ThreadsPerBlock[1] =
std::min(GlobalSizeNormalized[1],
std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1]));
MaxBlockDim[0] = MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]);
ThreadsPerBlock[0] = std::min(
MaxThreadsPerBlock[0], std::min(GlobalSizeNormalized[0], MaxBlockDim[0]));

static auto IsPowerOf2 = [](size_t Value) -> bool {
return Value && !(Value & (Value - 1));
};

// Find a local work group size that is a divisor of the global
// work group size to produce uniform work groups.
// Additionally, for best compute utilisation, the local size has
// to be a power of two.
while (0u != (GlobalSizeNormalized[0] % ThreadsPerBlock[0]) ||
!IsPowerOf2(ThreadsPerBlock[0])) {
--ThreadsPerBlock[0];
}
int MinGrid, MaxBlockSize;
UR_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
&MinGrid, &MaxBlockSize, Kernel->get(), NULL, Kernel->getLocalSize(),
MaxThreadsPerBlock[0]));

roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
MaxBlockDim, MaxBlockSize);
}

// Helper to verify out-of-registers case (exceeded block max registers).
Expand Down Expand Up @@ -261,7 +245,7 @@ setKernelParams(const ur_context_handle_t Context,
}
} else {
guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
MaxThreadsPerBlock, Kernel, LocalSize);
MaxThreadsPerBlock, Kernel);
}
}

Expand Down
36 changes: 22 additions & 14 deletions source/adapters/hip/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#include "memory.hpp"
#include "queue.hpp"

#include <ur/ur.hpp>

extern size_t imageElementByteSize(hipArray_Format ArrayFormat);

ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
Expand Down Expand Up @@ -48,23 +50,29 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
}
}

void simpleGuessLocalWorkSize(size_t *ThreadsPerBlock,
const size_t *GlobalWorkSize,
const size_t MaxThreadsPerBlock[3],
ur_kernel_handle_t Kernel) {
// Determine local work sizes that result in uniform work groups.
// The default threadsPerBlock only require handling the first work_dim
// dimension.
void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
const size_t *GlobalWorkSize, const uint32_t WorkDim,
const size_t MaxThreadsPerBlock[3]) {
assert(ThreadsPerBlock != nullptr);
assert(GlobalWorkSize != nullptr);
assert(Kernel != nullptr);

std::ignore = Kernel;
// FIXME: The below assumes a three dimensional range but this is not
// guaranteed by UR.
size_t GlobalSizeNormalized[3] = {1, 1, 1};
for (uint32_t i = 0; i < WorkDim; i++) {
GlobalSizeNormalized[i] = GlobalWorkSize[i];
}

ThreadsPerBlock[0] = std::min(MaxThreadsPerBlock[0], GlobalWorkSize[0]);
size_t MaxBlockDim[3];
MaxBlockDim[0] = MaxThreadsPerBlock[0];
MaxBlockDim[1] = Device->getMaxBlockDimY();
MaxBlockDim[2] = Device->getMaxBlockDimZ();

// Find a local work group size that is a divisor of the global
// work group size to produce uniform work groups.
while (GlobalWorkSize[0] % ThreadsPerBlock[0]) {
--ThreadsPerBlock[0];
}
roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
MaxBlockDim, MaxThreadsPerBlock[0]);
}

namespace {
Expand Down Expand Up @@ -1786,8 +1794,8 @@ setKernelParams(const ur_device_handle_t Device, const uint32_t WorkDim,
return err;
}
} else {
simpleGuessLocalWorkSize(ThreadsPerBlock, GlobalWorkSize,
MaxThreadsPerBlock, Kernel);
guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
MaxThreadsPerBlock);
}
}

Expand Down
53 changes: 53 additions & 0 deletions source/ur/ur.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,3 +321,56 @@ template <typename T> class Result {
private:
std::variant<ur_result_t, T> value_or_err;
};

// Helper to make sure each x, y, z dim divide the global dimension.
//
// In/Out: ThreadsPerBlockInDim - The dimension of workgroup in some dimension
// In: GlobalWorkSizeInDim - The global size in some dimension
static inline void
roundToHighestFactorOfGlobalSize(size_t &ThreadsPerBlockInDim,
const size_t GlobalWorkSizeInDim) {
while (ThreadsPerBlockInDim > 1 &&
GlobalWorkSizeInDim % ThreadsPerBlockInDim) {
--ThreadsPerBlockInDim;
}
}

// Returns whether or not Value is a power of 2
template <typename T> inline bool isPowerOf2(const T &Value) {
return Value && !(Value & (Value - 1));
}

// Helper to make sure each x, y, z dim divide the global dimension.
// Additionally it makes sure that the inner dimension always is a power of 2
//
// In/Out: ThreadsPerBlock - The size of wg in 3d
// In: GlobalSize - The global size in 3d (if dim < 3 then outer
// dims == 1)
// In: MaxBlockDim - The max size of block in 3d
// In: MaxBlockSize - The max total size of block in all dimensions
// In: WorkDim - The workdim (1, 2 or 3)
static inline void roundToHighestFactorOfGlobalSizeIn3d(
size_t *ThreadsPerBlock, const size_t *GlobalSize,
const size_t *MaxBlockDim, const size_t MaxBlockSize) {
assert(GlobalSize[0] && "GlobalSize[0] cannot be zero");
assert(GlobalSize[1] && "GlobalSize[1] cannot be zero");
assert(GlobalSize[2] && "GlobalSize[2] cannot be zero");

ThreadsPerBlock[0] =
std::min(GlobalSize[0], std::min(MaxBlockSize, MaxBlockDim[0]));
do {
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalSize[0]);
} while (!isPowerOf2(ThreadsPerBlock[0]) && ThreadsPerBlock[0] > 32 &&
--ThreadsPerBlock[0]);

ThreadsPerBlock[1] =
std::min(GlobalSize[1],
std::min(MaxBlockSize / ThreadsPerBlock[0], MaxBlockDim[1]));
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalSize[1]);

ThreadsPerBlock[2] = std::min(
GlobalSize[2],
std::min(MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[0]),
MaxBlockDim[2]));
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalSize[2]);
}
104 changes: 72 additions & 32 deletions test/conformance/enqueue/urEnqueueKernelLaunch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,53 +77,93 @@ TEST_P(urEnqueueKernelLaunchTest, InvalidWorkDimension) {
UR_RESULT_ERROR_INVALID_WORK_DIMENSION);
}

struct urEnqueueKernelLaunch2DTest : uur::urKernelExecutionTest {
void SetUp() override {
program_name = "fill_2d";
UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
}

uint32_t val = 42;
size_t global_size[2] = {8, 8};
size_t global_offset[2] = {0, 0};
size_t buffer_size = sizeof(val) * global_size[0] * global_size[1];
size_t n_dimensions = 2;
struct testParametersEnqueueKernel {
size_t X, Y, Z;
size_t Dims;
};
UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunch2DTest);

TEST_P(urEnqueueKernelLaunch2DTest, Success) {
ur_mem_handle_t buffer = nullptr;
AddBuffer1DArg(buffer_size, &buffer);
AddPodArg(val);
ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
global_offset, global_size, nullptr, 0,
nullptr, nullptr));
ASSERT_SUCCESS(urQueueFinish(queue));
ValidateBuffer(buffer, buffer_size, val);
template <typename T>
inline std::string printKernelLaunchTestString(
const testing::TestParamInfo<typename T::ParamType> &info) {
const auto device_handle = std::get<0>(info.param);
const auto platform_device_name =
uur::GetPlatformAndDeviceName(device_handle);
std::stringstream test_name;
test_name << platform_device_name << "__" << std::get<1>(info.param).Dims
<< "D_" << std::get<1>(info.param).X;
if (std::get<1>(info.param).Dims > 1) {
test_name << "_" << std::get<1>(info.param).Y;
}
if (std::get<1>(info.param).Dims > 2) {
test_name << "_" << std::get<1>(info.param).Z;
}
test_name << "";
return test_name.str();
}

struct urEnqueueKernelLaunch3DTest : uur::urKernelExecutionTest {
struct urEnqueueKernelLaunchTestWithParam
: uur::urBaseKernelExecutionTestWithParam<testParametersEnqueueKernel> {
void SetUp() override {
program_name = "fill_3d";
UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
global_range[0] = std::get<1>(GetParam()).X;
global_range[1] = std::get<1>(GetParam()).Y;
global_range[2] = std::get<1>(GetParam()).Z;
buffer_size = sizeof(val) * global_range[0];
n_dimensions = std::get<1>(GetParam()).Dims;
if (n_dimensions == 1) {
program_name = "fill";
} else if (n_dimensions == 2) {
program_name = "fill_2d";
buffer_size *= global_range[1];
} else {
assert(n_dimensions == 3);
program_name = "fill_3d";
buffer_size *= global_range[1] * global_range[2];
}
UUR_RETURN_ON_FATAL_FAILURE(
urBaseKernelExecutionTestWithParam::SetUp());
}

void TearDown() override {
UUR_RETURN_ON_FATAL_FAILURE(uur::urBaseKernelExecutionTestWithParam<
testParametersEnqueueKernel>::TearDown());
}

uint32_t val = 42;
size_t global_size[3] = {4, 4, 4};
size_t global_range[3];
size_t global_offset[3] = {0, 0, 0};
size_t buffer_size =
sizeof(val) * global_size[0] * global_size[1] * global_size[2];
size_t n_dimensions = 3;
size_t n_dimensions;
size_t buffer_size;
};
UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunch3DTest);

TEST_P(urEnqueueKernelLaunch3DTest, Success) {
static std::vector<testParametersEnqueueKernel> test_cases{// 1D
{1, 1, 1, 1},
{31, 1, 1, 1},
{1027, 1, 1, 1},
{32, 1, 1, 1},
{256, 1, 1, 1},
// 2D
{1, 1, 1, 2},
{31, 7, 1, 2},
{1027, 1, 1, 2},
{1, 32, 1, 2},
{256, 79, 1, 2},
// 3D
{1, 1, 1, 3},
{31, 7, 1, 3},
{1027, 1, 19, 3},
{1, 53, 19, 3},
{256, 79, 8, 3}};
UUR_TEST_SUITE_P(
urEnqueueKernelLaunchTestWithParam, testing::ValuesIn(test_cases),
printKernelLaunchTestString<urEnqueueKernelLaunchTestWithParam>);

TEST_P(urEnqueueKernelLaunchTestWithParam, Success) {
ur_mem_handle_t buffer = nullptr;
AddBuffer1DArg(buffer_size, &buffer);
AddPodArg(val);
ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
global_offset, global_size, nullptr, 0,
nullptr, nullptr));
global_offset, global_range, nullptr,
0, nullptr, nullptr));
ASSERT_SUCCESS(urQueueFinish(queue));
ValidateBuffer(buffer, buffer_size, val);
}
Expand Down
Loading

0 comments on commit ed1f8bf

Please sign in to comment.