diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index c8ae115df3..e7ee2bf523 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -18,6 +18,7 @@ #include #include +#include ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream, uint32_t NumEventsInWaitList, @@ -140,12 +141,10 @@ ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size, void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock, const size_t *GlobalWorkSize, const uint32_t WorkDim, const size_t MaxThreadsPerBlock[3], - ur_kernel_handle_t Kernel, uint32_t LocalSize) { + ur_kernel_handle_t Kernel) { assert(ThreadsPerBlock != nullptr); assert(GlobalWorkSize != nullptr); assert(Kernel != nullptr); - int MinGrid, MaxBlockSize; - size_t MaxBlockDim[3]; // The below assumes a three dimensional range but this is not guaranteed by // UR. @@ -154,33 +153,18 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock, GlobalSizeNormalized[i] = GlobalWorkSize[i]; } + size_t MaxBlockDim[3]; + MaxBlockDim[0] = MaxThreadsPerBlock[0]; MaxBlockDim[1] = Device->getMaxBlockDimY(); MaxBlockDim[2] = Device->getMaxBlockDimZ(); - UR_CHECK_ERROR( - cuOccupancyMaxPotentialBlockSize(&MinGrid, &MaxBlockSize, Kernel->get(), - NULL, LocalSize, MaxThreadsPerBlock[0])); - - ThreadsPerBlock[2] = std::min(GlobalSizeNormalized[2], MaxBlockDim[2]); - ThreadsPerBlock[1] = - std::min(GlobalSizeNormalized[1], - std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1])); - MaxBlockDim[0] = MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]); - ThreadsPerBlock[0] = std::min( - MaxThreadsPerBlock[0], std::min(GlobalSizeNormalized[0], MaxBlockDim[0])); - - static auto IsPowerOf2 = [](size_t Value) -> bool { - return Value && !(Value & (Value - 1)); - }; - - // Find a local work group size that is a divisor of the global - // work group size to produce uniform work groups. - // Additionally, for best compute utilisation, the local size has - // to be a power of two. - while (0u != (GlobalSizeNormalized[0] % ThreadsPerBlock[0]) || - !IsPowerOf2(ThreadsPerBlock[0])) { - --ThreadsPerBlock[0]; - } + int MinGrid, MaxBlockSize; + UR_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize( + &MinGrid, &MaxBlockSize, Kernel->get(), NULL, Kernel->getLocalSize(), + MaxThreadsPerBlock[0])); + + roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized, + MaxBlockDim, MaxBlockSize); } // Helper to verify out-of-registers case (exceeded block max registers). @@ -261,7 +245,7 @@ setKernelParams(const ur_context_handle_t Context, } } else { guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim, - MaxThreadsPerBlock, Kernel, LocalSize); + MaxThreadsPerBlock, Kernel); } } diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index 24ba905688..a5fcb9c26b 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -16,6 +16,8 @@ #include "memory.hpp" #include "queue.hpp" +#include + extern size_t imageElementByteSize(hipArray_Format ArrayFormat); ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream, @@ -48,23 +50,29 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream, } } -void simpleGuessLocalWorkSize(size_t *ThreadsPerBlock, - const size_t *GlobalWorkSize, - const size_t MaxThreadsPerBlock[3], - ur_kernel_handle_t Kernel) { +// Determine local work sizes that result in uniform work groups. +// The default threadsPerBlock only require handling the first work_dim +// dimension. +void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock, + const size_t *GlobalWorkSize, const uint32_t WorkDim, + const size_t MaxThreadsPerBlock[3]) { assert(ThreadsPerBlock != nullptr); assert(GlobalWorkSize != nullptr); - assert(Kernel != nullptr); - std::ignore = Kernel; + // FIXME: The below assumes a three dimensional range but this is not + // guaranteed by UR. + size_t GlobalSizeNormalized[3] = {1, 1, 1}; + for (uint32_t i = 0; i < WorkDim; i++) { + GlobalSizeNormalized[i] = GlobalWorkSize[i]; + } - ThreadsPerBlock[0] = std::min(MaxThreadsPerBlock[0], GlobalWorkSize[0]); + size_t MaxBlockDim[3]; + MaxBlockDim[0] = MaxThreadsPerBlock[0]; + MaxBlockDim[1] = Device->getMaxBlockDimY(); + MaxBlockDim[2] = Device->getMaxBlockDimZ(); - // Find a local work group size that is a divisor of the global - // work group size to produce uniform work groups. - while (GlobalWorkSize[0] % ThreadsPerBlock[0]) { - --ThreadsPerBlock[0]; - } + roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized, + MaxBlockDim, MaxThreadsPerBlock[0]); } namespace { @@ -1793,8 +1801,8 @@ setKernelParams(const ur_device_handle_t Device, const uint32_t WorkDim, return err; } } else { - simpleGuessLocalWorkSize(ThreadsPerBlock, GlobalWorkSize, - MaxThreadsPerBlock, Kernel); + guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim, + MaxThreadsPerBlock); } } diff --git a/source/ur/ur.hpp b/source/ur/ur.hpp index a849943760..48e611dda8 100644 --- a/source/ur/ur.hpp +++ b/source/ur/ur.hpp @@ -321,3 +321,56 @@ template class Result { private: std::variant value_or_err; }; + +// Helper to make sure each x, y, z dim divide the global dimension. +// +// In/Out: ThreadsPerBlockInDim - The dimension of workgroup in some dimension +// In: GlobalWorkSizeInDim - The global size in some dimension +static inline void +roundToHighestFactorOfGlobalSize(size_t &ThreadsPerBlockInDim, + const size_t GlobalWorkSizeInDim) { + while (ThreadsPerBlockInDim > 1 && + GlobalWorkSizeInDim % ThreadsPerBlockInDim) { + --ThreadsPerBlockInDim; + } +} + +// Returns whether or not Value is a power of 2 +template inline bool isPowerOf2(const T &Value) { + return Value && !(Value & (Value - 1)); +} + +// Helper to make sure each x, y, z dim divide the global dimension. +// Additionally it makes sure that the inner dimension always is a power of 2 +// +// In/Out: ThreadsPerBlock - The size of wg in 3d +// In: GlobalSize - The global size in 3d (if dim < 3 then outer +// dims == 1) +// In: MaxBlockDim - The max size of block in 3d +// In: MaxBlockSize - The max total size of block in all dimensions +// In: WorkDim - The workdim (1, 2 or 3) +static inline void roundToHighestFactorOfGlobalSizeIn3d( + size_t *ThreadsPerBlock, const size_t *GlobalSize, + const size_t *MaxBlockDim, const size_t MaxBlockSize) { + assert(GlobalSize[0] && "GlobalSize[0] cannot be zero"); + assert(GlobalSize[1] && "GlobalSize[1] cannot be zero"); + assert(GlobalSize[2] && "GlobalSize[2] cannot be zero"); + + ThreadsPerBlock[0] = + std::min(GlobalSize[0], std::min(MaxBlockSize, MaxBlockDim[0])); + do { + roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalSize[0]); + } while (!isPowerOf2(ThreadsPerBlock[0]) && ThreadsPerBlock[0] > 32 && + --ThreadsPerBlock[0]); + + ThreadsPerBlock[1] = + std::min(GlobalSize[1], + std::min(MaxBlockSize / ThreadsPerBlock[0], MaxBlockDim[1])); + roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalSize[1]); + + ThreadsPerBlock[2] = std::min( + GlobalSize[2], + std::min(MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[0]), + MaxBlockDim[2])); + roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalSize[2]); +} diff --git a/test/conformance/enqueue/urEnqueueKernelLaunch.cpp b/test/conformance/enqueue/urEnqueueKernelLaunch.cpp index c60de77af8..9217457270 100644 --- a/test/conformance/enqueue/urEnqueueKernelLaunch.cpp +++ b/test/conformance/enqueue/urEnqueueKernelLaunch.cpp @@ -77,53 +77,93 @@ TEST_P(urEnqueueKernelLaunchTest, InvalidWorkDimension) { UR_RESULT_ERROR_INVALID_WORK_DIMENSION); } -struct urEnqueueKernelLaunch2DTest : uur::urKernelExecutionTest { - void SetUp() override { - program_name = "fill_2d"; - UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); - } - - uint32_t val = 42; - size_t global_size[2] = {8, 8}; - size_t global_offset[2] = {0, 0}; - size_t buffer_size = sizeof(val) * global_size[0] * global_size[1]; - size_t n_dimensions = 2; +struct testParametersEnqueueKernel { + size_t X, Y, Z; + size_t Dims; }; -UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunch2DTest); -TEST_P(urEnqueueKernelLaunch2DTest, Success) { - ur_mem_handle_t buffer = nullptr; - AddBuffer1DArg(buffer_size, &buffer); - AddPodArg(val); - ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, - global_offset, global_size, nullptr, 0, - nullptr, nullptr)); - ASSERT_SUCCESS(urQueueFinish(queue)); - ValidateBuffer(buffer, buffer_size, val); +template +inline std::string printKernelLaunchTestString( + const testing::TestParamInfo &info) { + const auto device_handle = std::get<0>(info.param); + const auto platform_device_name = + uur::GetPlatformAndDeviceName(device_handle); + std::stringstream test_name; + test_name << platform_device_name << "__" << std::get<1>(info.param).Dims + << "D_" << std::get<1>(info.param).X; + if (std::get<1>(info.param).Dims > 1) { + test_name << "_" << std::get<1>(info.param).Y; + } + if (std::get<1>(info.param).Dims > 2) { + test_name << "_" << std::get<1>(info.param).Z; + } + test_name << ""; + return test_name.str(); } -struct urEnqueueKernelLaunch3DTest : uur::urKernelExecutionTest { +struct urEnqueueKernelLaunchTestWithParam + : uur::urBaseKernelExecutionTestWithParam { void SetUp() override { - program_name = "fill_3d"; - UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + global_range[0] = std::get<1>(GetParam()).X; + global_range[1] = std::get<1>(GetParam()).Y; + global_range[2] = std::get<1>(GetParam()).Z; + buffer_size = sizeof(val) * global_range[0]; + n_dimensions = std::get<1>(GetParam()).Dims; + if (n_dimensions == 1) { + program_name = "fill"; + } else if (n_dimensions == 2) { + program_name = "fill_2d"; + buffer_size *= global_range[1]; + } else { + assert(n_dimensions == 3); + program_name = "fill_3d"; + buffer_size *= global_range[1] * global_range[2]; + } + UUR_RETURN_ON_FATAL_FAILURE( + urBaseKernelExecutionTestWithParam::SetUp()); + } + + void TearDown() override { + UUR_RETURN_ON_FATAL_FAILURE(uur::urBaseKernelExecutionTestWithParam< + testParametersEnqueueKernel>::TearDown()); } uint32_t val = 42; - size_t global_size[3] = {4, 4, 4}; + size_t global_range[3]; size_t global_offset[3] = {0, 0, 0}; - size_t buffer_size = - sizeof(val) * global_size[0] * global_size[1] * global_size[2]; - size_t n_dimensions = 3; + size_t n_dimensions; + size_t buffer_size; }; -UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunch3DTest); -TEST_P(urEnqueueKernelLaunch3DTest, Success) { +static std::vector test_cases{// 1D + {1, 1, 1, 1}, + {31, 1, 1, 1}, + {1027, 1, 1, 1}, + {32, 1, 1, 1}, + {256, 1, 1, 1}, + // 2D + {1, 1, 1, 2}, + {31, 7, 1, 2}, + {1027, 1, 1, 2}, + {1, 32, 1, 2}, + {256, 79, 1, 2}, + // 3D + {1, 1, 1, 3}, + {31, 7, 1, 3}, + {1027, 1, 19, 3}, + {1, 53, 19, 3}, + {256, 79, 8, 3}}; +UUR_TEST_SUITE_P( + urEnqueueKernelLaunchTestWithParam, testing::ValuesIn(test_cases), + printKernelLaunchTestString); + +TEST_P(urEnqueueKernelLaunchTestWithParam, Success) { ur_mem_handle_t buffer = nullptr; AddBuffer1DArg(buffer_size, &buffer); AddPodArg(val); ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, - global_offset, global_size, nullptr, 0, - nullptr, nullptr)); + global_offset, global_range, nullptr, + 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); ValidateBuffer(buffer, buffer_size, val); } diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h index 8e0c86f9b3..0812973ae8 100644 --- a/test/conformance/testing/include/uur/fixtures.h +++ b/test/conformance/testing/include/uur/fixtures.h @@ -1274,6 +1274,109 @@ struct urBaseKernelExecutionTest : urBaseKernelTest { uint32_t current_arg_index = 0; }; +template +struct urBaseKernelExecutionTestWithParam : urBaseKernelTestWithParam { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam::SetUp()); + UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam::Build()); + context = urBaseKernelTestWithParam::context; + kernel = urBaseKernelTestWithParam::kernel; + ASSERT_SUCCESS(urQueueCreate( + context, urBaseKernelTestWithParam::device, 0, &queue)); + } + + void TearDown() override { + for (auto &buffer : buffer_args) { + ASSERT_SUCCESS(urMemRelease(buffer)); + } + UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam::TearDown()); + if (queue) { + EXPECT_SUCCESS(urQueueRelease(queue)); + } + } + + // Adds a kernel arg representing a sycl buffer constructed with a 1D range. + void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer) { + ur_mem_handle_t mem_handle = nullptr; + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, size, + nullptr, &mem_handle)); + char zero = 0; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, mem_handle, &zero, + sizeof(zero), 0, size, 0, nullptr, + nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, current_arg_index, nullptr, + mem_handle)); + + // SYCL device kernels have different interfaces depending on the + // backend being used. Typically a kernel which takes a buffer argument + // will take a pointer to the start of the buffer and a sycl::id param + // which is a struct that encodes the accessor to the buffer. However + // the AMD backend handles this differently and uses three separate + // arguments for each of the three dimensions of the accessor. + + ur_platform_backend_t backend; + ASSERT_SUCCESS(urPlatformGetInfo(urBaseKernelTestWithParam::platform, + UR_PLATFORM_INFO_BACKEND, + sizeof(backend), &backend, nullptr)); + if (backend == UR_PLATFORM_BACKEND_HIP) { + // this emulates the three offset params for buffer accessor on AMD. + size_t val = 0; + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1, + sizeof(size_t), nullptr, &val)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 2, + sizeof(size_t), nullptr, &val)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 3, + sizeof(size_t), nullptr, &val)); + current_arg_index += 4; + } else { + // This emulates the offset struct sycl adds for a 1D buffer accessor. + struct { + size_t offsets[1] = {0}; + } accessor; + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1, + sizeof(accessor), nullptr, + &accessor)); + current_arg_index += 2; + } + + buffer_args.push_back(mem_handle); + *out_buffer = mem_handle; + } + + template void AddPodArg(U data) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index, + sizeof(data), nullptr, &data)); + current_arg_index++; + } + + // Validate the contents of `buffer` according to the given validator. + template + void ValidateBuffer(ur_mem_handle_t buffer, size_t size, + std::function validator) { + std::vector read_buffer(size / sizeof(U)); + ASSERT_SUCCESS(urEnqueueMemBufferRead(queue, buffer, true, 0, size, + read_buffer.data(), 0, nullptr, + nullptr)); + ASSERT_TRUE( + std::all_of(read_buffer.begin(), read_buffer.end(), validator)); + } + + // Helper that uses the generic validate function to check for a given value. + template + void ValidateBuffer(ur_mem_handle_t buffer, size_t size, U value) { + auto validator = [&value](U result) -> bool { return result == value; }; + + ValidateBuffer(buffer, size, validator); + } + + std::vector buffer_args; + uint32_t current_arg_index = 0; + ur_context_handle_t context; + ur_kernel_handle_t kernel; + ur_queue_handle_t queue; +}; + struct urKernelExecutionTest : urBaseKernelExecutionTest { void SetUp() { UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelExecutionTest::SetUp());