diff --git a/source/ur/ur.hpp b/source/ur/ur.hpp index 31d813f15b..35a9808153 100644 --- a/source/ur/ur.hpp +++ b/source/ur/ur.hpp @@ -350,20 +350,20 @@ template inline bool isPowerOf2(const T &Value) { static inline void roundToHighestFactorOfGlobalSizeIn3d( size_t *ThreadsPerBlock, const size_t *GlobalSize, const size_t *MaxBlockDim, const size_t MaxBlockSize) { - ThreadsPerBlock[2] = std::min(GlobalSize[2], MaxBlockDim[2]); + ThreadsPerBlock[0] = std::min(GlobalSize[0], MaxBlockDim[0]); + // Make the X dim a factor of 2 + do { + roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalSize[0]); + } while (!isPowerOf2(ThreadsPerBlock[0]) && ThreadsPerBlock[0] > 32 && + --ThreadsPerBlock[0]); + roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalSize[2]); ThreadsPerBlock[1] = std::min(GlobalSize[1], - std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1])); + std::min(MaxBlockSize / ThreadsPerBlock[0], MaxBlockDim[1])); roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalSize[1]); - ThreadsPerBlock[0] = std::min( - GlobalSize[0], MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2])); - - // Make the X dim a factor of 2 - do { - roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalSize[0]); - } while (!isPowerOf2(ThreadsPerBlock[0]) && ThreadsPerBlock[0] > 32 && - --ThreadsPerBlock[0]); + ThreadsPerBlock[2] = std::min( + GlobalSize[2], MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[0])); } diff --git a/test/conformance/enqueue/urEnqueueKernelLaunch.cpp b/test/conformance/enqueue/urEnqueueKernelLaunch.cpp index f10175651b..e31d27133a 100644 --- a/test/conformance/enqueue/urEnqueueKernelLaunch.cpp +++ b/test/conformance/enqueue/urEnqueueKernelLaunch.cpp @@ -77,105 +77,97 @@ TEST_P(urEnqueueKernelLaunchTest, InvalidWorkDimension) { UR_RESULT_ERROR_INVALID_WORK_DIMENSION); } -#define ENQUEUE_KERNEL_LAUNCH_TEST_1D_SIZES(SIZE) \ - struct urEnqueueKernelLaunchTestSizes##SIZE : uur::urKernelExecutionTest { \ - void SetUp() override { \ - program_name = "fill"; \ - UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); \ - } \ - \ - uint32_t val = 42; \ - size_t global_size = SIZE; \ - size_t global_offset = 0; \ - size_t n_dimensions = 1; \ - }; \ - UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunchTestSizes##SIZE); \ - \ - TEST_P(urEnqueueKernelLaunchTestSizes##SIZE, Success) { \ - ur_mem_handle_t buffer = nullptr; \ - AddBuffer1DArg(sizeof(val) * global_size, &buffer); \ - AddPodArg(val); \ - ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, \ - &global_offset, &global_size, \ - nullptr, 0, nullptr, nullptr)); \ - ASSERT_SUCCESS(urQueueFinish(queue)); \ - ValidateBuffer(buffer, sizeof(val) * global_size, val); \ +struct testParametersEnqueueKernel { + size_t X, Y, Z; + size_t Dims; +}; + +template +inline std::string printKernelLaunchTestString( + const testing::TestParamInfo &info) { + const auto device_handle = std::get<0>(info.param); + const auto platform_device_name = + uur::GetPlatformAndDeviceName(device_handle); + std::stringstream test_name; + test_name << platform_device_name << "__" + << std::get<1>(info.param).Dims << "D_" + << std::get<1>(info.param).X; + if (std::get<1>(info.param).Dims > 1) { + test_name << "_" << std::get<1>(info.param).Y; + } + if (std::get<1>(info.param).Dims > 2) { + test_name << "_" << std::get<1>(info.param).Z; } -ENQUEUE_KERNEL_LAUNCH_TEST_1D_SIZES(1) -ENQUEUE_KERNEL_LAUNCH_TEST_1D_SIZES(53) -ENQUEUE_KERNEL_LAUNCH_TEST_1D_SIZES(100) -ENQUEUE_KERNEL_LAUNCH_TEST_1D_SIZES(1342) - -#define ENQUEUE_KERNEL_LAUNCH_TEST_2D_SIZES(SIZE1, SIZE2) \ - struct urEnqueueKernelLaunch2DTestSizes##SIZE1##_##SIZE2 \ - : uur::urKernelExecutionTest { \ - void SetUp() override { \ - program_name = "fill_2d"; \ - UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); \ - } \ - \ - uint32_t val = 42; \ - size_t global_size[2] = {SIZE1, SIZE2}; \ - size_t global_offset[2] = {0, 0}; \ - size_t buffer_size = sizeof(val) * global_size[0] * global_size[1]; \ - size_t n_dimensions = 2; \ - }; \ - UUR_INSTANTIATE_DEVICE_TEST_SUITE_P( \ - urEnqueueKernelLaunch2DTestSizes##SIZE1##_##SIZE2); \ - \ - TEST_P(urEnqueueKernelLaunch2DTestSizes##SIZE1##_##SIZE2, Success) { \ - ur_mem_handle_t buffer = nullptr; \ - AddBuffer1DArg(buffer_size, &buffer); \ - AddPodArg(val); \ - ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, \ - global_offset, global_size, \ - nullptr, 0, nullptr, nullptr)); \ - ASSERT_SUCCESS(urQueueFinish(queue)); \ - ValidateBuffer(buffer, buffer_size, val); \ + test_name << ""; + return test_name.str(); +} + +struct urEnqueueKernelLaunchTestWithParam + : uur::urBaseKernelExecutionTestWithParam { + void SetUp() override { + global_range[0] = std::get<1>(GetParam()).X; + global_range[1] = std::get<1>(GetParam()).Y; + global_range[2] = std::get<1>(GetParam()).Z; + buffer_size = sizeof(val) * global_range[0]; + n_dimensions = std::get<1>(GetParam()).Dims; + if (n_dimensions == 1) { + program_name = "fill"; + } else if (n_dimensions == 2) { + program_name = "fill_2d"; + buffer_size *= global_range[1]; + } else { + assert(n_dimensions == 3); + program_name = "fill_3d"; + buffer_size *= global_range[1] * global_range[2]; + } + UUR_RETURN_ON_FATAL_FAILURE( + urBaseKernelExecutionTestWithParam::SetUp()); } -ENQUEUE_KERNEL_LAUNCH_TEST_2D_SIZES(8, 8) -ENQUEUE_KERNEL_LAUNCH_TEST_2D_SIZES(1, 1) -ENQUEUE_KERNEL_LAUNCH_TEST_2D_SIZES(53, 100) -ENQUEUE_KERNEL_LAUNCH_TEST_2D_SIZES(1, 79) -ENQUEUE_KERNEL_LAUNCH_TEST_2D_SIZES(1342, 1) - -#define ENQUEUE_KERNEL_LAUNCH_TEST_3D_SIZES(SIZE1, SIZE2, SIZE3) \ - struct urEnqueueKernelLaunch3DTestSizes##SIZE1##_##SIZE2##_##SIZE3 \ - : uur::urKernelExecutionTest { \ - void SetUp() override { \ - program_name = "fill_3d"; \ - UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); \ - } \ - \ - uint32_t val = 42; \ - size_t global_size[3] = {SIZE1, SIZE2, SIZE3}; \ - size_t global_offset[3] = {0, 0, 0}; \ - size_t buffer_size = \ - sizeof(val) * global_size[0] * global_size[1] * global_size[2]; \ - size_t n_dimensions = 3; \ - }; \ - UUR_INSTANTIATE_DEVICE_TEST_SUITE_P( \ - urEnqueueKernelLaunch3DTestSizes##SIZE1##_##SIZE2##_##SIZE3); \ - \ - TEST_P(urEnqueueKernelLaunch3DTestSizes##SIZE1##_##SIZE2##_##SIZE3, \ - Success) { \ - ur_mem_handle_t buffer = nullptr; \ - AddBuffer1DArg(buffer_size, &buffer); \ - AddPodArg(val); \ - ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, \ - global_offset, global_size, \ - nullptr, 0, nullptr, nullptr)); \ - ASSERT_SUCCESS(urQueueFinish(queue)); \ - ValidateBuffer(buffer, buffer_size, val); \ + + void TearDown() override { + UUR_RETURN_ON_FATAL_FAILURE(uur::urBaseKernelExecutionTestWithParam< + testParametersEnqueueKernel>::TearDown()); } -ENQUEUE_KERNEL_LAUNCH_TEST_3D_SIZES(1, 1, 1) -ENQUEUE_KERNEL_LAUNCH_TEST_3D_SIZES(37, 1, 1) -ENQUEUE_KERNEL_LAUNCH_TEST_3D_SIZES(1, 78, 1) -ENQUEUE_KERNEL_LAUNCH_TEST_3D_SIZES(1, 1, 1025) -ENQUEUE_KERNEL_LAUNCH_TEST_3D_SIZES(37, 19, 1) -ENQUEUE_KERNEL_LAUNCH_TEST_3D_SIZES(1, 78, 91) -ENQUEUE_KERNEL_LAUNCH_TEST_3D_SIZES(18, 1, 1025) -ENQUEUE_KERNEL_LAUNCH_TEST_3D_SIZES(18, 79, 1025) + + uint32_t val = 42; + size_t global_range[3]; + size_t global_offset[3] = {0, 0, 0}; + size_t n_dimensions; + size_t buffer_size; +}; + +static std::vector test_cases{// 1D + {1, 1, 1, 1}, + {31, 1, 1, 1}, + {1027, 1, 1, 1}, + {32, 1, 1, 1}, + {256, 1, 1, 1}, + // 2D + {1, 1, 1, 2}, + {31, 7, 1, 2}, + {1027, 1, 1, 2}, + {1, 32, 1, 2}, + {256, 79, 1, 2}, + // 3D + {1, 1, 1, 3}, + {31, 7, 1, 3}, + {1027, 1, 19, 3}, + {1, 53, 19, 3}, + {256, 79, 8, 3}}; +UUR_TEST_SUITE_P( + urEnqueueKernelLaunchTestWithParam, testing::ValuesIn(test_cases), + printKernelLaunchTestString); + +TEST_P(urEnqueueKernelLaunchTestWithParam, Success) { + ur_mem_handle_t buffer = nullptr; + AddBuffer1DArg(buffer_size, &buffer); + AddPodArg(val); + ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, + global_offset, global_range, nullptr, + 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + ValidateBuffer(buffer, buffer_size, val); +} struct urEnqueueKernelLaunchWithVirtualMemory : uur::urKernelExecutionTest { diff --git a/test/conformance/testing/include/uur/fixtures.h b/test/conformance/testing/include/uur/fixtures.h index cf01015eb4..911d43e083 100644 --- a/test/conformance/testing/include/uur/fixtures.h +++ b/test/conformance/testing/include/uur/fixtures.h @@ -1254,6 +1254,110 @@ struct urBaseKernelExecutionTest : urBaseKernelTest { uint32_t current_arg_index = 0; }; +template +struct urBaseKernelExecutionTestWithParam : urBaseKernelTestWithParam { + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam::SetUp()); + UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam::Build()); + context = urBaseKernelTestWithParam::context; + kernel = urBaseKernelTestWithParam::kernel; + ASSERT_SUCCESS(urQueueCreate( + context, urBaseKernelTestWithParam::device, 0, &queue)); + } + + void TearDown() override { + for (auto &buffer : buffer_args) { + ASSERT_SUCCESS(urMemRelease(buffer)); + } + UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelTestWithParam::TearDown()); + if (queue) { + EXPECT_SUCCESS(urQueueRelease(queue)); + } + } + + // Adds a kernel arg representing a sycl buffer constructed with a 1D range. + void AddBuffer1DArg(size_t size, ur_mem_handle_t *out_buffer) { + ur_mem_handle_t mem_handle = nullptr; + ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, size, + nullptr, &mem_handle)); + char zero = 0; + ASSERT_SUCCESS(urEnqueueMemBufferFill(queue, mem_handle, &zero, + sizeof(zero), 0, size, 0, nullptr, + nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + ASSERT_SUCCESS(urKernelSetArgMemObj(kernel, current_arg_index, nullptr, + mem_handle)); + + // SYCL device kernels have different interfaces depending on the + // backend being used. Typically a kernel which takes a buffer argument + // will take a pointer to the start of the buffer and a sycl::id param + // which is a struct that encodes the accessor to the buffer. However + // the AMD backend handles this differently and uses three separate + // arguments for each of the three dimensions of the accessor. + + ur_platform_backend_t backend; + ASSERT_SUCCESS(urPlatformGetInfo(urBaseKernelTestWithParam::platform, + UR_PLATFORM_INFO_BACKEND, + sizeof(backend), &backend, nullptr)); + if (backend == UR_PLATFORM_BACKEND_HIP) { + // this emulates the three offset params for buffer accessor on AMD. + size_t val = 0; + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1, + sizeof(size_t), nullptr, &val)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 2, + sizeof(size_t), nullptr, &val)); + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 3, + sizeof(size_t), nullptr, &val)); + current_arg_index += 4; + } else { + // This emulates the offset struct sycl adds for a 1D buffer accessor. + struct { + size_t offsets[1] = {0}; + } accessor; + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index + 1, + sizeof(accessor), nullptr, + &accessor)); + current_arg_index += 2; + } + + buffer_args.push_back(mem_handle); + *out_buffer = mem_handle; + } + + template void AddPodArg(U data) { + ASSERT_SUCCESS(urKernelSetArgValue(kernel, current_arg_index, + sizeof(data), nullptr, &data)); + current_arg_index++; + } + + // Validate the contents of `buffer` according to the given validator. + template + void ValidateBuffer(ur_mem_handle_t buffer, size_t size, + std::function validator) { + std::vector read_buffer(size / sizeof(U)); + ASSERT_SUCCESS(urEnqueueMemBufferRead(queue, buffer, true, 0, size, + read_buffer.data(), 0, nullptr, + nullptr)); + ASSERT_TRUE( + std::all_of(read_buffer.begin(), read_buffer.end(), validator)); + } + + // Helper that uses the generic validate function to check for a given value. + template + void ValidateBuffer(ur_mem_handle_t buffer, size_t size, U value) { + auto validator = [&value](U result) -> bool { return result == value; }; + + ValidateBuffer(buffer, size, validator); + } + +std::vector + buffer_args; + uint32_t current_arg_index = 0; + ur_context_handle_t context; + ur_kernel_handle_t kernel; + ur_queue_handle_t queue; +}; + struct urKernelExecutionTest : urBaseKernelExecutionTest { void SetUp() { UUR_RETURN_ON_FATAL_FAILURE(urBaseKernelExecutionTest::SetUp());