From 2f02adee5d63b28d36aa88db8f893889a7f09b05 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Wed, 3 Jan 2024 11:53:03 +0000 Subject: [PATCH] Test updating multiple kernels in a command-buffer --- test/conformance/device_code/indexers_usm.cpp | 26 ++- .../exp_command_buffer_adapter_cuda.match | 0 .../exp_command_buffer_adapter_hip.match | 13 ++ ...xp_command_buffer_adapter_level_zero.match | 0 .../exp_command_buffer_adapter_opencl.match | 0 .../exp_command_buffer/ndrange_update.cpp | 178 +++++++++++++----- .../usm_fill_kernel_update.cpp | 143 ++++++++++++++ 7 files changed, 298 insertions(+), 62 deletions(-) create mode 100644 test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match create mode 100644 test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match create mode 100644 test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero.match create mode 100644 test/conformance/exp_command_buffer/exp_command_buffer_adapter_opencl.match diff --git a/test/conformance/device_code/indexers_usm.cpp b/test/conformance/device_code/indexers_usm.cpp index 7a43f4f898..c1936bcb48 100644 --- a/test/conformance/device_code/indexers_usm.cpp +++ b/test/conformance/device_code/indexers_usm.cpp @@ -6,20 +6,23 @@ #include int main() { - const cl::sycl::range<1> globalRange(6); - const cl::sycl::range<1> localRange(3); - const cl::sycl::id<1> globalOffset(4); - const cl::sycl::nd_range<1> ndRange(globalRange, localRange, globalOffset); + const cl::sycl::range<3> globalRange(8, 8, 8); + const cl::sycl::range<3> localRange(2, 2, 2); + const cl::sycl::id<3> globalOffset(4, 4, 4); + const cl::sycl::nd_range<3> ndRange(globalRange, localRange, globalOffset); cl::sycl::queue sycl_queue; - const size_t elements_per_work_item = 9; - int *ptr = cl::sycl::malloc_shared( - globalRange[0] * elements_per_work_item, sycl_queue); + const size_t elements_per_work_item = 6; + int *ptr = cl::sycl::malloc_shared(globalRange[0] * globalRange[1] * + globalRange[2] * + elements_per_work_item, + sycl_queue); sycl_queue.submit([&](cl::sycl::handler &cgh) { cgh.parallel_for( - ndRange, [ptr](cl::sycl::nd_item<1> index) { - int *wi_ptr = ptr + index.get_global_linear_id() * 9; + ndRange, [ptr](cl::sycl::nd_item<3> index) { + int *wi_ptr = + ptr + index.get_global_linear_id() * elements_per_work_item; wi_ptr[0] = index.get_global_id(0); wi_ptr[1] = index.get_global_id(1); @@ -28,11 +31,6 @@ int main() { wi_ptr[3] = index.get_local_id(0); wi_ptr[4] = index.get_local_id(1); wi_ptr[5] = index.get_local_id(2); - - auto offset = index.get_offset(); - wi_ptr[6] = offset[0]; - wi_ptr[7] = offset[1]; - wi_ptr[8] = offset[2]; }); }); return 0; diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_cuda.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match new file mode 100644 index 0000000000..49bcaff248 --- /dev/null +++ b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_hip.match @@ -0,0 +1,13 @@ +{{OPT}}BufferFillCommandTest.UpdateParmeters/AMD_HIP_BACKEND{{.*}} +{{OPT}}BufferFillCommandTest.UpdateGlobalSize/AMD_HIP_BACKEND{{.*}} +{{OPT}}BufferFillCommandTest.SeparateUpdateCalls/AMD_HIP_BACKEND{{.*}} +{{OPT}}BufferFillCommandTest.OverrideUpdate/AMD_HIP_BACKEND{{.*}} +{{OPT}}BufferFillCommandTest.OverrideArgList/AMD_HIP_BACKEND{{.*}} +{{OPT}}USMFillCommandTest.UpdateParmeters/AMD_HIP_BACKEND{{.*}} +{{OPT}}USMFillCommandTest.UpdateExecInfo/AMD_HIP_BACKEND{{.*}} +{{OPT}}USMMultipleFillCommandTest.UpdateAllKernels/AMD_HIP_BACKEND{{.*}} +{{OPT}}BufferSaxpyKernelTests.UpdateParmeters/AMD_HIP_BACKEND{{.*}} +{{OPT}}USMSaxpyKernelTests.UpdateParmeters/AMD_HIP_BACKEND{{.*}} +{{OPT}}NDRangeUpdateTests.Update3D/AMD_HIP_BACKEND{{.*}} +{{OPT}}NDRangeUpdateTests.Update2D/AMD_HIP_BACKEND{{.*}} +{{OPT}}NDRangeUpdateTests.Update1D/AMD_HIP_BACKEND{{.*}} diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_level_zero.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/conformance/exp_command_buffer/exp_command_buffer_adapter_opencl.match b/test/conformance/exp_command_buffer/exp_command_buffer_adapter_opencl.match new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/conformance/exp_command_buffer/ndrange_update.cpp b/test/conformance/exp_command_buffer/ndrange_update.cpp index 489033e1a4..ad61cfdbd8 100644 --- a/test/conformance/exp_command_buffer/ndrange_update.cpp +++ b/test/conformance/exp_command_buffer/ndrange_update.cpp @@ -18,7 +18,9 @@ struct NDRangeUpdateTests : uur::urExpUpdatableCommandBufferTests { GTEST_SKIP() << "Shared USM is not supported."; } - const size_t allocation_size = sizeof(int) * 3 * 3 * global_size; + const size_t allocation_size = sizeof(int) * elements_per_id * + global_size[0] * global_size[1] * + global_size[2]; ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, allocation_size, &shared_ptr)); ASSERT_NE(shared_ptr, nullptr); @@ -27,13 +29,46 @@ struct NDRangeUpdateTests : uur::urExpUpdatableCommandBufferTests { ASSERT_SUCCESS(urKernelSetArgPointer(kernel, 0, nullptr, &shared_ptr)); ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( - updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, - &global_size, &local_size, 0, nullptr, nullptr, &command_handle)); + updatable_cmd_buf_handle, kernel, n_dimensions, + global_offset.data(), global_size.data(), local_size.data(), 0, + nullptr, nullptr, &command_handle)); ASSERT_NE(command_handle, nullptr); ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); } + void Validate(std::array &globalSize, + std::array &localSize, + std::array &globalOffset) { + for (size_t i = 0; i < globalSize[0]; i++) { + for (size_t j = 0; j < globalSize[1]; j++) { + for (size_t k = 0; k < globalSize[2]; k++) { + const size_t global_linear_id = + (i * (globalSize[1] * globalSize[0]) + + j * globalSize[0] + k); + int *wi_ptr = (int *)shared_ptr + + (elements_per_id * global_linear_id); + + int global_id_1d = wi_ptr[0]; + int global_id_2d = wi_ptr[1]; + int global_id_3d = wi_ptr[2]; + + EXPECT_EQ(global_id_1d, i + globalOffset[0]); + EXPECT_EQ(global_id_2d, j + globalOffset[1]); + EXPECT_EQ(global_id_3d, k + globalOffset[2]); + + int local_id_1d = wi_ptr[3]; + int local_id_2d = wi_ptr[4]; + int local_id_3d = wi_ptr[5]; + + EXPECT_EQ(local_id_1d, i % localSize[0]); + EXPECT_EQ(local_id_2d, j % localSize[1]); + EXPECT_EQ(local_id_3d, k % localSize[2]); + } + } + } + } + void TearDown() override { if (shared_ptr) { ASSERT_SUCCESS(urUSMFree(context, shared_ptr)); @@ -43,73 +78,120 @@ struct NDRangeUpdateTests : uur::urExpUpdatableCommandBufferTests { urExpUpdatableCommandBufferTests::TearDown()); } - size_t global_size = 6; - size_t local_size = 3; - size_t global_offset = 4; - size_t n_dimensions = 1; + const size_t elements_per_id = 6; + std::array global_size = {8, 8, 8}; + std::array local_size = {2, 2, 2}; + std::array global_offset = {4, 4, 4}; + size_t n_dimensions = 3; void *shared_ptr = nullptr; ur_exp_command_buffer_command_handle_t command_handle = nullptr; }; UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(NDRangeUpdateTests); -TEST_P(NDRangeUpdateTests, Update1D) { +TEST_P(NDRangeUpdateTests, Update3D) { ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); - const size_t elements_per_id = 9; - for (size_t i = 0; i < global_size; i++) { - int *wi_ptr = (int *)shared_ptr + (i * elements_per_id); - - int global_id_1d = wi_ptr[0]; - //int global_id_2d = wi_ptr[1]; - //int global_id_3d = wi_ptr[2]; - - ASSERT_EQ(global_id_1d, i + global_offset) << i; - // ASSERT_EQ(global_id_2d, global_offset); - // ASSERT_EQ(global_id_3d, 0); - - int local_id_1d = wi_ptr[3]; - // int local_id_2d = wi_ptr[4]; - // int local_id_3d = wi_ptr[5]; + Validate(global_size, local_size, global_offset); + std::array new_local_size = {4, 4, 4}; + std::array new_global_offset = {3, 3, 3}; + ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + 0, // numMemobjArgs + 0, // numPointerArgs + 0, // numValueArgs + 0, // numExecInfos + 3, // workDim; + nullptr, // pArgMemobjList + nullptr, // pArgPointerList + nullptr, // pArgValueList + nullptr, // pArgExecInfoList + new_global_offset.data(), // pGlobalWorkOffset + nullptr, // pGlobalWorkSize + new_local_size.data(), // pLocalWorkSize + }; + + ASSERT_SUCCESS( + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); - ASSERT_EQ(local_id_1d, i % local_size); - // ASSERT_EQ(local_id_2d, 0); - // ASSERT_EQ(local_id_3d, 0); + Validate(global_size, new_local_size, new_global_offset); +} - int global_offset_1d = wi_ptr[6]; - //int global_offset_2d = wi_ptr[7]; - //int global_offset_3d = wi_ptr[8]; +TEST_P(NDRangeUpdateTests, DISABLED_Update2D) { + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); - ASSERT_EQ(global_offset_1d, 4); - // ASSERT_EQ(global_offset_2d, 0); - // ASSERT_EQ(global_offset_3d, 0); - } + Validate(global_size, local_size, global_offset); + std::array new_global_size = {6, 6, 1}; + std::array new_local_size = {3, 3, 1}; + std::array new_global_offset = {3, 3, 0}; + ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + 0, // numMemobjArgs + 0, // numPointerArgs + 0, // numValueArgs + 0, // numExecInfos + 2, // workDim; + nullptr, // pArgMemobjList + nullptr, // pArgPointerList + nullptr, // pArgValueList + nullptr, // pArgExecInfoList + new_global_offset.data(), // pGlobalWorkOffset + new_global_size.data(), // pGlobalWorkSize + new_local_size.data(), // pLocalWorkSize + }; + + ASSERT_SUCCESS( + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); - // TODO - // 1. update - // 3. Validate. + Validate(new_global_size, new_local_size, new_global_offset); } -TEST_P(NDRangeUpdateTests, Update2D) { +TEST_P(NDRangeUpdateTests, DISABLED_Update1D) { ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); - // TODO - // 1. Validate initial run - // 2. Update to offset/local/global work sizes in 2 dimensions - // 3. Validate. -} - -TEST_P(NDRangeUpdateTests, Update3D) { + Validate(global_size, local_size, global_offset); + ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + 0, // numMemobjArgs + 0, // numPointerArgs + 0, // numValueArgs + 0, // numExecInfos + 1, // workDim + nullptr, // pArgMemobjList + nullptr, // pArgPointerList + nullptr, // pArgValueList + nullptr, // pArgExecInfoList + nullptr, // pGlobalWorkOffset + nullptr, // pGlobalWorkSize + nullptr, // pLocalWorkSize + }; + + ASSERT_SUCCESS( + urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc)); ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); - // TODO - // 1. Validate initial run - // 2. Update to offset/local/global work sizes in 3 dimensions - // 3. Validate. + std::array new_global_size = {global_size[0], 1, 1}; + std::array new_local_size = {local_size[0], 1, 1}; + std::array new_global_offset = {global_offset[0], 0, 0}; + Validate(new_global_size, new_local_size, new_global_offset); } diff --git a/test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp b/test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp index 315d02906c..fe5a69c1da 100644 --- a/test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp +++ b/test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp @@ -194,3 +194,146 @@ TEST_P(USMFillCommandTest, UpdateExecInfo) { ValidateAllocation((int *)shared_ptr, global_size, val); } + +struct USMMultipleFillCommandTest : uur::urExpUpdatableCommandBufferTests { + void SetUp() override { + program_name = "fill_usm"; + UUR_RETURN_ON_FATAL_FAILURE(urExpUpdatableCommandBufferTests::SetUp()); + + ur_device_usm_access_capability_flags_t shared_usm_flags; + ASSERT_SUCCESS( + uur::GetDeviceUSMSingleSharedSupport(device, shared_usm_flags)); + if (!(shared_usm_flags & UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS)) { + GTEST_SKIP() << "Shared USM is not supported."; + } + + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, &shared_ptr)); + ASSERT_NE(shared_ptr, nullptr); + std::memset(shared_ptr, 0, allocation_size); + + for (size_t k = 0; k < num_kernels; k++) { + void *offset_ptr = (uint32_t *)shared_ptr + (k * elements); + ASSERT_SUCCESS( + urKernelSetArgPointer(kernel, 0, nullptr, &offset_ptr)); + + uint32_t fill_val = val + k; + ASSERT_SUCCESS(urKernelSetArgValue(kernel, 1, sizeof(fill_val), + nullptr, &fill_val)); + + ASSERT_SUCCESS(urCommandBufferAppendKernelLaunchExp( + updatable_cmd_buf_handle, kernel, n_dimensions, &global_offset, + &elements, &local_size, 0, nullptr, nullptr, + &command_handles[k])); + ASSERT_NE(command_handles[k], nullptr); + } + + ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); + } + + void ValidateAllocation(int *pointer, size_t length, int val) { + for (size_t i = 0; i < length; i++) { + ASSERT_EQ(pointer[i], val); + } + } + + void TearDown() override { + if (shared_ptr) { + ASSERT_SUCCESS(urUSMFree(context, shared_ptr)); + } + + if (new_shared_ptr) { + ASSERT_SUCCESS(urUSMFree(context, new_shared_ptr)); + } + + UUR_RETURN_ON_FATAL_FAILURE( + urExpUpdatableCommandBufferTests::TearDown()); + } + + uint32_t val = 42; + size_t local_size = 4; + static constexpr size_t global_size = 64; + size_t global_offset = 0; + size_t n_dimensions = 1; + const size_t allocation_size = sizeof(val) * global_size; + + void *shared_ptr = nullptr; + void *new_shared_ptr = nullptr; + static constexpr size_t num_kernels = 8; + const size_t elements = global_size / num_kernels; + std::array + command_handles; +}; + +UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(USMMultipleFillCommandTest); + +TEST_P(USMMultipleFillCommandTest, UpdateAllKernels) { + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *output = (uint32_t *)shared_ptr; + for (size_t i = 0; i < global_size; i++) { + uint32_t expected = val + (i / elements); + ASSERT_EQ(expected, output[i]); + } + + ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr, + allocation_size, &new_shared_ptr)); + ASSERT_NE(new_shared_ptr, nullptr); + std::memset(new_shared_ptr, 0, allocation_size); + + uint32_t new_val = 33; + for (size_t k = 0; k < num_kernels; k++) { + + void *offset_ptr = (uint32_t *)new_shared_ptr + (k * elements); + ur_exp_command_buffer_update_pointer_arg_desc_t new_output_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype + nullptr, // pNext, + 0, // argIndex, + nullptr, // pProperties + &offset_ptr, // pArgValue + }; + + uint32_t new_fill_val = new_val + k; + ur_exp_command_buffer_update_value_arg_desc_t new_input_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype + nullptr, // pNext, + 1, // argIndex, + sizeof(int), // argSize, + nullptr, // pProperties + &new_fill_val, // hArgValue + }; + + ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = { + UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype + nullptr, // pNext + 0, // numMemobjArgs + 1, // numPointerArgs + 1, // numValueArgs + 0, // numExecInfos + 0, // workDim; + nullptr, // pArgMemobjList + &new_output_desc, // pArgPointerList + &new_input_desc, // pArgValueList + nullptr, // pArgExecInfoList + nullptr, // pGlobalWorkOffset + nullptr, // pGlobalWorkSize + nullptr, // pLocalWorkSize + }; + + ASSERT_SUCCESS(urCommandBufferUpdateKernelLaunchExp(command_handles[k], + &update_desc)); + } + + ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); + + uint32_t *updated_output = (uint32_t *)new_shared_ptr; + for (size_t i = 0; i < global_size; i++) { + uint32_t expected = new_val + (i / elements); + ASSERT_EQ(expected, updated_output[i]) << i; + } +}