Skip to content

Commit

Permalink
Merge pull request #1629 from Bensuo/ewan/L0_update_host_wait
Browse files Browse the repository at this point in the history
Use fence rather than event for sync in L0 command-buffer update
  • Loading branch information
kbenzie authored Jun 10, 2024
2 parents ed4211c + 721d63c commit 0f118d7
Show file tree
Hide file tree
Showing 9 changed files with 298 additions and 35 deletions.
4 changes: 3 additions & 1 deletion include/ur_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -8742,7 +8742,9 @@ urCommandBufferReleaseCommandExp(
);

///////////////////////////////////////////////////////////////////////////////
/// @brief Update a kernel launch command in a finalized command-buffer.
/// @brief Update a kernel launch command in a finalized command-buffer. This
/// entry-point is synchronous and may block if the command-buffer is
/// executing when the entry-point is called.
///
/// @returns
/// - ::UR_RESULT_SUCCESS
Expand Down
2 changes: 1 addition & 1 deletion scripts/core/exp-command-buffer.yml
Original file line number Diff line number Diff line change
Expand Up @@ -900,7 +900,7 @@ returns:
- $X_RESULT_ERROR_OUT_OF_HOST_MEMORY
--- #--------------------------------------------------------------------------
type: function
desc: "Update a kernel launch command in a finalized command-buffer."
desc: "Update a kernel launch command in a finalized command-buffer. This entry-point is synchronous and may block if the command-buffer is executing when the entry-point is called."
class: $xCommandBuffer
name: UpdateKernelLaunchExp
params:
Expand Down
34 changes: 22 additions & 12 deletions source/adapters/level_zero/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_(
const ur_exp_command_buffer_desc_t *Desc, const bool IsInOrderCmdList)
: Context(Context), Device(Device), ZeCommandList(CommandList),
ZeCommandListResetEvents(CommandListResetEvents),
ZeCommandListDesc(ZeDesc), ZeFencesList(), QueueProperties(),
SyncPoints(), NextSyncPoint(0),
ZeCommandListDesc(ZeDesc), ZeFencesMap(), ZeActiveFence(nullptr),
QueueProperties(), SyncPoints(), NextSyncPoint(0),
IsUpdatable(Desc ? Desc->isUpdatable : false),
IsProfilingEnabled(Desc ? Desc->enableProfiling : false),
IsInOrderCmdList(IsInOrderCmdList) {
Expand Down Expand Up @@ -102,8 +102,9 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() {
urEventReleaseInternal(Event);
}

// Release Fences allocated to command_buffer
for (auto &ZeFence : ZeFencesList) {
// Release fences allocated to command-buffer
for (auto &ZeFencePair : ZeFencesMap) {
auto &ZeFence = ZeFencePair.second;
ZE_CALL_NOCHECK(zeFenceDestroy, (ZeFence));
}

Expand Down Expand Up @@ -1053,11 +1054,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
uint32_t QueueGroupOrdinal;
auto &ZeCommandQueue = QGroup.getZeQueue(&QueueGroupOrdinal);

ze_fence_handle_t ZeFence;
ZeStruct<ze_fence_desc_t> ZeFenceDesc;

ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
CommandBuffer->ZeFencesList.push_back(ZeFence);
// If we already have created a fence for this queue, first reset then reuse
// it, otherwise create a new fence.
ze_fence_handle_t &ZeFence = CommandBuffer->ZeActiveFence;
auto ZeWorkloadFenceForQueue =
CommandBuffer->ZeFencesMap.find(ZeCommandQueue);
if (ZeWorkloadFenceForQueue == CommandBuffer->ZeFencesMap.end()) {
ZeStruct<ze_fence_desc_t> ZeFenceDesc;
ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
CommandBuffer->ZeFencesMap.insert({{ZeCommandQueue, ZeFence}});
} else {
ZeFence = ZeWorkloadFenceForQueue->second;
ZE2UR_CALL(zeFenceReset, (ZeFence));
}

bool MustSignalWaitEvent = true;
if (NumEventsInWaitList) {
Expand Down Expand Up @@ -1458,10 +1467,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
MutableCommandDesc.flags = 0;

// We must synchronize mutable command list execution before mutating.
ZE2UR_CALL(zeEventHostSynchronize,
(CommandBuffer->SignalEvent->ZeEvent, UINT64_MAX));
if (ze_fence_handle_t &ZeFence = CommandBuffer->ZeActiveFence) {
ZE2UR_CALL(zeFenceHostSynchronize, (ZeFence, UINT64_MAX));
}

auto Plt = Command->CommandBuffer->Context->getPlatform();
auto Plt = CommandBuffer->Context->getPlatform();
UR_ASSERT(Plt->ZeMutableCmdListExt.Supported,
UR_RESULT_ERROR_UNSUPPORTED_FEATURE);
ZE2UR_CALL(Plt->ZeMutableCmdListExt.zexCommandListUpdateMutableCommandsExp,
Expand Down
11 changes: 7 additions & 4 deletions source/adapters/level_zero/command_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,13 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object {
ze_command_list_handle_t ZeCommandListResetEvents;
// Level Zero command list descriptor
ZeStruct<ze_command_list_desc_t> ZeCommandListDesc;
// List of Level Zero fences created when submitting a graph.
// This list is needed to release all fences retained by the
// command_buffer.
std::vector<ze_fence_handle_t> ZeFencesList;
// Level Zero fences for each queue the command-buffer has been enqueued to.
// These should be destroyed when the command-buffer is released.
std::unordered_map<ze_command_queue_handle_t, ze_fence_handle_t> ZeFencesMap;
// The Level Zero fence from the most recent enqueue of the command-buffer.
// Must be an element in ZeFencesMap, so is not required to be destroyed
// itself.
ze_fence_handle_t ZeActiveFence;
// Queue properties from command-buffer descriptor
// TODO: Do we need these?
ur_queue_properties_t QueueProperties;
Expand Down
4 changes: 3 additions & 1 deletion source/loader/ur_libapi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8117,7 +8117,9 @@ ur_result_t UR_APICALL urCommandBufferReleaseCommandExp(
}

///////////////////////////////////////////////////////////////////////////////
/// @brief Update a kernel launch command in a finalized command-buffer.
/// @brief Update a kernel launch command in a finalized command-buffer. This
/// entry-point is synchronous and may block if the command-buffer is
/// executing when the entry-point is called.
///
/// @returns
/// - ::UR_RESULT_SUCCESS
Expand Down
4 changes: 3 additions & 1 deletion source/ur_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6868,7 +6868,9 @@ ur_result_t UR_APICALL urCommandBufferReleaseCommandExp(
}

///////////////////////////////////////////////////////////////////////////////
/// @brief Update a kernel launch command in a finalized command-buffer.
/// @brief Update a kernel launch command in a finalized command-buffer. This
/// entry-point is synchronous and may block if the command-buffer is
/// executing when the entry-point is called.
///
/// @returns
/// - ::UR_RESULT_SUCCESS
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@
{{OPT}}BufferFillCommandTest.OverrideUpdate/SYCL_NATIVE_CPU___SYCL_Native_CPU_
{{OPT}}BufferFillCommandTest.OverrideArgList/SYCL_NATIVE_CPU___SYCL_Native_CPU_
{{OPT}}USMFillCommandTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU_
{{OPT}}USMFillCommandTest.UpdateBeforeEnqueue/SYCL_NATIVE_CPU___SYCL_Native_CPU_
{{OPT}}USMMultipleFillCommandTest.UpdateAllKernels/SYCL_NATIVE_CPU___SYCL_Native_CPU_
{{OPT}}BufferSaxpyKernelTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU_
{{OPT}}USMSaxpyKernelTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU_
{{OPT}}USMMultiSaxpyKernelTest.UpdateParameters/SYCL_NATIVE_CPU___SYCL_Native_CPU_
{{OPT}}USMMultiSaxpyKernelTest.UpdateWithoutBlocking/SYCL_NATIVE_CPU___SYCL_Native_CPU_
{{OPT}}NDRangeUpdateTest.Update3D/SYCL_NATIVE_CPU___SYCL_Native_CPU_
{{OPT}}NDRangeUpdateTest.Update2D/SYCL_NATIVE_CPU___SYCL_Native_CPU_
{{OPT}}NDRangeUpdateTest.Update1D/SYCL_NATIVE_CPU___SYCL_Native_CPU_
Expand Down
53 changes: 53 additions & 0 deletions test/conformance/exp_command_buffer/usm_fill_kernel_update.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,59 @@ TEST_P(USMFillCommandTest, UpdateParameters) {
Validate((uint32_t *)new_shared_ptr, new_global_size, new_val);
}

// Test updating a command-buffer which hasn't been enqueued yet
TEST_P(USMFillCommandTest, UpdateBeforeEnqueue) {
ASSERT_SUCCESS(urUSMSharedAlloc(context, device, nullptr, nullptr,
allocation_size, &new_shared_ptr));
ASSERT_NE(new_shared_ptr, nullptr);
std::memset(new_shared_ptr, 0, allocation_size);

// Set new USM pointer as kernel output at index 0
ur_exp_command_buffer_update_pointer_arg_desc_t new_output_desc = {
UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_POINTER_ARG_DESC, // stype
nullptr, // pNext
0, // argIndex
nullptr, // pProperties
&new_shared_ptr, // pArgValue
};

// Set new value to use for fill at kernel index 1
uint32_t new_val = 33;
ur_exp_command_buffer_update_value_arg_desc_t new_input_desc = {
UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_VALUE_ARG_DESC, // stype
nullptr, // pNext
1, // argIndex
sizeof(new_val), // argSize
nullptr, // pProperties
&new_val, // hArgValue
};

ur_exp_command_buffer_update_kernel_launch_desc_t update_desc = {
UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_UPDATE_KERNEL_LAUNCH_DESC, // stype
nullptr, // pNext
0, // numNewMemObjArgs
1, // numNewPointerArgs
1, // numNewValueArgs
0, // newWorkDim
nullptr, // pNewMemObjArgList
&new_output_desc, // pNewPointerArgList
&new_input_desc, // pNewValueArgList
nullptr, // pNewGlobalWorkOffset
nullptr, // pNewGlobalWorkSize
nullptr, // pNewLocalWorkSize
};

// Update kernel and enqueue command-buffer
ASSERT_SUCCESS(
urCommandBufferUpdateKernelLaunchExp(command_handle, &update_desc));
ASSERT_SUCCESS(urCommandBufferEnqueueExp(updatable_cmd_buf_handle, queue, 0,
nullptr, nullptr));
ASSERT_SUCCESS(urQueueFinish(queue));

// Verify that update occurred correctly
Validate((uint32_t *)new_shared_ptr, global_size, new_val);
}

// Test updating a command-buffer with multiple USM fill kernel commands
struct USMMultipleFillCommandTest
: uur::command_buffer::urUpdatableCommandBufferExpExecutionTest {
Expand Down
Loading

0 comments on commit 0f118d7

Please sign in to comment.