Skip to content

Commit

Permalink
Merge pull request #2117 from pbalcer/fix-filter-out-same-cmdlists
Browse files Browse the repository at this point in the history
Fix urEnqueueEventsWaitWithBarrier when used with interop events
  • Loading branch information
pbalcer authored Sep 24, 2024
2 parents 1d1808a + 96f66e0 commit 7a2caca
Show file tree
Hide file tree
Showing 3 changed files with 170 additions and 44 deletions.
104 changes: 60 additions & 44 deletions source/adapters/level_zero/event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,48 +171,63 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
std::scoped_lock<ur_shared_mutex> lock(Queue->Mutex);

// Helper function for appending a barrier to a command list.
auto insertBarrierIntoCmdList =
[&Queue](ur_command_list_ptr_t CmdList,
const _ur_ze_event_list_t &EventWaitList,
ur_event_handle_t &Event, bool IsInternal) {
UR_CALL(createEventAndAssociateQueue(
Queue, &Event, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, CmdList,
IsInternal, false));

Event->WaitList = EventWaitList;

// For in-order queue we don't need a real barrier, just wait for
// requested events in potentially different queues and add a "barrier"
// event signal because it is already guaranteed that previous commands
// in this queue are completed when the signal is started.
//
// Only consideration here is that when profiling is used, signalEvent
// cannot be used if EventWaitList.Lenght == 0. In those cases, we need
// to fallback directly to barrier to have correct timestamps. See here:
// https://spec.oneapi.io/level-zero/latest/core/api.html?highlight=appendsignalevent#_CPPv430zeCommandListAppendSignalEvent24ze_command_list_handle_t17ze_event_handle_t
//
// TODO: this and other special handling of in-order queues to be
// updated when/if Level Zero adds native support for in-order queues.
//
if (Queue->isInOrderQueue() && InOrderBarrierBySignal &&
!Queue->isProfilingEnabled()) {
// If we are using driver in order lists, then append wait on events
// is unnecessary and we can signal the event created.
if (EventWaitList.Length && !CmdList->second.IsInOrderList) {
ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
(CmdList->first, EventWaitList.Length,
EventWaitList.ZeEventList));
auto insertBarrierIntoCmdList = [&Queue](ur_command_list_ptr_t CmdList,
_ur_ze_event_list_t &EventWaitList,
ur_event_handle_t &Event,
bool IsInternal) {
UR_CALL(createEventAndAssociateQueue(Queue, &Event,
UR_COMMAND_EVENTS_WAIT_WITH_BARRIER,
CmdList, IsInternal, false));

Event->WaitList = EventWaitList;

// For in-order queue we don't need a real barrier, just wait for
// requested events in potentially different queues and add a "barrier"
// event signal because it is already guaranteed that previous commands
// in this queue are completed when the signal is started.
//
// Only consideration here is that when profiling is used, signalEvent
// cannot be used if EventWaitList.Lenght == 0. In those cases, we need
// to fallback directly to barrier to have correct timestamps. See here:
// https://spec.oneapi.io/level-zero/latest/core/api.html?highlight=appendsignalevent#_CPPv430zeCommandListAppendSignalEvent24ze_command_list_handle_t17ze_event_handle_t
//
// TODO: this and other special handling of in-order queues to be
// updated when/if Level Zero adds native support for in-order queues.
//
if (Queue->isInOrderQueue() && InOrderBarrierBySignal &&
!Queue->isProfilingEnabled()) {
// If we are using driver in order lists, then append wait on events
// is unnecessary IF the cmdlists match.
if (EventWaitList.Length) {
if (CmdList->second.IsInOrderList) {
for (unsigned i = EventWaitList.Length; i-- < 0;) {
// if the events is from the same cmdlist, we can remove it
// from the waitlist.
if (EventWaitList.UrEventList[i]->CommandList == CmdList) {
EventWaitList.Length--;
if (EventWaitList.Length != i) {
std::swap(EventWaitList.UrEventList[i],
EventWaitList.UrEventList[EventWaitList.Length]);
std::swap(EventWaitList.ZeEventList[i],
EventWaitList.ZeEventList[EventWaitList.Length]);
}
}
}
ZE2UR_CALL(zeCommandListAppendSignalEvent,
(CmdList->first, Event->ZeEvent));
} else {
ZE2UR_CALL(zeCommandListAppendBarrier,
(CmdList->first, Event->ZeEvent, EventWaitList.Length,
EventWaitList.ZeEventList));
}
ZE2UR_CALL(
zeCommandListAppendWaitOnEvents,
(CmdList->first, EventWaitList.Length, EventWaitList.ZeEventList));
}
ZE2UR_CALL(zeCommandListAppendSignalEvent,
(CmdList->first, Event->ZeEvent));
} else {
ZE2UR_CALL(zeCommandListAppendBarrier,
(CmdList->first, Event->ZeEvent, EventWaitList.Length,
EventWaitList.ZeEventList));
}

return UR_RESULT_SUCCESS;
};
return UR_RESULT_SUCCESS;
};

// If the queue is in-order then each command in it effectively acts as a
// barrier, so we don't need to do anything except if we were requested
Expand Down Expand Up @@ -349,9 +364,9 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
// command-lists.
std::vector<ur_event_handle_t> EventWaitVector(CmdLists.size());
for (size_t I = 0; I < CmdLists.size(); ++I) {
UR_CALL(insertBarrierIntoCmdList(CmdLists[I], _ur_ze_event_list_t{},
EventWaitVector[I],
true /*IsInternal*/));
_ur_ze_event_list_t waitlist;
UR_CALL(insertBarrierIntoCmdList(
CmdLists[I], waitlist, EventWaitVector[I], true /*IsInternal*/));
}
// If there were multiple queues we need to create a "convergence" event to
// be our active barrier. This convergence event is signalled by a barrier
Expand All @@ -376,8 +391,9 @@ ur_result_t urEnqueueEventsWaitWithBarrier(
// If there is only a single queue then insert a barrier and the single
// result event can be used as our active barrier and used as the return
// event. Take into account whether output event is discarded or not.
UR_CALL(insertBarrierIntoCmdList(CmdLists[0], _ur_ze_event_list_t{},
ResultEvent, IsInternal));
_ur_ze_event_list_t waitlist;
UR_CALL(insertBarrierIntoCmdList(CmdLists[0], waitlist, ResultEvent,
IsInternal));
}

// Execute each command list so the barriers can be encountered.
Expand Down
1 change: 1 addition & 0 deletions test/adapters/level_zero/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ if(UR_BUILD_ADAPTER_L0)
SOURCES
urProgramLink.cpp
urKernelCreateWithNativeHandle.cpp
urEventCreateWithNativeHandle.cpp
ENVIRONMENT
"UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_level_zero>\""
)
Expand Down
109 changes: 109 additions & 0 deletions test/adapters/level_zero/urEventCreateWithNativeHandle.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
// Copyright (C) 2024 Intel Corporation
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
// See LICENSE.TXT
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "ur_api.h"
#include "uur/checks.h"
#include "ze_api.h"
#include <cstring>
#include <thread>
#include <uur/fixtures.h>

using namespace std::chrono_literals;
using urLevelZeroEventNativeHandleTest = uur::urQueueTest;
UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urLevelZeroEventNativeHandleTest);

#define TEST_MEMCPY_SIZE 4096

TEST_P(urLevelZeroEventNativeHandleTest, WaitForNative) {
ze_event_pool_desc_t desc;
desc.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC;
desc.pNext = nullptr;
desc.count = 1;
desc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE;

ur_native_handle_t nativeContext;
ASSERT_SUCCESS(urContextGetNativeHandle(context, &nativeContext));

ur_native_handle_t nativeDevice;
ASSERT_SUCCESS(urDeviceGetNativeHandle(device, &nativeDevice));

ze_event_pool_handle_t pool = nullptr;

ASSERT_EQ(zeEventPoolCreate((ze_context_handle_t)nativeContext, &desc, 1,
(ze_device_handle_t *)&nativeDevice, &pool),
ZE_RESULT_SUCCESS);

ze_event_desc_t eventDesc;
eventDesc.pNext = nullptr;
eventDesc.stype = ZE_STRUCTURE_TYPE_EVENT_DESC;
eventDesc.index = 0;
eventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
eventDesc.wait = 0;

ze_event_handle_t zeEvent;
ASSERT_EQ(zeEventCreate(pool, &eventDesc, &zeEvent), ZE_RESULT_SUCCESS);

ur_event_native_properties_t pprops;
pprops.isNativeHandleOwned = false;
pprops.pNext = nullptr;
pprops.stype = UR_STRUCTURE_TYPE_EVENT_NATIVE_PROPERTIES;

ur_event_handle_t urEvent;
ASSERT_SUCCESS(urEventCreateWithNativeHandle((ur_native_handle_t)zeEvent,
context, &pprops, &urEvent));

int *src = (int *)malloc(TEST_MEMCPY_SIZE);
memset(src, 0xc, TEST_MEMCPY_SIZE);

int *dst = (int *)malloc(TEST_MEMCPY_SIZE);
memset(dst, 0, TEST_MEMCPY_SIZE);

int *dst2 = (int *)malloc(TEST_MEMCPY_SIZE);
memset(dst, 0, TEST_MEMCPY_SIZE);

ur_event_handle_t memcpyEvent2;
ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, dst2, src, TEST_MEMCPY_SIZE,
0, nullptr, &memcpyEvent2));

ur_event_handle_t memcpyEvent3;
ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, dst2, src, TEST_MEMCPY_SIZE,
0, nullptr, &memcpyEvent3));

// just to make wait lists contain more than 1 event
ur_event_handle_t events[] = {memcpyEvent2, urEvent, memcpyEvent3};

ur_event_handle_t waitEvent;
ASSERT_SUCCESS(
urEnqueueEventsWaitWithBarrier(queue, 3, events, &waitEvent));

ur_event_handle_t memcpyEvent;
ASSERT_SUCCESS(urEnqueueUSMMemcpy(queue, false, dst, src, TEST_MEMCPY_SIZE,
1, &waitEvent, &memcpyEvent));

// urQueueFinish would hang, so we flush and then wait
// some time to make sure the gpu had plenty of time
// to do the memcpy.
urQueueFlush(queue);
std::this_thread::sleep_for(500ms);

ASSERT_NE(memcmp(src, dst, TEST_MEMCPY_SIZE), 0);

zeEventHostSignal(zeEvent);

urQueueFinish(queue);

ASSERT_EQ(memcmp(src, dst, 4096), 0);

free(src);
free(dst);
free(dst2);
urEventRelease(urEvent);
urEventRelease(waitEvent);
urEventRelease(memcpyEvent);
urEventRelease(memcpyEvent2);
urEventRelease(memcpyEvent3);
zeEventDestroy(zeEvent);
zeEventPoolDestroy(pool);
}

0 comments on commit 7a2caca

Please sign in to comment.