Skip to content

Commit

Permalink
Merge pull request #1370 from winstonzhang-intel/counter-based-events
Browse files Browse the repository at this point in the history
[L0] Support for counter-based events using L0 driver
  • Loading branch information
kbenzie authored Apr 26, 2024
2 parents ee07570 + 39fcb2b commit f4a9497
Show file tree
Hide file tree
Showing 10 changed files with 143 additions and 54 deletions.
2 changes: 2 additions & 0 deletions source/adapters/level_zero/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -933,6 +933,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
MustSignalWaitEvent = false;
}
}
// Given WaitEvent was created without specifying Counting Events, then this
// event can be signalled on the host.
if (MustSignalWaitEvent) {
ZE2UR_CALL(zeEventHostSignal, (CommandBuffer->WaitEvent->ZeEvent));
}
Expand Down
33 changes: 27 additions & 6 deletions source/adapters/level_zero/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,8 @@ static const uint32_t MaxNumEventsPerPool = [] {

ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool(
ze_event_pool_handle_t &Pool, size_t &Index, bool HostVisible,
bool ProfilingEnabled, ur_device_handle_t Device) {
bool ProfilingEnabled, ur_device_handle_t Device,
bool CounterBasedEventEnabled, bool UsingImmCmdList) {
// Lock while updating event pool machinery.
std::scoped_lock<ur_mutex> Lock(ZeEventPoolCacheMutex);

Expand All @@ -481,7 +482,8 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool(
ZeDevice = Device->ZeDevice;
}
std::list<ze_event_pool_handle_t> *ZePoolCache =
getZeEventPoolCache(HostVisible, ProfilingEnabled, ZeDevice);
getZeEventPoolCache(HostVisible, ProfilingEnabled,
CounterBasedEventEnabled, UsingImmCmdList, ZeDevice);

if (!ZePoolCache->empty()) {
if (NumEventsAvailableInEventPool[ZePoolCache->front()] == 0) {
Expand All @@ -506,15 +508,27 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool(
Index = 0;
// Create one event ZePool per MaxNumEventsPerPool events
if (*ZePool == nullptr) {
ze_event_pool_counter_based_exp_desc_t counterBasedExt = {
ZE_STRUCTURE_TYPE_COUNTER_BASED_EVENT_POOL_EXP_DESC};
ZeStruct<ze_event_pool_desc_t> ZeEventPoolDesc;
ZeEventPoolDesc.count = MaxNumEventsPerPool;
ZeEventPoolDesc.flags = 0;
ZeEventPoolDesc.pNext = nullptr;
if (HostVisible)
ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE;
if (ProfilingEnabled)
ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP;
logger::debug("ze_event_pool_desc_t flags set to: {}",
ZeEventPoolDesc.flags);
if (CounterBasedEventEnabled) {
if (UsingImmCmdList) {
counterBasedExt.flags = ZE_EVENT_POOL_COUNTER_BASED_EXP_FLAG_IMMEDIATE;
} else {
counterBasedExt.flags =
ZE_EVENT_POOL_COUNTER_BASED_EXP_FLAG_NON_IMMEDIATE;
}
ZeEventPoolDesc.pNext = &counterBasedExt;
}

std::vector<ze_device_handle_t> ZeDevices;
if (ZeDevice) {
Expand All @@ -540,14 +554,18 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool(
}

ur_event_handle_t ur_context_handle_t_::getEventFromContextCache(
bool HostVisible, bool WithProfiling, ur_device_handle_t Device) {
bool HostVisible, bool WithProfiling, ur_device_handle_t Device,
bool CounterBasedEventEnabled) {
std::scoped_lock<ur_mutex> Lock(EventCacheMutex);
auto Cache = getEventCache(HostVisible, WithProfiling, Device);
if (Cache->empty())
return nullptr;

auto It = Cache->begin();
ur_event_handle_t Event = *It;
if (Event->CounterBasedEventsEnabled != CounterBasedEventEnabled) {
return nullptr;
}
Cache->erase(It);
// We have to reset event before using it.
Event->reset();
Expand Down Expand Up @@ -579,13 +597,16 @@ ur_context_handle_t_::decrementUnreleasedEventsInPool(ur_event_handle_t Event) {
}

ze_device_handle_t ZeDevice = nullptr;
bool UsingImmediateCommandlists =
!Event->UrQueue || Event->UrQueue->UsingImmCmdLists;

if (!Event->IsMultiDevice && Event->UrQueue) {
ZeDevice = Event->UrQueue->Device->ZeDevice;
}

std::list<ze_event_pool_handle_t> *ZePoolCache = getZeEventPoolCache(
Event->isHostVisible(), Event->isProfilingEnabled(), ZeDevice);
Event->isHostVisible(), Event->isProfilingEnabled(),
Event->CounterBasedEventsEnabled, UsingImmediateCommandlists, ZeDevice);

// Put the empty pool to the cache of the pools.
if (NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0)
Expand Down Expand Up @@ -683,8 +704,8 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList(
// Make sure to acquire the lock before checking the size, or there
// will be a race condition.
std::scoped_lock<ur_mutex> Lock(Queue->Context->ZeCommandListCacheMutex);
// Under mutex since operator[] does insertion on the first usage for every
// unique ZeDevice.
// Under mutex since operator[] does insertion on the first usage for
// every unique ZeDevice.
auto &ZeCommandListCache =
UseCopyEngine
? Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice]
Expand Down
85 changes: 55 additions & 30 deletions source/adapters/level_zero/context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,9 +146,9 @@ struct ur_context_handle_t_ : _ur_object {
// head.
//
// Cache of event pools to which host-visible events are added to.
std::vector<std::list<ze_event_pool_handle_t>> ZeEventPoolCache{4};
std::vector<std::list<ze_event_pool_handle_t>> ZeEventPoolCache{12};
std::vector<std::unordered_map<ze_device_handle_t, size_t>>
ZeEventPoolCacheDeviceMap{4};
ZeEventPoolCacheDeviceMap{12};

// This map will be used to determine if a pool is full or not
// by storing number of empty slots available in the pool.
Expand Down Expand Up @@ -199,48 +199,73 @@ struct ur_context_handle_t_ : _ur_object {
ur_result_t getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &, size_t &,
bool HostVisible,
bool ProfilingEnabled,
ur_device_handle_t Device);
ur_device_handle_t Device,
bool CounterBasedEventEnabled,
bool UsingImmCmdList);

// Get ur_event_handle_t from cache.
ur_event_handle_t getEventFromContextCache(bool HostVisible,
bool WithProfiling,
ur_device_handle_t Device);
ur_device_handle_t Device,
bool CounterBasedEventEnabled);

// Add ur_event_handle_t to cache.
void addEventToContextCache(ur_event_handle_t);

enum EventPoolCacheType {
HostVisibleCacheType,
HostInvisibleCacheType,
HostVisibleCounterBasedRegularCacheType,
HostInvisibleCounterBasedRegularCacheType,
HostVisibleCounterBasedImmediateCacheType,
HostInvisibleCounterBasedImmediateCacheType
};

std::list<ze_event_pool_handle_t> *
getZeEventPoolCache(bool HostVisible, bool WithProfiling,
bool CounterBasedEventEnabled, bool UsingImmediateCmdList,
ze_device_handle_t ZeDevice) {
if (HostVisible) {
if (ZeDevice) {
auto ZeEventPoolCacheMap = WithProfiling
? &ZeEventPoolCacheDeviceMap[0]
: &ZeEventPoolCacheDeviceMap[1];
if (ZeEventPoolCacheMap->find(ZeDevice) == ZeEventPoolCacheMap->end()) {
ZeEventPoolCache.emplace_back();
ZeEventPoolCacheMap->insert(
std::make_pair(ZeDevice, ZeEventPoolCache.size() - 1));
}
return &ZeEventPoolCache[(*ZeEventPoolCacheMap)[ZeDevice]];
} else {
return WithProfiling ? &ZeEventPoolCache[0] : &ZeEventPoolCache[1];
EventPoolCacheType CacheType;

calculateCacheIndex(HostVisible, CounterBasedEventEnabled,
UsingImmediateCmdList, CacheType);
if (ZeDevice) {
auto ZeEventPoolCacheMap =
WithProfiling ? &ZeEventPoolCacheDeviceMap[CacheType * 2]
: &ZeEventPoolCacheDeviceMap[CacheType * 2 + 1];
if (ZeEventPoolCacheMap->find(ZeDevice) == ZeEventPoolCacheMap->end()) {
ZeEventPoolCache.emplace_back();
ZeEventPoolCacheMap->insert(
std::make_pair(ZeDevice, ZeEventPoolCache.size() - 1));
}
return &ZeEventPoolCache[(*ZeEventPoolCacheMap)[ZeDevice]];
} else {
if (ZeDevice) {
auto ZeEventPoolCacheMap = WithProfiling
? &ZeEventPoolCacheDeviceMap[2]
: &ZeEventPoolCacheDeviceMap[3];
if (ZeEventPoolCacheMap->find(ZeDevice) == ZeEventPoolCacheMap->end()) {
ZeEventPoolCache.emplace_back();
ZeEventPoolCacheMap->insert(
std::make_pair(ZeDevice, ZeEventPoolCache.size() - 1));
}
return &ZeEventPoolCache[(*ZeEventPoolCacheMap)[ZeDevice]];
} else {
return WithProfiling ? &ZeEventPoolCache[2] : &ZeEventPoolCache[3];
}
return WithProfiling ? &ZeEventPoolCache[CacheType * 2]
: &ZeEventPoolCache[CacheType * 2 + 1];
}
}

ur_result_t calculateCacheIndex(bool HostVisible,
bool CounterBasedEventEnabled,
bool UsingImmediateCmdList,
EventPoolCacheType &CacheType) {
if (CounterBasedEventEnabled && HostVisible && !UsingImmediateCmdList) {
CacheType = HostVisibleCounterBasedRegularCacheType;
} else if (CounterBasedEventEnabled && !HostVisible &&
!UsingImmediateCmdList) {
CacheType = HostInvisibleCounterBasedRegularCacheType;
} else if (CounterBasedEventEnabled && HostVisible &&
UsingImmediateCmdList) {
CacheType = HostVisibleCounterBasedImmediateCacheType;
} else if (CounterBasedEventEnabled && !HostVisible &&
UsingImmediateCmdList) {
CacheType = HostInvisibleCounterBasedImmediateCacheType;
} else if (!CounterBasedEventEnabled && HostVisible) {
CacheType = HostVisibleCacheType;
} else {
CacheType = HostInvisibleCacheType;
}
return UR_RESULT_SUCCESS;
}

// Decrement number of events living in the pool upon event destroy
Expand Down
29 changes: 18 additions & 11 deletions source/adapters/level_zero/event.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
if (OutEvent) {
Queue->LastCommandEvent = reinterpret_cast<ur_event_handle_t>(*OutEvent);

ZE2UR_CALL(zeEventHostSignal, ((*OutEvent)->ZeEvent));
if (!(*OutEvent)->CounterBasedEventsEnabled)
ZE2UR_CALL(zeEventHostSignal, ((*OutEvent)->ZeEvent));
(*OutEvent)->Completed = true;
}
}
Expand Down Expand Up @@ -766,7 +767,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urExtEventCreate(
UR_CALL(EventCreate(Context, nullptr, false, true, Event));

(*Event)->RefCountExternal++;
ZE2UR_CALL(zeEventHostSignal, ((*Event)->ZeEvent));
if (!(*Event)->CounterBasedEventsEnabled)
ZE2UR_CALL(zeEventHostSignal, ((*Event)->ZeEvent));
return UR_RESULT_SUCCESS;
}

Expand All @@ -784,7 +786,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle(
UR_CALL(EventCreate(Context, nullptr, false, true, Event));

(*Event)->RefCountExternal++;
ZE2UR_CALL(zeEventHostSignal, ((*Event)->ZeEvent));
if (!(*Event)->CounterBasedEventsEnabled)
ZE2UR_CALL(zeEventHostSignal, ((*Event)->ZeEvent));
return UR_RESULT_SUCCESS;
}

Expand Down Expand Up @@ -1061,9 +1064,11 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked,
//
ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue,
bool IsMultiDevice, bool HostVisible,
ur_event_handle_t *RetEvent) {
ur_event_handle_t *RetEvent,
bool CounterBasedEventEnabled) {

bool ProfilingEnabled = !Queue || Queue->isProfilingEnabled();
bool UsingImmediateCommandlists = !Queue || Queue->UsingImmCmdLists;

ur_device_handle_t Device = nullptr;

Expand All @@ -1072,7 +1077,7 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue,
}

if (auto CachedEvent = Context->getEventFromContextCache(
HostVisible, ProfilingEnabled, Device)) {
HostVisible, ProfilingEnabled, Device, CounterBasedEventEnabled)) {
*RetEvent = CachedEvent;
return UR_RESULT_SUCCESS;
}
Expand All @@ -1083,14 +1088,15 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue,
size_t Index = 0;

if (auto Res = Context->getFreeSlotInExistingOrNewPool(
ZeEventPool, Index, HostVisible, ProfilingEnabled, Device))
ZeEventPool, Index, HostVisible, ProfilingEnabled, Device,
CounterBasedEventEnabled, UsingImmediateCommandlists))
return Res;

ZeStruct<ze_event_desc_t> ZeEventDesc;
ZeEventDesc.index = Index;
ZeEventDesc.wait = 0;

if (HostVisible) {
if (HostVisible || CounterBasedEventEnabled) {
ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST;
} else {
//
Expand All @@ -1115,7 +1121,7 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue,
} catch (...) {
return UR_RESULT_ERROR_UNKNOWN;
}

(*RetEvent)->CounterBasedEventsEnabled = CounterBasedEventEnabled;
if (HostVisible)
(*RetEvent)->HostVisibleEvent =
reinterpret_cast<ur_event_handle_t>(*RetEvent);
Expand All @@ -1137,8 +1143,8 @@ ur_result_t ur_event_handle_t_::reset() {

if (!isHostVisible())
HostVisibleEvent = nullptr;

ZE2UR_CALL(zeEventHostReset, (ZeEvent));
if (!CounterBasedEventsEnabled)
ZE2UR_CALL(zeEventHostReset, (ZeEvent));
return UR_RESULT_SUCCESS;
}

Expand Down Expand Up @@ -1339,7 +1345,8 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(

zeCommandListAppendWaitOnEvents(ZeCommandList, 1u,
&EventList[I]->ZeEvent);
zeEventHostSignal(MultiDeviceZeEvent);
if (!MultiDeviceEvent->CounterBasedEventsEnabled)
zeEventHostSignal(MultiDeviceZeEvent);

UR_CALL(Queue->executeCommandList(CommandList, /* IsBlocking */ false,
/* OkToBatchCommand */ true));
Expand Down
5 changes: 4 additions & 1 deletion source/adapters/level_zero/event.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ extern "C" {
ur_result_t urEventReleaseInternal(ur_event_handle_t Event);
ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue,
bool IsMultiDevice, bool HostVisible,
ur_event_handle_t *RetEvent);
ur_event_handle_t *RetEvent,
bool CounterBasedEventEnabled = false);
} // extern "C"

// This is an experimental option that allows to disable caching of events in
Expand Down Expand Up @@ -226,6 +227,8 @@ struct ur_event_handle_t_ : _ur_object {
// completion batch for this event. Only used for out-of-order immediate
// command lists.
std::optional<ur_completion_batch_it> completionBatch;
// Keeps track of whether we are using Counter-based Events.
bool CounterBasedEventsEnabled = false;
};

// Helper function to implement zeHostSynchronize.
Expand Down
8 changes: 5 additions & 3 deletions source/adapters/level_zero/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -944,7 +944,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap(
}

// Signal this event
ZE2UR_CALL(zeEventHostSignal, (ZeEvent));
if (!(*Event)->CounterBasedEventsEnabled)
ZE2UR_CALL(zeEventHostSignal, (ZeEvent));
(*Event)->Completed = true;
return UR_RESULT_SUCCESS;
}
Expand Down Expand Up @@ -1078,8 +1079,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
if (Buffer->MapHostPtr)
memcpy(ZeHandleDst + MapInfo.Offset, MappedPtr, MapInfo.Size);

// Signal this event
ZE2UR_CALL(zeEventHostSignal, (ZeEvent));
// Signal this event if it is not using counter based events
if (!(*Event)->CounterBasedEventsEnabled)
ZE2UR_CALL(zeEventHostSignal, (ZeEvent));
(*Event)->Completed = true;
return UR_RESULT_SUCCESS;
}
Expand Down
8 changes: 8 additions & 0 deletions source/adapters/level_zero/platform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,14 @@ ur_result_t ur_platform_handle_t_::initialize() {
ZeDriverModuleProgramExtensionFound = true;
}
}
// Check if extension is available for Counting Events.
if (strncmp(extension.name, ZE_EVENT_POOL_COUNTER_BASED_EXP_NAME,
strlen(ZE_EVENT_POOL_COUNTER_BASED_EXP_NAME) + 1) == 0) {
if (extension.version ==
ZE_EVENT_POOL_COUNTER_BASED_EXP_VERSION_CURRENT) {
ZeDriverEventPoolCountingEventsExtensionFound = true;
}
}
zeDriverExtensionMap[extension.name] = extension.version;
}

Expand Down
1 change: 1 addition & 0 deletions source/adapters/level_zero/platform.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ struct ur_platform_handle_t_ : public _ur_platform {
// Flags to tell whether various Level Zero platform extensions are available.
bool ZeDriverGlobalOffsetExtensionFound{false};
bool ZeDriverModuleProgramExtensionFound{false};
bool ZeDriverEventPoolCountingEventsExtensionFound{false};

// Cache UR devices for reuse
std::vector<std::unique_ptr<ur_device_handle_t_>> URDevicesCache;
Expand Down
Loading

0 comments on commit f4a9497

Please sign in to comment.