From 7145df448e1e2c2e6364bd2dc2f329e477305de9 Mon Sep 17 00:00:00 2001 From: "Zhang, Winston" Date: Sat, 23 Mar 2024 01:05:28 +0000 Subject: [PATCH] [L0] Support for counter-based events using L0 driver Signed-off-by: Zhang, Winston --- source/adapters/level_zero/context.cpp | 24 +++++++-- source/adapters/level_zero/context.hpp | 71 +++++++++++++++----------- source/adapters/level_zero/device.cpp | 33 ++---------- source/adapters/level_zero/device.hpp | 2 - source/adapters/level_zero/event.cpp | 14 ++--- source/adapters/level_zero/event.hpp | 8 ++- source/adapters/level_zero/queue.cpp | 50 +++++++++++------- source/adapters/level_zero/queue.hpp | 5 ++ 8 files changed, 118 insertions(+), 89 deletions(-) diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp index c4b5423adb..4953c721e0 100644 --- a/source/adapters/level_zero/context.cpp +++ b/source/adapters/level_zero/context.cpp @@ -468,7 +468,8 @@ static const uint32_t MaxNumEventsPerPool = [] { ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool( ze_event_pool_handle_t &Pool, size_t &Index, bool HostVisible, - bool ProfilingEnabled, ur_device_handle_t Device) { + bool ProfilingEnabled, ur_device_handle_t Device, + bool CounterBasedEventEnabled, bool UsingImmCmdList) { // Lock while updating event pool machinery. std::scoped_lock Lock(ZeEventPoolCacheMutex); @@ -477,8 +478,8 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool( if (Device) { ZeDevice = Device->ZeDevice; } - std::list *ZePoolCache = - getZeEventPoolCache(HostVisible, ProfilingEnabled, ZeDevice); + std::list *ZePoolCache = getZeEventPoolCache( + HostVisible, ProfilingEnabled, CounterBasedEventEnabled, ZeDevice); if (!ZePoolCache->empty()) { if (NumEventsAvailableInEventPool[ZePoolCache->front()] == 0) { @@ -510,6 +511,18 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool( ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE; if (ProfilingEnabled) ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + if (CounterBasedEventEnabled) { + ZeEventPoolDesc.flags |= ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + ze_event_pool_counter_based_exp_desc_t counterBasedExt = { + ZE_STRUCTURE_TYPE_COUNTER_BASED_EVENT_POOL_EXP_DESC}; + if (UsingImmCmdList) { + counterBasedExt.flags = ZE_EVENT_POOL_COUNTER_BASED_EXP_FLAG_IMMEDIATE; + } else { + counterBasedExt.flags = + ZE_EVENT_POOL_COUNTER_BASED_EXP_FLAG_NON_IMMEDIATE; + } + ZeEventPoolDesc.pNext = &counterBasedExt; + } urPrint("ze_event_pool_desc_t flags set to: %d\n", ZeEventPoolDesc.flags); std::vector ZeDevices; @@ -580,8 +593,9 @@ ur_context_handle_t_::decrementUnreleasedEventsInPool(ur_event_handle_t Event) { ZeDevice = Event->UrQueue->Device->ZeDevice; } - std::list *ZePoolCache = getZeEventPoolCache( - Event->isHostVisible(), Event->isProfilingEnabled(), ZeDevice); + std::list *ZePoolCache = + getZeEventPoolCache(Event->isHostVisible(), Event->isProfilingEnabled(), + Event->usingCounterBasedEvents(), ZeDevice); // Put the empty pool to the cache of the pools. if (NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0) diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp index 8cb1d5369f..bebc76470d 100644 --- a/source/adapters/level_zero/context.hpp +++ b/source/adapters/level_zero/context.hpp @@ -141,9 +141,9 @@ struct ur_context_handle_t_ : _ur_object { // head. // // Cache of event pools to which host-visible events are added to. - std::vector> ZeEventPoolCache{4}; + std::vector> ZeEventPoolCache{8}; std::vector> - ZeEventPoolCacheDeviceMap{4}; + ZeEventPoolCacheDeviceMap{8}; // This map will be used to determine if a pool is full or not // by storing number of empty slots available in the pool. @@ -194,7 +194,9 @@ struct ur_context_handle_t_ : _ur_object { ur_result_t getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &, size_t &, bool HostVisible, bool ProfilingEnabled, - ur_device_handle_t Device); + ur_device_handle_t Device, + bool CounterBasedEventEnabled, + bool UsingImmCmdList); // Get ur_event_handle_t from cache. ur_event_handle_t getEventFromContextCache(bool HostVisible, @@ -204,38 +206,49 @@ struct ur_context_handle_t_ : _ur_object { // Add ur_event_handle_t to cache. void addEventToContextCache(ur_event_handle_t); + enum EventPoolCacheType { + HostVisible, + HostInvisible, + HostVisibleCounterBased, + HostInvisibleCounterBased + }; + std::list * getZeEventPoolCache(bool HostVisible, bool WithProfiling, + bool CounterBasedEventEnabled, ze_device_handle_t ZeDevice) { - if (HostVisible) { - if (ZeDevice) { - auto ZeEventPoolCacheMap = WithProfiling - ? &ZeEventPoolCacheDeviceMap[0] - : &ZeEventPoolCacheDeviceMap[1]; - if (ZeEventPoolCacheMap->find(ZeDevice) == ZeEventPoolCacheMap->end()) { - ZeEventPoolCache.emplace_back(); - ZeEventPoolCacheMap->insert( - std::make_pair(ZeDevice, ZeEventPoolCache.size() - 1)); - } - return &ZeEventPoolCache[(*ZeEventPoolCacheMap)[ZeDevice]]; - } else { - return WithProfiling ? &ZeEventPoolCache[0] : &ZeEventPoolCache[1]; + EventPoolCacheType CacheType; + + calculateCacheIndex(HostVisible, CounterBasedEventEnabled, CacheType); + if (ZeDevice) { + auto ZeEventPoolCacheMap = + WithProfiling ? &ZeEventPoolCacheDeviceMap[CacheType] + : &ZeEventPoolCacheDeviceMap[CacheType + 1]; + if (ZeEventPoolCacheMap->find(ZeDevice) == ZeEventPoolCacheMap->end()) { + ZeEventPoolCache.emplace_back(); + ZeEventPoolCacheMap->insert( + std::make_pair(ZeDevice, ZeEventPoolCache.size() - 1)); } + return &ZeEventPoolCache[(*ZeEventPoolCacheMap)[ZeDevice]]; } else { - if (ZeDevice) { - auto ZeEventPoolCacheMap = WithProfiling - ? &ZeEventPoolCacheDeviceMap[2] - : &ZeEventPoolCacheDeviceMap[3]; - if (ZeEventPoolCacheMap->find(ZeDevice) == ZeEventPoolCacheMap->end()) { - ZeEventPoolCache.emplace_back(); - ZeEventPoolCacheMap->insert( - std::make_pair(ZeDevice, ZeEventPoolCache.size() - 1)); - } - return &ZeEventPoolCache[(*ZeEventPoolCacheMap)[ZeDevice]]; - } else { - return WithProfiling ? &ZeEventPoolCache[2] : &ZeEventPoolCache[3]; - } + return WithProfiling ? &ZeEventPoolCache[CacheType] + : &ZeEventPoolCache[CacheType + 1]; + } + } + + ur_result_t calculateCacheIndex(bool HostVisible, + bool CounterBasedEventEnabled, + EventPoolCacheType &CacheType) { + if (CounterBasedEventEnabled && HostVisible) { + CacheType = HostVisibleCounterBased; + } else if (CounterBasedEventEnabled && !HostVisible) { + CacheType = HostInvisibleCounterBased; + } else if (!CounterBasedEventEnabled && HostVisible) { + CacheType = HostVisible; + } else { + CacheType = HostInvisible; } + return UR_RESULT_SUCCESS; } // Decrement number of events living in the pool upon event destroy diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index f4d7f95e0b..fcb4fd392c 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -917,22 +917,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( } case UR_DEVICE_INFO_COMMAND_BUFFER_SUPPORT_EXP: return ReturnValue(true); - case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: { - // TODO: Level Zero API allows to check support for all sub-features: - // ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS, - // ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_COUNT, - // ZE_MUTABLE_COMMAND_EXP_FLAG_GROUP_SIZE, - // ZE_MUTABLE_COMMAND_EXP_FLAG_GLOBAL_OFFSET, - // ZE_MUTABLE_COMMAND_EXP_FLAG_SIGNAL_EVENT, - // ZE_MUTABLE_COMMAND_EXP_FLAG_WAIT_EVENTS - // but UR has only one property to check the mutable command lists feature - // support. For now return true if kernel arguments can be updated. - auto KernelArgUpdateSupport = - Device->ZeDeviceMutableCmdListsProperties->mutableCommandFlags & - ZE_MUTABLE_COMMAND_EXP_FLAG_KERNEL_ARGUMENTS; - return ReturnValue(KernelArgUpdateSupport && - Device->Platform->ZeMutableCmdListExt.Supported); - } + case UR_DEVICE_INFO_COMMAND_BUFFER_UPDATE_SUPPORT_EXP: + return ReturnValue(false); case UR_DEVICE_INFO_BINDLESS_IMAGES_SUPPORT_EXP: return ReturnValue(true); case UR_DEVICE_INFO_BINDLESS_IMAGES_SHARED_USM_SUPPORT_EXP: @@ -1074,7 +1060,7 @@ bool ur_device_handle_t_::useDriverInOrderLists() { static const bool UseDriverInOrderLists = [] { const char *UrRet = std::getenv("UR_L0_USE_DRIVER_INORDER_LISTS"); if (!UrRet) - return true; + return false; return std::atoi(UrRet) != 0; }(); @@ -1156,15 +1142,6 @@ ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal, (ZeDevice, &Count, &Properties)); }; - ZeDeviceMutableCmdListsProperties.Compute = - [ZeDevice]( - ZeStruct &Properties) { - ze_device_properties_t P; - P.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; - P.pNext = &Properties; - ZE_CALL_NOCHECK(zeDeviceGetProperties, (ZeDevice, &P)); - }; - ImmCommandListUsed = this->useImmediateCommandLists(); uint32_t numQueueGroups = 0; @@ -1478,7 +1455,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( // a valid Level Zero device. ur_device_handle_t Dev = nullptr; - if (const auto *platforms = GlobalAdapter->PlatformCache->get_value()) { + if (const auto *platforms = Adapter.PlatformCache->get_value()) { for (const auto &p : *platforms) { Dev = p->getDeviceFromNativeHandle(ZeDevice); if (Dev) { @@ -1489,7 +1466,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( } } } else { - return GlobalAdapter->PlatformCache->get_error(); + return Adapter.PlatformCache->get_error(); } if (Dev == nullptr) diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp index 484890670b..a57a97d38d 100644 --- a/source/adapters/level_zero/device.hpp +++ b/source/adapters/level_zero/device.hpp @@ -195,6 +195,4 @@ struct ur_device_handle_t_ : _ur_object { ZeCache> ZeDeviceCacheProperties; ZeCache> ZeDeviceIpVersionExt; ZeCache ZeGlobalMemSize; - ZeCache> - ZeDeviceMutableCmdListsProperties; }; diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index c9d1c7d6b4..651c2b9ea2 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -1057,7 +1057,8 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked, // ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, bool IsMultiDevice, bool HostVisible, - ur_event_handle_t *RetEvent) { + ur_event_handle_t *RetEvent, + bool CounterBasedEventEnabled) { bool ProfilingEnabled = !Queue || Queue->isProfilingEnabled(); @@ -1079,14 +1080,15 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, size_t Index = 0; if (auto Res = Context->getFreeSlotInExistingOrNewPool( - ZeEventPool, Index, HostVisible, ProfilingEnabled, Device)) + ZeEventPool, Index, HostVisible, ProfilingEnabled, Device, + CounterBasedEventEnabled, Queue->UsingImmCmdLists)) return Res; ZeStruct ZeEventDesc; ZeEventDesc.index = Index; ZeEventDesc.wait = 0; - if (HostVisible) { + if (HostVisible || CounterBasedEventEnabled) { ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; } else { // @@ -1111,7 +1113,7 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, } catch (...) { return UR_RESULT_ERROR_UNKNOWN; } - + (*RetEvent)->CounterBasedEventsEnabled = CounterBasedEventEnabled; if (HostVisible) (*RetEvent)->HostVisibleEvent = reinterpret_cast(*RetEvent); @@ -1132,8 +1134,8 @@ ur_result_t ur_event_handle_t_::reset() { if (!isHostVisible()) HostVisibleEvent = nullptr; - - ZE2UR_CALL(zeEventHostReset, (ZeEvent)); + if (!usingCounterBasedEvents()) + ZE2UR_CALL(zeEventHostReset, (ZeEvent)); return UR_RESULT_SUCCESS; } diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp index c266de8c0d..86ce5b46e0 100644 --- a/source/adapters/level_zero/event.hpp +++ b/source/adapters/level_zero/event.hpp @@ -31,7 +31,8 @@ extern "C" { ur_result_t urEventReleaseInternal(ur_event_handle_t Event); ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, bool IsMultiDevice, bool HostVisible, - ur_event_handle_t *RetEvent); + ur_event_handle_t *RetEvent, + bool CounterBasedEventEnabled = false); } // extern "C" // This is an experimental option that allows to disable caching of events in @@ -222,6 +223,11 @@ struct ur_event_handle_t_ : _ur_object { // Get the host-visible event or create one and enqueue its signal. ur_result_t getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent); + + // Keeps track of whether we are using Counter-based Events. + bool CounterBasedEventsEnabled = false; + + bool usingCounterBasedEvents() const { return CounterBasedEventsEnabled; } }; // Helper function to implement zeHostSynchronize. diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index 187f4f75f9..60445e9d69 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -569,7 +569,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( // Maybe this is not completely correct. uint32_t NumEntries = 1; ur_platform_handle_t Platform{}; - ur_adapter_handle_t AdapterHandle = GlobalAdapter; + ur_adapter_handle_t AdapterHandle = &Adapter; UR_CALL(urPlatformGet(&AdapterHandle, 1, NumEntries, &Platform, nullptr)); ur_device_handle_t UrDevice = Device; @@ -955,6 +955,16 @@ ur_queue_handle_t_::ur_queue_handle_t_( ComputeCommandBatch.QueueBatchSize = ZeCommandListBatchComputeConfig.startSize(); CopyCommandBatch.QueueBatchSize = ZeCommandListBatchCopyConfig.startSize(); + + static const bool useDriverCounterBasedEvents = [] { + const char *UrRet = std::getenv("UR_L0_USE_DRIVER_COUNTER_BASED_EVENTS"); + if (!UrRet) + return false; + return std::atoi(UrRet) != 0; + }(); + this->counterBasedEventsEnabled = isInOrderQueue() && + Device->useDriverInOrderLists() && + useDriverCounterBasedEvents; } void ur_queue_handle_t_::adjustBatchSizeForFullBatch(bool IsCopy) { @@ -1236,7 +1246,8 @@ bool ur_queue_handle_t_::doReuseDiscardedEvents() { ur_result_t ur_queue_handle_t_::resetDiscardedEvent(ur_command_list_ptr_t CommandList) { - if (LastCommandEvent && LastCommandEvent->IsDiscarded) { + if (!usingCounterBasedEvents() && LastCommandEvent && + LastCommandEvent->IsDiscarded) { ZE2UR_CALL(zeCommandListAppendBarrier, (CommandList->first, nullptr, 1, &(LastCommandEvent->ZeEvent))); ZE2UR_CALL(zeCommandListAppendEventReset, @@ -1366,6 +1377,10 @@ bool ur_queue_handle_t_::isInOrderQueue() const { 0); } +bool ur_queue_handle_t_::usingCounterBasedEvents() const { + return this->counterBasedEventsEnabled; +} + // Helper function to perform the necessary cleanup of the events from reset cmd // list. ur_result_t CleanupEventListFromResetCmdList( @@ -1519,7 +1534,8 @@ ur_result_t createEventAndAssociateQueue(ur_queue_handle_t Queue, if (*Event == nullptr) UR_CALL(EventCreate(Queue->Context, Queue, IsMultiDevice, - HostVisible.value(), Event)); + HostVisible.value(), Event, + Queue->usingCounterBasedEvents())); (*Event)->UrQueue = Queue; (*Event)->CommandType = CommandType; @@ -1861,19 +1877,18 @@ ur_result_t ur_queue_handle_t_::createCommandList( ze_command_list_handle_t ZeCommandList; uint32_t QueueGroupOrdinal; + ZeStruct ZeCommandListDesc; + if (usingCounterBasedEvents()) { + ZeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_IN_ORDER; + ZeCommandListDesc.stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC; + } auto &QGroup = getQueueGroup(UseCopyEngine); auto &ZeCommandQueue = ForcedCmdQueue ? *ForcedCmdQueue : QGroup.getZeQueue(&QueueGroupOrdinal); if (ForcedCmdQueue) QueueGroupOrdinal = QGroup.getCmdQueueOrdinal(ZeCommandQueue); - ZeStruct ZeCommandListDesc; ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal; - - if (Device->useDriverInOrderLists() && isInOrderQueue()) { - ZeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_IN_ORDER; - } - ZE2UR_CALL(zeCommandListCreate, (Context->ZeContext, Device->ZeDevice, &ZeCommandListDesc, &ZeCommandList)); @@ -1883,9 +1898,10 @@ ur_result_t ur_queue_handle_t_::createCommandList( std::tie(CommandList, std::ignore) = CommandListMap.insert( std::pair( ZeCommandList, {ZeFence, false, false, ZeCommandQueue, ZeQueueDesc})); - - UR_CALL(insertStartBarrierIfDiscardEventsMode(CommandList)); - UR_CALL(insertActiveBarriers(CommandList, UseCopyEngine)); + if (!usingCounterBasedEvents()) { + UR_CALL(insertStartBarrierIfDiscardEventsMode(CommandList)); + UR_CALL(insertActiveBarriers(CommandList, UseCopyEngine)); + } return UR_RESULT_SUCCESS; } @@ -1987,13 +2003,11 @@ ur_command_list_ptr_t &ur_queue_handle_t_::ur_queue_group_t::getImmCmdList() { Priority = "High"; } - // Evaluate performance of explicit usage for "0" index. - if (QueueIndex != 0) { - ZeCommandQueueDesc.flags |= ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY; - } - if (Queue->Device->useDriverInOrderLists() && Queue->isInOrderQueue()) { - ZeCommandQueueDesc.flags |= ZE_COMMAND_QUEUE_FLAG_IN_ORDER; + ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_IN_ORDER; + } else if (QueueIndex != 0) { + // Evaluate performance of explicit usage for "0" index. + ZeCommandQueueDesc.flags = ZE_COMMAND_QUEUE_FLAG_EXPLICIT_ONLY; } // Check if context's command list cache has an immediate command list with diff --git a/source/adapters/level_zero/queue.hpp b/source/adapters/level_zero/queue.hpp index 06751e03c1..4fd800c4d4 100644 --- a/source/adapters/level_zero/queue.hpp +++ b/source/adapters/level_zero/queue.hpp @@ -234,6 +234,9 @@ struct ur_queue_handle_t_ : _ur_object { // Keeps the properties of this queue. ur_queue_flags_t Properties; + // Keeps track of whether we are using Counter-based Events + bool counterBasedEventsEnabled = false; + // Map of all command lists used in this queue. ur_command_list_map_t CommandListMap; @@ -398,6 +401,8 @@ struct ur_queue_handle_t_ : _ur_object { // Returns true if the queue is a in-order queue. bool isInOrderQueue() const; + bool usingCounterBasedEvents() const; + // Returns true if the queue has discard events property. bool isDiscardEvents() const;