From 05e3e6d841a9029113e740d12877ece2a8a53cda Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Mon, 19 Feb 2024 16:06:56 +0000 Subject: [PATCH] [EXP][Command-Buffer] Optimize L0 command buffer submission - Adds command buffer property to explicitly enable profiling - Add ability to enforce use of in-order command lists --- include/ur_api.h | 3 + include/ur_print.hpp | 10 + scripts/core/EXP-COMMAND-BUFFER.rst | 8 +- scripts/core/exp-command-buffer.yml | 6 + source/adapters/level_zero/command_buffer.cpp | 398 ++++++++++++------ source/adapters/level_zero/command_buffer.hpp | 10 +- source/adapters/level_zero/event.cpp | 8 +- source/adapters/level_zero/event.hpp | 1 + source/adapters/level_zero/queue.cpp | 2 +- 9 files changed, 301 insertions(+), 145 deletions(-) diff --git a/include/ur_api.h b/include/ur_api.h index 19ba599c7d..bfd9131a1f 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -7892,6 +7892,9 @@ typedef struct ur_exp_command_buffer_desc_t { ///< ::UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC const void *pNext; ///< [in][optional] pointer to extension-specific structure ur_bool_t isUpdatable; ///< [in] Commands in a finalized command-buffer can be updated. + ur_bool_t isInOrder; ///< [in] Commands in a command-buffer may be executed in-order without + ///< explicit dependencies. + ur_bool_t enableProfiling; ///< [in] Command-buffer profiling is enabled. } ur_exp_command_buffer_desc_t; diff --git a/include/ur_print.hpp b/include/ur_print.hpp index 649f9f63cb..be1a8dedf4 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -9401,6 +9401,16 @@ inline std::ostream &operator<<(std::ostream &os, const struct ur_exp_command_bu os << (params.isUpdatable); + os << ", "; + os << ".isInOrder = "; + + os << (params.isInOrder); + + os << ", "; + os << ".enableProfiling = "; + + os << (params.enableProfiling); + os << "}"; return os; } diff --git a/scripts/core/EXP-COMMAND-BUFFER.rst b/scripts/core/EXP-COMMAND-BUFFER.rst index 0143b72c77..b99a13340e 100644 --- a/scripts/core/EXP-COMMAND-BUFFER.rst +++ b/scripts/core/EXP-COMMAND-BUFFER.rst @@ -77,9 +77,13 @@ Command-Buffer Creation Command-Buffers are tied to a specific ${x}_context_handle_t and ${x}_device_handle_t. ${x}CommandBufferCreateExp optionally takes a descriptor to provide additional properties for how the command-buffer should be -constructed. The only unique member defined in ${x}_exp_command_buffer_desc_t -is ``isUpdatable``, which should be set to ``true`` to support :ref:`updating +constructed. The members defined in ${x}_exp_command_buffer_desc_t are: +* ``isUpdatable``, which should be set to ``true`` to support :ref:`updating command-buffer commands`. +* ``isInOrder``, which should be set to ``true`` to enable commands enqueued to +a command-buffer to be executed in an in-order fashion where possible. +* ``enableProfiling``, which should be set to ``true`` to enable profiling of +the command-buffer. Command-buffers are reference counted and can be retained and released by calling ${x}CommandBufferRetainExp and ${x}CommandBufferReleaseExp respectively. diff --git a/scripts/core/exp-command-buffer.yml b/scripts/core/exp-command-buffer.yml index d2292ceb22..c277de228d 100644 --- a/scripts/core/exp-command-buffer.yml +++ b/scripts/core/exp-command-buffer.yml @@ -113,6 +113,12 @@ members: - type: $x_bool_t name: isUpdatable desc: "[in] Commands in a finalized command-buffer can be updated." + - type: $x_bool_t + name: isInOrder + desc: "[in] Commands in a command-buffer may be executed in-order without explicit dependencies." + - type: $x_bool_t + name: enableProfiling + desc: "[in] Command-buffer profiling is enabled." --- #-------------------------------------------------------------------------- type: struct desc: "Descriptor type for updating a kernel command memobj argument." diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 36cf76d111..b277a47427 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -14,17 +14,44 @@ https://github.com/intel/llvm/blob/sycl/sycl/doc/design/CommandGraph.md#level-zero */ +namespace { +/// Checks the version of the level-zero driver. +/// @param Context Execution context +/// @param VersionMajor Major verion number to compare to. +/// @param VersionMinor Minor verion number to compare to. +/// @param VersionBuild Build verion number to compare to. +/// @return true is the version of the driver is higher than or equal to the +/// compared version +bool IsDriverVersionNewerOrSimilar(ur_context_handle_t Context, + uint32_t VersionMajor, uint32_t VersionMinor, + uint32_t VersionBuild) { + ZeStruct ZeDriverProperties; + ZE2UR_CALL(zeDriverGetProperties, + (Context->getPlatform()->ZeDriver, &ZeDriverProperties)); + uint32_t DriverVersion = ZeDriverProperties.driverVersion; + auto DriverVersionMajor = (DriverVersion & 0xFF000000) >> 24; + auto DriverVersionMinor = (DriverVersion & 0x00FF0000) >> 16; + auto DriverVersionBuild = DriverVersion & 0x0000FFFF; + + return ((DriverVersionMajor >= VersionMajor) && + (DriverVersionMinor >= VersionMinor) && + (DriverVersionBuild >= VersionBuild)); +} +}; // namespace + ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( ur_context_handle_t Context, ur_device_handle_t Device, ze_command_list_handle_t CommandList, ze_command_list_handle_t CommandListResetEvents, ZeStruct ZeDesc, - const ur_exp_command_buffer_desc_t *Desc) + const ur_exp_command_buffer_desc_t *Desc, const bool IsInOrderCmdList) : Context(Context), Device(Device), ZeCommandList(CommandList), ZeCommandListResetEvents(CommandListResetEvents), ZeCommandListDesc(ZeDesc), ZeFencesList(), QueueProperties(), SyncPoints(), NextSyncPoint(0), - IsUpdatable(Desc ? Desc->isUpdatable : false) { + IsUpdatable(Desc ? Desc->isUpdatable : false), + IsProfilingEnabled(Desc ? Desc->enableProfiling : false), + IsInOrderCmdList(IsInOrderCmdList) { urContextRetain(Context); urDeviceRetain(Device); } @@ -75,6 +102,35 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { for (auto &ZeFence : ZeFencesList) { ZE_CALL_NOCHECK(zeFenceDestroy, (ZeFence)); } + + auto ReleaseIndirectMem = [](ur_kernel_handle_t Kernel) { + if (IndirectAccessTrackingEnabled) { + // urKernelRelease is called by CleanupCompletedEvent(Event) as soon as + // kernel execution has finished. This is the place where we need to + // release memory allocations. If kernel is not in use (not submitted by + // some other thread) then release referenced memory allocations. As a + // result, memory can be deallocated and context can be removed from + // container in the platform. That's why we need to lock a mutex here. + ur_platform_handle_t Platform = Kernel->Program->Context->getPlatform(); + std::scoped_lock ContextsLock(Platform->ContextsMutex); + + if (--Kernel->SubmissionsCount == 0) { + // Kernel is not submitted for execution, release referenced memory + // allocations. + for (auto &MemAlloc : Kernel->MemAllocs) { + // std::pair *, Hash + USMFreeHelper(MemAlloc->second.Context, MemAlloc->first, + MemAlloc->second.OwnNativeHandle); + } + Kernel->MemAllocs.clear(); + } + } + }; + + for (auto &AssociatedKernel : KernelsList) { + ReleaseIndirectMem(AssociatedKernel); + urKernelRelease(AssociatedKernel); + } } ur_exp_command_buffer_command_handle_t_:: @@ -227,14 +283,10 @@ static ur_result_t getEventsFromSyncPoints( if (!SyncPointWaitList || NumSyncPointsInWaitList == 0) return UR_RESULT_SUCCESS; - // Map of ur_exp_command_buffer_sync_point_t to ur_event_handle_t defining - // the event associated with each sync-point - auto SyncPoints = CommandBuffer->SyncPoints; - // For each sync-point add associated L0 event to the return list. for (size_t i = 0; i < NumSyncPointsInWaitList; i++) { - if (auto EventHandle = SyncPoints.find(SyncPointWaitList[i]); - EventHandle != SyncPoints.end()) { + if (auto EventHandle = CommandBuffer->SyncPoints.find(SyncPointWaitList[i]); + EventHandle != CommandBuffer->SyncPoints.end()) { ZeEventList.push_back(EventHandle->second->ZeEvent); } else { return UR_RESULT_ERROR_INVALID_VALUE; @@ -251,27 +303,33 @@ static ur_result_t enqueueCommandBufferMemCopyHelper( void *Dst, const void *Src, size_t Size, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, ur_exp_command_buffer_sync_point_t *SyncPoint) { - std::vector ZeEventList; - UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, - SyncPointWaitList, ZeEventList)); - - ur_event_handle_t LaunchEvent; - UR_CALL( - EventCreate(CommandBuffer->Context, nullptr, false, false, &LaunchEvent)); - LaunchEvent->CommandType = CommandType; + if (CommandBuffer->IsInOrderCmdList) { + ZE2UR_CALL( + zeCommandListAppendMemoryCopy, + (CommandBuffer->ZeCommandList, Dst, Src, Size, nullptr, 0, nullptr)); - // Get sync point and register the event with it. - *SyncPoint = CommandBuffer->GetNextSyncPoint(); - CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); + urPrint("calling zeCommandListAppendMemoryCopy()"); + } else { + std::vector ZeEventList; + ur_event_handle_t LaunchEvent; + UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, + SyncPointWaitList, ZeEventList)); + UR_CALL(EventCreate(CommandBuffer->Context, nullptr, false, false, + !CommandBuffer->IsProfilingEnabled, &LaunchEvent)); + LaunchEvent->CommandType = CommandType; - ZE2UR_CALL(zeCommandListAppendMemoryCopy, - (CommandBuffer->ZeCommandList, Dst, Src, Size, - LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data())); + // Get sync point and register the event with it. + *SyncPoint = CommandBuffer->GetNextSyncPoint(); + CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); - urPrint("calling zeCommandListAppendMemoryCopy() with" - " ZeEvent %#" PRIxPTR "\n", - ur_cast(LaunchEvent->ZeEvent)); + ZE2UR_CALL(zeCommandListAppendMemoryCopy, + (CommandBuffer->ZeCommandList, Dst, Src, Size, + LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data())); + urPrint("calling zeCommandListAppendMemoryCopy() with" + " ZeEvent %#" PRIxPTR "\n", + ur_cast(LaunchEvent->ZeEvent)); + } return UR_RESULT_SUCCESS; } @@ -317,27 +375,36 @@ static ur_result_t enqueueCommandBufferMemCopyRectHelper( const ze_copy_region_t ZeDstRegion = {DstOriginX, DstOriginY, DstOriginZ, Width, Height, Depth}; - std::vector ZeEventList; - UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, - SyncPointWaitList, ZeEventList)); + if (CommandBuffer->IsInOrderCmdList) { + ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion, + (CommandBuffer->ZeCommandList, Dst, &ZeDstRegion, DstPitch, + DstSlicePitch, Src, &ZeSrcRegion, SrcPitch, SrcSlicePitch, + nullptr, 0, nullptr)); - ur_event_handle_t LaunchEvent; - UR_CALL( - EventCreate(CommandBuffer->Context, nullptr, false, false, &LaunchEvent)); - LaunchEvent->CommandType = CommandType; + urPrint("calling zeCommandListAppendMemoryCopyRegion()"); + } else { + std::vector ZeEventList; + UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, + SyncPointWaitList, ZeEventList)); + + ur_event_handle_t LaunchEvent; + UR_CALL(EventCreate(CommandBuffer->Context, nullptr, false, false, + !CommandBuffer->IsProfilingEnabled, &LaunchEvent)); + LaunchEvent->CommandType = CommandType; - // Get sync point and register the event with it. - *SyncPoint = CommandBuffer->GetNextSyncPoint(); - CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); + // Get sync point and register the event with it. + *SyncPoint = CommandBuffer->GetNextSyncPoint(); + CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); - ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion, - (CommandBuffer->ZeCommandList, Dst, &ZeDstRegion, DstPitch, - DstSlicePitch, Src, &ZeSrcRegion, SrcPitch, SrcSlicePitch, - LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data())); + ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion, + (CommandBuffer->ZeCommandList, Dst, &ZeDstRegion, DstPitch, + DstSlicePitch, Src, &ZeSrcRegion, SrcPitch, SrcSlicePitch, + LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data())); - urPrint("calling zeCommandListAppendMemoryCopyRegion() with" - " ZeEvent %#" PRIxPTR "\n", - ur_cast(LaunchEvent->ZeEvent)); + urPrint("calling zeCommandListAppendMemoryCopyRegion() with" + " ZeEvent %#" PRIxPTR "\n", + ur_cast(LaunchEvent->ZeEvent)); + } return UR_RESULT_SUCCESS; } @@ -361,26 +428,34 @@ static ur_result_t enqueueCommandBufferFillHelper( .ZeProperties.maxMemoryFillPatternSize, UR_RESULT_ERROR_INVALID_VALUE); - std::vector ZeEventList; - UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, - SyncPointWaitList, ZeEventList)); + if (CommandBuffer->IsInOrderCmdList) { + ZE2UR_CALL(zeCommandListAppendMemoryFill, + (CommandBuffer->ZeCommandList, Ptr, Pattern, PatternSize, Size, + nullptr, 0, nullptr)); + + urPrint("calling zeCommandListAppendMemoryFill()"); + } else { + std::vector ZeEventList; + UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, + SyncPointWaitList, ZeEventList)); - ur_event_handle_t LaunchEvent; - UR_CALL( - EventCreate(CommandBuffer->Context, nullptr, false, true, &LaunchEvent)); - LaunchEvent->CommandType = CommandType; + ur_event_handle_t LaunchEvent; + UR_CALL(EventCreate(CommandBuffer->Context, nullptr, false, true, + !CommandBuffer->IsProfilingEnabled, &LaunchEvent)); + LaunchEvent->CommandType = CommandType; - // Get sync point and register the event with it. - *SyncPoint = CommandBuffer->GetNextSyncPoint(); - CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); + // Get sync point and register the event with it. + *SyncPoint = CommandBuffer->GetNextSyncPoint(); + CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); - ZE2UR_CALL(zeCommandListAppendMemoryFill, - (CommandBuffer->ZeCommandList, Ptr, Pattern, PatternSize, Size, - LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data())); + ZE2UR_CALL(zeCommandListAppendMemoryFill, + (CommandBuffer->ZeCommandList, Ptr, Pattern, PatternSize, Size, + LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data())); - urPrint("calling zeCommandListAppendMemoryFill() with" - " ZeEvent %#lx\n", - ur_cast(LaunchEvent->ZeEvent)); + urPrint("calling zeCommandListAppendMemoryFill() with" + " ZeEvent %#lx\n", + ur_cast(LaunchEvent->ZeEvent)); + } return UR_RESULT_SUCCESS; } @@ -389,6 +464,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, const ur_exp_command_buffer_desc_t *CommandBufferDesc, ur_exp_command_buffer_handle_t *CommandBuffer) { + // In-order command-lists are not available in old driver version. + bool CompatibleDriver = IsDriverVersionNewerOrSimilar(Context, 1, 3, 28454); + const bool IsInOrder = + CompatibleDriver + ? (CommandBufferDesc ? CommandBufferDesc->isInOrder : false) + : false; + // Force compute queue type for now. Copy engine types may be better suited // for host to device copies. uint32_t QueueGroupOrdinal = @@ -404,10 +486,11 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, (Context->ZeContext, Device->ZeDevice, &ZeCommandListDesc, &ZeCommandListResetEvents)); - // Dependencies between commands are explicitly enforced by sync points when - // enqueuing. Consequently, relax the command ordering in the command list - // can enable the backend to further optimize the workload - ZeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING; + // For non-linear graph, dependencies between commands are explicitly enforced + // by sync points when enqueuing. Consequently, relax the command ordering in + // the command list can enable the backend to further optimize the workload + ZeCommandListDesc.flags = IsInOrder ? ZE_COMMAND_LIST_FLAG_IN_ORDER + : ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING; ZeStruct ZeMutableCommandListDesc; if (CommandBufferDesc && CommandBufferDesc->isUpdatable) { @@ -423,7 +506,7 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, try { *CommandBuffer = new ur_exp_command_buffer_handle_t_( Context, Device, ZeCommandList, ZeCommandListResetEvents, - ZeCommandListDesc, CommandBufferDesc); + ZeCommandListDesc, CommandBufferDesc, IsInOrder); } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } catch (...) { @@ -434,10 +517,13 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, // on command-buffer enqueue. auto RetCommandBuffer = *CommandBuffer; UR_CALL(EventCreate(Context, nullptr, false, false, + !RetCommandBuffer->IsProfilingEnabled, &RetCommandBuffer->SignalEvent)); UR_CALL(EventCreate(Context, nullptr, false, false, + !RetCommandBuffer->IsProfilingEnabled, &RetCommandBuffer->WaitEvent)); UR_CALL(EventCreate(Context, nullptr, false, false, + !RetCommandBuffer->IsProfilingEnabled, &RetCommandBuffer->AllResetEvent)); // Add prefix commands @@ -489,11 +575,24 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) { (CommandBuffer->ZeCommandListResetEvents, CommandBuffer->AllResetEvent->ZeEvent)); - // Wait for all the user added commands to complete, and signal the - // command-buffer signal-event when they are done. - ZE2UR_CALL(zeCommandListAppendBarrier, - (CommandBuffer->ZeCommandList, CommandBuffer->SignalEvent->ZeEvent, - NumEvents, CommandBuffer->ZeEventsList.data())); + if (CommandBuffer->IsInOrderCmdList) { + ZE2UR_CALL( + zeCommandListAppendSignalEvent, + (CommandBuffer->ZeCommandList, CommandBuffer->SignalEvent->ZeEvent)); + } else { + // Create a list of events for our signal event to wait on + const size_t NumEvents = CommandBuffer->SyncPoints.size(); + std::vector WaitEventList{NumEvents}; + for (size_t i = 0; i < NumEvents; i++) { + WaitEventList[i] = CommandBuffer->SyncPoints[i]->ZeEvent; + } + + // Wait for all the user added commands to complete, and signal the + // command-buffer signal-event when they are done. + ZE2UR_CALL(zeCommandListAppendBarrier, (CommandBuffer->ZeCommandList, + CommandBuffer->SignalEvent->ZeEvent, + NumEvents, WaitEventList.data())); + } // Close the command lists and have them ready for dispatch. ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandList)); @@ -551,21 +650,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( ZE2UR_CALL(zeKernelSetGroupSize, (Kernel->ZeKernel, WG[0], WG[1], WG[2])); - std::vector ZeEventList; - UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, - SyncPointWaitList, ZeEventList)); - ur_event_handle_t LaunchEvent; - UR_CALL( - EventCreate(CommandBuffer->Context, nullptr, false, false, &LaunchEvent)); - LaunchEvent->CommandType = UR_COMMAND_KERNEL_LAUNCH; - - if (SyncPoint) { - // Get sync point and register the event with it. - *SyncPoint = CommandBuffer->GetNextSyncPoint(); - CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); - } - - LaunchEvent->CommandData = (void *)Kernel; + CommandBuffer->KernelsList.push_back(Kernel); // Increment the reference count of the Kernel and indicate that the Kernel // is in use. Once the event has been signaled, the code in // CleanupCompletedEvent(Event) will do a urKernelRelease to update the @@ -601,14 +686,36 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( return UR_RESULT_ERROR_UNKNOWN; } - ZE2UR_CALL(zeCommandListAppendLaunchKernel, - (CommandBuffer->ZeCommandList, Kernel->ZeKernel, - &ZeThreadGroupDimensions, LaunchEvent->ZeEvent, - ZeEventList.size(), ZeEventList.data())); + if (CommandBuffer->IsInOrderCmdList) { + ZE2UR_CALL(zeCommandListAppendLaunchKernel, + (CommandBuffer->ZeCommandList, Kernel->ZeKernel, + &ZeThreadGroupDimensions, nullptr, 0, nullptr)); - urPrint("calling zeCommandListAppendLaunchKernel() with" - " ZeEvent %#" PRIxPTR "\n", - ur_cast(LaunchEvent->ZeEvent)); + urPrint("calling zeCommandListAppendLaunchKernel()"); + } else { + std::vector ZeEventList; + UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, + SyncPointWaitList, ZeEventList)); + ur_event_handle_t LaunchEvent; + UR_CALL(EventCreate(CommandBuffer->Context, nullptr, false, false, + !CommandBuffer->IsProfilingEnabled, &LaunchEvent)); + LaunchEvent->CommandType = UR_COMMAND_KERNEL_LAUNCH; + + if (SyncPoint) { + // Get sync point and register the event with it. + *SyncPoint = CommandBuffer->GetNextSyncPoint(); + CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); + } + + ZE2UR_CALL(zeCommandListAppendLaunchKernel, + (CommandBuffer->ZeCommandList, Kernel->ZeKernel, + &ZeThreadGroupDimensions, LaunchEvent->ZeEvent, + ZeEventList.size(), ZeEventList.data())); + + urPrint("calling zeCommandListAppendLaunchKernel() with" + " ZeEvent %#" PRIxPTR "\n", + ur_cast(LaunchEvent->ZeEvent)); + } return UR_RESULT_SUCCESS; } @@ -758,34 +865,41 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( ur_exp_command_buffer_sync_point_t *SyncPoint) { std::ignore = Flags; - std::vector ZeEventList; - UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, - SyncPointWaitList, ZeEventList)); - - if (NumSyncPointsInWaitList) { - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (CommandBuffer->ZeCommandList, NumSyncPointsInWaitList, - ZeEventList.data())); - } + if (CommandBuffer->IsInOrderCmdList) { + // Add the prefetch command to the command buffer. + // Note that L0 does not handle migration flags. + ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, + (CommandBuffer->ZeCommandList, Mem, Size)); + } else { + std::vector ZeEventList; + UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, + SyncPointWaitList, ZeEventList)); + + if (NumSyncPointsInWaitList) { + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (CommandBuffer->ZeCommandList, NumSyncPointsInWaitList, + ZeEventList.data())); + } - ur_event_handle_t LaunchEvent; - UR_CALL( - EventCreate(CommandBuffer->Context, nullptr, false, true, &LaunchEvent)); - LaunchEvent->CommandType = UR_COMMAND_USM_PREFETCH; + ur_event_handle_t LaunchEvent; + UR_CALL(EventCreate(CommandBuffer->Context, nullptr, false, true, + !CommandBuffer->IsProfilingEnabled, &LaunchEvent)); + LaunchEvent->CommandType = UR_COMMAND_USM_PREFETCH; - // Get sync point and register the event with it. - *SyncPoint = CommandBuffer->GetNextSyncPoint(); - CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); + // Get sync point and register the event with it. + *SyncPoint = CommandBuffer->GetNextSyncPoint(); + CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); - // Add the prefetch command to the command buffer. - // Note that L0 does not handle migration flags. - ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, - (CommandBuffer->ZeCommandList, Mem, Size)); + // Add the prefetch command to the command buffer. + // Note that L0 does not handle migration flags. + ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, + (CommandBuffer->ZeCommandList, Mem, Size)); - // Level Zero does not have a completion "event" with the prefetch API, - // so manually add command to signal our event. - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (CommandBuffer->ZeCommandList, LaunchEvent->ZeEvent)); + // Level Zero does not have a completion "event" with the prefetch API, + // so manually add command to signal our event. + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (CommandBuffer->ZeCommandList, LaunchEvent->ZeEvent)); + } return UR_RESULT_SUCCESS; } @@ -822,33 +936,39 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMAdviseExp( ze_memory_advice_t ZeAdvice = static_cast(Value); - std::vector ZeEventList; - UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, - SyncPointWaitList, ZeEventList)); - - if (NumSyncPointsInWaitList) { - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (CommandBuffer->ZeCommandList, NumSyncPointsInWaitList, - ZeEventList.data())); - } + if (CommandBuffer->IsInOrderCmdList) { + ZE2UR_CALL(zeCommandListAppendMemAdvise, + (CommandBuffer->ZeCommandList, CommandBuffer->Device->ZeDevice, + Mem, Size, ZeAdvice)); + } else { + std::vector ZeEventList; + UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, + SyncPointWaitList, ZeEventList)); + + if (NumSyncPointsInWaitList) { + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (CommandBuffer->ZeCommandList, NumSyncPointsInWaitList, + ZeEventList.data())); + } - ur_event_handle_t LaunchEvent; - UR_CALL( - EventCreate(CommandBuffer->Context, nullptr, false, true, &LaunchEvent)); - LaunchEvent->CommandType = UR_COMMAND_USM_ADVISE; + ur_event_handle_t LaunchEvent; + UR_CALL(EventCreate(CommandBuffer->Context, nullptr, false, true, + !CommandBuffer->IsProfilingEnabled, &LaunchEvent)); + LaunchEvent->CommandType = UR_COMMAND_USM_ADVISE; - // Get sync point and register the event with it. - *SyncPoint = CommandBuffer->GetNextSyncPoint(); - CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); + // Get sync point and register the event with it. + *SyncPoint = CommandBuffer->GetNextSyncPoint(); + CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent); - ZE2UR_CALL(zeCommandListAppendMemAdvise, - (CommandBuffer->ZeCommandList, CommandBuffer->Device->ZeDevice, - Mem, Size, ZeAdvice)); + ZE2UR_CALL(zeCommandListAppendMemAdvise, + (CommandBuffer->ZeCommandList, CommandBuffer->Device->ZeDevice, + Mem, Size, ZeAdvice)); - // Level Zero does not have a completion "event" with the advise API, - // so manually add command to signal our event. - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (CommandBuffer->ZeCommandList, LaunchEvent->ZeEvent)); + // Level Zero does not have a completion "event" with the advise API, + // so manually add command to signal our event. + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (CommandBuffer->ZeCommandList, LaunchEvent->ZeEvent)); + } return UR_RESULT_SUCCESS; } @@ -973,7 +1093,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( Queue, &RetEvent, UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP, SignalCommandList, false, false, true)); - if ((Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE)) { + if ((Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE) && + (!CommandBuffer->IsInOrderCmdList) && + (CommandBuffer->IsProfilingEnabled)) { // Multiple submissions of a command buffer implies that we need to save // the event timestamps before resubmiting the command buffer. We // therefore copy the these timestamps in a dedicated USM memory section diff --git a/source/adapters/level_zero/command_buffer.hpp b/source/adapters/level_zero/command_buffer.hpp index 67f4afd54c..642451c0e3 100644 --- a/source/adapters/level_zero/command_buffer.hpp +++ b/source/adapters/level_zero/command_buffer.hpp @@ -30,7 +30,7 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { ze_command_list_handle_t CommandList, ze_command_list_handle_t CommandListResetEvents, ZeStruct ZeDesc, - const ur_exp_command_buffer_desc_t *Desc); + const ur_exp_command_buffer_desc_t *Desc, const bool IsInOrderCmdList); ~ur_exp_command_buffer_handle_t_(); @@ -82,6 +82,14 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { bool IsUpdatable = false; // Indicates if command buffer was finalized. bool IsFinalized = false; + // Command-buffer profiling is enabled. + bool IsProfilingEnabled = false; + // Command-buffer can be submitted to an in-order command-list. + bool IsInOrderCmdList = false; + // List of kernels. + // This list is needed to release all kernels retained by the + // command_buffer. + std::vector KernelsList; }; struct ur_exp_command_buffer_command_handle_t_ : public _ur_object { diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index c9d1c7d6b4..165a29659f 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -760,7 +760,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urExtEventCreate( ur_event_handle_t *Event ///< [out] pointer to the handle of the event object created. ) { - UR_CALL(EventCreate(Context, nullptr, false, true, Event)); + UR_CALL(EventCreate(Context, nullptr, false, true, false, Event)); (*Event)->RefCountExternal++; ZE2UR_CALL(zeEventHostSignal, ((*Event)->ZeEvent)); @@ -778,7 +778,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle( // we dont have urEventCreate, so use this check for now to know that // the call comes from urEventCreate() if (NativeEvent == nullptr) { - UR_CALL(EventCreate(Context, nullptr, false, true, Event)); + UR_CALL(EventCreate(Context, nullptr, false, true, false, Event)); (*Event)->RefCountExternal++; ZE2UR_CALL(zeEventHostSignal, ((*Event)->ZeEvent)); @@ -1057,9 +1057,11 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked, // ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, bool IsMultiDevice, bool HostVisible, + bool ForceDisableProfiling, ur_event_handle_t *RetEvent) { - bool ProfilingEnabled = !Queue || Queue->isProfilingEnabled(); + bool ProfilingEnabled = + ForceDisableProfiling ? false : (!Queue || Queue->isProfilingEnabled()); ur_device_handle_t Device = nullptr; diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp index c266de8c0d..a927fcf096 100644 --- a/source/adapters/level_zero/event.hpp +++ b/source/adapters/level_zero/event.hpp @@ -31,6 +31,7 @@ extern "C" { ur_result_t urEventReleaseInternal(ur_event_handle_t Event); ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, bool IsMultiDevice, bool HostVisible, + bool ForceDisableProfiling, ur_event_handle_t *RetEvent); } // extern "C" diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index 187f4f75f9..6b51416c1b 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -1519,7 +1519,7 @@ ur_result_t createEventAndAssociateQueue(ur_queue_handle_t Queue, if (*Event == nullptr) UR_CALL(EventCreate(Queue->Context, Queue, IsMultiDevice, - HostVisible.value(), Event)); + HostVisible.value(), false, Event)); (*Event)->UrQueue = Queue; (*Event)->CommandType = CommandType;