From 6d5860944fcc8dc08a4026fa63a61fc215c5cde7 Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Tue, 5 Mar 2024 13:45:38 +0000 Subject: [PATCH 01/11] [CMake] Support external adapter source dirs Allow the configuration of external source directory for any of the existing adapters. This is done via a CMake cache variable which if set will is used as the `source_dir` argument to `add_subdirectory()`, the `binary_dir` argument will also be set to the same location as a normal in-tree build. This is done via the addition of the new `add_adapter()` helper function which also hoists the logic to enable/disable each adapter. --- CMakeLists.txt | 10 ++++++++++ source/adapters/CMakeLists.txt | 26 +++++++++++++++++++++----- 2 files changed, 31 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c09b5f0a3c..e2b2f1dc86 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,6 +52,16 @@ set(UR_SYCL_LIBRARY_DIR "" CACHE PATH set(UR_CONFORMANCE_TARGET_TRIPLES "" CACHE STRING "List of sycl targets to build CTS device binaries for") set(UR_CONFORMANCE_AMD_ARCH "" CACHE STRING "AMD device target ID to build CTS binaries for") +set(UR_ADAPTER_LEVEL_ZERO_SOURCE_DIR "" CACHE PATH + "Path to external 'level_zero' adapter source dir") +set(UR_ADAPTER_OPENCL_SOURCE_DIR "" CACHE PATH + "Path to external 'opencl' adapter source dir") +set(UR_ADAPTER_CUDA_SOURCE_DIR "" CACHE PATH + "Path to external 'cuda' adapter source dir") +set(UR_ADAPTER_HIP_SOURCE_DIR "" CACHE PATH + "Path to external 'hip' adapter source dir") +set(UR_ADAPTER_NATIVE_CPU_SOURCE_DIR "" CACHE PATH + "Path to external 'native_cpu' adapter source dir") include(Assertions) diff --git a/source/adapters/CMakeLists.txt b/source/adapters/CMakeLists.txt index f0c9b71200..71b9baafa2 100644 --- a/source/adapters/CMakeLists.txt +++ b/source/adapters/CMakeLists.txt @@ -30,21 +30,37 @@ endfunction() add_subdirectory(null) +function(add_ur_adapter_subdirectory name) + string(TOUPPER ${name} NAME) + if(UR_ADAPTER_${NAME}_SOURCE_DIR) + if(NOT IS_DIRECTORY ${UR_ADAPTER_${NAME}_SOURCE_DIR}) + message(FATAL_ERROR + "UR_ADAPTER_${NAME}_SOURCE_DIR is not a directory: " + "${UR_ADAPTER_${NAME}_SOURCE_DIR}") + endif() + add_subdirectory( + "${UR_ADAPTER_${NAME}_SOURCE_DIR}" + "${CMAKE_CURRENT_BINARY_DIR}/${name}") + else() + add_subdirectory(${name}) + endif() +endfunction() + if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_ALL) - add_subdirectory(level_zero) + add_ur_adapter_subdirectory(level_zero) endif() if(UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_ALL) - add_subdirectory(cuda) + add_ur_adapter_subdirectory(cuda) endif() if(UR_BUILD_ADAPTER_HIP OR UR_BUILD_ADAPTER_ALL) - add_subdirectory(hip) + add_ur_adapter_subdirectory(hip) endif() if(UR_BUILD_ADAPTER_OPENCL OR UR_BUILD_ADAPTER_ALL) - add_subdirectory(opencl) + add_ur_adapter_subdirectory(opencl) endif() if(UR_BUILD_ADAPTER_NATIVE_CPU OR UR_BUILD_ADAPTER_ALL) - add_subdirectory(native_cpu) + add_ur_adapter_subdirectory(native_cpu) endif() From e9f855d478c25d6f1d75ad3d3687a3a7e4747297 Mon Sep 17 00:00:00 2001 From: "Neil R. Spruit" Date: Mon, 11 Mar 2024 10:58:35 -0700 Subject: [PATCH 02/11] [L0] Support for urUsmP2PPeerAccessGetInfoExp to query p2p access info Signed-off-by: Neil R. Spruit --- source/adapters/level_zero/usm_p2p.cpp | 37 +++++++++++++++++++++----- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/source/adapters/level_zero/usm_p2p.cpp b/source/adapters/level_zero/usm_p2p.cpp index dc59bbcc4b..7a9e4e3b9b 100644 --- a/source/adapters/level_zero/usm_p2p.cpp +++ b/source/adapters/level_zero/usm_p2p.cpp @@ -35,11 +35,36 @@ UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( ur_exp_peer_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet) { - std::ignore = commandDevice; - std::ignore = peerDevice; - std::ignore = propName; - UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); - // Zero return value indicates that all of the queries currently return false. - return ReturnValue(uint32_t{0}); + + bool propertyValue = false; + switch (propName) { + case UR_EXP_PEER_INFO_UR_PEER_ACCESS_SUPPORTED: { + bool p2pAccessSupported = false; + ze_device_p2p_properties_t p2pProperties; + ZE2UR_CALL(zeDeviceGetP2PProperties, + (commandDevice->ZeDevice, peerDevice->ZeDevice, &p2pProperties)); + if (p2pProperties.flags & ZE_DEVICE_P2P_PROPERTY_FLAG_ACCESS) { + p2pAccessSupported = true; + } + ze_bool_t p2pDeviceSupported = false; + ZE2UR_CALL( + zeDeviceCanAccessPeer, + (commandDevice->ZeDevice, peerDevice->ZeDevice, &p2pDeviceSupported)); + propertyValue = p2pAccessSupported && p2pDeviceSupported; + break; + } + case UR_EXP_PEER_INFO_UR_PEER_ATOMICS_SUPPORTED: { + ze_device_p2p_properties_t p2pProperties; + ZE2UR_CALL(zeDeviceGetP2PProperties, + (commandDevice->ZeDevice, peerDevice->ZeDevice, &p2pProperties)); + propertyValue = p2pProperties.flags & ZE_DEVICE_P2P_PROPERTY_FLAG_ATOMICS; + break; + } + default: { + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + + return ReturnValue(propertyValue); } From 0446c65e869d5098fa006d0035a5b00240400bee Mon Sep 17 00:00:00 2001 From: Konrad Kusiak Date: Wed, 21 Feb 2024 14:58:54 +0000 Subject: [PATCH 03/11] Fixed issue with function pointer typedefs for windows build --- source/adapters/cuda/tracing.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/source/adapters/cuda/tracing.cpp b/source/adapters/cuda/tracing.cpp index e3acf03165..1552e5e236 100644 --- a/source/adapters/cuda/tracing.cpp +++ b/source/adapters/cuda/tracing.cpp @@ -27,20 +27,20 @@ using tracing_event_t = xpti_td *; using subscriber_handle_t = CUpti_SubscriberHandle; -using cuptiSubscribe_fn = CUPTIAPI -CUptiResult (*)(CUpti_SubscriberHandle *subscriber, CUpti_CallbackFunc callback, - void *userdata); +using cuptiSubscribe_fn = + CUptiResult(CUPTIAPI *)(CUpti_SubscriberHandle *subscriber, + CUpti_CallbackFunc callback, void *userdata); -using cuptiUnsubscribe_fn = CUPTIAPI -CUptiResult (*)(CUpti_SubscriberHandle subscriber); +using cuptiUnsubscribe_fn = + CUptiResult(CUPTIAPI *)(CUpti_SubscriberHandle subscriber); -using cuptiEnableDomain_fn = CUPTIAPI -CUptiResult (*)(uint32_t enable, CUpti_SubscriberHandle subscriber, - CUpti_CallbackDomain domain); +using cuptiEnableDomain_fn = + CUptiResult(CUPTIAPI *)(uint32_t enable, CUpti_SubscriberHandle subscriber, + CUpti_CallbackDomain domain); -using cuptiEnableCallback_fn = CUPTIAPI -CUptiResult (*)(uint32_t enable, CUpti_SubscriberHandle subscriber, - CUpti_CallbackDomain domain, CUpti_CallbackId cbid); +using cuptiEnableCallback_fn = + CUptiResult(CUPTIAPI *)(uint32_t enable, CUpti_SubscriberHandle subscriber, + CUpti_CallbackDomain domain, CUpti_CallbackId cbid); #define LOAD_CUPTI_SYM(p, lib, x) \ p.x = (cupti##x##_fn)ur_loader::LibLoader::getFunctionPtr(lib.get(), \ From cb7b1262f3c0df42d9b16e67dc27b4fb75a301b9 Mon Sep 17 00:00:00 2001 From: Maosu Zhao Date: Tue, 12 Mar 2024 15:48:36 +0800 Subject: [PATCH 04/11] [OCL] Gracefully tear down adapter in case that some globals have been released Sometimes the adapter may exit before sycl lib, if sycl lib call urAdapterRelease later it may cause segment fault since some globals already have been released. --- source/adapters/opencl/adapter.cpp | 43 +++++++++++++++++++++++------- source/adapters/opencl/common.hpp | 2 +- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/source/adapters/opencl/adapter.cpp b/source/adapters/opencl/adapter.cpp index 8ae1e77755..c9e0e8af20 100644 --- a/source/adapters/opencl/adapter.cpp +++ b/source/adapters/opencl/adapter.cpp @@ -10,23 +10,40 @@ #include "common.hpp" +#ifdef _WIN32 +#include +#endif + struct ur_adapter_handle_t_ { std::atomic RefCount = 0; std::mutex Mutex; }; -ur_adapter_handle_t_ adapter{}; +static ur_adapter_handle_t_ *adapter = new ur_adapter_handle_t_(); + +static void globalAdapterShutdown() { + if (cl_ext::ExtFuncPtrCache) { + delete cl_ext::ExtFuncPtrCache; + cl_ext::ExtFuncPtrCache = nullptr; + } + if (adapter) { + delete adapter; + adapter = nullptr; + } +} UR_APIEXPORT ur_result_t UR_APICALL urAdapterGet(uint32_t NumEntries, ur_adapter_handle_t *phAdapters, uint32_t *pNumAdapters) { if (NumEntries > 0 && phAdapters) { - std::lock_guard Lock{adapter.Mutex}; - if (adapter.RefCount++ == 0) { - cl_ext::ExtFuncPtrCache = std::make_unique(); + std::lock_guard Lock{adapter->Mutex}; + if (adapter->RefCount++ == 0) { + cl_ext::ExtFuncPtrCache = new cl_ext::ExtFuncPtrCacheT(); } - *phAdapters = &adapter; + *phAdapters = adapter; + + atexit(globalAdapterShutdown); } if (pNumAdapters) { @@ -37,14 +54,20 @@ urAdapterGet(uint32_t NumEntries, ur_adapter_handle_t *phAdapters, } UR_APIEXPORT ur_result_t UR_APICALL urAdapterRetain(ur_adapter_handle_t) { - ++adapter.RefCount; + ++adapter->RefCount; return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urAdapterRelease(ur_adapter_handle_t) { - std::lock_guard Lock{adapter.Mutex}; - if (--adapter.RefCount == 0) { - cl_ext::ExtFuncPtrCache.reset(); + // Check first if the adapter is valid pointer + if (adapter) { + std::lock_guard Lock{adapter->Mutex}; + if (--adapter->RefCount == 0) { + if (cl_ext::ExtFuncPtrCache) { + delete cl_ext::ExtFuncPtrCache; + cl_ext::ExtFuncPtrCache = nullptr; + } + } } return UR_RESULT_SUCCESS; } @@ -68,7 +91,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t, case UR_ADAPTER_INFO_BACKEND: return ReturnValue(UR_ADAPTER_BACKEND_OPENCL); case UR_ADAPTER_INFO_REFERENCE_COUNT: - return ReturnValue(adapter.RefCount.load()); + return ReturnValue(adapter->RefCount.load()); default: return UR_RESULT_ERROR_INVALID_ENUMERATION; } diff --git a/source/adapters/opencl/common.hpp b/source/adapters/opencl/common.hpp index 0667cd3d17..43d1c12b1e 100644 --- a/source/adapters/opencl/common.hpp +++ b/source/adapters/opencl/common.hpp @@ -349,7 +349,7 @@ struct ExtFuncPtrCacheT { // piTeardown to avoid issues with static destruction order (a user application // might have static objects that indirectly access this cache in their // destructor). -inline std::unique_ptr ExtFuncPtrCache; +inline ExtFuncPtrCacheT *ExtFuncPtrCache; // USM helper function to get an extension function pointer template From aaa0458b782ea6183b21ead1ff14a947afe40a30 Mon Sep 17 00:00:00 2001 From: Maosu Zhao Date: Tue, 12 Mar 2024 20:02:49 +0800 Subject: [PATCH 05/11] Remove unused header file --- source/adapters/opencl/adapter.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/source/adapters/opencl/adapter.cpp b/source/adapters/opencl/adapter.cpp index c9e0e8af20..c25319ce67 100644 --- a/source/adapters/opencl/adapter.cpp +++ b/source/adapters/opencl/adapter.cpp @@ -10,10 +10,6 @@ #include "common.hpp" -#ifdef _WIN32 -#include -#endif - struct ur_adapter_handle_t_ { std::atomic RefCount = 0; std::mutex Mutex; From 26682290a43125ba48e10978bfa6a4c063293030 Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Tue, 20 Feb 2024 16:52:34 +0000 Subject: [PATCH 06/11] [EXP][CMDBUF] Move event reset commands to dedicated cmd-list Create a new command-list to reset all the events of the enqueued graph cmd. This allows us to move the enqueueing of reset commands from the command-buffer enqueue function to the finalize function, and allow us to perform this task only once. Move signal event reset from main cmd-list to reset cmd-list. Move the documentation to DPC++ design doc. --- source/adapters/level_zero/command_buffer.cpp | 178 +++++++----------- source/adapters/level_zero/command_buffer.hpp | 18 +- 2 files changed, 82 insertions(+), 114 deletions(-) diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 7dc2a42fd6..ced2d0286b 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -10,88 +10,18 @@ #include "command_buffer.hpp" #include "ur_level_zero.hpp" -/* Command-buffer Extension - - The UR interface for submitting a UR command-buffer takes a list - of events to wait on, and returns an event representing the completion of - that particular submission of the command-buffer. - - However, in `zeCommandQueueExecuteCommandLists` there are no parameters to - take a waitlist and also the only sync primitive returned is to block on - host. - - In order to get the UR command-buffer enqueue semantics we want with L0 - this adapter adds extra commands to the L0 command-list representing a - UR command-buffer. - - Prefix - Commands added to the start of the L0 command-list by L0 adapter. - Suffix - Commands added to the end of the L0 command-list by L0 adapter. - - These extra commands operate on L0 event synchronisation primitives used by - the command-list to interact with the external UR wait-list and UR return - event required for the enqueue interface. - - The `ur_exp_command_buffer_handle_t` class for this adapter contains a - SignalEvent which signals the completion of the command-list in the suffix, - and is reset in the prefix. This signal is detected by a new UR return event - created on UR command-buffer enqueue. - - There is also a WaitEvent used by the `ur_exp_command_buffer_handle_t` class - in the prefix to wait on any dependencies passed in the enqueue wait-list. - This WaitEvent is reset at the end of the suffix, along with reset commands - to reset the L0 events used to implement the UR sync-points. - - ┌──────────┬────────────────────────────────────────────────┬─────────┐ - │ Prefix │ Commands added to UR command-buffer by UR user │ Suffix │ - └──────────┴────────────────────────────────────────────────┴─────────┘ - - ┌───────────────────┬──────────────┐──────────────────────────────┐ - Prefix │Reset signal event │ Reset events │ Barrier waiting on wait event│ - └───────────────────┴──────────────┘──────────────────────────────┘ - - ┌─────────────────────────────────────────────┐──────────────┐ - Suffix │Barrier waiting on sync-point event, │ Query CMD │ - │signaling the UR command-buffer signal event │ Timestamps │ - └─────────────────────────────────────────────┘──────────────┘ - - For a call to `urCommandBufferEnqueueExp` with an event_list `EL`, - command-buffer `CB`, and return event `RE` our implementation has to create - and submit two new command-lists for the above approach to work. One before - the command-list with extra commands associated with `CB`, and the other - after `CB`. - - Command-list created on `urCommandBufferEnqueueExp` to execution before `CB`: - ┌───────────────────────────────────────────────────────────┐ - │Barrier on `EL` than signals `CB` WaitEvent when completed │ - └───────────────────────────────────────────────────────────┘ - - Command-list created on `urCommandBufferEnqueueExp` to execution after `CB`: - ┌─────────────────────────────────────────────────────────────┐ - │Barrier on `CB` SignalEvent that signals `RE` when completed │ - └─────────────────────────────────────────────────────────────┘ - -Drawbacks ---------- - -There are two drawbacks to this approach: - -1. We use 3x the command-list resources, if there are many UR command-buffers -in flight, this may exhaust L0 driver resources. - -2. Each command list is submitted individually with a -`ur_queue_handle_t_::executeCommandList` call which introduces serialization in -the submission pipeline that is heavier than having a barrier or a -waitForEvents on the same list. Resulting in additional latency when executing -graphs. - +/* L0 Command-buffer Extension Doc see: +https://github.com/intel/llvm/blob/sycl/sycl/doc/design/CommandGraph.md#level-zero */ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( ur_context_handle_t Context, ur_device_handle_t Device, ze_command_list_handle_t CommandList, + ze_command_list_handle_t CommandListResetEvents, ZeStruct ZeDesc, const ur_exp_command_buffer_desc_t *Desc) : Context(Context), Device(Device), ZeCommandList(CommandList), + ZeCommandListResetEvents(CommandListResetEvents), ZeCommandListDesc(ZeDesc), ZeFencesList(), QueueProperties(), SyncPoints(), NextSyncPoint(0) { (void)Desc; @@ -114,6 +44,12 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandList)); } + // Release the memory allocated to the CommandListResetEvents stored in the + // command_buffer + if (ZeCommandListResetEvents) { + ZE_CALL_NOCHECK(zeCommandListDestroy, (ZeCommandListResetEvents)); + } + // Release additional signal and wait events used by command_buffer if (SignalEvent) { CleanupCompletedEvent(SignalEvent, false); @@ -123,6 +59,10 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { CleanupCompletedEvent(WaitEvent, false); urEventReleaseInternal(WaitEvent); } + if (AllResetEvent) { + CleanupCompletedEvent(AllResetEvent, false); + urEventReleaseInternal(AllResetEvent); + } // Release events added to the command_buffer for (auto &Sync : SyncPoints) { @@ -434,6 +374,13 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, ZeStruct ZeCommandListDesc; ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal; + + ze_command_list_handle_t ZeCommandListResetEvents; + // Create a command-list for reseting the events associated to enqueued cmd. + ZE2UR_CALL(zeCommandListCreate, + (Context->ZeContext, Device->ZeDevice, &ZeCommandListDesc, + &ZeCommandListResetEvents)); + // Dependencies between commands are explicitly enforced by sync points when // enqueuing. Consequently, relax the command ordering in the command list // can enable the backend to further optimize the workload @@ -446,7 +393,8 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, &ZeCommandListDesc, &ZeCommandList)); try { *CommandBuffer = new ur_exp_command_buffer_handle_t_( - Context, Device, ZeCommandList, ZeCommandListDesc, CommandBufferDesc); + Context, Device, ZeCommandList, ZeCommandListResetEvents, + ZeCommandListDesc, CommandBufferDesc); } catch (const std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } catch (...) { @@ -460,13 +408,19 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, &RetCommandBuffer->SignalEvent)); UR_CALL(EventCreate(Context, nullptr, false, false, &RetCommandBuffer->WaitEvent)); + UR_CALL(EventCreate(Context, nullptr, false, false, + &RetCommandBuffer->AllResetEvent)); // Add prefix commands - ZE2UR_CALL(zeCommandListAppendEventReset, - (ZeCommandList, RetCommandBuffer->SignalEvent->ZeEvent)); + ZE2UR_CALL( + zeCommandListAppendEventReset, + (ZeCommandListResetEvents, RetCommandBuffer->SignalEvent->ZeEvent)); + std::vector PrecondEvents = { + RetCommandBuffer->WaitEvent->ZeEvent, + RetCommandBuffer->AllResetEvent->ZeEvent}; ZE2UR_CALL( zeCommandListAppendBarrier, - (ZeCommandList, nullptr, 1, &RetCommandBuffer->WaitEvent->ZeEvent)); + (ZeCommandList, nullptr, PrecondEvents.size(), PrecondEvents.data())); return UR_RESULT_SUCCESS; } @@ -488,20 +442,29 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t CommandBuffer) { UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) { // Create a list of events for our signal event to wait on + // This loop also resets the L0 events we use for command-buffer internal + // sync-points to the non-signaled state. + // This is required for multiple submissions. const size_t NumEvents = CommandBuffer->SyncPoints.size(); - std::vector WaitEventList{NumEvents}; for (size_t i = 0; i < NumEvents; i++) { - WaitEventList[i] = CommandBuffer->SyncPoints[i]->ZeEvent; + auto ZeEvent = CommandBuffer->SyncPoints[i]->ZeEvent; + CommandBuffer->ZeEventsList.push_back(ZeEvent); + ZE2UR_CALL(zeCommandListAppendEventReset, + (CommandBuffer->ZeCommandListResetEvents, ZeEvent)); } + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (CommandBuffer->ZeCommandListResetEvents, + CommandBuffer->AllResetEvent->ZeEvent)); // Wait for all the user added commands to complete, and signal the // command-buffer signal-event when they are done. ZE2UR_CALL(zeCommandListAppendBarrier, (CommandBuffer->ZeCommandList, CommandBuffer->SignalEvent->ZeEvent, - NumEvents, WaitEventList.data())); + NumEvents, CommandBuffer->ZeEventsList.data())); - // Close the command list and have it ready for dispatch. + // Close the command lists and have them ready for dispatch. ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandList)); + ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeCommandListResetEvents)); return UR_RESULT_SUCCESS; } @@ -875,26 +838,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); CommandBuffer->ZeFencesList.push_back(ZeFence); - // Create command-list to execute before `CommandListPtr` and will signal - // when `EventWaitList` dependencies are complete. - ur_command_list_ptr_t WaitCommandList{}; - UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList, false, - false)); - - // Create a list of events of all the events that compose the command buffer - // workload. - // This loop also resets the L0 events we use for command-buffer internal - // sync-points to the non-signaled state. - // This is required for multiple submissions. - const size_t NumEvents = CommandBuffer->SyncPoints.size(); - std::vector WaitEventList{NumEvents}; - for (size_t i = 0; i < NumEvents; i++) { - auto ZeEvent = CommandBuffer->SyncPoints[i]->ZeEvent; - WaitEventList[i] = ZeEvent; - ZE2UR_CALL(zeCommandListAppendEventReset, - (WaitCommandList->first, ZeEvent)); - } - bool MustSignalWaitEvent = true; if (NumEventsInWaitList) { _ur_ze_event_list_t TmpWaitList; @@ -909,18 +852,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( CommandBuffer->WaitEvent->WaitList.insert(TmpWaitList); if (!CommandBuffer->WaitEvent->WaitList.isEmpty()) { + // Create command-list to execute before `CommandListPtr` and will signal + // when `EventWaitList` dependencies are complete. + ur_command_list_ptr_t WaitCommandList{}; + UR_CALL(Queue->Context->getAvailableCommandList(Queue, WaitCommandList, + false, false)); + ZE2UR_CALL(zeCommandListAppendBarrier, (WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent, CommandBuffer->WaitEvent->WaitList.Length, CommandBuffer->WaitEvent->WaitList.ZeEventList)); + Queue->executeCommandList(WaitCommandList, false, false); MustSignalWaitEvent = false; } } if (MustSignalWaitEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent)); + ZE2UR_CALL(zeEventHostSignal, (CommandBuffer->WaitEvent->ZeEvent)); } - Queue->executeCommandList(WaitCommandList, false, false); + + // Submit reset events command-list. This command-list is of a batch + // command-list type, regardless of the UR Queue type. We therefore need to + // submit the list directly using the Level-Zero API to avoid type mismatches + // if using UR functions. + ZE2UR_CALL( + zeCommandQueueExecuteCommandLists, + (ZeCommandQueue, 1, &CommandBuffer->ZeCommandListResetEvents, nullptr)); // Submit main command-list. This command-list is of a batch command-list // type, regardless of the UR Queue type. We therefore need to submit the list @@ -940,6 +896,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( // submission dependencies have been satisfied. ZE2UR_CALL(zeCommandListAppendEventReset, (SignalCommandList->first, CommandBuffer->WaitEvent->ZeEvent)); + // Reset the all-reset-event for the UR command-buffer that is signaled when + // all events of the main command-list have been reset. + ZE2UR_CALL(zeCommandListAppendEventReset, + (SignalCommandList->first, CommandBuffer->AllResetEvent->ZeEvent)); if (Event) { UR_CALL(createEventAndAssociateQueue( @@ -955,14 +915,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( // engine to recover these timestamps. command_buffer_profiling_t *Profiling = new command_buffer_profiling_t(); - Profiling->NumEvents = WaitEventList.size(); + Profiling->NumEvents = CommandBuffer->ZeEventsList.size(); Profiling->Timestamps = new ze_kernel_timestamp_result_t[Profiling->NumEvents]; ZE2UR_CALL(zeCommandListAppendQueryKernelTimestamps, - (SignalCommandList->first, WaitEventList.size(), - WaitEventList.data(), (void *)Profiling->Timestamps, 0, - RetEvent->ZeEvent, 1, + (SignalCommandList->first, CommandBuffer->ZeEventsList.size(), + CommandBuffer->ZeEventsList.data(), + (void *)Profiling->Timestamps, 0, RetEvent->ZeEvent, 1, &(CommandBuffer->SignalEvent->ZeEvent))); RetEvent->CommandData = static_cast(Profiling); diff --git a/source/adapters/level_zero/command_buffer.hpp b/source/adapters/level_zero/command_buffer.hpp index 9fa0149d22..843d9d3f37 100644 --- a/source/adapters/level_zero/command_buffer.hpp +++ b/source/adapters/level_zero/command_buffer.hpp @@ -25,11 +25,12 @@ struct command_buffer_profiling_t { }; struct ur_exp_command_buffer_handle_t_ : public _ur_object { - ur_exp_command_buffer_handle_t_(ur_context_handle_t Context, - ur_device_handle_t Device, - ze_command_list_handle_t CommandList, - ZeStruct ZeDesc, - const ur_exp_command_buffer_desc_t *Desc); + ur_exp_command_buffer_handle_t_( + ur_context_handle_t Context, ur_device_handle_t Device, + ze_command_list_handle_t CommandList, + ze_command_list_handle_t CommandListResetEvents, + ZeStruct ZeDesc, + const ur_exp_command_buffer_desc_t *Desc); ~ur_exp_command_buffer_handle_t_(); @@ -49,6 +50,8 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { ur_device_handle_t Device; // Level Zero command list handle ze_command_list_handle_t ZeCommandList; + // Level Zero command list handle + ze_command_list_handle_t ZeCommandListResetEvents; // Level Zero command list descriptor ZeStruct ZeCommandListDesc; // List of Level Zero fences created when submitting a graph. @@ -64,10 +67,15 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object { // Next sync_point value (may need to consider ways to reuse values if 32-bits // is not enough) ur_exp_command_buffer_sync_point_t NextSyncPoint; + // List of Level Zero events associated to submitted commands. + std::vector ZeEventsList; // Event which will signals the most recent execution of the command-buffer // has finished ur_event_handle_t SignalEvent = nullptr; // Event which a command-buffer waits on until the wait-list dependencies // passed to a command-buffer enqueue have been satisfied. ur_event_handle_t WaitEvent = nullptr; + // Event which a command-buffer waits on until the main command-list event + // have been reset. + ur_event_handle_t AllResetEvent = nullptr; }; From aa98f7acd8b8eb99c4b2927f10a6e29df9f6bec4 Mon Sep 17 00:00:00 2001 From: Maosu Zhao Date: Wed, 13 Mar 2024 13:32:32 +0800 Subject: [PATCH 07/11] Fix pre-ci failures --- source/adapters/opencl/adapter.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/source/adapters/opencl/adapter.cpp b/source/adapters/opencl/adapter.cpp index c25319ce67..4c1bb6bca1 100644 --- a/source/adapters/opencl/adapter.cpp +++ b/source/adapters/opencl/adapter.cpp @@ -15,7 +15,7 @@ struct ur_adapter_handle_t_ { std::mutex Mutex; }; -static ur_adapter_handle_t_ *adapter = new ur_adapter_handle_t_(); +static ur_adapter_handle_t_ *adapter = nullptr; static void globalAdapterShutdown() { if (cl_ext::ExtFuncPtrCache) { @@ -32,14 +32,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGet(uint32_t NumEntries, ur_adapter_handle_t *phAdapters, uint32_t *pNumAdapters) { if (NumEntries > 0 && phAdapters) { + // Sometimes urAdaterGet may be called after the library already been torn + // down, we also need to create a temporary handle for it. + if (!adapter) { + adapter = new ur_adapter_handle_t_(); + atexit(globalAdapterShutdown); + } + std::lock_guard Lock{adapter->Mutex}; if (adapter->RefCount++ == 0) { cl_ext::ExtFuncPtrCache = new cl_ext::ExtFuncPtrCacheT(); } *phAdapters = adapter; - - atexit(globalAdapterShutdown); } if (pNumAdapters) { From 3ca422a21e9f230fac614b53d237877a1c845498 Mon Sep 17 00:00:00 2001 From: Sean Stirling Date: Fri, 19 Jan 2024 16:30:26 +0000 Subject: [PATCH 08/11] [Bindless][CUDA] Mipmap interop Extends the CUDA adapter to allow for mipmap interop with bindless images --- source/adapters/cuda/image.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/source/adapters/cuda/image.cpp b/source/adapters/cuda/image.cpp index dc08af248a..8d2610626e 100644 --- a/source/adapters/cuda/image.cpp +++ b/source/adapters/cuda/image.cpp @@ -1006,17 +1006,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp( ArrayDesc.Format = format; CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC mipmapDesc = {}; - mipmapDesc.numLevels = 1; + mipmapDesc.numLevels = pImageDesc->numMipLevel; mipmapDesc.arrayDesc = ArrayDesc; + // External memory is mapped to a CUmipmappedArray + // If desired, a CUarray is retrieved from the mipmaps 0th level CUmipmappedArray memMipMap; UR_CHECK_ERROR(cuExternalMemoryGetMappedMipmappedArray( &memMipMap, (CUexternalMemory)hInteropMem, &mipmapDesc)); - CUarray memArray; - UR_CHECK_ERROR(cuMipmappedArrayGetLevel(&memArray, memMipMap, 0)); + if (pImageDesc->numMipLevel > 1) { + *phImageMem = (ur_exp_image_mem_handle_t)memMipMap; + } else { + CUarray memArray; + UR_CHECK_ERROR(cuMipmappedArrayGetLevel(&memArray, memMipMap, 0)); - *phImageMem = (ur_exp_image_mem_handle_t)memArray; + *phImageMem = (ur_exp_image_mem_handle_t)memArray; + } } catch (ur_result_t Err) { return Err; From 8714b853229ee6b34c2abd17ba4c7f443e410a50 Mon Sep 17 00:00:00 2001 From: Patryk Kaminski Date: Wed, 13 Mar 2024 17:31:12 +0100 Subject: [PATCH 09/11] Disable fuzz tests on ubuntu-22.04 runner --- .github/workflows/cmake.yml | 89 ++++++++++++++++++----------------- .github/workflows/nightly.yml | 7 +-- 2 files changed, 49 insertions(+), 47 deletions(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index d8650eb29a..8ce93b3867 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -115,50 +115,51 @@ jobs: working-directory: ${{github.workspace}}/build run: ctest -C ${{matrix.build_type}} --output-on-failure -L "umf|loader|validation|tracing|unit|urtrace" - fuzztest-build: - name: Build and run quick fuzztest scenarios - strategy: - matrix: - build_type: [Debug, Release] - compiler: [{c: clang, cxx: clang++}] - - runs-on: 'ubuntu-22.04' - - steps: - - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - - - name: Install pip packages - run: pip install -r third_party/requirements.txt - - - name: Download DPC++ - run: | - sudo apt install libncurses5 - wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/sycl-nightly%2F20230626/dpcpp-compiler.tar.gz - tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz - - - name: Setup DPC++ - run: | - source ${{github.workspace}}/dpcpp_compiler/startup.sh - - - name: Configure CMake - run: > - cmake - -B${{github.workspace}}/build - -DCMAKE_C_COMPILER=${{matrix.compiler.c}} - -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}} - -DUR_ENABLE_TRACING=ON - -DCMAKE_BUILD_TYPE=${{matrix.build_type}} - -DUR_BUILD_TESTS=ON - -DUR_USE_ASAN=ON - -DUR_USE_UBSAN=ON - -DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++ - - - name: Build - run: cmake --build ${{github.workspace}}/build -j $(nproc) - - - name: Fuzz test - working-directory: ${{github.workspace}}/build - run: ctest -C ${{matrix.build_type}} --output-on-failure -L "fuzz-short" + # Disable short fuzz tests until the ubuntu-22.04 runner is fixed + # fuzztest-build: + # name: Build and run quick fuzztest scenarios + # strategy: + # matrix: + # build_type: [Debug, Release] + # compiler: [{c: clang, cxx: clang++}] + + # runs-on: 'ubuntu-22.04' + + # steps: + # - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + + # - name: Install pip packages + # run: pip install -r third_party/requirements.txt + + # - name: Download DPC++ + # run: | + # sudo apt install libncurses5 + # wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/sycl-nightly%2F20230626/dpcpp-compiler.tar.gz + # tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz + + # - name: Setup DPC++ + # run: | + # source ${{github.workspace}}/dpcpp_compiler/startup.sh + + # - name: Configure CMake + # run: > + # cmake + # -B${{github.workspace}}/build + # -DCMAKE_C_COMPILER=${{matrix.compiler.c}} + # -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}} + # -DUR_ENABLE_TRACING=ON + # -DCMAKE_BUILD_TYPE=${{matrix.build_type}} + # -DUR_BUILD_TESTS=ON + # -DUR_USE_ASAN=ON + # -DUR_USE_UBSAN=ON + # -DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++ + + # - name: Build + # run: cmake --build ${{github.workspace}}/build -j $(nproc) + + # - name: Fuzz test + # working-directory: ${{github.workspace}}/build + # run: ctest -C ${{matrix.build_type}} --output-on-failure -L "fuzz-short" adapter-build-hw: name: Build - Adapters on HW diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 3deade6603..311fdac40a 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -48,6 +48,7 @@ jobs: LD_LIBRARY_PATH=${{github.workspace}}/dpcpp_compiler/lib cmake --build ${{github.workspace}}/build -j $(nproc) - - name: Fuzz long test - working-directory: ${{github.workspace}}/build - run: ctest -C ${{matrix.build_type}} --output-on-failure -L "fuzz-long" + # Disable long fuzz tests until the ubuntu-22.04 runner is fixed + # - name: Fuzz long test + # working-directory: ${{github.workspace}}/build + # run: ctest -C ${{matrix.build_type}} --output-on-failure -L "fuzz-long" From b064aef6acf1009705c8f6d77bda27606c3bcc9d Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Fri, 2 Feb 2024 16:34:25 +0000 Subject: [PATCH 10/11] [SYCL][Graph] Improve CUDA Fill op implementation. Adjustment of value pointer size according to pattern size. Large patterns are now broken into 1-byte chunks, as in the regular implementation. --- source/adapters/cuda/command_buffer.cpp | 77 ++++++++++++++++++++----- 1 file changed, 61 insertions(+), 16 deletions(-) diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index d3f270c701..f1f7507e21 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -170,7 +170,6 @@ static ur_result_t enqueueCommandBufferFillHelper( try { const size_t N = Size / PatternSize; - auto Value = *static_cast(Pattern); auto DstPtr = DstType == CU_MEMORYTYPE_DEVICE ? *static_cast(DstDevice) : (CUdeviceptr)DstDevice; @@ -183,9 +182,27 @@ static ur_result_t enqueueCommandBufferFillHelper( NodeParams.elementSize = PatternSize; NodeParams.height = N; NodeParams.pitch = PatternSize; - NodeParams.value = Value; NodeParams.width = 1; + // pattern size in bytes + switch (PatternSize) { + case 1: { + auto Value = *static_cast(Pattern); + NodeParams.value = Value; + break; + } + case 2: { + auto Value = *static_cast(Pattern); + NodeParams.value = Value; + break; + } + case 4: { + auto Value = *static_cast(Pattern); + NodeParams.value = Value; + break; + } + } + UR_CHECK_ERROR(cuGraphAddMemsetNode( &GraphNode, CommandBuffer->CudaGraph, DepsList.data(), DepsList.size(), &NodeParams, CommandBuffer->Device->getContext())); @@ -198,29 +215,54 @@ static ur_result_t enqueueCommandBufferFillHelper( // CUDA has no memset functions that allow setting values more than 4 // bytes. UR API lets you pass an arbitrary "pattern" to the buffer // fill, which can be more than 4 bytes. We must break up the pattern - // into 4 byte values, and set the buffer using multiple strided calls. - // This means that one cuGraphAddMemsetNode call is made for every 4 bytes - // in the pattern. + // into 1 byte values, and set the buffer using multiple strided calls. + // This means that one cuGraphAddMemsetNode call is made for every 1 + // bytes in the pattern. + + size_t NumberOfSteps = PatternSize / sizeof(uint8_t); - size_t NumberOfSteps = PatternSize / sizeof(uint32_t); + // Shared pointer that will point to the last node created + std::shared_ptr GraphNodePtr; + // Create a new node + CUgraphNode GraphNodeFirst; + // Update NodeParam + CUDA_MEMSET_NODE_PARAMS NodeParamsStepFirst = {}; + NodeParamsStepFirst.dst = DstPtr; + NodeParamsStepFirst.elementSize = sizeof(uint32_t); + NodeParamsStepFirst.height = Size / sizeof(uint32_t); + NodeParamsStepFirst.pitch = sizeof(uint32_t); + NodeParamsStepFirst.value = *static_cast(Pattern); + NodeParamsStepFirst.width = 1; - // we walk up the pattern in 4-byte steps, and call cuMemset for each - // 4-byte chunk of the pattern. - for (auto Step = 0u; Step < NumberOfSteps; ++Step) { + UR_CHECK_ERROR(cuGraphAddMemsetNode( + &GraphNodeFirst, CommandBuffer->CudaGraph, DepsList.data(), + DepsList.size(), &NodeParamsStepFirst, + CommandBuffer->Device->getContext())); + + // Get sync point and register the cuNode with it. + *SyncPoint = CommandBuffer->AddSyncPoint( + std::make_shared(GraphNodeFirst)); + + DepsList.clear(); + DepsList.push_back(GraphNodeFirst); + + // we walk up the pattern in 1-byte steps, and call cuMemset for each + // 1-byte chunk of the pattern. + for (auto Step = 4u; Step < NumberOfSteps; ++Step) { // take 4 bytes of the pattern - auto Value = *(static_cast(Pattern) + Step); + auto Value = *(static_cast(Pattern) + Step); // offset the pointer to the part of the buffer we want to write to - auto OffsetPtr = DstPtr + (Step * sizeof(uint32_t)); + auto OffsetPtr = DstPtr + (Step * sizeof(uint8_t)); // Create a new node CUgraphNode GraphNode; // Update NodeParam CUDA_MEMSET_NODE_PARAMS NodeParamsStep = {}; NodeParamsStep.dst = (CUdeviceptr)OffsetPtr; - NodeParamsStep.elementSize = 4; - NodeParamsStep.height = N; - NodeParamsStep.pitch = PatternSize; + NodeParamsStep.elementSize = sizeof(uint8_t); + NodeParamsStep.height = Size / NumberOfSteps; + NodeParamsStep.pitch = NumberOfSteps * sizeof(uint8_t); NodeParamsStep.value = Value; NodeParamsStep.width = 1; @@ -229,9 +271,12 @@ static ur_result_t enqueueCommandBufferFillHelper( DepsList.size(), &NodeParamsStep, CommandBuffer->Device->getContext())); + GraphNodePtr = std::make_shared(GraphNode); // Get sync point and register the cuNode with it. - *SyncPoint = CommandBuffer->addSyncPoint( - std::make_shared(GraphNode)); + *SyncPoint = CommandBuffer->AddSyncPoint(GraphNodePtr); + + DepsList.clear(); + DepsList.push_back(*GraphNodePtr.get()); } } } catch (ur_result_t Err) { From ef72b3f8b593c5585e842e9307969e1b1e5df09e Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Mon, 11 Mar 2024 15:02:11 +0000 Subject: [PATCH 11/11] Fixup rebase issue --- source/adapters/cuda/command_buffer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/adapters/cuda/command_buffer.cpp b/source/adapters/cuda/command_buffer.cpp index f1f7507e21..d9d980073a 100644 --- a/source/adapters/cuda/command_buffer.cpp +++ b/source/adapters/cuda/command_buffer.cpp @@ -240,7 +240,7 @@ static ur_result_t enqueueCommandBufferFillHelper( CommandBuffer->Device->getContext())); // Get sync point and register the cuNode with it. - *SyncPoint = CommandBuffer->AddSyncPoint( + *SyncPoint = CommandBuffer->addSyncPoint( std::make_shared(GraphNodeFirst)); DepsList.clear(); @@ -273,7 +273,7 @@ static ur_result_t enqueueCommandBufferFillHelper( GraphNodePtr = std::make_shared(GraphNode); // Get sync point and register the cuNode with it. - *SyncPoint = CommandBuffer->AddSyncPoint(GraphNodePtr); + *SyncPoint = CommandBuffer->addSyncPoint(GraphNodePtr); DepsList.clear(); DepsList.push_back(*GraphNodePtr.get());