diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 377a1a2500..a62f865427 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -164,7 +164,6 @@ jobs: matrix: adapter: [ {name: CUDA, triplet: nvptx64-nvidia-cuda}, - {name: HIP, triplet: amdgcn-amd-amdhsa}, {name: L0, triplet: spir64} ] build_type: [Debug, Release] @@ -209,15 +208,7 @@ jobs: working-directory: ${{github.workspace}}/build run: ctest -C ${{matrix.build_type}} --output-on-failure -L "adapter-specific" --timeout 180 - # Temporarily disabling platform test for L0, because of hang - # See issue: #824 - - name: Test L0 adapter - if: matrix.adapter.name == 'L0' - working-directory: ${{github.workspace}}/build - run: ctest -C ${{matrix.build_type}} --output-on-failure -L "conformance" -E "platform-adapter_level_zero" --timeout 180 - - name: Test adapters - if: matrix.adapter.name != 'L0' working-directory: ${{github.workspace}}/build run: ctest -C ${{matrix.build_type}} --output-on-failure -L "conformance" --timeout 180 diff --git a/CMakeLists.txt b/CMakeLists.txt index 994848a008..0e5d1dc877 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception cmake_minimum_required(VERSION 3.14.0 FATAL_ERROR) -project(unified-runtime VERSION 0.8.6) +project(unified-runtime VERSION 0.8.7) include(GNUInstallDirs) include(CheckCXXSourceCompiles) diff --git a/source/adapters/level_zero/adapter.cpp b/source/adapters/level_zero/adapter.cpp index 5b9f39e743..1097101035 100644 --- a/source/adapters/level_zero/adapter.cpp +++ b/source/adapters/level_zero/adapter.cpp @@ -11,8 +11,6 @@ #include "adapter.hpp" #include "ur_level_zero.hpp" -ur_adapter_handle_t_ Adapter{}; - UR_APIEXPORT ur_result_t UR_APICALL urInit(ur_device_init_flags_t DeviceFlags, ///< [in] device initialization flags. @@ -24,15 +22,95 @@ urInit(ur_device_init_flags_t return UR_RESULT_SUCCESS; } -ur_result_t adapterStateTeardown() { - // reclaim ur_platform_handle_t objects here since we don't have - // urPlatformRelease. - for (ur_platform_handle_t Platform : *URPlatformsCache) { - delete Platform; +ur_result_t initPlatforms(PlatformVec &platforms) noexcept try { + uint32_t ZeDriverCount = 0; + ZE2UR_CALL(zeDriverGet, (&ZeDriverCount, nullptr)); + if (ZeDriverCount == 0) { + return UR_RESULT_SUCCESS; + } + + std::vector ZeDrivers; + ZeDrivers.resize(ZeDriverCount); + + ZE2UR_CALL(zeDriverGet, (&ZeDriverCount, ZeDrivers.data())); + for (uint32_t I = 0; I < ZeDriverCount; ++I) { + auto platform = std::make_unique(ZeDrivers[I]); + UR_CALL(platform->initialize()); + + // Save a copy in the cache for future uses. + platforms.push_back(std::move(platform)); } - delete URPlatformsCache; - delete URPlatformsCacheMutex; + return UR_RESULT_SUCCESS; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t adapterStateInit() { return UR_RESULT_SUCCESS; } + +ur_adapter_handle_t_::ur_adapter_handle_t_() { + + Adapter.PlatformCache.Compute = [](Result &result) { + static std::once_flag ZeCallCountInitialized; + try { + std::call_once(ZeCallCountInitialized, []() { + if (UrL0LeaksDebug) { + ZeCallCount = new std::map; + } + }); + } catch (...) { + result = exceptionToResult(std::current_exception()); + return; + } + + // initialize level zero only once. + if (Adapter.ZeResult == std::nullopt) { + // Setting these environment variables before running zeInit will enable + // the validation layer in the Level Zero loader. + if (UrL0Debug & UR_L0_DEBUG_VALIDATION) { + setEnvVar("ZE_ENABLE_VALIDATION_LAYER", "1"); + setEnvVar("ZE_ENABLE_PARAMETER_VALIDATION", "1"); + } + if (getenv("SYCL_ENABLE_PCI") != nullptr) { + urPrint( + "WARNING: SYCL_ENABLE_PCI is deprecated and no longer needed.\n"); + } + + // TODO: We can still safely recover if something goes wrong during the + // init. Implement handling segfault using sigaction. + + // We must only initialize the driver once, even if urPlatformGet() is + // called multiple times. Declaring the return value as "static" ensures + // it's only called once. + Adapter.ZeResult = ZE_CALL_NOCHECK(zeInit, (ZE_INIT_FLAG_GPU_ONLY)); + } + assert(Adapter.ZeResult != + std::nullopt); // verify that level-zero is initialized + PlatformVec platforms; + + // Absorb the ZE_RESULT_ERROR_UNINITIALIZED and just return 0 Platforms. + if (*Adapter.ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) { + result = std::move(platforms); + return; + } + if (*Adapter.ZeResult != ZE_RESULT_SUCCESS) { + urPrint("zeInit: Level Zero initialization failure\n"); + result = ze2urResult(*Adapter.ZeResult); + return; + } + + ur_result_t err = initPlatforms(platforms); + if (err == UR_RESULT_SUCCESS) { + result = std::move(platforms); + } else { + result = err; + } + }; +} + +ur_adapter_handle_t_ Adapter{}; + +ur_result_t adapterStateTeardown() { bool LeakFound = false; // Print the balance of various create/destroy native calls. @@ -144,9 +222,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGet( ) { if (NumEntries > 0 && Adapters) { std::lock_guard Lock{Adapter.Mutex}; - // TODO: Some initialization that happens in urPlatformsGet could be moved - // here for when RefCount reaches 1 - Adapter.RefCount++; + if (Adapter.RefCount++ == 0) { + adapterStateInit(); + } *Adapters = &Adapter; } diff --git a/source/adapters/level_zero/adapter.hpp b/source/adapters/level_zero/adapter.hpp index 22bb032d75..0942db852a 100644 --- a/source/adapters/level_zero/adapter.hpp +++ b/source/adapters/level_zero/adapter.hpp @@ -10,10 +10,19 @@ #include #include +#include +#include +#include + +using PlatformVec = std::vector>; struct ur_adapter_handle_t_ { + ur_adapter_handle_t_(); std::atomic RefCount = 0; std::mutex Mutex; + + std::optional ZeResult; + ZeCache> PlatformCache; }; extern ur_adapter_handle_t_ Adapter; diff --git a/source/adapters/level_zero/command_buffer.cpp b/source/adapters/level_zero/command_buffer.cpp index 7ba3cfae4d..cb5fcc72d9 100644 --- a/source/adapters/level_zero/command_buffer.cpp +++ b/source/adapters/level_zero/command_buffer.cpp @@ -295,7 +295,8 @@ static ur_result_t enqueueCommandBufferMemCopyHelper( SyncPointWaitList, ZeEventList)); ur_event_handle_t LaunchEvent; - UR_CALL(EventCreate(CommandBuffer->Context, nullptr, false, &LaunchEvent)); + UR_CALL( + EventCreate(CommandBuffer->Context, nullptr, false, false, &LaunchEvent)); LaunchEvent->CommandType = CommandType; // Get sync point and register the event with it. @@ -360,7 +361,8 @@ static ur_result_t enqueueCommandBufferMemCopyRectHelper( SyncPointWaitList, ZeEventList)); ur_event_handle_t LaunchEvent; - UR_CALL(EventCreate(CommandBuffer->Context, nullptr, false, &LaunchEvent)); + UR_CALL( + EventCreate(CommandBuffer->Context, nullptr, false, false, &LaunchEvent)); LaunchEvent->CommandType = CommandType; // Get sync point and register the event with it. @@ -409,8 +411,10 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device, // Create signal & wait events to be used in the command-list for sync // on command-buffer enqueue. auto RetCommandBuffer = *CommandBuffer; - UR_CALL(EventCreate(Context, nullptr, false, &RetCommandBuffer->SignalEvent)); - UR_CALL(EventCreate(Context, nullptr, false, &RetCommandBuffer->WaitEvent)); + UR_CALL(EventCreate(Context, nullptr, false, false, + &RetCommandBuffer->SignalEvent)); + UR_CALL(EventCreate(Context, nullptr, false, false, + &RetCommandBuffer->WaitEvent)); // Add prefix commands ZE2UR_CALL(zeCommandListAppendEventReset, @@ -519,7 +523,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp( UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList, SyncPointWaitList, ZeEventList)); ur_event_handle_t LaunchEvent; - UR_CALL(EventCreate(CommandBuffer->Context, nullptr, false, &LaunchEvent)); + UR_CALL( + EventCreate(CommandBuffer->Context, nullptr, false, false, &LaunchEvent)); LaunchEvent->CommandType = UR_COMMAND_KERNEL_LAUNCH; // Get sync point and register the event with it. @@ -754,12 +759,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp( // Create a command-list to signal RetEvent on completion ur_command_list_ptr_t SignalCommandList{}; if (Event) { - UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList, - false, false)); - - UR_CALL(createEventAndAssociateQueue(Queue, &RetEvent, - UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP, - SignalCommandList, false)); + UR_CALL(createEventAndAssociateQueue( + Queue, &RetEvent, UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP, + SignalCommandList, false, false, true)); ZE2UR_CALL(zeCommandListAppendBarrier, (SignalCommandList->first, RetEvent->ZeEvent, 1, diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp index 2bd893b043..f36442b491 100644 --- a/source/adapters/level_zero/context.cpp +++ b/source/adapters/level_zero/context.cpp @@ -471,12 +471,17 @@ static const uint32_t MaxNumEventsPerPool = [] { ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool( ze_event_pool_handle_t &Pool, size_t &Index, bool HostVisible, - bool ProfilingEnabled) { + bool ProfilingEnabled, ur_device_handle_t Device) { // Lock while updating event pool machinery. std::scoped_lock Lock(ZeEventPoolCacheMutex); + ze_device_handle_t ZeDevice = nullptr; + + if (Device) { + ZeDevice = Device->ZeDevice; + } std::list *ZePoolCache = - getZeEventPoolCache(HostVisible, ProfilingEnabled); + getZeEventPoolCache(HostVisible, ProfilingEnabled, ZeDevice); if (!ZePoolCache->empty()) { if (NumEventsAvailableInEventPool[ZePoolCache->front()] == 0) { @@ -511,9 +516,14 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool( urPrint("ze_event_pool_desc_t flags set to: %d\n", ZeEventPoolDesc.flags); std::vector ZeDevices; - std::for_each( - Devices.begin(), Devices.end(), - [&](const ur_device_handle_t &D) { ZeDevices.push_back(D->ZeDevice); }); + if (ZeDevice) { + ZeDevices.push_back(ZeDevice); + } else { + std::for_each(Devices.begin(), Devices.end(), + [&](const ur_device_handle_t &D) { + ZeDevices.push_back(D->ZeDevice); + }); + } ZE2UR_CALL(zeEventPoolCreate, (ZeContext, &ZeEventPoolDesc, ZeDevices.size(), &ZeDevices[0], ZePool)); @@ -528,11 +538,10 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool( return UR_RESULT_SUCCESS; } -ur_event_handle_t -ur_context_handle_t_::getEventFromContextCache(bool HostVisible, - bool WithProfiling) { +ur_event_handle_t ur_context_handle_t_::getEventFromContextCache( + bool HostVisible, bool WithProfiling, ur_device_handle_t Device) { std::scoped_lock Lock(EventCacheMutex); - auto Cache = getEventCache(HostVisible, WithProfiling); + auto Cache = getEventCache(HostVisible, WithProfiling, Device); if (Cache->empty()) return nullptr; @@ -546,8 +555,14 @@ ur_context_handle_t_::getEventFromContextCache(bool HostVisible, void ur_context_handle_t_::addEventToContextCache(ur_event_handle_t Event) { std::scoped_lock Lock(EventCacheMutex); - auto Cache = - getEventCache(Event->isHostVisible(), Event->isProfilingEnabled()); + ur_device_handle_t Device = nullptr; + + if (!Event->IsMultiDevice && Event->UrQueue) { + Device = Event->UrQueue->Device; + } + + auto Cache = getEventCache(Event->isHostVisible(), + Event->isProfilingEnabled(), Device); Cache->emplace_back(Event); } @@ -562,8 +577,14 @@ ur_context_handle_t_::decrementUnreleasedEventsInPool(ur_event_handle_t Event) { return UR_RESULT_SUCCESS; } - std::list *ZePoolCache = - getZeEventPoolCache(Event->isHostVisible(), Event->isProfilingEnabled()); + ze_device_handle_t ZeDevice = nullptr; + + if (!Event->IsMultiDevice && Event->UrQueue) { + ZeDevice = Event->UrQueue->Device->ZeDevice; + } + + std::list *ZePoolCache = getZeEventPoolCache( + Event->isHostVisible(), Event->isProfilingEnabled(), ZeDevice); // Put the empty pool to the cache of the pools. if (NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0) diff --git a/source/adapters/level_zero/context.hpp b/source/adapters/level_zero/context.hpp index 96935d470e..2c80ff0e33 100644 --- a/source/adapters/level_zero/context.hpp +++ b/source/adapters/level_zero/context.hpp @@ -142,6 +142,9 @@ struct ur_context_handle_t_ : _ur_object { // // Cache of event pools to which host-visible events are added to. std::vector> ZeEventPoolCache{4}; + std::vector *>> + ZeEventPoolCacheDeviceMap{4}; // This map will be used to determine if a pool is full or not // by storing number of empty slots available in the pool. @@ -163,6 +166,9 @@ struct ur_context_handle_t_ : _ur_object { // Caches for events. std::vector> EventCaches{4}; + std::vector< + std::unordered_map *>> + EventCachesDeviceMap{4}; // Initialize the PI context. ur_result_t initialize(); @@ -188,20 +194,46 @@ struct ur_context_handle_t_ : _ur_object { // slot for an event with profiling capabilities. ur_result_t getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &, size_t &, bool HostVisible, - bool ProfilingEnabled); + bool ProfilingEnabled, + ur_device_handle_t Device); // Get ur_event_handle_t from cache. ur_event_handle_t getEventFromContextCache(bool HostVisible, - bool WithProfiling); + bool WithProfiling, + ur_device_handle_t Device); // Add ur_event_handle_t to cache. void addEventToContextCache(ur_event_handle_t); - auto getZeEventPoolCache(bool HostVisible, bool WithProfiling) { - if (HostVisible) - return WithProfiling ? &ZeEventPoolCache[0] : &ZeEventPoolCache[1]; - else - return WithProfiling ? &ZeEventPoolCache[2] : &ZeEventPoolCache[3]; + auto getZeEventPoolCache(bool HostVisible, bool WithProfiling, + ze_device_handle_t ZeDevice) { + if (HostVisible) { + if (ZeDevice) { + auto ZeEventPoolCacheMap = WithProfiling + ? &ZeEventPoolCacheDeviceMap[0] + : &ZeEventPoolCacheDeviceMap[1]; + if (ZeEventPoolCacheMap->find(ZeDevice) == ZeEventPoolCacheMap->end()) { + ZeEventPoolCache.emplace_back(); + (*ZeEventPoolCacheMap)[ZeDevice] = &ZeEventPoolCache.back(); + } + return (*ZeEventPoolCacheMap)[ZeDevice]; + } else { + return WithProfiling ? &ZeEventPoolCache[0] : &ZeEventPoolCache[1]; + } + } else { + if (ZeDevice) { + auto ZeEventPoolCacheMap = WithProfiling + ? &ZeEventPoolCacheDeviceMap[2] + : &ZeEventPoolCacheDeviceMap[3]; + if (ZeEventPoolCacheMap->find(ZeDevice) == ZeEventPoolCacheMap->end()) { + ZeEventPoolCache.emplace_back(); + (*ZeEventPoolCacheMap)[ZeDevice] = &ZeEventPoolCache.back(); + } + return (*ZeEventPoolCacheMap)[ZeDevice]; + } else { + return WithProfiling ? &ZeEventPoolCache[2] : &ZeEventPoolCache[3]; + } + } } // Decrement number of events living in the pool upon event destroy @@ -240,11 +272,33 @@ struct ur_context_handle_t_ : _ur_object { private: // Get the cache of events for a provided scope and profiling mode. - auto getEventCache(bool HostVisible, bool WithProfiling) { - if (HostVisible) - return WithProfiling ? &EventCaches[0] : &EventCaches[1]; - else - return WithProfiling ? &EventCaches[2] : &EventCaches[3]; + auto getEventCache(bool HostVisible, bool WithProfiling, + ur_device_handle_t Device) { + if (HostVisible) { + if (Device) { + auto EventCachesMap = + WithProfiling ? &EventCachesDeviceMap[0] : &EventCachesDeviceMap[1]; + if (EventCachesMap->find(Device) == EventCachesMap->end()) { + EventCaches.emplace_back(); + (*EventCachesMap)[Device] = &EventCaches.back(); + } + return (*EventCachesMap)[Device]; + } else { + return WithProfiling ? &EventCaches[0] : &EventCaches[1]; + } + } else { + if (Device) { + auto EventCachesMap = + WithProfiling ? &EventCachesDeviceMap[2] : &EventCachesDeviceMap[3]; + if (EventCachesMap->find(Device) == EventCachesMap->end()) { + EventCaches.emplace_back(); + (*EventCachesMap)[Device] = &EventCaches.back(); + } + return (*EventCachesMap)[Device]; + } else { + return WithProfiling ? &EventCaches[2] : &EventCaches[3]; + } + } } }; diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index c132e28738..7633c723f9 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -9,6 +9,7 @@ //===----------------------------------------------------------------------===// #include "device.hpp" +#include "adapter.hpp" #include "ur_level_zero.hpp" #include "ur_util.hpp" #include @@ -1318,21 +1319,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceCreateWithNativeHandle( // Level Zero devices when we initialized the platforms/devices cache, so the // "NativeHandle" must already be in the cache. If it is not, this must not be // a valid Level Zero device. - // - // TODO: maybe we should populate cache of platforms if it wasn't already. - // For now assert that is was populated. - UR_ASSERT(URPlatformCachePopulated, UR_RESULT_ERROR_INVALID_VALUE); - const std::lock_guard Lock{*URPlatformsCacheMutex}; ur_device_handle_t Dev = nullptr; - for (ur_platform_handle_t ThePlatform : *URPlatformsCache) { - Dev = ThePlatform->getDeviceFromNativeHandle(ZeDevice); - if (Dev) { - // Check that the input Platform, if was given, matches the found one. - UR_ASSERT(!Platform || Platform == ThePlatform, - UR_RESULT_ERROR_INVALID_PLATFORM); - break; + if (const auto *platforms = Adapter.PlatformCache->get_value()) { + for (const auto &p : *platforms) { + Dev = p->getDeviceFromNativeHandle(ZeDevice); + if (Dev) { + // Check that the input Platform, if was given, matches the found one. + UR_ASSERT(!Platform || Platform == p.get(), + UR_RESULT_ERROR_INVALID_PLATFORM); + break; + } } + } else { + return Adapter.PlatformCache->get_error(); } if (Dev == nullptr) diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index 3cfac2cb45..2d969c1a02 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -75,7 +75,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( bool IsInternal = OutEvent == nullptr; ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_EVENTS_WAIT, - CommandList, IsInternal)); + CommandList, IsInternal, false)); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; @@ -102,9 +102,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( std::scoped_lock lock(Queue->Mutex); if (OutEvent) { - UR_CALL(createEventAndAssociateQueue( - Queue, OutEvent, UR_COMMAND_EVENTS_WAIT, Queue->CommandListMap.end(), - /* IsInternal */ false)); + UR_CALL(createEventAndAssociateQueue(Queue, OutEvent, + UR_COMMAND_EVENTS_WAIT, + Queue->CommandListMap.end(), false, + /* IsInternal */ false)); } UR_CALL(Queue->synchronize()); @@ -156,7 +157,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier( ur_event_handle_t &Event, bool IsInternal) { UR_CALL(createEventAndAssociateQueue( Queue, &Event, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, CmdList, - IsInternal)); + IsInternal, false)); Event->WaitList = EventWaitList; @@ -538,7 +539,8 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent( // Create a "proxy" host-visible event. UR_CALL(createEventAndAssociateQueue( UrQueue, &HostVisibleEvent, UR_EXT_COMMAND_TYPE_USER, CommandList, - /* IsInternal */ false, /* HostVisible */ true)); + /* IsInternal */ false, /* IsMultiDevice */ false, + /* HostVisible */ true)); ZE2UR_CALL(zeCommandListAppendWaitOnEvents, (CommandList->first, 1, &ZeEvent)); @@ -684,7 +686,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urExtEventCreate( ur_event_handle_t *Event ///< [out] pointer to the handle of the event object created. ) { - UR_CALL(EventCreate(Context, nullptr, true, Event)); + UR_CALL(EventCreate(Context, nullptr, false, true, Event)); (*Event)->RefCountExternal++; ZE2UR_CALL(zeEventHostSignal, ((*Event)->ZeEvent)); @@ -702,7 +704,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventCreateWithNativeHandle( // we dont have urEventCreate, so use this check for now to know that // the call comes from urEventCreate() if (NativeEvent == nullptr) { - UR_CALL(EventCreate(Context, nullptr, true, Event)); + UR_CALL(EventCreate(Context, nullptr, false, true, Event)); (*Event)->RefCountExternal++; ZE2UR_CALL(zeEventHostSignal, ((*Event)->ZeEvent)); @@ -971,12 +973,19 @@ ur_result_t CleanupCompletedEvent(ur_event_handle_t Event, bool QueueLocked, // a host-visible pool. // ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, - bool HostVisible, ur_event_handle_t *RetEvent) { + bool IsMultiDevice, bool HostVisible, + ur_event_handle_t *RetEvent) { bool ProfilingEnabled = !Queue || Queue->isProfilingEnabled(); - if (auto CachedEvent = - Context->getEventFromContextCache(HostVisible, ProfilingEnabled)) { + ur_device_handle_t Device = nullptr; + + if (!IsMultiDevice && Queue) { + Device = Queue->Device; + } + + if (auto CachedEvent = Context->getEventFromContextCache( + HostVisible, ProfilingEnabled, Device)) { *RetEvent = CachedEvent; return UR_RESULT_SUCCESS; } @@ -987,7 +996,7 @@ ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, size_t Index = 0; if (auto Res = Context->getFreeSlotInExistingOrNewPool( - ZeEventPool, Index, HostVisible, ProfilingEnabled)) + ZeEventPool, Index, HostVisible, ProfilingEnabled, Device)) return Res; ZeStruct ZeEventDesc; @@ -1189,9 +1198,45 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList( } std::shared_lock Lock(EventList[I]->Mutex); - this->ZeEventList[TmpListLength] = EventList[I]->ZeEvent; - this->UrEventList[TmpListLength] = EventList[I]; - this->UrEventList[TmpListLength]->RefCount.increment(); + + if (Queue && Queue->Device != CurQueue->Device && + !EventList[I]->IsMultiDevice) { + ze_event_handle_t MultiDeviceZeEvent = nullptr; + ur_event_handle_t MultiDeviceEvent; + bool IsInternal = true; + bool IsMultiDevice = true; + + ur_command_list_ptr_t CommandList{}; + UR_CALL(Queue->Context->getAvailableCommandList(Queue, CommandList, + false, true)); + + UR_CALL(createEventAndAssociateQueue( + Queue, &MultiDeviceEvent, EventList[I]->CommandType, CommandList, + IsInternal, IsMultiDevice)); + MultiDeviceZeEvent = MultiDeviceEvent->ZeEvent; + const auto &ZeCommandList = CommandList->first; + EventList[I]->RefCount.increment(); + + zeCommandListAppendWaitOnEvents(ZeCommandList, 1u, + &EventList[I]->ZeEvent); + zeEventHostSignal(MultiDeviceZeEvent); + + UR_CALL(Queue->executeCommandList(CommandList, /* IsBlocking */ false, + /* OkToBatchCommand */ true)); + + // Acquire lock of newly created MultiDeviceEvent to increase it's + // RefCount + std::shared_lock Lock(MultiDeviceEvent->Mutex); + + this->ZeEventList[TmpListLength] = MultiDeviceZeEvent; + this->UrEventList[TmpListLength] = MultiDeviceEvent; + this->UrEventList[TmpListLength]->RefCount.increment(); + } else { + this->ZeEventList[TmpListLength] = EventList[I]->ZeEvent; + this->UrEventList[TmpListLength] = EventList[I]; + this->UrEventList[TmpListLength]->RefCount.increment(); + } + TmpListLength += 1; } } diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp index d4e975012c..c266de8c0d 100644 --- a/source/adapters/level_zero/event.hpp +++ b/source/adapters/level_zero/event.hpp @@ -30,7 +30,8 @@ extern "C" { ur_result_t urEventReleaseInternal(ur_event_handle_t Event); ur_result_t EventCreate(ur_context_handle_t Context, ur_queue_handle_t Queue, - bool HostVisible, ur_event_handle_t *RetEvent); + bool IsMultiDevice, bool HostVisible, + ur_event_handle_t *RetEvent); } // extern "C" // This is an experimental option that allows to disable caching of events in @@ -190,6 +191,11 @@ struct ur_event_handle_t_ : _ur_object { // plugin. bool IsDiscarded = {false}; + // Indicates that this event is needed to be visible by multiple devices. + // When possible, allocate Event from single device pool for optimal + // performance + bool IsMultiDevice = {false}; + // Besides each PI object keeping a total reference count in // _ur_object::RefCount we keep special track of the event *external* // references. This way we are able to tell when the event is not referenced diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp index 3b3fc7b154..b36c309092 100644 --- a/source/adapters/level_zero/kernel.cpp +++ b/source/adapters/level_zero/kernel.cpp @@ -206,7 +206,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_KERNEL_LAUNCH, - CommandList, IsInternal)); + CommandList, IsInternal, false)); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; diff --git a/source/adapters/level_zero/memory.cpp b/source/adapters/level_zero/memory.cpp index fa3ef18e47..f5b1bf6956 100644 --- a/source/adapters/level_zero/memory.cpp +++ b/source/adapters/level_zero/memory.cpp @@ -67,7 +67,7 @@ ur_result_t enqueueMemCopyHelper(ur_command_t CommandType, bool IsInternal = OutEvent == nullptr; ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList, - IsInternal)); + IsInternal, false)); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; @@ -117,7 +117,7 @@ ur_result_t enqueueMemCopyRectHelper( bool IsInternal = OutEvent == nullptr; ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList, - IsInternal)); + IsInternal, false)); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; @@ -227,7 +227,7 @@ static ur_result_t enqueueMemFillHelper(ur_command_t CommandType, bool IsInternal = OutEvent == nullptr; ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList, - IsInternal)); + IsInternal, false)); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; @@ -361,7 +361,7 @@ static ur_result_t enqueueMemImageCommandHelper( bool IsInternal = OutEvent == nullptr; ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, CommandType, CommandList, - IsInternal)); + IsInternal, false)); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; @@ -911,9 +911,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( UR_CALL(TmpWaitList.createAndRetainUrZeEventList( NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine)); - UR_CALL( - createEventAndAssociateQueue(Queue, Event, UR_COMMAND_MEM_BUFFER_MAP, - Queue->CommandListMap.end(), IsInternal)); + UR_CALL(createEventAndAssociateQueue( + Queue, Event, UR_COMMAND_MEM_BUFFER_MAP, Queue->CommandListMap.end(), + IsInternal, false)); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; @@ -1071,7 +1071,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_MEM_UNMAP, Queue->CommandListMap.end(), - IsInternal)); + IsInternal, false)); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; } @@ -1262,7 +1262,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( bool IsInternal = OutEvent == nullptr; ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_USM_PREFETCH, - CommandList, IsInternal)); + CommandList, IsInternal, false)); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; @@ -1318,7 +1318,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMAdvise( bool IsInternal = OutEvent == nullptr; ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent; UR_CALL(createEventAndAssociateQueue(Queue, Event, UR_COMMAND_USM_ADVISE, - CommandList, IsInternal)); + CommandList, IsInternal, false)); ZeEvent = (*Event)->ZeEvent; (*Event)->WaitList = TmpWaitList; diff --git a/source/adapters/level_zero/platform.cpp b/source/adapters/level_zero/platform.cpp index b7680b1638..1b70f56910 100644 --- a/source/adapters/level_zero/platform.cpp +++ b/source/adapters/level_zero/platform.cpp @@ -27,101 +27,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urPlatformGet( uint32_t *NumPlatforms ///< [out][optional] returns the total number of ///< platforms available. ) { - static std::once_flag ZeCallCountInitialized; - try { - std::call_once(ZeCallCountInitialized, []() { - if (UrL0LeaksDebug) { - ZeCallCount = new std::map; - } - }); - } catch (const std::bad_alloc &) { - return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; - } - - // Setting these environment variables before running zeInit will enable the - // validation layer in the Level Zero loader. - if (UrL0Debug & UR_L0_DEBUG_VALIDATION) { - setEnvVar("ZE_ENABLE_VALIDATION_LAYER", "1"); - setEnvVar("ZE_ENABLE_PARAMETER_VALIDATION", "1"); - } - - if (getenv("SYCL_ENABLE_PCI") != nullptr) { - urPrint("WARNING: SYCL_ENABLE_PCI is deprecated and no longer needed.\n"); - } - - // TODO: We can still safely recover if something goes wrong during the init. - // Implement handling segfault using sigaction. - - // We must only initialize the driver once, even if urPlatformGet() is called - // multiple times. Declaring the return value as "static" ensures it's only - // called once. - static ze_result_t ZeResult = - ZE_CALL_NOCHECK(zeInit, (ZE_INIT_FLAG_GPU_ONLY)); - - // Absorb the ZE_RESULT_ERROR_UNINITIALIZED and just return 0 Platforms. - if (ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) { - UR_ASSERT(NumEntries == 0, UR_RESULT_ERROR_INVALID_VALUE); - if (NumPlatforms) - *NumPlatforms = 0; - return UR_RESULT_SUCCESS; - } - - if (ZeResult != ZE_RESULT_SUCCESS) { - urPrint("zeInit: Level Zero initialization failure\n"); - return ze2urResult(ZeResult); - } - - // Cache ur_platform_handle_t for reuse in the future - // It solves two problems; - // 1. sycl::platform equality issue; we always return the same - // ur_platform_handle_t - // 2. performance; we can save time by immediately return from cache. - // - - const std::lock_guard Lock{*URPlatformsCacheMutex}; - if (!URPlatformCachePopulated) { - try { - // Level Zero does not have concept of Platforms, but Level Zero driver is - // the closest match. - uint32_t ZeDriverCount = 0; - ZE2UR_CALL(zeDriverGet, (&ZeDriverCount, nullptr)); - if (ZeDriverCount == 0) { - URPlatformCachePopulated = true; - } else { - std::vector ZeDrivers; - ZeDrivers.resize(ZeDriverCount); - - ZE2UR_CALL(zeDriverGet, (&ZeDriverCount, ZeDrivers.data())); - for (uint32_t I = 0; I < ZeDriverCount; ++I) { - auto Platform = new ur_platform_handle_t_(ZeDrivers[I]); - // Save a copy in the cache for future uses. - URPlatformsCache->push_back(Platform); - - UR_CALL(Platform->initialize()); - } - URPlatformCachePopulated = true; + // Platform handles are cached for reuse. This is to ensure consistent + // handle pointers across invocations and to improve retrieval performance. + if (const auto *cached_platforms = Adapter.PlatformCache->get_value(); + cached_platforms) { + uint32_t nplatforms = (uint32_t)cached_platforms->size(); + if (NumPlatforms) { + *NumPlatforms = nplatforms; + } + if (Platforms) { + for (uint32_t i = 0; i < std::min(nplatforms, NumEntries); ++i) { + Platforms[i] = cached_platforms->at(i).get(); } - } catch (const std::bad_alloc &) { - return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } catch (...) { - return UR_RESULT_ERROR_UNKNOWN; } - } - - // Populate returned platforms from the cache. - if (Platforms) { - UR_ASSERT(NumEntries <= URPlatformsCache->size(), - UR_RESULT_ERROR_INVALID_PLATFORM); - std::copy_n(URPlatformsCache->begin(), NumEntries, Platforms); - } - - if (NumPlatforms) { - if (*NumPlatforms == 0) - *NumPlatforms = URPlatformsCache->size(); - else - *NumPlatforms = (std::min)(URPlatformsCache->size(), (size_t)NumEntries); + } else { + return Adapter.PlatformCache->get_error(); } return UR_RESULT_SUCCESS; diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index 29f3483089..7487e4dc63 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -1126,7 +1126,8 @@ ur_queue_handle_t_::executeCommandList(ur_command_list_ptr_t CommandList, auto Res = createEventAndAssociateQueue( reinterpret_cast(this), &HostVisibleEvent, UR_EXT_COMMAND_TYPE_USER, CommandList, - /* IsInternal */ false, /* HostVisible */ true); + /* IsInternal */ false, /* IsMultiDevice */ true, + /* HostVisible */ true); if (Res) return Res; @@ -1260,8 +1261,19 @@ ur_queue_handle_t_::resetDiscardedEvent(ur_command_list_ptr_t CommandList) { } ur_result_t ur_queue_handle_t_::addEventToQueueCache(ur_event_handle_t Event) { - auto Cache = Event->isHostVisible() ? &EventCaches[0] : &EventCaches[1]; - Cache->emplace_back(Event); + if (!Event->IsMultiDevice && Event->UrQueue) { + auto Device = Event->UrQueue->Device; + auto EventCachesMap = Event->isHostVisible() ? &EventCachesDeviceMap[0] + : &EventCachesDeviceMap[1]; + if (EventCachesMap->find(Device) == EventCachesMap->end()) { + EventCaches.emplace_back(); + (*EventCachesMap)[Device] = &EventCaches.back(); + } + (*EventCachesMap)[Device]->emplace_back(Event); + } else { + auto Cache = Event->isHostVisible() ? &EventCaches[0] : &EventCaches[1]; + Cache->emplace_back(Event); + } return UR_RESULT_SUCCESS; } @@ -1438,8 +1450,20 @@ ur_result_t ur_queue_handle_t_::synchronize() { return UR_RESULT_SUCCESS; } -ur_event_handle_t ur_queue_handle_t_::getEventFromQueueCache(bool HostVisible) { - auto Cache = HostVisible ? &EventCaches[0] : &EventCaches[1]; +ur_event_handle_t ur_queue_handle_t_::getEventFromQueueCache(bool IsMultiDevice, + bool HostVisible) { + std::list *Cache; + + if (!IsMultiDevice) { + auto Device = this->Device; + Cache = HostVisible ? EventCachesDeviceMap[0][Device] + : EventCachesDeviceMap[1][Device]; + if (!Cache) { + return nullptr; + } + } else { + Cache = HostVisible ? &EventCaches[0] : &EventCaches[1]; + } // If we don't have any events, return nullptr. // If we have only a single event then it was used by the last command and we @@ -1464,13 +1488,15 @@ ur_event_handle_t ur_queue_handle_t_::getEventFromQueueCache(bool HostVisible) { // \param CommandList is the command list where the event is added // \param IsInternal tells if the event is internal, i.e. visible in the L0 // plugin only. +// \param IsMultiDevice tells if the event must be created in the multi-device +// visible pool. // \param HostVisible tells if the event must be created in the // host-visible pool. If not set then this function will decide. ur_result_t createEventAndAssociateQueue(ur_queue_handle_t Queue, ur_event_handle_t *Event, ur_command_t CommandType, ur_command_list_ptr_t CommandList, - bool IsInternal, + bool IsInternal, bool IsMultiDevice, std::optional HostVisible) { if (!HostVisible.has_value()) { @@ -1479,15 +1505,18 @@ ur_result_t createEventAndAssociateQueue(ur_queue_handle_t Queue, } // If event is discarded then try to get event from the queue cache. - *Event = - IsInternal ? Queue->getEventFromQueueCache(HostVisible.value()) : nullptr; + *Event = IsInternal ? Queue->getEventFromQueueCache(IsMultiDevice, + HostVisible.value()) + : nullptr; if (*Event == nullptr) - UR_CALL(EventCreate(Queue->Context, Queue, HostVisible.value(), Event)); + UR_CALL(EventCreate(Queue->Context, Queue, IsMultiDevice, + HostVisible.value(), Event)); (*Event)->UrQueue = Queue; (*Event)->CommandType = CommandType; (*Event)->IsDiscarded = IsInternal; + (*Event)->IsMultiDevice = IsMultiDevice; (*Event)->CommandList = CommandList; // Discarded event doesn't own ze_event, it is used by multiple // ur_event_handle_t objects. We destroy corresponding ze_event by releasing @@ -1563,7 +1592,8 @@ ur_result_t ur_queue_handle_t_::signalEventFromCmdListIfLastEventDiscarded( UR_CALL(createEventAndAssociateQueue( reinterpret_cast(this), &Event, UR_EXT_COMMAND_TYPE_USER, CommandList, - /* IsInternal */ false, /* HostVisible */ false)); + /* IsInternal */ false, /* IsMultiDevice */ true, + /* HostVisible */ false)); UR_CALL(urEventReleaseInternal(Event)); LastCommandEvent = Event; @@ -1876,7 +1906,7 @@ ur_queue_handle_t_::insertActiveBarriers(ur_command_list_ptr_t &CmdList, if (auto Res = createEventAndAssociateQueue( reinterpret_cast(this), &Event, UR_EXT_COMMAND_TYPE_USER, CmdList, - /*IsInternal*/ true)) + /* IsInternal */ true, /* IsMultiDevice */ true)) return Res; Event->WaitList = ActiveBarriersWaitList; diff --git a/source/adapters/level_zero/queue.hpp b/source/adapters/level_zero/queue.hpp index 8022c45e0e..b255e5963e 100644 --- a/source/adapters/level_zero/queue.hpp +++ b/source/adapters/level_zero/queue.hpp @@ -343,6 +343,9 @@ struct ur_queue_handle_t_ : _ur_object { // inside all command lists in the queue as described in the 2-event model. // Leftover events in the cache are relased at the queue destruction. std::vector> EventCaches{2}; + std::vector< + std::unordered_map *>> + EventCachesDeviceMap{2}; // adjust the queue's batch size, knowing that the current command list // is being closed with a full batch. @@ -417,7 +420,8 @@ struct ur_queue_handle_t_ : _ur_object { // two times in a row and have to do round-robin between two events. Otherwise // it picks an event from the beginning of the cache and returns it. Event // from the last command is always appended to the end of the list. - ur_event_handle_t getEventFromQueueCache(bool HostVisible); + ur_event_handle_t getEventFromQueueCache(bool IsMultiDevice, + bool HostVisible); // Returns true if an OpenCommandList has commands that need to be submitted. // If IsCopy is 'true', then the OpenCommandList containing copy commands is @@ -532,13 +536,14 @@ struct ur_queue_handle_t_ : _ur_object { // \param CommandList is the command list where the event is added // \param IsInternal tells if the event is internal, i.e. visible in the L0 // plugin only. +// \param IsMultiDevice Indicates that this event must be visible by +// multiple devices. // \param ForceHostVisible tells if the event must be created in // the host-visible pool -ur_result_t -createEventAndAssociateQueue(ur_queue_handle_t Queue, ur_event_handle_t *Event, - ur_command_t CommandType, - ur_command_list_ptr_t CommandList, bool IsInternal, - std::optional HostVisible = std::nullopt); +ur_result_t createEventAndAssociateQueue( + ur_queue_handle_t Queue, ur_event_handle_t *Event, ur_command_t CommandType, + ur_command_list_ptr_t CommandList, bool IsInternal, bool IsMultiDevice, + std::optional HostVisible = std::nullopt); // Helper function to perform the necessary cleanup of the events from reset cmd // list. diff --git a/source/ur/ur.cpp b/source/ur/ur.cpp index 4de87d53c2..dad6312d57 100644 --- a/source/ur/ur.cpp +++ b/source/ur/ur.cpp @@ -22,9 +22,3 @@ bool PrintTrace = [] { } return false; }(); - -// Apparatus for maintaining immutable cache of platforms. -std::vector *URPlatformsCache = - new std::vector; -SpinLock *URPlatformsCacheMutex = new SpinLock; -bool URPlatformCachePopulated = false; diff --git a/source/ur/ur.hpp b/source/ur/ur.hpp index 0437d719ba..43e35a0055 100644 --- a/source/ur/ur.hpp +++ b/source/ur/ur.hpp @@ -19,10 +19,13 @@ #include #include #include +#include #include #include +#include "ur_util.hpp" + template To ur_cast(From Value) { // TODO: see if more sanity checks are possible. assert(sizeof(From) == sizeof(To)); @@ -176,16 +179,6 @@ struct _ur_platform {}; // Controls tracing UR calls from within the UR itself. extern bool PrintTrace; -// Apparatus for maintaining immutable cache of platforms. -// -// Note we only create a simple pointer variables such that C++ RT won't -// deallocate them automatically at the end of the main program. -// The heap memory allocated for these global variables reclaimed only at -// explicit tear-down. -extern std::vector *URPlatformsCache; -extern SpinLock *URPlatformsCacheMutex; -extern bool URPlatformCachePopulated; - // The getInfo*/ReturnHelper facilities provide shortcut way of // writing return bytes for the various getInfo APIs. namespace ur { @@ -295,3 +288,23 @@ class UrReturnHelper { void *param_value; size_t *param_value_size_ret; }; + +template class Result { +public: + Result(ur_result_t err) : value_or_err(err) {} + Result(T value) : value_or_err(std::move(value)) {} + Result() : value_or_err(UR_RESULT_ERROR_UNINITIALIZED) {} + + bool is_err() { return std::holds_alternative(value_or_err); } + explicit operator bool() const { return !is_err(); } + + const T *get_value() { return std::get_if(&value_or_err); } + + ur_result_t get_error() { + auto *err = std::get_if(&value_or_err); + return err ? *err : UR_RESULT_SUCCESS; + } + +private: + std::variant value_or_err; +}; diff --git a/test/conformance/platform/platform_adapter_level_zero.match b/test/conformance/platform/platform_adapter_level_zero.match index e69de29bb2..df63fbef05 100644 --- a/test/conformance/platform/platform_adapter_level_zero.match +++ b/test/conformance/platform/platform_adapter_level_zero.match @@ -0,0 +1 @@ +urPlatformGetTest.InvalidNumEntries