From 91f4cdcfc28543a167ef98f23db47840c2342f18 Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Sat, 12 Oct 2024 16:16:14 +0300 Subject: [PATCH] Queue::finish fence fix * event record uses regular command list enqueue. Finishing the queue in this case needs to wait not only on last even but on this fence. --- src/backend/Level0/CHIPBackendLevel0.cc | 26 +++++++++++++++++++++- src/backend/Level0/CHIPBackendLevel0.hh | 8 +++++++ src/backend/Level0/zeHipErrorConversion.hh | 7 ++++++ 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/src/backend/Level0/CHIPBackendLevel0.cc b/src/backend/Level0/CHIPBackendLevel0.cc index 986b16cca..8e8448fe9 100644 --- a/src/backend/Level0/CHIPBackendLevel0.cc +++ b/src/backend/Level0/CHIPBackendLevel0.cc @@ -1472,6 +1472,25 @@ void CHIPQueueLevel0::finish() { if (LastEvent) LastEvent->wait(); + if (ZeFenceLast_) { + auto BackendLz = static_cast(Backend); + LOCK(BackendLz->EventsMtx); + LOCK(BackendLz->ActiveCmdListsMtx); + auto it = std::find_if(BackendLz->ActiveCmdLists.begin(), + BackendLz->ActiveCmdLists.end(), + [this](const auto &CmdList) { + return CmdList->getFence() == ZeFenceLast_; + }); + + if (it != BackendLz->ActiveCmdLists.end()) { + (*it)->wait(); + BackendLz->ActiveCmdLists.erase(it); + return; + } + + ZeFenceLast_ = nullptr; + } + if (zeCmdQOwnership_) { zeStatus = zeCommandQueueSynchronize(ZeCmdQ_, ChipEnvVars.getL0EventTimeout() * 1e9); @@ -1486,7 +1505,11 @@ void CHIPQueueLevel0::finish() { void CHIPQueueLevel0::executeCommandList( Borrowed &CmdList, std::shared_ptr Event) { updateLastEvent(Event); - CmdList->execute(getCmdQueue()); + + CmdList->execute(getCmdQueue()); // creates fence + ZeFenceLast_ = CmdList->getFence(); + assert(ZeFenceLast_ && "Fence pointer is null"); + auto BackendLz = static_cast(Backend); LOCK(BackendLz->ActiveCmdListsMtx); BackendLz->ActiveCmdLists.push_back(std::move(CmdList)); @@ -1624,6 +1647,7 @@ void CHIPBackendLevel0::uninitialize() { EventMonitor_->Stop = true; } EventMonitor_->join(); + ActiveCmdLists.clear(); return; } diff --git a/src/backend/Level0/CHIPBackendLevel0.hh b/src/backend/Level0/CHIPBackendLevel0.hh index 899afff37..61a5796f8 100644 --- a/src/backend/Level0/CHIPBackendLevel0.hh +++ b/src/backend/Level0/CHIPBackendLevel0.hh @@ -213,7 +213,14 @@ public: CHIPERR_CHECK_LOG_AND_ABORT("Failed to create command list"); } + void wait() { + zeStatus = zeFenceHostSynchronize(ZeFence_, UINT64_MAX); + CHIPERR_CHECK_LOG_AND_THROW_TABLE(zeFenceHostSynchronize); + } + bool reset() { + zeStatus = zeFenceReset(ZeFence_); + CHIPERR_CHECK_LOG_AND_ABORT("Failed to reset fence"); zeStatus = zeCommandListReset(ZeCmdList_); CHIPERR_CHECK_LOG_AND_ABORT("Failed to reset command list"); return true; @@ -253,6 +260,7 @@ protected: ze_device_handle_t ZeDev_; CHIPDeviceLevel0 *ChipDevLz_; CHIPContextLevel0 *ChipCtxLz_; + ze_fence_handle_t ZeFenceLast_ = nullptr; // The shared memory buffer void *SharedBuf_; diff --git a/src/backend/Level0/zeHipErrorConversion.hh b/src/backend/Level0/zeHipErrorConversion.hh index 0b0ba6fe5..0fa2ff168 100644 --- a/src/backend/Level0/zeHipErrorConversion.hh +++ b/src/backend/Level0/zeHipErrorConversion.hh @@ -318,6 +318,13 @@ const std::unordered_map ZE_HIP_ERROR_MAPS = { {ZE_RESULT_ERROR_DEVICE_LOST, hipErrorNotInitialized}, {ZE_RESULT_ERROR_INVALID_NULL_HANDLE, hipErrorInvalidResourceHandle}, {ZE_RESULT_NOT_READY, hipErrorNotReady}}}, + {(void *)&zeFenceHostSynchronize, + {{ZE_RESULT_SUCCESS, hipSuccess}, + {ZE_RESULT_ERROR_UNINITIALIZED, hipErrorNotInitialized}, + {ZE_RESULT_ERROR_DEVICE_LOST, hipErrorNotInitialized}, + {ZE_RESULT_ERROR_INVALID_NULL_HANDLE, hipErrorInvalidResourceHandle}, + {ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT, hipErrorInvalidValue}, + {ZE_RESULT_NOT_READY, hipErrorNotReady}}}, {(void *)&zeCommandListAppendMemoryFill, {{ZE_RESULT_SUCCESS, hipSuccess}, {ZE_RESULT_ERROR_UNINITIALIZED, hipErrorNotInitialized},