Skip to content

Commit

Permalink
Queue::finish fence fix
Browse files Browse the repository at this point in the history
* event record uses regular command list enqueue. Finishing the queue in this case needs to wait not only on last even but on this fence.
  • Loading branch information
pvelesko committed Oct 13, 2024
1 parent 93a8085 commit 91f4cdc
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 1 deletion.
26 changes: 25 additions & 1 deletion src/backend/Level0/CHIPBackendLevel0.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1472,6 +1472,25 @@ void CHIPQueueLevel0::finish() {
if (LastEvent)
LastEvent->wait();

if (ZeFenceLast_) {
auto BackendLz = static_cast<CHIPBackendLevel0 *>(Backend);

Check warning on line 1476 in src/backend/Level0/CHIPBackendLevel0.cc

View workflow job for this annotation

GitHub Actions / cpp-linter

src/backend/Level0/CHIPBackendLevel0.cc:1476:5 [readability-qualified-auto]

'auto BackendLz' can be declared as 'auto *BackendLz'
LOCK(BackendLz->EventsMtx);
LOCK(BackendLz->ActiveCmdListsMtx);
auto it = std::find_if(BackendLz->ActiveCmdLists.begin(),

Check warning on line 1479 in src/backend/Level0/CHIPBackendLevel0.cc

View workflow job for this annotation

GitHub Actions / cpp-linter

src/backend/Level0/CHIPBackendLevel0.cc:1479:10 [readability-identifier-length]

variable name 'it' is too short, expected at least 3 characters

Check warning on line 1479 in src/backend/Level0/CHIPBackendLevel0.cc

View workflow job for this annotation

GitHub Actions / cpp-linter

src/backend/Level0/CHIPBackendLevel0.cc:1479:10 [readability-identifier-naming]

invalid case style for local variable 'it'
BackendLz->ActiveCmdLists.end(),
[this](const auto &CmdList) {
return CmdList->getFence() == ZeFenceLast_;
});

if (it != BackendLz->ActiveCmdLists.end()) {
(*it)->wait();
BackendLz->ActiveCmdLists.erase(it);
return;
}

ZeFenceLast_ = nullptr;
}

if (zeCmdQOwnership_) {
zeStatus = zeCommandQueueSynchronize(ZeCmdQ_,
ChipEnvVars.getL0EventTimeout() * 1e9);
Expand All @@ -1486,7 +1505,11 @@ void CHIPQueueLevel0::finish() {
void CHIPQueueLevel0::executeCommandList(
Borrowed<FencedCmdList> &CmdList, std::shared_ptr<chipstar::Event> Event) {
updateLastEvent(Event);
CmdList->execute(getCmdQueue());

CmdList->execute(getCmdQueue()); // creates fence
ZeFenceLast_ = CmdList->getFence();
assert(ZeFenceLast_ && "Fence pointer is null");

auto BackendLz = static_cast<CHIPBackendLevel0 *>(Backend);

Check warning on line 1513 in src/backend/Level0/CHIPBackendLevel0.cc

View workflow job for this annotation

GitHub Actions / cpp-linter

src/backend/Level0/CHIPBackendLevel0.cc:1513:3 [readability-qualified-auto]

'auto BackendLz' can be declared as 'auto *BackendLz'
LOCK(BackendLz->ActiveCmdListsMtx);
BackendLz->ActiveCmdLists.push_back(std::move(CmdList));
Expand Down Expand Up @@ -1624,6 +1647,7 @@ void CHIPBackendLevel0::uninitialize() {
EventMonitor_->Stop = true;
}
EventMonitor_->join();
ActiveCmdLists.clear();
return;

Check warning on line 1651 in src/backend/Level0/CHIPBackendLevel0.cc

View workflow job for this annotation

GitHub Actions / cpp-linter

src/backend/Level0/CHIPBackendLevel0.cc:1651:3 [readability-redundant-control-flow]

redundant return statement at the end of a function with a void return type
}

Expand Down
8 changes: 8 additions & 0 deletions src/backend/Level0/CHIPBackendLevel0.hh
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,14 @@ public:
CHIPERR_CHECK_LOG_AND_ABORT("Failed to create command list");
}

void wait() {

Check warning on line 216 in src/backend/Level0/CHIPBackendLevel0.hh

View workflow job for this annotation

GitHub Actions / cpp-linter

src/backend/Level0/CHIPBackendLevel0.hh:216:8 [readability-convert-member-functions-to-static]

method 'wait' can be made static
zeStatus = zeFenceHostSynchronize(ZeFence_, UINT64_MAX);
CHIPERR_CHECK_LOG_AND_THROW_TABLE(zeFenceHostSynchronize);
}

bool reset() {

Check warning on line 221 in src/backend/Level0/CHIPBackendLevel0.hh

View workflow job for this annotation

GitHub Actions / cpp-linter

src/backend/Level0/CHIPBackendLevel0.hh:221:8 [modernize-use-trailing-return-type]

use a trailing return type for this function

Check warning on line 221 in src/backend/Level0/CHIPBackendLevel0.hh

View workflow job for this annotation

GitHub Actions / cpp-linter

src/backend/Level0/CHIPBackendLevel0.hh:221:8 [readability-convert-member-functions-to-static]

method 'reset' can be made static
zeStatus = zeFenceReset(ZeFence_);
CHIPERR_CHECK_LOG_AND_ABORT("Failed to reset fence");
zeStatus = zeCommandListReset(ZeCmdList_);
CHIPERR_CHECK_LOG_AND_ABORT("Failed to reset command list");
return true;
Expand Down Expand Up @@ -253,6 +260,7 @@ protected:
ze_device_handle_t ZeDev_;
CHIPDeviceLevel0 *ChipDevLz_;
CHIPContextLevel0 *ChipCtxLz_;
ze_fence_handle_t ZeFenceLast_ = nullptr;

// The shared memory buffer
void *SharedBuf_;
Expand Down
7 changes: 7 additions & 0 deletions src/backend/Level0/zeHipErrorConversion.hh
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,13 @@ const std::unordered_map<void *, ze_hip_error_map_t> ZE_HIP_ERROR_MAPS = {
{ZE_RESULT_ERROR_DEVICE_LOST, hipErrorNotInitialized},
{ZE_RESULT_ERROR_INVALID_NULL_HANDLE, hipErrorInvalidResourceHandle},
{ZE_RESULT_NOT_READY, hipErrorNotReady}}},
{(void *)&zeFenceHostSynchronize,
{{ZE_RESULT_SUCCESS, hipSuccess},
{ZE_RESULT_ERROR_UNINITIALIZED, hipErrorNotInitialized},
{ZE_RESULT_ERROR_DEVICE_LOST, hipErrorNotInitialized},
{ZE_RESULT_ERROR_INVALID_NULL_HANDLE, hipErrorInvalidResourceHandle},
{ZE_RESULT_ERROR_INVALID_SYNCHRONIZATION_OBJECT, hipErrorInvalidValue},
{ZE_RESULT_NOT_READY, hipErrorNotReady}}},
{(void *)&zeCommandListAppendMemoryFill,
{{ZE_RESULT_SUCCESS, hipSuccess},
{ZE_RESULT_ERROR_UNINITIALIZED, hipErrorNotInitialized},
Expand Down

0 comments on commit 91f4cdc

Please sign in to comment.