Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Candidate for the v0.8.7 release tag #1329

Merged
merged 5 commits into from
Feb 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 0 additions & 9 deletions .github/workflows/cmake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,6 @@ jobs:
matrix:
adapter: [
{name: CUDA, triplet: nvptx64-nvidia-cuda},
{name: HIP, triplet: amdgcn-amd-amdhsa},
{name: L0, triplet: spir64}
]
build_type: [Debug, Release]
Expand Down Expand Up @@ -209,15 +208,7 @@ jobs:
working-directory: ${{github.workspace}}/build
run: ctest -C ${{matrix.build_type}} --output-on-failure -L "adapter-specific" --timeout 180

# Temporarily disabling platform test for L0, because of hang
# See issue: #824
- name: Test L0 adapter
if: matrix.adapter.name == 'L0'
working-directory: ${{github.workspace}}/build
run: ctest -C ${{matrix.build_type}} --output-on-failure -L "conformance" -E "platform-adapter_level_zero" --timeout 180

- name: Test adapters
if: matrix.adapter.name != 'L0'
working-directory: ${{github.workspace}}/build
run: ctest -C ${{matrix.build_type}} --output-on-failure -L "conformance" --timeout 180

Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

cmake_minimum_required(VERSION 3.14.0 FATAL_ERROR)
project(unified-runtime VERSION 0.8.6)
project(unified-runtime VERSION 0.8.7)

include(GNUInstallDirs)
include(CheckCXXSourceCompiles)
Expand Down
102 changes: 90 additions & 12 deletions source/adapters/level_zero/adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
#include "adapter.hpp"
#include "ur_level_zero.hpp"

ur_adapter_handle_t_ Adapter{};

UR_APIEXPORT ur_result_t UR_APICALL
urInit(ur_device_init_flags_t
DeviceFlags, ///< [in] device initialization flags.
Expand All @@ -24,15 +22,95 @@ urInit(ur_device_init_flags_t
return UR_RESULT_SUCCESS;
}

ur_result_t adapterStateTeardown() {
// reclaim ur_platform_handle_t objects here since we don't have
// urPlatformRelease.
for (ur_platform_handle_t Platform : *URPlatformsCache) {
delete Platform;
ur_result_t initPlatforms(PlatformVec &platforms) noexcept try {
uint32_t ZeDriverCount = 0;
ZE2UR_CALL(zeDriverGet, (&ZeDriverCount, nullptr));
if (ZeDriverCount == 0) {
return UR_RESULT_SUCCESS;
}

std::vector<ze_driver_handle_t> ZeDrivers;
ZeDrivers.resize(ZeDriverCount);

ZE2UR_CALL(zeDriverGet, (&ZeDriverCount, ZeDrivers.data()));
for (uint32_t I = 0; I < ZeDriverCount; ++I) {
auto platform = std::make_unique<ur_platform_handle_t_>(ZeDrivers[I]);
UR_CALL(platform->initialize());

// Save a copy in the cache for future uses.
platforms.push_back(std::move(platform));
}
delete URPlatformsCache;
delete URPlatformsCacheMutex;
return UR_RESULT_SUCCESS;
} catch (...) {
return exceptionToResult(std::current_exception());
}

ur_result_t adapterStateInit() { return UR_RESULT_SUCCESS; }

ur_adapter_handle_t_::ur_adapter_handle_t_() {

Adapter.PlatformCache.Compute = [](Result<PlatformVec> &result) {
static std::once_flag ZeCallCountInitialized;
try {
std::call_once(ZeCallCountInitialized, []() {
if (UrL0LeaksDebug) {
ZeCallCount = new std::map<std::string, int>;
}
});
} catch (...) {
result = exceptionToResult(std::current_exception());
return;
}

// initialize level zero only once.
if (Adapter.ZeResult == std::nullopt) {
// Setting these environment variables before running zeInit will enable
// the validation layer in the Level Zero loader.
if (UrL0Debug & UR_L0_DEBUG_VALIDATION) {
setEnvVar("ZE_ENABLE_VALIDATION_LAYER", "1");
setEnvVar("ZE_ENABLE_PARAMETER_VALIDATION", "1");
}

if (getenv("SYCL_ENABLE_PCI") != nullptr) {
urPrint(
"WARNING: SYCL_ENABLE_PCI is deprecated and no longer needed.\n");
}

// TODO: We can still safely recover if something goes wrong during the
// init. Implement handling segfault using sigaction.

// We must only initialize the driver once, even if urPlatformGet() is
// called multiple times. Declaring the return value as "static" ensures
// it's only called once.
Adapter.ZeResult = ZE_CALL_NOCHECK(zeInit, (ZE_INIT_FLAG_GPU_ONLY));
}
assert(Adapter.ZeResult !=
std::nullopt); // verify that level-zero is initialized
PlatformVec platforms;

// Absorb the ZE_RESULT_ERROR_UNINITIALIZED and just return 0 Platforms.
if (*Adapter.ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) {
result = std::move(platforms);
return;
}
if (*Adapter.ZeResult != ZE_RESULT_SUCCESS) {
urPrint("zeInit: Level Zero initialization failure\n");
result = ze2urResult(*Adapter.ZeResult);
return;
}

ur_result_t err = initPlatforms(platforms);
if (err == UR_RESULT_SUCCESS) {
result = std::move(platforms);
} else {
result = err;
}
};
}

ur_adapter_handle_t_ Adapter{};

ur_result_t adapterStateTeardown() {
bool LeakFound = false;

// Print the balance of various create/destroy native calls.
Expand Down Expand Up @@ -144,9 +222,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGet(
) {
if (NumEntries > 0 && Adapters) {
std::lock_guard<std::mutex> Lock{Adapter.Mutex};
// TODO: Some initialization that happens in urPlatformsGet could be moved
// here for when RefCount reaches 1
Adapter.RefCount++;
if (Adapter.RefCount++ == 0) {
adapterStateInit();
}
*Adapters = &Adapter;
}

Expand Down
9 changes: 9 additions & 0 deletions source/adapters/level_zero/adapter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,19 @@

#include <atomic>
#include <mutex>
#include <optional>
#include <ur/ur.hpp>
#include <ze_api.h>

using PlatformVec = std::vector<std::unique_ptr<ur_platform_handle_t_>>;

struct ur_adapter_handle_t_ {
ur_adapter_handle_t_();
std::atomic<uint32_t> RefCount = 0;
std::mutex Mutex;

std::optional<ze_result_t> ZeResult;
ZeCache<Result<PlatformVec>> PlatformCache;
};

extern ur_adapter_handle_t_ Adapter;
24 changes: 13 additions & 11 deletions source/adapters/level_zero/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,8 @@ static ur_result_t enqueueCommandBufferMemCopyHelper(
SyncPointWaitList, ZeEventList));

ur_event_handle_t LaunchEvent;
UR_CALL(EventCreate(CommandBuffer->Context, nullptr, false, &LaunchEvent));
UR_CALL(
EventCreate(CommandBuffer->Context, nullptr, false, false, &LaunchEvent));
LaunchEvent->CommandType = CommandType;

// Get sync point and register the event with it.
Expand Down Expand Up @@ -360,7 +361,8 @@ static ur_result_t enqueueCommandBufferMemCopyRectHelper(
SyncPointWaitList, ZeEventList));

ur_event_handle_t LaunchEvent;
UR_CALL(EventCreate(CommandBuffer->Context, nullptr, false, &LaunchEvent));
UR_CALL(
EventCreate(CommandBuffer->Context, nullptr, false, false, &LaunchEvent));
LaunchEvent->CommandType = CommandType;

// Get sync point and register the event with it.
Expand Down Expand Up @@ -409,8 +411,10 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
// Create signal & wait events to be used in the command-list for sync
// on command-buffer enqueue.
auto RetCommandBuffer = *CommandBuffer;
UR_CALL(EventCreate(Context, nullptr, false, &RetCommandBuffer->SignalEvent));
UR_CALL(EventCreate(Context, nullptr, false, &RetCommandBuffer->WaitEvent));
UR_CALL(EventCreate(Context, nullptr, false, false,
&RetCommandBuffer->SignalEvent));
UR_CALL(EventCreate(Context, nullptr, false, false,
&RetCommandBuffer->WaitEvent));

// Add prefix commands
ZE2UR_CALL(zeCommandListAppendEventReset,
Expand Down Expand Up @@ -519,7 +523,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
UR_CALL(getEventsFromSyncPoints(CommandBuffer, NumSyncPointsInWaitList,
SyncPointWaitList, ZeEventList));
ur_event_handle_t LaunchEvent;
UR_CALL(EventCreate(CommandBuffer->Context, nullptr, false, &LaunchEvent));
UR_CALL(
EventCreate(CommandBuffer->Context, nullptr, false, false, &LaunchEvent));
LaunchEvent->CommandType = UR_COMMAND_KERNEL_LAUNCH;

// Get sync point and register the event with it.
Expand Down Expand Up @@ -754,12 +759,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
// Create a command-list to signal RetEvent on completion
ur_command_list_ptr_t SignalCommandList{};
if (Event) {
UR_CALL(Queue->Context->getAvailableCommandList(Queue, SignalCommandList,
false, false));

UR_CALL(createEventAndAssociateQueue(Queue, &RetEvent,
UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP,
SignalCommandList, false));
UR_CALL(createEventAndAssociateQueue(
Queue, &RetEvent, UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP,
SignalCommandList, false, false, true));
Comment on lines -757 to +764
Copy link
Contributor

@againull againull Feb 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Merge conflict was resolved incorrectly here. getAvailableCommandList should have been left as is. Only createEventAndAssociateQueue should have been changed accordingly.

Please see againull@f9ac087#diff-ab021f9451ccb6dd66e3df853d0e123b2f898233a1f609b1e698bef4f6011c71R762-R767

for the reference.

graph tests are failing because of this.


ZE2UR_CALL(zeCommandListAppendBarrier,
(SignalCommandList->first, RetEvent->ZeEvent, 1,
Expand Down
47 changes: 34 additions & 13 deletions source/adapters/level_zero/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -471,12 +471,17 @@ static const uint32_t MaxNumEventsPerPool = [] {

ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool(
ze_event_pool_handle_t &Pool, size_t &Index, bool HostVisible,
bool ProfilingEnabled) {
bool ProfilingEnabled, ur_device_handle_t Device) {
// Lock while updating event pool machinery.
std::scoped_lock<ur_mutex> Lock(ZeEventPoolCacheMutex);

ze_device_handle_t ZeDevice = nullptr;

if (Device) {
ZeDevice = Device->ZeDevice;
}
std::list<ze_event_pool_handle_t> *ZePoolCache =
getZeEventPoolCache(HostVisible, ProfilingEnabled);
getZeEventPoolCache(HostVisible, ProfilingEnabled, ZeDevice);

if (!ZePoolCache->empty()) {
if (NumEventsAvailableInEventPool[ZePoolCache->front()] == 0) {
Expand Down Expand Up @@ -511,9 +516,14 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool(
urPrint("ze_event_pool_desc_t flags set to: %d\n", ZeEventPoolDesc.flags);

std::vector<ze_device_handle_t> ZeDevices;
std::for_each(
Devices.begin(), Devices.end(),
[&](const ur_device_handle_t &D) { ZeDevices.push_back(D->ZeDevice); });
if (ZeDevice) {
ZeDevices.push_back(ZeDevice);
} else {
std::for_each(Devices.begin(), Devices.end(),
[&](const ur_device_handle_t &D) {
ZeDevices.push_back(D->ZeDevice);
});
}

ZE2UR_CALL(zeEventPoolCreate, (ZeContext, &ZeEventPoolDesc,
ZeDevices.size(), &ZeDevices[0], ZePool));
Expand All @@ -528,11 +538,10 @@ ur_result_t ur_context_handle_t_::getFreeSlotInExistingOrNewPool(
return UR_RESULT_SUCCESS;
}

ur_event_handle_t
ur_context_handle_t_::getEventFromContextCache(bool HostVisible,
bool WithProfiling) {
ur_event_handle_t ur_context_handle_t_::getEventFromContextCache(
bool HostVisible, bool WithProfiling, ur_device_handle_t Device) {
std::scoped_lock<ur_mutex> Lock(EventCacheMutex);
auto Cache = getEventCache(HostVisible, WithProfiling);
auto Cache = getEventCache(HostVisible, WithProfiling, Device);
if (Cache->empty())
return nullptr;

Expand All @@ -546,8 +555,14 @@ ur_context_handle_t_::getEventFromContextCache(bool HostVisible,

void ur_context_handle_t_::addEventToContextCache(ur_event_handle_t Event) {
std::scoped_lock<ur_mutex> Lock(EventCacheMutex);
auto Cache =
getEventCache(Event->isHostVisible(), Event->isProfilingEnabled());
ur_device_handle_t Device = nullptr;

if (!Event->IsMultiDevice && Event->UrQueue) {
Device = Event->UrQueue->Device;
}

auto Cache = getEventCache(Event->isHostVisible(),
Event->isProfilingEnabled(), Device);
Cache->emplace_back(Event);
}

Expand All @@ -562,8 +577,14 @@ ur_context_handle_t_::decrementUnreleasedEventsInPool(ur_event_handle_t Event) {
return UR_RESULT_SUCCESS;
}

std::list<ze_event_pool_handle_t> *ZePoolCache =
getZeEventPoolCache(Event->isHostVisible(), Event->isProfilingEnabled());
ze_device_handle_t ZeDevice = nullptr;

if (!Event->IsMultiDevice && Event->UrQueue) {
ZeDevice = Event->UrQueue->Device->ZeDevice;
}

std::list<ze_event_pool_handle_t> *ZePoolCache = getZeEventPoolCache(
Event->isHostVisible(), Event->isProfilingEnabled(), ZeDevice);

// Put the empty pool to the cache of the pools.
if (NumEventsUnreleasedInEventPool[Event->ZeEventPool] == 0)
Expand Down
Loading
Loading