Skip to content

Commit

Permalink
Merge branch 'main' into event-wait-with-good-context
Browse files Browse the repository at this point in the history
  • Loading branch information
hdelan committed Mar 14, 2024
2 parents 3d3407a + d99d5f7 commit 0223877
Show file tree
Hide file tree
Showing 33 changed files with 449 additions and 145 deletions.
22 changes: 15 additions & 7 deletions .github/workflows/e2e_core.yml
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,6 @@ jobs:
ls -la ./
rm -rf ./* || true
- uses: xt0rted/pull-request-comment-branch@d97294d304604fa98a2600a6e2f916a84b596dc7 # v2.0.0
id: comment-branch
if: ${{ always() && inputs.trigger != 'schedule' }}

- name: Add comment to PR
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
if: ${{ always() && inputs.trigger != 'schedule' }}
Expand All @@ -90,7 +86,18 @@ jobs:
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
with:
path: ur-repo
ref: ${{ steps.comment-branch.outputs.head_ref }}

# On issue_comment trigger (for PRs) we need to fetch special ref for
# proper PR's merge commit. Note, this ref may be absent if the PR is already merged.
- name: Fetch PR's merge commit
if: ${{ inputs.trigger != 'schedule' }}
working-directory: ${{github.workspace}}/ur-repo
env:
PR_NO: ${{github.event.issue.number}}
run: |
git fetch -- https://github.com/${{github.repository}} +refs/pull/${PR_NO}/*:refs/remotes/origin/pr/${PR_NO}/*
git checkout origin/pr/${PR_NO}/merge
git rev-parse origin/pr/${PR_NO}/merge
- name: Checkout SYCL
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
Expand Down Expand Up @@ -191,8 +198,9 @@ jobs:
script: |
const adapter = '${{ matrix.adapter.name }}';
const url = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}';
const status = '${{ steps.tests.outcome }}';
const body = `E2E ${adapter} build: \n${url}\n Status: ${status}`;
const test_status = '${{ steps.tests.outcome }}';
const job_status = '${{ job.status }}';
const body = `E2E ${adapter} build:\n${url}\nJob status: ${job_status}. Test status: ${test_status}`;
github.rest.issues.createComment({
issue_number: context.issue.number,
Expand Down
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ set(UR_CONFORMANCE_TARGET_TRIPLES "" CACHE STRING
"List of sycl targets to build CTS device binaries for")
set(UR_CONFORMANCE_AMD_ARCH "" CACHE STRING "AMD device target ID to build CTS binaries for")

# There's little reason not to generate the compile_commands.json
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

include(Assertions)

set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
Expand Down
4 changes: 2 additions & 2 deletions scripts/ctest_parser.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ def summarize_results(results):
total_failed = len(results['Failed'])
total_crashed = total - (total_passed + total_skipped + total_failed)

pass_rate_incl_skipped = percent(total_passed, total)
pass_rate_excl_skipped = percent(total_passed, total - total_skipped)
pass_rate_incl_skipped = percent(total_passed + total_skipped, total)
pass_rate_excl_skipped = percent(total_passed, total)

skipped_rate = percent(total_skipped, total)
failed_rate = percent(total_failed, total)
Expand Down
18 changes: 10 additions & 8 deletions source/adapters/cuda/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,17 +66,20 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() {
cuGraphDestroy(CudaGraph);

// Release the memory allocated to the CudaGraphExec
cuGraphExecDestroy(CudaGraphExec);
if (CudaGraphExec) {
cuGraphExecDestroy(CudaGraphExec);
}
}

ur_exp_command_buffer_command_handle_t_::
ur_exp_command_buffer_command_handle_t_(
ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel,
std::shared_ptr<CUgraphNode> Node, CUDA_KERNEL_NODE_PARAMS Params,
std::shared_ptr<CUgraphNode> &&Node, CUDA_KERNEL_NODE_PARAMS Params,
uint32_t WorkDim, const size_t *GlobalWorkOffsetPtr,
const size_t *GlobalWorkSizePtr, const size_t *LocalWorkSizePtr)
: CommandBuffer(CommandBuffer), Kernel(Kernel), Node(Node), Params(Params),
WorkDim(WorkDim), RefCountInternal(1), RefCountExternal(1) {
: CommandBuffer(CommandBuffer), Kernel(Kernel), Node{std::move(Node)},
Params(Params), WorkDim(WorkDim), RefCountInternal(1),
RefCountExternal(1) {
CommandBuffer->incrementInternalReferenceCount();

const size_t CopySize = sizeof(size_t) * WorkDim;
Expand Down Expand Up @@ -365,7 +368,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
try {
// Set node param structure with the kernel related data
auto &ArgIndices = hKernel->getArgIndices();
CUDA_KERNEL_NODE_PARAMS NodeParams;
CUDA_KERNEL_NODE_PARAMS NodeParams = {};
NodeParams.func = CuFunc;
NodeParams.gridDimX = BlocksPerGrid[0];
NodeParams.gridDimY = BlocksPerGrid[1];
Expand All @@ -375,7 +378,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
NodeParams.blockDimZ = ThreadsPerBlock[2];
NodeParams.sharedMemBytes = LocalSize;
NodeParams.kernelParams = const_cast<void **>(ArgIndices.data());
NodeParams.extra = nullptr;

// Create and add an new kernel node to the Cuda graph
UR_CHECK_ERROR(cuGraphAddKernelNode(&GraphNode, hCommandBuffer->CudaGraph,
Expand All @@ -392,8 +394,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
}

auto NewCommand = new ur_exp_command_buffer_command_handle_t_{
hCommandBuffer, hKernel, NodeSP, NodeParams,
workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize};
hCommandBuffer, hKernel, std::move(NodeSP), NodeParams,
workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize};

NewCommand->incrementInternalReferenceCount();
hCommandBuffer->CommandHandles.push_back(NewCommand);
Expand Down
2 changes: 1 addition & 1 deletion source/adapters/cuda/command_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ static inline const char *getUrResultString(ur_result_t Result) {
struct ur_exp_command_buffer_command_handle_t_ {
ur_exp_command_buffer_command_handle_t_(
ur_exp_command_buffer_handle_t CommandBuffer, ur_kernel_handle_t Kernel,
std::shared_ptr<CUgraphNode> Node, CUDA_KERNEL_NODE_PARAMS Params,
std::shared_ptr<CUgraphNode> &&Node, CUDA_KERNEL_NODE_PARAMS Params,
uint32_t WorkDim, const size_t *GlobalWorkOffsetPtr,
const size_t *GlobalWorkSizePtr, const size_t *LocalWorkSizePtr);

Expand Down
14 changes: 10 additions & 4 deletions source/adapters/cuda/image.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1006,17 +1006,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
ArrayDesc.Format = format;

CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC mipmapDesc = {};
mipmapDesc.numLevels = 1;
mipmapDesc.numLevels = pImageDesc->numMipLevel;
mipmapDesc.arrayDesc = ArrayDesc;

// External memory is mapped to a CUmipmappedArray
// If desired, a CUarray is retrieved from the mipmaps 0th level
CUmipmappedArray memMipMap;
UR_CHECK_ERROR(cuExternalMemoryGetMappedMipmappedArray(
&memMipMap, (CUexternalMemory)hInteropMem, &mipmapDesc));

CUarray memArray;
UR_CHECK_ERROR(cuMipmappedArrayGetLevel(&memArray, memMipMap, 0));
if (pImageDesc->numMipLevel > 1) {
*phImageMem = (ur_exp_image_mem_handle_t)memMipMap;
} else {
CUarray memArray;
UR_CHECK_ERROR(cuMipmappedArrayGetLevel(&memArray, memMipMap, 0));

*phImageMem = (ur_exp_image_mem_handle_t)memArray;
*phImageMem = (ur_exp_image_mem_handle_t)memArray;
}

} catch (ur_result_t Err) {
return Err;
Expand Down
22 changes: 11 additions & 11 deletions source/adapters/cuda/tracing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,20 +27,20 @@
using tracing_event_t = xpti_td *;
using subscriber_handle_t = CUpti_SubscriberHandle;

using cuptiSubscribe_fn = CUPTIAPI
CUptiResult (*)(CUpti_SubscriberHandle *subscriber, CUpti_CallbackFunc callback,
void *userdata);
using cuptiSubscribe_fn =
CUptiResult(CUPTIAPI *)(CUpti_SubscriberHandle *subscriber,
CUpti_CallbackFunc callback, void *userdata);

using cuptiUnsubscribe_fn = CUPTIAPI
CUptiResult (*)(CUpti_SubscriberHandle subscriber);
using cuptiUnsubscribe_fn =
CUptiResult(CUPTIAPI *)(CUpti_SubscriberHandle subscriber);

using cuptiEnableDomain_fn = CUPTIAPI
CUptiResult (*)(uint32_t enable, CUpti_SubscriberHandle subscriber,
CUpti_CallbackDomain domain);
using cuptiEnableDomain_fn =
CUptiResult(CUPTIAPI *)(uint32_t enable, CUpti_SubscriberHandle subscriber,
CUpti_CallbackDomain domain);

using cuptiEnableCallback_fn = CUPTIAPI
CUptiResult (*)(uint32_t enable, CUpti_SubscriberHandle subscriber,
CUpti_CallbackDomain domain, CUpti_CallbackId cbid);
using cuptiEnableCallback_fn =
CUptiResult(CUPTIAPI *)(uint32_t enable, CUpti_SubscriberHandle subscriber,
CUpti_CallbackDomain domain, CUpti_CallbackId cbid);

#define LOAD_CUPTI_SYM(p, lib, x) \
p.x = (cupti##x##_fn)ur_loader::LibLoader::getFunctionPtr(lib.get(), \
Expand Down
6 changes: 3 additions & 3 deletions source/adapters/hip/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_(
ur_context_handle_t hContext, ur_device_handle_t hDevice, bool IsUpdatable)
: Context(hContext), Device(hDevice),
IsUpdatable(IsUpdatable), HIPGraph{nullptr}, HIPGraphExec{nullptr},
RefCountInternal{1}, RefCountExternal{1} {
RefCountInternal{1}, RefCountExternal{1}, NextSyncPoint{0} {
urContextRetain(hContext);
urDeviceRetain(hDevice);
}
Expand All @@ -65,11 +65,11 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() {
UR_TRACE(urDeviceRelease(Device));

// Release the memory allocated to the HIPGraph
UR_CHECK_ERROR(hipGraphDestroy(HIPGraph));
(void)hipGraphDestroy(HIPGraph);

// Release the memory allocated to the HIPGraphExec
if (HIPGraphExec) {
UR_CHECK_ERROR(hipGraphExecDestroy(HIPGraphExec));
(void)hipGraphExecDestroy(HIPGraphExec);
}
}

Expand Down
6 changes: 3 additions & 3 deletions source/adapters/hip/command_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -254,8 +254,8 @@ struct ur_exp_command_buffer_handle_t_ {
~ur_exp_command_buffer_handle_t_();

void registerSyncPoint(ur_exp_command_buffer_sync_point_t SyncPoint,
std::shared_ptr<hipGraphNode_t> HIPNode) {
SyncPoints[SyncPoint] = HIPNode;
std::shared_ptr<hipGraphNode_t> &&HIPNode) {
SyncPoints[SyncPoint] = std::move(HIPNode);
NextSyncPoint++;
}

Expand All @@ -269,7 +269,7 @@ struct ur_exp_command_buffer_handle_t_ {
ur_exp_command_buffer_sync_point_t
addSyncPoint(std::shared_ptr<hipGraphNode_t> HIPNode) {
ur_exp_command_buffer_sync_point_t SyncPoint = NextSyncPoint;
registerSyncPoint(SyncPoint, HIPNode);
registerSyncPoint(SyncPoint, std::move(HIPNode));
return SyncPoint;
}
uint32_t incrementInternalReferenceCount() noexcept {
Expand Down
9 changes: 8 additions & 1 deletion source/adapters/level_zero/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (C) 2022 Intel Corporation
# Copyright (C) 2022-2024 Intel Corporation
# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
# See LICENSE.TXT
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Expand Down Expand Up @@ -122,6 +122,13 @@ add_ur_adapter(${TARGET_NAME}
${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp
)

if(NOT WIN32)
target_sources(ur_adapter_level_zero
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/adapter_lib_init_linux.cpp
)
endif()

# TODO: fix level_zero adapter conversion warnings
target_compile_options(${TARGET_NAME} PRIVATE
$<$<CXX_COMPILER_ID:MSVC>:/wd4805 /wd4244>
Expand Down
72 changes: 53 additions & 19 deletions source/adapters/level_zero/adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@
#include "adapter.hpp"
#include "ur_level_zero.hpp"

// Due to multiple DLLMain definitions with SYCL, Global Adapter is init at
// variable creation.
#if defined(_WIN32)
ur_adapter_handle_t_ *GlobalAdapter = new ur_adapter_handle_t_();
#else
ur_adapter_handle_t_ *GlobalAdapter;
#endif

ur_result_t initPlatforms(PlatformVec &platforms) noexcept try {
uint32_t ZeDriverCount = 0;
ZE2UR_CALL(zeDriverGet, (&ZeDriverCount, nullptr));
Expand All @@ -37,8 +45,7 @@ ur_result_t initPlatforms(PlatformVec &platforms) noexcept try {
ur_result_t adapterStateInit() { return UR_RESULT_SUCCESS; }

ur_adapter_handle_t_::ur_adapter_handle_t_() {

Adapter.PlatformCache.Compute = [](Result<PlatformVec> &result) {
PlatformCache.Compute = [](Result<PlatformVec> &result) {
static std::once_flag ZeCallCountInitialized;
try {
std::call_once(ZeCallCountInitialized, []() {
Expand All @@ -52,7 +59,7 @@ ur_adapter_handle_t_::ur_adapter_handle_t_() {
}

// initialize level zero only once.
if (Adapter.ZeResult == std::nullopt) {
if (GlobalAdapter->ZeResult == std::nullopt) {
// Setting these environment variables before running zeInit will enable
// the validation layer in the Level Zero loader.
if (UrL0Debug & UR_L0_DEBUG_VALIDATION) {
Expand All @@ -71,20 +78,21 @@ ur_adapter_handle_t_::ur_adapter_handle_t_() {
// We must only initialize the driver once, even if urPlatformGet() is
// called multiple times. Declaring the return value as "static" ensures
// it's only called once.
Adapter.ZeResult = ZE_CALL_NOCHECK(zeInit, (ZE_INIT_FLAG_GPU_ONLY));
GlobalAdapter->ZeResult =
ZE_CALL_NOCHECK(zeInit, (ZE_INIT_FLAG_GPU_ONLY));
}
assert(Adapter.ZeResult !=
assert(GlobalAdapter->ZeResult !=
std::nullopt); // verify that level-zero is initialized
PlatformVec platforms;

// Absorb the ZE_RESULT_ERROR_UNINITIALIZED and just return 0 Platforms.
if (*Adapter.ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) {
if (*GlobalAdapter->ZeResult == ZE_RESULT_ERROR_UNINITIALIZED) {
result = std::move(platforms);
return;
}
if (*Adapter.ZeResult != ZE_RESULT_SUCCESS) {
if (*GlobalAdapter->ZeResult != ZE_RESULT_SUCCESS) {
urPrint("zeInit: Level Zero initialization failure\n");
result = ze2urResult(*Adapter.ZeResult);
result = ze2urResult(*GlobalAdapter->ZeResult);
return;
}

Expand All @@ -97,7 +105,11 @@ ur_adapter_handle_t_::ur_adapter_handle_t_() {
};
}

ur_adapter_handle_t_ Adapter{};
void globalAdapterOnDemandCleanup() {
if (GlobalAdapter) {
delete GlobalAdapter;
}
}

ur_result_t adapterStateTeardown() {
bool LeakFound = false;
Expand Down Expand Up @@ -184,6 +196,11 @@ ur_result_t adapterStateTeardown() {
}
if (LeakFound)
return UR_RESULT_ERROR_INVALID_MEM_OBJECT;
// Due to multiple DLLMain definitions with SYCL, register to cleanup the
// Global Adapter after refcnt is 0
#if defined(_WIN32)
std::atexit(globalAdapterOnDemandCleanup);
#endif

return UR_RESULT_SUCCESS;
}
Expand All @@ -203,11 +220,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGet(
///< adapters available.
) {
if (NumEntries > 0 && Adapters) {
std::lock_guard<std::mutex> Lock{Adapter.Mutex};
if (Adapter.RefCount++ == 0) {
adapterStateInit();
if (GlobalAdapter) {
std::lock_guard<std::mutex> Lock{GlobalAdapter->Mutex};
if (GlobalAdapter->RefCount++ == 0) {
adapterStateInit();
}
} else {
// If the GetAdapter is called after the Library began or was torndown,
// then temporarily create a new Adapter handle and register a new
// cleanup.
GlobalAdapter = new ur_adapter_handle_t_();
std::lock_guard<std::mutex> Lock{GlobalAdapter->Mutex};
if (GlobalAdapter->RefCount++ == 0) {
adapterStateInit();
}
std::atexit(globalAdapterOnDemandCleanup);
}
*Adapters = &Adapter;
*Adapters = GlobalAdapter;
}

if (NumAdapters) {
Expand All @@ -218,17 +247,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGet(
}

UR_APIEXPORT ur_result_t UR_APICALL urAdapterRelease(ur_adapter_handle_t) {
std::lock_guard<std::mutex> Lock{Adapter.Mutex};
if (--Adapter.RefCount == 0) {
return adapterStateTeardown();
// Check first if the Adapter pointer is valid
if (GlobalAdapter) {
std::lock_guard<std::mutex> Lock{GlobalAdapter->Mutex};
if (--GlobalAdapter->RefCount == 0) {
return adapterStateTeardown();
}
}

return UR_RESULT_SUCCESS;
}

UR_APIEXPORT ur_result_t UR_APICALL urAdapterRetain(ur_adapter_handle_t) {
std::lock_guard<std::mutex> Lock{Adapter.Mutex};
Adapter.RefCount++;
if (GlobalAdapter) {
std::lock_guard<std::mutex> Lock{GlobalAdapter->Mutex};
GlobalAdapter->RefCount++;
}

return UR_RESULT_SUCCESS;
}
Expand Down Expand Up @@ -257,7 +291,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t,
case UR_ADAPTER_INFO_BACKEND:
return ReturnValue(UR_ADAPTER_BACKEND_LEVEL_ZERO);
case UR_ADAPTER_INFO_REFERENCE_COUNT:
return ReturnValue(Adapter.RefCount.load());
return ReturnValue(GlobalAdapter->RefCount.load());
default:
return UR_RESULT_ERROR_INVALID_ENUMERATION;
}
Expand Down
Loading

0 comments on commit 0223877

Please sign in to comment.