Skip to content

Commit

Permalink
[CUDA} Dynamically load the CUPTI library when tracing
Browse files Browse the repository at this point in the history
  • Loading branch information
pasaulais authored and kbenzie committed Jan 23, 2024
1 parent 6fb1e54 commit 311afb9
Show file tree
Hide file tree
Showing 4 changed files with 175 additions and 22 deletions.
4 changes: 4 additions & 0 deletions source/adapters/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ else()
message(WARNING "CUDA adapter USM pools are disabled, set UMF_ENABLE_POOL_TRACKING to enable them")
endif()

if (CUDA_cupti_LIBRARY)
target_compile_definitions("ur_adapter_cuda" PRIVATE CUPTI_LIB_PATH="${CUDA_cupti_LIBRARY}")
endif()

target_link_libraries(${TARGET_NAME} PRIVATE
${PROJECT_NAME}::headers
${PROJECT_NAME}::common
Expand Down
12 changes: 7 additions & 5 deletions source/adapters/cuda/adapter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,12 @@
#include <ur_api.h>

#include "common.hpp"

void enableCUDATracing();
void disableCUDATracing();
#include "tracing.hpp"

struct ur_adapter_handle_t_ {
std::atomic<uint32_t> RefCount = 0;
std::mutex Mutex;
struct cuda_tracing_context_t_ *TracingCtx = nullptr;
};

ur_adapter_handle_t_ adapter{};
Expand All @@ -28,7 +27,8 @@ urAdapterGet(uint32_t NumEntries, ur_adapter_handle_t *phAdapters,
if (NumEntries > 0 && phAdapters) {
std::lock_guard<std::mutex> Lock{adapter.Mutex};
if (adapter.RefCount++ == 0) {
enableCUDATracing();
adapter.TracingCtx = createCUDATracingContext();
enableCUDATracing(adapter.TracingCtx);
}

*phAdapters = &adapter;
Expand All @@ -50,7 +50,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterRetain(ur_adapter_handle_t) {
UR_APIEXPORT ur_result_t UR_APICALL urAdapterRelease(ur_adapter_handle_t) {
std::lock_guard<std::mutex> Lock{adapter.Mutex};
if (--adapter.RefCount == 0) {
disableCUDATracing();
disableCUDATracing(adapter.TracingCtx);
freeCUDATracingContext(adapter.TracingCtx);
adapter.TracingCtx = nullptr;
}
return UR_RESULT_SUCCESS;
}
Expand Down
157 changes: 140 additions & 17 deletions source/adapters/cuda/tracing.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,29 +16,72 @@
#include <cuda.h>
#ifdef XPTI_ENABLE_INSTRUMENTATION
#include <cupti.h>
#include <dlfcn.h>
#endif // XPTI_ENABLE_INSTRUMENTATION

#include "tracing.hpp"
#include <exception>
#include <iostream>

#ifdef XPTI_ENABLE_INSTRUMENTATION
using tracing_event_t = xpti_td *;
using subscriber_handle_t = CUpti_SubscriberHandle;

using cuptiSubscribe_fn = CUPTIAPI
CUptiResult (*)(CUpti_SubscriberHandle *subscriber, CUpti_CallbackFunc callback,
void *userdata);

using cuptiUnsubscribe_fn = CUPTIAPI
CUptiResult (*)(CUpti_SubscriberHandle subscriber);

using cuptiEnableDomain_fn = CUPTIAPI
CUptiResult (*)(uint32_t enable, CUpti_SubscriberHandle subscriber,
CUpti_CallbackDomain domain);

using cuptiEnableCallback_fn = CUPTIAPI
CUptiResult (*)(uint32_t enable, CUpti_SubscriberHandle subscriber,
CUpti_CallbackDomain domain, CUpti_CallbackId cbid);

#define LOAD_CUPTI_SYM(p, x) \
p->x = (cupti##x##_fn)dlsym(p->Library, "cupti" #x);

#else
using tracing_event_t = void *;
using subscriber_handle_t = void *;
using cuptiSubscribe_fn = void *;
using cuptiUnsubscribe_fn = void *;
using cuptiEnableDomain_fn = void *;
using cuptiEnableCallback_fn = void *;
#endif // XPTI_ENABLE_INSTRUMENTATION

struct cuda_tracing_context_t_ {
tracing_event_t CallEvent = nullptr;
tracing_event_t DebugEvent = nullptr;
subscriber_handle_t Subscriber = nullptr;
void *Library = nullptr;
cuptiSubscribe_fn Subscribe = nullptr;
cuptiUnsubscribe_fn Unsubscribe = nullptr;
cuptiEnableDomain_fn EnableDomain = nullptr;
cuptiEnableCallback_fn EnableCallback = nullptr;
};

#ifdef XPTI_ENABLE_INSTRUMENTATION
constexpr auto CUDA_CALL_STREAM_NAME = "sycl.experimental.cuda.call";
constexpr auto CUDA_DEBUG_STREAM_NAME = "sycl.experimental.cuda.debug";

thread_local uint64_t CallCorrelationID = 0;
thread_local uint64_t DebugCorrelationID = 0;

static xpti_td *GCallEvent = nullptr;
static xpti_td *GDebugEvent = nullptr;

constexpr auto GVerStr = "0.1";
constexpr int GMajVer = 0;
constexpr int GMinVer = 1;

static void cuptiCallback(void *, CUpti_CallbackDomain, CUpti_CallbackId CBID,
const void *CBData) {
static void cuptiCallback(void *UserData, CUpti_CallbackDomain,
CUpti_CallbackId CBID, const void *CBData) {
if (xptiTraceEnabled()) {
const auto *CBInfo = static_cast<const CUpti_CallbackData *>(CBData);
cuda_tracing_context_t_ *Ctx =
static_cast<cuda_tracing_context_t_ *>(UserData);

if (CBInfo->callbackSite == CUPTI_API_ENTER) {
CallCorrelationID = xptiGetUniqueId();
Expand All @@ -57,22 +100,94 @@ static void cuptiCallback(void *, CUpti_CallbackDomain, CUpti_CallbackId CBID,
uint8_t CallStreamID = xptiRegisterStream(CUDA_CALL_STREAM_NAME);
uint8_t DebugStreamID = xptiRegisterStream(CUDA_DEBUG_STREAM_NAME);

xptiNotifySubscribers(CallStreamID, TraceType, GCallEvent, nullptr,
xptiNotifySubscribers(CallStreamID, TraceType, Ctx->CallEvent, nullptr,
CallCorrelationID, FuncName);

xpti::function_with_args_t Payload{
FuncID, FuncName, const_cast<void *>(CBInfo->functionParams),
CBInfo->functionReturnValue, CBInfo->context};
xptiNotifySubscribers(DebugStreamID, TraceTypeArgs, GDebugEvent, nullptr,
DebugCorrelationID, &Payload);
xptiNotifySubscribers(DebugStreamID, TraceTypeArgs, Ctx->DebugEvent,
nullptr, DebugCorrelationID, &Payload);
}
}
#endif

cuda_tracing_context_t_ *createCUDATracingContext() {
#ifdef XPTI_ENABLE_INSTRUMENTATION
if (!xptiTraceEnabled())
return nullptr;
return new cuda_tracing_context_t_;
#else
return nullptr;
#endif // XPTI_ENABLE_INSTRUMENTATION
}

void freeCUDATracingContext(cuda_tracing_context_t_ *Ctx) {
#ifdef XPTI_ENABLE_INSTRUMENTATION
unloadCUDATracingLibrary(Ctx);
delete Ctx;
#else
(void)Ctx;
#endif // XPTI_ENABLE_INSTRUMENTATION
}

bool loadCUDATracingLibrary(cuda_tracing_context_t_ *Ctx) {
#if defined(XPTI_ENABLE_INSTRUMENTATION) && defined(CUPTI_LIB_PATH)
if (!Ctx)
return false;
if (Ctx->Library)
return true;
Ctx->Library = dlopen(CUPTI_LIB_PATH, RTLD_NOW);
if (!Ctx->Library)
return false;
LOAD_CUPTI_SYM(Ctx, Subscribe)
LOAD_CUPTI_SYM(Ctx, Unsubscribe)
LOAD_CUPTI_SYM(Ctx, EnableDomain)
LOAD_CUPTI_SYM(Ctx, EnableCallback)
if (!Ctx->Subscribe || !Ctx->Unsubscribe || !Ctx->EnableDomain ||
!Ctx->EnableCallback) {
unloadCUDATracingLibrary(Ctx);
return false;
}
return true;
#else
(void)Ctx;
return false;
#endif // XPTI_ENABLE_INSTRUMENTATION && CUPTI_LIB_PATH
}

void unloadCUDATracingLibrary(cuda_tracing_context_t_ *Ctx) {
#ifdef XPTI_ENABLE_INSTRUMENTATION
if (!Ctx || !Ctx->Library)
return;
Ctx->Subscribe = nullptr;
Ctx->Unsubscribe = nullptr;
Ctx->EnableDomain = nullptr;
Ctx->EnableCallback = nullptr;
dlclose(Ctx->Library);
Ctx->Library = nullptr;
#else
(void)Ctx;
#endif // XPTI_ENABLE_INSTRUMENTATION
}

void enableCUDATracing() {
#ifdef XPTI_ENABLE_INSTRUMENTATION
if (!xptiTraceEnabled())
return;
static cuda_tracing_context_t_ *Ctx = nullptr;
if (!Ctx)
Ctx = createCUDATracingContext();
enableCUDATracing(Ctx);
#endif
}

void enableCUDATracing(cuda_tracing_context_t_ *Ctx) {
#ifdef XPTI_ENABLE_INSTRUMENTATION
if (!Ctx || !xptiTraceEnabled())
return;
else if (!loadCUDATracingLibrary(Ctx))
return;

xptiRegisterStream(CUDA_CALL_STREAM_NAME);
xptiInitialize(CUDA_CALL_STREAM_NAME, GMajVer, GMinVer, GVerStr);
Expand All @@ -81,31 +196,39 @@ void enableCUDATracing() {

uint64_t Dummy;
xpti::payload_t CUDAPayload("CUDA Plugin Layer");
GCallEvent =
Ctx->CallEvent =
xptiMakeEvent("CUDA Plugin Layer", &CUDAPayload,
xpti::trace_algorithm_event, xpti_at::active, &Dummy);

xpti::payload_t CUDADebugPayload("CUDA Plugin Debug Layer");
GDebugEvent =
Ctx->DebugEvent =
xptiMakeEvent("CUDA Plugin Debug Layer", &CUDADebugPayload,
xpti::trace_algorithm_event, xpti_at::active, &Dummy);

CUpti_SubscriberHandle Subscriber;
cuptiSubscribe(&Subscriber, cuptiCallback, nullptr);
cuptiEnableDomain(1, Subscriber, CUPTI_CB_DOMAIN_DRIVER_API);
cuptiEnableCallback(0, Subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
Ctx->Subscribe(&Ctx->Subscriber, cuptiCallback, Ctx);
Ctx->EnableDomain(1, Ctx->Subscriber, CUPTI_CB_DOMAIN_DRIVER_API);
Ctx->EnableCallback(0, Ctx->Subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
CUPTI_DRIVER_TRACE_CBID_cuGetErrorString);
cuptiEnableCallback(0, Subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
Ctx->EnableCallback(0, Ctx->Subscriber, CUPTI_CB_DOMAIN_DRIVER_API,
CUPTI_DRIVER_TRACE_CBID_cuGetErrorName);
#else
(void)Ctx;
#endif
}

void disableCUDATracing() {
void disableCUDATracing(cuda_tracing_context_t_ *Ctx) {
#ifdef XPTI_ENABLE_INSTRUMENTATION
if (!xptiTraceEnabled())
if (!Ctx || !xptiTraceEnabled())
return;

if (Ctx->Subscriber) {
Ctx->Unsubscribe(Ctx->Subscriber);
Ctx->Subscriber = nullptr;
}

xptiFinalize(CUDA_CALL_STREAM_NAME);
xptiFinalize(CUDA_DEBUG_STREAM_NAME);
#else
(void)Ctx;
#endif // XPTI_ENABLE_INSTRUMENTATION
}
24 changes: 24 additions & 0 deletions source/adapters/cuda/tracing.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
//===--------- tracing.hpp - CUDA Host API Tracing -------------------------==//
//
// Copyright (C) 2023 Intel Corporation
//
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
// Exceptions. See LICENSE.TXT
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

struct cuda_tracing_context_t_;

cuda_tracing_context_t_ *createCUDATracingContext();
void freeCUDATracingContext(cuda_tracing_context_t_ *Ctx);

bool loadCUDATracingLibrary(cuda_tracing_context_t_ *Ctx);
void unloadCUDATracingLibrary(cuda_tracing_context_t_ *Ctx);

void enableCUDATracing(cuda_tracing_context_t_ *Ctx);
void disableCUDATracing(cuda_tracing_context_t_ *Ctx);

// Deprecated. Will be removed once pi_cuda has been updated to use the variant
// that takes a context pointer.
void enableCUDATracing();

0 comments on commit 311afb9

Please sign in to comment.