Skip to content

Commit

Permalink
[SYCL] Implement sycl_ext_oneapi_profiling_tag extension (#12838)
Browse files Browse the repository at this point in the history
This commit adds the implementation of the sycl_ext_oneapi_profiling_tag
extension. Moving the extension to experimental will happen in a
follow-up patch.

---------

Signed-off-by: Larsen, Steffen <steffen.larsen@intel.com>
  • Loading branch information
steffenlarsen authored May 30, 2024
1 parent da379ec commit 42fc5e9
Show file tree
Hide file tree
Showing 34 changed files with 407 additions and 5 deletions.
5 changes: 3 additions & 2 deletions llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def AspectExt_oneapi_graph : Aspect<"ext_oneapi_graph">;
def AspectExt_intel_fpga_task_sequence : Aspect<"ext_intel_fpga_task_sequence">;
def AspectExt_oneapi_limited_graph : Aspect<"ext_oneapi_limited_graph">;
def AspectExt_oneapi_private_alloca : Aspect<"ext_oneapi_private_alloca">;
def AspectExt_oneapi_queue_profiling_tag : Aspect<"ext_oneapi_queue_profiling_tag">;
// Deprecated aspects
def AspectInt64_base_atomics : Aspect<"int64_base_atomics">;
def AspectInt64_extended_atomics : Aspect<"int64_extended_atomics">;
Expand Down Expand Up @@ -138,8 +139,8 @@ def : TargetInfo<"__TestAspectList",
AspectExt_oneapi_ballot_group, AspectExt_oneapi_fixed_size_group, AspectExt_oneapi_opportunistic_group,
AspectExt_oneapi_tangle_group, AspectExt_intel_matrix, AspectExt_oneapi_is_composite, AspectExt_oneapi_is_component,
AspectExt_oneapi_graph, AspectExt_intel_fpga_task_sequence, AspectExt_oneapi_limited_graph,
AspectExt_oneapi_private_alloca],
[]>;
AspectExt_oneapi_private_alloca, AspectExt_oneapi_queue_profiling_tag],
[]>;
// This definition serves the only purpose of testing whether the deprecated aspect list defined in here and in SYCL RT
// match.
def : TargetInfo<"__TestDeprecatedAspectList",
Expand Down
7 changes: 7 additions & 0 deletions sycl/include/sycl/detail/cg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ class CG {
CopyImage = 23,
SemaphoreWait = 24,
SemaphoreSignal = 25,
ProfilingTag = 26,
};

struct StorageInitHelper {
Expand Down Expand Up @@ -344,6 +345,12 @@ class CGBarrier : public CG {
MEventsWaitWithBarrier(std::move(EventsWaitWithBarrier)) {}
};

class CGProfilingTag : public CG {
public:
CGProfilingTag(CG::StorageInitHelper CGData, detail::code_location loc = {})
: CG(CG::ProfilingTag, std::move(CGData), std::move(loc)) {}
};

/// "Copy 2D USM" command group class.
class CGCopy2DUSM : public CG {
void *MSrc;
Expand Down
1 change: 1 addition & 0 deletions sycl/include/sycl/detail/pi.def
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ _PI_API(piEventRetain)
_PI_API(piEventRelease)
_PI_API(piextEventGetNativeHandle)
_PI_API(piextEventCreateWithNativeHandle)
_PI_API(piEnqueueTimestampRecordingExp)
// Sampler
_PI_API(piSamplerCreate)
_PI_API(piSamplerGetInfo)
Expand Down
11 changes: 10 additions & 1 deletion sycl/include/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,9 +174,11 @@
// - PI_EXT_ONEAPI_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D
// 15.51 Removed ret_mem argument from piextMemUnsampledImageCreate and
// piextMemSampledImageCreate
// 15.52 Added piEnqueueTimestampRecordingExp and
// PI_EXT_ONEAPI_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT.

#define _PI_H_VERSION_MAJOR 15
#define _PI_H_VERSION_MINOR 51
#define _PI_H_VERSION_MINOR 52

#define _PI_STRING_HELPER(a) #a
#define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b)
Expand Down Expand Up @@ -484,6 +486,9 @@ typedef enum {
PI_EXT_ONEAPI_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_2D = 0x2011A,
PI_EXT_ONEAPI_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_USM = 0x2011B,
PI_EXT_ONEAPI_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D = 0x2011C,

// Timestamp enqueue
PI_EXT_ONEAPI_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT = 0x2011D,
} _pi_device_info;

typedef enum {
Expand Down Expand Up @@ -1774,6 +1779,10 @@ __SYCL_EXPORT pi_result piEventRetain(pi_event event);

__SYCL_EXPORT pi_result piEventRelease(pi_event event);

__SYCL_EXPORT pi_result piEnqueueTimestampRecordingExp(
pi_queue queue, pi_bool blocking, pi_uint32 num_events_in_wait_list,
const pi_event *event_wait_list, pi_event *event);

/// Gets the native handle of a PI event object.
///
/// \param event is the PI event to get the native handle of.
Expand Down
10 changes: 10 additions & 0 deletions sycl/include/sycl/device_aspect_macros.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,11 @@
#define __SYCL_ALL_DEVICES_HAVE_ext_oneapi_bindless_sampled_image_fetch_3d__ 0
#endif

#ifndef __SYCL_ALL_DEVICES_HAVE_ext_oneapi_queue_profiling_tag__
// __SYCL_ASPECT(ext_oneapi_queue_profiling_tag, 73)
#define __SYCL_ALL_DEVICES_HAVE_ext_oneapi_queue_profiling_tag__ 0
#endif

#ifndef __SYCL_ANY_DEVICE_HAS_host__
// __SYCL_ASPECT(host, 0)
#define __SYCL_ANY_DEVICE_HAS_host__ 0
Expand Down Expand Up @@ -740,3 +745,8 @@
//__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_3d, 72)
#define __SYCL_ANY_DEVICE_HAS_ext_oneapi_bindless_sampled_image_fetch_3d__ 0
#endif

#ifndef __SYCL_ANY_DEVICE_HAS_ext_oneapi_queue_profiling_tag__
// __SYCL_ASPECT(ext_oneapi_queue_profiling_tag, 73)
#define __SYCL_ANY_DEVICE_HAS_ext_oneapi_queue_profiling_tag__ 0
#endif
44 changes: 44 additions & 0 deletions sycl/include/sycl/ext/oneapi/experimental/profiling_tag.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
//==--------- profiling_tag.hpp --- SYCL profiling tag extension -----------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#pragma once

#include <sycl/aspects.hpp>
#include <sycl/event.hpp>
#include <sycl/handler.hpp>
#include <sycl/properties/queue_properties.hpp>
#include <sycl/queue.hpp>

namespace sycl {
inline namespace _V1 {
namespace ext::oneapi::experimental {

inline event submit_profiling_tag(queue &Queue,
const sycl::detail::code_location &CodeLoc =
sycl::detail::code_location::current()) {
if (Queue.get_device().has(aspect::ext_oneapi_queue_profiling_tag)) {
return Queue.submit(
[=](handler &CGH) {
sycl::detail::HandlerAccess::internalProfilingTagImpl(CGH);
},
CodeLoc);
}

// If it is not supported natively on the device, we use another path if
// profiling is enabled.
if (!Queue.has_property<sycl::property::queue::enable_profiling>())
throw sycl::exception(
make_error_code(errc::invalid),
"Device must either have aspect::ext_oneapi_queue_profiling_tag or the "
"queue must have profiling enabled.");
return Queue.ext_oneapi_submit_barrier();
}

} // namespace ext::oneapi::experimental
} // namespace _V1
} // namespace sycl
18 changes: 18 additions & 0 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ class stream_impl;
template <typename DataT, int Dimensions, access::mode AccessMode,
access::target AccessTarget, access::placeholder IsPlaceholder>
class image_accessor;
class HandlerAccess;
template <typename RetType, typename Func, typename Arg>
static Arg member_ptr_helper(RetType (Func::*)(Arg) const);

Expand Down Expand Up @@ -3676,12 +3677,29 @@ class __SYCL_EXPORT handler {
// Set that an ND Range was used during a call to parallel_for
void setNDRangeUsed(bool Value);

inline void internalProfilingTagImpl() {
throwIfActionIsCreated();
setType(detail::CG::ProfilingTag);
}

friend class detail::HandlerAccess;

protected:
/// Registers event dependencies in this command group.
void depends_on(const detail::EventImplPtr &Event);
/// Registers event dependencies in this command group.
void depends_on(const std::vector<detail::EventImplPtr> &Events);
};

namespace detail {
class HandlerAccess {
public:
static void internalProfilingTagImpl(handler &Handler) {
Handler.internalProfilingTagImpl();
}
};
} // namespace detail

} // namespace _V1
} // namespace sycl

Expand Down
1 change: 1 addition & 0 deletions sycl/include/sycl/info/aspects.def
Original file line number Diff line number Diff line change
Expand Up @@ -67,3 +67,4 @@ __SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_2d_usm, 69)
__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_2d, 70)
__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_3d_usm, 71)
__SYCL_ASPECT(ext_oneapi_bindless_sampled_image_fetch_3d, 72)
__SYCL_ASPECT(ext_oneapi_queue_profiling_tag, 73)
1 change: 1 addition & 0 deletions sycl/include/sycl/sycl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@
#include <sycl/ext/oneapi/experimental/group_load_store.hpp>
#include <sycl/ext/oneapi/experimental/opportunistic_group.hpp>
#include <sycl/ext/oneapi/experimental/prefetch.hpp>
#include <sycl/ext/oneapi/experimental/profiling_tag.hpp>
#include <sycl/ext/oneapi/experimental/root_group.hpp>
#include <sycl/ext/oneapi/experimental/tangle_group.hpp>
#include <sycl/ext/oneapi/filter_selector.hpp>
Expand Down
8 changes: 8 additions & 0 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,14 @@ pi_result piextEventCreateWithNativeHandle(pi_native_handle NativeHandle,
OwnNativeHandle, Event);
}

pi_result piEnqueueTimestampRecordingExp(pi_queue Queue, pi_bool Blocking,
pi_uint32 NumEventsInWaitList,
const pi_event *EventWaitList,
pi_event *Event) {
return pi2ur::piEnqueueTimestampRecordingExp(
Queue, Blocking, NumEventsInWaitList, EventWaitList, Event);
}

pi_result piSamplerCreate(pi_context Context,
const pi_sampler_properties *SamplerProperties,
pi_sampler *RetSampler) {
Expand Down
8 changes: 8 additions & 0 deletions sycl/plugins/hip/pi_hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -634,6 +634,14 @@ pi_result piextEventCreateWithNativeHandle(pi_native_handle NativeHandle,
OwnNativeHandle, Event);
}

pi_result piEnqueueTimestampRecordingExp(pi_queue Queue, pi_bool Blocking,
pi_uint32 NumEventsInWaitList,
const pi_event *EventWaitList,
pi_event *Event) {
return pi2ur::piEnqueueTimestampRecordingExp(
Queue, Blocking, NumEventsInWaitList, EventWaitList, Event);
}

pi_result piSamplerCreate(pi_context Context,
const pi_sampler_properties *SamplerProperties,
pi_sampler *RetSampler) {
Expand Down
8 changes: 8 additions & 0 deletions sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -650,6 +650,14 @@ pi_result piextEventCreateWithNativeHandle(pi_native_handle NativeHandle,
OwnNativeHandle, Event);
}

pi_result piEnqueueTimestampRecordingExp(pi_queue Queue, pi_bool Blocking,
pi_uint32 NumEventsInWaitList,
const pi_event *EventWaitList,
pi_event *Event) {
return pi2ur::piEnqueueTimestampRecordingExp(
Queue, Blocking, NumEventsInWaitList, EventWaitList, Event);
}

//
// Sampler
//
Expand Down
8 changes: 8 additions & 0 deletions sycl/plugins/native_cpu/pi_native_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -628,6 +628,14 @@ pi_result piextEventCreateWithNativeHandle(pi_native_handle NativeHandle,
OwnNativeHandle, Event);
}

pi_result piEnqueueTimestampRecordingExp(pi_queue Queue, pi_bool Blocking,
pi_uint32 NumEventsInWaitList,
const pi_event *EventWaitList,
pi_event *Event) {
return pi2ur::piEnqueueTimestampRecordingExp(
Queue, Blocking, NumEventsInWaitList, EventWaitList, Event);
}

//
// Sampler
//
Expand Down
8 changes: 8 additions & 0 deletions sycl/plugins/opencl/pi_opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -611,6 +611,14 @@ pi_result piextEventCreateWithNativeHandle(pi_native_handle NativeHandle,
OwnNativeHandle, Event);
}

pi_result piEnqueueTimestampRecordingExp(pi_queue Queue, pi_bool Blocking,
pi_uint32 NumEventsInWaitList,
const pi_event *EventWaitList,
pi_event *Event) {
return pi2ur::piEnqueueTimestampRecordingExp(
Queue, Blocking, NumEventsInWaitList, EventWaitList, Event);
}

pi_result piSamplerCreate(pi_context Context,
const pi_sampler_properties *SamplerProperties,
pi_sampler *RetSampler) {
Expand Down
20 changes: 20 additions & 0 deletions sycl/plugins/unified_runtime/pi2ur.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1299,6 +1299,9 @@ inline pi_result piDeviceGetInfo(pi_device Device, pi_device_info ParamName,
PI_TO_UR_MAP_DEVICE_INFO(
PI_EXT_ONEAPI_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT,
UR_DEVICE_INFO_INTEROP_SEMAPHORE_EXPORT_SUPPORT_EXP)
PI_TO_UR_MAP_DEVICE_INFO(
PI_EXT_ONEAPI_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT,
UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP)
PI_TO_UR_MAP_DEVICE_INFO(PI_EXT_INTEL_DEVICE_INFO_ESIMD_SUPPORT,
UR_DEVICE_INFO_ESIMD_SUPPORT)
PI_TO_UR_MAP_DEVICE_INFO(PI_EXT_ONEAPI_DEVICE_INFO_COMPONENT_DEVICES,
Expand Down Expand Up @@ -4362,6 +4365,23 @@ inline pi_result piEventRelease(pi_event Event) {
return PI_SUCCESS;
}

inline pi_result piEnqueueTimestampRecordingExp(pi_queue Queue,
pi_bool Blocking,
pi_uint32 NumEventsInWaitList,
const pi_event *EventWaitList,
pi_event *Event) {

ur_queue_handle_t UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
const ur_event_handle_t *UrEventWaitList =
reinterpret_cast<const ur_event_handle_t *>(EventWaitList);
ur_event_handle_t *UREvent = reinterpret_cast<ur_event_handle_t *>(Event);

HANDLE_ERRORS(urEnqueueTimestampRecordingExp(
UrQueue, Blocking, NumEventsInWaitList, UrEventWaitList, UREvent));

return PI_SUCCESS;
}

// Events
///////////////////////////////////////////////////////////////////////////////

Expand Down
8 changes: 8 additions & 0 deletions sycl/plugins/unified_runtime/pi_unified_runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -687,6 +687,13 @@ __SYCL_EXPORT pi_result piextEventCreateWithNativeHandle(
OwnNativeHandle, Event);
}

__SYCL_EXPORT pi_result piEnqueueTimestampRecordingExp(
pi_queue Queue, pi_bool Blocking, pi_uint32 NumEventsInWaitList,
const pi_event *EventWaitList, pi_event *Event) {
return pi2ur::piEnqueueTimestampRecordingExp(
Queue, Blocking, NumEventsInWaitList, EventWaitList, Event);
}

__SYCL_EXPORT pi_result piEnqueueMemImageFill(
pi_queue Queue, pi_mem Image, const void *FillColor, const size_t *Origin,
const size_t *Region, pi_uint32 NumEventsInWaitList,
Expand Down Expand Up @@ -1516,6 +1523,7 @@ __SYCL_EXPORT pi_result piPluginInit(pi_plugin *PluginInit) {
_PI_API(piextEventGetNativeHandle)
_PI_API(piEventGetProfilingInfo)
_PI_API(piEventCreate)
_PI_API(piEnqueueTimestampRecordingExp)

_PI_API(piSamplerCreate)
_PI_API(piSamplerGetInfo)
Expand Down
8 changes: 8 additions & 0 deletions sycl/source/detail/device_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -741,6 +741,14 @@ bool device_impl::has(aspect Aspect) const {
return be == sycl::backend::ext_oneapi_level_zero ||
be == sycl::backend::opencl;
}
case aspect::ext_oneapi_queue_profiling_tag: {
pi_bool support = PI_FALSE;
bool call_successful =
getPlugin()->call_nocheck<detail::PiApiKind::piDeviceGetInfo>(
MDevice, PI_EXT_ONEAPI_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT,
sizeof(pi_bool), &support, nullptr) == PI_SUCCESS;
return call_successful && support;
}
}
throw runtime_error("This device aspect has not been implemented yet.",
PI_ERROR_INVALID_DEVICE);
Expand Down
10 changes: 8 additions & 2 deletions sycl/source/detail/event_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ void event_impl::checkProfilingPreconditions() const {
"Profiling information is unavailable as the event "
"has no associated queue.");
}
if (!MIsProfilingEnabled) {
if (!MIsProfilingEnabled && !MProfilingTagEvent) {
throw sycl::exception(
make_error_code(sycl::errc::invalid),
"Profiling information is unavailable as the queue associated with "
Expand All @@ -302,6 +302,12 @@ template <>
uint64_t
event_impl::get_profiling_info<info::event_profiling::command_submit>() {
checkProfilingPreconditions();
if (isProfilingTagEvent()) {
// For profiling tag events we rely on the submission time reported as
// the start time has undefined behavior.
return get_event_profiling_info<info::event_profiling::command_submit>(
this->getHandleRef(), this->getPlugin());
}

// The delay between the submission and the actual start of a CommandBuffer
// can be short. Consequently, the submission time, which is based on
Expand Down Expand Up @@ -558,7 +564,7 @@ void event_impl::cleanDepEventsThroughOneLevel() {
}

void event_impl::setSubmissionTime() {
if (!MIsProfilingEnabled)
if (!MIsProfilingEnabled && !MProfilingTagEvent)
return;
if (!MFallbackProfiling) {
if (QueueImplPtr Queue = MQueue.lock()) {
Expand Down
8 changes: 8 additions & 0 deletions sycl/source/detail/event_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,10 @@ class event_impl {

void setEnqueued() { MIsEnqueued = true; }

void markAsProfilingTagEvent() { MProfilingTagEvent = true; }

bool isProfilingTagEvent() const noexcept { return MProfilingTagEvent; }

protected:
// When instrumentation is enabled emits trace event for event wait begin and
// returns the telemetry event generated for the wait
Expand Down Expand Up @@ -408,6 +412,10 @@ class event_impl {
// (if any) associated with that submission is stored here.
sycl::detail::pi::PiExtCommandBufferCommand MCommandBufferCommand = nullptr;

// Signifies whether this event is the result of a profiling tag command. This
// allows for profiling, even if the queue does not have profiling enabled.
bool MProfilingTagEvent = false;

friend std::vector<sycl::detail::pi::PiEvent>
getOrWaitEvents(std::vector<sycl::event> DepEvents,
std::shared_ptr<sycl::detail::context_impl> Context);
Expand Down
Loading

0 comments on commit 42fc5e9

Please sign in to comment.