Skip to content

Commit

Permalink
[SYCL] Implement max_num_work_groups from the launch queries extension (
Browse files Browse the repository at this point in the history
#14333)

This PR implements the `max_num_work_groups ` query from the
`sycl_ext_oneapi_launch_queries` extension.

Additionally, this PR introduces changes that overload
`ext_oneapi_get_info` for another kernel-queue-specific query -
`max_num_work_group_sync` to take extra parameters for local work-group
size and dynamic local memory size (in bytes) in order to allow users to
pass those runtime resource limiting factors to the query, so they are
taken into account in the final group count suggestion.
  • Loading branch information
GeorgeWeb committed Sep 11, 2024
1 parent 729d6f6 commit 81aacfa
Show file tree
Hide file tree
Showing 16 changed files with 424 additions and 37 deletions.
14 changes: 7 additions & 7 deletions sycl/cmake/modules/FetchUnifiedRuntime.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,13 @@ if(SYCL_UR_USE_FETCH_CONTENT)
endfunction()

set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
# commit 8c9dd7e464a99ebbfb238ac2dabefc3ac77baea5
# Merge: a99dbcee 3abe18cf
# Author: Piotr Balcer <piotr.balcer@intel.com>
# Date: Fri Sep 6 17:21:17 2024 +0200
# Merge pull request #1820 from pbalcer/static-linking
# Add support for static linking of the L0 adapter
set(UNIFIED_RUNTIME_TAG 8c9dd7e464a99ebbfb238ac2dabefc3ac77baea5)
# commit eb63d1a21729f6928bb6cccc5f92856b0690aca6
# Merge: e26bba51 45a781f4
# Author: Omar Ahmed <omar.ahmed@codeplay.com>
# Date: Tue Sep 10 12:08:57 2024 +0100
# Merge pull request #1796 from GeorgeWeb/georgi/ur_kernel_max_active_wgs
# [CUDA] Implement urKernelSuggestMaxCooperativeGroupCountExp for Cuda
set(UNIFIED_RUNTIME_TAG eb63d1a21729f6928bb6cccc5f92856b0690aca6)

set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
# Due to the use of dependentloadflag and no installer for UMF and hwloc we need
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,9 +204,11 @@ otherwise it is 0.
|Returns the maximum number of work-groups, when the kernel is submitted to the
specified queue with the specified work-group size and the specified amount of
dynamic work-group local memory (in bytes), accounting for any kernel
properties or features. If the kernel can be submitted to the specified queue
without an error, the minimum value returned by this query is 1, otherwise it
is 0.
properties or features. If the specified work-group size is 0, which is
invalid, then the implementation will throw a synchronous exception with the
`errc::invalid` error code. If the kernel can be submitted to the specified
queue without an error, the minimum value returned by this query is 1,
otherwise it is 0.

|===

Expand Down
12 changes: 12 additions & 0 deletions sycl/include/sycl/detail/info_desc_helpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ template <typename T> struct is_queue_info_desc : std::false_type {};
template <typename T> struct is_kernel_info_desc : std::false_type {};
template <typename T>
struct is_kernel_device_specific_info_desc : std::false_type {};
template <typename T>
struct is_kernel_queue_specific_info_desc : std::false_type {};
template <typename T> struct is_event_info_desc : std::false_type {};
template <typename T> struct is_event_profiling_info_desc : std::false_type {};
// Normally we would just use std::enable_if to limit valid get_info template
Expand Down Expand Up @@ -134,6 +136,16 @@ struct IsKernelInfo<info::kernel_device_specific::ext_codeplay_num_regs>
#include <sycl/info/ext_intel_device_traits.def>
#include <sycl/info/ext_oneapi_device_traits.def>
#undef __SYCL_PARAM_TRAITS_SPEC

#define __SYCL_PARAM_TRAITS_SPEC(Namespace, DescType, Desc, ReturnT, PiCode) \
template <> \
struct is_##DescType##_info_desc<Namespace::info::DescType::Desc> \
: std::true_type { \
using return_type = Namespace::info::DescType::Desc::return_type; \
};
#include <sycl/info/ext_oneapi_kernel_queue_specific_traits.def>
#undef __SYCL_PARAM_TRAITS_SPEC

#define __SYCL_PARAM_TRAITS_SPEC(DescType, Desc, ReturnT, PiCode) \
template <> \
struct is_backend_info_desc<info::DescType::Desc> : std::true_type { \
Expand Down
9 changes: 2 additions & 7 deletions sycl/include/sycl/ext/oneapi/experimental/root_group.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,8 @@ namespace sycl {
inline namespace _V1 {
namespace ext::oneapi::experimental {

namespace info::kernel_queue_specific {
// TODO: Revisit and align with sycl_ext_oneapi_forward_progress extension once
// #7598 is merged.
struct max_num_work_group_sync {
using return_type = size_t;
};
} // namespace info::kernel_queue_specific
// See 'sycl/info/kernel_device_specific_traits.def' for the kernel
// device-specific properties that relate to 'root_group'.

template <int Dimensions> class root_group {
public:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
// TODO: Revisit 'max_num_work_group_sync' and align it with the
// 'sycl_ext_oneapi_forward_progress' extension once #7598 is merged.
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t,)
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t,)
2 changes: 2 additions & 0 deletions sycl/include/sycl/info/info_desc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,8 @@ struct work_item_progress_capabilities;
#include <sycl/info/ext_codeplay_device_traits.def>
#include <sycl/info/ext_intel_device_traits.def>
#include <sycl/info/ext_oneapi_device_traits.def>
#include <sycl/info/ext_oneapi_kernel_queue_specific_traits.def>

#undef __SYCL_PARAM_TRAITS_SPEC
#undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC
} // namespace _V1
Expand Down
24 changes: 22 additions & 2 deletions sycl/include/sycl/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,9 +159,29 @@ class __SYCL_EXPORT kernel : public detail::OwnerLessBase<kernel> {
get_info(const device &Device, const range<3> &WGSize) const;

// TODO: Revisit and align with sycl_ext_oneapi_forward_progress extension
// once #7598 is merged.
// once #7598 is merged. (regarding the 'max_num_work_group_sync' query)

/// Query queue/launch-specific information from a kernel using the
/// info::kernel_queue_specific descriptor for a specific Queue.
///
/// \param Queue is a valid SYCL queue.
/// \return depends on information being queried.
template <typename Param>
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
ext_oneapi_get_info(queue Queue) const;

/// Query queue/launch-specific information from a kernel using the
/// info::kernel_queue_specific descriptor for a specific Queue and values.
/// max_num_work_groups is the only valid descriptor for this function.
///
/// \param Queue is a valid SYCL queue.
/// \param WorkGroupSize is the work-group size the number of work-groups is
/// requested for.
/// \return depends on information being queried.
template <typename Param>
typename Param::return_type ext_oneapi_get_info(const queue &q) const;
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
size_t DynamicLocalMemorySize) const;

private:
/// Constructs a SYCL kernel object from a valid kernel_impl instance.
Expand Down
32 changes: 32 additions & 0 deletions sycl/source/detail/kernel_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,38 @@ void kernel_impl::checkIfValidForNumArgsInfoQuery() const {
"interoperability function or to query a device built-in kernel");
}

bool kernel_impl::exceedsOccupancyResourceLimits(
const device &Device, const range<3> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
// Respect occupancy limits for WorkGroupSize and DynamicLocalMemorySize.
// Generally, exceeding hardware resource limits will yield in an error when
// the kernel is launched.
const size_t MaxWorkGroupSize =
get_info<info::kernel_device_specific::work_group_size>(Device);
const size_t MaxLocalMemorySizeInBytes =
Device.get_info<info::device::local_mem_size>();

if (WorkGroupSize.size() > MaxWorkGroupSize)
return true;

if (DynamicLocalMemorySize > MaxLocalMemorySizeInBytes)
return true;

// It will be impossible to launch a kernel for Cuda when the hardware limit
// for the 32-bit registers page file size is exceeded.
if (Device.get_backend() == backend::ext_oneapi_cuda) {
const uint32_t RegsPerWorkItem =
get_info<info::kernel_device_specific::ext_codeplay_num_regs>(Device);
const uint32_t MaxRegsPerWorkGroup =
Device.get_info<ext::codeplay::experimental::info::device::
max_registers_per_work_group>();
if ((MaxWorkGroupSize * RegsPerWorkItem) > MaxRegsPerWorkGroup)
return true;
}

return false;
}

template <>
typename info::platform::version::return_type
kernel_impl::get_backend_info<info::platform::version>() const {
Expand Down
90 changes: 80 additions & 10 deletions sycl/source/detail/kernel_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,26 @@ class kernel_impl {
typename Param::return_type get_info(const device &Device,
const range<3> &WGSize) const;

/// Query queue/launch-specific information from a kernel using the
/// info::kernel_queue_specific descriptor for a specific Queue.
///
/// \param Queue is a valid SYCL queue.
/// \return depends on information being queried.
template <typename Param>
typename Param::return_type ext_oneapi_get_info(queue Queue) const;

/// Query queue/launch-specific information from a kernel using the
/// info::kernel_queue_specific descriptor for a specific Queue and values.
/// max_num_work_groups is the only valid descriptor for this function.
///
/// \param Queue is a valid SYCL queue.
/// \param WorkGroupSize is the work-group size the number of work-groups is
/// requested for.
/// \return depends on information being queried.
template <typename Param>
typename Param::return_type ext_oneapi_get_info(const queue &q) const;
typename Param::return_type
ext_oneapi_get_info(queue Queue, const range<3> &MaxWorkGroupSize,
size_t DynamicLocalMemorySize) const;

/// Get a constant reference to a raw kernel object.
///
Expand Down Expand Up @@ -171,6 +189,12 @@ class kernel_impl {

bool isBuiltInKernel(const device &Device) const;
void checkIfValidForNumArgsInfoQuery() const;

/// Check if the occupancy limits are exceeded for the given kernel launch
/// configuration.
bool exceedsOccupancyResourceLimits(const device &Device,
const range<3> &WorkGroupSize,
size_t DynamicLocalMemorySize) const;
};

template <typename Param>
Expand Down Expand Up @@ -217,20 +241,66 @@ kernel_impl::get_info(const device &Device,
getPlugin());
}

namespace syclex = ext::oneapi::experimental;

template <>
inline typename ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync::return_type
inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
return_type
kernel_impl::ext_oneapi_get_info<
ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync>(const queue &Queue) const {
syclex::info::kernel_queue_specific::max_num_work_groups>(
queue Queue, const range<3> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
if (WorkGroupSize.size() == 0)
throw exception(sycl::make_error_code(errc::invalid),
"The launch work-group size cannot be zero.");

const auto &Plugin = getPlugin();
const auto &Handle = getHandleRef();
auto Device = Queue.get_device();

uint32_t GroupCount{0};
if (auto Result = Plugin->call_nocheck<
UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>(
Handle, WorkGroupSize.size(), DynamicLocalMemorySize, &GroupCount);
Result != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
// The feature is supported. Check for other errors and throw if any.
Plugin->checkUrResult(Result);
return GroupCount;
}

// Fallback. If the backend API is unsupported, this query will return either
// 0 or 1 based on the kernel resource usage and the user-requested resources.
return exceedsOccupancyResourceLimits(Device, WorkGroupSize,
DynamicLocalMemorySize)
? 0
: 1;
}

template <>
inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
return_type
kernel_impl::ext_oneapi_get_info<
syclex::info::kernel_queue_specific::max_num_work_group_sync>(
queue Queue, const range<3> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
return ext_oneapi_get_info<
syclex::info::kernel_queue_specific::max_num_work_groups>(
Queue, WorkGroupSize, DynamicLocalMemorySize);
}

template <>
inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
return_type
kernel_impl::ext_oneapi_get_info<
syclex::info::kernel_queue_specific::max_num_work_group_sync>(
queue Queue) const {
auto Device = Queue.get_device();
const auto MaxWorkGroupSize =
Queue.get_device().get_info<info::device::max_work_group_size>();
uint32_t GroupCount = 0;
Plugin->call<UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>(
Handle, MaxWorkGroupSize, /* DynamicSharedMemorySize */ 0, &GroupCount);
return GroupCount;
get_info<info::kernel_device_specific::work_group_size>(Device);
const sycl::range<3> WorkGroupSize{MaxWorkGroupSize, 1, 1};
return ext_oneapi_get_info<
syclex::info::kernel_queue_specific::max_num_work_group_sync>(
Queue, WorkGroupSize, /* DynamicLocalMemorySize */ 0);
}

} // namespace detail
Expand Down
26 changes: 23 additions & 3 deletions sycl/source/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,16 +106,36 @@ kernel::get_info<info::kernel_device_specific::max_sub_group_size>(
const device &, const sycl::range<3> &) const;

template <typename Param>
typename Param::return_type
kernel::ext_oneapi_get_info(const queue &Queue) const {
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
kernel::ext_oneapi_get_info(queue Queue) const {
return impl->ext_oneapi_get_info<Param>(Queue);
}

template <typename Param>
typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
kernel::ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
size_t DynamicLocalMemorySize) const {
return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
DynamicLocalMemorySize);
}

template __SYCL_EXPORT typename ext::oneapi::experimental::info::
kernel_queue_specific::max_num_work_group_sync::return_type
kernel::ext_oneapi_get_info<
ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync>(const queue &Queue) const;
max_num_work_group_sync>(queue Queue) const;

#define __SYCL_PARAM_TRAITS_SPEC(Namespace, DescType, Desc, ReturnT) \
template __SYCL_EXPORT ReturnT \
kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>( \
queue, const range<3> &, size_t) const;
// Not including "ext_oneapi_kernel_queue_specific_traits.def" because not all
// kernel_queue_specific queries require the above-defined get_info interface.
// clang-format off
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t)
__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t)
// clang-format on
#undef __SYCL_PARAM_TRAITS_SPEC

kernel::kernel(std::shared_ptr<detail::kernel_impl> Impl) : impl(Impl) {}

Expand Down
Loading

0 comments on commit 81aacfa

Please sign in to comment.