[SYCL] Implement max_num_work_groups from the launch queries extension (

#14333) This PR implements the `max_num_work_groups ` query from the `sycl_ext_oneapi_launch_queries` extension. Additionally, this PR introduces changes that overload `ext_oneapi_get_info` for another kernel-queue-specific query - `max_num_work_group_sync` to take extra parameters for local work-group size and dynamic local memory size (in bytes) in order to allow users to pass those runtime resource limiting factors to the query, so they are taken into account in the final group count suggestion.
intel · Sep 11, 2024 · 81aacfa · 81aacfa
1 parent 729d6f6
commit 81aacfa
Show file tree

Hide file tree

Showing 16 changed files with 424 additions and 37 deletions.
diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake
@@ -117,13 +117,13 @@ if(SYCL_UR_USE_FETCH_CONTENT)
  endfunction()
 
  set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
- # commit 8c9dd7e464a99ebbfb238ac2dabefc3ac77baea5
- # Merge: a99dbcee 3abe18cf
- # Author: Piotr Balcer <piotr.balcer@intel.com>
- # Date: Fri Sep 6 17:21:17 2024 +0200
- # Merge pull request #1820 from pbalcer/static-linking
- # Add support for static linking of the L0 adapter
- set(UNIFIED_RUNTIME_TAG 8c9dd7e464a99ebbfb238ac2dabefc3ac77baea5)
+ # commit eb63d1a21729f6928bb6cccc5f92856b0690aca6
+ # Merge: e26bba51 45a781f4
+ # Author: Omar Ahmed <omar.ahmed@codeplay.com>
+ # Date: Tue Sep 10 12:08:57 2024 +0100
+ # Merge pull request #1796 from GeorgeWeb/georgi/ur_kernel_max_active_wgs
+ # [CUDA] Implement urKernelSuggestMaxCooperativeGroupCountExp for Cuda
+ set(UNIFIED_RUNTIME_TAG eb63d1a21729f6928bb6cccc5f92856b0690aca6)
 
  set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
  # Due to the use of dependentloadflag and no installer for UMF and hwloc we need

diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_launch_queries.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_launch_queries.asciidoc
@@ -204,9 +204,11 @@ otherwise it is 0.
 |Returns the maximum number of work-groups, when the kernel is submitted to the
 specified queue with the specified work-group size and the specified amount of
 dynamic work-group local memory (in bytes), accounting for any kernel
-properties or features. If the kernel can be submitted to the specified queue
-without an error, the minimum value returned by this query is 1, otherwise it
-is 0.
+properties or features. If the specified work-group size is 0, which is
+invalid, then the implementation will throw a synchronous exception with the
+`errc::invalid` error code. If the kernel can be submitted to the specified
+queue without an error, the minimum value returned by this query is 1,
+otherwise it is 0.
 
 |===
 

diff --git a/sycl/include/sycl/detail/info_desc_helpers.hpp b/sycl/include/sycl/detail/info_desc_helpers.hpp
@@ -31,6 +31,8 @@ template <typename T> struct is_queue_info_desc : std::false_type {};
 template <typename T> struct is_kernel_info_desc : std::false_type {};
 template <typename T>
 struct is_kernel_device_specific_info_desc : std::false_type {};
+template <typename T>
+struct is_kernel_queue_specific_info_desc : std::false_type {};
 template <typename T> struct is_event_info_desc : std::false_type {};
 template <typename T> struct is_event_profiling_info_desc : std::false_type {};
 // Normally we would just use std::enable_if to limit valid get_info template
@@ -134,6 +136,16 @@ struct IsKernelInfo<info::kernel_device_specific::ext_codeplay_num_regs>
 #include <sycl/info/ext_intel_device_traits.def>
 #include <sycl/info/ext_oneapi_device_traits.def>
 #undef __SYCL_PARAM_TRAITS_SPEC
+
+#define __SYCL_PARAM_TRAITS_SPEC(Namespace, DescType, Desc, ReturnT, PiCode) \
+ template <> \
+ struct is_##DescType##_info_desc<Namespace::info::DescType::Desc> \
+ : std::true_type { \
+ using return_type = Namespace::info::DescType::Desc::return_type; \
+ };
+#include <sycl/info/ext_oneapi_kernel_queue_specific_traits.def>
+#undef __SYCL_PARAM_TRAITS_SPEC
+
 #define __SYCL_PARAM_TRAITS_SPEC(DescType, Desc, ReturnT, PiCode) \
  template <> \
  struct is_backend_info_desc<info::DescType::Desc> : std::true_type { \

diff --git a/sycl/include/sycl/ext/oneapi/experimental/root_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/root_group.hpp
@@ -24,13 +24,8 @@ namespace sycl {
 inline namespace _V1 {
 namespace ext::oneapi::experimental {
 
-namespace info::kernel_queue_specific {
-// TODO: Revisit and align with sycl_ext_oneapi_forward_progress extension once
-// #7598 is merged.
-struct max_num_work_group_sync {
- using return_type = size_t;
-};
-} // namespace info::kernel_queue_specific
+// See 'sycl/info/kernel_device_specific_traits.def' for the kernel
+// device-specific properties that relate to 'root_group'.
 
 template <int Dimensions> class root_group {
 public:

diff --git a/sycl/include/sycl/info/ext_oneapi_kernel_queue_specific_traits.def b/sycl/include/sycl/info/ext_oneapi_kernel_queue_specific_traits.def
@@ -0,0 +1,4 @@
+// TODO: Revisit 'max_num_work_group_sync' and align it with the
+// 'sycl_ext_oneapi_forward_progress' extension once #7598 is merged.
+__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t,)
+__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t,)
diff --git a/sycl/include/sycl/info/info_desc.hpp b/sycl/include/sycl/info/info_desc.hpp
@@ -247,6 +247,8 @@ struct work_item_progress_capabilities;
 #include <sycl/info/ext_codeplay_device_traits.def>
 #include <sycl/info/ext_intel_device_traits.def>
 #include <sycl/info/ext_oneapi_device_traits.def>
+#include <sycl/info/ext_oneapi_kernel_queue_specific_traits.def>
+
 #undef __SYCL_PARAM_TRAITS_SPEC
 #undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC
 } // namespace _V1

diff --git a/sycl/include/sycl/kernel.hpp b/sycl/include/sycl/kernel.hpp
@@ -159,9 +159,29 @@ class __SYCL_EXPORT kernel : public detail::OwnerLessBase<kernel> {
  get_info(const device &Device, const range<3> &WGSize) const;
 
  // TODO: Revisit and align with sycl_ext_oneapi_forward_progress extension
- // once #7598 is merged.
+ // once #7598 is merged. (regarding the 'max_num_work_group_sync' query)
+
+ /// Query queue/launch-specific information from a kernel using the
+ /// info::kernel_queue_specific descriptor for a specific Queue.
+ ///
+ /// \param Queue is a valid SYCL queue.
+ /// \return depends on information being queried.
+ template <typename Param>
+ typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+ ext_oneapi_get_info(queue Queue) const;
+
+ /// Query queue/launch-specific information from a kernel using the
+ /// info::kernel_queue_specific descriptor for a specific Queue and values.
+ /// max_num_work_groups is the only valid descriptor for this function.
+ ///
+ /// \param Queue is a valid SYCL queue.
+ /// \param WorkGroupSize is the work-group size the number of work-groups is
+ /// requested for.
+ /// \return depends on information being queried.
  template <typename Param>
- typename Param::return_type ext_oneapi_get_info(const queue &q) const;
+ typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+ ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
+ size_t DynamicLocalMemorySize) const;
 
 private:
  /// Constructs a SYCL kernel object from a valid kernel_impl instance.

diff --git a/sycl/source/detail/kernel_impl.cpp b/sycl/source/detail/kernel_impl.cpp
@@ -106,6 +106,38 @@ void kernel_impl::checkIfValidForNumArgsInfoQuery() const {
  "interoperability function or to query a device built-in kernel");
 }
 
+bool kernel_impl::exceedsOccupancyResourceLimits(
+ const device &Device, const range<3> &WorkGroupSize,
+ size_t DynamicLocalMemorySize) const {
+ // Respect occupancy limits for WorkGroupSize and DynamicLocalMemorySize.
+ // Generally, exceeding hardware resource limits will yield in an error when
+ // the kernel is launched.
+ const size_t MaxWorkGroupSize =
+ get_info<info::kernel_device_specific::work_group_size>(Device);
+ const size_t MaxLocalMemorySizeInBytes =
+ Device.get_info<info::device::local_mem_size>();
+
+ if (WorkGroupSize.size() > MaxWorkGroupSize)
+ return true;
+
+ if (DynamicLocalMemorySize > MaxLocalMemorySizeInBytes)
+ return true;
+
+ // It will be impossible to launch a kernel for Cuda when the hardware limit
+ // for the 32-bit registers page file size is exceeded.
+ if (Device.get_backend() == backend::ext_oneapi_cuda) {
+ const uint32_t RegsPerWorkItem =
+ get_info<info::kernel_device_specific::ext_codeplay_num_regs>(Device);
+ const uint32_t MaxRegsPerWorkGroup =
+ Device.get_info<ext::codeplay::experimental::info::device::
+ max_registers_per_work_group>();
+ if ((MaxWorkGroupSize * RegsPerWorkItem) > MaxRegsPerWorkGroup)
+ return true;
+ }
+
+ return false;
+}
+
 template <>
 typename info::platform::version::return_type
 kernel_impl::get_backend_info<info::platform::version>() const {

diff --git a/sycl/source/detail/kernel_impl.hpp b/sycl/source/detail/kernel_impl.hpp
@@ -114,8 +114,26 @@ class kernel_impl {
  typename Param::return_type get_info(const device &Device,
  const range<3> &WGSize) const;
 
+ /// Query queue/launch-specific information from a kernel using the
+ /// info::kernel_queue_specific descriptor for a specific Queue.
+ ///
+ /// \param Queue is a valid SYCL queue.
+ /// \return depends on information being queried.
+ template <typename Param>
+ typename Param::return_type ext_oneapi_get_info(queue Queue) const;
+
+ /// Query queue/launch-specific information from a kernel using the
+ /// info::kernel_queue_specific descriptor for a specific Queue and values.
+ /// max_num_work_groups is the only valid descriptor for this function.
+ ///
+ /// \param Queue is a valid SYCL queue.
+ /// \param WorkGroupSize is the work-group size the number of work-groups is
+ /// requested for.
+ /// \return depends on information being queried.
  template <typename Param>
- typename Param::return_type ext_oneapi_get_info(const queue &q) const;
+ typename Param::return_type
+ ext_oneapi_get_info(queue Queue, const range<3> &MaxWorkGroupSize,
+ size_t DynamicLocalMemorySize) const;
 
  /// Get a constant reference to a raw kernel object.
  ///
@@ -171,6 +189,12 @@ class kernel_impl {
 
  bool isBuiltInKernel(const device &Device) const;
  void checkIfValidForNumArgsInfoQuery() const;
+
+ /// Check if the occupancy limits are exceeded for the given kernel launch
+ /// configuration.
+ bool exceedsOccupancyResourceLimits(const device &Device,
+ const range<3> &WorkGroupSize,
+ size_t DynamicLocalMemorySize) const;
 };
 
 template <typename Param>
@@ -217,20 +241,66 @@ kernel_impl::get_info(const device &Device,
  getPlugin());
 }
 
+namespace syclex = ext::oneapi::experimental;
+
 template <>
-inline typename ext::oneapi::experimental::info::kernel_queue_specific::
- max_num_work_group_sync::return_type
+inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
+ return_type
  kernel_impl::ext_oneapi_get_info<
- ext::oneapi::experimental::info::kernel_queue_specific::
- max_num_work_group_sync>(const queue &Queue) const {
+ syclex::info::kernel_queue_specific::max_num_work_groups>(
+ queue Queue, const range<3> &WorkGroupSize,
+ size_t DynamicLocalMemorySize) const {
+ if (WorkGroupSize.size() == 0)
+ throw exception(sycl::make_error_code(errc::invalid),
+ "The launch work-group size cannot be zero.");
+
  const auto &Plugin = getPlugin();
  const auto &Handle = getHandleRef();
+ auto Device = Queue.get_device();
+
+ uint32_t GroupCount{0};
+ if (auto Result = Plugin->call_nocheck<
+ UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>(
+ Handle, WorkGroupSize.size(), DynamicLocalMemorySize, &GroupCount);
+ Result != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
+ // The feature is supported. Check for other errors and throw if any.
+ Plugin->checkUrResult(Result);
+ return GroupCount;
+ }
+
+ // Fallback. If the backend API is unsupported, this query will return either
+ // 0 or 1 based on the kernel resource usage and the user-requested resources.
+ return exceedsOccupancyResourceLimits(Device, WorkGroupSize,
+ DynamicLocalMemorySize)
+ ? 0
+ : 1;
+}
+
+template <>
+inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
+ return_type
+ kernel_impl::ext_oneapi_get_info<
+ syclex::info::kernel_queue_specific::max_num_work_group_sync>(
+ queue Queue, const range<3> &WorkGroupSize,
+ size_t DynamicLocalMemorySize) const {
+ return ext_oneapi_get_info<
+ syclex::info::kernel_queue_specific::max_num_work_groups>(
+ Queue, WorkGroupSize, DynamicLocalMemorySize);
+}
+
+template <>
+inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
+ return_type
+ kernel_impl::ext_oneapi_get_info<
+ syclex::info::kernel_queue_specific::max_num_work_group_sync>(
+ queue Queue) const {
+ auto Device = Queue.get_device();
  const auto MaxWorkGroupSize =
- Queue.get_device().get_info<info::device::max_work_group_size>();
- uint32_t GroupCount = 0;
- Plugin->call<UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>(
- Handle, MaxWorkGroupSize, /* DynamicSharedMemorySize */ 0, &GroupCount);
- return GroupCount;
+ get_info<info::kernel_device_specific::work_group_size>(Device);
+ const sycl::range<3> WorkGroupSize{MaxWorkGroupSize, 1, 1};
+ return ext_oneapi_get_info<
+ syclex::info::kernel_queue_specific::max_num_work_group_sync>(
+  Queue, WorkGroupSize, /* DynamicLocalMemorySize */ 0);
 }
 
 } // namespace detail

diff --git a/sycl/source/kernel.cpp b/sycl/source/kernel.cpp
@@ -106,16 +106,36 @@ kernel::get_info<info::kernel_device_specific::max_sub_group_size>(
  const device &, const sycl::range<3> &) const;
 
 template <typename Param>
-typename Param::return_type
-kernel::ext_oneapi_get_info(const queue &Queue) const {
+typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+kernel::ext_oneapi_get_info(queue Queue) const {
  return impl->ext_oneapi_get_info<Param>(Queue);
 }
 
+template <typename Param>
+typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+kernel::ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
+ size_t DynamicLocalMemorySize) const {
+ return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
+ DynamicLocalMemorySize);
+}
+
 template __SYCL_EXPORT typename ext::oneapi::experimental::info::
  kernel_queue_specific::max_num_work_group_sync::return_type
  kernel::ext_oneapi_get_info<
  ext::oneapi::experimental::info::kernel_queue_specific::
- max_num_work_group_sync>(const queue &Queue) const;
+ max_num_work_group_sync>(queue Queue) const;
+
+#define __SYCL_PARAM_TRAITS_SPEC(Namespace, DescType, Desc, ReturnT) \
+ template __SYCL_EXPORT ReturnT \
+ kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>( \
+ queue, const range<3> &, size_t) const;
+// Not including "ext_oneapi_kernel_queue_specific_traits.def" because not all
+// kernel_queue_specific queries require the above-defined get_info interface.
+// clang-format off
+__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t)
+__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t)
+// clang-format on
+#undef __SYCL_PARAM_TRAITS_SPEC
 
 kernel::kernel(std::shared_ptr<detail::kernel_impl> Impl) : impl(Impl) {}