From 81aacfa9af9b99fb6658e4b906c509968da18e43 Mon Sep 17 00:00:00 2001
From: Georgi Mirazchiyski <georgi.mirazchiyski@codeplay.com>
Date: Wed, 11 Sep 2024 14:02:24 +0100
Subject: [PATCH] [SYCL] Implement max_num_work_groups from the launch queries
 extension (#14333)

This PR implements the `max_num_work_groups ` query from the
`sycl_ext_oneapi_launch_queries` extension.

Additionally, this PR introduces changes that overload
`ext_oneapi_get_info` for another kernel-queue-specific query -
`max_num_work_group_sync` to take extra parameters for local work-group
size and dynamic local memory size (in bytes) in order to allow users to
pass those runtime resource limiting factors to the query, so they are
taken into account in the final group count suggestion.
---
 sycl/cmake/modules/FetchUnifiedRuntime.cmake  |  14 +-
 .../sycl_ext_oneapi_launch_queries.asciidoc   |   8 +-
 .../include/sycl/detail/info_desc_helpers.hpp |  12 +
 .../ext/oneapi/experimental/root_group.hpp    |   9 +-
 ...xt_oneapi_kernel_queue_specific_traits.def |   4 +
 sycl/include/sycl/info/info_desc.hpp          |   2 +
 sycl/include/sycl/kernel.hpp                  |  24 +-
 sycl/source/detail/kernel_impl.cpp            |  32 +++
 sycl/source/detail/kernel_impl.hpp            |  90 +++++++-
 sycl/source/kernel.cpp                        |  26 ++-
 .../launch_queries/max_num_work_groups.cpp    | 215 ++++++++++++++++++
 sycl/test-e2e/GroupAlgorithm/root_group.cpp   |  15 +-
 sycl/test/abi/sycl_symbols_linux.dump         |   4 +-
 sycl/test/abi/sycl_symbols_windows.dump       |   4 +-
 sycl/test/include_deps/sycl_accessor.hpp.cpp  |   1 +
 .../include_deps/sycl_detail_core.hpp.cpp     |   1 +
 16 files changed, 424 insertions(+), 37 deletions(-)
 create mode 100644 sycl/include/sycl/info/ext_oneapi_kernel_queue_specific_traits.def
 create mode 100644 sycl/test-e2e/Basic/launch_queries/max_num_work_groups.cpp

diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake
index 956c33bec68df..e569981589ac4 100644
--- a/sycl/cmake/modules/FetchUnifiedRuntime.cmake
+++ b/sycl/cmake/modules/FetchUnifiedRuntime.cmake
@@ -117,13 +117,13 @@ if(SYCL_UR_USE_FETCH_CONTENT)
   endfunction()
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
-  # commit 8c9dd7e464a99ebbfb238ac2dabefc3ac77baea5
-  # Merge: a99dbcee 3abe18cf
-  # Author: Piotr Balcer <piotr.balcer@intel.com>
-  # Date:   Fri Sep 6 17:21:17 2024 +0200
-  #   Merge pull request #1820 from pbalcer/static-linking
-  #   Add support for static linking of the L0 adapter
-  set(UNIFIED_RUNTIME_TAG 8c9dd7e464a99ebbfb238ac2dabefc3ac77baea5)
+  # commit eb63d1a21729f6928bb6cccc5f92856b0690aca6
+  # Merge: e26bba51 45a781f4
+  # Author: Omar Ahmed <omar.ahmed@codeplay.com>
+  # Date:   Tue Sep 10 12:08:57 2024 +0100
+  #   Merge pull request #1796 from GeorgeWeb/georgi/ur_kernel_max_active_wgs
+  #   [CUDA] Implement urKernelSuggestMaxCooperativeGroupCountExp for Cuda
+  set(UNIFIED_RUNTIME_TAG eb63d1a21729f6928bb6cccc5f92856b0690aca6)
 
   set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
   # Due to the use of dependentloadflag and no installer for UMF and hwloc we need
diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_launch_queries.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_launch_queries.asciidoc
index 8221000502642..ee52d75b8fd21 100644
--- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_launch_queries.asciidoc
+++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_launch_queries.asciidoc
@@ -204,9 +204,11 @@ otherwise it is 0.
 |Returns the maximum number of work-groups, when the kernel is submitted to the
 specified queue with the specified work-group size and the specified amount of
 dynamic work-group local memory (in bytes), accounting for any kernel
-properties or features. If the kernel can be submitted to the specified queue
-without an error, the minimum value returned by this query is 1, otherwise it
-is 0.
+properties or features. If the specified work-group size is 0, which is
+invalid, then the implementation will throw a synchronous exception with the
+`errc::invalid` error code. If the kernel can be submitted to the specified
+queue without an error, the minimum value returned by this query is 1,
+otherwise it is 0.
 
 |===
 
diff --git a/sycl/include/sycl/detail/info_desc_helpers.hpp b/sycl/include/sycl/detail/info_desc_helpers.hpp
index e8bc8f76c83db..d3b4bfd977139 100644
--- a/sycl/include/sycl/detail/info_desc_helpers.hpp
+++ b/sycl/include/sycl/detail/info_desc_helpers.hpp
@@ -31,6 +31,8 @@ template <typename T> struct is_queue_info_desc : std::false_type {};
 template <typename T> struct is_kernel_info_desc : std::false_type {};
 template <typename T>
 struct is_kernel_device_specific_info_desc : std::false_type {};
+template <typename T>
+struct is_kernel_queue_specific_info_desc : std::false_type {};
 template <typename T> struct is_event_info_desc : std::false_type {};
 template <typename T> struct is_event_profiling_info_desc : std::false_type {};
 // Normally we would just use std::enable_if to limit valid get_info template
@@ -134,6 +136,16 @@ struct IsKernelInfo<info::kernel_device_specific::ext_codeplay_num_regs>
 #include <sycl/info/ext_intel_device_traits.def>
 #include <sycl/info/ext_oneapi_device_traits.def>
 #undef __SYCL_PARAM_TRAITS_SPEC
+
+#define __SYCL_PARAM_TRAITS_SPEC(Namespace, DescType, Desc, ReturnT, PiCode)   \
+  template <>                                                                  \
+  struct is_##DescType##_info_desc<Namespace::info::DescType::Desc>            \
+      : std::true_type {                                                       \
+    using return_type = Namespace::info::DescType::Desc::return_type;          \
+  };
+#include <sycl/info/ext_oneapi_kernel_queue_specific_traits.def>
+#undef __SYCL_PARAM_TRAITS_SPEC
+
 #define __SYCL_PARAM_TRAITS_SPEC(DescType, Desc, ReturnT, PiCode)              \
   template <>                                                                  \
   struct is_backend_info_desc<info::DescType::Desc> : std::true_type {         \
diff --git a/sycl/include/sycl/ext/oneapi/experimental/root_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/root_group.hpp
index 558396bb6f9c8..b8c90683bbaaf 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/root_group.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/root_group.hpp
@@ -24,13 +24,8 @@ namespace sycl {
 inline namespace _V1 {
 namespace ext::oneapi::experimental {
 
-namespace info::kernel_queue_specific {
-// TODO: Revisit and align with sycl_ext_oneapi_forward_progress extension once
-// #7598 is merged.
-struct max_num_work_group_sync {
-  using return_type = size_t;
-};
-} // namespace info::kernel_queue_specific
+// See 'sycl/info/kernel_device_specific_traits.def' for the kernel
+// device-specific properties that relate to 'root_group'.
 
 template <int Dimensions> class root_group {
 public:
diff --git a/sycl/include/sycl/info/ext_oneapi_kernel_queue_specific_traits.def b/sycl/include/sycl/info/ext_oneapi_kernel_queue_specific_traits.def
new file mode 100644
index 0000000000000..0ec11af0bb6b1
--- /dev/null
+++ b/sycl/include/sycl/info/ext_oneapi_kernel_queue_specific_traits.def
@@ -0,0 +1,4 @@
+// TODO: Revisit 'max_num_work_group_sync' and align it with the
+// 'sycl_ext_oneapi_forward_progress' extension once #7598 is merged.
+__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t,)
+__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t,)
diff --git a/sycl/include/sycl/info/info_desc.hpp b/sycl/include/sycl/info/info_desc.hpp
index 8f6a86e5f60bb..b84f98f350d0d 100644
--- a/sycl/include/sycl/info/info_desc.hpp
+++ b/sycl/include/sycl/info/info_desc.hpp
@@ -247,6 +247,8 @@ struct work_item_progress_capabilities;
 #include <sycl/info/ext_codeplay_device_traits.def>
 #include <sycl/info/ext_intel_device_traits.def>
 #include <sycl/info/ext_oneapi_device_traits.def>
+#include <sycl/info/ext_oneapi_kernel_queue_specific_traits.def>
+
 #undef __SYCL_PARAM_TRAITS_SPEC
 #undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC
 } // namespace _V1
diff --git a/sycl/include/sycl/kernel.hpp b/sycl/include/sycl/kernel.hpp
index 40db1d8461dde..dac7f619d745e 100644
--- a/sycl/include/sycl/kernel.hpp
+++ b/sycl/include/sycl/kernel.hpp
@@ -159,9 +159,29 @@ class __SYCL_EXPORT kernel : public detail::OwnerLessBase<kernel> {
       get_info(const device &Device, const range<3> &WGSize) const;
 
   // TODO: Revisit and align with sycl_ext_oneapi_forward_progress extension
-  // once #7598 is merged.
+  // once #7598 is merged. (regarding the 'max_num_work_group_sync' query)
+
+  /// Query queue/launch-specific information from a kernel using the
+  /// info::kernel_queue_specific descriptor for a specific Queue.
+  ///
+  /// \param Queue is a valid SYCL queue.
+  /// \return depends on information being queried.
+  template <typename Param>
+  typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+  ext_oneapi_get_info(queue Queue) const;
+
+  /// Query queue/launch-specific information from a kernel using the
+  /// info::kernel_queue_specific descriptor for a specific Queue and values.
+  /// max_num_work_groups is the only valid descriptor for this function.
+  ///
+  /// \param Queue is a valid SYCL queue.
+  /// \param WorkGroupSize is the work-group size the number of work-groups is
+  /// requested for.
+  /// \return depends on information being queried.
   template <typename Param>
-  typename Param::return_type ext_oneapi_get_info(const queue &q) const;
+  typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+  ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
+                      size_t DynamicLocalMemorySize) const;
 
 private:
   /// Constructs a SYCL kernel object from a valid kernel_impl instance.
diff --git a/sycl/source/detail/kernel_impl.cpp b/sycl/source/detail/kernel_impl.cpp
index faf3695c04e94..50af09831f207 100644
--- a/sycl/source/detail/kernel_impl.cpp
+++ b/sycl/source/detail/kernel_impl.cpp
@@ -106,6 +106,38 @@ void kernel_impl::checkIfValidForNumArgsInfoQuery() const {
       "interoperability function or to query a device built-in kernel");
 }
 
+bool kernel_impl::exceedsOccupancyResourceLimits(
+    const device &Device, const range<3> &WorkGroupSize,
+    size_t DynamicLocalMemorySize) const {
+  // Respect occupancy limits for WorkGroupSize and DynamicLocalMemorySize.
+  // Generally, exceeding hardware resource limits will yield in an error when
+  // the kernel is launched.
+  const size_t MaxWorkGroupSize =
+      get_info<info::kernel_device_specific::work_group_size>(Device);
+  const size_t MaxLocalMemorySizeInBytes =
+      Device.get_info<info::device::local_mem_size>();
+
+  if (WorkGroupSize.size() > MaxWorkGroupSize)
+    return true;
+
+  if (DynamicLocalMemorySize > MaxLocalMemorySizeInBytes)
+    return true;
+
+  // It will be impossible to launch a kernel for Cuda when the hardware limit
+  // for the 32-bit registers page file size is exceeded.
+  if (Device.get_backend() == backend::ext_oneapi_cuda) {
+    const uint32_t RegsPerWorkItem =
+        get_info<info::kernel_device_specific::ext_codeplay_num_regs>(Device);
+    const uint32_t MaxRegsPerWorkGroup =
+        Device.get_info<ext::codeplay::experimental::info::device::
+                            max_registers_per_work_group>();
+    if ((MaxWorkGroupSize * RegsPerWorkItem) > MaxRegsPerWorkGroup)
+      return true;
+  }
+
+  return false;
+}
+
 template <>
 typename info::platform::version::return_type
 kernel_impl::get_backend_info<info::platform::version>() const {
diff --git a/sycl/source/detail/kernel_impl.hpp b/sycl/source/detail/kernel_impl.hpp
index ab2950e26a856..040b5cbccf965 100644
--- a/sycl/source/detail/kernel_impl.hpp
+++ b/sycl/source/detail/kernel_impl.hpp
@@ -114,8 +114,26 @@ class kernel_impl {
   typename Param::return_type get_info(const device &Device,
                                        const range<3> &WGSize) const;
 
+  /// Query queue/launch-specific information from a kernel using the
+  /// info::kernel_queue_specific descriptor for a specific Queue.
+  ///
+  /// \param Queue is a valid SYCL queue.
+  /// \return depends on information being queried.
+  template <typename Param>
+  typename Param::return_type ext_oneapi_get_info(queue Queue) const;
+
+  /// Query queue/launch-specific information from a kernel using the
+  /// info::kernel_queue_specific descriptor for a specific Queue and values.
+  /// max_num_work_groups is the only valid descriptor for this function.
+  ///
+  /// \param Queue is a valid SYCL queue.
+  /// \param WorkGroupSize is the work-group size the number of work-groups is
+  /// requested for.
+  /// \return depends on information being queried.
   template <typename Param>
-  typename Param::return_type ext_oneapi_get_info(const queue &q) const;
+  typename Param::return_type
+  ext_oneapi_get_info(queue Queue, const range<3> &MaxWorkGroupSize,
+                      size_t DynamicLocalMemorySize) const;
 
   /// Get a constant reference to a raw kernel object.
   ///
@@ -171,6 +189,12 @@ class kernel_impl {
 
   bool isBuiltInKernel(const device &Device) const;
   void checkIfValidForNumArgsInfoQuery() const;
+
+  /// Check if the occupancy limits are exceeded for the given kernel launch
+  /// configuration.
+  bool exceedsOccupancyResourceLimits(const device &Device,
+                                      const range<3> &WorkGroupSize,
+                                      size_t DynamicLocalMemorySize) const;
 };
 
 template <typename Param>
@@ -217,20 +241,66 @@ kernel_impl::get_info(const device &Device,
       getPlugin());
 }
 
+namespace syclex = ext::oneapi::experimental;
+
 template <>
-inline typename ext::oneapi::experimental::info::kernel_queue_specific::
-    max_num_work_group_sync::return_type
+inline typename syclex::info::kernel_queue_specific::max_num_work_groups::
+    return_type
     kernel_impl::ext_oneapi_get_info<
-        ext::oneapi::experimental::info::kernel_queue_specific::
-            max_num_work_group_sync>(const queue &Queue) const {
+        syclex::info::kernel_queue_specific::max_num_work_groups>(
+        queue Queue, const range<3> &WorkGroupSize,
+        size_t DynamicLocalMemorySize) const {
+  if (WorkGroupSize.size() == 0)
+    throw exception(sycl::make_error_code(errc::invalid),
+                    "The launch work-group size cannot be zero.");
+
   const auto &Plugin = getPlugin();
   const auto &Handle = getHandleRef();
+  auto Device = Queue.get_device();
+
+  uint32_t GroupCount{0};
+  if (auto Result = Plugin->call_nocheck<
+                    UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>(
+          Handle, WorkGroupSize.size(), DynamicLocalMemorySize, &GroupCount);
+      Result != UR_RESULT_ERROR_UNSUPPORTED_FEATURE) {
+    // The feature is supported. Check for other errors and throw if any.
+    Plugin->checkUrResult(Result);
+    return GroupCount;
+  }
+
+  // Fallback. If the backend API is unsupported, this query will return either
+  // 0 or 1 based on the kernel resource usage and the user-requested resources.
+  return exceedsOccupancyResourceLimits(Device, WorkGroupSize,
+                                        DynamicLocalMemorySize)
+             ? 0
+             : 1;
+}
+
+template <>
+inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
+    return_type
+    kernel_impl::ext_oneapi_get_info<
+        syclex::info::kernel_queue_specific::max_num_work_group_sync>(
+        queue Queue, const range<3> &WorkGroupSize,
+        size_t DynamicLocalMemorySize) const {
+  return ext_oneapi_get_info<
+      syclex::info::kernel_queue_specific::max_num_work_groups>(
+      Queue, WorkGroupSize, DynamicLocalMemorySize);
+}
+
+template <>
+inline typename syclex::info::kernel_queue_specific::max_num_work_group_sync::
+    return_type
+    kernel_impl::ext_oneapi_get_info<
+        syclex::info::kernel_queue_specific::max_num_work_group_sync>(
+        queue Queue) const {
+  auto Device = Queue.get_device();
   const auto MaxWorkGroupSize =
-      Queue.get_device().get_info<info::device::max_work_group_size>();
-  uint32_t GroupCount = 0;
-  Plugin->call<UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>(
-      Handle, MaxWorkGroupSize, /* DynamicSharedMemorySize */ 0, &GroupCount);
-  return GroupCount;
+      get_info<info::kernel_device_specific::work_group_size>(Device);
+  const sycl::range<3> WorkGroupSize{MaxWorkGroupSize, 1, 1};
+  return ext_oneapi_get_info<
+      syclex::info::kernel_queue_specific::max_num_work_group_sync>(
+      Queue, WorkGroupSize, /* DynamicLocalMemorySize */ 0);
 }
 
 } // namespace detail
diff --git a/sycl/source/kernel.cpp b/sycl/source/kernel.cpp
index f4ec76bcf9e7d..a4aae60bece08 100644
--- a/sycl/source/kernel.cpp
+++ b/sycl/source/kernel.cpp
@@ -106,16 +106,36 @@ kernel::get_info<info::kernel_device_specific::max_sub_group_size>(
     const device &, const sycl::range<3> &) const;
 
 template <typename Param>
-typename Param::return_type
-kernel::ext_oneapi_get_info(const queue &Queue) const {
+typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+kernel::ext_oneapi_get_info(queue Queue) const {
   return impl->ext_oneapi_get_info<Param>(Queue);
 }
 
+template <typename Param>
+typename detail::is_kernel_queue_specific_info_desc<Param>::return_type
+kernel::ext_oneapi_get_info(queue Queue, const range<3> &WorkGroupSize,
+                            size_t DynamicLocalMemorySize) const {
+  return impl->ext_oneapi_get_info<Param>(Queue, WorkGroupSize,
+                                          DynamicLocalMemorySize);
+}
+
 template __SYCL_EXPORT typename ext::oneapi::experimental::info::
     kernel_queue_specific::max_num_work_group_sync::return_type
     kernel::ext_oneapi_get_info<
         ext::oneapi::experimental::info::kernel_queue_specific::
-            max_num_work_group_sync>(const queue &Queue) const;
+            max_num_work_group_sync>(queue Queue) const;
+
+#define __SYCL_PARAM_TRAITS_SPEC(Namespace, DescType, Desc, ReturnT)           \
+  template __SYCL_EXPORT ReturnT                                               \
+  kernel::ext_oneapi_get_info<Namespace::info::DescType::Desc>(                \
+      queue, const range<3> &, size_t) const;
+// Not including "ext_oneapi_kernel_queue_specific_traits.def" because not all
+// kernel_queue_specific queries require the above-defined get_info interface.
+// clang-format off
+__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_group_sync, size_t)
+__SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, kernel_queue_specific, max_num_work_groups, size_t)
+// clang-format on
+#undef __SYCL_PARAM_TRAITS_SPEC
 
 kernel::kernel(std::shared_ptr<detail::kernel_impl> Impl) : impl(Impl) {}
 
diff --git a/sycl/test-e2e/Basic/launch_queries/max_num_work_groups.cpp b/sycl/test-e2e/Basic/launch_queries/max_num_work_groups.cpp
new file mode 100644
index 0000000000000..7b76327d015b1
--- /dev/null
+++ b/sycl/test-e2e/Basic/launch_queries/max_num_work_groups.cpp
@@ -0,0 +1,215 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+#include <sycl/detail/core.hpp>
+
+#include <cassert>
+#include <cstdint>
+
+namespace syclex = sycl::ext::oneapi::experimental;
+using namespace sycl::info::device;
+using namespace sycl::info::kernel_device_specific;
+
+using value_type = int64_t;
+
+namespace kernels {
+
+template <class T, size_t Dim>
+using sycl_global_accessor =
+    sycl::accessor<T, Dim, sycl::access::mode::read_write,
+                   sycl::access::target::global_buffer>;
+
+class TestKernel {
+public:
+  static constexpr bool HasLocalMemory{false};
+
+  TestKernel(sycl_global_accessor<value_type, 1> acc) : acc_{acc} {}
+
+  void operator()(sycl::nd_item<1> item) const {
+    const auto gtid = item.get_global_linear_id();
+    acc_[gtid] = gtid + 42;
+  }
+
+private:
+  sycl_global_accessor<value_type, 1> acc_;
+};
+
+class TestLocalMemoryKernel {
+public:
+  static constexpr bool HasLocalMemory{true};
+
+  TestLocalMemoryKernel(sycl_global_accessor<value_type, 1> acc,
+                        sycl::local_accessor<value_type, 1> loc_acc)
+      : acc_{acc}, loc_acc_{loc_acc} {}
+
+  void operator()(sycl::nd_item<1> item) const {
+    const auto ltid = item.get_local_linear_id();
+    const auto gtid = item.get_global_linear_id();
+    if (ltid < loc_acc_.size()) {
+      loc_acc_[ltid] = ltid + 42;
+      item.barrier(sycl::access::fence_space::local_space);
+      acc_[gtid] = loc_acc_[ltid];
+    } else {
+      acc_[gtid] = 0;
+    }
+  }
+
+private:
+  sycl_global_accessor<value_type, 1> acc_;
+  sycl::local_accessor<value_type, 1> loc_acc_;
+};
+
+} // namespace kernels
+
+namespace {
+
+template <class KernelName>
+int test_max_num_work_groups(sycl::queue &q, const sycl::device &dev) {
+  const auto ctx = q.get_context();
+  auto bundle = sycl::get_kernel_bundle<sycl::bundle_state::executable>(ctx);
+  auto kernel = bundle.template get_kernel<KernelName>();
+
+  const size_t maxWorkGroupSize =
+      kernel.template get_info<work_group_size>(dev);
+  const size_t NumWorkItems = maxWorkGroupSize * maxWorkGroupSize;
+
+  size_t workGroupSize = 32;
+  size_t localMemorySizeInBytes{0};
+  if constexpr (KernelName::HasLocalMemory) {
+    localMemorySizeInBytes = workGroupSize * sizeof(value_type);
+  }
+
+  sycl::buffer<value_type, 1> buf{sycl::range<1>{NumWorkItems}};
+
+  // Tests
+
+  // ==================== //
+  // Test 1 - return type //
+  // ==================== //
+  sycl::range<3> workGroupRange{workGroupSize, 1, 1};
+  auto maxWGs = kernel.template ext_oneapi_get_info<
+      syclex::info::kernel_queue_specific::max_num_work_groups>(
+      q, workGroupRange, localMemorySizeInBytes);
+
+  // Test the return type is as specified in the extension document.
+  static_assert(std::is_same_v<std::remove_cv_t<decltype(maxWGs)>, size_t>,
+                "max_num_work_groups query must return size_t");
+
+  // ===================== //
+  // Test 2 - return value //
+  // ===================== //
+  // We must have at least one active group if we are below resource limits.
+  assert(maxWGs > 0 && "max_num_work_groups query failed");
+  if (maxWGs == 0)
+    return 1;
+
+  // Run the kernel
+  auto launch_range = sycl::nd_range<1>{sycl::range<1>{NumWorkItems},
+                                        sycl::range<1>{workGroupSize}};
+  q.submit([&](sycl::handler &cgh) {
+     auto acc = buf.get_access<sycl::access::mode::read_write>(cgh);
+     if constexpr (KernelName::HasLocalMemory) {
+       sycl::local_accessor<value_type, 1> loc_acc{
+           sycl::range<1>{workGroupSize}, cgh};
+       cgh.parallel_for(launch_range, KernelName{acc, loc_acc});
+     } else {
+       cgh.parallel_for(launch_range, KernelName{acc});
+     }
+   }).wait();
+  assert(sycl::host_accessor{buf}[0] == 42);
+
+  // ========================== //
+  // Test 3 - use max resources //
+  // ========================== //
+  // A little over the maximum work-group size for the purpose of exceeding.
+  workGroupSize = maxWorkGroupSize;
+  workGroupRange[0] = workGroupSize;
+  size_t localSize =
+      (dev.get_info<sycl::info::device::local_mem_size>() / sizeof(value_type));
+  if constexpr (KernelName::HasLocalMemory) {
+    localMemorySizeInBytes = localSize * sizeof(value_type);
+  }
+  maxWGs = kernel.template ext_oneapi_get_info<
+      syclex::info::kernel_queue_specific::max_num_work_groups>(
+      q, workGroupRange, localMemorySizeInBytes);
+
+  assert(maxWGs > 0 && "max_num_work_groups query failed");
+  if (maxWGs == 0)
+    return 1;
+
+  launch_range = sycl::nd_range<1>{sycl::range<1>{NumWorkItems},
+                                   sycl::range<1>{workGroupSize}};
+
+  q.submit([&](sycl::handler &cgh) {
+     auto acc = buf.get_access<sycl::access::mode::read_write>(cgh);
+     if constexpr (KernelName::HasLocalMemory) {
+       sycl::local_accessor<value_type, 1> loc_acc{sycl::range<1>{localSize},
+                                                   cgh};
+       cgh.parallel_for(launch_range, KernelName{acc, loc_acc});
+     } else {
+       cgh.parallel_for(launch_range, KernelName{acc});
+     }
+   }).wait();
+  assert(sycl::host_accessor{buf}[0] == 42);
+
+  // =============================== //
+  // Test 4 - exceed resource limits //
+  // =============================== //
+  workGroupSize = maxWorkGroupSize + 32;
+  workGroupRange[0] = workGroupSize;
+  maxWGs = kernel.template ext_oneapi_get_info<
+      syclex::info::kernel_queue_specific::max_num_work_groups>(
+      q, workGroupRange, localMemorySizeInBytes);
+  // It cannot be possible to launch a kernel successfully with a configuration
+  // that exceeds the available resources as in the above defined workGroupSize.
+  // workGroupSize is larger than maxWorkGroupSize, hence maxWGs must equal 0.
+  if (dev.get_backend() == sycl::backend::ext_oneapi_cuda) {
+    assert(maxWGs == 0 && "max_num_work_groups query failed");
+    if (maxWGs > 0)
+      return 1;
+  }
+
+  // As we ensured that the 'max_num_work_groups' query correctly
+  // returns 0 possible work-groups, test that the kernel launch will fail.
+  // A configuration that defines a work-group size larger than the maximum
+  // possible should result in failure.
+  try {
+    launch_range = sycl::nd_range<1>{sycl::range<1>{NumWorkItems},
+                                     sycl::range<1>{workGroupSize}};
+
+    q.submit([&](sycl::handler &cgh) {
+       auto acc = buf.get_access<sycl::access::mode::read_write>(cgh);
+       if constexpr (KernelName::HasLocalMemory) {
+         sycl::local_accessor<value_type, 1> loc_acc{sycl::range<1>{localSize},
+                                                     cgh};
+         cgh.parallel_for(launch_range, KernelName{acc, loc_acc});
+       } else {
+         cgh.parallel_for(launch_range, KernelName{acc});
+       }
+     }).wait();
+  } catch (const sycl::exception &e) {
+    // 'nd_range' error is the expected outcome from the above launch config.
+    if (e.code() == sycl::make_error_code(sycl::errc::nd_range)) {
+      return 0;
+    }
+    std::cerr << e.code() << ":\t";
+    std::cerr << e.what() << std::endl;
+    return 1;
+  }
+
+  return 0;
+}
+
+} // namespace
+
+int main() {
+  sycl::queue q{};
+  sycl::device dev = q.get_device();
+
+  using namespace kernels;
+
+  int ret{0};
+  ret &= test_max_num_work_groups<TestKernel>(q, dev);
+  ret &= test_max_num_work_groups<TestLocalMemoryKernel>(q, dev);
+  return ret;
+}
diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
index d8393f35c6253..92e5d69ffcab4 100644
--- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp
+++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
@@ -30,12 +30,21 @@ void testQueriesAndProperties() {
   const auto maxWGs = kernel.ext_oneapi_get_info<
       sycl::ext::oneapi::experimental::info::kernel_queue_specific::
           max_num_work_group_sync>(q);
+  const auto wgRange = sycl::range{WorkGroupSize, 1, 1};
+  const auto maxWGsWithLimits = kernel.ext_oneapi_get_info<
+      sycl::ext::oneapi::experimental::info::kernel_queue_specific::
+          max_num_work_group_sync>(q, wgRange, wgRange.size() * sizeof(int));
   const auto props = sycl::ext::oneapi::experimental::properties{
       sycl::ext::oneapi::experimental::use_root_sync};
   q.single_task<class QueryKernel>(props, []() {});
-  static_assert(std::is_same_v<std::remove_cv<decltype(maxWGs)>::type, size_t>,
-                "max_num_work_group_sync query must return size_t");
-  assert(maxWGs >= 1 && "max_num_work_group_sync query failed");
+
+  static auto check_max_num_work_group_sync = [](auto Result) {
+    static_assert(std::is_same_v<std::remove_cv_t<decltype(Result)>, size_t>,
+                  "max_num_work_group_sync query must return size_t");
+    assert(Result >= 1 && "max_num_work_group_sync query failed");
+  };
+  check_max_num_work_group_sync(maxWGs);
+  check_max_num_work_group_sync(maxWGsWithLimits);
 }
 
 void testRootGroup() {
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index 4c73f43ed6ba2..ec6ec2096403f 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -3915,8 +3915,10 @@ _ZNK4sycl3_V16kernel16get_backend_infoINS0_4info6device15backend_versionEEENS0_6
 _ZNK4sycl3_V16kernel16get_backend_infoINS0_4info6device7versionEEENS0_6detail20is_backend_info_descIT_E11return_typeEv
 _ZNK4sycl3_V16kernel16get_backend_infoINS0_4info8platform7versionEEENS0_6detail20is_backend_info_descIT_E11return_typeEv
 _ZNK4sycl3_V16kernel17get_kernel_bundleEv
-_ZNK4sycl3_V16kernel19ext_oneapi_get_infoINS0_3ext6oneapi12experimental4info21kernel_queue_specific23max_num_work_group_syncEEENT_11return_typeERKNS0_5queueE
 _ZNK4sycl3_V16kernel3getEv
+_ZNK4sycl3_V16kernel19ext_oneapi_get_infoINS0_3ext6oneapi12experimental4info21kernel_queue_specific23max_num_work_group_syncEEENS0_6detail34is_kernel_queue_specific_info_descIT_E11return_typeENS0_5queueE
+_ZNK4sycl3_V16kernel19ext_oneapi_get_infoINS0_3ext6oneapi12experimental4info21kernel_queue_specific23max_num_work_group_syncEEENS0_6detail34is_kernel_queue_specific_info_descIT_E11return_typeENS0_5queueERKNS0_5rangeILi3EEEm
+_ZNK4sycl3_V16kernel19ext_oneapi_get_infoINS0_3ext6oneapi12experimental4info21kernel_queue_specific19max_num_work_groupsEEENS0_6detail34is_kernel_queue_specific_info_descIT_E11return_typeENS0_5queueERKNS0_5rangeILi3EEEm
 _ZNK4sycl3_V16kernel8get_infoINS0_4info22kernel_device_specific15work_group_sizeEEENS0_6detail35is_kernel_device_specific_info_descIT_E11return_typeERKNS0_6deviceE
 _ZNK4sycl3_V16kernel8get_infoINS0_4info22kernel_device_specific16global_work_sizeEEENS0_6detail35is_kernel_device_specific_info_descIT_E11return_typeERKNS0_6deviceE
 _ZNK4sycl3_V16kernel8get_infoINS0_4info22kernel_device_specific16private_mem_sizeEEENS0_6detail35is_kernel_device_specific_info_descIT_E11return_typeERKNS0_6deviceE
diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump
index e2c3643c557be..55ce460c64559 100644
--- a/sycl/test/abi/sycl_symbols_windows.dump
+++ b/sycl/test/abi/sycl_symbols_windows.dump
@@ -13,7 +13,9 @@
 ??$create_sub_devices@$0BAIH@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@AEBV?$vector@_KV?$allocator@_K@std@@@4@@Z
 ??$create_sub_devices@$0BAII@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@W4partition_affinity_domain@info@12@@Z
 ??$create_sub_devices@$0BAIJ@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@XZ
-??$ext_oneapi_get_info@Umax_num_work_group_sync@kernel_queue_specific@info@experimental@oneapi@ext@_V1@sycl@@@kernel@_V1@sycl@@QEBA_KAEBVqueue@12@@Z
+??$ext_oneapi_get_info@Umax_num_work_group_sync@kernel_queue_specific@info@experimental@oneapi@ext@_V1@sycl@@@kernel@_V1@sycl@@QEBA_KVqueue@12@@Z
+??$ext_oneapi_get_info@Umax_num_work_group_sync@kernel_queue_specific@info@experimental@oneapi@ext@_V1@sycl@@@kernel@_V1@sycl@@QEBA_KVqueue@12@AEBV?$range@$02@12@_K@Z
+??$ext_oneapi_get_info@Umax_num_work_groups@kernel_queue_specific@info@experimental@oneapi@ext@_V1@sycl@@@kernel@_V1@sycl@@QEBA_KVqueue@12@AEBV?$range@$02@12@_K@Z
 ??$get_backend_info@Ubackend_version@device@info@_V1@sycl@@@context@_V1@sycl@@QEBA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@XZ
 ??$get_backend_info@Ubackend_version@device@info@_V1@sycl@@@device@_V1@sycl@@QEBA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@XZ
 ??$get_backend_info@Ubackend_version@device@info@_V1@sycl@@@event@_V1@sycl@@QEBA?AV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@XZ
diff --git a/sycl/test/include_deps/sycl_accessor.hpp.cpp b/sycl/test/include_deps/sycl_accessor.hpp.cpp
index 36e9a5ad3961e..fd2628dfb6e1c 100644
--- a/sycl/test/include_deps/sycl_accessor.hpp.cpp
+++ b/sycl/test/include_deps/sycl_accessor.hpp.cpp
@@ -110,6 +110,7 @@
 // CHECK-NEXT: info/ext_codeplay_device_traits.def
 // CHECK-NEXT: info/ext_intel_device_traits.def
 // CHECK-NEXT: info/ext_oneapi_device_traits.def
+// CHECK-NEXT: info/ext_oneapi_kernel_queue_specific_traits.def
 // CHECK-NEXT: info/sycl_backend_traits.def
 // CHECK-NEXT: platform.hpp
 // CHECK-NEXT: detail/string_view.hpp
diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp
index 6cb1be75681ee..33dc01b8a4e74 100644
--- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp
+++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp
@@ -111,6 +111,7 @@
 // CHECK-NEXT: info/ext_codeplay_device_traits.def
 // CHECK-NEXT: info/ext_intel_device_traits.def
 // CHECK-NEXT: info/ext_oneapi_device_traits.def
+// CHECK-NEXT: info/ext_oneapi_kernel_queue_specific_traits.def
 // CHECK-NEXT: info/sycl_backend_traits.def
 // CHECK-NEXT: platform.hpp
 // CHECK-NEXT: detail/string_view.hpp