From bcca15a54555e364e60f9b464a4eabac4adb3d2d Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Tue, 20 Aug 2024 17:43:33 +0100
Subject: [PATCH] Add two new properties to ur_kernel_group_info_t

These two properties allow the program to specify a maximum work-group
size in various ways.

They are intended to be targeted from languages such as SYCL (see
https://github.com/intel/llvm/pull/14518).

This PR implements them for CUDA and Native CPU. It should also be able
support them for HIP, in the same fashion. Other adapters using SPIR-V
and/or Level Zero would require further changes to both of those
specifications.
---
 include/ur_api.h                              |  6 +++-
 include/ur_print.hpp                          | 32 +++++++++++++++++
 scripts/core/kernel.yml                       |  8 +++++
 source/adapters/cuda/enqueue.cpp              | 11 ++++++
 source/adapters/cuda/kernel.cpp               | 24 +++++++++++++
 source/adapters/cuda/kernel.hpp               | 14 ++++++++
 source/adapters/cuda/program.cpp              | 22 +++++++-----
 source/adapters/cuda/program.hpp              |  6 +++-
 source/adapters/hip/kernel.cpp                |  4 +++
 source/adapters/level_zero/kernel.cpp         |  4 +++
 source/adapters/level_zero/v2/kernel.cpp      |  4 +++
 source/adapters/native_cpu/enqueue.cpp        | 19 +++++++---
 source/adapters/native_cpu/kernel.cpp         | 25 ++++++++++---
 source/adapters/native_cpu/kernel.hpp         | 25 ++++++++-----
 source/adapters/native_cpu/program.cpp        | 35 ++++++++++++-------
 source/adapters/native_cpu/program.hpp        |  7 ++--
 source/adapters/opencl/kernel.cpp             |  4 +++
 source/loader/layers/validation/ur_valddi.cpp |  3 +-
 source/loader/ur_libapi.cpp                   |  2 +-
 source/ur/ur.hpp                              |  4 +++
 source/ur_api.cpp                             |  2 +-
 .../kernel/kernel_adapter_native_cpu.match    |  9 +++++
 .../kernel/urKernelGetGroupInfo.cpp           | 20 +++++++++--
 23 files changed, 243 insertions(+), 47 deletions(-)

diff --git a/include/ur_api.h b/include/ur_api.h
index a707d40a3f..98468fb9d8 100644
--- a/include/ur_api.h
+++ b/include/ur_api.h
@@ -4854,6 +4854,10 @@ typedef enum ur_kernel_group_info_t {
     UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE = 4, ///< [size_t] Return preferred multiple of Work Group size for launch
     UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE = 5,                   ///< [size_t] Return minimum amount of private memory in bytes used by each
                                                                  ///< work item in the Kernel
+    UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE = 6,        ///< [size_t[3]] Return the maximum Work Group size guaranteed by the
+                                                                 ///< source code, or (0, 0, 0) if unspecified
+    UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE = 7, ///< [size_t] Return the maximum linearized Work Group size (X * Y * Z)
+                                                                 ///< guaranteed by the source code, or 0 if unspecified
     /// @cond
     UR_KERNEL_GROUP_INFO_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -4957,7 +4961,7 @@ urKernelGetInfo(
 ///         + `NULL == hKernel`
 ///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE < propName`
+///         + `::UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE < propName`
 UR_APIEXPORT ur_result_t UR_APICALL
 urKernelGetGroupInfo(
     ur_kernel_handle_t hKernel,      ///< [in] handle of the Kernel object
diff --git a/include/ur_print.hpp b/include/ur_print.hpp
index 681e8e814d..6e84ce97a5 100644
--- a/include/ur_print.hpp
+++ b/include/ur_print.hpp
@@ -7994,6 +7994,12 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_kernel_group_info_t va
     case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE:
         os << "UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE";
         break;
+    case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE:
+        os << "UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE";
+        break;
+    case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE:
+        os << "UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE";
+        break;
     default:
         os << "unknown enumerator";
         break;
@@ -8086,6 +8092,32 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_kernel_grou
 
         os << ")";
     } break;
+    case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE: {
+
+        const size_t *tptr = (const size_t *)ptr;
+        os << "{";
+        size_t nelems = size / sizeof(size_t);
+        for (size_t i = 0; i < nelems; ++i) {
+            if (i != 0) {
+                os << ", ";
+            }
+
+            os << tptr[i];
+        }
+        os << "}";
+    } break;
+    case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE: {
+        const size_t *tptr = (const size_t *)ptr;
+        if (sizeof(size_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(size_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        os << *tptr;
+
+        os << ")";
+    } break;
     default:
         os << "unknown enumerator";
         return UR_RESULT_ERROR_INVALID_ENUMERATION;
diff --git a/scripts/core/kernel.yml b/scripts/core/kernel.yml
index 8585cffe6f..1ddcd83934 100644
--- a/scripts/core/kernel.yml
+++ b/scripts/core/kernel.yml
@@ -144,6 +144,14 @@ etors:
       desc: "[size_t] Return preferred multiple of Work Group size for launch"
     - name: PRIVATE_MEM_SIZE
       desc: "[size_t] Return minimum amount of private memory in bytes used by each work item in the Kernel"
+    - name: COMPILE_MAX_WORK_GROUP_SIZE
+      desc: |
+            [size_t[3]] Return the maximum Work Group size guaranteed by the
+            source code, or (0, 0, 0) if unspecified
+    - name: COMPILE_MAX_LINEAR_WORK_GROUP_SIZE
+      desc: |
+            [size_t] Return the maximum linearized Work Group size (X * Y * Z)
+            guaranteed by the source code, or 0 if unspecified
 --- #--------------------------------------------------------------------------
 type: enum
 desc: "Get Kernel SubGroup information"
diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
index 1c074025a9..d1d7b8afee 100644
--- a/source/adapters/cuda/enqueue.cpp
+++ b/source/adapters/cuda/enqueue.cpp
@@ -203,6 +203,7 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context,
     // Set the active context here as guessLocalWorkSize needs an active context
     ScopedContext Active(Device);
     {
+      size_t *MaxThreadsPerBlock = Kernel->MaxThreadsPerBlock;
       size_t *ReqdThreadsPerBlock = Kernel->ReqdThreadsPerBlock;
       MaxWorkGroupSize = Device->getMaxWorkGroupSize();
 
@@ -212,6 +213,10 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context,
               LocalWorkSize[Dim] != ReqdThreadsPerBlock[Dim])
             return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
 
+          if (MaxThreadsPerBlock[Dim] != 0 &&
+              LocalWorkSize[Dim] > MaxThreadsPerBlock[Dim])
+            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+
           if (LocalWorkSize[Dim] > Device->getMaxWorkItemSizes(Dim))
             return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
           // Checks that local work sizes are a divisor of the global work sizes
@@ -235,6 +240,12 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context,
           KernelLocalWorkGroupSize *= LocalWorkSize[Dim];
         }
 
+        if (size_t MaxLinearThreadsPerBlock = Kernel->MaxLinearThreadsPerBlock;
+            MaxLinearThreadsPerBlock &&
+            MaxLinearThreadsPerBlock < KernelLocalWorkGroupSize) {
+          return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+        }
+
         if (hasExceededMaxRegistersPerBlock(Device, Kernel,
                                             KernelLocalWorkGroupSize)) {
           return UR_RESULT_ERROR_OUT_OF_RESOURCES;
diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp
index 2061893744..04904528c4 100644
--- a/source/adapters/cuda/kernel.cpp
+++ b/source/adapters/cuda/kernel.cpp
@@ -124,6 +124,30 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
         &Bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, hKernel->get()));
     return ReturnValue(uint64_t(Bytes));
   }
+  case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE: {
+    size_t MaxGroupSize[3] = {0, 0, 0};
+    const auto &MaxWGSizeMDMap =
+        hKernel->getProgram()->KernelMaxWorkGroupSizeMD;
+    const auto MaxWGSizeMD = MaxWGSizeMDMap.find(hKernel->getName());
+    if (MaxWGSizeMD != MaxWGSizeMDMap.end()) {
+      const auto MaxWGSize = MaxWGSizeMD->second;
+      MaxGroupSize[0] = std::get<0>(MaxWGSize);
+      MaxGroupSize[1] = std::get<1>(MaxWGSize);
+      MaxGroupSize[2] = std::get<2>(MaxWGSize);
+    }
+    return ReturnValue(MaxGroupSize, 3);
+  }
+  case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE: {
+    size_t MaxLinearGroupSize = 0;
+    const auto &MaxLinearWGSizeMDMap =
+        hKernel->getProgram()->KernelMaxLinearWorkGroupSizeMD;
+    const auto MaxLinearWGSizeMD =
+        MaxLinearWGSizeMDMap.find(hKernel->getName());
+    if (MaxLinearWGSizeMD != MaxLinearWGSizeMDMap.end()) {
+      MaxLinearGroupSize = MaxLinearWGSizeMD->second;
+    }
+    return ReturnValue(MaxLinearGroupSize);
+  }
   default:
     break;
   }
diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp
index c6761d8525..7ad20a4f0e 100644
--- a/source/adapters/cuda/kernel.hpp
+++ b/source/adapters/cuda/kernel.hpp
@@ -46,6 +46,8 @@ struct ur_kernel_handle_t_ {
 
   static constexpr uint32_t ReqdThreadsPerBlockDimensions = 3u;
   size_t ReqdThreadsPerBlock[ReqdThreadsPerBlockDimensions];
+  size_t MaxThreadsPerBlock[ReqdThreadsPerBlockDimensions];
+  size_t MaxLinearThreadsPerBlock{0};
   int RegsPerThread{0};
 
   /// Structure that holds the arguments to the kernel.
@@ -169,6 +171,18 @@ struct ur_kernel_handle_t_ {
         sizeof(ReqdThreadsPerBlock), ReqdThreadsPerBlock, nullptr);
     (void)RetError;
     assert(RetError == UR_RESULT_SUCCESS);
+    /// Note: this code assumes that there is only one device per context
+    RetError = urKernelGetGroupInfo(
+        this, Program->getDevice(),
+        UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE,
+        sizeof(MaxThreadsPerBlock), MaxThreadsPerBlock, nullptr);
+    assert(RetError == UR_RESULT_SUCCESS);
+    /// Note: this code assumes that there is only one device per context
+    RetError = urKernelGetGroupInfo(
+        this, Program->getDevice(),
+        UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE,
+        sizeof(MaxLinearThreadsPerBlock), &MaxLinearThreadsPerBlock, nullptr);
+    assert(RetError == UR_RESULT_SUCCESS);
     UR_CHECK_ERROR(
         cuFuncGetAttribute(&RegsPerThread, CU_FUNC_ATTRIBUTE_NUM_REGS, Func));
   }
diff --git a/source/adapters/cuda/program.cpp b/source/adapters/cuda/program.cpp
index 98757d710e..9b14a0a4eb 100644
--- a/source/adapters/cuda/program.cpp
+++ b/source/adapters/cuda/program.cpp
@@ -54,9 +54,10 @@ ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata,
 
     auto [Prefix, Tag] = splitMetadataName(MetadataElementName);
 
-    if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) {
-      // If metadata is reqd_work_group_size, record it for the corresponding
-      // kernel name.
+    if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE ||
+        Tag == __SYCL_UR_PROGRAM_METADATA_TAG_MAX_WORK_GROUP_SIZE) {
+      // If metadata is reqd_work_group_size/max_work_group_size, record it for
+      // the corresponding kernel name.
       size_t MDElemsSize = MetadataElement.size - sizeof(std::uint64_t);
 
       // Expect between 1 and 3 32-bit integer values.
@@ -69,11 +70,13 @@ ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata,
           reinterpret_cast<const char *>(MetadataElement.value.pData) +
           sizeof(std::uint64_t);
       // Read values and pad with 1's for values not present.
-      std::uint32_t ReqdWorkGroupElements[] = {1, 1, 1};
-      std::memcpy(ReqdWorkGroupElements, ValuePtr, MDElemsSize);
-      KernelReqdWorkGroupSizeMD[Prefix] =
-          std::make_tuple(ReqdWorkGroupElements[0], ReqdWorkGroupElements[1],
-                          ReqdWorkGroupElements[2]);
+      std::array<uint32_t, 3> WorkGroupElements = {1, 1, 1};
+      std::memcpy(WorkGroupElements.data(), ValuePtr, MDElemsSize);
+      (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE
+           ? KernelReqdWorkGroupSizeMD
+           : KernelMaxWorkGroupSizeMD)[Prefix] =
+          std::make_tuple(WorkGroupElements[0], WorkGroupElements[1],
+                          WorkGroupElements[2]);
     } else if (Tag == __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING) {
       const char *MetadataValPtr =
           reinterpret_cast<const char *>(MetadataElement.value.pData) +
@@ -81,6 +84,9 @@ ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata,
       const char *MetadataValPtrEnd =
           MetadataValPtr + MetadataElement.size - sizeof(std::uint64_t);
       GlobalIDMD[Prefix] = std::string{MetadataValPtr, MetadataValPtrEnd};
+    } else if (Tag ==
+               __SYCL_UR_PROGRAM_METADATA_TAG_MAX_LINEAR_WORK_GROUP_SIZE) {
+      KernelMaxLinearWorkGroupSizeMD[Prefix] = MetadataElement.value.data64;
     }
   }
   return UR_RESULT_SUCCESS;
diff --git a/source/adapters/cuda/program.hpp b/source/adapters/cuda/program.hpp
index 5d41374d34..10998cae2c 100644
--- a/source/adapters/cuda/program.hpp
+++ b/source/adapters/cuda/program.hpp
@@ -36,6 +36,9 @@ struct ur_program_handle_t_ {
   std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
       KernelReqdWorkGroupSizeMD;
   std::unordered_map<std::string, std::string> GlobalIDMD;
+  std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
+      KernelMaxWorkGroupSizeMD;
+  std::unordered_map<std::string, uint64_t> KernelMaxLinearWorkGroupSizeMD;
 
   constexpr static size_t MaxLogSize = 8192u;
 
@@ -45,7 +48,8 @@ struct ur_program_handle_t_ {
 
   ur_program_handle_t_(ur_context_handle_t Context, ur_device_handle_t Device)
       : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1},
-        Context{Context}, Device{Device}, KernelReqdWorkGroupSizeMD{} {
+        Context{Context}, Device{Device}, KernelReqdWorkGroupSizeMD{},
+        KernelMaxWorkGroupSizeMD{}, KernelMaxLinearWorkGroupSizeMD{} {
     urContextRetain(Context);
     urDeviceRetain(Device);
   }
diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp
index aa46843963..cdf0364ae2 100644
--- a/source/adapters/hip/kernel.cpp
+++ b/source/adapters/hip/kernel.cpp
@@ -127,6 +127,10 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
         &Bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, hKernel->get()));
     return ReturnValue(uint64_t(Bytes));
   }
+  case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE:
+  case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE:
+    // FIXME: could be added
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   default:
     break;
   }
diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp
index 8e627f3ade..4cf3c9c267 100644
--- a/source/adapters/level_zero/kernel.cpp
+++ b/source/adapters/level_zero/kernel.cpp
@@ -840,6 +840,10 @@ ur_result_t urKernelGetGroupInfo(
   case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
     return ReturnValue(uint32_t{Kernel->ZeKernelProperties->privateMemSize});
   }
+  case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE:
+  case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE:
+    // No corresponding enumeration in Level Zero
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   default: {
     logger::error(
         "Unknown ParamName in urKernelGetGroupInfo: ParamName={}(0x{})",
diff --git a/source/adapters/level_zero/v2/kernel.cpp b/source/adapters/level_zero/v2/kernel.cpp
index 8bfad2d2ad..fc3cb4a120 100644
--- a/source/adapters/level_zero/v2/kernel.cpp
+++ b/source/adapters/level_zero/v2/kernel.cpp
@@ -417,6 +417,10 @@ ur_result_t urKernelGetGroupInfo(
     auto props = hKernel->getProperties(hDevice);
     return returnValue(uint32_t{props.privateMemSize});
   }
+  case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE:
+  case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE:
+    // No corresponding enumeration in Level Zero
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   default: {
     logger::error(
         "Unknown ParamName in urKernelGetGroupInfo: ParamName={}(0x{})",
diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
index b5d4713e2f..33d8c35c36 100644
--- a/source/adapters/native_cpu/enqueue.cpp
+++ b/source/adapters/native_cpu/enqueue.cpp
@@ -81,11 +81,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     DIE_NO_IMPLEMENTATION;
   }
 
-  // Check reqd_work_group_size
-  if (hKernel->hasReqdWGSize() && pLocalWorkSize != nullptr) {
-    const auto &Reqd = hKernel->getReqdWGSize();
+  // Check reqd_work_group_size and other kernel constraints
+  if (pLocalWorkSize != nullptr) {
+    uint64_t TotalNumWIs = 1;
     for (uint32_t Dim = 0; Dim < workDim; Dim++) {
-      if (pLocalWorkSize[Dim] != Reqd[Dim]) {
+      TotalNumWIs *= pLocalWorkSize[Dim];
+      if (auto Reqd = hKernel->getReqdWGSize();
+          Reqd && pLocalWorkSize[Dim] != Reqd.value()[Dim]) {
+        return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+      }
+      if (auto MaxWG = hKernel->getMaxWGSize();
+          MaxWG && pLocalWorkSize[Dim] > MaxWG.value()[Dim]) {
+        return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+      }
+    }
+    if (auto MaxLinearWG = hKernel->getMaxLinearWGSize()) {
+      if (TotalNumWIs > MaxLinearWG) {
         return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
       }
     }
diff --git a/source/adapters/native_cpu/kernel.cpp b/source/adapters/native_cpu/kernel.cpp
index 23a65eb03b..9363f92b7b 100644
--- a/source/adapters/native_cpu/kernel.cpp
+++ b/source/adapters/native_cpu/kernel.cpp
@@ -31,14 +31,25 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName,
   ur_kernel_handle_t_ *kernel;
 
   // Set reqd_work_group_size for kernel if needed
+  std::optional<native_cpu::WGSize_t> ReqdWG;
   const auto &ReqdMap = hProgram->KernelReqdWorkGroupSizeMD;
-  auto ReqdIt = ReqdMap.find(pKernelName);
-  if (ReqdIt != ReqdMap.end()) {
-    kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f, ReqdIt->second);
-  } else {
-    kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f);
+  if (auto ReqdIt = ReqdMap.find(pKernelName); ReqdIt != ReqdMap.end()) {
+    ReqdWG = ReqdIt->second;
   }
 
+  std::optional<native_cpu::WGSize_t> MaxWG;
+  const auto &MaxMap = hProgram->KernelMaxWorkGroupSizeMD;
+  if (auto MaxIt = MaxMap.find(pKernelName); MaxIt != MaxMap.end()) {
+    MaxWG = MaxIt->second;
+  }
+  std::optional<uint64_t> MaxLinearWG;
+  const auto &MaxLinMap = hProgram->KernelMaxLinearWorkGroupSizeMD;
+  if (auto MaxLIt = MaxLinMap.find(pKernelName); MaxLIt != MaxLinMap.end()) {
+    MaxLinearWG = MaxLIt->second;
+  }
+  kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f, ReqdWG, MaxWG,
+                                   MaxLinearWG);
+
   *phKernel = kernel;
 
   return UR_RESULT_SUCCESS;
@@ -148,6 +159,10 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
     int bytes = 0;
     return returnValue(static_cast<uint64_t>(bytes));
   }
+  case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE:
+  case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE:
+    // FIXME: could be added
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
 
   default:
     break;
diff --git a/source/adapters/native_cpu/kernel.hpp b/source/adapters/native_cpu/kernel.hpp
index b5728fa8b2..084a0ee695 100644
--- a/source/adapters/native_cpu/kernel.hpp
+++ b/source/adapters/native_cpu/kernel.hpp
@@ -41,15 +41,14 @@ struct ur_kernel_handle_t_ : RefCounted {
 
   ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
                       nativecpu_task_t subhandler)
-      : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)},
-        HasReqdWGSize(false) {}
+      : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)} {}
 
   ur_kernel_handle_t_(const ur_kernel_handle_t_ &other)
       : hProgram(other.hProgram), _name(other._name),
         _subhandler(other._subhandler), _args(other._args),
         _localArgInfo(other._localArgInfo), _localMemPool(other._localMemPool),
         _localMemPoolSize(other._localMemPoolSize),
-        HasReqdWGSize(other.HasReqdWGSize), ReqdWGSize(other.ReqdWGSize) {
+        ReqdWGSize(other.ReqdWGSize) {
     incrementReferenceCount();
   }
 
@@ -60,9 +59,12 @@ struct ur_kernel_handle_t_ : RefCounted {
   }
   ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
                       nativecpu_task_t subhandler,
-                      const native_cpu::ReqdWGSize_t &ReqdWGSize)
+                      std::optional<native_cpu::WGSize_t> ReqdWGSize,
+                      std::optional<native_cpu::WGSize_t> MaxWGSize,
+                      std::optional<uint64_t> MaxLinearWGSize)
       : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)},
-        HasReqdWGSize(true), ReqdWGSize(ReqdWGSize) {}
+        ReqdWGSize(ReqdWGSize), MaxWGSize(MaxWGSize),
+        MaxLinearWGSize(MaxLinearWGSize) {}
 
   ur_program_handle_t hProgram;
   std::string _name;
@@ -70,9 +72,13 @@ struct ur_kernel_handle_t_ : RefCounted {
   std::vector<native_cpu::NativeCPUArgDesc> _args;
   std::vector<local_arg_info_t> _localArgInfo;
 
-  bool hasReqdWGSize() const { return HasReqdWGSize; }
+  std::optional<native_cpu::WGSize_t> getReqdWGSize() const {
+    return ReqdWGSize;
+  }
+
+  std::optional<native_cpu::WGSize_t> getMaxWGSize() const { return MaxWGSize; }
 
-  const native_cpu::ReqdWGSize_t &getReqdWGSize() const { return ReqdWGSize; }
+  std::optional<uint64_t> getMaxLinearWGSize() const { return MaxLinearWGSize; }
 
   void updateMemPool(size_t numParallelThreads) {
     // compute requested size.
@@ -103,6 +109,7 @@ struct ur_kernel_handle_t_ : RefCounted {
 private:
   char *_localMemPool = nullptr;
   size_t _localMemPoolSize = 0;
-  bool HasReqdWGSize;
-  native_cpu::ReqdWGSize_t ReqdWGSize;
+  std::optional<native_cpu::WGSize_t> ReqdWGSize = std::nullopt;
+  std::optional<native_cpu::WGSize_t> MaxWGSize = std::nullopt;
+  std::optional<uint64_t> MaxLinearWGSize = std::nullopt;
 };
diff --git a/source/adapters/native_cpu/program.cpp b/source/adapters/native_cpu/program.cpp
index 77edd83bce..460660e2cd 100644
--- a/source/adapters/native_cpu/program.cpp
+++ b/source/adapters/native_cpu/program.cpp
@@ -29,8 +29,9 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
   DIE_NO_IMPLEMENTATION
 }
 
-static ur_result_t getReqdWGSize(const ur_program_metadata_t &MetadataElement,
-                                 native_cpu::ReqdWGSize_t &res) {
+static ur_result_t
+deserializeWGMetadata(const ur_program_metadata_t &MetadataElement,
+                      native_cpu::WGSize_t &res, std::uint32_t DefaultVal) {
   size_t MDElemsSize = MetadataElement.size - sizeof(std::uint64_t);
 
   // Expect between 1 and 3 32-bit integer values.
@@ -43,12 +44,12 @@ static ur_result_t getReqdWGSize(const ur_program_metadata_t &MetadataElement,
   const char *ValuePtr =
       reinterpret_cast<const char *>(MetadataElement.value.pData) +
       sizeof(std::uint64_t);
-  // Read values and pad with 1's for values not present.
-  std::uint32_t ReqdWorkGroupElements[] = {1, 1, 1};
-  std::memcpy(ReqdWorkGroupElements, ValuePtr, MDElemsSize);
-  std::get<0>(res) = ReqdWorkGroupElements[0];
-  std::get<1>(res) = ReqdWorkGroupElements[1];
-  std::get<2>(res) = ReqdWorkGroupElements[2];
+  // Read values and pad with a default value for missing elements.
+  std::uint32_t WorkGroupElements[] = {DefaultVal, DefaultVal, DefaultVal};
+  std::memcpy(WorkGroupElements, ValuePtr, MDElemsSize);
+  std::get<0>(res) = WorkGroupElements[0];
+  std::get<1>(res) = WorkGroupElements[1];
+  std::get<2>(res) = WorkGroupElements[2];
   return UR_RESULT_SUCCESS;
 }
 
@@ -71,13 +72,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
       const auto &mdNode = pProperties->pMetadatas[i];
       std::string mdName(mdNode.pName);
       auto [Prefix, Tag] = splitMetadataName(mdName);
-      if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) {
-        native_cpu::ReqdWGSize_t reqdWGSize;
-        auto res = getReqdWGSize(mdNode, reqdWGSize);
+      if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE ||
+          Tag == __SYCL_UR_PROGRAM_METADATA_TAG_MAX_WORK_GROUP_SIZE) {
+        bool isReqd =
+            Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE;
+        native_cpu::WGSize_t wgSizeProp;
+        auto res = deserializeWGMetadata(
+            mdNode, wgSizeProp,
+            isReqd ? 1 : std::numeric_limits<std::uint32_t>::max());
         if (res != UR_RESULT_SUCCESS) {
           return res;
         }
-        hProgram->KernelReqdWorkGroupSizeMD[Prefix] = std::move(reqdWGSize);
+        (isReqd ? hProgram->KernelReqdWorkGroupSizeMD
+                : hProgram->KernelMaxWorkGroupSizeMD)[Prefix] =
+            std::move(wgSizeProp);
+      } else if (Tag ==
+                 __SYCL_UR_PROGRAM_METADATA_TAG_MAX_LINEAR_WORK_GROUP_SIZE) {
+        hProgram->KernelMaxLinearWorkGroupSizeMD[Prefix] = mdNode.value.data64;
       }
     }
   }
diff --git a/source/adapters/native_cpu/program.hpp b/source/adapters/native_cpu/program.hpp
index e85749a7b2..d58412751e 100644
--- a/source/adapters/native_cpu/program.hpp
+++ b/source/adapters/native_cpu/program.hpp
@@ -18,7 +18,7 @@
 #include <map>
 
 namespace native_cpu {
-using ReqdWGSize_t = std::array<uint32_t, 3>;
+using WGSize_t = std::array<uint32_t, 3>;
 }
 
 struct ur_program_handle_t_ : RefCounted {
@@ -36,8 +36,11 @@ struct ur_program_handle_t_ : RefCounted {
   };
 
   std::map<const char *, const unsigned char *, _compare> _kernels;
-  std::unordered_map<std::string, native_cpu::ReqdWGSize_t>
+  std::unordered_map<std::string, native_cpu::WGSize_t>
       KernelReqdWorkGroupSizeMD;
+  std::unordered_map<std::string, native_cpu::WGSize_t>
+      KernelMaxWorkGroupSizeMD;
+  std::unordered_map<std::string, uint64_t> KernelMaxLinearWorkGroupSizeMD;
 };
 
 // The nativecpu_entry struct is also defined as LLVM-IR in the
diff --git a/source/adapters/opencl/kernel.cpp b/source/adapters/opencl/kernel.cpp
index 9735abefbf..074348c622 100644
--- a/source/adapters/opencl/kernel.cpp
+++ b/source/adapters/opencl/kernel.cpp
@@ -130,6 +130,10 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
       return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
     }
   }
+  if (propName == UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE ||
+      propName == UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE) {
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+  }
   CL_RETURN_ON_FAILURE(clGetKernelWorkGroupInfo(
       cl_adapter::cast<cl_kernel>(hKernel),
       cl_adapter::cast<cl_device_id>(hDevice),
diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp
index e41623b15c..fb705dfc20 100644
--- a/source/loader/layers/validation/ur_valddi.cpp
+++ b/source/loader/layers/validation/ur_valddi.cpp
@@ -3504,7 +3504,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelGetGroupInfo(
             return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
         }
 
-        if (UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE < propName) {
+        if (UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE <
+            propName) {
             return UR_RESULT_ERROR_INVALID_ENUMERATION;
         }
     }
diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp
index 05b0c71995..3ccc51133b 100644
--- a/source/loader/ur_libapi.cpp
+++ b/source/loader/ur_libapi.cpp
@@ -3764,7 +3764,7 @@ ur_result_t UR_APICALL urKernelGetInfo(
 ///         + `NULL == hKernel`
 ///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE < propName`
+///         + `::UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE < propName`
 ur_result_t UR_APICALL urKernelGetGroupInfo(
     ur_kernel_handle_t hKernel, ///< [in] handle of the Kernel object
     ur_device_handle_t hDevice, ///< [in] handle of the Device object
diff --git a/source/ur/ur.hpp b/source/ur/ur.hpp
index e8a1ed56b9..0639a9d9be 100644
--- a/source/ur/ur.hpp
+++ b/source/ur/ur.hpp
@@ -53,6 +53,10 @@ const ur_command_t UR_EXT_COMMAND_TYPE_USER =
 #define __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE                    \
   "@reqd_work_group_size"
 #define __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING "@global_id_mapping"
+#define __SYCL_UR_PROGRAM_METADATA_TAG_MAX_WORK_GROUP_SIZE                     \
+  "@max_work_group_size"
+#define __SYCL_UR_PROGRAM_METADATA_TAG_MAX_LINEAR_WORK_GROUP_SIZE              \
+  "@max_linear_work_group_size"
 #define __SYCL_UR_PROGRAM_METADATA_TAG_NEED_FINALIZATION "Requires finalization"
 
 // Terminates the process with a catastrophic error message.
diff --git a/source/ur_api.cpp b/source/ur_api.cpp
index 11b9dea7e9..3e024ede0f 100644
--- a/source/ur_api.cpp
+++ b/source/ur_api.cpp
@@ -3211,7 +3211,7 @@ ur_result_t UR_APICALL urKernelGetInfo(
 ///         + `NULL == hKernel`
 ///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE < propName`
+///         + `::UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE < propName`
 ur_result_t UR_APICALL urKernelGetGroupInfo(
     ur_kernel_handle_t hKernel, ///< [in] handle of the Kernel object
     ur_device_handle_t hDevice, ///< [in] handle of the Device object
diff --git a/test/conformance/kernel/kernel_adapter_native_cpu.match b/test/conformance/kernel/kernel_adapter_native_cpu.match
index 6e5db6f70f..4cf052500e 100644
--- a/test/conformance/kernel/kernel_adapter_native_cpu.match
+++ b/test/conformance/kernel/kernel_adapter_native_cpu.match
@@ -12,25 +12,34 @@ urKernelGetGroupInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_K
 urKernelGetGroupInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE
 urKernelGetGroupInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
 urKernelGetGroupInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE
+urKernelGetGroupInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE
+urKernelGetGroupInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE
 urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE
 urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE
 urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
 urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE
 urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
 urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE
+urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE
+urKernelGetGroupInfoTest.InvalidNullHandleKernel/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE
 urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE
 urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE
 urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
 urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE
 urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
 urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE
+urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE
+urKernelGetGroupInfoTest.InvalidNullHandleDevice/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE
 urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_GLOBAL_WORK_SIZE
 urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_WORK_GROUP_SIZE
 urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE
 urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE
 urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE
 urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE
+urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE
+urKernelGetGroupInfoTest.InvalidEnumeration/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE
 urKernelGetGroupInfoSingleTest.CompileWorkGroupSizeEmpty/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}
+urKernelGetGroupInfoSingleTest.CompileMaxWorkGroupSizeEmpty/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}
 urKernelGetGroupInfoWgSizeTest.CompileWorkGroupSize/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}
 urKernelGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_INFO_FUNCTION_NAME
 urKernelGetInfoTest.Success/SYCL_NATIVE_CPU___SYCL_Native_CPU__{{.*}}__UR_KERNEL_INFO_NUM_ARGS
diff --git a/test/conformance/kernel/urKernelGetGroupInfo.cpp b/test/conformance/kernel/urKernelGetGroupInfo.cpp
index b91001a07f..fcb3c39410 100644
--- a/test/conformance/kernel/urKernelGetGroupInfo.cpp
+++ b/test/conformance/kernel/urKernelGetGroupInfo.cpp
@@ -15,7 +15,9 @@ UUR_TEST_SUITE_P(
                       UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE,
                       UR_KERNEL_GROUP_INFO_LOCAL_MEM_SIZE,
                       UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
-                      UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE),
+                      UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE,
+                      UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE,
+                      UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE),
     uur::deviceTestWithParamPrinter<ur_kernel_group_info_t>);
 
 struct urKernelGetGroupInfoSingleTest : uur::urKernelTest {
@@ -85,7 +87,7 @@ TEST_P(urKernelGetGroupInfoWgSizeTest, CompileWorkGroupSize) {
 }
 
 TEST_P(urKernelGetGroupInfoSingleTest, CompileWorkGroupSizeEmpty) {
-    // Returns 0 by default when there is no sepecific information
+    // Returns 0 by default when there is no specific information
     std::array<size_t, 3> read_dims{1, 1, 1};
     std::array<size_t, 3> zero{0, 0, 0};
     ASSERT_SUCCESS(urKernelGetGroupInfo(
@@ -93,3 +95,17 @@ TEST_P(urKernelGetGroupInfoSingleTest, CompileWorkGroupSizeEmpty) {
         sizeof(read_dims), read_dims.data(), nullptr));
     ASSERT_EQ(read_dims, zero);
 }
+
+TEST_P(urKernelGetGroupInfoSingleTest, CompileMaxWorkGroupSizeEmpty) {
+    // Returns 0 by default when there is no specific information
+    std::array<size_t, 3> read_dims{1, 1, 1};
+    std::array<size_t, 3> zero{0, 0, 0};
+    auto result = urKernelGetGroupInfo(
+        kernel, device, UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE,
+        sizeof(read_dims), read_dims.data(), nullptr);
+    if (result == UR_RESULT_SUCCESS) {
+        ASSERT_EQ(read_dims, zero);
+    } else {
+        ASSERT_EQ(result, UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION);
+    }
+}