Add two new properties to ur_kernel_group_info_t

These two properties allow the program to specify a maximum work-group size in various ways.
oneapi-src · Aug 22, 2024 · b595cbf · b595cbf
1 parent 40a790f
commit b595cbf
Show file tree

Hide file tree

Showing 23 changed files with 242 additions and 47 deletions.
diff --git a/include/ur_api.h b/include/ur_api.h
@@ -4854,6 +4854,10 @@ typedef enum ur_kernel_group_info_t {
     UR_KERNEL_GROUP_INFO_PREFERRED_WORK_GROUP_SIZE_MULTIPLE = 4, ///< [size_t] Return preferred multiple of Work Group size for launch
     UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE = 5,                   ///< [size_t] Return minimum amount of private memory in bytes used by each
                                                                  ///< work item in the Kernel
+    UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE = 6,        ///< [size_t[3]] Return the maximum Work Group size guaranteed by the
+                                                                 ///< source code, or (0, 0, 0) if unspecified
+    UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE = 7, ///< [size_t] Return the maximum linearized Work Group size (X * Y * Z)
+                                                                 ///< guaranteed by the source code, or 0 if unspecified
     /// @cond
     UR_KERNEL_GROUP_INFO_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -4957,7 +4961,7 @@ urKernelGetInfo(
 ///         + `NULL == hKernel`
 ///         + `NULL == hDevice`
 ///     - ::UR_RESULT_ERROR_INVALID_ENUMERATION
-///         + `::UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE < propName`
+///         + `::UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE < propName`
 UR_APIEXPORT ur_result_t UR_APICALL
 urKernelGetGroupInfo(
     ur_kernel_handle_t hKernel,      ///< [in] handle of the Kernel object

diff --git a/include/ur_print.hpp b/include/ur_print.hpp
@@ -7994,6 +7994,12 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_kernel_group_info_t va
     case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE:
         os << "UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE";
         break;
+    case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE:
+        os << "UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE";
+        break;
+    case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE:
+        os << "UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE";
+        break;
     default:
         os << "unknown enumerator";
         break;
@@ -8086,6 +8092,32 @@ inline ur_result_t printTagged(std::ostream &os, const void *ptr, ur_kernel_grou
 
         os << ")";
     } break;
+    case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE: {
+
+        const size_t *tptr = (const size_t *)ptr;
+        os << "{";
+        size_t nelems = size / sizeof(size_t);
+        for (size_t i = 0; i < nelems; ++i) {
+            if (i != 0) {
+                os << ", ";
+            }
+
+            os << tptr[i];
+        }
+        os << "}";
+    } break;
+    case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE: {
+        const size_t *tptr = (const size_t *)ptr;
+        if (sizeof(size_t) > size) {
+            os << "invalid size (is: " << size << ", expected: >=" << sizeof(size_t) << ")";
+            return UR_RESULT_ERROR_INVALID_SIZE;
+        }
+        os << (const void *)(tptr) << " (";
+
+        os << *tptr;
+
+        os << ")";
+    } break;
     default:
         os << "unknown enumerator";
         return UR_RESULT_ERROR_INVALID_ENUMERATION;

diff --git a/scripts/core/kernel.yml b/scripts/core/kernel.yml
@@ -144,6 +144,14 @@ etors:
       desc: "[size_t] Return preferred multiple of Work Group size for launch"
     - name: PRIVATE_MEM_SIZE
       desc: "[size_t] Return minimum amount of private memory in bytes used by each work item in the Kernel"
+    - name: COMPILE_MAX_WORK_GROUP_SIZE
+      desc: |
+            [size_t[3]] Return the maximum Work Group size guaranteed by the
+            source code, or (0, 0, 0) if unspecified
+    - name: COMPILE_MAX_LINEAR_WORK_GROUP_SIZE
+      desc: |
+            [size_t] Return the maximum linearized Work Group size (X * Y * Z)
+            guaranteed by the source code, or 0 if unspecified
 --- #--------------------------------------------------------------------------
 type: enum
 desc: "Get Kernel SubGroup information"

diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
@@ -203,6 +203,7 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context,
     // Set the active context here as guessLocalWorkSize needs an active context
     ScopedContext Active(Device);
     {
+      size_t *MaxThreadsPerBlock = Kernel->MaxThreadsPerBlock;
       size_t *ReqdThreadsPerBlock = Kernel->ReqdThreadsPerBlock;
       MaxWorkGroupSize = Device->getMaxWorkGroupSize();
 
@@ -212,6 +213,10 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context,
               LocalWorkSize[Dim] != ReqdThreadsPerBlock[Dim])
             return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
 
+          if (MaxThreadsPerBlock[Dim] != 0 &&
+              LocalWorkSize[Dim] > MaxThreadsPerBlock[Dim])
+            return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+
           if (LocalWorkSize[Dim] > Device->getMaxWorkItemSizes(Dim))
             return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
           // Checks that local work sizes are a divisor of the global work sizes
@@ -235,6 +240,12 @@ setKernelParams([[maybe_unused]] const ur_context_handle_t Context,
           KernelLocalWorkGroupSize *= LocalWorkSize[Dim];
         }
 
+        if (size_t MaxLinearThreadsPerBlock = Kernel->MaxLinearThreadsPerBlock;
+            MaxLinearThreadsPerBlock &&
+            MaxLinearThreadsPerBlock < KernelLocalWorkGroupSize) {
+          return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+        }
+
         if (hasExceededMaxRegistersPerBlock(Device, Kernel,
                                             KernelLocalWorkGroupSize)) {
           return UR_RESULT_ERROR_OUT_OF_RESOURCES;

diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp
@@ -124,6 +124,30 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
         &Bytes, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, hKernel->get()));
     return ReturnValue(uint64_t(Bytes));
   }
+  case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE: {
+    size_t MaxGroupSize[3] = {0, 0, 0};
+    const auto &MaxWGSizeMDMap =
+        hKernel->getProgram()->KernelMaxWorkGroupSizeMD;
+    const auto MaxWGSizeMD = MaxWGSizeMDMap.find(hKernel->getName());
+    if (MaxWGSizeMD != MaxWGSizeMDMap.end()) {
+      const auto MaxWGSize = MaxWGSizeMD->second;
+      MaxGroupSize[0] = std::get<0>(MaxWGSize);
+      MaxGroupSize[1] = std::get<1>(MaxWGSize);
+      MaxGroupSize[2] = std::get<2>(MaxWGSize);
+    }
+    return ReturnValue(MaxGroupSize, 3);
+  }
+  case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE: {
+    size_t MaxLinearGroupSize = 0;
+    const auto &MaxLinearWGSizeMDMap =
+        hKernel->getProgram()->KernelMaxLinearWorkGroupSizeMD;
+    const auto MaxLinearWGSizeMD =
+        MaxLinearWGSizeMDMap.find(hKernel->getName());
+    if (MaxLinearWGSizeMD != MaxLinearWGSizeMDMap.end()) {
+      MaxLinearGroupSize = MaxLinearWGSizeMD->second;
+    }
+    return ReturnValue(MaxLinearGroupSize);
+  }
   default:
     break;
   }

diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp
@@ -46,6 +46,8 @@ struct ur_kernel_handle_t_ {
 
   static constexpr uint32_t ReqdThreadsPerBlockDimensions = 3u;
   size_t ReqdThreadsPerBlock[ReqdThreadsPerBlockDimensions];
+  size_t MaxThreadsPerBlock[ReqdThreadsPerBlockDimensions];
+  size_t MaxLinearThreadsPerBlock{0};
   int RegsPerThread{0};
 
   /// Structure that holds the arguments to the kernel.
@@ -169,6 +171,18 @@ struct ur_kernel_handle_t_ {
         sizeof(ReqdThreadsPerBlock), ReqdThreadsPerBlock, nullptr);
     (void)RetError;
     assert(RetError == UR_RESULT_SUCCESS);
+    /// Note: this code assumes that there is only one device per context
+    RetError = urKernelGetGroupInfo(
+        this, Program->getDevice(),
+        UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE,
+        sizeof(MaxThreadsPerBlock), MaxThreadsPerBlock, nullptr);
+    assert(RetError == UR_RESULT_SUCCESS);
+    /// Note: this code assumes that there is only one device per context
+    RetError = urKernelGetGroupInfo(
+        this, Program->getDevice(),
+        UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE,
+        sizeof(MaxLinearThreadsPerBlock), &MaxLinearThreadsPerBlock, nullptr);
+    assert(RetError == UR_RESULT_SUCCESS);
     UR_CHECK_ERROR(
         cuFuncGetAttribute(&RegsPerThread, CU_FUNC_ATTRIBUTE_NUM_REGS, Func));
   }

diff --git a/source/adapters/cuda/program.cpp b/source/adapters/cuda/program.cpp
@@ -54,9 +54,10 @@ ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata,
 
     auto [Prefix, Tag] = splitMetadataName(MetadataElementName);
 
-    if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE) {
-      // If metadata is reqd_work_group_size, record it for the corresponding
-      // kernel name.
+    if (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE ||
+        Tag == __SYCL_UR_PROGRAM_METADATA_TAG_MAX_WORK_GROUP_SIZE) {
+      // If metadata is reqd_work_group_size/max_work_group_size, record it for
+      // the corresponding kernel name.
       size_t MDElemsSize = MetadataElement.size - sizeof(std::uint64_t);
 
       // Expect between 1 and 3 32-bit integer values.
@@ -69,18 +70,23 @@ ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata,
           reinterpret_cast<const char *>(MetadataElement.value.pData) +
           sizeof(std::uint64_t);
       // Read values and pad with 1's for values not present.
-      std::uint32_t ReqdWorkGroupElements[] = {1, 1, 1};
-      std::memcpy(ReqdWorkGroupElements, ValuePtr, MDElemsSize);
-      KernelReqdWorkGroupSizeMD[Prefix] =
-          std::make_tuple(ReqdWorkGroupElements[0], ReqdWorkGroupElements[1],
-                          ReqdWorkGroupElements[2]);
+      std::array<uint32_t, 3> WorkGroupElements = {1, 1, 1};
+      std::memcpy(WorkGroupElements.data(), ValuePtr, MDElemsSize);
+      (Tag == __SYCL_UR_PROGRAM_METADATA_TAG_REQD_WORK_GROUP_SIZE
+           ? KernelReqdWorkGroupSizeMD
+           : KernelMaxWorkGroupSizeMD)[Prefix] =
+          std::make_tuple(WorkGroupElements[0], WorkGroupElements[1],
+                          WorkGroupElements[2]);
     } else if (Tag == __SYCL_UR_PROGRAM_METADATA_GLOBAL_ID_MAPPING) {
       const char *MetadataValPtr =
           reinterpret_cast<const char *>(MetadataElement.value.pData) +
           sizeof(std::uint64_t);
       const char *MetadataValPtrEnd =
           MetadataValPtr + MetadataElement.size - sizeof(std::uint64_t);
       GlobalIDMD[Prefix] = std::string{MetadataValPtr, MetadataValPtrEnd};
+    } else if (Tag ==
+               __SYCL_UR_PROGRAM_METADATA_TAG_MAX_LINEAR_WORK_GROUP_SIZE) {
+      KernelMaxLinearWorkGroupSizeMD[Prefix] = MetadataElement.value.data64;
     }
   }
   return UR_RESULT_SUCCESS;

diff --git a/source/adapters/cuda/program.hpp b/source/adapters/cuda/program.hpp
@@ -36,6 +36,9 @@ struct ur_program_handle_t_ {
   std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
       KernelReqdWorkGroupSizeMD;
   std::unordered_map<std::string, std::string> GlobalIDMD;
+  std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
+      KernelMaxWorkGroupSizeMD;
+  std::unordered_map<std::string, uint64_t> KernelMaxLinearWorkGroupSizeMD;
 
   constexpr static size_t MaxLogSize = 8192u;
 
@@ -45,7 +48,8 @@ struct ur_program_handle_t_ {
 
   ur_program_handle_t_(ur_context_handle_t Context, ur_device_handle_t Device)
       : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1},
-        Context{Context}, Device{Device}, KernelReqdWorkGroupSizeMD{} {
+        Context{Context}, Device{Device}, KernelReqdWorkGroupSizeMD{},
+        KernelMaxWorkGroupSizeMD{}, KernelMaxLinearWorkGroupSizeMD{} {
     urContextRetain(Context);
     urDeviceRetain(Device);
   }

diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp
@@ -127,6 +127,10 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
         &Bytes, HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, hKernel->get()));
     return ReturnValue(uint64_t(Bytes));
   }
+  case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE:
+  case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE:
+    // FIXME: could be added
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   default:
     break;
   }

diff --git a/source/adapters/level_zero/kernel.cpp b/source/adapters/level_zero/kernel.cpp
@@ -838,6 +838,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo(
   case UR_KERNEL_GROUP_INFO_PRIVATE_MEM_SIZE: {
     return ReturnValue(uint32_t{Kernel->ZeKernelProperties->privateMemSize});
   }
+  case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE:
+  case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE:
+    // No corresponding enumeration in Level Zero
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
   default: {
     logger::error(
         "Unknown ParamName in urKernelGetGroupInfo: ParamName={}(0x{})",

diff --git a/source/adapters/native_cpu/enqueue.cpp b/source/adapters/native_cpu/enqueue.cpp
@@ -81,11 +81,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
     DIE_NO_IMPLEMENTATION;
   }
 
-  // Check reqd_work_group_size
-  if (hKernel->hasReqdWGSize() && pLocalWorkSize != nullptr) {
-    const auto &Reqd = hKernel->getReqdWGSize();
+  // Check reqd_work_group_size and other kernel constraints
+  if (pLocalWorkSize != nullptr) {
+    uint64_t TotalNumWIs = 1;
     for (uint32_t Dim = 0; Dim < workDim; Dim++) {
-      if (pLocalWorkSize[Dim] != Reqd[Dim]) {
+      TotalNumWIs *= pLocalWorkSize[Dim];
+      if (auto Reqd = hKernel->getReqdWGSize();
+          Reqd && pLocalWorkSize[Dim] != Reqd.value()[Dim]) {
+        return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+      }
+      if (auto MaxWG = hKernel->getMaxWGSize();
+          MaxWG && pLocalWorkSize[Dim] > MaxWG.value()[Dim]) {
+        return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
+      }
+    }
+    if (auto MaxLinearWG = hKernel->getMaxLinearWGSize()) {
+      if (TotalNumWIs > MaxLinearWG) {
         return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
       }
     }

diff --git a/source/adapters/native_cpu/kernel.cpp b/source/adapters/native_cpu/kernel.cpp
@@ -31,14 +31,25 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName,
   ur_kernel_handle_t_ *kernel;
 
   // Set reqd_work_group_size for kernel if needed
+  std::optional<native_cpu::WGSize_t> ReqdWG;
   const auto &ReqdMap = hProgram->KernelReqdWorkGroupSizeMD;
-  auto ReqdIt = ReqdMap.find(pKernelName);
-  if (ReqdIt != ReqdMap.end()) {
-    kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f, ReqdIt->second);
-  } else {
-    kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f);
+  if (auto ReqdIt = ReqdMap.find(pKernelName); ReqdIt != ReqdMap.end()) {
+    ReqdWG = ReqdIt->second;
   }
 
+  std::optional<native_cpu::WGSize_t> MaxWG;
+  const auto &MaxMap = hProgram->KernelMaxWorkGroupSizeMD;
+  if (auto MaxIt = MaxMap.find(pKernelName); MaxIt != MaxMap.end()) {
+    ReqdWG = MaxIt->second;
+  }
+  std::optional<uint64_t> MaxLinearWG;
+  const auto &MaxLinMap = hProgram->KernelMaxLinearWorkGroupSizeMD;
+  if (auto MaxLIt = MaxLinMap.find(pKernelName); MaxLIt != MaxLinMap.end()) {
+    MaxLinearWG = MaxLIt->second;
+  }
+  kernel = new ur_kernel_handle_t_(hProgram, pKernelName, *f, ReqdWG, MaxWG,
+                                   MaxLinearWG);
+
   *phKernel = kernel;
 
   return UR_RESULT_SUCCESS;
@@ -148,6 +159,10 @@ urKernelGetGroupInfo(ur_kernel_handle_t hKernel, ur_device_handle_t hDevice,
     int bytes = 0;
     return returnValue(static_cast<uint64_t>(bytes));
   }
+  case UR_KERNEL_GROUP_INFO_COMPILE_MAX_WORK_GROUP_SIZE:
+  case UR_KERNEL_GROUP_INFO_COMPILE_MAX_LINEAR_WORK_GROUP_SIZE:
+    // FIXME: could be added
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
 
   default:
     break;

diff --git a/source/adapters/native_cpu/kernel.hpp b/source/adapters/native_cpu/kernel.hpp
@@ -41,15 +41,14 @@ struct ur_kernel_handle_t_ : RefCounted {
 
   ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
                       nativecpu_task_t subhandler)
-      : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)},
-        HasReqdWGSize(false) {}
+      : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)} {}
 
   ur_kernel_handle_t_(const ur_kernel_handle_t_ &other)
       : hProgram(other.hProgram), _name(other._name),
         _subhandler(other._subhandler), _args(other._args),
         _localArgInfo(other._localArgInfo), _localMemPool(other._localMemPool),
         _localMemPoolSize(other._localMemPoolSize),
-        HasReqdWGSize(other.HasReqdWGSize), ReqdWGSize(other.ReqdWGSize) {
+        ReqdWGSize(other.ReqdWGSize) {
     incrementReferenceCount();
   }
 
@@ -60,19 +59,26 @@ struct ur_kernel_handle_t_ : RefCounted {
   }
   ur_kernel_handle_t_(ur_program_handle_t hProgram, const char *name,
                       nativecpu_task_t subhandler,
-                      const native_cpu::ReqdWGSize_t &ReqdWGSize)
+                      std::optional<native_cpu::WGSize_t> ReqdWGSize,
+                      std::optional<native_cpu::WGSize_t> MaxWGSize,
+                      std::optional<uint64_t> MaxLinearWGSize)
       : hProgram(hProgram), _name{name}, _subhandler{std::move(subhandler)},
-        HasReqdWGSize(true), ReqdWGSize(ReqdWGSize) {}
+        ReqdWGSize(ReqdWGSize), MaxWGSize(MaxWGSize),
+        MaxLinearWGSize(MaxLinearWGSize) {}
 
   ur_program_handle_t hProgram;
   std::string _name;
   nativecpu_task_t _subhandler;
   std::vector<native_cpu::NativeCPUArgDesc> _args;
   std::vector<local_arg_info_t> _localArgInfo;
 
-  bool hasReqdWGSize() const { return HasReqdWGSize; }
+  std::optional<native_cpu::WGSize_t> getReqdWGSize() const {
+    return ReqdWGSize;
+  }
+
+  std::optional<native_cpu::WGSize_t> getMaxWGSize() const { return MaxWGSize; }
 
-  const native_cpu::ReqdWGSize_t &getReqdWGSize() const { return ReqdWGSize; }
+  std::optional<uint64_t> getMaxLinearWGSize() const { return MaxLinearWGSize; }
 
   void updateMemPool(size_t numParallelThreads) {
     // compute requested size.
@@ -103,6 +109,7 @@ struct ur_kernel_handle_t_ : RefCounted {
 private:
   char *_localMemPool = nullptr;
   size_t _localMemPoolSize = 0;
-  bool HasReqdWGSize;
-  native_cpu::ReqdWGSize_t ReqdWGSize;
+  std::optional<native_cpu::WGSize_t> ReqdWGSize = std::nullopt;
+  std::optional<native_cpu::WGSize_t> MaxWGSize = std::nullopt;
+  std::optional<uint64_t> MaxLinearWGSize = std::nullopt;
 };