intel · sommerlukas · Jun 3, 2024 · Feb 2, 2024 · Feb 5, 2024 · Feb 5, 2024
@@ -3,10 +3,11 @@
 
 # Options to override the default behaviour of the FetchContent to include UR
 # source code.
+# TODO: Resolve this tag override after related UR changes are merged.
 set(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO
-  "" CACHE STRING "Override the Unified Runtime FetchContent repository")
+  "https://github.com/GeorgeWeb/unified-runtime.git" CACHE STRING "Override the Unified Runtime FetchContent repository")
 set(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_TAG
-  "" CACHE STRING "Override the Unified Runtime FetchContent tag")
+  "eec7aa47fb7ddf9f659984ae1fd6d67eb3252f3d" CACHE STRING "Override the Unified Runtime FetchContent tag")
 
 # Options to disable use of FetchContent to include Unified Runtime source code
 # to improve developer workflow.

@@ -20,6 +20,60 @@ namespace sycl {
 inline namespace _V1 {
 namespace detail::enqueue_kernel_launch {
 
+void handleOutOfResources(const device_impl &DeviceImpl, pi_kernel Kernel,
+                          const NDRDescT &NDRDesc) {
+  sycl::platform Platform = DeviceImpl.get_platform();
+  sycl::backend Backend = Platform.get_backend();
+  if (Backend == sycl::backend::ext_oneapi_cuda) {
+    // PI_ERROR_OUT_OF_RESOURCES is returned when the kernel registers
+    // required for the launch config exceeds the maximum number of registers
+    // per block (PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP).
+    // This is if local_work_size[0] * ... * local_work_size[work_dim - 1]
+    // multiplied by PI_KERNEL_GROUP_INFO_NUM_REGS is greater than the value
+    // of PI_KERNEL_MAX_NUM_REGISTERS_PER_BLOCK. See Table 15: Technical
+    // Specifications per Compute Capability, for limitations.
+    const size_t TotalNumberOfWIs =
+        NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];
+
+    const uint32_t MaxRegistersPerBlock =
+        DeviceImpl.get_info<ext::codeplay::experimental::info::device::
+                                max_registers_per_work_group>();
+
+    const PluginPtr &Plugin = DeviceImpl.getPlugin();
+    sycl::detail::pi::PiDevice Device = DeviceImpl.getHandleRef();
+
+    uint32_t NumRegisters = 0;
+    Plugin->call<PiApiKind::piKernelGetGroupInfo>(
+        Kernel, Device, PI_KERNEL_GROUP_INFO_NUM_REGS, sizeof(NumRegisters),
+        &NumRegisters, nullptr);
+
+    const bool HasExceededAvailableRegisters =
+        TotalNumberOfWIs * NumRegisters > MaxRegistersPerBlock;
+
+    if (HasExceededAvailableRegisters) {
+      std::string message(
+          "Exceeded the number of registers available on the hardware.\n");
+      throw sycl::exception(
+          sycl::make_error_code(sycl::errc::nd_range),
+          // Additional information which can be helpful to the user.
+          message.append(
+              "\tThe number registers per work-group cannot exceed " +
+              std::to_string(MaxRegistersPerBlock) +
+              " for this kernel on this device.\n"
+              "\tThe kernel uses " +
+              std::to_string(NumRegisters) +
+              " registers per work-item for a total of " +
+              std::to_string(TotalNumberOfWIs) +
+              " work-items per work-group.\n"));
+    }
+  }
+  // Fallback
+  constexpr pi_result Error = PI_ERROR_OUT_OF_RESOURCES;
+  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
+                        "PI backend failed. PI backend returns:" +
+                            codeToString(Error));
+}
+
 void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
                                 const NDRDescT &NDRDesc) {
   sycl::platform Platform = DeviceImpl.get_platform();
@@ -30,7 +84,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
   bool IsOpenCLV1x = false;   // Backend is OpenCL 1.x
   bool IsOpenCLVGE20 = false; // Backend is Greater or Equal to OpenCL 2.0
   bool IsLevelZero = false;   // Backend is any OneAPI Level 0 version
-  bool IsCuda = false;        // Backend is CUDA
   auto Backend = Platform.get_backend();
   if (Backend == sycl::backend::opencl) {
     std::string VersionString =
@@ -41,8 +94,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
         (VersionString.find("2.") == 0) || (VersionString.find("3.") == 0);
   } else if (Backend == sycl::backend::ext_oneapi_level_zero) {
     IsLevelZero = true;
-  } else if (Backend == sycl::backend::ext_oneapi_cuda) {
-    IsCuda = true;
   }
 
   const PluginPtr &Plugin = DeviceImpl.getPlugin();
@@ -243,46 +294,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
           // else unknown.  fallback (below)
         }
       }
-    } else if (IsCuda) {
-      // CUDA:
-      // PI_ERROR_INVALID_WORK_GROUP_SIZE is returned when the kernel registers
-      // required for the launch config exceeds the maximum number of registers
-      // per block (PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP).
-      // This is if local_work_size[0] * ... * local_work_size[work_dim - 1]
-      // multiplied by PI_KERNEL_GROUP_INFO_NUM_REGS is greater than the value
-      // of PI_KERNEL_MAX_NUM_REGISTERS_PER_BLOCK. See Table 15: Technical
-      // Specifications per Compute Capability, for limitations.
-      const size_t TotalNumberOfWIs =
-          NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];
-
-      uint32_t NumRegisters = 0;
-      Plugin->call<PiApiKind::piKernelGetGroupInfo>(
-          Kernel, Device, PI_KERNEL_GROUP_INFO_NUM_REGS, sizeof(NumRegisters),
-          &NumRegisters, nullptr);
-
-      uint32_t MaxRegistersPerBlock =
-          DeviceImpl.get_info<ext::codeplay::experimental::info::device::
-                                  max_registers_per_work_group>();
-
-      const bool HasExceededAvailableRegisters =
-          TotalNumberOfWIs * NumRegisters > MaxRegistersPerBlock;
-
-      if (HasExceededAvailableRegisters) {
-        std::string message(
-            "Exceeded the number of registers available on the hardware.\n");
-        throw sycl::nd_range_error(
-            // Additional information which can be helpful to the user.
-            message.append(
-                "\tThe number registers per work-group cannot exceed " +
-                std::to_string(MaxRegistersPerBlock) +
-                " for this kernel on this device.\n"
-                "\tThe kernel uses " +
-                std::to_string(NumRegisters) +
-                " registers per work-item for a total of " +
-                std::to_string(TotalNumberOfWIs) +
-                " work-items per work-group.\n"),
-            PI_ERROR_INVALID_WORK_GROUP_SIZE);
-      }
     } else {
       // TODO: Decide what checks (if any) we need for the other backends
     }
@@ -352,6 +363,9 @@ void handleErrorOrWarning(pi_result Error, const device_impl &DeviceImpl,
   assert(Error != PI_SUCCESS &&
          "Success is expected to be handled on caller side");
   switch (Error) {
+  case PI_ERROR_OUT_OF_RESOURCES:
+    return handleOutOfResources(DeviceImpl, Kernel, NDRDesc);
+
   case PI_ERROR_INVALID_WORK_GROUP_SIZE:
     return handleInvalidWorkGroupSize(DeviceImpl, Kernel, NDRDesc);
 

@@ -26,7 +26,7 @@
 class kernel_vadd_and_sum;
 
 int main() {
-  sycl::queue q;
+  sycl::queue q{};
   sycl::device dev = q.get_device();
   size_t local_size = dev.get_info<sycl::info::device::max_work_group_size>();
   if (local_size < 1024u) {
@@ -80,6 +80,8 @@ int main() {
              // compute vector add
              const auto vadd = values1 + values2 + values3 + values4;
 
+             // NB: 64 registers used to do the vector addition.
+
              // compute total vector elements sum
              auto sum = elem_t(0);
              for (int j = 0; j < VEC_DIM; j++) {
@@ -92,11 +94,13 @@ int main() {
              output[i] = vadd;
              output[i] += sum;
            });
-     }).wait();
-  } catch (sycl::exception &e) {
+     }).wait_and_throw();
+  } catch (const sycl::exception &e) {
     using std::string_view_literals::operator""sv;
     auto Msg = "Exceeded the number of registers available on the hardware."sv;
-    if (std::string(e.what()).find(Msg) != std::string::npos) {
+    auto Errc = sycl::make_error_code(sycl::errc::nd_range);
+    if (e.code() == Errc &&
+        std::string_view{e.what()}.find(Msg) != std::string_view::npos) {
       return 0;
     }
   }