intel · sommerlukas · Jun 3, 2024 · Feb 2, 2024 · Feb 5, 2024 · Feb 5, 2024
@@ -100,13 +100,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
   endfunction()
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
-  # commit 9f783837089c970a22cda08f768aa3dbed38f0d3
-  # Merge: c015f892 b9442104
+  # commit 5083f4f96557672b7b6a55ea53347896d40549d7
+  # Merge: a97eed15 4c3f9abe
   # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
-  # Date:   Fri May 31 10:20:23 2024 +0100
-  #     Merge pull request #1533 from AllanZyne/sanitizer-buffer
-  #     [DeviceSanitizer] Support detecting out-of-bounds errors on sycl::buffer
-  set(UNIFIED_RUNTIME_TAG 9f783837089c970a22cda08f768aa3dbed38f0d3)
+  # Date:   Fri May 31 17:20:01 2024 +0100
+  #     Merge pull request #1397 from GeorgeWeb/georgi/check-allocation-error-on-event-from-native-handle
+  #     [CUDA][HIP] Catch and report bad_alloc errors for event object creation
+  set(UNIFIED_RUNTIME_TAG 5083f4f96557672b7b6a55ea53347896d40549d7)
 
   fetch_adapter_source(level_zero
     ${UNIFIED_RUNTIME_REPO}

@@ -20,6 +20,60 @@ namespace sycl {
 inline namespace _V1 {
 namespace detail::enqueue_kernel_launch {
 
+void handleOutOfResources(const device_impl &DeviceImpl, pi_kernel Kernel,
+                          const NDRDescT &NDRDesc) {
+  sycl::platform Platform = DeviceImpl.get_platform();
+  sycl::backend Backend = Platform.get_backend();
+  if (Backend == sycl::backend::ext_oneapi_cuda) {
+    // PI_ERROR_OUT_OF_RESOURCES is returned when the kernel registers
+    // required for the launch config exceeds the maximum number of registers
+    // per block (PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP).
+    // This is if local_work_size[0] * ... * local_work_size[work_dim - 1]
+    // multiplied by PI_KERNEL_GROUP_INFO_NUM_REGS is greater than the value
+    // of PI_KERNEL_MAX_NUM_REGISTERS_PER_BLOCK. See Table 15: Technical
+    // Specifications per Compute Capability, for limitations.
+    const size_t TotalNumberOfWIs =
+        NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];
+
+    const uint32_t MaxRegistersPerBlock =
+        DeviceImpl.get_info<ext::codeplay::experimental::info::device::
+                                max_registers_per_work_group>();
+
+    const PluginPtr &Plugin = DeviceImpl.getPlugin();
+    sycl::detail::pi::PiDevice Device = DeviceImpl.getHandleRef();
+
+    uint32_t NumRegisters = 0;
+    Plugin->call<PiApiKind::piKernelGetGroupInfo>(
+        Kernel, Device, PI_KERNEL_GROUP_INFO_NUM_REGS, sizeof(NumRegisters),
+        &NumRegisters, nullptr);
+
+    const bool HasExceededAvailableRegisters =
+        TotalNumberOfWIs * NumRegisters > MaxRegistersPerBlock;
+
+    if (HasExceededAvailableRegisters) {
+      std::string message(
+          "Exceeded the number of registers available on the hardware.\n");
+      throw sycl::exception(
+          sycl::make_error_code(sycl::errc::nd_range),
+          // Additional information which can be helpful to the user.
+          message.append(
+              "\tThe number registers per work-group cannot exceed " +
+              std::to_string(MaxRegistersPerBlock) +
+              " for this kernel on this device.\n"
+              "\tThe kernel uses " +
+              std::to_string(NumRegisters) +
+              " registers per work-item for a total of " +
+              std::to_string(TotalNumberOfWIs) +
+              " work-items per work-group.\n"));
+    }
+  }
+  // Fallback
+  constexpr pi_result Error = PI_ERROR_OUT_OF_RESOURCES;
+  throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
+                        "PI backend failed. PI backend returns:" +
+                            codeToString(Error));
+}
+
 void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
                                 const NDRDescT &NDRDesc) {
   sycl::platform Platform = DeviceImpl.get_platform();
@@ -30,7 +84,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
   bool IsOpenCLV1x = false;   // Backend is OpenCL 1.x
   bool IsOpenCLVGE20 = false; // Backend is Greater or Equal to OpenCL 2.0
   bool IsLevelZero = false;   // Backend is any OneAPI Level 0 version
-  bool IsCuda = false;        // Backend is CUDA
   auto Backend = Platform.get_backend();
   if (Backend == sycl::backend::opencl) {
     std::string VersionString =
@@ -41,8 +94,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
         (VersionString.find("2.") == 0) || (VersionString.find("3.") == 0);
   } else if (Backend == sycl::backend::ext_oneapi_level_zero) {
     IsLevelZero = true;
-  } else if (Backend == sycl::backend::ext_oneapi_cuda) {
-    IsCuda = true;
   }
 
   const PluginPtr &Plugin = DeviceImpl.getPlugin();
@@ -243,46 +294,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
           // else unknown.  fallback (below)
         }
       }
-    } else if (IsCuda) {
-      // CUDA:
-      // PI_ERROR_INVALID_WORK_GROUP_SIZE is returned when the kernel registers
-      // required for the launch config exceeds the maximum number of registers
-      // per block (PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP).
-      // This is if local_work_size[0] * ... * local_work_size[work_dim - 1]
-      // multiplied by PI_KERNEL_GROUP_INFO_NUM_REGS is greater than the value
-      // of PI_KERNEL_MAX_NUM_REGISTERS_PER_BLOCK. See Table 15: Technical
-      // Specifications per Compute Capability, for limitations.
-      const size_t TotalNumberOfWIs =
-          NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];
-
-      uint32_t NumRegisters = 0;
-      Plugin->call<PiApiKind::piKernelGetGroupInfo>(
-          Kernel, Device, PI_KERNEL_GROUP_INFO_NUM_REGS, sizeof(NumRegisters),
-          &NumRegisters, nullptr);
-
-      uint32_t MaxRegistersPerBlock =
-          DeviceImpl.get_info<ext::codeplay::experimental::info::device::
-                                  max_registers_per_work_group>();
-
-      const bool HasExceededAvailableRegisters =
-          TotalNumberOfWIs * NumRegisters > MaxRegistersPerBlock;
-
-      if (HasExceededAvailableRegisters) {
-        std::string message(
-            "Exceeded the number of registers available on the hardware.\n");
-        throw sycl::nd_range_error(
-            // Additional information which can be helpful to the user.
-            message.append(
-                "\tThe number registers per work-group cannot exceed " +
-                std::to_string(MaxRegistersPerBlock) +
-                " for this kernel on this device.\n"
-                "\tThe kernel uses " +
-                std::to_string(NumRegisters) +
-                " registers per work-item for a total of " +
-                std::to_string(TotalNumberOfWIs) +
-                " work-items per work-group.\n"),
-            PI_ERROR_INVALID_WORK_GROUP_SIZE);
-      }
     } else {
       // TODO: Decide what checks (if any) we need for the other backends
     }
@@ -352,6 +363,9 @@ void handleErrorOrWarning(pi_result Error, const device_impl &DeviceImpl,
   assert(Error != PI_SUCCESS &&
          "Success is expected to be handled on caller side");
   switch (Error) {
+  case PI_ERROR_OUT_OF_RESOURCES:
+    return handleOutOfResources(DeviceImpl, Kernel, NDRDesc);
+
   case PI_ERROR_INVALID_WORK_GROUP_SIZE:
     return handleInvalidWorkGroupSize(DeviceImpl, Kernel, NDRDesc);
 

@@ -26,7 +26,7 @@
 class kernel_vadd_and_sum;
 
 int main() {
-  sycl::queue q;
+  sycl::queue q{};
   sycl::device dev = q.get_device();
   size_t local_size = dev.get_info<sycl::info::device::max_work_group_size>();
   if (local_size < 1024u) {
@@ -80,6 +80,8 @@ int main() {
              // compute vector add
              const auto vadd = values1 + values2 + values3 + values4;
 
+             // NB: 64 registers used to do the vector addition.
+
              // compute total vector elements sum
              auto sum = elem_t(0);
              for (int j = 0; j < VEC_DIM; j++) {
@@ -92,11 +94,13 @@ int main() {
              output[i] = vadd;
              output[i] += sum;
            });
-     }).wait();
-  } catch (sycl::exception &e) {
+     }).wait_and_throw();
+  } catch (const sycl::exception &e) {
     using std::string_view_literals::operator""sv;
     auto Msg = "Exceeded the number of registers available on the hardware."sv;
-    if (std::string(e.what()).find(Msg) != std::string::npos) {
+    auto Errc = sycl::make_error_code(sycl::errc::nd_range);
+    if (e.code() == Errc &&
+        std::string_view{e.what()}.find(Msg) != std::string_view::npos) {
       return 0;
     }
   }