[SYCL][CUDA] Improve kernel launch error handling for out-of-registers (

#12604) This PR improves the handling of errors by specializing `PI_ERROR_OUT_OF_RESOURCES`. Previously, in the CUDA backend we handled the out of resources launch error (for exceeded registers) as invalid work group size error. Now pairing the new specialized handling with the UR adapter change oneapi-src/unified-runtime#1318 to return the correct error code, we no longer output a misleading error message to users. Also, added a fallback message for the generic out of resources error codes returned from APIs (e.g. for kernel launch). Fixes issue: oneapi-src/unified-runtime#1308
intel · Jun 3, 2024 · 9f1cee5 · 9f1cee5
1 parent 030a937
commit 9f1cee5
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 53 deletions.
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -100,13 +100,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
  endfunction()
 
  set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
- # commit 9f783837089c970a22cda08f768aa3dbed38f0d3
- # Merge: c015f892 b9442104
+ # commit 5083f4f96557672b7b6a55ea53347896d40549d7
+ # Merge: a97eed15 4c3f9abe
  # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
- # Date: Fri May 31 10:20:23 2024 +0100
- # Merge pull request #1533 from AllanZyne/sanitizer-buffer
- # [DeviceSanitizer] Support detecting out-of-bounds errors on sycl::buffer
- set(UNIFIED_RUNTIME_TAG 9f783837089c970a22cda08f768aa3dbed38f0d3)
+ # Date: Fri May 31 17:20:01 2024 +0100
+ # Merge pull request #1397 from GeorgeWeb/georgi/check-allocation-error-on-event-from-native-handle
+ # [CUDA][HIP] Catch and report bad_alloc errors for event object creation
+ set(UNIFIED_RUNTIME_TAG 5083f4f96557672b7b6a55ea53347896d40549d7)
 
  fetch_adapter_source(level_zero
  ${UNIFIED_RUNTIME_REPO}

diff --git a/sycl/source/detail/error_handling/error_handling.cpp b/sycl/source/detail/error_handling/error_handling.cpp
@@ -20,6 +20,60 @@ namespace sycl {
 inline namespace _V1 {
 namespace detail::enqueue_kernel_launch {
 
+void handleOutOfResources(const device_impl &DeviceImpl, pi_kernel Kernel,
+ const NDRDescT &NDRDesc) {
+ sycl::platform Platform = DeviceImpl.get_platform();
+ sycl::backend Backend = Platform.get_backend();
+ if (Backend == sycl::backend::ext_oneapi_cuda) {
+ // PI_ERROR_OUT_OF_RESOURCES is returned when the kernel registers
+ // required for the launch config exceeds the maximum number of registers
+ // per block (PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP).
+ // This is if local_work_size[0] * ... * local_work_size[work_dim - 1]
+ // multiplied by PI_KERNEL_GROUP_INFO_NUM_REGS is greater than the value
+ // of PI_KERNEL_MAX_NUM_REGISTERS_PER_BLOCK. See Table 15: Technical
+ // Specifications per Compute Capability, for limitations.
+ const size_t TotalNumberOfWIs =
+ NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];
+
+ const uint32_t MaxRegistersPerBlock =
+ DeviceImpl.get_info<ext::codeplay::experimental::info::device::
+ max_registers_per_work_group>();
+
+ const PluginPtr &Plugin = DeviceImpl.getPlugin();
+ sycl::detail::pi::PiDevice Device = DeviceImpl.getHandleRef();
+
+ uint32_t NumRegisters = 0;
+ Plugin->call<PiApiKind::piKernelGetGroupInfo>(
+ Kernel, Device, PI_KERNEL_GROUP_INFO_NUM_REGS, sizeof(NumRegisters),
+ &NumRegisters, nullptr);
+
+ const bool HasExceededAvailableRegisters =
+ TotalNumberOfWIs * NumRegisters > MaxRegistersPerBlock;
+
+ if (HasExceededAvailableRegisters) {
+ std::string message(
+ "Exceeded the number of registers available on the hardware.\n");
+ throw sycl::exception(
+ sycl::make_error_code(sycl::errc::nd_range),
+ // Additional information which can be helpful to the user.
+ message.append(
+ "\tThe number registers per work-group cannot exceed " +
+ std::to_string(MaxRegistersPerBlock) +
+ " for this kernel on this device.\n"
+ "\tThe kernel uses " +
+ std::to_string(NumRegisters) +
+ " registers per work-item for a total of " +
+ std::to_string(TotalNumberOfWIs) +
+ " work-items per work-group.\n"));
+ }
+ }
+ // Fallback
+ constexpr pi_result Error = PI_ERROR_OUT_OF_RESOURCES;
+ throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
+ "PI backend failed. PI backend returns:" +
+ codeToString(Error));
+}
+
 void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
  const NDRDescT &NDRDesc) {
  sycl::platform Platform = DeviceImpl.get_platform();
@@ -30,7 +84,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
  bool IsOpenCLV1x = false; // Backend is OpenCL 1.x
  bool IsOpenCLVGE20 = false; // Backend is Greater or Equal to OpenCL 2.0
  bool IsLevelZero = false; // Backend is any OneAPI Level 0 version
- bool IsCuda = false; // Backend is CUDA
  auto Backend = Platform.get_backend();
  if (Backend == sycl::backend::opencl) {
  std::string VersionString =
@@ -41,8 +94,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
  (VersionString.find("2.") == 0) || (VersionString.find("3.") == 0);
  } else if (Backend == sycl::backend::ext_oneapi_level_zero) {
  IsLevelZero = true;
- } else if (Backend == sycl::backend::ext_oneapi_cuda) {
- IsCuda = true;
  }
 
  const PluginPtr &Plugin = DeviceImpl.getPlugin();
@@ -243,46 +294,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
  // else unknown. fallback (below)
  }
  }
- } else if (IsCuda) {
- // CUDA:
- // PI_ERROR_INVALID_WORK_GROUP_SIZE is returned when the kernel registers
- // required for the launch config exceeds the maximum number of registers
- // per block (PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP).
- // This is if local_work_size[0] * ... * local_work_size[work_dim - 1]
- // multiplied by PI_KERNEL_GROUP_INFO_NUM_REGS is greater than the value
- // of PI_KERNEL_MAX_NUM_REGISTERS_PER_BLOCK. See Table 15: Technical
- // Specifications per Compute Capability, for limitations.
- const size_t TotalNumberOfWIs =
- NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];
-
- uint32_t NumRegisters = 0;
- Plugin->call<PiApiKind::piKernelGetGroupInfo>(
- Kernel, Device, PI_KERNEL_GROUP_INFO_NUM_REGS, sizeof(NumRegisters),
- &NumRegisters, nullptr);
-
- uint32_t MaxRegistersPerBlock =
- DeviceImpl.get_info<ext::codeplay::experimental::info::device::
- max_registers_per_work_group>();
-
- const bool HasExceededAvailableRegisters =
- TotalNumberOfWIs * NumRegisters > MaxRegistersPerBlock;
-
- if (HasExceededAvailableRegisters) {
- std::string message(
- "Exceeded the number of registers available on the hardware.\n");
- throw sycl::nd_range_error(
- // Additional information which can be helpful to the user.
- message.append(
- "\tThe number registers per work-group cannot exceed " +
- std::to_string(MaxRegistersPerBlock) +
- " for this kernel on this device.\n"
- "\tThe kernel uses " +
- std::to_string(NumRegisters) +
- " registers per work-item for a total of " +
- std::to_string(TotalNumberOfWIs) +
- " work-items per work-group.\n"),
- PI_ERROR_INVALID_WORK_GROUP_SIZE);
- }
  } else {
  // TODO: Decide what checks (if any) we need for the other backends
  }
@@ -352,6 +363,9 @@ void handleErrorOrWarning(pi_result Error, const device_impl &DeviceImpl,
  assert(Error != PI_SUCCESS &&
  "Success is expected to be handled on caller side");
  switch (Error) {
+ case PI_ERROR_OUT_OF_RESOURCES:
+ return handleOutOfResources(DeviceImpl, Kernel, NDRDesc);
+
  case PI_ERROR_INVALID_WORK_GROUP_SIZE:
  return handleInvalidWorkGroupSize(DeviceImpl, Kernel, NDRDesc);
 

diff --git a/...test-e2e/OptionalKernelFeatures/throw-exception-for-out-of-registers-on-kernel-launch.cpp b/...test-e2e/OptionalKernelFeatures/throw-exception-for-out-of-registers-on-kernel-launch.cpp
@@ -26,7 +26,7 @@
 class kernel_vadd_and_sum;
 
 int main() {
- sycl::queue q;
+ sycl::queue q{};
  sycl::device dev = q.get_device();
  size_t local_size = dev.get_info<sycl::info::device::max_work_group_size>();
  if (local_size < 1024u) {
@@ -80,6 +80,8 @@ int main() {
  // compute vector add
  const auto vadd = values1 + values2 + values3 + values4;
 
+ // NB: 64 registers used to do the vector addition.
+
  // compute total vector elements sum
  auto sum = elem_t(0);
  for (int j = 0; j < VEC_DIM; j++) {
@@ -92,11 +94,13 @@ int main() {
  output[i] = vadd;
  output[i] += sum;
  });
- }).wait();
- } catch (sycl::exception &e) {
+ }).wait_and_throw();
+ } catch (const sycl::exception &e) {
  using std::string_view_literals::operator""sv;
  auto Msg = "Exceeded the number of registers available on the hardware."sv;
- if (std::string(e.what()).find(Msg) != std::string::npos) {
+ auto Errc = sycl::make_error_code(sycl::errc::nd_range);
+ if (e.code() == Errc &&
+ std::string_view{e.what()}.find(Msg) != std::string_view::npos) {
  return 0;
  }
  }