diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index a081bdb010d81..536358cb36af2 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -100,13 +100,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) endfunction() set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit 9f783837089c970a22cda08f768aa3dbed38f0d3 - # Merge: c015f892 b9442104 + # commit 5083f4f96557672b7b6a55ea53347896d40549d7 + # Merge: a97eed15 4c3f9abe # Author: Kenneth Benzie (Benie) - # Date: Fri May 31 10:20:23 2024 +0100 - # Merge pull request #1533 from AllanZyne/sanitizer-buffer - # [DeviceSanitizer] Support detecting out-of-bounds errors on sycl::buffer - set(UNIFIED_RUNTIME_TAG 9f783837089c970a22cda08f768aa3dbed38f0d3) + # Date: Fri May 31 17:20:01 2024 +0100 + # Merge pull request #1397 from GeorgeWeb/georgi/check-allocation-error-on-event-from-native-handle + # [CUDA][HIP] Catch and report bad_alloc errors for event object creation + set(UNIFIED_RUNTIME_TAG 5083f4f96557672b7b6a55ea53347896d40549d7) fetch_adapter_source(level_zero ${UNIFIED_RUNTIME_REPO} diff --git a/sycl/source/detail/error_handling/error_handling.cpp b/sycl/source/detail/error_handling/error_handling.cpp index f7ab280c6fa7f..6f7cacd89fb91 100644 --- a/sycl/source/detail/error_handling/error_handling.cpp +++ b/sycl/source/detail/error_handling/error_handling.cpp @@ -20,6 +20,60 @@ namespace sycl { inline namespace _V1 { namespace detail::enqueue_kernel_launch { +void handleOutOfResources(const device_impl &DeviceImpl, pi_kernel Kernel, + const NDRDescT &NDRDesc) { + sycl::platform Platform = DeviceImpl.get_platform(); + sycl::backend Backend = Platform.get_backend(); + if (Backend == sycl::backend::ext_oneapi_cuda) { + // PI_ERROR_OUT_OF_RESOURCES is returned when the kernel registers + // required for the launch config exceeds the maximum number of registers + // per block (PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP). + // This is if local_work_size[0] * ... * local_work_size[work_dim - 1] + // multiplied by PI_KERNEL_GROUP_INFO_NUM_REGS is greater than the value + // of PI_KERNEL_MAX_NUM_REGISTERS_PER_BLOCK. See Table 15: Technical + // Specifications per Compute Capability, for limitations. + const size_t TotalNumberOfWIs = + NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2]; + + const uint32_t MaxRegistersPerBlock = + DeviceImpl.get_info(); + + const PluginPtr &Plugin = DeviceImpl.getPlugin(); + sycl::detail::pi::PiDevice Device = DeviceImpl.getHandleRef(); + + uint32_t NumRegisters = 0; + Plugin->call( + Kernel, Device, PI_KERNEL_GROUP_INFO_NUM_REGS, sizeof(NumRegisters), + &NumRegisters, nullptr); + + const bool HasExceededAvailableRegisters = + TotalNumberOfWIs * NumRegisters > MaxRegistersPerBlock; + + if (HasExceededAvailableRegisters) { + std::string message( + "Exceeded the number of registers available on the hardware.\n"); + throw sycl::exception( + sycl::make_error_code(sycl::errc::nd_range), + // Additional information which can be helpful to the user. + message.append( + "\tThe number registers per work-group cannot exceed " + + std::to_string(MaxRegistersPerBlock) + + " for this kernel on this device.\n" + "\tThe kernel uses " + + std::to_string(NumRegisters) + + " registers per work-item for a total of " + + std::to_string(TotalNumberOfWIs) + + " work-items per work-group.\n")); + } + } + // Fallback + constexpr pi_result Error = PI_ERROR_OUT_OF_RESOURCES; + throw sycl::exception(sycl::make_error_code(sycl::errc::runtime), + "PI backend failed. PI backend returns:" + + codeToString(Error)); +} + void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel, const NDRDescT &NDRDesc) { sycl::platform Platform = DeviceImpl.get_platform(); @@ -30,7 +84,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel, bool IsOpenCLV1x = false; // Backend is OpenCL 1.x bool IsOpenCLVGE20 = false; // Backend is Greater or Equal to OpenCL 2.0 bool IsLevelZero = false; // Backend is any OneAPI Level 0 version - bool IsCuda = false; // Backend is CUDA auto Backend = Platform.get_backend(); if (Backend == sycl::backend::opencl) { std::string VersionString = @@ -41,8 +94,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel, (VersionString.find("2.") == 0) || (VersionString.find("3.") == 0); } else if (Backend == sycl::backend::ext_oneapi_level_zero) { IsLevelZero = true; - } else if (Backend == sycl::backend::ext_oneapi_cuda) { - IsCuda = true; } const PluginPtr &Plugin = DeviceImpl.getPlugin(); @@ -243,46 +294,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel, // else unknown. fallback (below) } } - } else if (IsCuda) { - // CUDA: - // PI_ERROR_INVALID_WORK_GROUP_SIZE is returned when the kernel registers - // required for the launch config exceeds the maximum number of registers - // per block (PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP). - // This is if local_work_size[0] * ... * local_work_size[work_dim - 1] - // multiplied by PI_KERNEL_GROUP_INFO_NUM_REGS is greater than the value - // of PI_KERNEL_MAX_NUM_REGISTERS_PER_BLOCK. See Table 15: Technical - // Specifications per Compute Capability, for limitations. - const size_t TotalNumberOfWIs = - NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2]; - - uint32_t NumRegisters = 0; - Plugin->call( - Kernel, Device, PI_KERNEL_GROUP_INFO_NUM_REGS, sizeof(NumRegisters), - &NumRegisters, nullptr); - - uint32_t MaxRegistersPerBlock = - DeviceImpl.get_info(); - - const bool HasExceededAvailableRegisters = - TotalNumberOfWIs * NumRegisters > MaxRegistersPerBlock; - - if (HasExceededAvailableRegisters) { - std::string message( - "Exceeded the number of registers available on the hardware.\n"); - throw sycl::nd_range_error( - // Additional information which can be helpful to the user. - message.append( - "\tThe number registers per work-group cannot exceed " + - std::to_string(MaxRegistersPerBlock) + - " for this kernel on this device.\n" - "\tThe kernel uses " + - std::to_string(NumRegisters) + - " registers per work-item for a total of " + - std::to_string(TotalNumberOfWIs) + - " work-items per work-group.\n"), - PI_ERROR_INVALID_WORK_GROUP_SIZE); - } } else { // TODO: Decide what checks (if any) we need for the other backends } @@ -352,6 +363,9 @@ void handleErrorOrWarning(pi_result Error, const device_impl &DeviceImpl, assert(Error != PI_SUCCESS && "Success is expected to be handled on caller side"); switch (Error) { + case PI_ERROR_OUT_OF_RESOURCES: + return handleOutOfResources(DeviceImpl, Kernel, NDRDesc); + case PI_ERROR_INVALID_WORK_GROUP_SIZE: return handleInvalidWorkGroupSize(DeviceImpl, Kernel, NDRDesc); diff --git a/sycl/test-e2e/OptionalKernelFeatures/throw-exception-for-out-of-registers-on-kernel-launch.cpp b/sycl/test-e2e/OptionalKernelFeatures/throw-exception-for-out-of-registers-on-kernel-launch.cpp index 6a806756c2a85..a701b7960c232 100644 --- a/sycl/test-e2e/OptionalKernelFeatures/throw-exception-for-out-of-registers-on-kernel-launch.cpp +++ b/sycl/test-e2e/OptionalKernelFeatures/throw-exception-for-out-of-registers-on-kernel-launch.cpp @@ -26,7 +26,7 @@ class kernel_vadd_and_sum; int main() { - sycl::queue q; + sycl::queue q{}; sycl::device dev = q.get_device(); size_t local_size = dev.get_info(); if (local_size < 1024u) { @@ -80,6 +80,8 @@ int main() { // compute vector add const auto vadd = values1 + values2 + values3 + values4; + // NB: 64 registers used to do the vector addition. + // compute total vector elements sum auto sum = elem_t(0); for (int j = 0; j < VEC_DIM; j++) { @@ -92,11 +94,13 @@ int main() { output[i] = vadd; output[i] += sum; }); - }).wait(); - } catch (sycl::exception &e) { + }).wait_and_throw(); + } catch (const sycl::exception &e) { using std::string_view_literals::operator""sv; auto Msg = "Exceeded the number of registers available on the hardware."sv; - if (std::string(e.what()).find(Msg) != std::string::npos) { + auto Errc = sycl::make_error_code(sycl::errc::nd_range); + if (e.code() == Errc && + std::string_view{e.what()}.find(Msg) != std::string_view::npos) { return 0; } }