Skip to content

Commit

Permalink
[SYCL][CUDA] Improve kernel launch error handling for out-of-registers (
Browse files Browse the repository at this point in the history
#12604)

This PR improves the handling of errors by specializing
`PI_ERROR_OUT_OF_RESOURCES`.

Previously, in the CUDA backend we handled the out of resources launch
error (for exceeded registers) as invalid work group size error. Now
pairing the new specialized handling with the UR adapter change
oneapi-src/unified-runtime#1318 to return the
correct error code, we no longer output a misleading error message to
users.
Also, added a fallback message for the generic out of resources error
codes returned from APIs (e.g. for kernel launch).

Fixes issue: oneapi-src/unified-runtime#1308
  • Loading branch information
GeorgeWeb committed Jun 3, 2024
1 parent 030a937 commit 9f1cee5
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 53 deletions.
12 changes: 6 additions & 6 deletions sycl/plugins/unified_runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -100,13 +100,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
endfunction()

set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
# commit 9f783837089c970a22cda08f768aa3dbed38f0d3
# Merge: c015f892 b9442104
# commit 5083f4f96557672b7b6a55ea53347896d40549d7
# Merge: a97eed15 4c3f9abe
# Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
# Date: Fri May 31 10:20:23 2024 +0100
# Merge pull request #1533 from AllanZyne/sanitizer-buffer
# [DeviceSanitizer] Support detecting out-of-bounds errors on sycl::buffer
set(UNIFIED_RUNTIME_TAG 9f783837089c970a22cda08f768aa3dbed38f0d3)
# Date: Fri May 31 17:20:01 2024 +0100
# Merge pull request #1397 from GeorgeWeb/georgi/check-allocation-error-on-event-from-native-handle
# [CUDA][HIP] Catch and report bad_alloc errors for event object creation
set(UNIFIED_RUNTIME_TAG 5083f4f96557672b7b6a55ea53347896d40549d7)

fetch_adapter_source(level_zero
${UNIFIED_RUNTIME_REPO}
Expand Down
100 changes: 57 additions & 43 deletions sycl/source/detail/error_handling/error_handling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,60 @@ namespace sycl {
inline namespace _V1 {
namespace detail::enqueue_kernel_launch {

void handleOutOfResources(const device_impl &DeviceImpl, pi_kernel Kernel,
const NDRDescT &NDRDesc) {
sycl::platform Platform = DeviceImpl.get_platform();
sycl::backend Backend = Platform.get_backend();
if (Backend == sycl::backend::ext_oneapi_cuda) {
// PI_ERROR_OUT_OF_RESOURCES is returned when the kernel registers
// required for the launch config exceeds the maximum number of registers
// per block (PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP).
// This is if local_work_size[0] * ... * local_work_size[work_dim - 1]
// multiplied by PI_KERNEL_GROUP_INFO_NUM_REGS is greater than the value
// of PI_KERNEL_MAX_NUM_REGISTERS_PER_BLOCK. See Table 15: Technical
// Specifications per Compute Capability, for limitations.
const size_t TotalNumberOfWIs =
NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];

const uint32_t MaxRegistersPerBlock =
DeviceImpl.get_info<ext::codeplay::experimental::info::device::
max_registers_per_work_group>();

const PluginPtr &Plugin = DeviceImpl.getPlugin();
sycl::detail::pi::PiDevice Device = DeviceImpl.getHandleRef();

uint32_t NumRegisters = 0;
Plugin->call<PiApiKind::piKernelGetGroupInfo>(
Kernel, Device, PI_KERNEL_GROUP_INFO_NUM_REGS, sizeof(NumRegisters),
&NumRegisters, nullptr);

const bool HasExceededAvailableRegisters =
TotalNumberOfWIs * NumRegisters > MaxRegistersPerBlock;

if (HasExceededAvailableRegisters) {
std::string message(
"Exceeded the number of registers available on the hardware.\n");
throw sycl::exception(
sycl::make_error_code(sycl::errc::nd_range),
// Additional information which can be helpful to the user.
message.append(
"\tThe number registers per work-group cannot exceed " +
std::to_string(MaxRegistersPerBlock) +
" for this kernel on this device.\n"
"\tThe kernel uses " +
std::to_string(NumRegisters) +
" registers per work-item for a total of " +
std::to_string(TotalNumberOfWIs) +
" work-items per work-group.\n"));
}
}
// Fallback
constexpr pi_result Error = PI_ERROR_OUT_OF_RESOURCES;
throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
"PI backend failed. PI backend returns:" +
codeToString(Error));
}

void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
const NDRDescT &NDRDesc) {
sycl::platform Platform = DeviceImpl.get_platform();
Expand All @@ -30,7 +84,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
bool IsOpenCLV1x = false; // Backend is OpenCL 1.x
bool IsOpenCLVGE20 = false; // Backend is Greater or Equal to OpenCL 2.0
bool IsLevelZero = false; // Backend is any OneAPI Level 0 version
bool IsCuda = false; // Backend is CUDA
auto Backend = Platform.get_backend();
if (Backend == sycl::backend::opencl) {
std::string VersionString =
Expand All @@ -41,8 +94,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
(VersionString.find("2.") == 0) || (VersionString.find("3.") == 0);
} else if (Backend == sycl::backend::ext_oneapi_level_zero) {
IsLevelZero = true;
} else if (Backend == sycl::backend::ext_oneapi_cuda) {
IsCuda = true;
}

const PluginPtr &Plugin = DeviceImpl.getPlugin();
Expand Down Expand Up @@ -243,46 +294,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
// else unknown. fallback (below)
}
}
} else if (IsCuda) {
// CUDA:
// PI_ERROR_INVALID_WORK_GROUP_SIZE is returned when the kernel registers
// required for the launch config exceeds the maximum number of registers
// per block (PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP).
// This is if local_work_size[0] * ... * local_work_size[work_dim - 1]
// multiplied by PI_KERNEL_GROUP_INFO_NUM_REGS is greater than the value
// of PI_KERNEL_MAX_NUM_REGISTERS_PER_BLOCK. See Table 15: Technical
// Specifications per Compute Capability, for limitations.
const size_t TotalNumberOfWIs =
NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];

uint32_t NumRegisters = 0;
Plugin->call<PiApiKind::piKernelGetGroupInfo>(
Kernel, Device, PI_KERNEL_GROUP_INFO_NUM_REGS, sizeof(NumRegisters),
&NumRegisters, nullptr);

uint32_t MaxRegistersPerBlock =
DeviceImpl.get_info<ext::codeplay::experimental::info::device::
max_registers_per_work_group>();

const bool HasExceededAvailableRegisters =
TotalNumberOfWIs * NumRegisters > MaxRegistersPerBlock;

if (HasExceededAvailableRegisters) {
std::string message(
"Exceeded the number of registers available on the hardware.\n");
throw sycl::nd_range_error(
// Additional information which can be helpful to the user.
message.append(
"\tThe number registers per work-group cannot exceed " +
std::to_string(MaxRegistersPerBlock) +
" for this kernel on this device.\n"
"\tThe kernel uses " +
std::to_string(NumRegisters) +
" registers per work-item for a total of " +
std::to_string(TotalNumberOfWIs) +
" work-items per work-group.\n"),
PI_ERROR_INVALID_WORK_GROUP_SIZE);
}
} else {
// TODO: Decide what checks (if any) we need for the other backends
}
Expand Down Expand Up @@ -352,6 +363,9 @@ void handleErrorOrWarning(pi_result Error, const device_impl &DeviceImpl,
assert(Error != PI_SUCCESS &&
"Success is expected to be handled on caller side");
switch (Error) {
case PI_ERROR_OUT_OF_RESOURCES:
return handleOutOfResources(DeviceImpl, Kernel, NDRDesc);

case PI_ERROR_INVALID_WORK_GROUP_SIZE:
return handleInvalidWorkGroupSize(DeviceImpl, Kernel, NDRDesc);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
class kernel_vadd_and_sum;

int main() {
sycl::queue q;
sycl::queue q{};
sycl::device dev = q.get_device();
size_t local_size = dev.get_info<sycl::info::device::max_work_group_size>();
if (local_size < 1024u) {
Expand Down Expand Up @@ -80,6 +80,8 @@ int main() {
// compute vector add
const auto vadd = values1 + values2 + values3 + values4;

// NB: 64 registers used to do the vector addition.

// compute total vector elements sum
auto sum = elem_t(0);
for (int j = 0; j < VEC_DIM; j++) {
Expand All @@ -92,11 +94,13 @@ int main() {
output[i] = vadd;
output[i] += sum;
});
}).wait();
} catch (sycl::exception &e) {
}).wait_and_throw();
} catch (const sycl::exception &e) {
using std::string_view_literals::operator""sv;
auto Msg = "Exceeded the number of registers available on the hardware."sv;
if (std::string(e.what()).find(Msg) != std::string::npos) {
auto Errc = sycl::make_error_code(sycl::errc::nd_range);
if (e.code() == Errc &&
std::string_view{e.what()}.find(Msg) != std::string_view::npos) {
return 0;
}
}
Expand Down

0 comments on commit 9f1cee5

Please sign in to comment.