Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL][CUDA] Improve kernel launch error handling for out-of-registers #12604

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions sycl/plugins/unified_runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -100,13 +100,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
endfunction()

set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
# commit 9f783837089c970a22cda08f768aa3dbed38f0d3
# Merge: c015f892 b9442104
# commit 5083f4f96557672b7b6a55ea53347896d40549d7
# Merge: a97eed15 4c3f9abe
# Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
# Date: Fri May 31 10:20:23 2024 +0100
# Merge pull request #1533 from AllanZyne/sanitizer-buffer
# [DeviceSanitizer] Support detecting out-of-bounds errors on sycl::buffer
set(UNIFIED_RUNTIME_TAG 9f783837089c970a22cda08f768aa3dbed38f0d3)
# Date: Fri May 31 17:20:01 2024 +0100
# Merge pull request #1397 from GeorgeWeb/georgi/check-allocation-error-on-event-from-native-handle
# [CUDA][HIP] Catch and report bad_alloc errors for event object creation
set(UNIFIED_RUNTIME_TAG 5083f4f96557672b7b6a55ea53347896d40549d7)

fetch_adapter_source(level_zero
${UNIFIED_RUNTIME_REPO}
Expand Down
100 changes: 57 additions & 43 deletions sycl/source/detail/error_handling/error_handling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,60 @@ namespace sycl {
inline namespace _V1 {
namespace detail::enqueue_kernel_launch {

void handleOutOfResources(const device_impl &DeviceImpl, pi_kernel Kernel,
const NDRDescT &NDRDesc) {
sycl::platform Platform = DeviceImpl.get_platform();
sycl::backend Backend = Platform.get_backend();
if (Backend == sycl::backend::ext_oneapi_cuda) {
// PI_ERROR_OUT_OF_RESOURCES is returned when the kernel registers
// required for the launch config exceeds the maximum number of registers
// per block (PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP).
// This is if local_work_size[0] * ... * local_work_size[work_dim - 1]
// multiplied by PI_KERNEL_GROUP_INFO_NUM_REGS is greater than the value
// of PI_KERNEL_MAX_NUM_REGISTERS_PER_BLOCK. See Table 15: Technical
// Specifications per Compute Capability, for limitations.
const size_t TotalNumberOfWIs =
NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];

const uint32_t MaxRegistersPerBlock =
DeviceImpl.get_info<ext::codeplay::experimental::info::device::
max_registers_per_work_group>();

const PluginPtr &Plugin = DeviceImpl.getPlugin();
sycl::detail::pi::PiDevice Device = DeviceImpl.getHandleRef();

uint32_t NumRegisters = 0;
Plugin->call<PiApiKind::piKernelGetGroupInfo>(
Kernel, Device, PI_KERNEL_GROUP_INFO_NUM_REGS, sizeof(NumRegisters),
&NumRegisters, nullptr);

const bool HasExceededAvailableRegisters =
TotalNumberOfWIs * NumRegisters > MaxRegistersPerBlock;

if (HasExceededAvailableRegisters) {
std::string message(
"Exceeded the number of registers available on the hardware.\n");
throw sycl::exception(
sycl::make_error_code(sycl::errc::nd_range),
// Additional information which can be helpful to the user.
message.append(
"\tThe number registers per work-group cannot exceed " +
std::to_string(MaxRegistersPerBlock) +
" for this kernel on this device.\n"
"\tThe kernel uses " +
std::to_string(NumRegisters) +
" registers per work-item for a total of " +
std::to_string(TotalNumberOfWIs) +
" work-items per work-group.\n"));
}
}
// Fallback
constexpr pi_result Error = PI_ERROR_OUT_OF_RESOURCES;
throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
"PI backend failed. PI backend returns:" +
codeToString(Error));
}

void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
const NDRDescT &NDRDesc) {
sycl::platform Platform = DeviceImpl.get_platform();
Expand All @@ -30,7 +84,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
bool IsOpenCLV1x = false; // Backend is OpenCL 1.x
bool IsOpenCLVGE20 = false; // Backend is Greater or Equal to OpenCL 2.0
bool IsLevelZero = false; // Backend is any OneAPI Level 0 version
bool IsCuda = false; // Backend is CUDA
auto Backend = Platform.get_backend();
if (Backend == sycl::backend::opencl) {
std::string VersionString =
Expand All @@ -41,8 +94,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
(VersionString.find("2.") == 0) || (VersionString.find("3.") == 0);
} else if (Backend == sycl::backend::ext_oneapi_level_zero) {
IsLevelZero = true;
} else if (Backend == sycl::backend::ext_oneapi_cuda) {
IsCuda = true;
}

const PluginPtr &Plugin = DeviceImpl.getPlugin();
Expand Down Expand Up @@ -243,46 +294,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
// else unknown. fallback (below)
}
}
} else if (IsCuda) {
// CUDA:
// PI_ERROR_INVALID_WORK_GROUP_SIZE is returned when the kernel registers
// required for the launch config exceeds the maximum number of registers
// per block (PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP).
// This is if local_work_size[0] * ... * local_work_size[work_dim - 1]
// multiplied by PI_KERNEL_GROUP_INFO_NUM_REGS is greater than the value
// of PI_KERNEL_MAX_NUM_REGISTERS_PER_BLOCK. See Table 15: Technical
// Specifications per Compute Capability, for limitations.
const size_t TotalNumberOfWIs =
NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];

uint32_t NumRegisters = 0;
Plugin->call<PiApiKind::piKernelGetGroupInfo>(
Kernel, Device, PI_KERNEL_GROUP_INFO_NUM_REGS, sizeof(NumRegisters),
&NumRegisters, nullptr);

uint32_t MaxRegistersPerBlock =
DeviceImpl.get_info<ext::codeplay::experimental::info::device::
max_registers_per_work_group>();

const bool HasExceededAvailableRegisters =
TotalNumberOfWIs * NumRegisters > MaxRegistersPerBlock;

if (HasExceededAvailableRegisters) {
std::string message(
"Exceeded the number of registers available on the hardware.\n");
throw sycl::nd_range_error(
// Additional information which can be helpful to the user.
message.append(
"\tThe number registers per work-group cannot exceed " +
std::to_string(MaxRegistersPerBlock) +
" for this kernel on this device.\n"
"\tThe kernel uses " +
std::to_string(NumRegisters) +
" registers per work-item for a total of " +
std::to_string(TotalNumberOfWIs) +
" work-items per work-group.\n"),
PI_ERROR_INVALID_WORK_GROUP_SIZE);
}
} else {
// TODO: Decide what checks (if any) we need for the other backends
}
Expand Down Expand Up @@ -352,6 +363,9 @@ void handleErrorOrWarning(pi_result Error, const device_impl &DeviceImpl,
assert(Error != PI_SUCCESS &&
"Success is expected to be handled on caller side");
switch (Error) {
case PI_ERROR_OUT_OF_RESOURCES:
return handleOutOfResources(DeviceImpl, Kernel, NDRDesc);

case PI_ERROR_INVALID_WORK_GROUP_SIZE:
return handleInvalidWorkGroupSize(DeviceImpl, Kernel, NDRDesc);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
class kernel_vadd_and_sum;

int main() {
sycl::queue q;
sycl::queue q{};
sycl::device dev = q.get_device();
size_t local_size = dev.get_info<sycl::info::device::max_work_group_size>();
if (local_size < 1024u) {
Expand Down Expand Up @@ -80,6 +80,8 @@ int main() {
// compute vector add
const auto vadd = values1 + values2 + values3 + values4;

// NB: 64 registers used to do the vector addition.

// compute total vector elements sum
auto sum = elem_t(0);
for (int j = 0; j < VEC_DIM; j++) {
Expand All @@ -92,11 +94,13 @@ int main() {
output[i] = vadd;
output[i] += sum;
});
}).wait();
} catch (sycl::exception &e) {
}).wait_and_throw();
} catch (const sycl::exception &e) {
using std::string_view_literals::operator""sv;
auto Msg = "Exceeded the number of registers available on the hardware."sv;
if (std::string(e.what()).find(Msg) != std::string::npos) {
auto Errc = sycl::make_error_code(sycl::errc::nd_range);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@GeorgeWeb, I was looking at the PR and got confused by it. Could you please clarify what is the key change here? PR description says that we used to display confusing error to users, but check for the exception message wasn't changed by the PR.

What was the confusing part then? I see that in #12363 we had a bug report which contains invalid message, but this test had been introduced a year ago before that issue was submitted in #9106. What am I missing here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In summary the wrong part of the message as per the PRs description was just the plugin error code description - PI_ERROR_INVALID_WORK_GROUP_SIZE should have been PI_ERROR_OUT_OF_LAUNCH_RESOURCES. The reason I've added the errc::nd_range error code check was just for more verbosity but it wasn't of importance here.


The real issue about "reporting a completely wrong message" in the OPs report (#12363) was due to a mistake on this line https://github.com/intel/llvm/pull/9106/files#diff-7525901710934f7bdb2ad36238c4b67163f112d3bd233db7af0b0078b5b01e80R3263 which was fixed by this UR cuda change oneapi-src/unified-runtime#1299

if (e.code() == Errc &&
std::string_view{e.what()}.find(Msg) != std::string_view::npos) {
return 0;
}
}
Expand Down
Loading