Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL][CUDA] Improve kernel launch error handling for out-of-registers #12604

Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions sycl/plugins/unified_runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@

# Options to override the default behaviour of the FetchContent to include UR
# source code.
# TODO: Resolve this tag override after related UR changes are merged.
kbenzie marked this conversation as resolved.
Show resolved Hide resolved
set(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO
"" CACHE STRING "Override the Unified Runtime FetchContent repository")
"https://github.com/GeorgeWeb/unified-runtime.git" CACHE STRING "Override the Unified Runtime FetchContent repository")
set(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_TAG
"" CACHE STRING "Override the Unified Runtime FetchContent tag")
"eec7aa47fb7ddf9f659984ae1fd6d67eb3252f3d" CACHE STRING "Override the Unified Runtime FetchContent tag")

# Options to disable use of FetchContent to include Unified Runtime source code
# to improve developer workflow.
Expand Down
99 changes: 56 additions & 43 deletions sycl/source/detail/error_handling/error_handling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,59 @@ namespace sycl {
inline namespace _V1 {
namespace detail::enqueue_kernel_launch {

void handleOutOfResources(const device_impl &DeviceImpl, pi_kernel Kernel,
const NDRDescT &NDRDesc) {
sycl::platform Platform = DeviceImpl.get_platform();
sycl::backend Backend = Platform.get_backend();
if (Backend == sycl::backend::ext_oneapi_cuda) {
// PI_ERROR_OUT_OF_RESOURCES is returned when the kernel registers
// required for the launch config exceeds the maximum number of registers
// per block (PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP).
// This is if local_work_size[0] * ... * local_work_size[work_dim - 1]
// multiplied by PI_KERNEL_GROUP_INFO_NUM_REGS is greater than the value
// of PI_KERNEL_MAX_NUM_REGISTERS_PER_BLOCK. See Table 15: Technical
// Specifications per Compute Capability, for limitations.
const size_t TotalNumberOfWIs =
NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];

const uint32_t MaxRegistersPerBlock =
DeviceImpl.get_info<ext::codeplay::experimental::info::device::
max_registers_per_work_group>();

const PluginPtr &Plugin = DeviceImpl.getPlugin();
sycl::detail::pi::PiDevice Device = DeviceImpl.getHandleRef();

uint32_t NumRegisters = 0;
Plugin->call<PiApiKind::piKernelGetGroupInfo>(
Kernel, Device, PI_KERNEL_GROUP_INFO_NUM_REGS, sizeof(NumRegisters),
&NumRegisters, nullptr);

const bool HasExceededAvailableRegisters =
TotalNumberOfWIs * NumRegisters > MaxRegistersPerBlock;

if (HasExceededAvailableRegisters) {
std::string message(
"Exceeded the number of registers available on the hardware.\n");
throw sycl::nd_range_error(
steffenlarsen marked this conversation as resolved.
Show resolved Hide resolved
// Additional information which can be helpful to the user.
message.append(
"\tThe number registers per work-group cannot exceed " +
std::to_string(MaxRegistersPerBlock) +
" for this kernel on this device.\n"
"\tThe kernel uses " +
std::to_string(NumRegisters) +
" registers per work-item for a total of " +
std::to_string(TotalNumberOfWIs) +
" work-items per work-group.\n"),
PI_ERROR_OUT_OF_RESOURCES);
}
}
// Fallback
constexpr pi_result Error = PI_ERROR_OUT_OF_RESOURCES;
throw runtime_error(
steffenlarsen marked this conversation as resolved.
Show resolved Hide resolved
"PI backend failed. PI backend returns: " + codeToString(Error), Error);
}

void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
const NDRDescT &NDRDesc) {
sycl::platform Platform = DeviceImpl.get_platform();
Expand All @@ -30,7 +83,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
bool IsOpenCLV1x = false; // Backend is OpenCL 1.x
bool IsOpenCLVGE20 = false; // Backend is Greater or Equal to OpenCL 2.0
bool IsLevelZero = false; // Backend is any OneAPI Level 0 version
bool IsCuda = false; // Backend is CUDA
auto Backend = Platform.get_backend();
if (Backend == sycl::backend::opencl) {
std::string VersionString =
Expand All @@ -41,8 +93,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
(VersionString.find("2.") == 0) || (VersionString.find("3.") == 0);
} else if (Backend == sycl::backend::ext_oneapi_level_zero) {
IsLevelZero = true;
} else if (Backend == sycl::backend::ext_oneapi_cuda) {
IsCuda = true;
}

const PluginPtr &Plugin = DeviceImpl.getPlugin();
Expand Down Expand Up @@ -243,46 +293,6 @@ void handleInvalidWorkGroupSize(const device_impl &DeviceImpl, pi_kernel Kernel,
// else unknown. fallback (below)
}
}
} else if (IsCuda) {
// CUDA:
// PI_ERROR_INVALID_WORK_GROUP_SIZE is returned when the kernel registers
// required for the launch config exceeds the maximum number of registers
// per block (PI_EXT_CODEPLAY_DEVICE_INFO_MAX_REGISTERS_PER_WORK_GROUP).
// This is if local_work_size[0] * ... * local_work_size[work_dim - 1]
// multiplied by PI_KERNEL_GROUP_INFO_NUM_REGS is greater than the value
// of PI_KERNEL_MAX_NUM_REGISTERS_PER_BLOCK. See Table 15: Technical
// Specifications per Compute Capability, for limitations.
const size_t TotalNumberOfWIs =
NDRDesc.LocalSize[0] * NDRDesc.LocalSize[1] * NDRDesc.LocalSize[2];

uint32_t NumRegisters = 0;
Plugin->call<PiApiKind::piKernelGetGroupInfo>(
Kernel, Device, PI_KERNEL_GROUP_INFO_NUM_REGS, sizeof(NumRegisters),
&NumRegisters, nullptr);

uint32_t MaxRegistersPerBlock =
DeviceImpl.get_info<ext::codeplay::experimental::info::device::
max_registers_per_work_group>();

const bool HasExceededAvailableRegisters =
TotalNumberOfWIs * NumRegisters > MaxRegistersPerBlock;

if (HasExceededAvailableRegisters) {
std::string message(
"Exceeded the number of registers available on the hardware.\n");
throw sycl::nd_range_error(
// Additional information which can be helpful to the user.
message.append(
"\tThe number registers per work-group cannot exceed " +
std::to_string(MaxRegistersPerBlock) +
" for this kernel on this device.\n"
"\tThe kernel uses " +
std::to_string(NumRegisters) +
" registers per work-item for a total of " +
std::to_string(TotalNumberOfWIs) +
" work-items per work-group.\n"),
PI_ERROR_INVALID_WORK_GROUP_SIZE);
}
} else {
// TODO: Decide what checks (if any) we need for the other backends
}
Expand Down Expand Up @@ -352,6 +362,9 @@ void handleErrorOrWarning(pi_result Error, const device_impl &DeviceImpl,
assert(Error != PI_SUCCESS &&
"Success is expected to be handled on caller side");
switch (Error) {
case PI_ERROR_OUT_OF_RESOURCES:
return handleOutOfResources(DeviceImpl, Kernel, NDRDesc);

case PI_ERROR_INVALID_WORK_GROUP_SIZE:
return handleInvalidWorkGroupSize(DeviceImpl, Kernel, NDRDesc);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

#include "sycl/info/info_desc.hpp"
GeorgeWeb marked this conversation as resolved.
Show resolved Hide resolved
#include <numeric>
#include <string_view>
#include <type_traits>
Expand All @@ -25,7 +26,7 @@
class kernel_vadd_and_sum;

int main() {
sycl::queue q;
sycl::queue q{};
sycl::device dev = q.get_device();
size_t local_size = dev.get_info<sycl::info::device::max_work_group_size>();
if (local_size < 1024u) {
Expand Down Expand Up @@ -79,6 +80,8 @@ int main() {
// compute vector add
const auto vadd = values1 + values2 + values3 + values4;

// NB: 64 registers used to do the vector addition.

// compute total vector elements sum
auto sum = elem_t(0);
for (int j = 0; j < VEC_DIM; j++) {
Expand All @@ -91,11 +94,13 @@ int main() {
output[i] = vadd;
output[i] += sum;
});
}).wait();
} catch (sycl::exception &e) {
}).wait_and_throw();
} catch (const sycl::exception &e) {
using std::string_view_literals::operator""sv;
auto Msg = "Exceeded the number of registers available on the hardware."sv;
if (std::string(e.what()).find(Msg) != std::string::npos) {
int Errc = PI_ERROR_OUT_OF_RESOURCES;
if (e.get_cl_code() == Errc &&
std::string_view{e.what()}.find(Msg) != std::string_view::npos) {
return 0;
}
}
Expand Down
Loading