Skip to content

Commit

Permalink
[SYCL] Take into account UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY error c…
Browse files Browse the repository at this point in the history
…ode in Program Manager (#15335)

Currently if program manager faces one of the errors -
UR_RESULT_ERROR_OUT_OF_RESOURCES or UR_RESULT_ERROR_OUT_OF_HOST_MEMORY -
during the program building/linking then it will clear the cache and
make another attempt.
This PR adds the following changes:
* Additionally take into account UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY
error which is also possible in addition to aforementioned error codes.
* Parameterize the existing unit test by error code (which allows to
avoid excessive code duplication) and add
UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY value to testing.
  • Loading branch information
againull committed Sep 9, 2024
1 parent 114236f commit fbb1fb0
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 127 deletions.
3 changes: 2 additions & 1 deletion sycl/source/detail/kernel_program_cache.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,8 @@ class KernelProgramCache {
BuildResult->Error.Code = detail::get_ur_error(Ex);
if (Ex.code() == errc::memory_allocation ||
BuildResult->Error.Code == UR_RESULT_ERROR_OUT_OF_RESOURCES ||
BuildResult->Error.Code == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY) {
BuildResult->Error.Code == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY ||
BuildResult->Error.Code == UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY) {
reset();
BuildResult->updateAndNotify(BuildState::BS_Initial);
continue;
Expand Down
6 changes: 4 additions & 2 deletions sycl/source/detail/program_manager/program_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1605,7 +1605,8 @@ ProgramManager::ProgramPtr ProgramManager::build(
};
ur_result_t Error = doLink();
if (Error == UR_RESULT_ERROR_OUT_OF_RESOURCES ||
Error == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY) {
Error == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY ||
Error == UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY) {
Context->getKernelProgramCache().reset();
Error = doLink();
}
Expand Down Expand Up @@ -2427,7 +2428,8 @@ ProgramManager::link(const device_image_plain &DeviceImage,
};
ur_result_t Error = doLink();
if (Error == UR_RESULT_ERROR_OUT_OF_RESOURCES ||
Error == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY) {
Error == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY ||
Error == UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY) {
ContextImpl->getKernelProgramCache().reset();
Error = doLink();
}
Expand Down
140 changes: 16 additions & 124 deletions sycl/unittests/kernel-and-program/OutOfResources.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,28 +35,24 @@ static sycl::unittest::UrImageArray<2> ImgArray{Img};

static int nProgramCreate = 0;
static volatile bool outOfResourcesToggle = false;
static volatile bool outOfHostMemoryToggle = false;
static volatile ur_result_t ErrorCode = UR_RESULT_SUCCESS;

static ur_result_t redefinedProgramCreateWithIL(void *) {
++nProgramCreate;
if (outOfResourcesToggle) {
outOfResourcesToggle = false;
return UR_RESULT_ERROR_OUT_OF_RESOURCES;
return ErrorCode;
}
return UR_RESULT_SUCCESS;
}

static ur_result_t redefinedProgramCreateWithILOutOfHostMemory(void *) {
++nProgramCreate;
if (outOfHostMemoryToggle) {
outOfHostMemoryToggle = false;
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
}
return UR_RESULT_SUCCESS;
}
// Parameterized test fixture
class OutOfResourcesTestSuite : public ::testing::TestWithParam<ur_result_t> {};

TEST(OutOfResourcesTest, urProgramCreate) {
TEST_P(OutOfResourcesTestSuite, urProgramCreate) {
nProgramCreate = 0;
sycl::unittest::UrMock<> Mock;
ErrorCode = GetParam();
mock::getCallbacks().set_before_callback("urProgramCreateWithIL",
&redefinedProgramCreateWithIL);

Expand Down Expand Up @@ -116,92 +112,21 @@ TEST(OutOfResourcesTest, urProgramCreate) {
}
}

TEST(OutOfHostMemoryTest, urProgramCreate) {
// Reset to zero.
nProgramCreate = 0;

sycl::unittest::UrMock<> Mock;
mock::getCallbacks().set_before_callback(
"urProgramCreateWithIL", &redefinedProgramCreateWithILOutOfHostMemory);

sycl::platform Plt{sycl::platform()};
sycl::context Ctx{Plt};
auto CtxImpl = detail::getSyclObjImpl(Ctx);
queue q(Ctx, default_selector_v);

int runningTotal = 0;
// Cache is empty, so one urProgramCreateWithIL call.
q.single_task<class OutOfResourcesKernel1>([] {});
EXPECT_EQ(nProgramCreate, runningTotal += 1);

// Now, we make the next urProgramCreateWithIL call fail with
// UR_RESULT_ERROR_OUT_OF_HOST_MEMORY. The caching mechanism should catch
// this, clear the cache, and retry the urProgramCreateWithIL.
outOfHostMemoryToggle = true;
q.single_task<class OutOfResourcesKernel2>([] {});
EXPECT_FALSE(outOfHostMemoryToggle);
EXPECT_EQ(nProgramCreate, runningTotal += 2);
{
detail::KernelProgramCache::ProgramCache &Cache =
CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
EXPECT_EQ(Cache.size(), 1U) << "Expected 1 program in the cache";
}

// The next urProgramCreateWithIL call will fail with
// UR_RESULT_ERROR_OUT_OF_HOST_MEMORY. But OutOfResourcesKernel2 is in the
// cache, so we expect no new urProgramCreateWithIL calls.
outOfHostMemoryToggle = true;
q.single_task<class OutOfResourcesKernel2>([] {});
EXPECT_TRUE(outOfHostMemoryToggle);
EXPECT_EQ(nProgramCreate, runningTotal);

// OutOfResourcesKernel1 is not in the cache, so we have to
// build it. From what we set before, this call will fail,
// the cache will clear out, and will try again.
q.single_task<class OutOfResourcesKernel1>([] {});
EXPECT_FALSE(outOfHostMemoryToggle);
EXPECT_EQ(nProgramCreate, runningTotal += 2);
{
detail::KernelProgramCache::ProgramCache &Cache =
CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
EXPECT_EQ(Cache.size(), 1U) << "Expected 1 program in the cache";
}

// Finally, OutOfResourcesKernel1 will be in the cache, but
// OutOfResourceKenel2 will not, so one more urProgramCreateWithIL.
// Toggle is not set, so this should succeed.
q.single_task<class OutOfResourcesKernel1>([] {});
q.single_task<class OutOfResourcesKernel2>([] {});
EXPECT_EQ(nProgramCreate, runningTotal += 1);
{
detail::KernelProgramCache::ProgramCache &Cache =
CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
EXPECT_EQ(Cache.size(), 2U) << "Expected 2 program in the cache";
}
}

static int nProgramLink = 0;

static ur_result_t redefinedProgramLink(void *) {
++nProgramLink;
if (outOfResourcesToggle) {
outOfResourcesToggle = false;
return UR_RESULT_ERROR_OUT_OF_RESOURCES;
}
return UR_RESULT_SUCCESS;
}

static ur_result_t redefinedProgramLinkOutOfHostMemory(void *) {
++nProgramLink;
if (outOfHostMemoryToggle) {
outOfHostMemoryToggle = false;
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
return ErrorCode;
}
return UR_RESULT_SUCCESS;
}

TEST(OutOfResourcesTest, urProgramLink) {
TEST_P(OutOfResourcesTestSuite, urProgramLink) {
nProgramLink = 0;
sycl::unittest::UrMock<> Mock;
ErrorCode = GetParam();
mock::getCallbacks().set_before_callback("urProgramLinkExp",
&redefinedProgramLink);

Expand Down Expand Up @@ -236,41 +161,8 @@ TEST(OutOfResourcesTest, urProgramLink) {
}
}

TEST(OutOfHostMemoryTest, urProgramLink) {
// Reset to zero.
nProgramLink = 0;

sycl::unittest::UrMock<> Mock;
mock::getCallbacks().set_before_callback(
"urProgramLinkExp", &redefinedProgramLinkOutOfHostMemory);

sycl::platform Plt{sycl::platform()};
sycl::context Ctx{Plt};
auto CtxImpl = detail::getSyclObjImpl(Ctx);
queue q(Ctx, default_selector_v);
// Put some programs in the cache
q.single_task<class OutOfResourcesKernel1>([] {});
q.single_task<class OutOfResourcesKernel2>([] {});
{
detail::KernelProgramCache::ProgramCache &Cache =
CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
EXPECT_EQ(Cache.size(), 2U) << "Expect 2 programs in the cache";
}

auto b1 = sycl::get_kernel_bundle<OutOfResourcesKernel1,
sycl::bundle_state::object>(Ctx);
auto b2 = sycl::get_kernel_bundle<OutOfResourcesKernel2,
sycl::bundle_state::object>(Ctx);
outOfHostMemoryToggle = true;
EXPECT_EQ(nProgramLink, 0);
auto b3 = sycl::link({b1, b2});
EXPECT_FALSE(outOfHostMemoryToggle);
// one restart due to out of resources, one link per each of b1 and b2.
EXPECT_EQ(nProgramLink, 3);
// no programs should be in the cache due to out of resources.
{
detail::KernelProgramCache::ProgramCache &Cache =
CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
EXPECT_EQ(Cache.size(), 0u) << "Expect no programs in the cache";
}
}
INSTANTIATE_TEST_SUITE_P(
OutOfResourcesParameterizedRun, OutOfResourcesTestSuite,
::testing::Values(UR_RESULT_ERROR_OUT_OF_RESOURCES,
UR_RESULT_ERROR_OUT_OF_HOST_MEMORY,
UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY));

0 comments on commit fbb1fb0

Please sign in to comment.