From fbb1fb0010ec128c93148461c951dccf4c01b5b7 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Mon, 9 Sep 2024 16:33:23 -0700 Subject: [PATCH] [SYCL] Take into account UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY error code in Program Manager (#15335) Currently if program manager faces one of the errors - UR_RESULT_ERROR_OUT_OF_RESOURCES or UR_RESULT_ERROR_OUT_OF_HOST_MEMORY - during the program building/linking then it will clear the cache and make another attempt. This PR adds the following changes: * Additionally take into account UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY error which is also possible in addition to aforementioned error codes. * Parameterize the existing unit test by error code (which allows to avoid excessive code duplication) and add UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY value to testing. --- sycl/source/detail/kernel_program_cache.hpp | 3 +- .../program_manager/program_manager.cpp | 6 +- .../kernel-and-program/OutOfResources.cpp | 140 ++---------------- 3 files changed, 22 insertions(+), 127 deletions(-) diff --git a/sycl/source/detail/kernel_program_cache.hpp b/sycl/source/detail/kernel_program_cache.hpp index bc800b034179d..44dfd84751afd 100644 --- a/sycl/source/detail/kernel_program_cache.hpp +++ b/sycl/source/detail/kernel_program_cache.hpp @@ -329,7 +329,8 @@ class KernelProgramCache { BuildResult->Error.Code = detail::get_ur_error(Ex); if (Ex.code() == errc::memory_allocation || BuildResult->Error.Code == UR_RESULT_ERROR_OUT_OF_RESOURCES || - BuildResult->Error.Code == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY) { + BuildResult->Error.Code == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY || + BuildResult->Error.Code == UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY) { reset(); BuildResult->updateAndNotify(BuildState::BS_Initial); continue; diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index fb30de4695499..99f3c5204dc74 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -1605,7 +1605,8 @@ ProgramManager::ProgramPtr ProgramManager::build( }; ur_result_t Error = doLink(); if (Error == UR_RESULT_ERROR_OUT_OF_RESOURCES || - Error == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY) { + Error == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY || + Error == UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY) { Context->getKernelProgramCache().reset(); Error = doLink(); } @@ -2427,7 +2428,8 @@ ProgramManager::link(const device_image_plain &DeviceImage, }; ur_result_t Error = doLink(); if (Error == UR_RESULT_ERROR_OUT_OF_RESOURCES || - Error == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY) { + Error == UR_RESULT_ERROR_OUT_OF_HOST_MEMORY || + Error == UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY) { ContextImpl->getKernelProgramCache().reset(); Error = doLink(); } diff --git a/sycl/unittests/kernel-and-program/OutOfResources.cpp b/sycl/unittests/kernel-and-program/OutOfResources.cpp index c249e6599ec5a..b0b6e877ebe77 100644 --- a/sycl/unittests/kernel-and-program/OutOfResources.cpp +++ b/sycl/unittests/kernel-and-program/OutOfResources.cpp @@ -35,28 +35,24 @@ static sycl::unittest::UrImageArray<2> ImgArray{Img}; static int nProgramCreate = 0; static volatile bool outOfResourcesToggle = false; -static volatile bool outOfHostMemoryToggle = false; +static volatile ur_result_t ErrorCode = UR_RESULT_SUCCESS; static ur_result_t redefinedProgramCreateWithIL(void *) { ++nProgramCreate; if (outOfResourcesToggle) { outOfResourcesToggle = false; - return UR_RESULT_ERROR_OUT_OF_RESOURCES; + return ErrorCode; } return UR_RESULT_SUCCESS; } -static ur_result_t redefinedProgramCreateWithILOutOfHostMemory(void *) { - ++nProgramCreate; - if (outOfHostMemoryToggle) { - outOfHostMemoryToggle = false; - return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } - return UR_RESULT_SUCCESS; -} +// Parameterized test fixture +class OutOfResourcesTestSuite : public ::testing::TestWithParam {}; -TEST(OutOfResourcesTest, urProgramCreate) { +TEST_P(OutOfResourcesTestSuite, urProgramCreate) { + nProgramCreate = 0; sycl::unittest::UrMock<> Mock; + ErrorCode = GetParam(); mock::getCallbacks().set_before_callback("urProgramCreateWithIL", &redefinedProgramCreateWithIL); @@ -116,92 +112,21 @@ TEST(OutOfResourcesTest, urProgramCreate) { } } -TEST(OutOfHostMemoryTest, urProgramCreate) { - // Reset to zero. - nProgramCreate = 0; - - sycl::unittest::UrMock<> Mock; - mock::getCallbacks().set_before_callback( - "urProgramCreateWithIL", &redefinedProgramCreateWithILOutOfHostMemory); - - sycl::platform Plt{sycl::platform()}; - sycl::context Ctx{Plt}; - auto CtxImpl = detail::getSyclObjImpl(Ctx); - queue q(Ctx, default_selector_v); - - int runningTotal = 0; - // Cache is empty, so one urProgramCreateWithIL call. - q.single_task([] {}); - EXPECT_EQ(nProgramCreate, runningTotal += 1); - - // Now, we make the next urProgramCreateWithIL call fail with - // UR_RESULT_ERROR_OUT_OF_HOST_MEMORY. The caching mechanism should catch - // this, clear the cache, and retry the urProgramCreateWithIL. - outOfHostMemoryToggle = true; - q.single_task([] {}); - EXPECT_FALSE(outOfHostMemoryToggle); - EXPECT_EQ(nProgramCreate, runningTotal += 2); - { - detail::KernelProgramCache::ProgramCache &Cache = - CtxImpl->getKernelProgramCache().acquireCachedPrograms().get(); - EXPECT_EQ(Cache.size(), 1U) << "Expected 1 program in the cache"; - } - - // The next urProgramCreateWithIL call will fail with - // UR_RESULT_ERROR_OUT_OF_HOST_MEMORY. But OutOfResourcesKernel2 is in the - // cache, so we expect no new urProgramCreateWithIL calls. - outOfHostMemoryToggle = true; - q.single_task([] {}); - EXPECT_TRUE(outOfHostMemoryToggle); - EXPECT_EQ(nProgramCreate, runningTotal); - - // OutOfResourcesKernel1 is not in the cache, so we have to - // build it. From what we set before, this call will fail, - // the cache will clear out, and will try again. - q.single_task([] {}); - EXPECT_FALSE(outOfHostMemoryToggle); - EXPECT_EQ(nProgramCreate, runningTotal += 2); - { - detail::KernelProgramCache::ProgramCache &Cache = - CtxImpl->getKernelProgramCache().acquireCachedPrograms().get(); - EXPECT_EQ(Cache.size(), 1U) << "Expected 1 program in the cache"; - } - - // Finally, OutOfResourcesKernel1 will be in the cache, but - // OutOfResourceKenel2 will not, so one more urProgramCreateWithIL. - // Toggle is not set, so this should succeed. - q.single_task([] {}); - q.single_task([] {}); - EXPECT_EQ(nProgramCreate, runningTotal += 1); - { - detail::KernelProgramCache::ProgramCache &Cache = - CtxImpl->getKernelProgramCache().acquireCachedPrograms().get(); - EXPECT_EQ(Cache.size(), 2U) << "Expected 2 program in the cache"; - } -} - static int nProgramLink = 0; static ur_result_t redefinedProgramLink(void *) { ++nProgramLink; if (outOfResourcesToggle) { outOfResourcesToggle = false; - return UR_RESULT_ERROR_OUT_OF_RESOURCES; - } - return UR_RESULT_SUCCESS; -} - -static ur_result_t redefinedProgramLinkOutOfHostMemory(void *) { - ++nProgramLink; - if (outOfHostMemoryToggle) { - outOfHostMemoryToggle = false; - return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + return ErrorCode; } return UR_RESULT_SUCCESS; } -TEST(OutOfResourcesTest, urProgramLink) { +TEST_P(OutOfResourcesTestSuite, urProgramLink) { + nProgramLink = 0; sycl::unittest::UrMock<> Mock; + ErrorCode = GetParam(); mock::getCallbacks().set_before_callback("urProgramLinkExp", &redefinedProgramLink); @@ -236,41 +161,8 @@ TEST(OutOfResourcesTest, urProgramLink) { } } -TEST(OutOfHostMemoryTest, urProgramLink) { - // Reset to zero. - nProgramLink = 0; - - sycl::unittest::UrMock<> Mock; - mock::getCallbacks().set_before_callback( - "urProgramLinkExp", &redefinedProgramLinkOutOfHostMemory); - - sycl::platform Plt{sycl::platform()}; - sycl::context Ctx{Plt}; - auto CtxImpl = detail::getSyclObjImpl(Ctx); - queue q(Ctx, default_selector_v); - // Put some programs in the cache - q.single_task([] {}); - q.single_task([] {}); - { - detail::KernelProgramCache::ProgramCache &Cache = - CtxImpl->getKernelProgramCache().acquireCachedPrograms().get(); - EXPECT_EQ(Cache.size(), 2U) << "Expect 2 programs in the cache"; - } - - auto b1 = sycl::get_kernel_bundle(Ctx); - auto b2 = sycl::get_kernel_bundle(Ctx); - outOfHostMemoryToggle = true; - EXPECT_EQ(nProgramLink, 0); - auto b3 = sycl::link({b1, b2}); - EXPECT_FALSE(outOfHostMemoryToggle); - // one restart due to out of resources, one link per each of b1 and b2. - EXPECT_EQ(nProgramLink, 3); - // no programs should be in the cache due to out of resources. - { - detail::KernelProgramCache::ProgramCache &Cache = - CtxImpl->getKernelProgramCache().acquireCachedPrograms().get(); - EXPECT_EQ(Cache.size(), 0u) << "Expect no programs in the cache"; - } -} +INSTANTIATE_TEST_SUITE_P( + OutOfResourcesParameterizedRun, OutOfResourcesTestSuite, + ::testing::Values(UR_RESULT_ERROR_OUT_OF_RESOURCES, + UR_RESULT_ERROR_OUT_OF_HOST_MEMORY, + UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY));