From b5ea5954558e23524d13d038369a183e0f8d7bbc Mon Sep 17 00:00:00 2001 From: "Spruit, Neil R" Date: Thu, 11 Jan 2024 11:55:25 -0800 Subject: [PATCH] [L0] Only Override max allocation limits given env - Change the defaults from always allowing > 4GB allocations to making the user have to request > 4GB allocation support given the max allocation allowed on that system is less than 4GB. - This ensures performance is maintained on systems that dont handle > 4GB allocations natively and avoids breaking Ahead of Time (AOT) binaries that were built without > 4GB resource support. - By setting UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1 , the L0 Adapter will force the modules to be built with stateless or > 4GB support and will allow for the allocations to exceed the max single allocation size limit for that device. Signed-off-by: Spruit, Neil R --- source/adapters/level_zero/device.cpp | 20 ++++++++------------ source/adapters/level_zero/device.hpp | 2 +- source/adapters/level_zero/program.cpp | 4 ++-- source/adapters/level_zero/usm.cpp | 2 +- 4 files changed, 12 insertions(+), 16 deletions(-) diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index abdfd2e541..68a0fcfe69 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -268,9 +268,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue(uint32_t{64}); } case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: - // if not optimized for 32-bit access, return total memory size. - // otherwise, return only maximum allocatable size. - if (Device->useOptimized32bitAccess() == 0) { + // if the user wishes to allocate large allocations on a system that usually + // does not allow that allocation size, then we return the max global mem + // size as the limit. + if (Device->useRelaxedAllocationLimits() == 1) { return ReturnValue(uint64_t{calculateGlobalMemSize(Device)}); } else { return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize}); @@ -923,20 +924,15 @@ ur_device_handle_t_::useImmediateCommandLists() { } } -int32_t ur_device_handle_t_::useOptimized32bitAccess() { - static const int32_t Optimize32bitAccessMode = [this] { - // If device is Intel(R) Data Center GPU Max, - // use default provided by L0 driver. - // TODO: Use IP versioning to select based on range of devices - if (this->isPVC()) - return -1; - const char *UrRet = std::getenv("UR_L0_USE_OPTIMIZED_32BIT_ACCESS"); +int32_t ur_device_handle_t_::useRelaxedAllocationLimits() { + static const int32_t EnableRelaxedAllocationLimits = [this] { + const char *UrRet = std::getenv("UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS"); if (!UrRet) return 0; return std::atoi(UrRet); }(); - return Optimize32bitAccessMode; + return EnableRelaxedAllocationLimits; } ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal, diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp index 3b91b70058..f644f01226 100644 --- a/source/adapters/level_zero/device.hpp +++ b/source/adapters/level_zero/device.hpp @@ -160,7 +160,7 @@ struct ur_device_handle_t_ : _ur_object { // provide support for only one, like for Intel(R) // Data Center GPU Max, for which L0 driver only // supports stateless. - int32_t useOptimized32bitAccess(); + int32_t useRelaxedAllocationLimits(); bool isSubDevice() { return RootDevice != nullptr; } diff --git a/source/adapters/level_zero/program.cpp b/source/adapters/level_zero/program.cpp index f118a5b9dd..9b78a22960 100644 --- a/source/adapters/level_zero/program.cpp +++ b/source/adapters/level_zero/program.cpp @@ -161,7 +161,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp( ZeBuildOptions += pOptions; } - if (phDevices[0]->useOptimized32bitAccess() == 0) { + if (phDevices[0]->useRelaxedAllocationLimits() == 1) { ZeBuildOptions += " -ze-opt-greater-than-4GB-buffer-required"; } @@ -256,7 +256,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile( // ze-opt-greater-than-4GB-buffer-required to disable // stateful optimizations and be able to use larger than // 4GB allocations on these kernels. - if (Context->Devices[0]->useOptimized32bitAccess() == 0) { + if (Context->Devices[0]->useRelaxedAllocationLimits() == 1) { Program->BuildFlags += " -ze-opt-greater-than-4GB-buffer-required"; } } diff --git a/source/adapters/level_zero/usm.cpp b/source/adapters/level_zero/usm.cpp index e4a00249a2..50c1593e99 100644 --- a/source/adapters/level_zero/usm.cpp +++ b/source/adapters/level_zero/usm.cpp @@ -178,7 +178,7 @@ static ur_result_t USMDeviceAllocImpl(void **ResultPtr, ZeDesc.flags = 0; ZeDesc.ordinal = 0; - if (Device->useOptimized32bitAccess() == 0 && + if (Device->useRelaxedAllocationLimits() == 1 && (Size > Device->ZeDeviceProperties->maxMemAllocSize)) { // Tell Level-Zero to accept Size > maxMemAllocSize if // large allocations are used.