From 0ebaca6ed1f988f0fc2d8138e1534323a78f9f65 Mon Sep 17 00:00:00 2001 From: "Spruit, Neil R" Date: Thu, 11 Jan 2024 11:55:25 -0800 Subject: [PATCH] [L0] Only Override max allocation limits given env - Change the defaults from always allowing > 4GB allocations to making the user have to request > 4GB allocation support given the max allocation allowed on that system is less than 4GB. - This ensures performance is maintained on systems that dont handle > 4GB allocations natively and avoids breaking Ahead of Time (AOT) binaries that were built without > 4GB resource support. - By setting UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS=1 , the L0 Adapter will force the modules to be built with stateless or > 4GB support and will allow for the allocations to exceed the max single allocation size limit for that device. Signed-off-by: Spruit, Neil R --- source/adapters/level_zero/device.cpp | 26 +++++++++++--------------- source/adapters/level_zero/device.hpp | 2 +- source/adapters/level_zero/program.cpp | 4 ++-- source/adapters/level_zero/usm.cpp | 4 ++-- 4 files changed, 16 insertions(+), 20 deletions(-) diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index abdfd2e541..0b8e12c67a 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -10,6 +10,7 @@ #include "device.hpp" #include "ur_level_zero.hpp" +#include "ur_util.hpp" #include #include #include @@ -268,9 +269,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue(uint32_t{64}); } case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: - // if not optimized for 32-bit access, return total memory size. - // otherwise, return only maximum allocatable size. - if (Device->useOptimized32bitAccess() == 0) { + // if the user wishes to allocate large allocations on a system that usually + // does not allow that allocation size, then we return the max global mem + // size as the limit. + if (Device->useRelaxedAllocationLimits()) { return ReturnValue(uint64_t{calculateGlobalMemSize(Device)}); } else { return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize}); @@ -923,20 +925,14 @@ ur_device_handle_t_::useImmediateCommandLists() { } } -int32_t ur_device_handle_t_::useOptimized32bitAccess() { - static const int32_t Optimize32bitAccessMode = [this] { - // If device is Intel(R) Data Center GPU Max, - // use default provided by L0 driver. - // TODO: Use IP versioning to select based on range of devices - if (this->isPVC()) - return -1; - const char *UrRet = std::getenv("UR_L0_USE_OPTIMIZED_32BIT_ACCESS"); - if (!UrRet) - return 0; - return std::atoi(UrRet); +bool ur_device_handle_t_::useRelaxedAllocationLimits() { + static const bool EnableRelaxedAllocationLimits = [] { + auto UrRet = ur_getenv("UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS"); + const bool RetVal = UrRet ? std::stoi(*UrRet) : 0; + return RetVal; }(); - return Optimize32bitAccessMode; + return EnableRelaxedAllocationLimits; } ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal, diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp index 3b91b70058..94480336c5 100644 --- a/source/adapters/level_zero/device.hpp +++ b/source/adapters/level_zero/device.hpp @@ -160,7 +160,7 @@ struct ur_device_handle_t_ : _ur_object { // provide support for only one, like for Intel(R) // Data Center GPU Max, for which L0 driver only // supports stateless. - int32_t useOptimized32bitAccess(); + bool useRelaxedAllocationLimits(); bool isSubDevice() { return RootDevice != nullptr; } diff --git a/source/adapters/level_zero/program.cpp b/source/adapters/level_zero/program.cpp index f118a5b9dd..bb2d964422 100644 --- a/source/adapters/level_zero/program.cpp +++ b/source/adapters/level_zero/program.cpp @@ -161,7 +161,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp( ZeBuildOptions += pOptions; } - if (phDevices[0]->useOptimized32bitAccess() == 0) { + if (phDevices[0]->useRelaxedAllocationLimits()) { ZeBuildOptions += " -ze-opt-greater-than-4GB-buffer-required"; } @@ -256,7 +256,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile( // ze-opt-greater-than-4GB-buffer-required to disable // stateful optimizations and be able to use larger than // 4GB allocations on these kernels. - if (Context->Devices[0]->useOptimized32bitAccess() == 0) { + if (Context->Devices[0]->useRelaxedAllocationLimits()) { Program->BuildFlags += " -ze-opt-greater-than-4GB-buffer-required"; } } diff --git a/source/adapters/level_zero/usm.cpp b/source/adapters/level_zero/usm.cpp index e4a00249a2..11245b5760 100644 --- a/source/adapters/level_zero/usm.cpp +++ b/source/adapters/level_zero/usm.cpp @@ -178,11 +178,11 @@ static ur_result_t USMDeviceAllocImpl(void **ResultPtr, ZeDesc.flags = 0; ZeDesc.ordinal = 0; - if (Device->useOptimized32bitAccess() == 0 && + ZeStruct RelaxedDesc; + if (Device->useRelaxedAllocationLimits() && (Size > Device->ZeDeviceProperties->maxMemAllocSize)) { // Tell Level-Zero to accept Size > maxMemAllocSize if // large allocations are used. - ZeStruct RelaxedDesc; RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE; ZeDesc.pNext = &RelaxedDesc; }