diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp index ec6a294c21..acc7c755f4 100644 --- a/source/adapters/level_zero/device.cpp +++ b/source/adapters/level_zero/device.cpp @@ -267,7 +267,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo( return ReturnValue(uint32_t{64}); } case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE: - return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize}); + // if not optimized for 32-bit access, return total memory size. + // otherwise, return only maximum allocatable size. + if (Device->useOptimized32bitAccess() == 0) { + return ReturnValue(uint64_t{calculateGlobalMemSize(Device)}); + } else { + return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize}); + } case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: { // Support to read physicalSize depends on kernel, // so fallback into reading totalSize if physicalSize @@ -911,6 +917,22 @@ ur_device_handle_t_::useImmediateCommandLists() { } } +int32_t ur_device_handle_t_::useOptimized32bitAccess() { + static const int32_t Optimize32bitAccessMode = [this] { + // If device is Intel(R) Data Center GPU Max, + // use default provided by L0 driver. + // TODO: Use IP versioning to select based on range of devices + if (this->isPVC()) + return -1; + const char *UrRet = std::getenv("UR_L0_USE_OPTIMIZED_32BIT_ACCESS"); + if (!UrRet) + return 0; + return std::atoi(UrRet); + }(); + + return Optimize32bitAccessMode; +} + ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal, int SubSubDeviceIndex) { // Maintain various device properties cache. diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp index bdae64beba..5f34efab44 100644 --- a/source/adapters/level_zero/device.hpp +++ b/source/adapters/level_zero/device.hpp @@ -145,6 +145,22 @@ struct ur_device_handle_t_ : _ur_object { // Returns whether immediate command lists are used on this device. ImmCmdlistMode ImmCommandListUsed{}; + // Returns whether large allocations are being used + // or not to have a consistent behavior throughout + // the adapter between the creation of large allocations + // and the compilation of kernels into stateful and + // stateless modes. + // With stateful mode, kernels are compiled with + // pointer-arithmetic optimizations for optimized + // access of allocations smaller than 4GB. + // In stateless mode, such optimizations are not + // applied. + // Even if a GPU supports both modes, L0 driver may + // provide support for only one, like for Intel(R) + // Data Center GPU Max, for which L0 driver only + // supports stateless. + int32_t useOptimized32bitAccess(); + bool isSubDevice() { return RootDevice != nullptr; } // Is this a Data Center GPU Max series (aka PVC)? diff --git a/source/adapters/level_zero/program.cpp b/source/adapters/level_zero/program.cpp index 92a3c87aea..f118a5b9dd 100644 --- a/source/adapters/level_zero/program.cpp +++ b/source/adapters/level_zero/program.cpp @@ -148,9 +148,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp( ZeModuleDesc.format = (hProgram->State == ur_program_handle_t_::IL) ? ZE_MODULE_FORMAT_IL_SPIRV : ZE_MODULE_FORMAT_NATIVE; + ZeModuleDesc.inputSize = hProgram->CodeLength; ZeModuleDesc.pInputModule = hProgram->Code.get(); - ZeModuleDesc.pBuildFlags = pOptions; + + // if large allocations are selected, then pass + // ze-opt-greater-than-4GB-buffer-required to disable + // stateful optimizations and be able to use larger than + // 4GB allocations on these kernels. + std::string ZeBuildOptions{}; + if (pOptions) { + ZeBuildOptions += pOptions; + } + + if (phDevices[0]->useOptimized32bitAccess() == 0) { + ZeBuildOptions += " -ze-opt-greater-than-4GB-buffer-required"; + } + + ZeModuleDesc.pBuildFlags = ZeBuildOptions.c_str(); ZeModuleDesc.pConstants = Shim.ze(); ze_device_handle_t ZeDevice = phDevices[0]->ZeDevice; @@ -234,8 +249,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile( // This produces better code because the driver can do cross-module // optimizations. Therefore, we just remember the compilation flags, so we // can use them later. - if (Options) + if (Options) { Program->BuildFlags = Options; + + // if large allocations are selected, then pass + // ze-opt-greater-than-4GB-buffer-required to disable + // stateful optimizations and be able to use larger than + // 4GB allocations on these kernels. + if (Context->Devices[0]->useOptimized32bitAccess() == 0) { + Program->BuildFlags += " -ze-opt-greater-than-4GB-buffer-required"; + } + } Program->State = ur_program_handle_t_::Object; return UR_RESULT_SUCCESS; diff --git a/source/adapters/level_zero/usm.cpp b/source/adapters/level_zero/usm.cpp index daec0408fb..c6d98855e7 100644 --- a/source/adapters/level_zero/usm.cpp +++ b/source/adapters/level_zero/usm.cpp @@ -178,9 +178,11 @@ static ur_result_t USMDeviceAllocImpl(void **ResultPtr, ZeDesc.flags = 0; ZeDesc.ordinal = 0; - ZeStruct RelaxedDesc; - if (Size > Device->ZeDeviceProperties->maxMemAllocSize) { - // Tell Level-Zero to accept Size > maxMemAllocSize + if (Device->useOptimized32bitAccess() == 0 && + (Size > Device->ZeDeviceProperties->maxMemAllocSize)) { + // Tell Level-Zero to accept Size > maxMemAllocSize if + // large allocations are used. + ZeStruct RelaxedDesc; RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE; ZeDesc.pNext = &RelaxedDesc; }