From 28590a82e9a4b63612f7319760dae4f0d02c9d3b Mon Sep 17 00:00:00 2001
From: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
Date: Mon, 20 Nov 2023 21:37:17 -0800
Subject: [PATCH] [UR][L0] Unify use of large allocation in L0 adapter

Intel(R) GPUs have two modes of operation in terms of allocations:
Stateful and stateless mode.

Stateful optimizes memory accesses through pointer arithmetic.
This can be done as long as allocations used by the allocation
are smaller than 4GB.

Stateless disables such pointer-arithmetic optimization to
allow the kernel to use allocations larger than 4GB.

Currently, L0 adapter dynamically and automatically requests
the L0 driver large allocations if it detects an allocation size
is larger than 4GB. This creates a problem if a kernel has been
previously compiled for stateful access. This ultimately means
the adapter mixes stateful and stateless behavior, which is not
a user-friendly experience.

This patch aims at correcting this behavior by defining a default
one. On Intel(R) GPUs previous to Intel(R) Data Center GPU Max,
default behavior is now stateless, meaning all allocations are
only allowed by default. Users can opt-in for stateful mode setting
a new environment variable UR_L0_USE_OPTIMIZED_32BIT_ACCESS=1.

Addresses:
https://stackoverflow.com/questions/75621264/sycl-dot-product-code-gives-wrong-results

Signed-off-by: Jaime Arteaga <jaime.a.arteaga.molina@intel.com>
---
 source/adapters/level_zero/device.cpp  | 24 +++++++++++++++++++++-
 source/adapters/level_zero/device.hpp  | 16 +++++++++++++++
 source/adapters/level_zero/program.cpp | 28 ++++++++++++++++++++++++--
 source/adapters/level_zero/usm.cpp     |  8 +++++---
 4 files changed, 70 insertions(+), 6 deletions(-)

diff --git a/source/adapters/level_zero/device.cpp b/source/adapters/level_zero/device.cpp
index ec6a294c21..acc7c755f4 100644
--- a/source/adapters/level_zero/device.cpp
+++ b/source/adapters/level_zero/device.cpp
@@ -267,7 +267,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(
     return ReturnValue(uint32_t{64});
   }
   case UR_DEVICE_INFO_MAX_MEM_ALLOC_SIZE:
-    return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize});
+    // if not optimized for 32-bit access, return total memory size.
+    // otherwise, return only maximum allocatable size.
+    if (Device->useOptimized32bitAccess() == 0) {
+      return ReturnValue(uint64_t{calculateGlobalMemSize(Device)});
+    } else {
+      return ReturnValue(uint64_t{Device->ZeDeviceProperties->maxMemAllocSize});
+    }
   case UR_DEVICE_INFO_GLOBAL_MEM_SIZE: {
     // Support to read physicalSize depends on kernel,
     // so fallback into reading totalSize if physicalSize
@@ -911,6 +917,22 @@ ur_device_handle_t_::useImmediateCommandLists() {
   }
 }
 
+int32_t ur_device_handle_t_::useOptimized32bitAccess() {
+  static const int32_t Optimize32bitAccessMode = [this] {
+    // If device is Intel(R) Data Center GPU Max,
+    // use default provided by L0 driver.
+    // TODO: Use IP versioning to select based on range of devices
+    if (this->isPVC())
+      return -1;
+    const char *UrRet = std::getenv("UR_L0_USE_OPTIMIZED_32BIT_ACCESS");
+    if (!UrRet)
+      return 0;
+    return std::atoi(UrRet);
+  }();
+
+  return Optimize32bitAccessMode;
+}
+
 ur_result_t ur_device_handle_t_::initialize(int SubSubDeviceOrdinal,
                                             int SubSubDeviceIndex) {
   // Maintain various device properties cache.
diff --git a/source/adapters/level_zero/device.hpp b/source/adapters/level_zero/device.hpp
index bdae64beba..5f34efab44 100644
--- a/source/adapters/level_zero/device.hpp
+++ b/source/adapters/level_zero/device.hpp
@@ -145,6 +145,22 @@ struct ur_device_handle_t_ : _ur_object {
   // Returns whether immediate command lists are used on this device.
   ImmCmdlistMode ImmCommandListUsed{};
 
+  // Returns whether large allocations are being used
+  // or not to have a consistent behavior throughout
+  // the adapter between the creation of large allocations
+  // and the compilation of kernels into stateful and
+  // stateless modes.
+  // With stateful mode, kernels are compiled with
+  // pointer-arithmetic optimizations for optimized
+  // access of allocations smaller than 4GB.
+  // In stateless mode, such optimizations are not
+  // applied.
+  // Even if a GPU supports both modes, L0 driver may
+  // provide support for only one, like for Intel(R)
+  // Data Center GPU Max, for which L0 driver only
+  // supports stateless.
+  int32_t useOptimized32bitAccess();
+
   bool isSubDevice() { return RootDevice != nullptr; }
 
   // Is this a Data Center GPU Max series (aka PVC)?
diff --git a/source/adapters/level_zero/program.cpp b/source/adapters/level_zero/program.cpp
index 92a3c87aea..f118a5b9dd 100644
--- a/source/adapters/level_zero/program.cpp
+++ b/source/adapters/level_zero/program.cpp
@@ -148,9 +148,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp(
   ZeModuleDesc.format = (hProgram->State == ur_program_handle_t_::IL)
                             ? ZE_MODULE_FORMAT_IL_SPIRV
                             : ZE_MODULE_FORMAT_NATIVE;
+
   ZeModuleDesc.inputSize = hProgram->CodeLength;
   ZeModuleDesc.pInputModule = hProgram->Code.get();
-  ZeModuleDesc.pBuildFlags = pOptions;
+
+  // if large allocations are selected, then pass
+  // ze-opt-greater-than-4GB-buffer-required to disable
+  // stateful optimizations and be able to use larger than
+  // 4GB allocations on these kernels.
+  std::string ZeBuildOptions{};
+  if (pOptions) {
+    ZeBuildOptions += pOptions;
+  }
+
+  if (phDevices[0]->useOptimized32bitAccess() == 0) {
+    ZeBuildOptions += " -ze-opt-greater-than-4GB-buffer-required";
+  }
+
+  ZeModuleDesc.pBuildFlags = ZeBuildOptions.c_str();
   ZeModuleDesc.pConstants = Shim.ze();
 
   ze_device_handle_t ZeDevice = phDevices[0]->ZeDevice;
@@ -234,8 +249,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile(
   // This produces better code because the driver can do cross-module
   // optimizations.  Therefore, we just remember the compilation flags, so we
   // can use them later.
-  if (Options)
+  if (Options) {
     Program->BuildFlags = Options;
+
+    // if large allocations are selected, then pass
+    // ze-opt-greater-than-4GB-buffer-required to disable
+    // stateful optimizations and be able to use larger than
+    // 4GB allocations on these kernels.
+    if (Context->Devices[0]->useOptimized32bitAccess() == 0) {
+      Program->BuildFlags += " -ze-opt-greater-than-4GB-buffer-required";
+    }
+  }
   Program->State = ur_program_handle_t_::Object;
 
   return UR_RESULT_SUCCESS;
diff --git a/source/adapters/level_zero/usm.cpp b/source/adapters/level_zero/usm.cpp
index daec0408fb..c6d98855e7 100644
--- a/source/adapters/level_zero/usm.cpp
+++ b/source/adapters/level_zero/usm.cpp
@@ -178,9 +178,11 @@ static ur_result_t USMDeviceAllocImpl(void **ResultPtr,
   ZeDesc.flags = 0;
   ZeDesc.ordinal = 0;
 
-  ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
-  if (Size > Device->ZeDeviceProperties->maxMemAllocSize) {
-    // Tell Level-Zero to accept Size > maxMemAllocSize
+  if (Device->useOptimized32bitAccess() == 0 &&
+      (Size > Device->ZeDeviceProperties->maxMemAllocSize)) {
+    // Tell Level-Zero to accept Size > maxMemAllocSize if
+    // large allocations are used.
+    ZeStruct<ze_relaxed_allocation_limits_exp_desc_t> RelaxedDesc;
     RelaxedDesc.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE;
     ZeDesc.pNext = &RelaxedDesc;
   }