From 3756fd1b778ae4ab36bd3988bfdf9ba910b779fd Mon Sep 17 00:00:00 2001
From: Vyacheslav Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Mon, 29 Apr 2024 16:44:14 -0500
Subject: [PATCH] [ESIMD] Enable FADD/FSUB for slm_atomic_update (#13535)

Those test cases required newer GPU driver and thus were disabled
previously.

GPU driver on DG2 still does not handle correctly atomic_update for 'float' and 'half' types.
GPU driver on PVC still does not handle correctly slm_atomic_update for 'half' types.

---------
Signed-off-by: Vyacheslav N Klochkov <vyacheslav.n.klochkov@intel.com>
---
 sycl/include/sycl/ext/intel/esimd/memory.hpp  | 20 +++---
 .../Inputs/atomic_update.hpp                  | 43 ++++--------
 .../Inputs/atomic_update_slm.hpp              | 70 +++++++------------
 sycl/test/esimd/memory_properties.cpp         | 17 ++++-
 4 files changed, 66 insertions(+), 84 deletions(-)
diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index c0961a1e20f03..a44c8cad98c3c 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -5823,7 +5823,7 @@ slm_atomic_update_impl(simd<uint32_t, N> offsets, simd<T, N> src0,
   constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
   constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
   constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
-  if constexpr (std::is_same_v<T, double>) {
+  if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
     return __esimd_lsc_xatomic_slm_1<T, IOp, cache_hint::none, cache_hint::none,
                                      AddressScale, ImmOffset, EDS, VS,
                                      Transposed, N>(pred.data(), offsets.data(),
@@ -5867,7 +5867,7 @@ __ESIMD_API simd<T, N> slm_atomic_update_impl(simd<uint32_t, N> offsets,
   constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
   constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
   constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
-  if constexpr (std::is_same_v<T, double>) {
+  if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
     return __esimd_lsc_xatomic_slm_2<T, IOp, cache_hint::none, cache_hint::none,
                                      AddressScale, ImmOffset, EDS, VS,
                                      Transposed, N>(pred.data(), offsets.data(),
@@ -6007,11 +6007,11 @@ template <atomic_op Op, typename T, int N>
 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1, simd<T, N>>
 slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
                   simd_mask<N> mask = 1) {
-  // 2 byte, 8 byte types, non-power of two, and operations wider than
-  // 32 are supported only by LSC.
-  if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
-                !__ESIMD_DNS::isPowerOf2(N, 32)) {
-    // half and short are supported in LSC.
+  // Non-LSC atomic_update supports only 4-byte int vector operations with
+  // 1,2,4,8,16,32 vector length. Non-LSC supports only 'store' for FP types.
+  if constexpr (Op == atomic_op::fmin || Op == atomic_op::fmax ||
+                Op == atomic_op::fadd || Op == atomic_op::fsub ||
+                sizeof(T) != 4 || !__ESIMD_DNS::isPowerOf2(N, 32)) {
     return slm_atomic_update_impl<Op, T, N,
                                   detail::lsc_data_size::default_size>(
         byte_offset, src0, mask);
@@ -6096,9 +6096,9 @@ template <atomic_op Op, typename T, int N>
 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2, simd<T, N>>
 slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
                   simd<T, N> src1, simd_mask<N> mask = 1) {
-  // 2 byte, 8 byte types, non-power of two, and operations wider than
-  // 32 are supported only by LSC.
-  if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
+  // Non-LSC atomic_update supports only 4-byte int vector operations with
+  // 1,2,4,8,16,32 vector length.
+  if constexpr (sizeof(T) != 4 || Op == atomic_op::fcmpxchg ||
                 !__ESIMD_DNS::isPowerOf2(N, 32)) {
     // 2-argument lsc_atomic_update arguments order matches the standard one -
     // expected value first, then new value. But atomic_update uses reverse
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp
index 985f44c0e6954..af1d1262de959 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp
@@ -670,16 +670,13 @@ template <int N, template <class, int> class Op, bool UseMask,
           bool UseLSCFeatures, bool UseAcc>
 bool test_fp_types(queue q, const Config &cfg) {
   bool passed = true;
-  if constexpr (UseLSCFeatures) {
-    if constexpr (std::is_same_v<Op<sycl::half, N>, ImplFmin<sycl::half, N>> ||
-                  std::is_same_v<Op<sycl::half, N>, ImplFmax<sycl::half, N>> ||
-                  std::is_same_v<Op<sycl::half, N>,
-                                 ImplFcmpwr<sycl::half, N>>) {
-      auto dev = q.get_device();
-      if (dev.has(sycl::aspect::fp16)) {
-        passed &= run_test<UseAcc, sycl::half, N, Op, UseMask, UseLSCFeatures>(
-            q, cfg);
-      }
+  // TODO: Enable FADD/FSUB on DG2/PVC when the error in GPU driver is resolved.
+  if constexpr (UseLSCFeatures &&
+                !std::is_same_v<Op<sycl::half, N>, ImplFadd<sycl::half, N>> &&
+                !std::is_same_v<Op<sycl::half, N>, ImplFsub<sycl::half, N>>) {
+    if (q.get_device().has(sycl::aspect::fp16)) {
+      passed &=
+          run_test<UseAcc, sycl::half, N, Op, UseMask, UseLSCFeatures>(q, cfg);
     }
   }
   passed &= run_test<UseAcc, float, N, Op, UseMask, UseLSCFeatures>(q, cfg);
@@ -688,7 +685,6 @@ bool test_fp_types(queue q, const Config &cfg) {
       q.get_device().has(sycl::aspect::fp64)) {
     passed &= run_test<UseAcc, double, N, Op, UseMask, UseLSCFeatures>(q, cfg);
   }
-
 #endif // CMPXCHG_TEST
   return passed;
 }
@@ -703,7 +699,6 @@ bool test_int_types_and_sizes(queue q, const Config &cfg) {
       test_int_types<2, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg);
   passed &=
       test_int_types<4, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg);
-
   passed &=
       test_int_types<8, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg);
   passed &=
@@ -715,13 +710,10 @@ bool test_int_types_and_sizes(queue q, const Config &cfg) {
   if constexpr (UseLSCFeatures) {
     passed &= test_int_types<64, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(
         q, cfg);
-    // non power of two values are supported only in newer driver.
-    // TODO: Enable this when the new driver reaches test infrastructure
-    // (v27556).
-#if 0
-    passed &= test_int_types<12, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg);
-    passed &= test_int_types<33, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg);
-#endif
+    passed &= test_int_types<12, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(
+        q, cfg);
+    passed &= test_int_types<33, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(
+        q, cfg);
   }
 
   return passed;
@@ -734,21 +726,14 @@ bool test_fp_types_and_sizes(queue q, const Config &cfg) {
   passed &= test_fp_types<1, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
   passed &= test_fp_types<2, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
   passed &= test_fp_types<4, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
-
   passed &= test_fp_types<8, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
-  // Supported by LSC atomic:
+  passed &= test_fp_types<16, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
+  passed &= test_fp_types<32, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
+
   if constexpr (UseLSCFeatures) {
-    passed &= test_fp_types<16, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
-    passed &= test_fp_types<32, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
     passed &= test_fp_types<64, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
-
-    // non power of two values are supported only in newer driver.
-    // TODO: Enable this when the new driver reaches test infrastructure
-    // (v27556).
-#if 0
     passed &= test_fp_types<12, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
     passed &= test_fp_types<35, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg);
-#endif
   }
   return passed;
 }
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp
index 92f000711e137..93075316f65d7 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp
@@ -593,18 +593,14 @@ template <int N, template <class, int> class Op, bool UseMask,
           TestFeatures Features, bool UseAcc>
 bool test_fp_types(queue q) {
   bool passed = true;
-  if constexpr (Features == TestFeatures::DG2 ||
-                Features == TestFeatures::PVC) {
-    if constexpr (std::is_same_v<Op<sycl::half, N>,
-                                 ImplLSCFmax<sycl::half, N>> ||
-                  std::is_same_v<Op<sycl::half, N>,
-                                 ImplLSCFmin<sycl::half, N>> ||
-                  std::is_same_v<Op<sycl::half, N>,
-                                 ImplLSCFcmpwr<sycl::half, N>>) {
-      auto dev = q.get_device();
-      if (dev.has(sycl::aspect::fp16)) {
-        passed &= run_test<UseAcc, sycl::half, N, Op, UseMask>(q);
-      }
+
+  // TODO: Enable 'half' FADD/FSUB on DG2 when the error in GPU driver is fixed.
+  if constexpr (Features == TestFeatures::PVC ||
+                (Features == TestFeatures::DG2 &&
+                 !std::is_same_v<Op<sycl::half, N>, ImplFadd<sycl::half, N>> &&
+                 !std::is_same_v<Op<sycl::half, N>, ImplFsub<sycl::half, N>>)) {
+    if (q.get_device().has(sycl::aspect::fp16)) {
+      passed &= run_test<UseAcc, sycl::half, N, Op, UseMask>(q);
     }
   }
 
@@ -612,14 +608,9 @@ bool test_fp_types(queue q) {
 
   if constexpr (Features == TestFeatures::DG2 ||
                 Features == TestFeatures::PVC) {
-    // TODO: fmin/fmax/fcmpxchg for double requires a newer GPU driver.
-    if constexpr (!std::is_same_v<Op<double, N>, ImplLSCFmax<double, N>> &&
-                  !std::is_same_v<Op<double, N>, ImplLSCFmin<double, N>> &&
-                  !std::is_same_v<Op<double, N>, ImplLSCFcmpwr<double, N>>) {
-      if (q.get_device().has(sycl::aspect::atomic64) &&
-          q.get_device().has(sycl::aspect::fp64)) {
-        passed &= run_test<UseAcc, double, N, Op, UseMask>(q);
-      }
+    if (q.get_device().has(sycl::aspect::atomic64) &&
+        q.get_device().has(sycl::aspect::fp64)) {
+      passed &= run_test<UseAcc, double, N, Op, UseMask>(q);
     }
   }
   return passed;
@@ -633,7 +624,6 @@ bool test_int_types_and_sizes(queue q) {
   passed &= test_int_types<2, Op, UseMask, Features, UseAcc, SignMask>(q);
   passed &= test_int_types<4, Op, UseMask, Features, UseAcc, SignMask>(q);
   passed &= test_int_types<8, Op, UseMask, Features, UseAcc, SignMask>(q);
-  // TODO: N=16 and N=32 does not pass on Gen12 with mask due to older driver.
   if (UseMask && Features == TestFeatures::Generic &&
       esimd_test::isGPUDriverGE(q, esimd_test::GPUDriverOS::LinuxAndWindows,
                                 "26918", "101.4953", false)) {
@@ -645,13 +635,8 @@ bool test_int_types_and_sizes(queue q) {
   if constexpr (Features == TestFeatures::DG2 ||
                 Features == TestFeatures::PVC) {
     passed &= test_int_types<64, Op, UseMask, Features, UseAcc, SignMask>(q);
-    // non power of two values are supported only in newer driver.
-    // TODO: Enable this when the new driver reaches test infrastructure
-    // (v27556).
-#if 0
     passed &= test_int_types<12, Op, UseMask, Features, UseAcc, SignMask>(q);
     passed &= test_int_types<33, Op, UseMask, Features, UseAcc, SignMask>(q);
-#endif
   }
 
   return passed;
@@ -672,13 +657,8 @@ bool test_fp_types_and_sizes(queue q) {
   if constexpr (Features == TestFeatures::DG2 ||
                 Features == TestFeatures::PVC) {
     passed &= test_fp_types<64, Op, UseMask, Features, UseAcc>(q);
-    // non power of two values are supported only in newer driver.
-    // TODO: Enable this when the new driver reaches test infrastructure
-    // (v27556).
-#if 0
     passed &= test_fp_types<33, Op, UseMask, Features, UseAcc>(q);
     passed &= test_fp_types<65, Op, UseMask, Features, UseAcc>(q);
-#endif
   }
   return passed;
 }
@@ -705,29 +685,33 @@ int test_with_mask(queue q) {
       test_int_types_and_sizes<ImplUMin, UseMask, Features, UseAcc, Unsigned>(
           q);
 
+  // Check load/store operations.
+  passed &= test_int_types_and_sizes<ImplLoad, UseMask, Features, UseAcc>(q);
+  passed &= test_int_types_and_sizes<ImplStore, UseMask, Features, UseAcc>(q);
+  // 'float' 'load' and 'store' do not require DG2/PVC.
+  passed &= test_fp_types_and_sizes<ImplLoad, UseMask, Features, UseAcc>(q);
+  passed &= test_fp_types_and_sizes<ImplStore, UseMask, Features, UseAcc>(q);
+
   if constexpr (Features == TestFeatures::DG2 ||
                 Features == TestFeatures::PVC) {
     passed &=
         test_fp_types_and_sizes<ImplLSCFmax, UseMask, Features, UseAcc>(q);
     passed &=
         test_fp_types_and_sizes<ImplLSCFmin, UseMask, Features, UseAcc>(q);
-
-    // TODO: fadd/fsub are emulated in the newer driver, but do not pass
-    // validation.
-#if 0
+  }
+  // TODO: GPU driver promised to support FADD/FSUB on DG2, but it doesn't.
+  // Report the issue to driver, enable FADD/FSUB for DG2 when it is fixed.
+  if constexpr (Features == TestFeatures::PVC) {
     passed &= test_fp_types_and_sizes<ImplFadd, UseMask, Features, UseAcc>(q);
     passed &= test_fp_types_and_sizes<ImplFsub, UseMask, Features, UseAcc>(q);
-#endif
-
-    // Check load/store operations.
-    passed &= test_int_types_and_sizes<ImplLoad, UseMask, Features, UseAcc>(q);
-    passed &= test_int_types_and_sizes<ImplStore, UseMask, Features, UseAcc>(q);
-    passed &= test_fp_types_and_sizes<ImplStore, UseMask, Features, UseAcc>(q);
   }
 #else
   passed &= test_int_types_and_sizes<ImplCmpxchg, UseMask, Features, UseAcc>(q);
-  passed &=
-      test_fp_types_and_sizes<ImplLSCFcmpwr, UseMask, Features, UseAcc>(q);
+  if constexpr (Features == TestFeatures::DG2 ||
+                Features == TestFeatures::PVC) {
+    passed &=
+        test_fp_types_and_sizes<ImplLSCFcmpwr, UseMask, Features, UseAcc>(q);
+  }
 #endif
   return passed;
 }
diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp
index 0a43e31f62bd8..fc26b81684b7b 100644
--- a/sycl/test/esimd/memory_properties.cpp
+++ b/sycl/test/esimd/memory_properties.cpp
@@ -910,14 +910,14 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
       auto res_slm_atomic_0 =
           slm_atomic_update<atomic_op::add, int16_t>(offsets, add);
     }
-    // Expect DWORD for fmin.
+    // Expect LSC for fmin.
     {
       constexpr int VL = 16;
       simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(float);
       auto pred = simd_mask<VL>(1);
       simd<float, VL> min = simd<float, VL>(1) * sizeof(int);
 
-      // CHECK: call <16 x float> @llvm.genx.dword.atomic.fmin.v16f32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i32 {{[^)]+}}, <16 x i32> {{[^)]+}}, <16 x float> {{[^)]+}}, <16 x float> undef)
+      // CHECK: call <16 x float> @llvm.genx.lsc.xatomic.slm.v16f32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 21, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <16 x i32> {{[^)]+}}, <16 x float> {{[^)]+}}, <16 x float> undef, i32 0, <16 x float> undef)
       auto res_slm_atomic_0 =
           slm_atomic_update<atomic_op::fmin, float>(offsets, min, pred);
     }
@@ -1038,6 +1038,19 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
       auto res_slm_atomic_0 = slm_atomic_update<atomic_op::cmpxchg, int64_t>(
           offsets, swap, compare, pred);
     }
+
+    // Expect LSC for FP types.
+    {
+      constexpr int VL = 16;
+      simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int64_t);
+      auto compare = simd<float, VL>(VL, 1);
+      auto swap = compare * 2;
+      auto pred = simd_mask<VL>(1);
+
+      // CHECK: call <16 x float> @llvm.genx.lsc.xatomic.slm.v16f32.v16i1.v16i32(<16 x i1> {{[^)]+}} i8 23, i8 0, i8 0, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <16 x i32> {{[^)]+}}, <16 x float> {{[^)]+}}, <16 x float> {{[^)]+}}, i32 0, <16 x float> undef)
+      auto res_slm_atomic_0 = slm_atomic_update<atomic_op::fcmpxchg, float>(
+          offsets, swap, compare, pred);
+    }
   }
 
   // Test with local accessor.