[SYCL][ESIMD] Implement slm_gather accepting compile-time properties (i…

…ntel#12456)
reble · Jan 29, 2024 · 5582ce4 · 5582ce4
1 parent f4b4a84
commit 5582ce4
Show file tree

Hide file tree

Showing 8 changed files with 806 additions and 53 deletions.
diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
@@ -556,20 +556,8 @@ template <typename T, int NElts = 1,
 __ESIMD_API __ESIMD_NS::simd<T, N * NElts>
 lsc_slm_gather(__ESIMD_NS::simd<uint32_t, N> offsets,
                __ESIMD_NS::simd_mask<N> pred = 1) {
-  detail::check_lsc_vector_size<NElts>();
-  detail::check_lsc_data_size<T, DS>();
-  constexpr uint16_t _AddressScale = 1;
-  constexpr int _ImmOffset = 0;
-  constexpr lsc_data_size _DS =
-      detail::expand_data_size(detail::finalize_data_size<T, DS>());
-  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
-  constexpr auto _Transposed = detail::lsc_data_order::nontranspose;
-  using MsgT = typename detail::lsc_expand_type<T>::type;
-  __ESIMD_NS::simd<MsgT, N * NElts> Tmp =
-      __esimd_lsc_load_slm<MsgT, cache_hint::none, cache_hint::none,
-                           _AddressScale, _ImmOffset, _DS, _VS, _Transposed, N>(
-          pred.data(), offsets.data());
-  return detail::lsc_format_ret<T>(Tmp);
+  __ESIMD_NS::simd<T, N * NElts> pass_thru;
+  return __ESIMD_DNS::slm_gather_impl<T, NElts, DS>(offsets, pred, pass_thru);
 }
 
 /// SLM gather.
@@ -595,24 +583,7 @@ __ESIMD_API __ESIMD_NS::simd<T, N * NElts>
 lsc_slm_gather(__ESIMD_NS::simd<uint32_t, N> offsets,
                __ESIMD_NS::simd_mask<N> pred,
                __ESIMD_NS::simd<T, N * NElts> pass_thru) {
-  detail::check_lsc_vector_size<NElts>();
-  detail::check_lsc_data_size<T, DS>();
-  constexpr uint16_t _AddressScale = 1;
-  constexpr int _ImmOffset = 0;
-  constexpr lsc_data_size _DS =
-      detail::expand_data_size(detail::finalize_data_size<T, DS>());
-  constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
-  constexpr detail::lsc_data_order _Transposed =
-      detail::lsc_data_order::nontranspose;
-  using MsgT = typename detail::lsc_expand_type<T>::type;
-  __ESIMD_NS::simd<MsgT, N * NElts> PassThruExpanded =
-      detail::lsc_format_input<MsgT>(pass_thru);
-  __ESIMD_NS::simd<MsgT, N * NElts> Result =
-      __esimd_lsc_load_merge_slm<MsgT, cache_hint::none, cache_hint::none,
-                                 _AddressScale, _ImmOffset, _DS, _VS,
-                                 _Transposed, N>(pred.data(), offsets.data(),
-                                                 PassThruExpanded.data());
-  return detail::lsc_format_ret<T>(Result);
+  return __ESIMD_DNS::slm_gather_impl<T, NElts, DS>(offsets, pred, pass_thru);
 }
 
 /// Transposed SLM gather with 1 channel.

diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp
@@ -468,6 +468,244 @@ template <typename T, TestFeatures Features> bool testUSM(queue Q) {
   return Passed;
 }
 
+template <typename T, uint16_t N, uint16_t VS, bool UseMask, bool UsePassThru,
+          bool UseProperties, typename PropertiesT>
+bool testSLM(queue Q, uint32_t MaskStride, PropertiesT) {
+
+  static_assert(VS > 0 && N % VS == 0,
+                "Incorrect VS parameter. N must be divisible by VS.");
+  constexpr int NOffsets = N / VS;
+  static_assert(!UsePassThru || UseMask,
+                "PassThru cannot be used without using mask");
+
+  constexpr uint32_t Groups = 8;
+  constexpr uint32_t Threads = 16;
+
+  std::cout << "Running slm_gather case: T=" << esimd_test::type_name<T>()
+            << ", N=" << N << ", VS=" << VS << ", MaskStride=" << MaskStride
+            << ", Groups=" << Groups << ", Threads=" << Threads
+            << ", use_mask=" << UseMask << ", use_pass_thru=" << UsePassThru
+            << ", use_properties=" << UseProperties << std::endl;
+
+  constexpr uint16_t Size = Groups * Threads * N;
+  using Tuint = esimd_test::uint_type_t<sizeof(T)>;
+
+  sycl::range<1> GlobalRange{Groups};
+  sycl::range<1> LocalRange{Threads};
+  sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
+
+  T *Out = sycl::malloc_shared<T>(Size, Q);
+  std::memset(Out, 0, Size * sizeof(T));
+
+  T *In = sycl::malloc_shared<T>(Size * 2, Q);
+  for (int I = 0; I < Size; I++)
+    In[I] = esimd_test::getRandomValue<T>();
+
+  try {
+    Q.parallel_for(Range, [=](sycl::nd_item<1> NDI) SYCL_ESIMD_KERNEL {
+       uint16_t GlobalID = NDI.get_global_id(0);
+       uint16_t LocalID = NDI.get_local_id(0);
+       uint32_t GlobalElemOffset = GlobalID * N;
+       uint32_t LocalElemOffset = LocalID * N;
+
+       // Allocate a bit more to safely initialize it with 4-element chunks.
+       constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T);
+       slm_init<SLMSize>();
+
+       if (LocalID == 0) {
+         for (int I = 0; I < Threads * N; I += 8) {
+           simd<T, 8> InVec(In + GlobalElemOffset + I);
+           simd<uint32_t, 8> offsets(I * sizeof(T), sizeof(T));
+           slm_scatter<T>(offsets, InVec);
+         }
+       }
+       barrier();
+
+       PropertiesT Props{};
+
+       simd<uint32_t, NOffsets> ByteOffsets(LocalElemOffset * sizeof(T),
+                                            VS * sizeof(T));
+       simd_view ByteOffsetsView = ByteOffsets.template select<NOffsets, 1>();
+
+       simd_mask<NOffsets> Pred;
+       for (int I = 0; I < NOffsets; I++)
+         Pred[I] = (I % MaskStride == 0) ? 1 : 0;
+
+       using Tuint = esimd_test::uint_type_t<sizeof(T)>;
+       simd<Tuint, N> PassThruInt(GlobalElemOffset, 1);
+       simd<T, N> PassThru = PassThruInt.template bit_cast_view<T>();
+       auto PassThruView = PassThru.template select<N, 1>(0);
+
+       simd<T, N> Vals;
+       if constexpr (VS > 1) { // VS > 1 requires specifying <T, N, VS>
+         if constexpr (UsePassThru) {
+           if constexpr (UseProperties) {
+             if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
+               Vals = slm_gather<T, N, VS>(ByteOffsets, Pred, PassThru, Props);
+             else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
+               Vals =
+                   slm_gather<T, N, VS>(ByteOffsets, Pred, PassThruView, Props);
+             else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
+               Vals =
+                   slm_gather<T, N, VS>(ByteOffsetsView, Pred, PassThru, Props);
+             else // ByteOffset - view, PassThru - view
+               Vals = slm_gather<T, N, VS>(ByteOffsetsView, Pred, PassThruView,
+                                           Props);
+           } else {                 // UseProperties is false
+             if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
+               Vals = slm_gather<T, N, VS>(ByteOffsets, Pred, PassThru);
+             else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
+               Vals = slm_gather<T, N, VS>(ByteOffsets, Pred, PassThruView);
+             else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
+               Vals = slm_gather<T, N, VS>(ByteOffsetsView, Pred, PassThru);
+             else // ByteOffset - view, PassThru - view
+               Vals = slm_gather<T, N, VS>(ByteOffsetsView, Pred, PassThruView);
+           }
+         } else if constexpr (UseMask) { // UsePassThru is false
+           if constexpr (UseProperties) {
+             if (GlobalID % 2 == 0) // ByteOffset - simd
+               Vals = slm_gather<T, N, VS>(ByteOffsets, Pred, Props);
+             else // ByteOffset - simd_view
+               Vals = slm_gather<T, N, VS>(ByteOffsetsView, Pred, Props);
+           } else {                 // UseProperties is false
+             if (GlobalID % 2 == 0) // ByteOffset - simd
+               Vals = slm_gather<T, N, VS>(ByteOffsets, Pred);
+             else // ByteOffset - simd_view
+               Vals = slm_gather<T, N, VS>(ByteOffsetsView, Pred);
+           }
+         } else { // UseMask is false, UsePassThru is false
+           if constexpr (UseProperties) {
+             if (GlobalID % 2 == 0) // ByteOffset - simd
+               Vals = slm_gather<T, N, VS>(ByteOffsets, Props);
+             else // ByteOffset - simd_view
+               Vals = slm_gather<T, N, VS>(ByteOffsetsView, Props);
+           } else {                 // UseProperties is false
+             if (GlobalID % 2 == 0) // ByteOffset - simd
+               Vals = slm_gather<T, N, VS>(ByteOffsets);
+             else // ByteOffset - simd_view
+               Vals = slm_gather<T, N, VS>(ByteOffsetsView);
+           }
+         }
+       } else {
+         // if (VS == 1) then <T, N, VS> can often be omitted - test it here.
+         // The variants accepting simd_view for 'PassThru' operand though
+         // still require <T, N> to be specified explicitly to help
+         // C++ FE do simd to simd_view matching.
+         if constexpr (UsePassThru) {
+           if constexpr (UseProperties) {
+             if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
+               Vals = slm_gather<T>(ByteOffsets, Pred, PassThru, Props);
+             else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
+               Vals = slm_gather<T, N>(ByteOffsets, Pred, PassThruView, Props);
+             else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
+               Vals = slm_gather<T, N>(ByteOffsetsView, Pred, PassThru, Props);
+             else // ByteOffset - view, PassThru - view
+               Vals =
+                   slm_gather<T, N>(ByteOffsetsView, Pred, PassThruView, Props);
+           } else {                 // UseProperties is false
+             if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
+               Vals = slm_gather<T>(ByteOffsets, Pred, PassThru);
+             else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
+               Vals = slm_gather<T, N>(ByteOffsets, Pred, PassThruView);
+             else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
+               Vals = slm_gather<T, N>(ByteOffsetsView, Pred, PassThru);
+             else // ByteOffset - view, PassThru - view
+               Vals = slm_gather<T, N>(ByteOffsetsView, Pred, PassThruView);
+           }
+         } else if constexpr (UseMask) { // UsePassThru is false
+           if constexpr (UseProperties) {
+             if (GlobalID % 2 == 0) // ByteOffset - simd
+               Vals = slm_gather<T>(ByteOffsets, Pred, Props);
+             else // ByteOffset - simd_view
+               Vals = slm_gather<T, N>(ByteOffsetsView, Pred, Props);
+           } else {                 // UseProperties is false
+             if (GlobalID % 2 == 0) // ByteOffset - simd
+               Vals = slm_gather<T>(ByteOffsets, Pred);
+             else // ByteOffset - simd_view
+               Vals = slm_gather<T, N>(ByteOffsetsView, Pred);
+           }
+         } else { // UsePassThru is false, UseMask is false
+           if constexpr (UseProperties) {
+             if (GlobalID % 2 == 0) // ByteOffset - simd
+               Vals = slm_gather<T>(ByteOffsets, Props);
+             else // ByteOffset - simd_view
+               Vals = slm_gather<T, N>(ByteOffsetsView, Props);
+           } else {
+             if (GlobalID % 2 == 0) // ByteOffset - simd
+               Vals = slm_gather<T>(ByteOffsets);
+             else // ByteOffset - simd_view
+               Vals = slm_gather<T, N>(ByteOffsetsView);
+           }
+         }
+       } // end if (VS == 1)
+       Vals.copy_to(Out + GlobalElemOffset);
+     }).wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    sycl::free(In, Q);
+    sycl::free(Out, Q);
+    return false;
+  }
+
+  bool Passed = verify(In, Out, N, Size, VS, MaskStride, UseMask, UsePassThru);
+  if (!Passed)
+    std::cout << "Case FAILED" << std::endl;
+
+  sycl::free(In, Q);
+  sycl::free(Out, Q);
+  return Passed;
+}
+
+template <typename T, TestFeatures Features> bool testSLM(queue Q) {
+  constexpr bool UseMask = true;
+  constexpr bool UsePassThru = true;
+  constexpr bool UseProperties = true;
+
+  properties AlignElemProps{alignment<sizeof(T)>};
+
+  bool Passed = true;
+  Passed &= testSLM<T, 1, 1, !UseMask, !UsePassThru, !UseProperties>(
+      Q, 2, AlignElemProps);
+  Passed &= testSLM<T, 2, 1, UseMask, !UsePassThru, !UseProperties>(
+      Q, 2, AlignElemProps);
+  Passed &= testSLM<T, 4, 1, UseMask, !UsePassThru, !UseProperties>(
+      Q, 2, AlignElemProps);
+  Passed &= testSLM<T, 8, 1, UseMask, !UsePassThru, UseProperties>(
+      Q, 3, AlignElemProps);
+  // UsePassThru requires either DG2/PVC or support of llvm.masked.gather LLVM
+  // IR.
+#ifdef __ESIMD_GATHER_SCATTER_LLVM_IR
+  Passed &= testSLM<T, 16, 1, UseMask, UsePassThru, UseProperties>(
+      Q, 2, AlignElemProps);
+  Passed &= testSLM<T, 32, 1, UseMask, UsePassThru, !UseProperties>(
+      Q, 3, AlignElemProps);
+#endif
+
+  // TODO: test non-power-of-2 N
+  // Such cases were promised to be supported, but in fact they fail.
+  // Create some test cases here after the issue in GPU driver is resolved.
+
+  if constexpr (Features == TestFeatures::PVC ||
+                Features == TestFeatures::DG2) {
+
+    // Check VS > 1. GPU supports only dwords and qwords in this mode.
+    if constexpr (sizeof(T) >= 4) {
+      // TODO: This test case causes flaky fail. Enable it after the issue
+      // in GPU driver is fixed.
+      // Passed &= testUSM<T, 16, 2, UseMask, !UsePassThru, UseProperties>(
+      //    Q, 3, AlignElemProps);
+
+      Passed &= testSLM<T, 32, 2, !UseMask, !UsePassThru, UseProperties>(
+          Q, 3, AlignElemProps);
+      Passed &= testSLM<T, 32, 2, UseMask, !UsePassThru, UseProperties>(
+          Q, 3, AlignElemProps);
+      Passed &= testSLM<T, 32, 2, UseMask, UsePassThru, UseProperties>(
+          Q, 3, AlignElemProps);
+    }
+  }
+  return Passed;
+}
+
 template <typename T, TestFeatures Features> bool testACC(queue Q) {
   constexpr bool UseMask = true;
   constexpr bool UsePassThru = true;
@@ -478,12 +716,10 @@ template <typename T, TestFeatures Features> bool testACC(queue Q) {
   bool Passed = true;
   Passed &= testACC<T, 1, 1, !UseMask, !UsePassThru, !UseProperties>(
       Q, 2, AlignElemProps);
-#ifdef __ESIMD_FORCE_STATELESS_MEM
   Passed &= testACC<T, 2, 1, UseMask, !UsePassThru, !UseProperties>(
       Q, 2, AlignElemProps);
   Passed &= testACC<T, 4, 1, UseMask, !UsePassThru, !UseProperties>(
       Q, 2, AlignElemProps);
-#endif // __ESIMD_FORCE_STATELESS_MEM
   Passed &= testACC<T, 8, 1, UseMask, !UsePassThru, !UseProperties>(
       Q, 3, AlignElemProps);
   Passed &= testACC<T, 16, 1, UseMask, !UsePassThru, UseProperties>(

diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_gather.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_gather.cpp
@@ -0,0 +1,34 @@
+//==------- slm_gather.cpp - DPC++ ESIMD on-device test --------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Use per-kernel compilation to have more information about failing cases.
+// RUN: %{build} -fsycl-device-code-split=per_kernel -D__ESIMD_GATHER_SCATTER_LLVM_IR -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::slm_gather() functions accepting  optional
+// compile-time esimd::properties. The slm_gather() calls in this test do not
+// use VS > 1 (number of loads per offset) to not impose using DG2/PVC features.
+
+#include "Inputs/gather.hpp"
+
+int main() {
+  auto Q = queue{gpu_selector_v};
+  esimd_test::printTestLabel(Q);
+
+  constexpr auto TestFeatures = TestFeatures::Generic;
+  bool Passed = true;
+
+  Passed &= testSLM<int8_t, TestFeatures>(Q);
+  Passed &= testSLM<int16_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp16))
+    Passed &= testSLM<sycl::half, TestFeatures>(Q);
+  Passed &= testSLM<uint32_t, TestFeatures>(Q);
+  Passed &= testSLM<float, TestFeatures>(Q);
+  Passed &= testSLM<ext::intel::experimental::esimd::tfloat32, TestFeatures>(Q);
+  std::cout << (Passed ? "Passed\n" : "FAILED\n");
+  return Passed ? 0 : 1;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_gather_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_gather_dg2_pvc.cpp
@@ -0,0 +1,38 @@
+//==------- slm_gather_dg2_pvc.cpp - DPC++ ESIMD on-device test ------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-dg2 || gpu-intel-pvc
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::slm_gather() functions accepting  optional
+// compile-time esimd::properties. The slm_gather() calls in this test use
+// VS > 1 (number of loads per offset) and require DG2 or PVC to run.
+
+#include "Inputs/gather.hpp"
+
+int main() {
+  auto Q = queue{gpu_selector_v};
+  esimd_test::printTestLabel(Q);
+
+  // DG2 and PVC support same gather() configurations. If some gather call
+  // has corresponding instructions in PVC and does not have it in DG2, then
+  // GPU RT emulates it for DG2.
+  constexpr auto TestFeatures = TestFeatures::DG2;
+  bool Passed = true;
+
+  Passed &= testSLM<int8_t, TestFeatures>(Q);
+  Passed &= testSLM<int16_t, TestFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp16))
+    Passed &= testSLM<sycl::half, TestFeatures>(Q);
+  Passed &= testSLM<uint32_t, TestFeatures>(Q);
+  Passed &= testSLM<float, TestFeatures>(Q);
+  Passed &= testSLM<ext::intel::experimental::esimd::tfloat32, TestFeatures>(Q);
+
+  std::cout << (Passed ? "Passed\n" : "FAILED\n");
+  return Passed ? 0 : 1;
+}