Skip to content

Commit

Permalink
[SYCL][ESIMD] Implement slm_gather accepting compile-time properties (i…
Browse files Browse the repository at this point in the history
  • Loading branch information
fineg74 authored Jan 29, 2024
1 parent f4b4a84 commit 5582ce4
Show file tree
Hide file tree
Showing 8 changed files with 806 additions and 53 deletions.
393 changes: 375 additions & 18 deletions sycl/include/sycl/ext/intel/esimd/memory.hpp

Large diffs are not rendered by default.

35 changes: 3 additions & 32 deletions sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -556,20 +556,8 @@ template <typename T, int NElts = 1,
__ESIMD_API __ESIMD_NS::simd<T, N * NElts>
lsc_slm_gather(__ESIMD_NS::simd<uint32_t, N> offsets,
__ESIMD_NS::simd_mask<N> pred = 1) {
detail::check_lsc_vector_size<NElts>();
detail::check_lsc_data_size<T, DS>();
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS =
detail::expand_data_size(detail::finalize_data_size<T, DS>());
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
constexpr auto _Transposed = detail::lsc_data_order::nontranspose;
using MsgT = typename detail::lsc_expand_type<T>::type;
__ESIMD_NS::simd<MsgT, N * NElts> Tmp =
__esimd_lsc_load_slm<MsgT, cache_hint::none, cache_hint::none,
_AddressScale, _ImmOffset, _DS, _VS, _Transposed, N>(
pred.data(), offsets.data());
return detail::lsc_format_ret<T>(Tmp);
__ESIMD_NS::simd<T, N * NElts> pass_thru;
return __ESIMD_DNS::slm_gather_impl<T, NElts, DS>(offsets, pred, pass_thru);
}

/// SLM gather.
Expand All @@ -595,24 +583,7 @@ __ESIMD_API __ESIMD_NS::simd<T, N * NElts>
lsc_slm_gather(__ESIMD_NS::simd<uint32_t, N> offsets,
__ESIMD_NS::simd_mask<N> pred,
__ESIMD_NS::simd<T, N * NElts> pass_thru) {
detail::check_lsc_vector_size<NElts>();
detail::check_lsc_data_size<T, DS>();
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS =
detail::expand_data_size(detail::finalize_data_size<T, DS>());
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
constexpr detail::lsc_data_order _Transposed =
detail::lsc_data_order::nontranspose;
using MsgT = typename detail::lsc_expand_type<T>::type;
__ESIMD_NS::simd<MsgT, N * NElts> PassThruExpanded =
detail::lsc_format_input<MsgT>(pass_thru);
__ESIMD_NS::simd<MsgT, N * NElts> Result =
__esimd_lsc_load_merge_slm<MsgT, cache_hint::none, cache_hint::none,
_AddressScale, _ImmOffset, _DS, _VS,
_Transposed, N>(pred.data(), offsets.data(),
PassThruExpanded.data());
return detail::lsc_format_ret<T>(Result);
return __ESIMD_DNS::slm_gather_impl<T, NElts, DS>(offsets, pred, pass_thru);
}

/// Transposed SLM gather with 1 channel.
Expand Down
240 changes: 238 additions & 2 deletions sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -468,6 +468,244 @@ template <typename T, TestFeatures Features> bool testUSM(queue Q) {
return Passed;
}

template <typename T, uint16_t N, uint16_t VS, bool UseMask, bool UsePassThru,
bool UseProperties, typename PropertiesT>
bool testSLM(queue Q, uint32_t MaskStride, PropertiesT) {

static_assert(VS > 0 && N % VS == 0,
"Incorrect VS parameter. N must be divisible by VS.");
constexpr int NOffsets = N / VS;
static_assert(!UsePassThru || UseMask,
"PassThru cannot be used without using mask");

constexpr uint32_t Groups = 8;
constexpr uint32_t Threads = 16;

std::cout << "Running slm_gather case: T=" << esimd_test::type_name<T>()
<< ", N=" << N << ", VS=" << VS << ", MaskStride=" << MaskStride
<< ", Groups=" << Groups << ", Threads=" << Threads
<< ", use_mask=" << UseMask << ", use_pass_thru=" << UsePassThru
<< ", use_properties=" << UseProperties << std::endl;

constexpr uint16_t Size = Groups * Threads * N;
using Tuint = esimd_test::uint_type_t<sizeof(T)>;

sycl::range<1> GlobalRange{Groups};
sycl::range<1> LocalRange{Threads};
sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};

T *Out = sycl::malloc_shared<T>(Size, Q);
std::memset(Out, 0, Size * sizeof(T));

T *In = sycl::malloc_shared<T>(Size * 2, Q);
for (int I = 0; I < Size; I++)
In[I] = esimd_test::getRandomValue<T>();

try {
Q.parallel_for(Range, [=](sycl::nd_item<1> NDI) SYCL_ESIMD_KERNEL {
uint16_t GlobalID = NDI.get_global_id(0);
uint16_t LocalID = NDI.get_local_id(0);
uint32_t GlobalElemOffset = GlobalID * N;
uint32_t LocalElemOffset = LocalID * N;

// Allocate a bit more to safely initialize it with 4-element chunks.
constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T);
slm_init<SLMSize>();

if (LocalID == 0) {
for (int I = 0; I < Threads * N; I += 8) {
simd<T, 8> InVec(In + GlobalElemOffset + I);
simd<uint32_t, 8> offsets(I * sizeof(T), sizeof(T));
slm_scatter<T>(offsets, InVec);
}
}
barrier();

PropertiesT Props{};

simd<uint32_t, NOffsets> ByteOffsets(LocalElemOffset * sizeof(T),
VS * sizeof(T));
simd_view ByteOffsetsView = ByteOffsets.template select<NOffsets, 1>();

simd_mask<NOffsets> Pred;
for (int I = 0; I < NOffsets; I++)
Pred[I] = (I % MaskStride == 0) ? 1 : 0;

using Tuint = esimd_test::uint_type_t<sizeof(T)>;
simd<Tuint, N> PassThruInt(GlobalElemOffset, 1);
simd<T, N> PassThru = PassThruInt.template bit_cast_view<T>();
auto PassThruView = PassThru.template select<N, 1>(0);

simd<T, N> Vals;
if constexpr (VS > 1) { // VS > 1 requires specifying <T, N, VS>
if constexpr (UsePassThru) {
if constexpr (UseProperties) {
if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
Vals = slm_gather<T, N, VS>(ByteOffsets, Pred, PassThru, Props);
else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
Vals =
slm_gather<T, N, VS>(ByteOffsets, Pred, PassThruView, Props);
else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
Vals =
slm_gather<T, N, VS>(ByteOffsetsView, Pred, PassThru, Props);
else // ByteOffset - view, PassThru - view
Vals = slm_gather<T, N, VS>(ByteOffsetsView, Pred, PassThruView,
Props);
} else { // UseProperties is false
if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
Vals = slm_gather<T, N, VS>(ByteOffsets, Pred, PassThru);
else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
Vals = slm_gather<T, N, VS>(ByteOffsets, Pred, PassThruView);
else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
Vals = slm_gather<T, N, VS>(ByteOffsetsView, Pred, PassThru);
else // ByteOffset - view, PassThru - view
Vals = slm_gather<T, N, VS>(ByteOffsetsView, Pred, PassThruView);
}
} else if constexpr (UseMask) { // UsePassThru is false
if constexpr (UseProperties) {
if (GlobalID % 2 == 0) // ByteOffset - simd
Vals = slm_gather<T, N, VS>(ByteOffsets, Pred, Props);
else // ByteOffset - simd_view
Vals = slm_gather<T, N, VS>(ByteOffsetsView, Pred, Props);
} else { // UseProperties is false
if (GlobalID % 2 == 0) // ByteOffset - simd
Vals = slm_gather<T, N, VS>(ByteOffsets, Pred);
else // ByteOffset - simd_view
Vals = slm_gather<T, N, VS>(ByteOffsetsView, Pred);
}
} else { // UseMask is false, UsePassThru is false
if constexpr (UseProperties) {
if (GlobalID % 2 == 0) // ByteOffset - simd
Vals = slm_gather<T, N, VS>(ByteOffsets, Props);
else // ByteOffset - simd_view
Vals = slm_gather<T, N, VS>(ByteOffsetsView, Props);
} else { // UseProperties is false
if (GlobalID % 2 == 0) // ByteOffset - simd
Vals = slm_gather<T, N, VS>(ByteOffsets);
else // ByteOffset - simd_view
Vals = slm_gather<T, N, VS>(ByteOffsetsView);
}
}
} else {
// if (VS == 1) then <T, N, VS> can often be omitted - test it here.
// The variants accepting simd_view for 'PassThru' operand though
// still require <T, N> to be specified explicitly to help
// C++ FE do simd to simd_view matching.
if constexpr (UsePassThru) {
if constexpr (UseProperties) {
if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
Vals = slm_gather<T>(ByteOffsets, Pred, PassThru, Props);
else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
Vals = slm_gather<T, N>(ByteOffsets, Pred, PassThruView, Props);
else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
Vals = slm_gather<T, N>(ByteOffsetsView, Pred, PassThru, Props);
else // ByteOffset - view, PassThru - view
Vals =
slm_gather<T, N>(ByteOffsetsView, Pred, PassThruView, Props);
} else { // UseProperties is false
if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd
Vals = slm_gather<T>(ByteOffsets, Pred, PassThru);
else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view
Vals = slm_gather<T, N>(ByteOffsets, Pred, PassThruView);
else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd
Vals = slm_gather<T, N>(ByteOffsetsView, Pred, PassThru);
else // ByteOffset - view, PassThru - view
Vals = slm_gather<T, N>(ByteOffsetsView, Pred, PassThruView);
}
} else if constexpr (UseMask) { // UsePassThru is false
if constexpr (UseProperties) {
if (GlobalID % 2 == 0) // ByteOffset - simd
Vals = slm_gather<T>(ByteOffsets, Pred, Props);
else // ByteOffset - simd_view
Vals = slm_gather<T, N>(ByteOffsetsView, Pred, Props);
} else { // UseProperties is false
if (GlobalID % 2 == 0) // ByteOffset - simd
Vals = slm_gather<T>(ByteOffsets, Pred);
else // ByteOffset - simd_view
Vals = slm_gather<T, N>(ByteOffsetsView, Pred);
}
} else { // UsePassThru is false, UseMask is false
if constexpr (UseProperties) {
if (GlobalID % 2 == 0) // ByteOffset - simd
Vals = slm_gather<T>(ByteOffsets, Props);
else // ByteOffset - simd_view
Vals = slm_gather<T, N>(ByteOffsetsView, Props);
} else {
if (GlobalID % 2 == 0) // ByteOffset - simd
Vals = slm_gather<T>(ByteOffsets);
else // ByteOffset - simd_view
Vals = slm_gather<T, N>(ByteOffsetsView);
}
}
} // end if (VS == 1)
Vals.copy_to(Out + GlobalElemOffset);
}).wait();
} catch (sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
sycl::free(In, Q);
sycl::free(Out, Q);
return false;
}

bool Passed = verify(In, Out, N, Size, VS, MaskStride, UseMask, UsePassThru);
if (!Passed)
std::cout << "Case FAILED" << std::endl;

sycl::free(In, Q);
sycl::free(Out, Q);
return Passed;
}

template <typename T, TestFeatures Features> bool testSLM(queue Q) {
constexpr bool UseMask = true;
constexpr bool UsePassThru = true;
constexpr bool UseProperties = true;

properties AlignElemProps{alignment<sizeof(T)>};

bool Passed = true;
Passed &= testSLM<T, 1, 1, !UseMask, !UsePassThru, !UseProperties>(
Q, 2, AlignElemProps);
Passed &= testSLM<T, 2, 1, UseMask, !UsePassThru, !UseProperties>(
Q, 2, AlignElemProps);
Passed &= testSLM<T, 4, 1, UseMask, !UsePassThru, !UseProperties>(
Q, 2, AlignElemProps);
Passed &= testSLM<T, 8, 1, UseMask, !UsePassThru, UseProperties>(
Q, 3, AlignElemProps);
// UsePassThru requires either DG2/PVC or support of llvm.masked.gather LLVM
// IR.
#ifdef __ESIMD_GATHER_SCATTER_LLVM_IR
Passed &= testSLM<T, 16, 1, UseMask, UsePassThru, UseProperties>(
Q, 2, AlignElemProps);
Passed &= testSLM<T, 32, 1, UseMask, UsePassThru, !UseProperties>(
Q, 3, AlignElemProps);
#endif

// TODO: test non-power-of-2 N
// Such cases were promised to be supported, but in fact they fail.
// Create some test cases here after the issue in GPU driver is resolved.

if constexpr (Features == TestFeatures::PVC ||
Features == TestFeatures::DG2) {

// Check VS > 1. GPU supports only dwords and qwords in this mode.
if constexpr (sizeof(T) >= 4) {
// TODO: This test case causes flaky fail. Enable it after the issue
// in GPU driver is fixed.
// Passed &= testUSM<T, 16, 2, UseMask, !UsePassThru, UseProperties>(
// Q, 3, AlignElemProps);

Passed &= testSLM<T, 32, 2, !UseMask, !UsePassThru, UseProperties>(
Q, 3, AlignElemProps);
Passed &= testSLM<T, 32, 2, UseMask, !UsePassThru, UseProperties>(
Q, 3, AlignElemProps);
Passed &= testSLM<T, 32, 2, UseMask, UsePassThru, UseProperties>(
Q, 3, AlignElemProps);
}
}
return Passed;
}

template <typename T, TestFeatures Features> bool testACC(queue Q) {
constexpr bool UseMask = true;
constexpr bool UsePassThru = true;
Expand All @@ -478,12 +716,10 @@ template <typename T, TestFeatures Features> bool testACC(queue Q) {
bool Passed = true;
Passed &= testACC<T, 1, 1, !UseMask, !UsePassThru, !UseProperties>(
Q, 2, AlignElemProps);
#ifdef __ESIMD_FORCE_STATELESS_MEM
Passed &= testACC<T, 2, 1, UseMask, !UsePassThru, !UseProperties>(
Q, 2, AlignElemProps);
Passed &= testACC<T, 4, 1, UseMask, !UsePassThru, !UseProperties>(
Q, 2, AlignElemProps);
#endif // __ESIMD_FORCE_STATELESS_MEM
Passed &= testACC<T, 8, 1, UseMask, !UsePassThru, !UseProperties>(
Q, 3, AlignElemProps);
Passed &= testACC<T, 16, 1, UseMask, !UsePassThru, UseProperties>(
Expand Down
34 changes: 34 additions & 0 deletions sycl/test-e2e/ESIMD/unified_memory_api/slm_gather.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
//==------- slm_gather.cpp - DPC++ ESIMD on-device test --------------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Use per-kernel compilation to have more information about failing cases.
// RUN: %{build} -fsycl-device-code-split=per_kernel -D__ESIMD_GATHER_SCATTER_LLVM_IR -o %t.out
// RUN: %{run} %t.out

// The test verifies esimd::slm_gather() functions accepting optional
// compile-time esimd::properties. The slm_gather() calls in this test do not
// use VS > 1 (number of loads per offset) to not impose using DG2/PVC features.

#include "Inputs/gather.hpp"

int main() {
auto Q = queue{gpu_selector_v};
esimd_test::printTestLabel(Q);

constexpr auto TestFeatures = TestFeatures::Generic;
bool Passed = true;

Passed &= testSLM<int8_t, TestFeatures>(Q);
Passed &= testSLM<int16_t, TestFeatures>(Q);
if (Q.get_device().has(sycl::aspect::fp16))
Passed &= testSLM<sycl::half, TestFeatures>(Q);
Passed &= testSLM<uint32_t, TestFeatures>(Q);
Passed &= testSLM<float, TestFeatures>(Q);
Passed &= testSLM<ext::intel::experimental::esimd::tfloat32, TestFeatures>(Q);
std::cout << (Passed ? "Passed\n" : "FAILED\n");
return Passed ? 0 : 1;
}
38 changes: 38 additions & 0 deletions sycl/test-e2e/ESIMD/unified_memory_api/slm_gather_dg2_pvc.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
//==------- slm_gather_dg2_pvc.cpp - DPC++ ESIMD on-device test ------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: gpu-intel-dg2 || gpu-intel-pvc
// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
// RUN: %{run} %t.out

// The test verifies esimd::slm_gather() functions accepting optional
// compile-time esimd::properties. The slm_gather() calls in this test use
// VS > 1 (number of loads per offset) and require DG2 or PVC to run.

#include "Inputs/gather.hpp"

int main() {
auto Q = queue{gpu_selector_v};
esimd_test::printTestLabel(Q);

// DG2 and PVC support same gather() configurations. If some gather call
// has corresponding instructions in PVC and does not have it in DG2, then
// GPU RT emulates it for DG2.
constexpr auto TestFeatures = TestFeatures::DG2;
bool Passed = true;

Passed &= testSLM<int8_t, TestFeatures>(Q);
Passed &= testSLM<int16_t, TestFeatures>(Q);
if (Q.get_device().has(sycl::aspect::fp16))
Passed &= testSLM<sycl::half, TestFeatures>(Q);
Passed &= testSLM<uint32_t, TestFeatures>(Q);
Passed &= testSLM<float, TestFeatures>(Q);
Passed &= testSLM<ext::intel::experimental::esimd::tfloat32, TestFeatures>(Q);

std::cout << (Passed ? "Passed\n" : "FAILED\n");
return Passed ? 0 : 1;
}
Loading

0 comments on commit 5582ce4

Please sign in to comment.