Skip to content

Commit

Permalink
[SYCL][ESIMD] Implement unified memory API - block_store(usm, ...) (#…
Browse files Browse the repository at this point in the history
…11641)

This change adds the groundwork for adding overloads of the block_store
APIs accepting compile time properties (L1,L2 cache hints, alignment).
We have 8 overloads total, with various combinations of offset, predicate and simd_view.

---------

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
  • Loading branch information
sarnex authored Oct 30, 2023
1 parent 0b5757b commit d38206c
Show file tree
Hide file tree
Showing 9 changed files with 684 additions and 130 deletions.
33 changes: 33 additions & 0 deletions sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,39 @@ __esimd_lsc_load_stateless(__ESIMD_DNS::simd_mask_storage_t<N> pred,
}
#endif // __SYCL_DEVICE_ONLY__

/// USM pointer scatter.
/// Supported platforms: DG2, PVC
///
/// Scatters elements to specific address.
///
/// @tparam Ty is element type.
/// @tparam L1H is L1 cache hint.
/// @tparam L2H is L2 cache hint.
/// @tparam AddressScale is the address scale.
/// @tparam ImmOffset is the immediate offset added to each address.
/// @tparam DS is the data size.
/// @tparam VS is the number of elements to load per address.
/// @tparam Transposed indicates if the data is transposed during the transfer.
/// @tparam N is the SIMD size of operation (the number of addresses to access)
/// @param pred is predicates.
/// @param addrs is the prefetch addresses.
/// @param vals is values to store.
template <typename Ty, __ESIMD_NS::cache_hint L1H, __ESIMD_NS::cache_hint L2H,
uint16_t AddressScale, int ImmOffset, __ESIMD_DNS::lsc_data_size DS,
__ESIMD_DNS::lsc_vector_size VS,
__ESIMD_DNS::lsc_data_order _Transposed, int N>
__ESIMD_INTRIN void __esimd_lsc_store_stateless(
__ESIMD_DNS::simd_mask_storage_t<N> pred,
__ESIMD_DNS::vector_type_t<uintptr_t, N> addrs,
__ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_DNS::to_int<VS>()> vals)
#ifdef __SYCL_DEVICE_ONLY__
;
#else // __SYCL_DEVICE_ONLY__
{
__ESIMD_UNSUPPORTED_ON_HOST;
}
#endif // __SYCL_DEVICE_ONLY__

// \brief Raw sends.
//
// @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT).
Expand Down
337 changes: 336 additions & 1 deletion sycl/include/sycl/ext/intel/esimd/memory.hpp

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -282,39 +282,6 @@ __ESIMD_INTRIN void __esimd_lsc_store_bti(
}
#endif // __SYCL_DEVICE_ONLY__

/// USM pointer scatter.
/// Supported platforms: DG2, PVC
///
/// Scatters elements to specific address.
///
/// @tparam Ty is element type.
/// @tparam L1H is L1 cache hint.
/// @tparam L3H is L3 cache hint.
/// @tparam AddressScale is the address scale.
/// @tparam ImmOffset is the immediate offset added to each address.
/// @tparam DS is the data size.
/// @tparam VS is the number of elements to load per address.
/// @tparam Transposed indicates if the data is transposed during the transfer.
/// @tparam N is the SIMD size of operation (the number of addresses to access)
/// @param pred is predicates.
/// @param addrs is the prefetch addresses.
/// @param vals is values to store.
template <typename Ty, __ESIMD_ENS::cache_hint L1H, __ESIMD_ENS::cache_hint L3H,
uint16_t AddressScale, int ImmOffset, __ESIMD_ENS::lsc_data_size DS,
__ESIMD_EDNS::lsc_vector_size VS,
__ESIMD_EDNS::lsc_data_order _Transposed, int N>
__ESIMD_INTRIN void __esimd_lsc_store_stateless(
__ESIMD_DNS::simd_mask_storage_t<N> pred,
__ESIMD_DNS::vector_type_t<uintptr_t, N> addrs,
__ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()> vals)
#ifdef __SYCL_DEVICE_ONLY__
;
#else // __SYCL_DEVICE_ONLY__
{
__ESIMD_UNSUPPORTED_ON_HOST;
}
#endif // __SYCL_DEVICE_ONLY__

/// 2D USM pointer block load.
/// Supported platforms: PVC
///
Expand Down
60 changes: 3 additions & 57 deletions sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1765,7 +1765,7 @@ lsc_scatter(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
///
/// @tparam T is element type.
/// @tparam NElts is the number of elements to store per address.
/// @tparam DS is the data size.
/// @tparam DS is the data size (unused/obsolete).
/// @tparam L1H is L1 cache hint.
/// @tparam L3H is L3 cache hint.
/// @param p is the base pointer.
Expand All @@ -1781,62 +1781,8 @@ template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
__ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v<FlagsT>>
lsc_block_store(T *p, __ESIMD_NS::simd<T, NElts> vals,
__ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) {
detail::check_lsc_data_size<T, DS>();
detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
constexpr auto Alignment =
FlagsT::template alignment<__ESIMD_DNS::__raw_t<T>>;
static_assert(
(Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
(Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
"Incorrect alignment for the data type");

// Prepare template arguments for the call of intrinsic.
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
static_assert(_DS == lsc_data_size::u16 || _DS == lsc_data_size::u8 ||
_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
"Conversion data types are not supported");
constexpr detail::lsc_data_order _Transposed =
detail::lsc_data_order::transpose;
constexpr int N = 1;
__ESIMD_NS::simd<uintptr_t, N> Addrs = reinterpret_cast<uintptr_t>(p);

constexpr int SmallIntFactor32Bit =
(_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1);
static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
"Number of elements is not supported by Transposed store");

constexpr bool Use64BitData =
Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
(sizeof(T) == 8 ||
(DS == lsc_data_size::default_size && NElts / SmallIntFactor32Bit > 64 &&
(NElts * sizeof(T)) % 8 == 0));
constexpr int SmallIntFactor64Bit =
(_DS == lsc_data_size::u16)
? 4
: (_DS == lsc_data_size::u8 ? 8
: (_DS == lsc_data_size::u32 ? 2 : 1));
constexpr int SmallIntFactor =
Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
constexpr int FactoredNElts = NElts / SmallIntFactor;
constexpr lsc_data_size ActualDS = Use64BitData
? __ESIMD_ENS::lsc_data_size::u64
: __ESIMD_ENS::lsc_data_size::u32;

detail::check_lsc_vector_size<FactoredNElts>();
constexpr detail::lsc_vector_size _VS =
detail::to_lsc_vector_size<FactoredNElts>();

using StoreType = __ESIMD_DNS::__raw_t<
std::conditional_t<SmallIntFactor == 1, T,
std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;

__esimd_lsc_store_stateless<StoreType, L1H, L3H, _AddressScale, _ImmOffset,
ActualDS, _VS, _Transposed, N>(
pred.data(), Addrs.data(),
sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreType, FactoredNElts>>(
vals.data()));
return __ESIMD_DNS::block_store_impl<T, NElts, L1H, L3H>(p, vals, pred,
flags);
}

/// A variation of lsc_block_store without predicate parameter to simplify
Expand Down
40 changes: 1 addition & 39 deletions sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,49 +13,11 @@
#include <iostream>

#include "../../esimd_test_utils.hpp"
#include "common.hpp"

using namespace sycl;
using namespace sycl::ext::intel::esimd;

template <typename Key, typename PropertiesT>
constexpr cache_hint getCacheHint(PropertiesT) {
if constexpr (PropertiesT::template has_property<Key>()) {
constexpr auto ValueT = PropertiesT::template get_property<Key>();
return ValueT.hint;
} else {
return cache_hint::none;
}
}

template <typename PropertiesT>
constexpr size_t getAlignment(PropertiesT, size_t DefaultAlignment) {
if constexpr (PropertiesT::template has_property<
sycl::ext::intel::esimd::alignment_key>()) {
constexpr auto ValueT = PropertiesT::template get_property<
sycl::ext::intel::esimd::alignment_key>();
return ValueT.value;
} else {
return DefaultAlignment;
}
}

template <typename T, uint16_t N, bool UseMask, typename PropertiesT>
constexpr size_t getAlignment(PropertiesT Props) {
constexpr cache_hint L1Hint =
getCacheHint<sycl::ext::intel::esimd::cache_hint_L1_key>(Props);
constexpr cache_hint L2Hint =
getCacheHint<sycl::ext::intel::esimd::cache_hint_L2_key>(Props);
constexpr bool RequiresPVC =
L1Hint != cache_hint::none || L2Hint != cache_hint::none || UseMask;

constexpr bool IsMaxLoadSizePVC = RequiresPVC && (N * sizeof(T) > 256);
constexpr size_t RequiredAlignment =
IsMaxLoadSizePVC ? 8 : (RequiresPVC ? 4 : sizeof(T));
constexpr size_t RequestedAlignment = getAlignment(Props, RequiredAlignment);
static_assert(RequestedAlignment >= RequiredAlignment, "Too small alignment");
return RequestedAlignment;
}

// Returns true iff verification is passed.
template <typename T>
bool verify(const T *In, const T *Out, size_t Size, int N,
Expand Down
178 changes: 178 additions & 0 deletions sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_store.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
//==------- block_store.hpp - DPC++ ESIMD on-device test ----------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===-------------------------------------------------------------------===//

#include "common.hpp"

using namespace sycl;
using namespace sycl::ext::intel::esimd;

template <typename T, uint16_t N, bool UseMask, bool UseProperties,
typename StorePropertiesT>
bool testUSM(queue Q, uint32_t Groups, uint32_t Threads,
StorePropertiesT StoreProperties) {

uint16_t Size = Groups * Threads * N;
using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t<sizeof(T)>;

std::cout << "USM case: T=" << esimd_test::type_name<T>() << ",N=" << N
<< ",UseMask=" << UseMask << ",UseProperties=" << UseProperties
<< std::endl;

sycl::range<1> GlobalRange{Groups};
sycl::range<1> LocalRange{Threads};
sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
constexpr size_t Alignment = getAlignment<T, N, UseMask>(StoreProperties);
T *Out = sycl::aligned_alloc_shared<T>(Alignment, Size, Q);
T Out_val = esimd_test::getRandomValue<T>();
for (int i = 0; i < Size; i++)
Out[i] = Out_val;

try {
Q.submit([&](handler &cgh) {
cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL {
uint16_t GlobalID = ndi.get_global_id(0);
uint32_t ElemOff = GlobalID * N;
// TODO: these 2 lines work-around the problem with scalar
// conversions to bfloat16. It could be just: "simd<T, N>
// PassThru(ElemOffset, 1);"
simd<uint32_t, N> PassThruInt(ElemOff, 1);
simd<T, N> Vals = PassThruInt;
if constexpr (UseMask) {
simd_mask<1> Mask = (GlobalID + 1) % 1;
block_store(Out + ElemOff, Vals, Mask, StorePropertiesT{});
Vals = block_load<T, N>(Out + ElemOff);
Vals += 1;
block_store(Out, ElemOff * sizeof(T), Vals, Mask,
StorePropertiesT{});
Vals = block_load<T, N>(Out + ElemOff);
Vals += 2;
auto View = Vals.template select<N, 1>();
block_store<T, N>(Out, ElemOff * sizeof(T), View, Mask,
StorePropertiesT{});
Vals = block_load<T, N>(Out + ElemOff);
Vals += 3;
View = Vals.template select<N, 1>();
block_store<T, N>(Out + ElemOff, View, Mask, StorePropertiesT{});
} else {
if constexpr (UseProperties)
block_store(Out + ElemOff, Vals, StorePropertiesT{});

else
block_store(Out + ElemOff, Vals);

Vals = block_load<T, N>(Out + ElemOff);
Vals += 1;
if constexpr (UseProperties)
block_store(Out, ElemOff * sizeof(T), Vals, StorePropertiesT{});
else
block_store(Out, ElemOff * sizeof(T), Vals);

Vals = block_load<T, N>(Out + ElemOff);
Vals += 2;
auto View = Vals.template select<N, 1>();
if constexpr (UseProperties)
block_store<T, N>(Out, ElemOff * sizeof(T), View,
StorePropertiesT{});
else
block_store<T, N>(Out, ElemOff * sizeof(T), View);

Vals = block_load<T, N>(Out + ElemOff);
Vals += 3;
View = Vals.template select<N, 1>();
if constexpr (UseProperties)
block_store<T, N>(Out + ElemOff, View, StorePropertiesT{});
else
block_store<T, N>(Out + ElemOff, View);
}
});
}).wait();
} catch (sycl::exception const &e) {
std::cout << "SYCL exception caught: " << e.what() << '\n';
sycl::free(Out, Q);
return false;
}

bool Passed = true;

for (int i = 0; i < Size; i++) {
bool IsMaskSet = (i / N + 1) % 1;
Tuint Expected = sycl::bit_cast<Tuint>(Out_val);
if (!UseMask || IsMaskSet)
Expected = sycl::bit_cast<Tuint>((T)(i + 6));
Tuint Computed = sycl::bit_cast<Tuint>(Out[i]);
if (Computed != Expected) {
Passed = false;
std::cout << "Out[" << i << "] = " << std::to_string(Computed) << " vs "
<< std::to_string(Expected) << std::endl;
}
}

sycl::free(Out, Q);

return Passed;
}

template <typename T, bool TestPVCFeatures> bool test_block_store(queue Q) {
constexpr bool CheckMask = true;
constexpr bool CheckProperties = true;
properties AlignOnlyProps{alignment<sizeof(T)>};

bool Passed = true;

// Test block_store() that is available on Gen12 and PVC.
Passed &= testUSM<T, 1, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
Passed &= testUSM<T, 2, !CheckMask, CheckProperties>(Q, 1, 4, AlignOnlyProps);
Passed &= testUSM<T, 3, !CheckMask, CheckProperties>(Q, 2, 8, AlignOnlyProps);
Passed &= testUSM<T, 4, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
Passed &= testUSM<T, 8, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
Passed &=
testUSM<T, 16, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
Passed &=
testUSM<T, 32, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
// Intentionally check non-power-of-2 simd size - it must work.
Passed &=
testUSM<T, 33, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
// TODO: Enable after failure fixed
// Passed &=
// testUSM<T, 67, !CheckMask, CheckProperties>(Q, 1, 4, AlignOnlyProps);
// Intentionally check big simd size - it must work.
Passed &=
testUSM<T, 128, !CheckMask, CheckProperties>(Q, 2, 4, AlignOnlyProps);
Passed &=
testUSM<T, 256, !CheckMask, CheckProperties>(Q, 1, 4, AlignOnlyProps);

// Test block_store() without passing compile-time properties argument.
Passed &=
testUSM<T, 16, !CheckMask, !CheckProperties>(Q, 2, 4, AlignOnlyProps);
Passed &=
testUSM<T, 32, !CheckMask, !CheckProperties>(Q, 2, 4, AlignOnlyProps);

if constexpr (TestPVCFeatures) {
// Using cache hints adds the requirement to run tests on PVC.
// Also, PVC variant currently requires power-or-two elements and
// the number of bytes loaded per call must not exceed 512.
properties PVCProps{cache_hint_L1<cache_hint::write_back>,
cache_hint_L2<cache_hint::write_back>, alignment<16>};

if constexpr (sizeof(T) >= 4) // only d/q words are supported now
Passed &= testUSM<T, 1, !CheckMask, CheckProperties>(Q, 2, 4, PVCProps);
if constexpr (sizeof(T) >= 2) // only d/q words are supported now
Passed &= testUSM<T, 2, !CheckMask, CheckProperties>(Q, 5, 5, PVCProps);
Passed &= testUSM<T, 4, !CheckMask, CheckProperties>(Q, 5, 5, PVCProps);
Passed &= testUSM<T, 8, !CheckMask, CheckProperties>(Q, 5, 5, PVCProps);
Passed &= testUSM<T, 16, CheckMask, CheckProperties>(Q, 5, 5, PVCProps);
Passed &= testUSM<T, 32, !CheckMask, CheckProperties>(Q, 2, 4, PVCProps);
Passed &= testUSM<T, 64, !CheckMask, CheckProperties>(Q, 7, 1, PVCProps);
if constexpr (128 * sizeof(T) <= 512)
Passed &= testUSM<T, 128, CheckMask, CheckProperties>(Q, 1, 4, PVCProps);
if constexpr (256 * sizeof(T) <= 512)
Passed &= testUSM<T, 256, CheckMask, CheckProperties>(Q, 1, 4, PVCProps);
} // TestPVCFeatures

return Passed;
}
Loading

0 comments on commit d38206c

Please sign in to comment.