Skip to content

Commit

Permalink
[SYCL][ESIMD] Emulate handling of 64 bit data by accessor version of …
Browse files Browse the repository at this point in the history
…gather/scatter (#12602)
  • Loading branch information
fineg74 authored Feb 13, 2024
1 parent 05740cc commit ffeeae2
Show file tree
Hide file tree
Showing 10 changed files with 183 additions and 90 deletions.
223 changes: 143 additions & 80 deletions sycl/include/sycl/ext/intel/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2699,32 +2699,40 @@ ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t<
scatter_impl(AccessorTy acc, simd<T, N> vals, simd<uint32_t, N> offsets,
uint32_t glob_offset, simd_mask<N> mask) {

static_assert(sizeof(T) <= 4 && detail::isPowerOf2(N, 32),
"Unexpected type or vector length");
constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
// TODO (performance) use hardware-supported scale once BE supports it
constexpr int16_t scale = 0;
const auto si = __ESIMD_NS::get_surface_index(acc);

if constexpr (sizeof(T) < 4) {
using Tint = std::conditional_t<std::is_integral_v<T>, T,
detail::uint_type_t<sizeof(T)>>;
using Treal = __raw_t<T>;
simd<Tint, N> vals_int = bitcast<Tint, Treal, N>(std::move(vals).data());
using PromoT = typename std::conditional_t<std::is_signed<Tint>::value,
int32_t, uint32_t>;
const simd<PromoT, N> promo_vals = convert<PromoT>(std::move(vals_int));
__esimd_scatter_scaled<PromoT, N, decltype(si), TypeSizeLog2, scale>(
mask.data(), si, glob_offset, offsets.data(), promo_vals.data());
static_assert(detail::isPowerOf2(N, 32), "Unexpected vector length");
if constexpr (sizeof(T) == 8) {
scatter_impl<uint32_t, N>(
acc, vals.template bit_cast_view<uint32_t>().template select<N, 2>(0),
offsets, glob_offset, mask);
scatter_impl<uint32_t, N>(
acc, vals.template bit_cast_view<uint32_t>().template select<N, 2>(1),
offsets, glob_offset + sizeof(uint32_t), mask);
} else {
using Treal = __raw_t<T>;
if constexpr (!std::is_same_v<Treal, T>) {
simd<Treal, N> Values = vals.template bit_cast_view<Treal>();
__esimd_scatter_scaled<Treal, N, decltype(si), TypeSizeLog2, scale>(
mask.data(), si, glob_offset, offsets.data(), Values.data());
constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
// TODO (performance) use hardware-supported scale once BE supports it
constexpr int16_t scale = 0;
const auto si = __ESIMD_NS::get_surface_index(acc);

if constexpr (sizeof(T) < 4) {
using Tint = std::conditional_t<std::is_integral_v<T>, T,
detail::uint_type_t<sizeof(T)>>;
using Treal = __raw_t<T>;
simd<Tint, N> vals_int = bitcast<Tint, Treal, N>(std::move(vals).data());
using PromoT = typename std::conditional_t<std::is_signed<Tint>::value,
int32_t, uint32_t>;
const simd<PromoT, N> promo_vals = convert<PromoT>(std::move(vals_int));
__esimd_scatter_scaled<PromoT, N, decltype(si), TypeSizeLog2, scale>(
mask.data(), si, glob_offset, offsets.data(), promo_vals.data());
} else {
__esimd_scatter_scaled<T, N, decltype(si), TypeSizeLog2, scale>(
mask.data(), si, glob_offset, offsets.data(), vals.data());
using Treal = __raw_t<T>;
if constexpr (!std::is_same_v<Treal, T>) {
simd<Treal, N> Values = vals.template bit_cast_view<Treal>();
__esimd_scatter_scaled<Treal, N, decltype(si), TypeSizeLog2, scale>(
mask.data(), si, glob_offset, offsets.data(), Values.data());
} else {
__esimd_scatter_scaled<T, N, decltype(si), TypeSizeLog2, scale>(
mask.data(), si, glob_offset, offsets.data(), vals.data());
}
}
}
}
Expand All @@ -2736,42 +2744,50 @@ __ESIMD_API std::enable_if_t<
simd<T, N>>
gather_impl(AccessorTy acc, simd<uint32_t, N> offsets, uint32_t glob_offset,
simd_mask<N> mask) {
static_assert(sizeof(T) <= 4 && detail::isPowerOf2(N, 32),
"Unexpected type or vector length");

constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
// TODO (performance) use hardware-supported scale once BE supports it
constexpr uint32_t scale = 0;
const auto si = get_surface_index(acc);

if constexpr (sizeof(T) < 4) {
using Tint = std::conditional_t<std::is_integral_v<T>, T,
detail::uint_type_t<sizeof(T)>>;
using Treal = __raw_t<T>;
static_assert(std::is_integral<Tint>::value,
"only integral 1- & 2-byte types are supported");
using PromoT = typename std::conditional_t<std::is_signed<Tint>::value,
int32_t, uint32_t>;
simd<PromoT, N> promo_vals =
__esimd_gather_masked_scaled2<PromoT, N, decltype(si), TypeSizeLog2,
scale>(si, glob_offset, offsets.data(),
mask.data());
auto Res = convert<Tint>(promo_vals);

if constexpr (!std::is_same_v<Tint, T>) {
return detail::bitcast<Treal, Tint, N>(Res.data());
} else {
return Res;
}
static_assert(detail::isPowerOf2(N, 32), "Unexpected vector length");

if constexpr (sizeof(T) == 8) {
simd<T, N> Res;
Res.template bit_cast_view<uint32_t>().template select<N, 2>(0) =
gather_impl<uint32_t, N>(acc, offsets, glob_offset, mask);
Res.template bit_cast_view<uint32_t>().template select<N, 2>(1) =
gather_impl<uint32_t, N>(acc, offsets, glob_offset + sizeof(uint32_t),
mask);
return Res;
} else {
using Treal = __raw_t<T>;
simd<Treal, N> Res = __esimd_gather_masked_scaled2<Treal, N, decltype(si),
TypeSizeLog2, scale>(
si, glob_offset, offsets.data(), mask.data());
if constexpr (!std::is_same_v<Treal, T>) {
return Res.template bit_cast_view<T>();
constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
// TODO (performance) use hardware-supported scale once BE supports it
constexpr uint32_t scale = 0;
const auto si = get_surface_index(acc);
if constexpr (sizeof(T) < 4) {
using Tint = std::conditional_t<std::is_integral_v<T>, T,
detail::uint_type_t<sizeof(T)>>;

static_assert(std::is_integral<Tint>::value,
"only integral 1- & 2-byte types are supported");
using PromoT = typename std::conditional_t<std::is_signed<Tint>::value,
int32_t, uint32_t>;
simd<PromoT, N> promo_vals =
__esimd_gather_masked_scaled2<PromoT, N, decltype(si), TypeSizeLog2,
scale>(si, glob_offset, offsets.data(),
mask.data());
auto Res = convert<Tint>(promo_vals);

if constexpr (!std::is_same_v<Tint, T>) {
return detail::bitcast<Treal, Tint, N>(Res.data());
} else {
return Res;
}
} else {
return Res;
simd<Treal, N> Res = __esimd_gather_masked_scaled2<Treal, N, decltype(si),
TypeSizeLog2, scale>(
si, glob_offset, offsets.data(), mask.data());
if constexpr (!std::is_same_v<Treal, T>) {
return Res.template bit_cast_view<T>();
} else {
return Res;
}
}
}
}
Expand Down Expand Up @@ -2927,7 +2943,7 @@ __ESIMD_API
return gather<T, N>(__ESIMD_DNS::accessorToPointer<T>(acc, glob_offset),
byte_offsets, mask);
#else
if constexpr (sizeof(T) > 4 || !(detail::isPowerOf2(N, 32))) {
if constexpr (!detail::isPowerOf2(N, 32)) {
// Requires DG2 or PVC.
simd<T, N> PassThru; // Intentionally undefined
byte_offsets += glob_offset;
Expand Down Expand Up @@ -3136,7 +3152,7 @@ gather(AccessorT acc, simd<OffsetT, N / VS> byte_offsets,
"hint is cache_level::L2 now.");

if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none ||
VS > 1 || sizeof(T) > 4 || !(detail::isPowerOf2(N, 32))) {
VS > 1 || !(detail::isPowerOf2(N, 32))) {
simd<T, N> PassThru; // Intentionally undefined
return detail::gather_impl<T, N, VS, L1Hint, L2Hint,
detail::lsc_data_size::default_size>(
Expand Down Expand Up @@ -3344,13 +3360,13 @@ gather(AccessorT acc, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
///
///
template <typename T, int N, typename AccessorTy>
__ESIMD_API std::enable_if_t<
(sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) &&
detail::is_device_accessor_with_v<AccessorTy,
detail::accessor_mode_cap::can_write>>
scatter(AccessorTy acc, simd<detail::DeviceAccessorOffsetT, N> offsets,
simd<T, N> vals, detail::DeviceAccessorOffsetT glob_offset = 0,
simd_mask<N> mask = 1) {
__ESIMD_API
std::enable_if_t<(detail::isPowerOf2(N, 32)) &&
detail::is_device_accessor_with_v<
AccessorTy, detail::accessor_mode_cap::can_write>>
scatter(AccessorTy acc, simd<detail::DeviceAccessorOffsetT, N> offsets,
simd<T, N> vals, detail::DeviceAccessorOffsetT glob_offset = 0,
simd_mask<N> mask = 1) {
#ifdef __ESIMD_FORCE_STATELESS_MEM
scatter<T, N>(__ESIMD_DNS::accessorToPointer<T>(acc, glob_offset), offsets,
vals, mask);
Expand All @@ -3362,7 +3378,7 @@ scatter(AccessorTy acc, simd<detail::DeviceAccessorOffsetT, N> offsets,
#ifdef __ESIMD_FORCE_STATELESS_MEM
template <typename T, int N, typename AccessorTy, typename Toffset>
__ESIMD_API std::enable_if_t<
(sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) &&
(detail::isPowerOf2(N, 32)) &&
detail::is_device_accessor_with_v<AccessorTy,
detail::accessor_mode_cap::can_write> &&
std::is_integral_v<Toffset> && !std::is_same_v<Toffset, uint64_t>>
Expand Down Expand Up @@ -3902,9 +3918,27 @@ slm_gather(simd<uint32_t, N / VS> byte_offsets, simd_mask<N / VS> mask,
detail::lsc_data_size::default_size>(
byte_offsets, mask, pass_thru);
} else {
using MsgT = detail::__raw_t<T>;
return __esimd_slm_gather_ld<MsgT, N, Alignment>(
byte_offsets.data(), mask.data(), pass_thru.data());
if constexpr (sizeof(T) == 8) {
simd<T, N> Res;
Res.template bit_cast_view<uint32_t>().template select<N, 2>(0) =
__esimd_slm_gather_ld<uint32_t, N, Alignment>(
byte_offsets.data(), mask.data(),
(pass_thru.template bit_cast_view<uint32_t>()
.template select<N, 2>(0))
.data());
simd<uint32_t, N / VS> Offset = byte_offsets + sizeof(uint32_t);
Res.template bit_cast_view<uint32_t>().template select<N, 2>(1) =
__esimd_slm_gather_ld<uint32_t, N, sizeof(uint32_t)>(
Offset.data(), mask.data(),
(pass_thru.template bit_cast_view<uint32_t>()
.template select<N, 2>(1))
.data());
return Res;
} else {
using MsgT = detail::__raw_t<T>;
return __esimd_slm_gather_ld<MsgT, N, Alignment>(
byte_offsets.data(), mask.data(), pass_thru.data());
}
}
}

Expand Down Expand Up @@ -3943,16 +3977,30 @@ slm_gather(simd<uint32_t, N / VS> byte_offsets, simd_mask<N / VS> mask,
static_assert(Alignment >= sizeof(T),
"slm_gather() requires at least element-size alignment");

if constexpr (VS > 1 || (!(detail::isPowerOf2(N, 32) && sizeof(T) <= 4) &&
if constexpr (VS > 1 || (!detail::isPowerOf2(N, 32) &&
!detail::isMaskedGatherScatterLLVMAvailable())) {
simd<T, N> PassThru; // Intentionally undefined
return detail::slm_gather_impl<T, VS, detail::lsc_data_size::default_size>(
byte_offsets, mask, PassThru);
} else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) {
using MsgT = detail::__raw_t<T>;
simd<MsgT, N> PassThru; // it is intentionally undefined
return __esimd_slm_gather_ld<MsgT, N, Alignment>(
byte_offsets.data(), mask.data(), PassThru.data());
if constexpr (sizeof(T) == 8) {
simd<T, N> Res;
simd<uint32_t, N> PassThru; // it is intentionally undefined

Res.template bit_cast_view<uint32_t>().template select<N, 2>(0) =
__esimd_slm_gather_ld<uint32_t, N, Alignment>(
byte_offsets.data(), mask.data(), PassThru.data());
simd<uint32_t, N / VS> Offset = byte_offsets + sizeof(uint32_t);
Res.template bit_cast_view<uint32_t>().template select<N, 2>(1) =
__esimd_slm_gather_ld<uint32_t, N, sizeof(uint32_t)>(
Offset.data(), mask.data(), PassThru.data());
return Res;
} else {
using MsgT = detail::__raw_t<T>;
simd<MsgT, N> PassThru; // it is intentionally undefined
return __esimd_slm_gather_ld<MsgT, N, Alignment>(
byte_offsets.data(), mask.data(), PassThru.data());
}
} else {
detail::LocalAccessorMarker acc;
return detail::gather_impl<T, N>(acc, byte_offsets, 0, mask);
Expand Down Expand Up @@ -4236,15 +4284,30 @@ slm_scatter(simd<uint32_t, N / VS> byte_offsets, simd<T, N> vals,
"slm_scatter() requires at least element-size alignment");

// Use LSC lowering if VS > 1.
if constexpr (VS > 1 || (!(detail::isPowerOf2(N, 32) && sizeof(T) <= 4) &&
if constexpr (VS > 1 || (!detail::isPowerOf2(N, 32) &&
!detail::isMaskedGatherScatterLLVMAvailable())) {
__ESIMD_DNS::slm_scatter_impl<T, VS, detail::lsc_data_size::default_size>(
byte_offsets, vals, mask);
} else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) {
using MsgT = detail::__raw_t<T>;
__esimd_slm_scatter_st<MsgT, N, Alignment>(
sycl::bit_cast<__ESIMD_DNS::vector_type_t<MsgT, N>>(vals.data()),
byte_offsets.data(), mask.data());
if constexpr (sizeof(T) == 8) {
__esimd_slm_scatter_st<uint32_t, N, Alignment>(
vals.template bit_cast_view<uint32_t>()
.template select<N, 2>(0)
.data(),
byte_offsets.data(), mask.data());
simd<uint32_t, N / VS> Offset = byte_offsets + sizeof(uint32_t);
__esimd_slm_scatter_st<uint32_t, N, sizeof(uint32_t)>(
vals.template bit_cast_view<uint32_t>()
.template select<N, 2>(1)
.data(),
Offset.data(), mask.data());

} else {
using MsgT = detail::__raw_t<T>;
__esimd_slm_scatter_st<MsgT, N, Alignment>(
sycl::bit_cast<__ESIMD_DNS::vector_type_t<MsgT, N>>(vals.data()),
byte_offsets.data(), mask.data());
}
} else {
detail::LocalAccessorMarker acc;
detail::scatter_impl<T, N>(acc, vals, byte_offsets, 0, mask);
Expand Down
8 changes: 8 additions & 0 deletions sycl/test-e2e/ESIMD/api/slm_gather_scatter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,5 +127,13 @@ int main(void) {
passed &= test<half, 32>(q);
}

passed &= test<int64_t, 16>(q);
passed &= test<int64_t, 32>(q);

if (dev.has(sycl::aspect::fp64)) {
passed &= test<double, 16>(q);
passed &= test<double, 32>(q);
}

return passed ? 0 : 1;
}
9 changes: 9 additions & 0 deletions sycl/test-e2e/ESIMD/api/slm_gather_scatter_heavy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,15 @@ int main() {
passed &= test_vl1<half, 7>(q);
passed &= test<half, 16, 2>(q);
}
if (dev.has(sycl::aspect::fp64)) {
passed &= test<double, 8, 2>(q);
passed &= test<double, 16, 5>(q);
passed &= test<double, 32, 3>(q);
}

passed &= test<int64_t, 8, 2>(q);
passed &= test<int64_t, 16, 5>(q);
passed &= test<int64_t, 32, 3>(q);

std::cout << (!passed ? "TEST FAILED\n" : "TEST Passed\n");
return passed ? 0 : 1;
Expand Down
8 changes: 4 additions & 4 deletions sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -509,13 +509,13 @@ bool testSLM(queue Q, uint32_t MaskStride, PropertiesT) {
uint32_t LocalElemOffset = LocalID * N;

// Allocate a bit more to safely initialize it with 8-element chunks.
constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T);
constexpr uint32_t SLMSize = (Threads * N) * sizeof(T);
slm_init<SLMSize>();

if (LocalID == 0) {
for (int I = 0; I < Threads * N; I += 8) {
simd<T, 8> InVec(In + GlobalElemOffset + I);
simd<uint32_t, 8> offsets(I * sizeof(T), sizeof(T));
for (int I = 0; I < Threads * N; I++) {
simd<T, 1> InVec(In + GlobalElemOffset + I);
simd<uint32_t, 1> offsets(I * sizeof(T), sizeof(T));
slm_scatter<T>(offsets, InVec);
}
}
Expand Down
2 changes: 0 additions & 2 deletions sycl/test-e2e/ESIMD/unified_memory_api/gather_acc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,9 @@ int main() {
Passed &= testACC<uint32_t, TestFeatures>(Q);
Passed &= testACC<float, TestFeatures>(Q);
Passed &= testACC<ext::intel::experimental::esimd::tfloat32, TestFeatures>(Q);
#ifdef __ESIMD_FORCE_STATELESS_MEM
Passed &= testACC<int64_t, TestFeatures>(Q);
if (Q.get_device().has(sycl::aspect::fp64))
Passed &= testACC<double, TestFeatures>(Q);
#endif // __ESIMD_FORCE_STATELESS_MEM
std::cout << (Passed ? "Passed\n" : "FAILED\n");
return Passed ? 0 : 1;
}
2 changes: 0 additions & 2 deletions sycl/test-e2e/ESIMD/unified_memory_api/gather_acc_dg2_pvc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,9 @@ int main() {
Passed &= testACC<uint32_t, TestFeatures>(Q);
Passed &= testACC<float, TestFeatures>(Q);
Passed &= testACC<ext::intel::experimental::esimd::tfloat32, TestFeatures>(Q);
#ifdef __ESIMD_FORCE_STATELESS_MEM
Passed &= testACC<int64_t, TestFeatures>(Q);
if (Q.get_device().has(sycl::aspect::fp64))
Passed &= testACC<double, TestFeatures>(Q);
#endif // __ESIMD_FORCE_STATELESS_MEM
std::cout << (Passed ? "Passed\n" : "FAILED\n");
return Passed ? 0 : 1;
}
3 changes: 3 additions & 0 deletions sycl/test-e2e/ESIMD/unified_memory_api/slm_gather.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ int main() {
Passed &= testSLM<uint32_t, TestFeatures>(Q);
Passed &= testSLM<float, TestFeatures>(Q);
Passed &= testSLM<ext::intel::experimental::esimd::tfloat32, TestFeatures>(Q);
Passed &= testSLM<int64_t, TestFeatures>(Q);
if (Q.get_device().has(sycl::aspect::fp64))
Passed &= testSLM<double, TestFeatures>(Q);
std::cout << (Passed ? "Passed\n" : "FAILED\n");
return Passed ? 0 : 1;
}
3 changes: 3 additions & 0 deletions sycl/test-e2e/ESIMD/unified_memory_api/slm_gather_dg2_pvc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ int main() {
Passed &= testSLM<uint32_t, TestFeatures>(Q);
Passed &= testSLM<float, TestFeatures>(Q);
Passed &= testSLM<ext::intel::experimental::esimd::tfloat32, TestFeatures>(Q);
Passed &= testSLM<int64_t, TestFeatures>(Q);
if (Q.get_device().has(sycl::aspect::fp64))
Passed &= testSLM<double, TestFeatures>(Q);

std::cout << (Passed ? "Passed\n" : "FAILED\n");
return Passed ? 0 : 1;
Expand Down
Loading

0 comments on commit ffeeae2

Please sign in to comment.