Skip to content

Commit

Permalink
[SYCL][ESIMD] atomic_update with data size less than 4 bytes should u…
Browse files Browse the repository at this point in the history
…se LSC atomics (#13340)

SVM doesn't support less than 4 bytes on Gen12, we either get an error
or the wrong answer.

---------

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
  • Loading branch information
sarnex committed Apr 11, 2024
1 parent 05644a4 commit 5332773
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 16 deletions.
32 changes: 18 additions & 14 deletions sycl/include/sycl/ext/intel/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5862,8 +5862,8 @@ __ESIMD_API simd<T, N> slm_atomic_update_impl(simd<uint32_t, N> offsets,
template <atomic_op Op, typename T, int N>
__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0, simd<T, N>>
slm_atomic_update(simd<uint32_t, N> byte_offset, simd_mask<N> mask = 1) {
// 2 byte, 8 byte types, non-power of two, and operations wider than 32 are
// supported only by LSC.
// 2 byte, 8 byte types, non-power of two, and operations wider than
// 32 are supported only by LSC.
if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
!__ESIMD_DNS::isPowerOf2(N, 32)) {
return slm_atomic_update_impl<Op, T, N,
Expand Down Expand Up @@ -5942,8 +5942,8 @@ template <atomic_op Op, typename T, int N>
__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1, simd<T, N>>
slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
simd_mask<N> mask = 1) {
// 2 byte, 8 byte types, non-power of two, and operations wider than 32 are
// supported only by LSC.
// 2 byte, 8 byte types, non-power of two, and operations wider than
// 32 are supported only by LSC.
if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
!__ESIMD_DNS::isPowerOf2(N, 32)) {
// half and short are supported in LSC.
Expand Down Expand Up @@ -6031,8 +6031,8 @@ template <atomic_op Op, typename T, int N>
__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2, simd<T, N>>
slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
simd<T, N> src1, simd_mask<N> mask = 1) {
// 2 byte, 8 byte types, non-power of two, and operations wider than 32 are
// supported only by LSC.
// 2 byte, 8 byte types, non-power of two, and operations wider than
// 32 are supported only by LSC.
if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
!__ESIMD_DNS::isPowerOf2(N, 32)) {
// 2-argument lsc_atomic_update arguments order matches the standard one -
Expand Down Expand Up @@ -6417,7 +6417,7 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd_mask<N> mask,
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");

if constexpr (detail::has_cache_hints<PropertyListT>() ||
!__ESIMD_DNS::isPowerOf2(N, 32)) {
!__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
return detail::atomic_update_impl<
Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>(
p, byte_offset, mask);
Expand Down Expand Up @@ -6640,7 +6640,7 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
if constexpr (detail::has_cache_hints<PropertyListT>() ||
(Op == atomic_op::fmin) || (Op == atomic_op::fmax) ||
(Op == atomic_op::fadd) || (Op == atomic_op::fsub) ||
!__ESIMD_DNS::isPowerOf2(N, 32)) {
!__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
return detail::atomic_update_impl<
Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>(
p, byte_offset, src0, mask);
Expand Down Expand Up @@ -6888,9 +6888,11 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");

// Use LSC atomic when cache hints are present, FP atomics is used,
// non-power of two length is used, or operation width greater than 32.
// non-power of two length is used, or operation width greater than 32, or the
// data size is less than 4 bytes.
if constexpr (detail::has_cache_hints<PropertyListT>() ||
Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32)) {
Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32) ||
sizeof(T) < 4) {
// 2-argument lsc_atomic_update arguments order matches the standard one -
// expected value first, then new value. But atomic_update uses reverse
// order, hence the src1/src0 swap.
Expand Down Expand Up @@ -7116,7 +7118,7 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd_mask<N> mask,
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");

if constexpr (detail::has_cache_hints<PropertyListT>() ||
!detail::isPowerOf2(N, 32)) {
!detail::isPowerOf2(N, 32) || sizeof(T) < 4) {
return detail::atomic_update_impl<
Op, T, N, detail::lsc_data_size::default_size, PropertyListT>(
acc, byte_offset, mask);
Expand Down Expand Up @@ -7384,7 +7386,7 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
if constexpr (detail::has_cache_hints<PropertyListT>() ||
Op == atomic_op::fmin || Op == atomic_op::fmax ||
Op == atomic_op::fadd || Op == atomic_op::fsub ||
!__ESIMD_DNS::isPowerOf2(N, 32)) {
!__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
return detail::atomic_update_impl<
Op, T, N, detail::lsc_data_size::default_size, PropertyListT>(
acc, byte_offset, src0, mask);
Expand Down Expand Up @@ -7681,9 +7683,11 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
// Use LSC atomic when cache hints are present, FP atomics is used,
// non-power of two length is used, or operation width greater than 32.
// non-power of two length is used, operation width greater than 32, or the
// data size is less than 4 bytes,
if constexpr (detail::has_cache_hints<PropertyListT>() ||
Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32)) {
Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32) ||
sizeof(T) < 4) {
// 2-argument lsc_atomic_update arguments order matches the standard one -
// expected value first, then new value. But atomic_update uses reverse
// order, hence the src1/src0 swap.
Expand Down
126 changes: 124 additions & 2 deletions sycl/test/esimd/memory_properties.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,17 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
atomic_update<atomic_op::inc, int, VL>(ptr, offsets, pred);
}

// Try with int16_t to check that LSC atomic is generated
// The result is later casted to int16, not captured here.
// CHECK: call <8 x i32> @llvm.genx.lsc.xatomic.stateless.v8i32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 8, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x i32> undef, <8 x i32> undef, i32 0, <8 x i32> undef)
{
int16_t *ptr = 0;
constexpr int VL = 8;
simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
auto atomic_res =
atomic_update<atomic_op::inc, int16_t, VL>(ptr, offsets);
}

// Accessor

// CHECK-STATEFUL: call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 8, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 {{[^)]+}}, <4 x i32> undef)
Expand Down Expand Up @@ -377,6 +388,19 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
auto atomic_res_acc =
atomic_update<atomic_op::inc, int, VL>(acc, offsets, pred);
}
// Try with int16_t to check that LSC atomic is generated
// The result is later casted to int16, not captured here.
// CHECK-STATEFUL: call <8 x i32> @llvm.genx.lsc.xatomic.bti.v8i32.v8i1.v8i32(<8 x i1> {{[^)]+}}, i8 8, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <8 x i32> {{[^)]+}}, <8 x i32> undef, <8 x i32> undef, i32 {{[^)]+}}, <8 x i32> undef)
// CHECK-STATELESS: call <8 x i32> @llvm.genx.lsc.xatomic.stateless.v8i32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 8, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x i32> undef, <8 x i32> undef, i32 0, <8 x i32> undef)
{
using AccType =
sycl::accessor<int16_t, 1, sycl::access::mode::read_write>;
AccType *acc = nullptr;
constexpr int VL = 8;
simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
auto atomic_res =
atomic_update<atomic_op::inc, int16_t, VL>(*acc, offsets);
}
}

// Test atomic update with one operand.
Expand Down Expand Up @@ -432,6 +456,18 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
auto res_atomic_8 =
atomic_update<atomic_op::add, int>(ptr, offsets, add, pred);

// Try with int16_t to check that LSC atomic is generated
// The result is later casted to int16, not captured here.
// CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32>{{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
{
int16_t *ptr = 0;
constexpr int VL = 4;
simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
auto add = simd<int16_t, VL>(5);
auto atomic_res =
atomic_update<atomic_op::add, int16_t, VL>(ptr, offsets, add);
}

// Accessors

// CHECK-STATEFUL-COUNT-14: call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 {{[^)]+}}, <4 x i32> undef)
Expand Down Expand Up @@ -483,6 +519,21 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
// CHECK-STATELESS: call <4 x i32> @llvm.genx.svm.atomic.sub.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
auto res_atomic_17 =
atomic_update<atomic_op::sub, int>(acc, offsets, add, pred);

// Try with int16_t to check that LSC atomic is generated
// The result is later casted to int16, not captured here.
// CHECK-STATEFUL: call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 {{[^)]+}}, <4 x i32> undef)
// CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
{
using AccType =
sycl::accessor<int16_t, 1, sycl::access::mode::read_write>;
AccType *acc = nullptr;
constexpr int VL = 4;
simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
auto add = simd<int16_t, VL>(5);
auto atomic_res =
atomic_update<atomic_op::add, int16_t, VL>(*acc, offsets, add);
}
}

// Test atomic update with two operands.
Expand Down Expand Up @@ -626,6 +677,19 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
auto res_atomic_100 = atomic_update<atomic_op::cmpxchg, int, VL>(
ptr, offsets, swap, compare, pred);

// Try with int16_t to check that LSC atomic is generated
// The result is later casted to int16, not captured here.
// CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
{
int16_t *ptr = 0;
constexpr int VL = 4;
simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
simd<int16_t, VL> swap = simd<int16_t, VL>(1) * sizeof(int);
auto compare = swap * 2;
auto atomic_res = atomic_update<atomic_op::cmpxchg, int16_t, VL>(
ptr, offsets, swap, compare);
}

// Accessors

// CHECK-STATEFUL-COUNT-30: call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> undef)
Expand Down Expand Up @@ -751,6 +815,22 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
// CHECK-STATELESS: call <4 x i32> @llvm.genx.svm.atomic.cmpxchg.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
auto res_atomic_33 = atomic_update<atomic_op::cmpxchg, int, VL>(
acc, offsets, swap, compare, pred);

// Try with int16_t to check that LSC atomic is generated
// The result is later casted to int16, not captured here.
// CHECK-STATEFUL: call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> undef)
// CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
{
using AccType =
sycl::accessor<int16_t, 1, sycl::access::mode::read_write>;
AccType *acc = nullptr;
constexpr int VL = 4;
simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
simd<int16_t, VL> swap = simd<int16_t, VL>(1) * sizeof(int);
auto compare = swap * 2;
auto atomic_res = atomic_update<atomic_op::cmpxchg, int16_t, VL>(
*acc, offsets, compare, swap);
}
}

// Test slm_atomic_update without operands.
Expand Down Expand Up @@ -824,12 +904,11 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
{
constexpr int VL = 16;
simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
auto pred = simd_mask<VL>(1);
simd<int16_t, VL> add = simd<int16_t, VL>(1) * sizeof(int);

// CHECK: call <16 x i32> @llvm.genx.lsc.xatomic.slm.v16i32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <16 x i32> {{[^)]+}}, <16 x i32> {{[^)]+}}, <16 x i32> undef, i32 0, <16 x i32> undef)
auto res_slm_atomic_0 =
slm_atomic_update<atomic_op::add, int16_t>(offsets, add, pred);
slm_atomic_update<atomic_op::add, int16_t>(offsets, add);
}
// Expect DWORD for fmin.
{
Expand Down Expand Up @@ -934,6 +1013,19 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
compare_view.select<VL, 1>());

// Expect LSC for short.
{
constexpr int VL = 16;
simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int16_t);
auto compare = simd<int16_t, VL>(VL, 1);
auto swap = compare * 2;

// CHECK: call <16 x i32> @llvm.genx.lsc.xatomic.slm.v16i32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <16 x i32> {{[^)]+}}, <16 x i32> {{[^)]+}}, <16 x i32> {{[^)]+}}, i32 0, <16 x i32> undef)
auto res_slm_atomic_0 =
slm_atomic_update<atomic_op::cmpxchg, int16_t, VL>(offsets, swap,
compare);
}

// Expect LSC for int64_t.
{
constexpr int VL = 16;
Expand Down Expand Up @@ -964,6 +1056,15 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
local_acc, offsets_view.select<VL, 1>(), pred);
auto res_slm_atomic_6 = atomic_update<atomic_op::inc, int, VL>(
local_acc, offsets_view.select<VL, 1>());

// Expect LSC for short.
{
using LocalAccType = sycl::local_accessor<int16_t, 1>;
LocalAccType *local_acc = nullptr;
// CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.slm.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 8, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 0, <4 x i32> undef)
auto res_slm_atomic_1 =
atomic_update<atomic_op::inc, int16_t>(*local_acc, offsets);
}
}
// One operand atomic.
{
Expand Down Expand Up @@ -997,6 +1098,16 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
pred);
res_slm_atomic_8 = atomic_update<atomic_op::add, int, VL>(
local_acc, offsets_view.select<VL, 1>(), add_view.select<VL, 1>());

// Expect LSC for short.
{
using LocalAccType = sycl::local_accessor<int16_t, 1>;
LocalAccType *local_acc = nullptr;
simd<int16_t, VL> add = simd<int16_t, VL>(1) * sizeof(int);
// CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.slm.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
auto res_slm_atomic_1 =
atomic_update<atomic_op::add, int16_t>(*local_acc, offsets, add);
}
}
// Two operand atomic.
{
Expand Down Expand Up @@ -1069,6 +1180,17 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
res_slm_atomic_16 = atomic_update<atomic_op::cmpxchg, int, VL>(
local_acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
compare_view.select<VL, 1>());

// Expect LSC for short.
{
using LocalAccType = sycl::local_accessor<int16_t, 1>;
LocalAccType *local_acc = nullptr;
auto compare = simd<int16_t, VL>(VL, 1);
auto swap = compare * 2;
// CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.slm.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
auto res_slm_atomic_1 = atomic_update<atomic_op::cmpxchg, int16_t, VL>(
*local_acc, offsets, swap, compare);
}
}
}

Expand Down

0 comments on commit 5332773

Please sign in to comment.