Skip to content

Commit

Permalink
[SYCL][ESIMD] atomic_update with data size less than 4 bytes should u…
Browse files Browse the repository at this point in the history
…se LSC atomics

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
  • Loading branch information
sarnex committed Apr 9, 2024
1 parent cb28e09 commit 731ce6f
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -782,14 +782,14 @@ The template parameter `N` is the number of elements being atomically updated.
| `Function` | `Condition` | Required Intel GPU |
|-|-|-|
| `(usm-au0-*)`, `(acc-au0-*)` | !(cache-hints) and (`N` == 1,2,4,8,16,32) and (sizeof(T) >= 4) | Any Intel GPU |
| `(usm-au0-*)`, `(acc-au0-*)` | (cache-hints) or (`N` != 1,2,4,8,16,32) or (sizeof(T) == 2) | DG2 or PVC |
| `(usm-au0-*)`, `(acc-au0-*)` | (cache-hints) or (`N` != 1,2,4,8,16,32) or (sizeof(T) < 4) | DG2 or PVC |
| `(usm-au1-*)`, `(acc-au1-*)`, `(usm-au2-*)`, `(acc-au2-*)` | !(cache-hints) and (`N` == 1,2,4,8,16,32) and (sizeof(T) >= 4) and (`Op` is integral operation) | Any Intel GPU |
| `(usm-au1-*)`, `(acc-au1-*)`, `(usm-au2-*)`, `(acc-au2-*)` | (cache-hints) or (`N` != 1,2,4,8,16,32) or (sizeof(T) == 2) or (`Op` is FP operation) | DG2 or PVC |
| `(usm-au1-*)`, `(acc-au1-*)`, `(usm-au2-*)`, `(acc-au2-*)` | (cache-hints) or (`N` != 1,2,4,8,16,32) or (sizeof(T) < 4) or (`Op` is FP operation) | DG2 or PVC |
|-|-|-|
| `(slm-au0-*)`, `(lacc-au0-*)` | (`N` == 1,2,4,8,16,32) and (sizeof(T) == 4) | Any Intel GPU |
| `(slm-au0-*)`, `(lacc-au0-*)` | (`N` != 1,2,4,8,16,32) or (sizeof(T) == 2) or (sizeof(T) == 8)| DG2 or PVC |
| `(slm-au0-*)`, `(lacc-au0-*)` | (`N` != 1,2,4,8,16,32) or (sizeof(T) < 4) or (sizeof(T) == 8)| DG2 or PVC |
| `(slm-au1-*)`, `(lacc-au1-*)`, `(slm-au2-*)`, `(lacc-au2-*)` | (`N` == 1,2,4,8,16,32) and (sizeof(T) == 4) and (`Op` is integral operation) | Any Intel GPU |
| `(slm-au1-*)`, `(lacc-au1-*)`, `(slm-au2-*)`, `(lacc-au2-*)` | (`N` != 1,2,4,8,16,32) or (sizeof(T) == 2) or (sizeof(T) == 8) or (`Op` is FP operation)| DG2 or PVC |
| `(slm-au1-*)`, `(lacc-au1-*)`, `(slm-au2-*)`, `(lacc-au2-*)` | (`N` != 1,2,4,8,16,32) or (sizeof(T) < 4) or (sizeof(T) == 8) or (`Op` is FP operation)| DG2 or PVC |


## prefetch(...)
Expand Down
38 changes: 21 additions & 17 deletions sycl/include/sycl/ext/intel/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5862,9 +5862,9 @@ __ESIMD_API simd<T, N> slm_atomic_update_impl(simd<uint32_t, N> offsets,
template <atomic_op Op, typename T, int N>
__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0, simd<T, N>>
slm_atomic_update(simd<uint32_t, N> byte_offset, simd_mask<N> mask = 1) {
// 2 byte, 8 byte types, non-power of two, and operations wider than 32 are
// supported only by LSC.
if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
// 1 byte, 2 byte, 8 byte types, non-power of two, and operations wider than
// 32 are supported only by LSC.
if constexpr (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 8 ||
!__ESIMD_DNS::isPowerOf2(N, 32)) {
return slm_atomic_update_impl<Op, T, N,
detail::lsc_data_size::default_size>(
Expand Down Expand Up @@ -5942,9 +5942,9 @@ template <atomic_op Op, typename T, int N>
__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1, simd<T, N>>
slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
simd_mask<N> mask = 1) {
// 2 byte, 8 byte types, non-power of two, and operations wider than 32 are
// supported only by LSC.
if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
// 1 byte, 2 byte, 8 byte types, non-power of two, and operations wider than
// 32 are supported only by LSC.
if constexpr (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 8 ||
!__ESIMD_DNS::isPowerOf2(N, 32)) {
// half and short are supported in LSC.
return slm_atomic_update_impl<Op, T, N,
Expand Down Expand Up @@ -6031,9 +6031,9 @@ template <atomic_op Op, typename T, int N>
__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2, simd<T, N>>
slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
simd<T, N> src1, simd_mask<N> mask = 1) {
// 2 byte, 8 byte types, non-power of two, and operations wider than 32 are
// supported only by LSC.
if constexpr (sizeof(T) == 2 || sizeof(T) == 8 ||
// 1 byte, 2 byte, 8 byte types, non-power of two, and operations wider than
// 32 are supported only by LSC.
if constexpr (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 8 ||
!__ESIMD_DNS::isPowerOf2(N, 32)) {
// 2-argument lsc_atomic_update arguments order matches the standard one -
// expected value first, then new value. But atomic_update uses reverse
Expand Down Expand Up @@ -6417,7 +6417,7 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd_mask<N> mask,
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");

if constexpr (detail::has_cache_hints<PropertyListT>() ||
!__ESIMD_DNS::isPowerOf2(N, 32)) {
!__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
return detail::atomic_update_impl<
Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>(
p, byte_offset, mask);
Expand Down Expand Up @@ -6640,7 +6640,7 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
if constexpr (detail::has_cache_hints<PropertyListT>() ||
(Op == atomic_op::fmin) || (Op == atomic_op::fmax) ||
(Op == atomic_op::fadd) || (Op == atomic_op::fsub) ||
!__ESIMD_DNS::isPowerOf2(N, 32)) {
!__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
return detail::atomic_update_impl<
Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>(
p, byte_offset, src0, mask);
Expand Down Expand Up @@ -6888,9 +6888,11 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");

// Use LSC atomic when cache hints are present, FP atomics is used,
// non-power of two length is used, or operation width greater than 32.
// non-power of two length is used, or operation width greater than 32, or the
// data size is less than 4 bytes.
if constexpr (detail::has_cache_hints<PropertyListT>() ||
Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32)) {
Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32) ||
sizeof(T) < 4) {
// 2-argument lsc_atomic_update arguments order matches the standard one -
// expected value first, then new value. But atomic_update uses reverse
// order, hence the src1/src0 swap.
Expand Down Expand Up @@ -7116,7 +7118,7 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd_mask<N> mask,
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");

if constexpr (detail::has_cache_hints<PropertyListT>() ||
!detail::isPowerOf2(N, 32)) {
!detail::isPowerOf2(N, 32) || sizeof(T) < 4) {
return detail::atomic_update_impl<
Op, T, N, detail::lsc_data_size::default_size, PropertyListT>(
acc, byte_offset, mask);
Expand Down Expand Up @@ -7384,7 +7386,7 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
if constexpr (detail::has_cache_hints<PropertyListT>() ||
Op == atomic_op::fmin || Op == atomic_op::fmax ||
Op == atomic_op::fadd || Op == atomic_op::fsub ||
!__ESIMD_DNS::isPowerOf2(N, 32)) {
!__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
return detail::atomic_update_impl<
Op, T, N, detail::lsc_data_size::default_size, PropertyListT>(
acc, byte_offset, src0, mask);
Expand Down Expand Up @@ -7681,9 +7683,11 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
// Use LSC atomic when cache hints are present, FP atomics is used,
// non-power of two length is used, or operation width greater than 32.
// non-power of two length is used, operation width greater than 32, or the
// data size is less than 4 bytes,
if constexpr (detail::has_cache_hints<PropertyListT>() ||
Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32)) {
Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32) ||
sizeof(T) < 4) {
// 2-argument lsc_atomic_update arguments order matches the standard one -
// expected value first, then new value. But atomic_update uses reverse
// order, hence the src1/src0 swap.
Expand Down

0 comments on commit 731ce6f

Please sign in to comment.