From 5332773b17efbf10e1b72cd633c1d7e2b4f75125 Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Fri, 12 Apr 2024 01:51:19 +0900 Subject: [PATCH] [SYCL][ESIMD] atomic_update with data size less than 4 bytes should use LSC atomics (#13340) SVM doesn't support less than 4 bytes on Gen12, we either get an error or the wrong answer. --------- Signed-off-by: Sarnie, Nick --- sycl/include/sycl/ext/intel/esimd/memory.hpp | 32 ++--- sycl/test/esimd/memory_properties.cpp | 126 ++++++++++++++++++- 2 files changed, 142 insertions(+), 16 deletions(-) diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index 7ef701b7edc85..188bce9d59b62 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -5862,8 +5862,8 @@ __ESIMD_API simd slm_atomic_update_impl(simd offsets, template __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 0, simd> slm_atomic_update(simd byte_offset, simd_mask mask = 1) { - // 2 byte, 8 byte types, non-power of two, and operations wider than 32 are - // supported only by LSC. + // 2 byte, 8 byte types, non-power of two, and operations wider than + // 32 are supported only by LSC. if constexpr (sizeof(T) == 2 || sizeof(T) == 8 || !__ESIMD_DNS::isPowerOf2(N, 32)) { return slm_atomic_update_impl __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 1, simd> slm_atomic_update(simd byte_offset, simd src0, simd_mask mask = 1) { - // 2 byte, 8 byte types, non-power of two, and operations wider than 32 are - // supported only by LSC. + // 2 byte, 8 byte types, non-power of two, and operations wider than + // 32 are supported only by LSC. if constexpr (sizeof(T) == 2 || sizeof(T) == 8 || !__ESIMD_DNS::isPowerOf2(N, 32)) { // half and short are supported in LSC. @@ -6031,8 +6031,8 @@ template __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args() == 2, simd> slm_atomic_update(simd byte_offset, simd src0, simd src1, simd_mask mask = 1) { - // 2 byte, 8 byte types, non-power of two, and operations wider than 32 are - // supported only by LSC. + // 2 byte, 8 byte types, non-power of two, and operations wider than + // 32 are supported only by LSC. if constexpr (sizeof(T) == 2 || sizeof(T) == 8 || !__ESIMD_DNS::isPowerOf2(N, 32)) { // 2-argument lsc_atomic_update arguments order matches the standard one - @@ -6417,7 +6417,7 @@ atomic_update(T *p, simd byte_offset, simd_mask mask, static_assert(std::is_integral_v, "Unsupported offset type"); if constexpr (detail::has_cache_hints() || - !__ESIMD_DNS::isPowerOf2(N, 32)) { + !__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) { return detail::atomic_update_impl< Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>( p, byte_offset, mask); @@ -6640,7 +6640,7 @@ atomic_update(T *p, simd byte_offset, simd src0, if constexpr (detail::has_cache_hints() || (Op == atomic_op::fmin) || (Op == atomic_op::fmax) || (Op == atomic_op::fadd) || (Op == atomic_op::fsub) || - !__ESIMD_DNS::isPowerOf2(N, 32)) { + !__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) { return detail::atomic_update_impl< Op, T, N, detail::lsc_data_size::default_size, PropertyListT, Toffset>( p, byte_offset, src0, mask); @@ -6888,9 +6888,11 @@ atomic_update(T *p, simd byte_offset, simd src0, static_assert(std::is_integral_v, "Unsupported offset type"); // Use LSC atomic when cache hints are present, FP atomics is used, - // non-power of two length is used, or operation width greater than 32. + // non-power of two length is used, or operation width greater than 32, or the + // data size is less than 4 bytes. if constexpr (detail::has_cache_hints() || - Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32)) { + Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32) || + sizeof(T) < 4) { // 2-argument lsc_atomic_update arguments order matches the standard one - // expected value first, then new value. But atomic_update uses reverse // order, hence the src1/src0 swap. @@ -7116,7 +7118,7 @@ atomic_update(AccessorTy acc, simd byte_offset, simd_mask mask, static_assert(std::is_integral_v, "Unsupported offset type"); if constexpr (detail::has_cache_hints() || - !detail::isPowerOf2(N, 32)) { + !detail::isPowerOf2(N, 32) || sizeof(T) < 4) { return detail::atomic_update_impl< Op, T, N, detail::lsc_data_size::default_size, PropertyListT>( acc, byte_offset, mask); @@ -7384,7 +7386,7 @@ atomic_update(AccessorTy acc, simd byte_offset, simd src0, if constexpr (detail::has_cache_hints() || Op == atomic_op::fmin || Op == atomic_op::fmax || Op == atomic_op::fadd || Op == atomic_op::fsub || - !__ESIMD_DNS::isPowerOf2(N, 32)) { + !__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) { return detail::atomic_update_impl< Op, T, N, detail::lsc_data_size::default_size, PropertyListT>( acc, byte_offset, src0, mask); @@ -7681,9 +7683,11 @@ atomic_update(AccessorTy acc, simd byte_offset, simd src0, static_assert(std::is_integral_v, "Unsupported offset type"); static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported"); // Use LSC atomic when cache hints are present, FP atomics is used, - // non-power of two length is used, or operation width greater than 32. + // non-power of two length is used, operation width greater than 32, or the + // data size is less than 4 bytes, if constexpr (detail::has_cache_hints() || - Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32)) { + Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32) || + sizeof(T) < 4) { // 2-argument lsc_atomic_update arguments order matches the standard one - // expected value first, then new value. But atomic_update uses reverse // order, hence the src1/src0 swap. diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp index 2c69d3a69d782..b23697d91922a 100644 --- a/sycl/test/esimd/memory_properties.cpp +++ b/sycl/test/esimd/memory_properties.cpp @@ -322,6 +322,17 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, atomic_update(ptr, offsets, pred); } + // Try with int16_t to check that LSC atomic is generated + // The result is later casted to int16, not captured here. + // CHECK: call <8 x i32> @llvm.genx.lsc.xatomic.stateless.v8i32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 8, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x i32> undef, <8 x i32> undef, i32 0, <8 x i32> undef) + { + int16_t *ptr = 0; + constexpr int VL = 8; + simd offsets = simd(1) * sizeof(int16_t); + auto atomic_res = + atomic_update(ptr, offsets); + } + // Accessor // CHECK-STATEFUL: call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 8, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 {{[^)]+}}, <4 x i32> undef) @@ -377,6 +388,19 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, auto atomic_res_acc = atomic_update(acc, offsets, pred); } + // Try with int16_t to check that LSC atomic is generated + // The result is later casted to int16, not captured here. + // CHECK-STATEFUL: call <8 x i32> @llvm.genx.lsc.xatomic.bti.v8i32.v8i1.v8i32(<8 x i1> {{[^)]+}}, i8 8, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <8 x i32> {{[^)]+}}, <8 x i32> undef, <8 x i32> undef, i32 {{[^)]+}}, <8 x i32> undef) + // CHECK-STATELESS: call <8 x i32> @llvm.genx.lsc.xatomic.stateless.v8i32.v8i1.v8i64(<8 x i1> {{[^)]+}}, i8 8, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <8 x i64> {{[^)]+}}, <8 x i32> undef, <8 x i32> undef, i32 0, <8 x i32> undef) + { + using AccType = + sycl::accessor; + AccType *acc = nullptr; + constexpr int VL = 8; + simd offsets = simd(1) * sizeof(int16_t); + auto atomic_res = + atomic_update(*acc, offsets); + } } // Test atomic update with one operand. @@ -432,6 +456,18 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, auto res_atomic_8 = atomic_update(ptr, offsets, add, pred); + // Try with int16_t to check that LSC atomic is generated + // The result is later casted to int16, not captured here. + // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32>{{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef) + { + int16_t *ptr = 0; + constexpr int VL = 4; + simd offsets = simd(1) * sizeof(int16_t); + auto add = simd(5); + auto atomic_res = + atomic_update(ptr, offsets, add); + } + // Accessors // CHECK-STATEFUL-COUNT-14: call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 {{[^)]+}}, <4 x i32> undef) @@ -483,6 +519,21 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, // CHECK-STATELESS: call <4 x i32> @llvm.genx.svm.atomic.sub.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef) auto res_atomic_17 = atomic_update(acc, offsets, add, pred); + + // Try with int16_t to check that LSC atomic is generated + // The result is later casted to int16, not captured here. + // CHECK-STATEFUL: call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 {{[^)]+}}, <4 x i32> undef) + // CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef) + { + using AccType = + sycl::accessor; + AccType *acc = nullptr; + constexpr int VL = 4; + simd offsets = simd(1) * sizeof(int16_t); + auto add = simd(5); + auto atomic_res = + atomic_update(*acc, offsets, add); + } } // Test atomic update with two operands. @@ -626,6 +677,19 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, auto res_atomic_100 = atomic_update( ptr, offsets, swap, compare, pred); + // Try with int16_t to check that LSC atomic is generated + // The result is later casted to int16, not captured here. + // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef) + { + int16_t *ptr = 0; + constexpr int VL = 4; + simd offsets = simd(1) * sizeof(int16_t); + simd swap = simd(1) * sizeof(int); + auto compare = swap * 2; + auto atomic_res = atomic_update( + ptr, offsets, swap, compare); + } + // Accessors // CHECK-STATEFUL-COUNT-30: call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> undef) @@ -751,6 +815,22 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, // CHECK-STATELESS: call <4 x i32> @llvm.genx.svm.atomic.cmpxchg.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef) auto res_atomic_33 = atomic_update( acc, offsets, swap, compare, pred); + + // Try with int16_t to check that LSC atomic is generated + // The result is later casted to int16, not captured here. + // CHECK-STATEFUL: call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> undef) + // CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef) + { + using AccType = + sycl::accessor; + AccType *acc = nullptr; + constexpr int VL = 4; + simd offsets = simd(1) * sizeof(int16_t); + simd swap = simd(1) * sizeof(int); + auto compare = swap * 2; + auto atomic_res = atomic_update( + *acc, offsets, compare, swap); + } } // Test slm_atomic_update without operands. @@ -824,12 +904,11 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, { constexpr int VL = 16; simd offsets = simd(1) * sizeof(int16_t); - auto pred = simd_mask(1); simd add = simd(1) * sizeof(int); // CHECK: call <16 x i32> @llvm.genx.lsc.xatomic.slm.v16i32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <16 x i32> {{[^)]+}}, <16 x i32> {{[^)]+}}, <16 x i32> undef, i32 0, <16 x i32> undef) auto res_slm_atomic_0 = - slm_atomic_update(offsets, add, pred); + slm_atomic_update(offsets, add); } // Expect DWORD for fmin. { @@ -934,6 +1013,19 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, offsets_view.select(), swap_view.select(), compare_view.select()); + // Expect LSC for short. + { + constexpr int VL = 16; + simd offsets = simd(1) * sizeof(int16_t); + auto compare = simd(VL, 1); + auto swap = compare * 2; + + // CHECK: call <16 x i32> @llvm.genx.lsc.xatomic.slm.v16i32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <16 x i32> {{[^)]+}}, <16 x i32> {{[^)]+}}, <16 x i32> {{[^)]+}}, i32 0, <16 x i32> undef) + auto res_slm_atomic_0 = + slm_atomic_update(offsets, swap, + compare); + } + // Expect LSC for int64_t. { constexpr int VL = 16; @@ -964,6 +1056,15 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, local_acc, offsets_view.select(), pred); auto res_slm_atomic_6 = atomic_update( local_acc, offsets_view.select()); + + // Expect LSC for short. + { + using LocalAccType = sycl::local_accessor; + LocalAccType *local_acc = nullptr; + // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.slm.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 8, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 0, <4 x i32> undef) + auto res_slm_atomic_1 = + atomic_update(*local_acc, offsets); + } } // One operand atomic. { @@ -997,6 +1098,16 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, pred); res_slm_atomic_8 = atomic_update( local_acc, offsets_view.select(), add_view.select()); + + // Expect LSC for short. + { + using LocalAccType = sycl::local_accessor; + LocalAccType *local_acc = nullptr; + simd add = simd(1) * sizeof(int); + // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.slm.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 12, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef) + auto res_slm_atomic_1 = + atomic_update(*local_acc, offsets, add); + } } // Two operand atomic. { @@ -1069,6 +1180,17 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf, res_slm_atomic_16 = atomic_update( local_acc, offsets_view.select(), swap_view.select(), compare_view.select()); + + // Expect LSC for short. + { + using LocalAccType = sycl::local_accessor; + LocalAccType *local_acc = nullptr; + auto compare = simd(VL, 1); + auto swap = compare * 2; + // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.slm.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 18, i8 0, i8 0, i16 1, i32 0, i8 6, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef) + auto res_slm_atomic_1 = atomic_update( + *local_acc, offsets, swap, compare); + } } }