diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index 4ffd5c418138c..d5e62d1ebb175 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -2699,32 +2699,40 @@ ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t< scatter_impl(AccessorTy acc, simd vals, simd offsets, uint32_t glob_offset, simd_mask mask) { - static_assert(sizeof(T) <= 4 && detail::isPowerOf2(N, 32), - "Unexpected type or vector length"); - constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding(); - // TODO (performance) use hardware-supported scale once BE supports it - constexpr int16_t scale = 0; - const auto si = __ESIMD_NS::get_surface_index(acc); - - if constexpr (sizeof(T) < 4) { - using Tint = std::conditional_t, T, - detail::uint_type_t>; - using Treal = __raw_t; - simd vals_int = bitcast(std::move(vals).data()); - using PromoT = typename std::conditional_t::value, - int32_t, uint32_t>; - const simd promo_vals = convert(std::move(vals_int)); - __esimd_scatter_scaled( - mask.data(), si, glob_offset, offsets.data(), promo_vals.data()); + static_assert(detail::isPowerOf2(N, 32), "Unexpected vector length"); + if constexpr (sizeof(T) == 8) { + scatter_impl( + acc, vals.template bit_cast_view().template select(0), + offsets, glob_offset, mask); + scatter_impl( + acc, vals.template bit_cast_view().template select(1), + offsets, glob_offset + sizeof(uint32_t), mask); } else { - using Treal = __raw_t; - if constexpr (!std::is_same_v) { - simd Values = vals.template bit_cast_view(); - __esimd_scatter_scaled( - mask.data(), si, glob_offset, offsets.data(), Values.data()); + constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding(); + // TODO (performance) use hardware-supported scale once BE supports it + constexpr int16_t scale = 0; + const auto si = __ESIMD_NS::get_surface_index(acc); + + if constexpr (sizeof(T) < 4) { + using Tint = std::conditional_t, T, + detail::uint_type_t>; + using Treal = __raw_t; + simd vals_int = bitcast(std::move(vals).data()); + using PromoT = typename std::conditional_t::value, + int32_t, uint32_t>; + const simd promo_vals = convert(std::move(vals_int)); + __esimd_scatter_scaled( + mask.data(), si, glob_offset, offsets.data(), promo_vals.data()); } else { - __esimd_scatter_scaled( - mask.data(), si, glob_offset, offsets.data(), vals.data()); + using Treal = __raw_t; + if constexpr (!std::is_same_v) { + simd Values = vals.template bit_cast_view(); + __esimd_scatter_scaled( + mask.data(), si, glob_offset, offsets.data(), Values.data()); + } else { + __esimd_scatter_scaled( + mask.data(), si, glob_offset, offsets.data(), vals.data()); + } } } } @@ -2736,42 +2744,50 @@ __ESIMD_API std::enable_if_t< simd> gather_impl(AccessorTy acc, simd offsets, uint32_t glob_offset, simd_mask mask) { - static_assert(sizeof(T) <= 4 && detail::isPowerOf2(N, 32), - "Unexpected type or vector length"); - - constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding(); - // TODO (performance) use hardware-supported scale once BE supports it - constexpr uint32_t scale = 0; - const auto si = get_surface_index(acc); - - if constexpr (sizeof(T) < 4) { - using Tint = std::conditional_t, T, - detail::uint_type_t>; - using Treal = __raw_t; - static_assert(std::is_integral::value, - "only integral 1- & 2-byte types are supported"); - using PromoT = typename std::conditional_t::value, - int32_t, uint32_t>; - simd promo_vals = - __esimd_gather_masked_scaled2(si, glob_offset, offsets.data(), - mask.data()); - auto Res = convert(promo_vals); - - if constexpr (!std::is_same_v) { - return detail::bitcast(Res.data()); - } else { - return Res; - } + static_assert(detail::isPowerOf2(N, 32), "Unexpected vector length"); + + if constexpr (sizeof(T) == 8) { + simd Res; + Res.template bit_cast_view().template select(0) = + gather_impl(acc, offsets, glob_offset, mask); + Res.template bit_cast_view().template select(1) = + gather_impl(acc, offsets, glob_offset + sizeof(uint32_t), + mask); + return Res; } else { using Treal = __raw_t; - simd Res = __esimd_gather_masked_scaled2( - si, glob_offset, offsets.data(), mask.data()); - if constexpr (!std::is_same_v) { - return Res.template bit_cast_view(); + constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding(); + // TODO (performance) use hardware-supported scale once BE supports it + constexpr uint32_t scale = 0; + const auto si = get_surface_index(acc); + if constexpr (sizeof(T) < 4) { + using Tint = std::conditional_t, T, + detail::uint_type_t>; + + static_assert(std::is_integral::value, + "only integral 1- & 2-byte types are supported"); + using PromoT = typename std::conditional_t::value, + int32_t, uint32_t>; + simd promo_vals = + __esimd_gather_masked_scaled2(si, glob_offset, offsets.data(), + mask.data()); + auto Res = convert(promo_vals); + + if constexpr (!std::is_same_v) { + return detail::bitcast(Res.data()); + } else { + return Res; + } } else { - return Res; + simd Res = __esimd_gather_masked_scaled2( + si, glob_offset, offsets.data(), mask.data()); + if constexpr (!std::is_same_v) { + return Res.template bit_cast_view(); + } else { + return Res; + } } } } @@ -2927,7 +2943,7 @@ __ESIMD_API return gather(__ESIMD_DNS::accessorToPointer(acc, glob_offset), byte_offsets, mask); #else - if constexpr (sizeof(T) > 4 || !(detail::isPowerOf2(N, 32))) { + if constexpr (!detail::isPowerOf2(N, 32)) { // Requires DG2 or PVC. simd PassThru; // Intentionally undefined byte_offsets += glob_offset; @@ -3136,7 +3152,7 @@ gather(AccessorT acc, simd byte_offsets, "hint is cache_level::L2 now."); if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none || - VS > 1 || sizeof(T) > 4 || !(detail::isPowerOf2(N, 32))) { + VS > 1 || !(detail::isPowerOf2(N, 32))) { simd PassThru; // Intentionally undefined return detail::gather_impl( @@ -3344,13 +3360,13 @@ gather(AccessorT acc, OffsetSimdViewT byte_offsets, PropertyListT props = {}) { /// /// template -__ESIMD_API std::enable_if_t< - (sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) && - detail::is_device_accessor_with_v> -scatter(AccessorTy acc, simd offsets, - simd vals, detail::DeviceAccessorOffsetT glob_offset = 0, - simd_mask mask = 1) { +__ESIMD_API + std::enable_if_t<(detail::isPowerOf2(N, 32)) && + detail::is_device_accessor_with_v< + AccessorTy, detail::accessor_mode_cap::can_write>> + scatter(AccessorTy acc, simd offsets, + simd vals, detail::DeviceAccessorOffsetT glob_offset = 0, + simd_mask mask = 1) { #ifdef __ESIMD_FORCE_STATELESS_MEM scatter(__ESIMD_DNS::accessorToPointer(acc, glob_offset), offsets, vals, mask); @@ -3362,7 +3378,7 @@ scatter(AccessorTy acc, simd offsets, #ifdef __ESIMD_FORCE_STATELESS_MEM template __ESIMD_API std::enable_if_t< - (sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) && + (detail::isPowerOf2(N, 32)) && detail::is_device_accessor_with_v && std::is_integral_v && !std::is_same_v> @@ -3902,9 +3918,27 @@ slm_gather(simd byte_offsets, simd_mask mask, detail::lsc_data_size::default_size>( byte_offsets, mask, pass_thru); } else { - using MsgT = detail::__raw_t; - return __esimd_slm_gather_ld( - byte_offsets.data(), mask.data(), pass_thru.data()); + if constexpr (sizeof(T) == 8) { + simd Res; + Res.template bit_cast_view().template select(0) = + __esimd_slm_gather_ld( + byte_offsets.data(), mask.data(), + (pass_thru.template bit_cast_view() + .template select(0)) + .data()); + simd Offset = byte_offsets + sizeof(uint32_t); + Res.template bit_cast_view().template select(1) = + __esimd_slm_gather_ld( + Offset.data(), mask.data(), + (pass_thru.template bit_cast_view() + .template select(1)) + .data()); + return Res; + } else { + using MsgT = detail::__raw_t; + return __esimd_slm_gather_ld( + byte_offsets.data(), mask.data(), pass_thru.data()); + } } } @@ -3943,16 +3977,30 @@ slm_gather(simd byte_offsets, simd_mask mask, static_assert(Alignment >= sizeof(T), "slm_gather() requires at least element-size alignment"); - if constexpr (VS > 1 || (!(detail::isPowerOf2(N, 32) && sizeof(T) <= 4) && + if constexpr (VS > 1 || (!detail::isPowerOf2(N, 32) && !detail::isMaskedGatherScatterLLVMAvailable())) { simd PassThru; // Intentionally undefined return detail::slm_gather_impl( byte_offsets, mask, PassThru); } else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) { - using MsgT = detail::__raw_t; - simd PassThru; // it is intentionally undefined - return __esimd_slm_gather_ld( - byte_offsets.data(), mask.data(), PassThru.data()); + if constexpr (sizeof(T) == 8) { + simd Res; + simd PassThru; // it is intentionally undefined + + Res.template bit_cast_view().template select(0) = + __esimd_slm_gather_ld( + byte_offsets.data(), mask.data(), PassThru.data()); + simd Offset = byte_offsets + sizeof(uint32_t); + Res.template bit_cast_view().template select(1) = + __esimd_slm_gather_ld( + Offset.data(), mask.data(), PassThru.data()); + return Res; + } else { + using MsgT = detail::__raw_t; + simd PassThru; // it is intentionally undefined + return __esimd_slm_gather_ld( + byte_offsets.data(), mask.data(), PassThru.data()); + } } else { detail::LocalAccessorMarker acc; return detail::gather_impl(acc, byte_offsets, 0, mask); @@ -4236,15 +4284,30 @@ slm_scatter(simd byte_offsets, simd vals, "slm_scatter() requires at least element-size alignment"); // Use LSC lowering if VS > 1. - if constexpr (VS > 1 || (!(detail::isPowerOf2(N, 32) && sizeof(T) <= 4) && + if constexpr (VS > 1 || (!detail::isPowerOf2(N, 32) && !detail::isMaskedGatherScatterLLVMAvailable())) { __ESIMD_DNS::slm_scatter_impl( byte_offsets, vals, mask); } else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) { - using MsgT = detail::__raw_t; - __esimd_slm_scatter_st( - sycl::bit_cast<__ESIMD_DNS::vector_type_t>(vals.data()), - byte_offsets.data(), mask.data()); + if constexpr (sizeof(T) == 8) { + __esimd_slm_scatter_st( + vals.template bit_cast_view() + .template select(0) + .data(), + byte_offsets.data(), mask.data()); + simd Offset = byte_offsets + sizeof(uint32_t); + __esimd_slm_scatter_st( + vals.template bit_cast_view() + .template select(1) + .data(), + Offset.data(), mask.data()); + + } else { + using MsgT = detail::__raw_t; + __esimd_slm_scatter_st( + sycl::bit_cast<__ESIMD_DNS::vector_type_t>(vals.data()), + byte_offsets.data(), mask.data()); + } } else { detail::LocalAccessorMarker acc; detail::scatter_impl(acc, vals, byte_offsets, 0, mask); diff --git a/sycl/test-e2e/ESIMD/api/slm_gather_scatter.cpp b/sycl/test-e2e/ESIMD/api/slm_gather_scatter.cpp index e2824d49a0e6b..bcf4e7ecb8db9 100644 --- a/sycl/test-e2e/ESIMD/api/slm_gather_scatter.cpp +++ b/sycl/test-e2e/ESIMD/api/slm_gather_scatter.cpp @@ -127,5 +127,13 @@ int main(void) { passed &= test(q); } + passed &= test(q); + passed &= test(q); + + if (dev.has(sycl::aspect::fp64)) { + passed &= test(q); + passed &= test(q); + } + return passed ? 0 : 1; } diff --git a/sycl/test-e2e/ESIMD/api/slm_gather_scatter_heavy.cpp b/sycl/test-e2e/ESIMD/api/slm_gather_scatter_heavy.cpp index 82e45a2a40648..576838ad6a062 100644 --- a/sycl/test-e2e/ESIMD/api/slm_gather_scatter_heavy.cpp +++ b/sycl/test-e2e/ESIMD/api/slm_gather_scatter_heavy.cpp @@ -469,6 +469,15 @@ int main() { passed &= test_vl1(q); passed &= test(q); } + if (dev.has(sycl::aspect::fp64)) { + passed &= test(q); + passed &= test(q); + passed &= test(q); + } + + passed &= test(q); + passed &= test(q); + passed &= test(q); std::cout << (!passed ? "TEST FAILED\n" : "TEST Passed\n"); return passed ? 0 : 1; diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp index e34f259c093ec..153707a79f9c4 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp @@ -509,13 +509,13 @@ bool testSLM(queue Q, uint32_t MaskStride, PropertiesT) { uint32_t LocalElemOffset = LocalID * N; // Allocate a bit more to safely initialize it with 8-element chunks. - constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T); + constexpr uint32_t SLMSize = (Threads * N) * sizeof(T); slm_init(); if (LocalID == 0) { - for (int I = 0; I < Threads * N; I += 8) { - simd InVec(In + GlobalElemOffset + I); - simd offsets(I * sizeof(T), sizeof(T)); + for (int I = 0; I < Threads * N; I++) { + simd InVec(In + GlobalElemOffset + I); + simd offsets(I * sizeof(T), sizeof(T)); slm_scatter(offsets, InVec); } } diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/gather_acc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/gather_acc.cpp index de67e4eaa5e4b..1d8d4c3c802fe 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/gather_acc.cpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/gather_acc.cpp @@ -30,11 +30,9 @@ int main() { Passed &= testACC(Q); Passed &= testACC(Q); Passed &= testACC(Q); -#ifdef __ESIMD_FORCE_STATELESS_MEM Passed &= testACC(Q); if (Q.get_device().has(sycl::aspect::fp64)) Passed &= testACC(Q); -#endif // __ESIMD_FORCE_STATELESS_MEM std::cout << (Passed ? "Passed\n" : "FAILED\n"); return Passed ? 0 : 1; } diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/gather_acc_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/gather_acc_dg2_pvc.cpp index 7749678611466..57686554a5934 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/gather_acc_dg2_pvc.cpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/gather_acc_dg2_pvc.cpp @@ -33,11 +33,9 @@ int main() { Passed &= testACC(Q); Passed &= testACC(Q); Passed &= testACC(Q); -#ifdef __ESIMD_FORCE_STATELESS_MEM Passed &= testACC(Q); if (Q.get_device().has(sycl::aspect::fp64)) Passed &= testACC(Q); -#endif // __ESIMD_FORCE_STATELESS_MEM std::cout << (Passed ? "Passed\n" : "FAILED\n"); return Passed ? 0 : 1; } diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_gather.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_gather.cpp index 2a11c981bf069..a1cb70863aad1 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/slm_gather.cpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_gather.cpp @@ -29,6 +29,9 @@ int main() { Passed &= testSLM(Q); Passed &= testSLM(Q); Passed &= testSLM(Q); + Passed &= testSLM(Q); + if (Q.get_device().has(sycl::aspect::fp64)) + Passed &= testSLM(Q); std::cout << (Passed ? "Passed\n" : "FAILED\n"); return Passed ? 0 : 1; } diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_gather_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_gather_dg2_pvc.cpp index 4ae98cfc1f41c..d501d3aa38422 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/slm_gather_dg2_pvc.cpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_gather_dg2_pvc.cpp @@ -32,6 +32,9 @@ int main() { Passed &= testSLM(Q); Passed &= testSLM(Q); Passed &= testSLM(Q); + Passed &= testSLM(Q); + if (Q.get_device().has(sycl::aspect::fp64)) + Passed &= testSLM(Q); std::cout << (Passed ? "Passed\n" : "FAILED\n"); return Passed ? 0 : 1; diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp index ffa0a718e7689..9bb7446a8a2b9 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp @@ -27,6 +27,9 @@ int main() { Passed &= testSLM(Q); Passed &= testSLM(Q); Passed &= testSLM(Q); + Passed &= testSLM(Q); + if (Q.get_device().has(sycl::aspect::fp64)) + Passed &= testSLM(Q); std::cout << (Passed ? "Passed\n" : "FAILED\n"); return Passed ? 0 : 1; diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp index 8305bd9b83b18..783b9b38ac08d 100644 --- a/sycl/test/esimd/memory_properties.cpp +++ b/sycl/test/esimd/memory_properties.cpp @@ -1439,9 +1439,17 @@ test_slm_gather_scatter(int byte_offset32) { slm_scatter(ioffset_n16_view, slm_view, mask_n16, props_align4); simd ioffset_n10(byte_offset32, 8); - simd usm_n10; + simd slm_n10; // Check special case to verify that for cases when N is not power of 2 llvm // intrinsic is used // CHECK-COUNT-1: call void @llvm.masked.scatter.v10f32.v10p3(<10 x float> {{[^)]+}}, <10 x ptr addrspace(3)> {{[^)]+}}, i32 4, <10 x i1> {{[^)]+}}) - slm_scatter(ioffset_n10, usm_n10); + slm_scatter(ioffset_n10, slm_n10); + + // Check a case to verify emulation for 64 bit data types + // CHECK-COUNT-1: call <32 x i32> @llvm.masked.gather.v32i32.v32p3(<32 x ptr addrspace(3)> {{[^)]+}}, i32 8, <32 x i1> {{[^)]+}}, <32 x i32> {{[^)]+}}) + // CHECK-COUNT-1: call <32 x i32> @llvm.masked.gather.v32i32.v32p3(<32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}, <32 x i32> {{[^)]+}}) + auto slm_64 = slm_gather(ioffset_n32); + // CHECK-COUNT-1: call void @llvm.masked.scatter.v32i32.v32p3(<32 x i32> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 8, <32 x i1> {{[^)]+}}) + // CHECK-COUNT-1: call void @llvm.masked.scatter.v32i32.v32p3(<32 x i32> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}) + slm_scatter(ioffset_n32, slm_64); }