From bb2f87838ec3921666c1615152fa6d1b27acc715 Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Tue, 6 Feb 2024 11:27:06 -0800 Subject: [PATCH] Address PR comments --- sycl/include/sycl/ext/intel/esimd/memory.hpp | 42 +++++++++----------- sycl/test/esimd/memory_properties.cpp | 33 +++++++++++---- 2 files changed, 43 insertions(+), 32 deletions(-) diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index f57b7b1f66e74..3dafe61e3256a 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -2822,10 +2822,9 @@ gather_impl(AccessorT acc, simd byte_offsets, /// @return is a vector of type T and size N * NElts. /// template -__ESIMD_API __ESIMD_NS::simd -slm_gather_impl(__ESIMD_NS::simd offsets, - __ESIMD_NS::simd_mask pred, - __ESIMD_NS::simd pass_thru) { +__ESIMD_API simd slm_gather_impl(simd offsets, + simd_mask pred, + simd pass_thru) { check_lsc_vector_size(); check_lsc_data_size(); constexpr uint16_t AddressScale = 1; @@ -2834,9 +2833,8 @@ slm_gather_impl(__ESIMD_NS::simd offsets, constexpr lsc_vector_size LSCVS = to_lsc_vector_size(); constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; using MsgT = typename lsc_expand_type::type; - __ESIMD_NS::simd PassThruExpanded = - lsc_format_input(pass_thru); - __ESIMD_NS::simd Result = + simd PassThruExpanded = lsc_format_input(pass_thru); + simd Result = __esimd_lsc_load_merge_slm(pred.data(), offsets.data(), @@ -2859,21 +2857,17 @@ slm_gather_impl(__ESIMD_NS::simd offsets, /// @param pred is predicates. /// template -__ESIMD_API void slm_scatter_impl(__ESIMD_NS::simd offsets, - __ESIMD_NS::simd vals, - __ESIMD_NS::simd_mask pred) { - detail::check_lsc_vector_size(); - detail::check_lsc_data_size(); +__ESIMD_API void slm_scatter_impl(simd offsets, + simd vals, simd_mask pred) { + check_lsc_vector_size(); + check_lsc_data_size(); constexpr uint16_t AddressScale = 1; constexpr int ImmOffset = 0; - constexpr lsc_data_size EDS = - detail::expand_data_size(detail::finalize_data_size()); - constexpr detail::lsc_vector_size LSCVS = detail::to_lsc_vector_size(); - constexpr detail::lsc_data_order Transposed = - detail::lsc_data_order::nontranspose; - using MsgT = typename detail::lsc_expand_type::type; - using CstT = __ESIMD_DNS::uint_type_t; - __ESIMD_NS::simd Tmp = vals.template bit_cast_view(); + constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); + constexpr lsc_vector_size LSCVS = to_lsc_vector_size(); + constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; + using MsgT = typename lsc_expand_type::type; + simd Tmp = lsc_format_input(vals); __esimd_lsc_store_slm( pred.data(), offsets.data(), Tmp.data()); @@ -4181,8 +4175,8 @@ template __ESIMD_API T slm_scalar_load(uint32_t offset) { /// template /// void slm_scatter(simd byte_offsets, -/// simd vals, simd_mask mask, -/// PropertyListT props = {}); // (slm-sc-1) +/// simd vals, simd_mask mask, +/// PropertyListT props = {}); // (slm-sc-1) /// void slm_scatter(simd byte_offsets, /// simd vals, PropertyListT props = {}); // (slm-sc-2) /// @@ -4252,7 +4246,7 @@ slm_scatter(simd byte_offsets, simd vals, /// @tparam N Number of elements to read. /// @tparam VS Vector size. It can also be read as the number of reads per each /// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported -/// only on DG2 and PVC. +/// only on DG2 and PVC and only for 4- and 8-byte element vectors.. /// @param byte_offsets the vector of 32-bit offsets in bytes. /// For each i, (byte_offsets[i]) must be element size aligned. /// @param vals The vector of values to store. @@ -4283,7 +4277,7 @@ slm_scatter(simd byte_offsets, simd vals, /// @tparam N Number of elements to read. /// @tparam VS Vector size. It can also be read as the number of reads per each /// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported -/// only on DG2 and PVC. +/// only on DG2 and PVC and only for 4- and 8-byte element vectors.. /// @param byte_offsets the vector of 32-bit offsets in bytes. /// For each i, (byte_offsets[i]) must be element size aligned. /// If the alignment property is not passed, then it is assumed that each diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp index 1103ed11bc26d..eb629935347db 100644 --- a/sycl/test/esimd/memory_properties.cpp +++ b/sycl/test/esimd/memory_properties.cpp @@ -1302,6 +1302,7 @@ test_slm_gather_scatter(int byte_offset32) { simd slm; simd pass_thru; auto pass_thru_view = pass_thru.select<32, 1>(); + auto slm_view = slm.select<32, 1>(); // Test SLM gather using this plan: // 1) slm_gather(offsets): offsets is simd or simd_view @@ -1375,41 +1376,57 @@ test_slm_gather_scatter(int byte_offset32) { props_align4); // Test SLM scatter using this plan: - // 1) slm_scatter(offsets): offsets is simd or simd_view - // 2) slm_scatter(offsets, mask): offsets is simd or simd_view - // 4) slm_scatter(...): same as (1), (2) above, but with VS > 1. + // 1) slm_scatter(offsets, vals): offsets/vals is simd or simd_view + // 2) slm_scatter(offsets, vals, mask): offsets/vals is simd or simd_view + // 3) slm_scatter(...): same as (1), (2) above, but with VS > 1. // 1) slm_scatter(offsets): offsets is simd or simd_view - // CHECK-COUNT-2: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) slm_scatter(ioffset_n32, slm); slm_scatter(ioffset_n32_view, slm); + slm_scatter(ioffset_n32, slm_view); + slm_scatter(ioffset_n32_view, slm_view); - // CHECK-COUNT-2: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) slm_scatter(ioffset_n32, slm, props_align8); slm_scatter(ioffset_n32_view, slm, props_align8); + slm_scatter(ioffset_n32, slm_view, props_align8); + slm_scatter(ioffset_n32_view, slm_view, props_align8); // 2) slm_gather(offsets, mask): offsets is simd or simd_view - // CHECK-COUNT-2: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) slm_scatter(ioffset_n32, slm, mask_n32); slm_scatter(ioffset_n32_view, slm, mask_n32); + slm_scatter(ioffset_n32, slm_view, mask_n32); + slm_scatter(ioffset_n32_view, slm_view, mask_n32); - // CHECK-COUNT-2: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) slm_scatter(ioffset_n32, slm, mask_n32, props_align8); slm_scatter(ioffset_n32_view, slm, mask_n32, props_align8); + slm_scatter(ioffset_n32, slm_view, mask_n32, props_align8); + slm_scatter(ioffset_n32_view, slm_view, mask_n32, props_align8); // 4) slm_gather(...): same as (1), (2), above, but with VS > 1. - // CHECK-COUNT-8: call void @llvm.genx.lsc.store.slm.v16i1.v16i32.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, <32 x i32>{{[^)]+}}, i32 0) + // CHECK-COUNT-16: call void @llvm.genx.lsc.store.slm.v16i1.v16i32.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, <32 x i32>{{[^)]+}}, i32 0) // 4a) check VS > 1. no 'mask' operand first. slm_scatter(ioffset_n16, slm); slm_scatter(ioffset_n16_view, slm); + slm_scatter(ioffset_n16, slm_view); + slm_scatter(ioffset_n16_view, slm_view); slm_scatter(ioffset_n16, slm, props_align4); slm_scatter(ioffset_n16_view, slm, props_align4); + slm_scatter(ioffset_n16, slm_view, props_align4); + slm_scatter(ioffset_n16_view, slm_view, props_align4); // 4b) check VS > 1. Pass the 'mask' operand this time. slm_scatter(ioffset_n16, slm, mask_n16); slm_scatter(ioffset_n16_view, slm, mask_n16); + slm_scatter(ioffset_n16, slm_view, mask_n16); + slm_scatter(ioffset_n16_view, slm_view, mask_n16); slm_scatter(ioffset_n16, slm, mask_n16, props_align4); slm_scatter(ioffset_n16_view, slm, mask_n16, props_align4); + slm_scatter(ioffset_n16, slm_view, mask_n16, props_align4); + slm_scatter(ioffset_n16_view, slm_view, mask_n16, props_align4); }