From 179b72118feb31b180c6ffdc19b2a5b33ea82b33 Mon Sep 17 00:00:00 2001 From: Vyacheslav Klochkov Date: Tue, 5 Mar 2024 17:26:34 -0600 Subject: [PATCH] [ESIMD][NFC] Rework the L1/L2 cache hints passing across internal funcs (#12899) This is the 1st patch in the upcoming series of similar patches. Cache-hints: * cache_hint and cache_level were also moved to memory_properties.hpp * added check_cache_hints() function accepting PropertyListT instead of L1/L2 template parameters. Restructures in block_load: * block_load_impl() functions now accept template param PropertyListT instead of L1H, L2H and alignment FlagsT. * only block_load_impl functions call check_cache_hints() now. * Replaced the uses of lsc.load.stateless with lsc.load.merge.stateless It does not change the GPU code-gen, it is identical. Restructures in block_store: * block_store_impl() functions now accept template param PropertyListT instead of L1H, L2H and alignment FlagsT. --------- Signed-off-by: Klochkov, Vyacheslav N --- sycl/include/sycl/ext/intel/esimd/common.hpp | 64 ++-- sycl/include/sycl/ext/intel/esimd/memory.hpp | 353 ++++++------------ .../ext/intel/esimd/memory_properties.hpp | 119 +++++- .../ext/intel/experimental/esimd/memory.hpp | 68 ++-- sycl/test/esimd/lsc.cpp | 4 +- sycl/test/esimd/memory_properties.cpp | 18 +- 6 files changed, 300 insertions(+), 326 deletions(-) diff --git a/sycl/include/sycl/ext/intel/esimd/common.hpp b/sycl/include/sycl/ext/intel/esimd/common.hpp index 6c208706df0b5..25f5188ad38bb 100644 --- a/sycl/include/sycl/ext/intel/esimd/common.hpp +++ b/sycl/include/sycl/ext/intel/esimd/common.hpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include // for uint* types @@ -344,46 +345,6 @@ template <__ESIMD_NS::native::lsc::atomic_op Op> constexpr int get_num_args() { } // namespace detail -/// L1, L2 or L3 cache hints. -enum class cache_hint : uint8_t { - none = 0, - /// load/store/atomic: do not cache data to cache; - uncached = 1, - - // load: cache data to cache; - cached = 2, - - /// store: write data into cache level and mark the cache line as "dirty". - /// Upon eviction, the "dirty" data will be written into the furthest - /// subsequent cache; - write_back = 3, - - /// store: immediately write data to the subsequent furthest cache, marking - /// the cache line in the current cache as "not dirty"; - write_through = 4, - - /// load: cache data to cache using the evict-first policy to minimize cache - /// pollution caused by temporary streaming data that may only be accessed - /// once or twice; - /// store/atomic: same as write-through, but use the evict-first policy - /// to limit cache pollution by streaming; - streaming = 5, - - /// load: asserts that the cache line containing the data will not be read - /// again until it’s overwritten, therefore the load operation can invalidate - /// the cache line and discard "dirty" data. If the assertion is violated - /// (the cache line is read again) then behavior is undefined. - read_invalidate = 6, - - // TODO: Implement the verification of this enum in check_cache_hint(). - /// load, L2 cache only, next gen GPU after Xe required: asserts that - /// the L2 cache line containing the data will not be written until all - /// invocations of the shader or kernel execution are finished. - /// If the assertion is violated (the cache line is written), the behavior - /// is undefined. - const_cached = 7 -}; - /// The scope that fence() operation should apply to. /// Supported platforms: DG2, PVC enum class fence_scope : uint8_t { @@ -440,9 +401,6 @@ enum class memory_kind : uint8_t { local = 3, /// shared local memory }; -/// L1, L2 or L3 cache hint levels. L3 is reserved for future use. -enum class cache_level : uint8_t { L1 = 1, L2 = 2, L3 = 3 }; - namespace detail { /// Data size or format to read or store @@ -632,6 +590,26 @@ void check_cache_hint() { } } +template constexpr bool has_cache_hints() { + constexpr cache_hint L1H = + getPropertyValue(cache_hint::none); + constexpr cache_hint L2H = + getPropertyValue(cache_hint::none); + return L1H != cache_hint::none || L2H != cache_hint::none; +} + +// Currently, this is just a wrapper around 'check_cache_hint' function. +// It accepts the compile-time properties that may include cache-hints +// to be verified. +template +void check_cache_hints() { + constexpr cache_hint L1H = + getPropertyValue(cache_hint::none); + constexpr cache_hint L2H = + getPropertyValue(cache_hint::none); + check_cache_hint(); +} + constexpr lsc_data_size expand_data_size(lsc_data_size DS) { if (DS == lsc_data_size::u8) return lsc_data_size::u8u32; diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index 5e930f90eaa75..b92f798d08f48 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -889,59 +889,6 @@ using DeviceAccessorOffsetT = uint64_t; using DeviceAccessorOffsetT = uint32_t; #endif -template -__ESIMD_API std::enable_if_t, simd> -block_load_impl(const T *p, simd_mask<1> pred, FlagsT flags) { - // Verify input template arguments. - check_cache_hint(); - constexpr auto Alignment = - FlagsT::template alignment<__ESIMD_DNS::__raw_t>; - static_assert( - (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) || - (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4), - "Incorrect alignment for the data type"); - - constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T); - constexpr int SmallIntFactor32Bit = - sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1; - static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0, - "Number of elements is not supported by Transposed load"); - - // If alignment >= 8 and (NElts * sizeof(T)) % 8 == 0) we can load QWORDs. - // Don't do it for 4-byte vectors (unless it is greater than 256-bytes), - // because it would require a bit-cast, which is supposed to be NO-OP, but - // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways. - constexpr bool Use64BitData = - Alignment >= __ESIMD_DNS::OperandSize::QWORD && - (NElts * sizeof(T)) % sizeof(uint64_t) == 0 && - (sizeof(T) != sizeof(uint32_t) || NElts * sizeof(T) > 256); - constexpr int SmallIntFactor = - Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit; - constexpr int FactoredNElts = NElts / SmallIntFactor; - check_lsc_vector_size(); - - // Prepare template arguments for the call of intrinsic. - using LoadElemT = __ESIMD_DNS::__raw_t< - std::conditional_t>>; - constexpr uint16_t AddressScale = 1; - constexpr int ImmOffset = 0; - constexpr lsc_data_size ActualDS = - Use64BitData ? lsc_data_size::u64 : lsc_data_size::u32; - constexpr lsc_vector_size VS = to_lsc_vector_size(); - constexpr auto Transposed = lsc_data_order::transpose; - constexpr int N = 1; - - // Prepare non-template arguments and call the intrinsic. - simd Addrs = reinterpret_cast(p); - simd Result = - __esimd_lsc_load_stateless(pred.data(), - Addrs.data()); - return Result.template bit_cast_view(); -} - /// USM pointer transposed gather with 1 channel. /// Supported platforms: DG2, PVC /// VISA instruction: lsc_load.ugm @@ -964,25 +911,22 @@ block_load_impl(const T *p, simd_mask<1> pred, FlagsT flags) { /// /// @tparam T is element type. /// @tparam NElts is the number of elements to load per address. -/// @tparam L1H is L1 cache hint. -/// @tparam L2H is L2 cache hint. +/// @tparam PropertyListT is the list of optional cache-hint properties and +/// the required alignment property. /// @param p is the base pointer. /// @param pred is operation predicate. Zero means operation is skipped /// entirely, non-zero - operation is performed. /// @param pass_thru contains the vector which elements are copied /// to the returned result when the corresponding element of \p pred is 0. -/// @param flags is the alignment specifier type tag. /// @return is a vector of type T and size NElts. /// -template -__ESIMD_API std::enable_if_t, simd> -block_load_impl(const T *p, simd_mask<1> pred, simd pass_thru, - FlagsT flags) { +template +__ESIMD_API std::enable_if_t, simd> +block_load_impl(const T *p, simd_mask<1> pred, simd pass_thru) { // Verify input template arguments. - check_cache_hint(); - constexpr auto Alignment = - FlagsT::template alignment<__ESIMD_DNS::__raw_t>; + check_cache_hints(); + constexpr size_t Alignment = + PropertyListT::template get_property().value; static_assert( (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) || (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4), @@ -1011,6 +955,10 @@ block_load_impl(const T *p, simd_mask<1> pred, simd pass_thru, using LoadElemT = __ESIMD_DNS::__raw_t< std::conditional_t>>; + constexpr cache_hint L1H = + getPropertyValue(cache_hint::none); + constexpr cache_hint L2H = + getPropertyValue(cache_hint::none); constexpr uint16_t AddressScale = 1; constexpr int ImmOffset = 0; constexpr lsc_data_size ActualDS = @@ -1051,35 +999,34 @@ block_load_impl(const T *p, simd_mask<1> pred, simd pass_thru, /// /// @tparam T is element type. /// @tparam NElts is the number of elements to load per address. -/// @tparam L1H is L1 cache hint. -/// @tparam L2H is L2 cache hint. +/// @tparam PropertyListT is the list of optional cache-hint properties and +/// the required alignment property. /// @tparam AccessorT is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offset is the zero-based offset in bytes. /// @param pred is operation predicate. Zero means operation is skipped /// entirely, non-zero - operation is performed. The default is '1' - perform /// the operation. -/// @param flags is the alignment specifier type tag. /// @return is a vector of type T and size NElts. The elements of the returned /// vector for which the corresponding element in \p pred is 0 are undefined. /// -template +template __ESIMD_API std::enable_if_t && - is_simd_flag_type_v, + is_property_list_v, simd> block_load_impl(AccessorT acc, DeviceAccessorOffsetT offset, - simd_mask<1> pred, FlagsT flags) { + simd_mask<1> pred) { #ifdef __ESIMD_FORCE_STATELESS_MEM - return block_load_impl(accessorToPointer(acc, offset), - pred, flags); + simd PassThru; // Intentionally undefined. + return block_load_impl( + accessorToPointer(acc, offset), pred, PassThru); #else // !__ESIMD_FORCE_STATELESS_MEM // Verify input template arguments. - check_cache_hint(); - constexpr auto Alignment = - FlagsT::template alignment<__ESIMD_DNS::__raw_t>; + check_cache_hints(); + constexpr size_t Alignment = + detail::getPropertyValue(sizeof(T)); static_assert( (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) || (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4), @@ -1108,7 +1055,10 @@ __ESIMD_API using LoadElemT = __ESIMD_DNS::__raw_t< std::conditional_t>>; - + constexpr cache_hint L1H = + getPropertyValue(cache_hint::none); + constexpr cache_hint L2H = + getPropertyValue(cache_hint::none); constexpr uint16_t AddressScale = 1; constexpr int ImmOffset = 0; constexpr lsc_data_size ActualDS = @@ -1149,8 +1099,8 @@ __ESIMD_API /// /// @tparam T is element type. /// @tparam NElts is the number of elements to load per address. -/// @tparam L1H is L1 cache hint. -/// @tparam L2H is L2 cache hint. +/// @tparam PropertyListT is the list of optional cache-hint properties and +/// the required alignment property. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offset is the zero-based offset in bytes. @@ -1159,26 +1109,24 @@ __ESIMD_API /// Otherwise, the operation is performed and the result if it copied to /// the result. /// @param pass_thru contains the values copied to the result if \p pred is 0. -/// @param flags is the alignment specifier type tag. /// @return is a vector of type T and size NElts /// -template +template __ESIMD_API std::enable_if_t && - is_simd_flag_type_v, + is_property_list_v, simd> block_load_impl(AccessorT acc, DeviceAccessorOffsetT offset, - simd_mask<1> pred, simd pass_thru, FlagsT flags) { + simd_mask<1> pred, simd pass_thru) { #ifdef __ESIMD_FORCE_STATELESS_MEM - return block_load_impl(accessorToPointer(acc, offset), - pred, pass_thru, flags); + return block_load_impl( + accessorToPointer(acc, offset), pred, pass_thru); #else // !__ESIMD_FORCE_STATELESS_MEM // Verify input template arguments. - check_cache_hint(); - constexpr auto Alignment = - FlagsT::template alignment<__ESIMD_DNS::__raw_t>; + check_cache_hints(); + constexpr size_t Alignment = + PropertyListT::template get_property().value; static_assert( (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) || (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4), @@ -1207,7 +1155,10 @@ __ESIMD_API using LoadElemT = __ESIMD_DNS::__raw_t< std::conditional_t>>; - + constexpr cache_hint L1H = + getPropertyValue(cache_hint::none); + constexpr cache_hint L2H = + getPropertyValue(cache_hint::none); constexpr uint16_t AddressScale = 1; constexpr int ImmOffset = 0; constexpr lsc_data_size ActualDS = @@ -1229,13 +1180,12 @@ __ESIMD_API #endif // !__ESIMD_FORCE_STATELESS_MEM } -template -__ESIMD_API std::enable_if_t> -block_store_impl(T *p, simd vals, simd_mask<1> pred, FlagsT flags) { - detail::check_cache_hint(); - constexpr auto Alignment = - FlagsT::template alignment<__ESIMD_DNS::__raw_t>; +template +__ESIMD_API std::enable_if_t> +block_store_impl(T *p, simd vals, simd_mask<1> pred) { + detail::check_cache_hints(); + constexpr size_t Alignment = + detail::getPropertyValue(sizeof(T)); static_assert( (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) || (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4), @@ -1265,6 +1215,10 @@ block_store_impl(T *p, simd vals, simd_mask<1> pred, FlagsT flags) { using StoreType = __ESIMD_DNS::__raw_t< std::conditional_t>>; + constexpr cache_hint L1H = + getPropertyValue(cache_hint::none); + constexpr cache_hint L2H = + getPropertyValue(cache_hint::none); constexpr uint16_t AddressScale = 1; constexpr int ImmOffset = 0; constexpr lsc_data_size ActualDS = @@ -1281,22 +1235,21 @@ block_store_impl(T *p, simd vals, simd_mask<1> pred, FlagsT flags) { vals.data())); } -template +template __ESIMD_API std::enable_if_t && - is_simd_flag_type_v> + detail::is_property_list_v> block_store_impl(AccessorT acc, DeviceAccessorOffsetT offset, - simd vals, simd_mask<1> pred, FlagsT flags) { + simd vals, simd_mask<1> pred) { #ifdef __ESIMD_FORCE_STATELESS_MEM - block_store_impl(accessorToPointer(acc, offset), vals, - pred, flags); + block_store_impl(accessorToPointer(acc, offset), + vals, pred); #else // Verify input template arguments. - check_cache_hint(); - constexpr auto Alignment = - FlagsT::template alignment<__ESIMD_DNS::__raw_t>; + check_cache_hints(); + constexpr size_t Alignment = + detail::getPropertyValue(sizeof(T)); static_assert( (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) || (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4), @@ -1327,7 +1280,10 @@ __ESIMD_API using StoreElemT = __ESIMD_DNS::__raw_t< std::conditional_t>>; - + constexpr cache_hint L1H = + getPropertyValue(cache_hint::none); + constexpr cache_hint L2H = + getPropertyValue(cache_hint::none); constexpr uint16_t AddressScale = 1; constexpr int ImmOffset = 0; constexpr lsc_data_size ActualDS = @@ -1444,26 +1400,16 @@ template , simd> block_load(const T *ptr, PropertyListT props = {}) { - constexpr auto L1Hint = - detail::getPropertyValue( - cache_hint::none); - constexpr auto L2Hint = - detail::getPropertyValue( - cache_hint::none); - static_assert(!PropertyListT::template has_property(), - "L3 cache hint is reserved. The old/experimental L3 LSC cache " - "hint is cache_level::L2 now."); - constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T); - constexpr size_t Alignment = - detail::getPropertyValue(DefaultAlignment); - if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none) { - detail::check_cache_hint(); - + using NewPropertyListT = + detail::add_alignment_property_t; + if constexpr (detail::has_cache_hints()) { + simd PassThru; // Intentionally undefined. simd_mask<1> Mask = 1; - return detail::block_load_impl( - ptr, Mask, overaligned_tag{}); + return detail::block_load_impl(ptr, Mask, PassThru); } else { + constexpr size_t Alignment = + NewPropertyListT::template get_property().value; return block_load(ptr, overaligned_tag{}); } } @@ -1549,26 +1495,14 @@ block_load(const T *ptr, size_t byte_offset, PropertyListT props = {}) { template -__ESIMD_API std::enable_if_t< - ext::oneapi::experimental::is_property_list_v, simd> -block_load(const T *ptr, simd_mask<1> pred, PropertyListT props = {}) { - constexpr auto L1Hint = - detail::getPropertyValue( - cache_hint::none); - constexpr auto L2Hint = - detail::getPropertyValue( - cache_hint::none); - static_assert(!PropertyListT::template has_property(), - "L3 cache hint is reserved. The old/experimental L3 LSC cache " - "hint is cache_level::L2 now."); - - detail::check_cache_hint(); +__ESIMD_API + std::enable_if_t, simd> + block_load(const T *ptr, simd_mask<1> pred, PropertyListT props = {}) { constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T); - constexpr size_t Alignment = - detail::getPropertyValue(DefaultAlignment); - - return detail::block_load_impl( - ptr, pred, overaligned_tag{}); + using NewPropertyListT = + detail::add_alignment_property_t; + simd PassThru; // Intentionally uninitialized. + return detail::block_load_impl(ptr, pred, PassThru); } /// simd block_load(const T* ptr, size_t byte_offset, @@ -1655,23 +1589,10 @@ __ESIMD_API std::enable_if_t< ext::oneapi::experimental::is_property_list_v, simd> block_load(const T *ptr, simd_mask<1> pred, simd pass_thru, PropertyListT props = {}) { - constexpr auto L1Hint = - detail::getPropertyValue( - cache_hint::none); - constexpr auto L2Hint = - detail::getPropertyValue( - cache_hint::none); - static_assert(!PropertyListT::template has_property(), - "L3 cache hint is reserved. The old/experimental L3 LSC cache " - "hint is cache_level::L2 now."); - - detail::check_cache_hint(); constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T); - constexpr size_t Alignment = - detail::getPropertyValue(DefaultAlignment); - - return detail::block_load_impl( - ptr, pred, pass_thru, overaligned_tag{}); + using NewPropertyListT = + detail::add_alignment_property_t; + return detail::block_load_impl(ptr, pred, pass_thru); } /// simd block_load(const T* ptr, size_t byte_offset, @@ -1898,11 +1819,14 @@ block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset, constexpr bool IsLegacySize = Size == OWord || Size == 2 * OWord || Size == 4 * OWord || Size == 8 * OWord; - if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none || - !IsLegacySize) { - return detail::block_load_impl( - acc, byte_offset, simd_mask<1>(1), overaligned_tag{}); + using NewPropertyListT = + detail::add_alignment_property_t; + if constexpr (detail::has_cache_hints() || !IsLegacySize) { + return detail::block_load_impl(acc, byte_offset, + simd_mask<1>(1)); } else { + constexpr size_t Alignment = + NewPropertyListT::template get_property().value; return block_load(acc, byte_offset, overaligned_tag{}); } #endif // !__ESIMD_FORCE_STATELESS_MEM @@ -2016,10 +1940,10 @@ block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset, // If the alignment property is not passed, then assume the byte_offset // is element-aligned and is at leat 4-bytes. constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T); - constexpr size_t Alignment = - detail::getPropertyValue(DefaultAlignment); - return detail::block_load_impl( - acc, byte_offset, pred, pass_thru, overaligned_tag{}); + using NewPropertyListT = + detail::add_alignment_property_t; + return detail::block_load_impl(acc, byte_offset, pred, + pass_thru); } /// simd @@ -2227,28 +2151,14 @@ block_load(AccessorT acc, simd_mask<1> pred, PropertyListT /* props */ = {}) { template -__ESIMD_API std::enable_if_t< - ext::oneapi::experimental::is_property_list_v> -block_store(T *ptr, simd vals, PropertyListT props = {}) { - constexpr auto L1Hint = - detail::getPropertyValue( - cache_hint::none); - constexpr auto L2Hint = - detail::getPropertyValue( - cache_hint::none); - static_assert(!PropertyListT::template has_property(), - "L3 cache hint is reserved. The old/experimental L3 LSC cache " - "hint is cache_level::L2 now."); - if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none) { - detail::check_cache_hint(); - constexpr int DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T); - constexpr size_t Alignment = - detail::getPropertyValue( - DefaultAlignment); - +__ESIMD_API std::enable_if_t> +block_store(T *ptr, simd vals, PropertyListT /* props */ = {}) { + if constexpr (detail::has_cache_hints()) { + constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T); + using NewPropertyListT = + detail::add_alignment_property_t; simd_mask<1> Mask = 1; - detail::block_store_impl( - ptr, vals, Mask, overaligned_tag{}); + detail::block_store_impl(ptr, vals, Mask); } else { // If the alignment property is not passed, then assume the pointer // is OWORD-aligned. @@ -2340,26 +2250,13 @@ block_store(T *ptr, size_t byte_offset, simd vals, template -__ESIMD_API std::enable_if_t< - ext::oneapi::experimental::is_property_list_v> +__ESIMD_API std::enable_if_t> block_store(T *ptr, simd vals, simd_mask<1> pred, - PropertyListT props = {}) { - constexpr auto L1Hint = - detail::getPropertyValue( - cache_hint::none); - constexpr auto L2Hint = - detail::getPropertyValue( - cache_hint::none); - static_assert(!PropertyListT::template has_property(), - "L3 cache hint is reserved. The old/experimental L3 LSC cache " - "hint is cache_level::L2 now."); - + PropertyListT /* props */ = {}) { constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T); - constexpr size_t Alignment = - detail::getPropertyValue(DefaultAlignment); - - detail::block_store_impl(ptr, vals, pred, - overaligned_tag{}); + using NewPropertyListT = + detail::add_alignment_property_t; + detail::block_store_impl(ptr, vals, pred); } /// void block_store(T* ptr, size_t byte_offset, // (usm-bs-4) @@ -2485,27 +2382,19 @@ block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset, block_store(detail::accessorToPointer(acc, byte_offset), vals, props); #else - constexpr auto L1Hint = - detail::getPropertyValue( - cache_hint::none); - constexpr auto L2Hint = - detail::getPropertyValue( - cache_hint::none); - static_assert(!PropertyListT::template has_property(), - "L3 cache hint is reserved. The old/experimental L3 LSC cache " - "hint is cache_level::L2 now."); constexpr int DefaultLSCAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T); constexpr size_t Alignment = detail::getPropertyValue( DefaultLSCAlignment); constexpr bool AlignmentRequiresLSC = PropertyListT::template has_property() && Alignment < 16; - if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none || + if constexpr (detail::has_cache_hints() || AlignmentRequiresLSC) { - detail::check_cache_hint(); + using NewPropertyListT = + detail::add_alignment_property_t; simd_mask<1> Mask = 1; - detail::block_store_impl( - acc, byte_offset, vals, Mask, overaligned_tag{}); + detail::block_store_impl(acc, byte_offset, vals, + Mask); } else { using Tx = detail::__raw_t; constexpr unsigned Sz = sizeof(Tx) * N; @@ -2570,11 +2459,7 @@ block_store(AccessorT acc, simd vals, PropertyListT props = {}) { constexpr auto L2Hint = detail::getPropertyValue( cache_hint::none); - static_assert(!PropertyListT::template has_property(), - "L3 cache hint is reserved. The old/experimental L3 LSC cache " - "hint is cache_level::L2 now."); properties Props{cache_hint_L1, cache_hint_L2, alignment<16>}; - block_store(acc, 0, vals, Props); } @@ -2618,22 +2503,11 @@ __ESIMD_API std::enable_if_t< detail::accessor_mode_cap::can_write>> block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset, simd vals, simd_mask<1> pred, PropertyListT props = {}) { - constexpr auto L1Hint = - detail::getPropertyValue( - cache_hint::none); - constexpr auto L2Hint = - detail::getPropertyValue( - cache_hint::none); - static_assert(!PropertyListT::template has_property(), - "L3 cache hint is reserved. The old/experimental L3 LSC cache " - "hint is cache_level::L2 now."); - constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T); - constexpr size_t Alignment = - detail::getPropertyValue(DefaultAlignment); - - detail::block_store_impl(acc, byte_offset, vals, pred, - overaligned_tag{}); + using NewPropertyListT = + detail::add_alignment_property_t; + detail::block_store_impl(acc, byte_offset, vals, + pred); } /// void block_store(AccessorT acc, simd vals, // (acc-bs-4) @@ -2677,9 +2551,6 @@ block_store(AccessorT acc, simd vals, simd_mask<1> pred, constexpr auto L2Hint = detail::getPropertyValue( cache_hint::none); - static_assert(!PropertyListT::template has_property(), - "L3 cache hint is reserved. The old/experimental L3 LSC cache " - "hint is cache_level::L2 now."); properties Props{cache_hint_L1, cache_hint_L2, alignment<16>}; block_store(acc, 0, vals, pred, Props); } diff --git a/sycl/include/sycl/ext/intel/esimd/memory_properties.hpp b/sycl/include/sycl/ext/intel/esimd/memory_properties.hpp index 4da1a64225bed..5b2f920914466 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory_properties.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory_properties.hpp @@ -9,9 +9,10 @@ #pragma once #include -#include +#include #include #include +#include #define SYCL_EXT_INTEL_ESIMD_MEMORY_PROPERTIES 1 @@ -19,6 +20,49 @@ namespace sycl { inline namespace _V1 { namespace ext::intel::esimd { +/// L1, L2 or L3 cache hint levels. L3 is reserved for future use. +enum class cache_level : uint8_t { L1 = 1, L2 = 2, L3 = 3 }; + +/// L1, L2 or L3 cache hints. +enum class cache_hint : uint8_t { + none = 0, + /// load/store/atomic: do not cache data to cache; + uncached = 1, + + // load: cache data to cache; + cached = 2, + + /// store: write data into cache level and mark the cache line as "dirty". + /// Upon eviction, the "dirty" data will be written into the furthest + /// subsequent cache; + write_back = 3, + + /// store: immediately write data to the subsequent furthest cache, marking + /// the cache line in the current cache as "not dirty"; + write_through = 4, + + /// load: cache data to cache using the evict-first policy to minimize cache + /// pollution caused by temporary streaming data that may only be accessed + /// once or twice; + /// store/atomic: same as write-through, but use the evict-first policy + /// to limit cache pollution by streaming; + streaming = 5, + + /// load: asserts that the cache line containing the data will not be read + /// again until it’s overwritten, therefore the load operation can invalidate + /// the cache line and discard "dirty" data. If the assertion is violated + /// (the cache line is read again) then behavior is undefined. + read_invalidate = 6, + + // TODO: Implement the verification of this enum in check_cache_hint(). + /// load, L2 cache only, next gen GPU after Xe required: asserts that + /// the L2 cache line containing the data will not be written until all + /// invocations of the shader or kernel execution are finished. + /// If the assertion is violated (the cache line is written), the behavior + /// is undefined. + const_cached = 7 +}; + template class properties : public sycl::ext::oneapi::experimental::properties { @@ -110,12 +154,18 @@ using default_cache_hint_L2 = cache_hint_L2_key::value_t; using default_cache_hint_L3 = cache_hint_L3_key::value_t; namespace detail { + +template +using is_property_list = ext::oneapi::experimental::is_property_list; + +template +inline constexpr bool is_property_list_v = is_property_list::value; + /// Helper-function that returns the value of the compile time property `KeyT` /// if `PropertiesT` includes it. If it does not then the default value /// \p DefaultValue is returned. template >> + typename = std::enable_if_t>> constexpr auto getPropertyValue(KeyValueT DefaultValue) { if constexpr (!PropertiesT::template has_property()) { return DefaultValue; @@ -129,8 +179,69 @@ constexpr auto getPropertyValue(KeyValueT DefaultValue) { return ValueT.value; } } -} // namespace detail +/// This helper returns the ext::oneapi::experimental::properties class for +/// ext::oneapi::experimental::properties and it's child in esimd namespace. +template struct get_ext_oneapi_properties; +template +struct get_ext_oneapi_properties< + ext::oneapi::experimental::properties> { + using type = ext::oneapi::experimental::properties; +}; +template +struct get_ext_oneapi_properties> { + using type = ext::oneapi::experimental::properties; +}; + +/// Simply returns 'PropertyListT' as it already has the alignment property. +template +struct add_alignment_property_helper { + using type = PropertyListT; +}; +/// Returns a new property list type that contains the properties from +/// 'PropertyListT' and the newly added alignment property. +template +struct add_alignment_property_helper { + using ExpPropertyListT = + typename get_ext_oneapi_properties::type; + using AlignmentPropList = + typename ext::oneapi::experimental::detail::properties_t< + alignment_key::value_t>; + + using type = + ext::oneapi::experimental::detail::merged_properties_t; +}; + +// Creates and adds a compile-time property 'alignment' if +// the given property list 'PropertyListT' does not yet have the 'alignment' +// property in it. +template +class add_alignment_property { + using ExpPropertyListT = + typename get_ext_oneapi_properties::type; + +public: + using type = typename add_alignment_property_helper< + ExpPropertyListT, Alignment, + ExpPropertyListT::template has_property()>::type; +}; +template +using add_alignment_property_t = + typename add_alignment_property::type; + +// Creates the type for the list of L1, L2, and alignment properties. +template +struct make_L1_L2_alignment_properties { + using type = ext::oneapi::experimental::detail::properties_t< + alignment_key::value_t, cache_hint_L1_key::value_t, + cache_hint_L2_key::value_t>; +}; +template +using make_L1_L2_alignment_properties_t = + typename make_L1_L2_alignment_properties::type; + +} // namespace detail } // namespace ext::intel::esimd namespace ext::oneapi::experimental { diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index f52cf8ab855bd..b4a4924512f34 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -913,11 +913,11 @@ lsc_gather(AccessorTy acc, __ESIMD_NS::simd offsets, /// @tparam DS is the data size (unused/obsolete). /// @tparam L1H is L1 cache hint. /// @tparam L2H is L2 cache hint. +/// @tparam Flags is the alignment specifier type tag. /// @param p is the base pointer. /// @param pred is operation predicate. Zero means operation is skipped /// entirely, non-zero - operation is performed. The default is '1' - /// perform the operation. -/// @param flags is the alignment specifier type tag. /// @return is a vector of type T and size NElts. The elements of the /// returned vector for which the corresponding element in \p pred is 0 /// are undefined. @@ -927,9 +927,12 @@ template __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v, __ESIMD_NS::simd> -lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1, - FlagsT flags = FlagsT{}) { - return __ESIMD_DNS::block_load_impl(p, pred, flags); +lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1, FlagsT = {}) { + using PropertyListT = __ESIMD_DNS::make_L1_L2_alignment_properties_t< + L1H, L2H, FlagsT::template alignment<__ESIMD_NS::simd>>; + __ESIMD_NS::simd PassThru; // Intentionally undefined. + return __ESIMD_DNS::block_load_impl(p, pred, + PassThru); } /// A variation of lsc_block_load without predicate parameter to simplify use @@ -951,11 +954,11 @@ lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1, /// /// @tparam T is element type. /// @tparam NElts is the number of elements to load per address. +/// @tparam FlagsT is the alignment specifier type tag. /// @tparam DS is the data size (unused/obsolete). /// @tparam L1H is L1 cache hint. /// @tparam L2H is L2 cache hint. /// @param p is the base pointer. -/// @param flags is the alignment specifier type tag. /// @return is a vector of type T and size NElts. The elements of the /// returned vector for which the corresponding element in \p pred is 0 /// are undefined. @@ -965,9 +968,11 @@ template __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v, __ESIMD_NS::simd> -lsc_block_load(const T *p, FlagsT flags) { - return __ESIMD_DNS::block_load_impl( - p, __ESIMD_NS::simd_mask<1>(1), flags); +lsc_block_load(const T *p, FlagsT) { + using PropertyListT = __ESIMD_DNS::make_L1_L2_alignment_properties_t< + L1H, L2H, FlagsT::template alignment<__ESIMD_NS::simd>>; + return __ESIMD_DNS::block_load_impl( + p, __ESIMD_NS::simd_mask<1>(1)); } /// USM pointer transposed gather with 1 channel. @@ -993,12 +998,12 @@ lsc_block_load(const T *p, FlagsT flags) { /// @tparam DS is the data size (unused/obsolete). /// @tparam L1H is L1 cache hint. /// @tparam L2H is L2 cache hint. +/// @tparam FlagsT is the alignment specifier type tag. /// @param p is the base pointer. /// @param pred is operation predicate. Zero means operation is skipped /// entirely, non-zero - operation is performed. /// @param pass_thru contains the vector which elements are copied /// to the returned result when the corresponding element of \p pred is 0. -/// @param flags is the alignment specifier type tag. /// @return is a vector of type T and size NElts. /// template , __ESIMD_NS::simd> lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred, - __ESIMD_NS::simd pass_thru, FlagsT flags = FlagsT{}) { - return __ESIMD_DNS::block_load_impl(p, pred, pass_thru, - flags); + __ESIMD_NS::simd pass_thru, FlagsT = {}) { + using PropertyListT = __ESIMD_DNS::make_L1_L2_alignment_properties_t< + L1H, L2H, FlagsT::template alignment<__ESIMD_NS::simd>>; + return __ESIMD_DNS::block_load_impl(p, pred, + pass_thru); } /// Accessor-based transposed gather with 1 channel. @@ -1034,12 +1041,12 @@ lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred, /// @tparam L1H is L1 cache hint. /// @tparam L2H is L2 cache hint. /// @tparam AccessorTy is the \ref sycl::accessor type. +/// @param FlagsT is the alignment specifier type tag. /// @param acc is the SYCL accessor. /// @param offset is the zero-based offset in bytes. /// @param pred is operation predicate. Zero means operation is skipped /// entirely, non-zero - operation is performed. The default is '1' - perform /// the operation. -/// @param flags is the alignment specifier type tag. /// @return is a vector of type T and size NElts. The elements of the returned /// vector for which the corresponding element in \p pred is 0 are undefined. /// @@ -1054,8 +1061,10 @@ __ESIMD_API std::enable_if_t< __ESIMD_NS::simd> lsc_block_load(AccessorTy acc, __ESIMD_DNS::DeviceAccessorOffsetT offset, __ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) { - return __ESIMD_DNS::block_load_impl(acc, offset, pred, - flags); + using PropertyListT = __ESIMD_DNS::make_L1_L2_alignment_properties_t< + L1H, L2H, FlagsT::template alignment<__ESIMD_NS::simd>>; + return __ESIMD_DNS::block_load_impl(acc, offset, + pred); } template > lsc_block_load(AccessorTy acc, __ESIMD_DNS::DeviceAccessorOffsetT offset, __ESIMD_NS::simd_mask<1> pred, - __ESIMD_NS::simd pass_thru, FlagsT flags = FlagsT{}) { - return __ESIMD_DNS::block_load_impl(acc, offset, pred, - pass_thru, flags); + __ESIMD_NS::simd pass_thru, FlagsT = {}) { + using PropertyListT = __ESIMD_DNS::make_L1_L2_alignment_properties_t< + L1H, L2H, FlagsT::template alignment<__ESIMD_NS::simd>>; + return __ESIMD_DNS::block_load_impl(acc, offset, + pred, pass_thru); } template offsets, /// @tparam DS is the data size (unused/obsolete). /// @tparam L1H is L1 cache hint. /// @tparam L2H is L2 cache hint. +/// @tparam Flags is the alignment specifier type tag. /// @param p is the base pointer. /// @param vals is values to store. /// @param pred is operation predicate. Zero means operation is skipped /// entirely, non-zero - operation is performed. The default is '1' - perform /// the operation. -/// @param flags is the alignment specifier type tag. /// template __ESIMD_API std::enable_if_t<__ESIMD_NS::is_simd_flag_type_v> lsc_block_store(T *p, __ESIMD_NS::simd vals, - __ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) { - return __ESIMD_DNS::block_store_impl(p, vals, pred, - flags); + __ESIMD_NS::simd_mask<1> pred = 1, FlagsT = {}) { + using PropertyListT = __ESIMD_DNS::make_L1_L2_alignment_properties_t< + L1H, L2H, FlagsT::template alignment<__ESIMD_NS::simd>>; + return __ESIMD_DNS::block_store_impl(p, vals, pred); } /// A variation of lsc_block_store without predicate parameter to simplify @@ -1636,6 +1648,7 @@ lsc_block_store(T *p, __ESIMD_NS::simd vals, FlagsT flags) { /// @tparam DS is the data size (unused/obsolete). /// @tparam L1H is L1 cache hint. /// @tparam L2H is L2 cache hint. +/// @tparam Flags is the alignment specifier type tag. /// @tparam AccessorTy is the \ref sycl::accessor type. /// @param acc is the SYCL accessor. /// @param offset is the zero-based offset in bytes. @@ -1643,7 +1656,6 @@ lsc_block_store(T *p, __ESIMD_NS::simd vals, FlagsT flags) { /// @param pred is operation predicate. Zero means operation is skipped /// entirely, non-zero - operation is performed. The default is '1' - perform /// the operation. -/// @param flags is the alignment specifier type tag. /// template > lsc_block_store(AccessorTy acc, __ESIMD_DNS::DeviceAccessorOffsetT offset, __ESIMD_NS::simd vals, - __ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) { - __ESIMD_DNS::block_store_impl(acc, offset, vals, pred, - flags); + __ESIMD_NS::simd_mask<1> pred = 1, FlagsT = {}) { + using PropertyListT = __ESIMD_DNS::make_L1_L2_alignment_properties_t< + L1H, L2H, FlagsT::template alignment<__ESIMD_NS::simd>>; + __ESIMD_DNS::block_store_impl(acc, offset, vals, + pred); } template data1 = 1; lsc_block_store(ptr, data1); - // CHECK: call <4 x i32> @llvm.genx.lsc.load.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0) + // CHECK: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}}) simd data2 = lsc_block_load(ptr); //CHECK: call void @llvm.genx.lsc.prefetch.stateless.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 1, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0) @@ -71,7 +71,7 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void foo(AccType &acc) { lsc_block_store(acc, surf_offset, data1); // CHECK-STATEFUL: call <4 x i32> @llvm.genx.lsc.load.bti.v4i32.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}) - // CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.load.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0) + // CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}}) simd data4 = lsc_block_load(acc, surf_offset); // CHECK-STATEFUL: call void @llvm.genx.lsc.prefetch.bti.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 1, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}) diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp index ebf7f74a05206..e2a25c0dbdc2b 100644 --- a/sycl/test/esimd/memory_properties.cpp +++ b/sycl/test/esimd/memory_properties.cpp @@ -104,26 +104,26 @@ test_block_load(AccType &acc, LocalAccType &local_acc, float *ptrf, const int *ptri = reinterpret_cast(ptrf); const int8_t *ptrb = reinterpret_cast(ptrf); - // CHECK: call <4 x float> @llvm.genx.lsc.load.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0) + // CHECK: call <4 x float> @llvm.genx.lsc.load.merge.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x float> {{[^)]+}}) auto d1 = block_load(ptrf, props_a); - // CHECK: call <4 x i32> @llvm.genx.lsc.load.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0) + // CHECK: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}}) auto d2 = block_load(ptri, byte_offset32, props_a); - // CHECK: call <4 x float> @llvm.genx.lsc.load.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0) + // CHECK: call <4 x float> @llvm.genx.lsc.load.merge.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x float> {{[^)]+}}) auto d3 = block_load(ptrf, byte_offset64, props_b); - // CHECK: call <4 x float> @llvm.genx.lsc.load.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0) + // CHECK: call <4 x float> @llvm.genx.lsc.load.merge.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x float> {{[^)]+}}) simd_mask<1> mask = 1; auto d4 = block_load(ptrf, mask, props_a); // CHECK: call <4 x float> @llvm.genx.lsc.load.merge.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x float> {{[^)]+}}) auto d5 = block_load(ptrf, mask, pass_thru, props_b); - // CHECK: call <4 x float> @llvm.genx.lsc.load.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0) + // CHECK: call <4 x float> @llvm.genx.lsc.load.merge.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x float> {{[^)]+}}) auto d6 = block_load(ptrf, byte_offset32, mask, props_a); - // CHECK: call <4 x i32> @llvm.genx.lsc.load.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0) + // CHECK: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}}) auto d7 = block_load(ptri, byte_offset64, mask, props_b); // CHECK: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}}) @@ -149,15 +149,15 @@ test_block_load(AccType &acc, LocalAccType &local_acc, float *ptrf, // Verify ACCESSOR-based block_load. // CHECK-STATEFUL: call <4 x float> @llvm.genx.lsc.load.bti.v4f32.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}) - // CHECK-STATELESS: call <4 x float> @llvm.genx.lsc.load.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0) + // CHECK-STATELESS: call <4 x float> @llvm.genx.lsc.load.merge.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x float> {{[^)]+}}) auto a1 = block_load(acc, props_a); // CHECK-STATEFUL: call <4 x i32> @llvm.genx.lsc.load.bti.v4i32.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}) - // CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.load.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0) + // CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}}) auto a2 = block_load(acc, byte_offset32, props_a); // CHECK-STATEFUL: call <4 x float> @llvm.genx.lsc.load.bti.v4f32.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}) - // CHECK-STATELESS: call <4 x float> @llvm.genx.lsc.load.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0) + // CHECK-STATELESS: call <4 x float> @llvm.genx.lsc.load.merge.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x float> {{[^)]+}}) auto a3 = block_load(acc, byte_offset64, props_b); // CHECK-STATEFUL: call <4 x float> @llvm.genx.lsc.load.merge.bti.v4f32.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x float> {{[^)]+}})