diff --git a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp index f2f3b8deef45d..e7eed0c5ecfc1 100644 --- a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp @@ -655,6 +655,39 @@ __esimd_lsc_load_stateless(__ESIMD_DNS::simd_mask_storage_t pred, } #endif // __SYCL_DEVICE_ONLY__ +/// USM pointer scatter. +/// Supported platforms: DG2, PVC +/// +/// Scatters elements to specific address. +/// +/// @tparam Ty is element type. +/// @tparam L1H is L1 cache hint. +/// @tparam L2H is L2 cache hint. +/// @tparam AddressScale is the address scale. +/// @tparam ImmOffset is the immediate offset added to each address. +/// @tparam DS is the data size. +/// @tparam VS is the number of elements to load per address. +/// @tparam Transposed indicates if the data is transposed during the transfer. +/// @tparam N is the SIMD size of operation (the number of addresses to access) +/// @param pred is predicates. +/// @param addrs is the prefetch addresses. +/// @param vals is values to store. +template +__ESIMD_INTRIN void __esimd_lsc_store_stateless( + __ESIMD_DNS::simd_mask_storage_t pred, + __ESIMD_DNS::vector_type_t addrs, + __ESIMD_DNS::vector_type_t()> vals) +#ifdef __SYCL_DEVICE_ONLY__ + ; +#else // __SYCL_DEVICE_ONLY__ +{ + __ESIMD_UNSUPPORTED_ON_HOST; +} +#endif // __SYCL_DEVICE_ONLY__ + // \brief Raw sends. // // @param modifier the send message flags (Bit-0: isSendc, Bit-1: isEOT). diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index 36773ed1355f5..a4d0b80c87cdd 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -574,6 +574,58 @@ __ESIMD_API #endif // !__ESIMD_FORCE_STATELESS_MEM } +template +__ESIMD_API std::enable_if_t> +block_store_impl(T *p, simd vals, simd_mask<1> pred, FlagsT flags) { + detail::check_cache_hint(); + constexpr auto Alignment = + FlagsT::template alignment<__ESIMD_DNS::__raw_t>; + static_assert( + (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) || + (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4), + "Incorrect alignment for the data type"); + + constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T); + constexpr int SmallIntFactor32Bit = + std::max(static_cast(1), sizeof(uint32_t) / sizeof(T)); + static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0, + "Number of elements is not supported by Transposed store"); + + // If alignment >= 8 and (NElts * sizeof(T)) % 8 == 0) we can store QWORDs. + // Don't do it for 4-byte vectors (unless it is greater than 256-bytes), + // because it would require a bit-cast, which is supposed to be NO-OP, but + // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways. + constexpr bool Use64BitData = + Alignment >= __ESIMD_DNS::OperandSize::QWORD && + (NElts * sizeof(T)) % sizeof(uint64_t) == 0 && + (sizeof(T) != sizeof(uint32_t) || NElts * sizeof(T) > 256); + + constexpr int SmallIntFactor = + Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit; + constexpr int FactoredNElts = NElts / SmallIntFactor; + + check_lsc_vector_size(); + + using StoreType = __ESIMD_DNS::__raw_t< + std::conditional_t>>; + constexpr uint16_t AddressScale = 1; + constexpr int ImmOffset = 0; + constexpr lsc_data_size ActualDS = + Use64BitData ? lsc_data_size::u64 : lsc_data_size::u32; + constexpr lsc_vector_size VS = to_lsc_vector_size(); + constexpr auto Transposed = lsc_data_order::transpose; + constexpr int N = 1; + simd Addrs = reinterpret_cast(p); + + __esimd_lsc_store_stateless( + pred.data(), Addrs.data(), + sycl::bit_cast<__ESIMD_DNS::vector_type_t>( + vals.data())); +} + } // namespace detail /// Stores elements of the vector \p vals to a contiguous block of memory @@ -593,7 +645,7 @@ __ESIMD_API template > __ESIMD_API std::enable_if_t> -block_store(Tx *addr, simd vals, Flags = {}) { +block_store(Tx *addr, simd vals, Flags) { using T = typename detail::__raw_t; using VecT = typename simd::raw_vector_type; constexpr size_t Align = Flags::template alignment>; @@ -1301,6 +1353,289 @@ block_load(AccessorT acc, simd_mask<1> pred, PropertyListT props = {}) { return block_load(acc, 0, pred, PassThru, props); } +/// Each of the following block store functions stores a contiguous memory block +/// to the address referenced by the USM pointer 'ptr', or from 'ptr + +/// offset', where 'offset' is the offset in bytes (not in elements!) with data +/// specified by 'vals'. +/// The parameter 'pred' is the one element predicate. If it is set to 1, then +/// all 'N' elements are stored. Otherwise, the block store operation is a +/// NO-OP. The parameter 'props' specifies the optional compile-time properties +/// of the type esimd::properties and may include esimd::cache_hint_L1, +/// esimd::cache_hint_L2, esimd::cache_hint_L3, esimd::alignment. +/// +/// void block_store(T* ptr, simd vals, props={}); // (1) +/// void block_store(T* ptr, size_t byte_offset, // (2) +/// simd vals, props={}); + +/// void block_store(T* ptr, simd vals, // (3) +/// simd_mask<1> pred, props={}); + +/// void block_store(T* ptr, size_t byte_offset, // (4) +/// simd vals, simd_mask<1> pred, props={}); +/// +/// void block_store(T* ptr, simd vals, props={}); // (1) +/// This function stores a contiguous memory block to USM pointer \p ptr +/// with data specified by \p vals. +/// +/// There may be temporary restrictions depending on L1, L2 cache hints, +/// See details in the 'Restrictions' section below. The restrictions will be +/// relaxed in the future. +/// +/// The parameter \p props specifies the optional compile-time properties +/// of the type esimd::properties and may include esimd::cache_hint_L1, +/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored. +/// +/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then +/// the cache_hint::none value is assumed by default. +/// +/// Alignment: If \p props does not specify the 'alignment' property, then +/// the default assumed alignment is 16 bytes if \p props does not specify any +/// L1 or L2 cache hints, and the minimally required element-size +/// alignment otherwise. Note that additional/temporary restrictions may apply +/// (see Restrictions below). +/// +/// Restrictions - cache hint imposed - temporary: +/// If L1 or L2 cache hint is passed, then: +/// R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or +/// smaller and 8-byte aligned for 8-byte elements. +/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64, +/// or 128(only if alignment is 8-bytes or more); +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128, +/// or 256(only if alignment is 8-bytes or more); +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256, +/// or 512(only if alignment is 8-bytes or more). +/// R3: The target device must be DG2, PVC or newer GPU. +template +__ESIMD_API std::enable_if_t< + ext::oneapi::experimental::is_property_list_v> +block_store(T *ptr, simd vals, PropertyListT props = {}) { + constexpr auto L1Hint = + detail::getPropertyValue( + cache_hint::none); + constexpr auto L2Hint = + detail::getPropertyValue( + cache_hint::none); + static_assert(!PropertyListT::template has_property(), + "L3 cache hint is reserved. The old/experimental L3 LSC cache " + "hint is cache_level::L2 now."); + if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none) { + detail::check_cache_hint(); + constexpr int DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T); + constexpr size_t Alignment = + detail::getPropertyValue( + DefaultAlignment); + + simd_mask<1> Mask = 1; + detail::block_store_impl( + ptr, vals, Mask, overaligned_tag{}); + } else { + // If the alignment property is not passed, then assume the pointer + // is OWORD-aligned. + constexpr size_t Alignment = + detail::getPropertyValue( + detail::OperandSize::OWORD); + block_store(ptr, vals, overaligned_tag{}); + } +} + +/// void block_store(T* ptr, size_t byte_offset, // (2) +/// simd vals, props={}); +/// This function stores a contiguous memory block to USM pointer \p ptr and +/// byte-offset \p byte_offset with data specified by \p vals. +/// +/// There may be temporary restrictions depending on L1, L2 cache hints, +/// See details in the 'Restrictions' section below. The restrictions will be +/// relaxed in the future. +/// +/// The parameter \p props specifies the optional compile-time properties +/// of the type esimd::properties and may include esimd::cache_hint_L1, +/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored. +/// +/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then +/// the cache_hint::none value is assumed by default. +/// +/// Alignment: If \p props does not specify the 'alignment' property, then +/// the default assumed alignment is 16 bytes if \p props does not specify any +/// L1 or L2 cache hints, and the minimally required element-size +/// alignment otherwise. Note that additional/temporary restrictions may apply +/// (see Restrictions below). +/// +/// Restrictions - cache hint imposed - temporary: +/// If L1 or L2 cache hint is passed, then: +/// R1: The pointer plus byte offset must be at least 4-byte aligned for +/// elements of 4-bytes or smaller and 8-byte aligned for 8-byte elements. +/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64, +/// or 128(only if alignment is 8-bytes or more); +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128, +/// or 256(only if alignment is 8-bytes or more); +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256, +/// or 512(only if alignment is 8-bytes or more). +/// R3: The target device must be DG2, PVC or newer GPU. +template +__ESIMD_API std::enable_if_t< + ext::oneapi::experimental::is_property_list_v> +block_store(T *ptr, size_t byte_offset, simd vals, + PropertyListT props = {}) { + T *AdjustedPtr = + reinterpret_cast(reinterpret_cast(ptr) + byte_offset); + block_store(AdjustedPtr, vals, props); +} + +/// void block_store(T* ptr, simd vals, // (3) +/// simd_mask<1> pred, props={}); +/// This function stores a contiguous memory block to USM pointer \p ptr +/// with data specified by \p vals. If the predicate \p pred is set to 0, +/// then the store is omitted. +/// +/// There are temporary restrictions. See details in the 'Restrictions' +/// section below. The restrictions will be relaxed in the future. +/// +/// The parameter \p props specifies the optional compile-time properties +/// of the type esimd::properties and may include esimd::cache_hint_L1, +/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored. +/// +/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then +/// the cache_hint::none value is assumed by default. +/// +/// Alignment: If \p props does not specify the 'alignment' property, then +/// the default assumed alignment is the minimally required element-size +/// alignment. Note that additional/temporary restrictions apply (see +/// Restrictions below). +/// +/// Restrictions - predicate imposed - temporary: +/// R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or +/// smaller and 8-byte aligned for 8-byte elements. +/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64, +/// or 128(only if alignment is 8-bytes or more); +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128, +/// or 256(only if alignment is 8-bytes or more); +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256, +/// or 512(only if alignment is 8-bytes or more). +/// R3: The target device must be DG2, PVC or newer GPU. +template +__ESIMD_API std::enable_if_t< + ext::oneapi::experimental::is_property_list_v> +block_store(T *ptr, simd vals, simd_mask<1> pred, + PropertyListT props = {}) { + constexpr auto L1Hint = + detail::getPropertyValue( + cache_hint::none); + constexpr auto L2Hint = + detail::getPropertyValue( + cache_hint::none); + static_assert(!PropertyListT::template has_property(), + "L3 cache hint is reserved. The old/experimental L3 LSC cache " + "hint is cache_level::L2 now."); + + constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T); + constexpr size_t Alignment = + detail::getPropertyValue(DefaultAlignment); + + detail::block_store_impl(ptr, vals, pred, + overaligned_tag{}); +} + +/// void block_store(T* ptr, size_t byte_offset, // (4) +/// simd vals, simd_mask<1> pred, props={}); +/// This function stores a contiguous memory block to USM pointer \p ptr +/// and byte-offset \p byte_offset with data specified by \p vals. +/// If the predicate \p pred is set to 0, then the store is omitted. +/// +/// There may be temporary restrictions depending on L1, L2 cache hints, +/// See details in the 'Restrictions' section below. The restrictions will be +/// relaxed in the future. +/// +/// The parameter \p props specifies the optional compile-time properties +/// of the type esimd::properties and may include esimd::cache_hint_L1, +/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored. +/// +/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then +/// the cache_hint::none value is assumed by default. +/// +/// Alignment: If \p props does not specify the 'alignment' property, then +/// the default assumed alignment is 16 bytes if \p props does not specify any +/// L1 or L2 cache hints and \p pred is set to 1, and +// the minimally required element-size alignment otherwise. +/// Note that additional/temporary restrictions may apply +/// (see Restrictions below). +/// +/// Restrictions - cache hint or predicate imposed - temporary: +/// If a predicate, L1 or L2 cache hint is passed, then: +/// R1: The pointer plus byte offset must be at least 4-byte aligned for +/// elements of 4-bytes or smaller and 8-byte aligned for 8-byte elements. +/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64; +/// for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64, +/// or 128(only if alignment is 8-bytes or more); +/// for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128, +/// or 256(only if alignment is 8-bytes or more); +/// for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256, +/// or 512(only if alignment is 8-bytes or more). +/// R3: The target device must be DG2, PVC or newer GPU. +template +__ESIMD_API std::enable_if_t< + ext::oneapi::experimental::is_property_list_v> +block_store(T *ptr, size_t byte_offset, simd vals, simd_mask<1> pred, + PropertyListT props = {}) { + T *AdjustedPtr = + reinterpret_cast(reinterpret_cast(ptr) + byte_offset); + block_store(AdjustedPtr, vals, pred, props); +} + +template , + typename PropertyListT = + ext::oneapi::experimental::detail::empty_properties_t> +__ESIMD_API std::enable_if_t< + ext::oneapi::experimental::is_property_list_v> +block_store(T *ptr, simd_view vals, + PropertyListT props = {}) { + block_store(ptr, vals.read(), props); +} + +template , + typename PropertyListT = + ext::oneapi::experimental::detail::empty_properties_t> +__ESIMD_API std::enable_if_t< + ext::oneapi::experimental::is_property_list_v> +block_store(T *ptr, size_t byte_offset, simd_view vals, + PropertyListT props = {}) { + block_store(ptr, byte_offset, vals.read(), props); +} + +template , + typename PropertyListT = + ext::oneapi::experimental::detail::empty_properties_t> +__ESIMD_API std::enable_if_t< + ext::oneapi::experimental::is_property_list_v> +block_store(T *ptr, simd_view vals, simd_mask<1> pred, + PropertyListT props = {}) { + block_store(ptr, vals.read(), pred, props); +} + +template , + typename PropertyListT = + ext::oneapi::experimental::detail::empty_properties_t> +__ESIMD_API std::enable_if_t< + ext::oneapi::experimental::is_property_list_v> +block_store(T *ptr, size_t byte_offset, simd_view vals, + simd_mask<1> pred, PropertyListT props = {}) { + block_store(ptr, byte_offset, vals.read(), pred, props); +} + /// @} sycl_esimd_memory_block /// Stores elements of a vector to a contiguous block of memory represented by diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp index aa22578589c06..4d8fc2c00e2b4 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp @@ -282,39 +282,6 @@ __ESIMD_INTRIN void __esimd_lsc_store_bti( } #endif // __SYCL_DEVICE_ONLY__ -/// USM pointer scatter. -/// Supported platforms: DG2, PVC -/// -/// Scatters elements to specific address. -/// -/// @tparam Ty is element type. -/// @tparam L1H is L1 cache hint. -/// @tparam L3H is L3 cache hint. -/// @tparam AddressScale is the address scale. -/// @tparam ImmOffset is the immediate offset added to each address. -/// @tparam DS is the data size. -/// @tparam VS is the number of elements to load per address. -/// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the SIMD size of operation (the number of addresses to access) -/// @param pred is predicates. -/// @param addrs is the prefetch addresses. -/// @param vals is values to store. -template -__ESIMD_INTRIN void __esimd_lsc_store_stateless( - __ESIMD_DNS::simd_mask_storage_t pred, - __ESIMD_DNS::vector_type_t addrs, - __ESIMD_DNS::vector_type_t()> vals) -#ifdef __SYCL_DEVICE_ONLY__ - ; -#else // __SYCL_DEVICE_ONLY__ -{ - __ESIMD_UNSUPPORTED_ON_HOST; -} -#endif // __SYCL_DEVICE_ONLY__ - /// 2D USM pointer block load. /// Supported platforms: PVC /// diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index f214f79079599..39af99b98ac6c 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -1765,7 +1765,7 @@ lsc_scatter(AccessorTy acc, __ESIMD_NS::simd offsets, /// /// @tparam T is element type. /// @tparam NElts is the number of elements to store per address. -/// @tparam DS is the data size. +/// @tparam DS is the data size (unused/obsolete). /// @tparam L1H is L1 cache hint. /// @tparam L3H is L3 cache hint. /// @param p is the base pointer. @@ -1781,62 +1781,8 @@ template > lsc_block_store(T *p, __ESIMD_NS::simd vals, __ESIMD_NS::simd_mask<1> pred = 1, FlagsT flags = FlagsT{}) { - detail::check_lsc_data_size(); - detail::check_lsc_cache_hint(); - constexpr auto Alignment = - FlagsT::template alignment<__ESIMD_DNS::__raw_t>; - static_assert( - (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) || - (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4), - "Incorrect alignment for the data type"); - - // Prepare template arguments for the call of intrinsic. - constexpr uint16_t _AddressScale = 1; - constexpr int _ImmOffset = 0; - constexpr lsc_data_size _DS = detail::finalize_data_size(); - static_assert(_DS == lsc_data_size::u16 || _DS == lsc_data_size::u8 || - _DS == lsc_data_size::u32 || _DS == lsc_data_size::u64, - "Conversion data types are not supported"); - constexpr detail::lsc_data_order _Transposed = - detail::lsc_data_order::transpose; - constexpr int N = 1; - __ESIMD_NS::simd Addrs = reinterpret_cast(p); - - constexpr int SmallIntFactor32Bit = - (_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1); - static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0, - "Number of elements is not supported by Transposed store"); - - constexpr bool Use64BitData = - Alignment >= __ESIMD_DNS::OperandSize::QWORD && - (sizeof(T) == 8 || - (DS == lsc_data_size::default_size && NElts / SmallIntFactor32Bit > 64 && - (NElts * sizeof(T)) % 8 == 0)); - constexpr int SmallIntFactor64Bit = - (_DS == lsc_data_size::u16) - ? 4 - : (_DS == lsc_data_size::u8 ? 8 - : (_DS == lsc_data_size::u32 ? 2 : 1)); - constexpr int SmallIntFactor = - Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit; - constexpr int FactoredNElts = NElts / SmallIntFactor; - constexpr lsc_data_size ActualDS = Use64BitData - ? __ESIMD_ENS::lsc_data_size::u64 - : __ESIMD_ENS::lsc_data_size::u32; - - detail::check_lsc_vector_size(); - constexpr detail::lsc_vector_size _VS = - detail::to_lsc_vector_size(); - - using StoreType = __ESIMD_DNS::__raw_t< - std::conditional_t>>; - - __esimd_lsc_store_stateless( - pred.data(), Addrs.data(), - sycl::bit_cast<__ESIMD_DNS::vector_type_t>( - vals.data())); + return __ESIMD_DNS::block_store_impl(p, vals, pred, + flags); } /// A variation of lsc_block_store without predicate parameter to simplify diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp index d42ea46d8f93c..be666613a4f72 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp @@ -13,49 +13,11 @@ #include #include "../../esimd_test_utils.hpp" +#include "common.hpp" using namespace sycl; using namespace sycl::ext::intel::esimd; -template -constexpr cache_hint getCacheHint(PropertiesT) { - if constexpr (PropertiesT::template has_property()) { - constexpr auto ValueT = PropertiesT::template get_property(); - return ValueT.hint; - } else { - return cache_hint::none; - } -} - -template -constexpr size_t getAlignment(PropertiesT, size_t DefaultAlignment) { - if constexpr (PropertiesT::template has_property< - sycl::ext::intel::esimd::alignment_key>()) { - constexpr auto ValueT = PropertiesT::template get_property< - sycl::ext::intel::esimd::alignment_key>(); - return ValueT.value; - } else { - return DefaultAlignment; - } -} - -template -constexpr size_t getAlignment(PropertiesT Props) { - constexpr cache_hint L1Hint = - getCacheHint(Props); - constexpr cache_hint L2Hint = - getCacheHint(Props); - constexpr bool RequiresPVC = - L1Hint != cache_hint::none || L2Hint != cache_hint::none || UseMask; - - constexpr bool IsMaxLoadSizePVC = RequiresPVC && (N * sizeof(T) > 256); - constexpr size_t RequiredAlignment = - IsMaxLoadSizePVC ? 8 : (RequiresPVC ? 4 : sizeof(T)); - constexpr size_t RequestedAlignment = getAlignment(Props, RequiredAlignment); - static_assert(RequestedAlignment >= RequiredAlignment, "Too small alignment"); - return RequestedAlignment; -} - // Returns true iff verification is passed. template bool verify(const T *In, const T *Out, size_t Size, int N, diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_store.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_store.hpp new file mode 100644 index 0000000000000..c9c69354af3e5 --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_store.hpp @@ -0,0 +1,178 @@ +//==------- block_store.hpp - DPC++ ESIMD on-device test ----------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-------------------------------------------------------------------===// + +#include "common.hpp" + +using namespace sycl; +using namespace sycl::ext::intel::esimd; + +template +bool testUSM(queue Q, uint32_t Groups, uint32_t Threads, + StorePropertiesT StoreProperties) { + + uint16_t Size = Groups * Threads * N; + using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t; + + std::cout << "USM case: T=" << esimd_test::type_name() << ",N=" << N + << ",UseMask=" << UseMask << ",UseProperties=" << UseProperties + << std::endl; + + sycl::range<1> GlobalRange{Groups}; + sycl::range<1> LocalRange{Threads}; + sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange}; + constexpr size_t Alignment = getAlignment(StoreProperties); + T *Out = sycl::aligned_alloc_shared(Alignment, Size, Q); + T Out_val = esimd_test::getRandomValue(); + for (int i = 0; i < Size; i++) + Out[i] = Out_val; + + try { + Q.submit([&](handler &cgh) { + cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { + uint16_t GlobalID = ndi.get_global_id(0); + uint32_t ElemOff = GlobalID * N; + // TODO: these 2 lines work-around the problem with scalar + // conversions to bfloat16. It could be just: "simd + // PassThru(ElemOffset, 1);" + simd PassThruInt(ElemOff, 1); + simd Vals = PassThruInt; + if constexpr (UseMask) { + simd_mask<1> Mask = (GlobalID + 1) % 1; + block_store(Out + ElemOff, Vals, Mask, StorePropertiesT{}); + Vals = block_load(Out + ElemOff); + Vals += 1; + block_store(Out, ElemOff * sizeof(T), Vals, Mask, + StorePropertiesT{}); + Vals = block_load(Out + ElemOff); + Vals += 2; + auto View = Vals.template select(); + block_store(Out, ElemOff * sizeof(T), View, Mask, + StorePropertiesT{}); + Vals = block_load(Out + ElemOff); + Vals += 3; + View = Vals.template select(); + block_store(Out + ElemOff, View, Mask, StorePropertiesT{}); + } else { + if constexpr (UseProperties) + block_store(Out + ElemOff, Vals, StorePropertiesT{}); + + else + block_store(Out + ElemOff, Vals); + + Vals = block_load(Out + ElemOff); + Vals += 1; + if constexpr (UseProperties) + block_store(Out, ElemOff * sizeof(T), Vals, StorePropertiesT{}); + else + block_store(Out, ElemOff * sizeof(T), Vals); + + Vals = block_load(Out + ElemOff); + Vals += 2; + auto View = Vals.template select(); + if constexpr (UseProperties) + block_store(Out, ElemOff * sizeof(T), View, + StorePropertiesT{}); + else + block_store(Out, ElemOff * sizeof(T), View); + + Vals = block_load(Out + ElemOff); + Vals += 3; + View = Vals.template select(); + if constexpr (UseProperties) + block_store(Out + ElemOff, View, StorePropertiesT{}); + else + block_store(Out + ElemOff, View); + } + }); + }).wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + sycl::free(Out, Q); + return false; + } + + bool Passed = true; + + for (int i = 0; i < Size; i++) { + bool IsMaskSet = (i / N + 1) % 1; + Tuint Expected = sycl::bit_cast(Out_val); + if (!UseMask || IsMaskSet) + Expected = sycl::bit_cast((T)(i + 6)); + Tuint Computed = sycl::bit_cast(Out[i]); + if (Computed != Expected) { + Passed = false; + std::cout << "Out[" << i << "] = " << std::to_string(Computed) << " vs " + << std::to_string(Expected) << std::endl; + } + } + + sycl::free(Out, Q); + + return Passed; +} + +template bool test_block_store(queue Q) { + constexpr bool CheckMask = true; + constexpr bool CheckProperties = true; + properties AlignOnlyProps{alignment}; + + bool Passed = true; + + // Test block_store() that is available on Gen12 and PVC. + Passed &= testUSM(Q, 2, 4, AlignOnlyProps); + Passed &= testUSM(Q, 1, 4, AlignOnlyProps); + Passed &= testUSM(Q, 2, 8, AlignOnlyProps); + Passed &= testUSM(Q, 2, 4, AlignOnlyProps); + Passed &= testUSM(Q, 2, 4, AlignOnlyProps); + Passed &= + testUSM(Q, 2, 4, AlignOnlyProps); + Passed &= + testUSM(Q, 2, 4, AlignOnlyProps); + // Intentionally check non-power-of-2 simd size - it must work. + Passed &= + testUSM(Q, 2, 4, AlignOnlyProps); + // TODO: Enable after failure fixed + // Passed &= + // testUSM(Q, 1, 4, AlignOnlyProps); + // Intentionally check big simd size - it must work. + Passed &= + testUSM(Q, 2, 4, AlignOnlyProps); + Passed &= + testUSM(Q, 1, 4, AlignOnlyProps); + + // Test block_store() without passing compile-time properties argument. + Passed &= + testUSM(Q, 2, 4, AlignOnlyProps); + Passed &= + testUSM(Q, 2, 4, AlignOnlyProps); + + if constexpr (TestPVCFeatures) { + // Using cache hints adds the requirement to run tests on PVC. + // Also, PVC variant currently requires power-or-two elements and + // the number of bytes loaded per call must not exceed 512. + properties PVCProps{cache_hint_L1, + cache_hint_L2, alignment<16>}; + + if constexpr (sizeof(T) >= 4) // only d/q words are supported now + Passed &= testUSM(Q, 2, 4, PVCProps); + if constexpr (sizeof(T) >= 2) // only d/q words are supported now + Passed &= testUSM(Q, 5, 5, PVCProps); + Passed &= testUSM(Q, 5, 5, PVCProps); + Passed &= testUSM(Q, 5, 5, PVCProps); + Passed &= testUSM(Q, 5, 5, PVCProps); + Passed &= testUSM(Q, 2, 4, PVCProps); + Passed &= testUSM(Q, 7, 1, PVCProps); + if constexpr (128 * sizeof(T) <= 512) + Passed &= testUSM(Q, 1, 4, PVCProps); + if constexpr (256 * sizeof(T) <= 512) + Passed &= testUSM(Q, 1, 4, PVCProps); + } // TestPVCFeatures + + return Passed; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/common.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/common.hpp new file mode 100644 index 0000000000000..31ba3dad663bc --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/common.hpp @@ -0,0 +1,54 @@ +//==------- common.hpp - DPC++ ESIMD on-device test ----------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include "../../esimd_test_utils.hpp" + +using namespace sycl; +using namespace sycl::ext::intel::esimd; + +template +constexpr cache_hint getCacheHint(PropertiesT) { + if constexpr (PropertiesT::template has_property()) { + constexpr auto ValueT = PropertiesT::template get_property(); + return ValueT.hint; + } else { + return cache_hint::none; + } +} + +template +constexpr size_t getAlignment(PropertiesT, size_t DefaultAlignment) { + if constexpr (PropertiesT::template has_property< + sycl::ext::intel::esimd::alignment_key>()) { + constexpr auto ValueT = PropertiesT::template get_property< + sycl::ext::intel::esimd::alignment_key>(); + return ValueT.value; + } else { + return DefaultAlignment; + } +} + +template +constexpr size_t getAlignment(PropertiesT Props) { + constexpr cache_hint L1Hint = + getCacheHint(Props); + constexpr cache_hint L2Hint = + getCacheHint(Props); + constexpr bool RequiresPVC = + L1Hint != cache_hint::none || L2Hint != cache_hint::none || UseMask; + + constexpr bool IsMaxLoadSizePVC = RequiresPVC && (N * sizeof(T) > 256); + constexpr size_t RequiredAlignment = + IsMaxLoadSizePVC ? 8 : (RequiresPVC ? 4 : sizeof(T)); + constexpr size_t RequestedAlignment = getAlignment(Props, RequiredAlignment); + static_assert(RequestedAlignment >= RequiredAlignment, "Too small alignment"); + return RequestedAlignment; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/block_store_usm.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/block_store_usm.cpp new file mode 100644 index 0000000000000..1e95762c30c47 --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/block_store_usm.cpp @@ -0,0 +1,37 @@ +//==------- block_store_usm.cpp - DPC++ ESIMD on-device test ---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::block_store() functions accepting USM pointer +// and optional compile-time esimd::properties. +// The block_store() calls in this test do not use cache-hint +// properties to not impose using PVC features. + +#include "Inputs/block_store.hpp" + +int main() { + auto Q = queue{gpu_selector_v}; + esimd_test::printTestLabel(Q); + + constexpr bool TestPVCFeatures = true; + bool Passed = true; + + Passed &= test_block_store(Q); + Passed &= test_block_store(Q); + if (Q.get_device().has(sycl::aspect::fp16)) + Passed &= test_block_store(Q); + Passed &= test_block_store(Q); + Passed &= test_block_store(Q); + Passed &= test_block_store(Q); + if (Q.get_device().has(sycl::aspect::fp64)) + Passed &= test_block_store(Q); + + std::cout << (Passed ? "Passed\n" : "FAILED\n"); + return Passed ? 0 : 1; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/block_store_usm_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/block_store_usm_pvc.cpp new file mode 100644 index 0000000000000..1f6f2a6fad1ee --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/block_store_usm_pvc.cpp @@ -0,0 +1,42 @@ +//==--- block_store_usm_pvc.cpp - DPC++ ESIMD on-device test----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------===// +// REQUIRES: gpu-intel-pvc +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::block_store() functions accepting USM pointer +// and optional compile-time esimd::properties. +// The block_store() calls in this test use cache-hint +// properties which require PVC+ target device. + +#include "Inputs/block_store.hpp" + +int main() { + auto Q = queue{gpu_selector_v}; + esimd_test::printTestLabel(Q); + + constexpr bool TestPVCFeatures = true; + bool Passed = true; + + Passed &= test_block_store(Q); + Passed &= test_block_store(Q); + if (Q.get_device().has(sycl::aspect::fp16)) + Passed &= test_block_store(Q); + Passed &= test_block_store(Q); + Passed &= test_block_store(Q); + Passed &= test_block_store(Q); + Passed &= test_block_store(Q); + Passed &= test_block_store(Q); + if (Q.get_device().has(sycl::aspect::fp64)) + Passed &= test_block_store(Q); + + std::cout << (Passed ? "Passed\n" : "FAILED\n"); + return Passed ? 0 : 1; +}