diff --git a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp index 71af45386aafa..1b4c6d9d08b56 100644 --- a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp @@ -169,6 +169,34 @@ __ESIMD_INTRIN void __esimd_lsc_prefetch_stateless( __ESIMD_DNS::simd_mask_storage_t pred, __ESIMD_DNS::vector_type_t addrs) __ESIMD_INTRIN_END; +/// Surface-based prefetch gather. +/// Supported platforms: DG2, PVC +/// +/// Prefetches elements located at surface. +/// +/// @tparam Ty is element type. +/// @tparam L1H is L1 cache hint. +/// @tparam L2H is L2 cache hint. +/// @tparam AddressScale is the address scale. +/// @tparam ImmOffset is the immediate offset added to each address. +/// @tparam DS is the data size. +/// @tparam VS is the number of elements to load per address. +/// @tparam Transposed indicates if the data is transposed during the transfer. +/// @tparam N is the SIMD size of operation (the number of addresses to access) +/// @tparam SurfIndAliasTy is the \ref sycl::accessor type. +/// @param pred is predicates. +/// @param offsets is the zero-based offsets in bytes. +/// @param surf_ind is the surface index. +template +__ESIMD_INTRIN void +__esimd_lsc_prefetch_bti(__ESIMD_DNS::simd_mask_storage_t pred, + __ESIMD_DNS::vector_type_t offsets, + SurfIndAliasTy surf_ind) __ESIMD_INTRIN_END; + // Read a block of data from SLM at the given offset. template __ESIMD_INTRIN __ESIMD_DNS::vector_type_t diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index b92f798d08f48..21c7b0d8c43de 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -2879,6 +2879,98 @@ prefetch_impl(const T *p, Toffset offset, simd_mask<1> pred) { addrs.data()); } +#ifndef __ESIMD_FORCE_STATELESS_MEM +/// Accessor-based prefetch gather. +/// Supported platforms: DG2, PVC +/// VISA instruction: lsc_load.ugm +/// +/// Prefetches elements located at surface. +/// +/// @tparam T is element type. +/// @tparam NElts is the number of elements to load per address. +/// @tparam DS is the data size. +/// @tparam L1H is L1 cache hint. +/// @tparam L2H is L2 cache hint. +/// @tparam N is the number of channels (platform dependent). +/// @tparam AccessorTy is the \ref sycl::accessor type. +/// @tparam OffsetT is the type of \c byte_offsets. +/// @param acc is the SYCL accessor. +/// @param byte_offsets is the zero-based offsets in bytes. +/// @param pred is predicates. +/// + +template +__ESIMD_API std::enable_if_t< + is_device_accessor_with_v> +prefetch_impl(AccessorTy acc, simd byte_offsets, + simd_mask pred) { + static_assert(std::is_integral_v, + "Prefetch must have integral byte_offset type"); + static_assert(sizeof(OffsetT) <= 4, + "Implicit truncation of 64-bit byte_offset to 32-bit is " + "disabled. Use -fsycl-esimd-force-stateless-mem or explicitly " + "convert offsets to a 32-bit vector"); + check_lsc_vector_size(); + check_lsc_data_size(); + check_cache_hint(); + constexpr uint16_t AddressScale = 1; + constexpr int ImmOffset = 0; + constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); + constexpr lsc_vector_size LSCVS = to_lsc_vector_size(); + constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; + using MsgT = typename lsc_expand_type::type; + simd ByteOffsets32 = convert(byte_offsets); + auto SI = get_surface_index(acc); + __esimd_lsc_prefetch_bti(pred.data(), ByteOffsets32.data(), + SI); +} + +/// Accessor-based transposed prefetch gather with 1 channel. +/// Supported platforms: DG2, PVC +/// VISA instruction: lsc_load.ugm +/// +/// Prefetches elements located at surface. +/// +/// @tparam T is element type. +/// @tparam NElts is the number of elements to load per address. +/// @tparam DS is the data size. +/// @tparam L1H is L1 cache hint. +/// @tparam L2H is L2 cache hint. +/// @tparam AccessorTy is the \ref sycl::accessor type. +/// @tparam OffsetT is the type of \c byte_offset. +/// @param acc is the SYCL accessor. +/// @param byte_offset is the zero-based offset in bytes. +/// +template +__ESIMD_API std::enable_if_t< + std::is_integral_v && + is_device_accessor_with_v> +prefetch_impl(AccessorTy acc, OffsetT byte_offset, simd_mask<1> pred) { + static_assert(sizeof(OffsetT) <= 4, + "Implicit truncation of 64-bit byte_offset to 32-bit is " + "disabled. Use -fsycl-esimd-force-stateless-mem or explicitly " + "convert offsets to a 32-bit vector"); + check_lsc_vector_size(); + check_lsc_data_size(); + check_cache_hint(); + constexpr uint16_t AddressScale = 1; + constexpr int ImmOffset = 0; + constexpr lsc_data_size EDS = finalize_data_size(); + static_assert( + EDS == lsc_data_size::u32 || EDS == lsc_data_size::u64, + "Transposed prefetch is supported only for data size u32 or u64"); + constexpr lsc_vector_size LSCVS = to_lsc_vector_size(); + constexpr lsc_data_order Transposed = lsc_data_order::transpose; + constexpr int N = 1; + simd offsets = byte_offset; + auto SI = get_surface_index(acc); + __esimd_lsc_prefetch_bti(pred.data(), offsets.data(), SI); +} +#endif // __ESIMD_FORCE_STATELESS_MEM } // namespace detail /// @endcond ESIMD_DETAIL @@ -8758,6 +8850,374 @@ prefetch(const T *p, PropertyListT props = {}) { prefetch(p, 0, Mask, props); } +/// template +/// void prefetch(AccessorT acc, simd byte_offsets, +/// simd_mask mask, +/// PropertyListT props = {}); // (acc-pf-1) +/// void prefetch(AccessorT acc, simd byte_offsets, +/// PropertyListT props = {}); // (acc-pf-2) +/// +/// The next 2 functions are similar to the above and were added for +/// convenience. They assume the VS parameter is set to 1 and do not require +/// specifying the template parameters at function calls. +/// template +/// void prefetch(AccessorT acc, simd byte_offsets, +/// simd_mask mask, +/// PropertyListT props = {}); // (acc-pf-3) +/// void prefetch(AccessorT acc, simd byte_offsets, +/// PropertyListT props = {}); // (acc-pf-4) +/// The next 2 functions are variations of the first 2 above (acc-pf-1,2) +/// and were added only to support simd_view instead of simd for byte_offsets +/// operand. +/// template +/// void prefetch(AccessorT acc, OffsetSimdViewT byte_offsets, +/// simd_mask mask, PropertyListT props = {});//(acc-pf-5) +/// void prefetch(AccessorT acc, OffsetSimdViewT byte_offsets, +/// PropertyListT props = {}); //(acc-pf-6) +/// +/// The next functions perform transposed 1-channel prefetch. +/// template +/// void prefetch(AccessorT acc, OffsetT byte_offset, simd_mask<1> mask, +/// PropertyListT props = {}); // (acc-pf-7) +/// void prefetch(AccessorT acc, OffsetT byte_offset, +/// PropertyListT props = {}); // (acc-pf-8) +/// template +/// void prefetch(AccessorT acc, simd_mask<1> mask, +/// PropertyListT props = {}); // (acc-pf-9) +/// void prefetch(AccessorT acc, PropertyListT props = {}); // (acc-pf-10) +/// + +/// template +/// void prefetch(AccessorT acc, simd byte_offsets, +/// simd_mask mask, +/// PropertyListT props = {}); // (acc-pf-1) +/// Supported platforms: DG2, PVC only. +/// Prefetches elements of the type 'T' from memory locations addressed +/// by the accessor \p acc and byte offsets \p byte_offsets, to the cache. +/// Access to any element's memory location can be disabled via the input vector +/// of predicates \p mask. If mask[i] is unset, then the prefetch from +/// (acc + byte_offsets[i]) is skipped. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @tparam VS Vector size. It can also be read as the number of reads per each +/// address. The parameter 'N' must be divisible by 'VS'. +/// @param acc Accessor referencing the data to load. +/// @param byte_offsets the vector of offsets in bytes. If force stateless +/// memory is used the offsets can be up to 64 bit size, otherwise up to 32 bit +/// size. For each i, (acc + byte_offsets[i]) must be element size aligned. +/// @param mask The access mask. +/// @param props The optional compile-time properties. Only cache hint +/// properties are used. +template +__ESIMD_API std::enable_if_t< + detail::is_device_accessor_with_v && + ext::oneapi::experimental::is_property_list_v> +prefetch(AccessorT acc, simd byte_offsets, + simd_mask mask, PropertyListT props = {}) { +#ifdef __ESIMD_FORCE_STATELESS_MEM + prefetch(detail::accessorToPointer(acc), byte_offsets, mask, + props); +#else + static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS"); + + constexpr auto L1Hint = + detail::getPropertyValue( + cache_hint::uncached); + constexpr auto L2Hint = + detail::getPropertyValue( + cache_hint::cached); + detail::prefetch_impl(acc, byte_offsets, mask); +#endif // __ESIMD_FORCE_STATELESS_MEM +} + +/// template +/// void prefetch(AccessorT acc, simd byte_offsets, +/// PropertyListT props = {}); // (acc-pf-2) +/// Supported platforms: DG2, PVC only. +/// Prefetches elements of the type 'T' from memory locations addressed +/// by the accessor \p acc and byte offsets \p byte_offsets, into the cache. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @tparam VS Vector size. It can also be read as the number of reads per each +/// address. The parameter 'N' must be divisible by 'VS'. +/// @param acc Accessor referencing the data to load. +/// @param byte_offsets the vector of offsets in bytes. If force stateless +/// memory is used the offsets can be up to 64 bit size, otherwise up to 32 bit +/// size. For each i, (acc + byte_offsets[i]) must be element size aligned. +/// @param props The optional compile-time properties. Only cache hint +/// properties are used. +template +__ESIMD_API std::enable_if_t< + detail::is_device_accessor_with_v && + ext::oneapi::experimental::is_property_list_v> +prefetch(AccessorT acc, simd byte_offsets, + PropertyListT props = {}) { + simd_mask Mask = 1; + prefetch(acc, byte_offsets, Mask, props); +} + +/// template +/// void prefetch(AccessorT acc, simd byte_offsets, +/// simd_mask mask, +/// PropertyListT props = {}); // (acc-pf-3) +/// Supported platforms: DG2, PVC only. +/// Prefetches elements of the type 'T' from memory locations addressed +/// by the accessor \p acc and byte offsets \p byte_offsets, to the cache. +/// Access to any element's memory location can be disabled via the input vector +/// of predicates \p mask. If mask[i] is unset, then the prefetch from +/// (acc + byte_offsets[i]) is skipped. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @param acc Accessor referencing the data to load. +/// @param byte_offsets the vector of offsets in bytes. If force stateless +/// memory is used the offsets can be up to 64 bit size, otherwise up to 32 bit +/// size. For each i, (acc + byte_offsets[i]) must be element size aligned. +/// @param mask The access mask. +/// @param props The optional compile-time properties. Only cache hint +/// properties are used. +template +__ESIMD_API std::enable_if_t< + detail::is_device_accessor_with_v && + ext::oneapi::experimental::is_property_list_v> +prefetch(AccessorT acc, simd byte_offsets, simd_mask mask, + PropertyListT props = {}) { + constexpr int VS = 1; + prefetch(acc, byte_offsets, mask, props); +} + +/// template +/// void prefetch(AccessorT acc, simd byte_offsets, +/// PropertyListT props = {}); // (acc-pf-4) +/// Supported platforms: DG2, PVC only. +/// Prefetches elements of the type 'T' from memory locations addressed +/// by the accessor \p acc and byte offsets \p byte_offsets, into the cache. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @param acc Accessor referencing the data to load. +/// @param byte_offsets the vector of offsets in bytes. If force stateless +/// memory is used the offsets can be up to 64 bit size, otherwise up to 32 bit +/// size. For each i, (acc + byte_offsets[i]) must be element size aligned. +/// @param props The optional compile-time properties. Only cache hint +/// properties are used. +template +__ESIMD_API std::enable_if_t< + detail::is_device_accessor_with_v && + ext::oneapi::experimental::is_property_list_v> +prefetch(AccessorT acc, simd byte_offsets, + PropertyListT props = {}) { + constexpr int VS = 1; + prefetch(acc, byte_offsets, props); +} + +/// template +/// void prefetch(AccessorT acc, OffsetSimdViewT byte_offsets, +/// simd_mask mask, PropertyListT props = {}); // (acc-pf-5) +/// Supported platforms: DG2, PVC only. +/// Prefetches elements of the type 'T' from memory locations +/// addressed by the accessor \p acc and byte offsets \p byte_offsets to the +/// cache. Access to any element's memory location can be disabled via the input +/// vector of predicates \p mask. If mask[i] is unset, then the load from (acc + +/// byte_offsets[i]) is skipped. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @tparam VS Vector size. It can also be read as the number of reads per +/// each address. The parameter 'N' must be divisible by 'VS'. +/// @param acc Accessor referencing the data to load. +/// @param byte_offsets the vector of offsets in bytes. If force stateless +/// memory is used the offsets can be up to 64 bit size, otherwise up to 32 bit +/// size. For each i, (acc + byte_offsets[i]) must be element size aligned. +/// @param mask The access mask. +/// @param props The optional compile-time properties. Only cache hint +/// properties are used. +template +__ESIMD_API std::enable_if_t< + detail::is_device_accessor_with_v && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v> +prefetch(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask mask, + PropertyListT props = {}) { + prefetch(acc, byte_offsets.read(), mask, props); +} + +/// template +/// void prefetch(AccessorT acc, OffsetSimdViewT byte_offsets, +/// PropertyListT props = {}); // (acc-pf-6) +/// Supported platforms: DG2, PVC only. +/// Prefetches elements of the type 'T' from memory locations +/// addressed by the accessor \p acc and byte offsets \p byte_offsets to the +/// cache. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @tparam VS Vector size. It can also be read as the number of reads per +/// each address. The parameter 'N' must be divisible by 'VS'. +/// @param acc Accessor referencing the data to load. +/// @param byte_offsets the vector of offsets in bytes. If force stateless +/// memory is used the offsets can be up to 64 bit size, otherwise up to 32 bit +/// size. For each i, (acc + byte_offsets[i]) must be element size aligned. +/// @param props The optional compile-time properties. Only cache hint +/// properties are used. +template +__ESIMD_API std::enable_if_t< + detail::is_device_accessor_with_v && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v> +prefetch(AccessorT acc, OffsetSimdViewT byte_offsets, + PropertyListT props = {}) { + prefetch(acc, byte_offsets.read(), props); +} + +/// template +/// void prefetch(AccessorT acc, uint32_t byte_offset, simd_mask<1> mask, +/// PropertyListT props = {}); // (acc-pf-7) +/// Supported platforms: DG2, PVC only. +/// Prefetches elements of the type 'T' from continuous memory location +/// addressed by the accessor \p acc, and offset \p byte_offset and the length +/// \p VS elements into the cache. +/// @tparam T Element type. +/// @tparam VS Vector size. It specifies the number of consequent elements to +/// prefetch. +/// @param acc Accessor referencing the data to load. +/// @param byte_offset offset from the base address. +/// @param mask The access mask. If it is set to 0, then the prefetch is +/// omitted. +/// @param props The optional compile-time properties. Only cache hint +/// properties are used. +template +__ESIMD_API std::enable_if_t< + std::is_integral_v && + detail::is_device_accessor_with_v && + ext::oneapi::experimental::is_property_list_v> +prefetch(AccessorT acc, OffsetT byte_offset, simd_mask<1> mask, + PropertyListT props = {}) { +#ifdef __ESIMD_FORCE_STATELESS_MEM + prefetch(detail::accessorToPointer(acc), byte_offset, mask, props); +#else + constexpr auto L1Hint = + detail::getPropertyValue( + cache_hint::uncached); + constexpr auto L2Hint = + detail::getPropertyValue( + cache_hint::cached); + detail::prefetch_impl(acc, byte_offset, mask); +#endif // __ESIMD_FORCE_STATELESS_MEM +} + +/// template +/// void prefetch(AccessorT acc, uint32_t byte_offset, +/// PropertyListT props = {}); // (acc-pf-8) +/// Supported platforms: DG2, PVC only. +/// Prefetches elements of the type 'T' from continuous memory location +/// addressed by the accessor \p acc, and offset \p byte_offset and the length +/// \p VS elements into the cache. +/// @tparam T Element type. +/// @tparam VS Vector size. It specifies the number of consequent elements to +/// prefetch. +/// @param acc Accessor referencing the data to load. +/// @param byte_offset offset from the base address +/// @param props The optional compile-time properties. Only cache hint +/// properties are used. +template +__ESIMD_API std::enable_if_t< + std::is_integral_v && + detail::is_device_accessor_with_v && + ext::oneapi::experimental::is_property_list_v> +prefetch(AccessorT acc, OffsetT byte_offset, PropertyListT props = {}) { + simd_mask<1> Mask = 1; + prefetch(acc, byte_offset, Mask, props); +} + +/// template +/// void prefetch(AccessorT acc, simd_mask<1> mask, +/// PropertyListT props = {}); //(acc-pf-9) +/// Supported platforms: DG2, PVC only. +/// Prefetches elements of the type 'T' from continuous memory location +/// addressed by the accessor \p acc +/// and the length \p VS elements into the cache. +/// @tparam T Element type. +/// @tparam VS Vector size. It specifies the number of consequent elements to +/// prefetch. +/// @param acc Accessor referencing the data to load. +/// @param mask The access mask. If it is set to 0, then the prefetch is +/// omitted. +/// @param props The optional compile-time properties. Only cache hint +/// properties are used. +template +__ESIMD_API std::enable_if_t< + detail::is_device_accessor_with_v && + ext::oneapi::experimental::is_property_list_v> +prefetch(AccessorT acc, simd_mask<1> mask, PropertyListT props = {}) { + prefetch(acc, 0, mask, props); +} + +/// template +/// void prefetch(AccessorT acc, PropertyListT props = {}); // (acc-pf-10) +/// Supported platforms: DG2, PVC only. +/// Prefetches elements of the type 'T' from continuous memory location +/// addressed by the accessor \p acc and the length \p VS into the cache. +/// @tparam T Element type. +/// @tparam VS Vector size. It specifies the number of consequent elements to +/// prefetch. +/// @param acc Accessor referencing the data to load. +/// @param props The optional compile-time properties. Only cache hint +/// properties are used. +template +__ESIMD_API std::enable_if_t< + detail::is_device_accessor_with_v && + ext::oneapi::experimental::is_property_list_v> +prefetch(AccessorT acc, PropertyListT props = {}) { + simd_mask<1> Mask = 1; + prefetch(acc, 0, Mask, props); +} + /// Variant of gather_rgba that uses local accessor as a parameter /// /// Gather and transpose pixels from the given memory locations defined by the diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp index 14230b3994e33..96780df8c6eb3 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp @@ -62,34 +62,6 @@ __ESIMD_INTRIN void __esimd_raw_send_nbarrier_signal( __ESIMD_DNS::vector_type_t msg_var, uint16_t pred = 1) __ESIMD_INTRIN_END; -/// Surface-based prefetch gather. -/// Supported platforms: DG2, PVC -/// -/// Prefetches elements located at surface. -/// -/// @tparam Ty is element type. -/// @tparam L1H is L1 cache hint. -/// @tparam L2H is L2 cache hint. -/// @tparam AddressScale is the address scale. -/// @tparam ImmOffset is the immediate offset added to each address. -/// @tparam DS is the data size. -/// @tparam VS is the number of elements to load per address. -/// @tparam Transposed indicates if the data is transposed during the transfer. -/// @tparam N is the SIMD size of operation (the number of addresses to access) -/// @tparam SurfIndAliasTy is the \ref sycl::accessor type. -/// @param pred is predicates. -/// @param offsets is the zero-based offsets in bytes. -/// @param surf_ind is the surface index. -template -__ESIMD_INTRIN void -__esimd_lsc_prefetch_bti(__ESIMD_DNS::simd_mask_storage_t pred, - __ESIMD_DNS::vector_type_t offsets, - SurfIndAliasTy surf_ind) __ESIMD_INTRIN_END; - /// 2D USM pointer block load. /// Supported platforms: PVC /// diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index fad2b2a802a09..4c50b02202468 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -1293,30 +1293,13 @@ template > lsc_prefetch(AccessorTy acc, -#ifdef __ESIMD_FORCE_STATELESS_MEM - __ESIMD_NS::simd offsets, -#else - __ESIMD_NS::simd offsets, -#endif + __ESIMD_NS::simd<__ESIMD_DNS::DeviceAccessorOffsetT, N> offsets, __ESIMD_NS::simd_mask pred = 1) { #ifdef __ESIMD_FORCE_STATELESS_MEM - return lsc_prefetch( - __ESIMD_DNS::accessorToPointer(acc), offsets, pred); + lsc_prefetch(__ESIMD_DNS::accessorToPointer(acc), + offsets, pred); #else - detail::check_lsc_vector_size(); - detail::check_lsc_data_size(); - detail::check_lsc_cache_hint(); - constexpr uint16_t _AddressScale = 1; - constexpr int _ImmOffset = 0; - constexpr lsc_data_size _DS = - detail::expand_data_size(detail::finalize_data_size()); - constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size(); - constexpr detail::lsc_data_order _Transposed = - detail::lsc_data_order::nontranspose; - using MsgT = typename detail::lsc_expand_type::type; - auto si = __ESIMD_NS::get_surface_index(acc); - __esimd_lsc_prefetch_bti(pred.data(), offsets.data(), si); + __ESIMD_DNS::prefetch_impl(acc, offsets, pred); #endif } @@ -1362,24 +1345,8 @@ lsc_prefetch(AccessorTy acc, __ESIMD_DNS::DeviceAccessorOffsetT offset) { lsc_prefetch( __ESIMD_DNS::accessorToPointer(acc, offset)); #else - detail::check_lsc_vector_size(); - detail::check_lsc_data_size(); - detail::check_lsc_cache_hint(); - constexpr uint16_t _AddressScale = 1; - constexpr int _ImmOffset = 0; - constexpr lsc_data_size _DS = detail::finalize_data_size(); - static_assert( - _DS == lsc_data_size::u32 || _DS == lsc_data_size::u64, - "Transposed prefetch is supported only for data size u32 or u64"); - constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size(); - constexpr detail::lsc_data_order _Transposed = - detail::lsc_data_order::transpose; - constexpr int N = 1; - __ESIMD_NS::simd_mask pred = 1; - __ESIMD_NS::simd offsets = offset; - auto si = __ESIMD_NS::get_surface_index(acc); - __esimd_lsc_prefetch_bti(pred.data(), offsets.data(), si); + __ESIMD_NS::simd_mask<1> Mask = 1; + __ESIMD_DNS::prefetch_impl(acc, offset, Mask); #endif } diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/prefetch.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/prefetch.hpp index 646dccd3a97b2..3cda02b9c0248 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/prefetch.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/prefetch.hpp @@ -59,7 +59,6 @@ bool testUSM(queue Q, uint32_t MaskStride, PropertiesT) { << std::endl; uint16_t Size = Groups * Threads * N; - using Tuint = esimd_test::uint_type_t; sycl::range<1> GlobalRange{Groups}; sycl::range<1> LocalRange{Threads}; @@ -86,8 +85,6 @@ bool testUSM(queue Q, uint32_t MaskStride, PropertiesT) { for (int I = 0; I < NOffsets; I++) Pred[I] = (I % MaskStride == 0) ? 1 : 0; - using Tuint = esimd_test::uint_type_t; - simd Vals; if constexpr (VS > 1) { // VS > 1 requires specifying if constexpr (UseMask) { @@ -279,3 +276,251 @@ template bool testUSM(queue Q) { } return Passed; } + +template +bool testACC(queue Q, uint32_t MaskStride, PropertiesT) { + + static_assert(VS > 0 && N % VS == 0, + "Incorrect VS parameter. N must be divisible by VS."); + constexpr int NOffsets = N / VS; + + uint32_t Groups = 8; + uint32_t Threads = 16; + + std::cout << "Running case: T=" << esimd_test::type_name() << ", N=" << N + << ", VS=" << VS << ", MaskStride=" << MaskStride + << ", Groups=" << Groups << ", Threads=" << Threads + << ", use_mask=" << UseMask << ", use_properties=" << UseProperties + << std::endl; + + uint16_t Size = Groups * Threads * N; + + sycl::range<1> GlobalRange{Groups}; + sycl::range<1> LocalRange{Threads}; + sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange}; + + T *Out = sycl::malloc_shared(Size, Q); + std::memset(Out, 0, Size * sizeof(T)); + + T *In = sycl::malloc_shared(Size * 2, Q); + for (int I = 0; I < Size; I++) + In[I] = esimd_test::getRandomValue(); + + try { + buffer InBuf(In, Size * 2); + Q.submit([&](handler &CGH) { + accessor InAcc{InBuf, CGH}; + CGH.parallel_for(Range, [=](sycl::nd_item<1> NDI) SYCL_ESIMD_KERNEL { + int GlobalID = NDI.get_global_id(0); + PropertiesT Props{}; + + uint32_t ByteOffset = GlobalID * N * sizeof(T); + simd ByteOffsets(ByteOffset, VS * sizeof(T)); + simd_view ByteOffsetsView = ByteOffsets.template select(); + + simd_mask Pred; + simd_mask<1> Pred_1 = 1; + for (int I = 0; I < NOffsets; I++) + Pred[I] = (I % MaskStride == 0) ? 1 : 0; + + simd Vals; + if constexpr (VS > 1) { // VS > 1 requires specifying + if constexpr (UseMask) { + if constexpr (UseProperties) { + if constexpr (sizeof(T) >= 4) { + if (GlobalID % 4 == 0) // ByteOffset - simd + prefetch(InAcc, ByteOffsets, Pred, Props); + else if (GlobalID % 4 == 1) + prefetch(InAcc, Pred_1, Props); + else if (GlobalID % 4 == 2) + prefetch(InAcc, ByteOffset, Pred_1, Props); + else // ByteOffset - simd_view + prefetch(InAcc, ByteOffsetsView, Pred, Props); + } else { + if (GlobalID % 2 == 0) // ByteOffset - simd + prefetch(InAcc, ByteOffsets, Pred, Props); + else // ByteOffset - simd_view + prefetch(InAcc, ByteOffsetsView, Pred, Props); + } + } else { // UseProperties is false + if constexpr (sizeof(T) >= 4) { + if (GlobalID % 4 == 0) // ByteOffset - simd + prefetch(InAcc, ByteOffsets, Pred); + else if (GlobalID % 4 == 1) + prefetch(InAcc, Pred_1); + else if (GlobalID % 4 == 2) + prefetch(InAcc, ByteOffset, Pred_1); + else // ByteOffset - simd_view + prefetch(InAcc, ByteOffsetsView, Pred); + } else { + if (GlobalID % 2 == 0) // ByteOffset - simd + prefetch(InAcc, ByteOffsets, Pred); + else // ByteOffset - simd_view + prefetch(InAcc, ByteOffsetsView, Pred); + } + } + } else { // UseMask is false + if constexpr (UseProperties) { + if constexpr (sizeof(T) >= 4) { + if (GlobalID % 4 == 0) // ByteOffset - simd + prefetch(InAcc, ByteOffsets, Props); + else if (GlobalID % 4 == 1) + prefetch(InAcc, Props); + else if (GlobalID % 4 == 2) + prefetch(InAcc, ByteOffset); + else // ByteOffset - simd_view + prefetch(InAcc, ByteOffsetsView, Props); + } else { + if (GlobalID % 2 == 0) // ByteOffset - simd + prefetch(InAcc, ByteOffsets, Props); + else // ByteOffset - simd_view + prefetch(InAcc, ByteOffsetsView, Props); + } + } else { // UseProperties is false + if constexpr (sizeof(T) >= 4) { + if (GlobalID % 4 == 0) // ByteOffset - simd + prefetch(InAcc, ByteOffsets); + else if (GlobalID % 4 == 1) + prefetch(In); + else if (GlobalID % 4 == 2) + prefetch(InAcc, ByteOffset); + else // ByteOffset - simd_view + prefetch(InAcc, ByteOffsetsView); + } else { + if (GlobalID % 2 == 0) // ByteOffset - simd + prefetch(InAcc, ByteOffsets); + else // ByteOffset - simd_view + prefetch(InAcc, ByteOffsetsView); + } + } + } + } else { + // if (VS == 1) then can often be omitted - test it + // here. C++ FE do simd to simd_view matching. + if constexpr (UseMask) { + if constexpr (UseProperties) { + if constexpr (sizeof(T) >= 4) { + if (GlobalID % 4 == 0) // ByteOffset - simd + prefetch(InAcc, ByteOffsets, Pred, Props); + else if (GlobalID % 4 == 1) + prefetch(InAcc, Pred_1, Props); + else if (GlobalID % 4 == 2) + prefetch(InAcc, ByteOffset, Pred_1, Props); + else // ByteOffset - simd_view + prefetch(InAcc, ByteOffsetsView, Pred, Props); + } else { + if (GlobalID % 2 == 0) // ByteOffset - simd + prefetch(InAcc, ByteOffsets, Pred, Props); + else // ByteOffset - simd_view + prefetch(InAcc, ByteOffsetsView, Pred, Props); + } + } else { // UseProperties is false + if constexpr (sizeof(T) >= 4) { + if (GlobalID % 4 == 0) // ByteOffset - simd + prefetch(InAcc, ByteOffsets, Pred); + else if (GlobalID % 4 == 1) + prefetch(InAcc, Pred_1); + else if (GlobalID % 4 == 2) + prefetch(InAcc, ByteOffset, Pred_1); + else // ByteOffset - simd_view + prefetch(InAcc, ByteOffsetsView, Pred); + } else { + if (GlobalID % 2 == 0) // ByteOffset - simd + prefetch(InAcc, ByteOffsets, Pred); + else // ByteOffset - simd_view + prefetch(InAcc, ByteOffsetsView, Pred); + } + } + } else { // UseMask is false + if constexpr (UseProperties) { + if constexpr (sizeof(T) >= 4) { + if (GlobalID % 4 == 0) // ByteOffset - simd + prefetch(InAcc, ByteOffsets, Props); + else if (GlobalID % 4 == 1) + prefetch(InAcc, Props); + else if (GlobalID % 4 == 2) + prefetch(InAcc, ByteOffset, Props); + else // ByteOffset - simd_view + prefetch(InAcc, ByteOffsetsView, Props); + } else { + if (GlobalID % 2 == 0) // ByteOffset - simd + prefetch(InAcc, ByteOffsets, Props); + else // ByteOffset - simd_view + prefetch(InAcc, ByteOffsetsView, Props); + } + } else { + if constexpr (sizeof(T) >= 4) { + if (GlobalID % 4 == 0) // ByteOffset - simd + prefetch(InAcc, ByteOffsets); + else if (GlobalID % 4 == 1) + prefetch(InAcc); + else if (GlobalID % 4 == 2) + prefetch(InAcc, ByteOffset); + else // ByteOffset - simd_view + prefetch(InAcc, ByteOffsetsView); + } else { + if (GlobalID % 2 == 0) // ByteOffset - simd + prefetch(In, ByteOffsets); + else // ByteOffset - simd_view + prefetch(InAcc, ByteOffsetsView); + } + } + } + } // end if (VS == 1) + Vals = gather(InAcc, ByteOffsets); + Vals.copy_to(Out + GlobalID * N); + }); + }).wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + sycl::free(In, Q); + sycl::free(Out, Q); + return false; + } + + bool Passed = verify(In, Out, N, Size, VS); + if (!Passed) + std::cout << "Case FAILED" << std::endl; + + sycl::free(In, Q); + sycl::free(Out, Q); + return Passed; +} + +template bool testACC(queue Q) { + constexpr bool UseMask = true; + constexpr bool UseProperties = true; + + properties CacheProps{cache_hint_L1, + cache_hint_L2}; + + bool Passed = true; + Passed &= testACC(Q, 2, CacheProps); + Passed &= testACC(Q, 2, CacheProps); + Passed &= testACC(Q, 2, CacheProps); + Passed &= testACC(Q, 3, CacheProps); + Passed &= testACC(Q, 3, CacheProps); + Passed &= testACC(Q, 3, CacheProps); + Passed &= testACC(Q, 3, CacheProps); + + Passed &= testACC(Q, 2, CacheProps); + Passed &= testACC(Q, 2, CacheProps); + Passed &= testACC(Q, 2, CacheProps); + Passed &= testACC(Q, 3, CacheProps); + Passed &= testACC(Q, 3, CacheProps); + Passed &= testACC(Q, 2, CacheProps); + Passed &= testACC(Q, 2, CacheProps); + Passed &= testACC(Q, 3, CacheProps); + + // Check VS > 1. GPU supports only dwords and qwords in this mode. + if constexpr (sizeof(T) >= 4) { + Passed &= testACC(Q, 3, CacheProps); + Passed &= testACC(Q, 3, CacheProps); + Passed &= testACC(Q, 3, CacheProps); + Passed &= testACC(Q, 3, CacheProps); + Passed &= testACC(Q, 3, CacheProps); + Passed &= testACC(Q, 3, CacheProps); + } + return Passed; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/prefetch_acc_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/prefetch_acc_dg2_pvc.cpp new file mode 100644 index 0000000000000..347df1894a12d --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/prefetch_acc_dg2_pvc.cpp @@ -0,0 +1,14 @@ +//==------- prefetch_acc_dg2_pvc.cpp - DPC++ ESIMD on-device test -------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu-intel-dg2 || gpu-intel-pvc +// RUN: %{build} -fsycl-device-code-split=per_kernel -D__ESIMD_GATHER_SCATTER_LLVM_IR -o %t.out +// RUN: %{run} %t.out +// The test verifies esimd::prefetch() functions accepting accessor +// and optional compile-time esimd::properties. +// The prefetch() calls in this test require DG2 or PVC to run. +#include "prefetch_acc_stateful_dg2_pvc.cpp" \ No newline at end of file diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/prefetch_acc_stateful_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/prefetch_acc_stateful_dg2_pvc.cpp new file mode 100644 index 0000000000000..e45fc89fe0901 --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/prefetch_acc_stateful_dg2_pvc.cpp @@ -0,0 +1,37 @@ +//==-- prefetch_acc_stateful_dg2_pvc.cpp - DPC++ ESIMD on-device test ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu-intel-dg2 || gpu-intel-pvc +// RUN: %{build} -fsycl-device-code-split=per_kernel -fno-sycl-esimd-force-stateless-mem -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::prefetch() functions accepting accessor +// and optional compile-time esimd::properties. +// The prefetch() calls in this test require DG2 or PVC to run. + +#include "Inputs/prefetch.hpp" + +int main() { + auto Q = queue{gpu_selector_v}; + esimd_test::printTestLabel(Q); + + bool Passed = true; + + Passed &= testACC(Q); + Passed &= testACC(Q); + if (Q.get_device().has(sycl::aspect::fp16)) + Passed &= testACC(Q); + Passed &= testACC(Q); + Passed &= testACC(Q); + Passed &= testACC(Q); + Passed &= testACC(Q); + if (Q.get_device().has(sycl::aspect::fp64)) + Passed &= testACC(Q); + + std::cout << (Passed ? "Passed\n" : "FAILED\n"); + return Passed ? 0 : 1; +} diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp index e2a25c0dbdc2b..0adb42749984f 100644 --- a/sycl/test/esimd/memory_properties.cpp +++ b/sycl/test/esimd/memory_properties.cpp @@ -1691,4 +1691,75 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void test_prefetch(AccType &acc, float *ptrf, // CHECK-COUNT-2: call void @llvm.genx.lsc.prefetch.stateless.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 1, i8 2, i16 1, i32 0, i8 3, i8 7, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0) __ESIMD_NS::prefetch(ptrf, 0); __ESIMD_NS::prefetch(ptrf, 0, 1); + + // Test Acc prefetch using this plan: + // 1) prefetch(acc, offsets): offsets is simd or simd_view + // 2) prefetch(acc, offsets, mask): offsets is simd or simd_view + // 3) prefetch(acc, offset): same as (1) above, but with offset as a scalar. + // 4) prefetch(acc, offset): same as (1) and (2) above, but with VS > 1. + + // 1) prefetch(acc, offsets): offsets is simd or simd_view + // CHECK-STATEFUL-COUNT-2: call void @llvm.genx.lsc.prefetch.bti.v32i1.v32i32(<32 x i1> {{[^)]+}}, i8 0, i8 1, i8 2, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i32> {{[^)]+}}, i32 {{[^)]+}}) + // CHECK-STATELESS-COUNT-2: call void @llvm.genx.lsc.prefetch.stateless.v32i1.v32i64(<32 x i1> {{[^)]+}}, i8 0, i8 1, i8 2, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, i32 0) + prefetch(acc, ioffset_n32); + prefetch(acc, ioffset_n32_view); + + // CHECK-STATEFUL-COUNT-2: call void @llvm.genx.lsc.prefetch.bti.v32i1.v32i32(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i32> {{[^)]+}}, i32 {{[^)]+}}) + // CHECK-STATELESS-COUNT-2: call void @llvm.genx.lsc.prefetch.stateless.v32i1.v32i64(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, i32 0) + prefetch(acc, ioffset_n32, props_cache_load); + prefetch(acc, ioffset_n32_view, props_cache_load); + + // 2) prefetch(acc, offsets, mask): offsets is simd or simd_view + // CHECK-STATEFUL-COUNT-2: call void @llvm.genx.lsc.prefetch.bti.v32i1.v32i32(<32 x i1> {{[^)]+}}, i8 0, i8 1, i8 2, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i32> {{[^)]+}}, i32 {{[^)]+}}) + // CHECK-STATELESS-COUNT-2: call void @llvm.genx.lsc.prefetch.stateless.v32i1.v32i64(<32 x i1> {{[^)]+}}, i8 0, i8 1, i8 2, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, i32 0) + prefetch(acc, ioffset_n32, mask_n32); + prefetch(acc, ioffset_n32_view, mask_n32); + + // CHECK-STATEFUL-COUNT-2: call void @llvm.genx.lsc.prefetch.bti.v32i1.v32i32(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i32> {{[^)]+}}, i32 {{[^)]+}}) + // CHECK-STATELESS-COUNT-2: call void @llvm.genx.lsc.prefetch.stateless.v32i1.v32i64(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, i32 0) + prefetch(acc, ioffset_n32, mask_n32, props_cache_load); + prefetch(acc, ioffset_n32_view, mask_n32, props_cache_load); + + // 3) prefetch(acc, offset): offset is scalar + // CHECK-STATEFUL-COUNT-5: call void @llvm.genx.lsc.prefetch.bti.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 1, i8 2, i16 1, i32 0, i8 3, i8 1, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}) + // CHECK-STATELESS-COUNT-5: call void @llvm.genx.lsc.prefetch.stateless.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 1, i8 2, i16 1, i32 0, i8 3, i8 1, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0) + prefetch(acc); + prefetch(acc, byte_offset32); + prefetch(acc, mask_n1); + prefetch(acc, byte_offset32, mask_n1); + prefetch(acc, byte_offset32, mask_n1); + + // CHECK-STATEFUL-COUNT-5: call void @llvm.genx.lsc.prefetch.bti.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}) + // CHECK-STATELESS-COUNT-5: call void @llvm.genx.lsc.prefetch.stateless.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0) + prefetch(acc, byte_offset32, props_cache_load); + prefetch(acc, props_cache_load); + prefetch(acc, mask_n1, props_cache_load); + prefetch(acc, byte_offset32, mask_n1, props_cache_load); + prefetch(acc, byte_offset32, mask_n1, props_cache_load); + + // 4) prefetch(usm, ...): same as (1), (2) above, but with VS > 1. + // CHECK-STATEFUL-COUNT-2: call void @llvm.genx.lsc.prefetch.bti.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 0, i8 1, i8 2, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, i32 {{[^)]+}}) + // CHECK-STATELESS-COUNT-2: call void @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64(<16 x i1> {{[^)]+}}, i8 0, i8 1, i8 2, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, i32 0) + prefetch(acc, ioffset_n16); + prefetch(acc, ioffset_n16_view); + + // CHECK-STATEFUL-COUNT-2: call void @llvm.genx.lsc.prefetch.bti.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, i32 {{[^)]+}}) + // CHECK-STATELESS-COUNT-2: call void @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, i32 0) + prefetch(acc, ioffset_n16, props_cache_load); + prefetch(acc, ioffset_n16_view, props_cache_load); + + // CHECK-STATEFUL-COUNT-2: call void @llvm.genx.lsc.prefetch.bti.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 0, i8 1, i8 2, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, i32 {{[^)]+}}) + // CHECK-STATELESS-COUNT-2: call void @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64(<16 x i1> {{[^)]+}}, i8 0, i8 1, i8 2, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, i32 0) + prefetch(acc, ioffset_n16, mask_n16); + prefetch(acc, ioffset_n16_view, mask_n16); + + // CHECK-STATEFUL-COUNT-2: call void @llvm.genx.lsc.prefetch.bti.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, i32 {{[^)]+}}) + // CHECK-STATELESS-COUNT-2: call void @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, i32 0) + prefetch(acc, ioffset_n16, mask_n16, props_cache_load); + prefetch(acc, ioffset_n16_view, mask_n16, props_cache_load); + + // CHECK-STATEFUL-COUNT-2: call void @llvm.genx.lsc.prefetch.bti.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 1, i8 2, i16 1, i32 0, i8 3, i8 7, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}) + // CHECK-STATELESS-COUNT-2: call void @llvm.genx.lsc.prefetch.stateless.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 1, i8 2, i16 1, i32 0, i8 3, i8 7, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0) + prefetch(acc, 0); + prefetch(acc, 0, 1); } \ No newline at end of file