From b0d92ffe66c18a97f0e6fc39df043a73f60a937d Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Wed, 7 Feb 2024 08:41:29 -0800 Subject: [PATCH 01/26] [SYCL] Split sycl::clamp implementation in two (#12637) SYCL2020, revision 8 defines two separate clamp functions - once in integer functions and once in common functions. Follow the same in the implementation so that clamp's handling is uniform with other common/integer functions. --- .../sycl/detail/builtins/common_functions.inc | 25 +++---------------- .../detail/builtins/integer_functions.inc | 9 +++++++ sycl/source/builtins/common_functions.cpp | 12 ++------- sycl/source/builtins/integer_functions.cpp | 4 +++ 4 files changed, 19 insertions(+), 31 deletions(-) diff --git a/sycl/include/sycl/detail/builtins/common_functions.inc b/sycl/include/sycl/detail/builtins/common_functions.inc index fb10964934cf0..022cab78e51db 100644 --- a/sycl/include/sycl/detail/builtins/common_functions.inc +++ b/sycl/include/sycl/detail/builtins/common_functions.inc @@ -72,32 +72,15 @@ min(T x, detail::get_elem_type_t y) { detail::simplify_if_swizzle_t{y}); } -#undef BUILTIN_COMMON - -#ifdef __SYCL_DEVICE_ONLY__ -DEVICE_IMPL_TEMPLATE(THREE_ARGS, clamp, builtin_enable_generic_t, - [](auto... xs) { - using ElemTy = detail::get_elem_type_t; - if constexpr (std::is_integral_v) { - if constexpr (std::is_signed_v) { - return __spirv_ocl_s_clamp(xs...); - } else { - return __spirv_ocl_u_clamp(xs...); - } - } else { - return __spirv_ocl_fclamp(xs...); - } - }) -#else -HOST_IMPL_TEMPLATE(THREE_ARGS, clamp, builtin_enable_generic_t, common, - default_ret_type) -#endif +BUILTIN_COMMON(THREE_ARGS, clamp, __spirv_ocl_fclamp) template -detail::builtin_enable_generic_non_scalar_t +detail::builtin_enable_common_non_scalar_t clamp(T x, detail::get_elem_type_t y, detail::get_elem_type_t z) { return clamp(detail::simplify_if_swizzle_t{x}, detail::simplify_if_swizzle_t{y}, detail::simplify_if_swizzle_t{z}); } + +#undef BUILTIN_COMMON } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/detail/builtins/integer_functions.inc b/sycl/include/sycl/detail/builtins/integer_functions.inc index dfeb815e52494..455abf07aa250 100644 --- a/sycl/include/sycl/detail/builtins/integer_functions.inc +++ b/sycl/include/sycl/detail/builtins/integer_functions.inc @@ -122,6 +122,15 @@ min(T x, detail::get_elem_type_t y) { detail::simplify_if_swizzle_t{y}); } +BUILTIN_GENINT_SU(THREE_ARGS, clamp) +template +detail::builtin_enable_integer_non_scalar_t +clamp(T x, detail::get_elem_type_t y, detail::get_elem_type_t z) { + return clamp(detail::simplify_if_swizzle_t{x}, + detail::simplify_if_swizzle_t{y}, + detail::simplify_if_swizzle_t{z}); +} + BUILTIN_GENINT(ONE_ARG, clz) BUILTIN_GENINT(ONE_ARG, ctz) BUILTIN_GENINT(ONE_ARG, popcount) diff --git a/sycl/source/builtins/common_functions.cpp b/sycl/source/builtins/common_functions.cpp index 09742649ca24d..30d0645f2b517 100644 --- a/sycl/source/builtins/common_functions.cpp +++ b/sycl/source/builtins/common_functions.cpp @@ -63,16 +63,8 @@ BUILTIN_COMMON(TWO_ARGS, max, BUILTIN_COMMON(TWO_ARGS, min, [](auto x, auto y) -> decltype(x) { return (y < x ? y : x); }) -// clamp is implemented for INTEGER_TYPES as well, so expand/inline -// BUILTIN_COMMON manually. -HOST_IMPL(clamp, [](auto x, auto y, auto z) -> decltype(x) { - using ElemTy = detail::get_elem_type_t; - if constexpr (std::is_integral_v) { - return std::min(std::max(x, y), z); - } else { - return std::fmin(std::fmax(x, y), z); - } +BUILTIN_COMMON(THREE_ARGS, clamp, [](auto x, auto y, auto z) -> decltype(x) { + return std::fmin(std::fmax(x, y), z); }) -EXPORT_SCALAR_AND_VEC_1_16(THREE_ARGS, clamp, INTEGER_TYPES, FP_TYPES) } // namespace _V1 } // namespace sycl diff --git a/sycl/source/builtins/integer_functions.cpp b/sycl/source/builtins/integer_functions.cpp index 26c4dd9a5788f..381d6f1fa0a10 100644 --- a/sycl/source/builtins/integer_functions.cpp +++ b/sycl/source/builtins/integer_functions.cpp @@ -214,6 +214,10 @@ BUILTIN_GENINT_SU(TWO_ARGS, max, BUILTIN_GENINT_SU(TWO_ARGS, min, [](auto x, auto y) -> decltype(x) { return y < x ? y : x; }) +BUILTIN_GENINT_SU(THREE_ARGS, clamp, [](auto x, auto y, auto z) -> decltype(x) { + return std::min(std::max(x, y), z); +}) + template static inline constexpr T __clz_impl(T x, T m, T n = 0) { return (x & m) ? n : __clz_impl(x, T(m >> 1), ++n); } From e53e5d599454fe3775b360da92b0f774f2ea523d Mon Sep 17 00:00:00 2001 From: John Pennycook Date: Wed, 7 Feb 2024 09:31:01 -0800 Subject: [PATCH 02/26] [SYCL][Doc] Fix broken link in forward_progress (#12652) sycl_ext_oneapi_launch_queries was originally part of sycl_ext_oneapi_forward_progress. When we split things into two extensions, we forgot to update one of the links. Signed-off-by: John Pennycook --- .../proposed/sycl_ext_oneapi_forward_progress.asciidoc | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_forward_progress.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_forward_progress.asciidoc index 4c1df1640d021..227b23f1ffdac 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_forward_progress.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_forward_progress.asciidoc @@ -20,7 +20,7 @@ == Notice [%hardbreaks] -Copyright (C) 2022-2023 Intel Corporation. All rights reserved. +Copyright (C) 2022-2024 Intel Corporation. All rights reserved. Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks of The Khronos Group Inc. OpenCL(TM) is a trademark of Apple Inc. used by @@ -383,9 +383,11 @@ and work-groups to also provide concurrent forward progress guarantees). In such a case, an implementation must satisfy the strongest request(s). Devices may not be able to provide the requested forward progress guarantees -for all launch configurations. The <> defined in a -later section allow developers to identify valid launch configurations for -specific combinations of properties. +for all launch configurations. Developers should use the launch queries defined +by the +link:../proposed/sycl_ext_oneapi_launch_queries.asciidoc[sycl_ext_oneapi_launch_queries] +extension to identify valid launch configurations for specific combinations of +properties. [NOTE] ==== From 96179391391caca00b4712fae950cd75af551ff2 Mon Sep 17 00:00:00 2001 From: fineg74 <61437305+fineg74@users.noreply.github.com> Date: Wed, 7 Feb 2024 10:19:18 -0800 Subject: [PATCH 03/26] [SYCL][ESIMD]Implement slm_scatter accepting compile time properties (#12591) This implements the unified memory API for slm_scatter with local memory --- sycl/include/sycl/ext/intel/esimd/memory.hpp | 216 +++++++++++++++-- .../ext/intel/experimental/esimd/memory.hpp | 16 +- .../unified_memory_api/Inputs/scatter.hpp | 219 ++++++++++++++++++ .../ESIMD/unified_memory_api/slm_scatter.cpp | 33 +++ .../slm_scatter_dg2_pvc.cpp | 37 +++ sycl/test/esimd/memory_properties.cpp | 56 +++++ 6 files changed, 539 insertions(+), 38 deletions(-) create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index d923821027339..7f4833017d38a 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -2685,11 +2685,13 @@ block_store(AccessorT acc, simd vals, simd_mask<1> pred, namespace detail { template ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t< - (sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16 || N == 32) && - (std::is_same_v || - is_accessor_with_v)> + std::is_same_v || + is_accessor_with_v> scatter_impl(AccessorTy acc, simd vals, simd offsets, uint32_t glob_offset, simd_mask mask) { + + static_assert(sizeof(T) <= 4 && detail::isPowerOf2(N, 32), + "Unexpected type or vector length"); constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding(); // TODO (performance) use hardware-supported scale once BE supports it constexpr int16_t scale = 0; @@ -2820,10 +2822,9 @@ gather_impl(AccessorT acc, simd byte_offsets, /// @return is a vector of type T and size N * NElts. /// template -__ESIMD_API __ESIMD_NS::simd -slm_gather_impl(__ESIMD_NS::simd offsets, - __ESIMD_NS::simd_mask pred, - __ESIMD_NS::simd pass_thru) { +__ESIMD_API simd slm_gather_impl(simd offsets, + simd_mask pred, + simd pass_thru) { check_lsc_vector_size(); check_lsc_data_size(); constexpr uint16_t AddressScale = 1; @@ -2832,9 +2833,8 @@ slm_gather_impl(__ESIMD_NS::simd offsets, constexpr lsc_vector_size LSCVS = to_lsc_vector_size(); constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; using MsgT = typename lsc_expand_type::type; - __ESIMD_NS::simd PassThruExpanded = - lsc_format_input(pass_thru); - __ESIMD_NS::simd Result = + simd PassThruExpanded = lsc_format_input(pass_thru); + simd Result = __esimd_lsc_load_merge_slm(pred.data(), offsets.data(), @@ -2842,6 +2842,37 @@ slm_gather_impl(__ESIMD_NS::simd offsets, return lsc_format_ret(Result); } +/// SLM scatter implementation. +/// Supported platforms: DG2, PVC +/// VISA instruction: lsc_store.slm +/// +/// Scatters elements located to slm. +/// +/// @tparam T is element type. +/// @tparam NElts is the number of elements to store per address. +/// @tparam DS is the data size. +/// @tparam N is the number of channels (platform dependent). +/// @param offsets is the zero-based offsets for SLM buffer in bytes. +/// @param vals is values to store. +/// @param pred is predicates. +/// +template +__ESIMD_API void slm_scatter_impl(simd offsets, + simd vals, simd_mask pred) { + check_lsc_vector_size(); + check_lsc_data_size(); + constexpr uint16_t AddressScale = 1; + constexpr int ImmOffset = 0; + constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); + constexpr lsc_vector_size LSCVS = to_lsc_vector_size(); + constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; + using MsgT = typename lsc_expand_type::type; + simd Tmp = lsc_format_input(vals); + __esimd_lsc_store_slm( + pred.data(), offsets.data(), Tmp.data()); +} + } // namespace detail /// @endcond ESIMD_DETAIL @@ -3903,7 +3934,7 @@ slm_gather(simd byte_offsets, simd_mask mask, static_assert(Alignment >= sizeof(T), "slm_gather() requires at least element-size alignment"); - if constexpr (VS > 1 || (!detail::isPowerOf2(N, 32) && + if constexpr (VS > 1 || (!(detail::isPowerOf2(N, 32) && sizeof(T) <= 4) && !detail::isMaskedGatherScatterLLVMAvailable())) { simd PassThru; // Intentionally undefined return detail::slm_gather_impl( @@ -4118,7 +4149,7 @@ slm_gather(OffsetSimdViewT byte_offsets, simd_mask mask, /// @param byte_offsets the vector of 32-bit offsets in bytes. /// For each i, (byte_offsets[i]) must be element size aligned. /// @param props The optional compile-time properties. Only 'alignment' -/// and cache hint properties are used. +/// property is used. /// @return A vector of elements read. template __ESIMD_API T slm_scalar_load(uint32_t offset) { return Res[0]; } -/// Scatter operation over the Shared Local Memory. -/// This API has almost the same interface as the @ref accessor_scatter -/// "accessor-based scatter", except that it does not have the accessor and -/// the global offset parameters. -/// -template -__ESIMD_API std::enable_if_t<(N == 1 || N == 8 || N == 16 || N == 32) && - (sizeof(T) <= 4)> -slm_scatter(simd offsets, simd vals, simd_mask mask = 1) { - detail::LocalAccessorMarker acc; - detail::scatter_impl(acc, vals, offsets, 0, mask); +/// template +/// void slm_scatter(simd byte_offsets, +/// simd vals, simd_mask mask, +/// PropertyListT props = {}); // (slm-sc-1) +/// void slm_scatter(simd byte_offsets, +/// simd vals, PropertyListT props = {}); // (slm-sc-2) +/// +/// The next 2 functions are variations of the first 2 above (slm-sc-1,2) +/// and were added only to support simd_view instead of simd for byte_offsets. +/// template +/// void slm_scatter(OffsetSimdViewT byte_offsets, +/// simd vals, simd_mask mask, +/// PropertyListT props = {}); // (slm-sc-3) +/// void slm_scatter(OffsetSimdViewT byte_offsets, +/// simd vals, PropertyListT props = {}); // (slm-sc-4) + +/// template +/// void slm_scatter(simd byte_offsets, +/// simd vals, simd_mask mask, +/// PropertyListT props = {}); // (slm-sc-1) +/// Stores ("scatters") elements of the type 'T' to Shared Local Memory +/// locations addressed by byte offsets \p byte_offsets. Storage of any element +/// can be disabled via the input vector of predicates \p mask. +/// If mask[i] is unset, then the storage to (byte_offsets[i]) is skipped. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @tparam VS Vector size. It can also be read as the number of writes per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, (byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param vals The vector of values to store. +/// @param mask The access mask, defaults to all 1s. +/// @param props The optional compile-time properties. Only 'alignment' property +/// is used. +template +__ESIMD_API std::enable_if_t< + ext::oneapi::experimental::is_property_list_v> +slm_scatter(simd byte_offsets, simd vals, + simd_mask mask, PropertyListT props = {}) { + static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS"); + + constexpr size_t Alignment = + detail::getPropertyValue(sizeof(T)); + static_assert(Alignment >= sizeof(T), + "slm_scatter() requires at least element-size alignment"); + + // Use LSC lowering if VS > 1. + if constexpr (VS > 1 || !(detail::isPowerOf2(N, 32) && sizeof(T) <= 4)) { + __ESIMD_DNS::slm_scatter_impl( + byte_offsets, vals, mask); + } else { + detail::LocalAccessorMarker acc; + detail::scatter_impl(acc, vals, byte_offsets, 0, mask); + } +} + +/// template +/// void slm_scatter(simd byte_offsets, simd vals, +/// PropertyListT props = {}); // (slm-sc-2) +/// Stores ("scatters") elements of the type 'T' to Shared Local Memory +/// locations addressed by byte offsets \p byte_offsets. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @tparam VS Vector size. It can also be read as the number of reads per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors.. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, (byte_offsets[i]) must be element size aligned. +/// @param vals The vector of values to store. +/// @param props The optional compile-time properties. Only 'alignment' +/// property is used. +template +__ESIMD_API std::enable_if_t< + ext::oneapi::experimental::is_property_list_v> +slm_scatter(simd byte_offsets, simd vals, + PropertyListT props = {}) { + simd_mask Mask = 1; + slm_scatter(byte_offsets, vals, Mask, props); +} + +/// template +/// void slm_scatter( +/// OffsetSimdViewT byte_offsets, simd vals, +/// simd_mask mask, PropertyListT props = {}); // (slm-sc-3) +/// Stores ("scatters") elements of the type 'T' to Shared Local Memory +/// locations addressed by byte offsets \p byte_offsets. +/// Storage to any element's memory location can be disabled via the +/// input vector of predicates \p mask. If mask[i] is unset, then the storage to +/// (byte_offsets[i]) is skipped. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @tparam VS Vector size. It can also be read as the number of reads per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors.. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, (byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param vals The vector of values to store. +/// @param mask The access mask, defaults to all 1s. +/// @param props The optional compile-time properties. Only 'alignment' +/// property is used. +template +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v> +slm_scatter(OffsetSimdViewT byte_offsets, simd vals, + simd_mask mask, PropertyListT props = {}) { + slm_scatter(byte_offsets.read(), vals, mask, props); +} + +/// void slm_scatter( +/// OffsetSimdViewT byte_offsets, simd vals, +/// PropertyListT props = {}); // (slm-sc-4) +/// Loads ("gathers") elements of the type 'T' from Shared Local Memory +/// locations addressed by byte offsets \p byte_offsets, and returns the loaded +/// elements. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @tparam VS Vector size. It can also be read as the number of reads per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, (byte_offsets[i]) must be element size aligned. +/// @param vals The vector of values to store. +/// @param props The optional compile-time properties. Only 'alignment' +/// property is used. +template +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v> +slm_scatter(OffsetSimdViewT byte_offsets, simd vals, + PropertyListT props = {}) { + return slm_scatter(byte_offsets.read(), vals, props); } /// Store a scalar value into the Shared Local Memory. diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index 7fb0ae833e793..9d5054b4392ba 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -1424,21 +1424,7 @@ template offsets, __ESIMD_NS::simd vals, __ESIMD_NS::simd_mask pred = 1) { - detail::check_lsc_vector_size(); - detail::check_lsc_data_size(); - constexpr uint16_t _AddressScale = 1; - constexpr int _ImmOffset = 0; - constexpr lsc_data_size _DS = - detail::expand_data_size(detail::finalize_data_size()); - constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size(); - constexpr detail::lsc_data_order _Transposed = - detail::lsc_data_order::nontranspose; - using MsgT = typename detail::lsc_expand_type::type; - using CstT = __ESIMD_DNS::uint_type_t; - __ESIMD_NS::simd Tmp = vals.template bit_cast_view(); - __esimd_lsc_store_slm( - pred.data(), offsets.data(), Tmp.data()); + __ESIMD_DNS::slm_scatter_impl(offsets, vals, pred); } /// Transposed SLM scatter with 1 channel. diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp index 755c436f0b329..8ea1fcf4a08ad 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp @@ -230,3 +230,222 @@ template bool testUSM(queue Q) { return Passed; } + +template +bool testSLM(queue Q, uint32_t MaskStride, + ScatterPropertiesT ScatterProperties) { + constexpr uint32_t Groups = 8; + constexpr uint32_t Threads = 1; + constexpr size_t Size = Groups * Threads * N; + static_assert(VS > 0 && N % VS == 0, + "Incorrect VS parameter. N must be divisible by VS."); + constexpr int NOffsets = N / VS; + using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t; + + std::cout << "SLM case: T=" << esimd_test::type_name() << ",N=" << N + << ", VS=" << VS << ",UseMask=" << UseMask + << ",UseProperties=" << UseProperties << std::endl; + + sycl::range<1> GlobalRange{Groups}; + sycl::range<1> LocalRange{Threads}; + sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange}; + + T *Out = static_cast(sycl::malloc_shared(Size * sizeof(T), Q)); + for (size_t i = 0; i < Size; i++) + Out[i] = i; + + try { + Q.submit([&](handler &cgh) { + cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { + ScatterPropertiesT Props{}; + uint16_t GlobalID = ndi.get_global_id(0); + uint16_t LocalID = ndi.get_local_id(0); + uint32_t GlobalElemOffset = GlobalID * N; + uint32_t LocalElemOffset = LocalID * N; + + constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T); + slm_init(); + + if (LocalID == 0) { + for (int I = 0; I < Threads * N; I += 8) { + simd InVec(Out + GlobalElemOffset + I); + simd Offsets(I * sizeof(T), sizeof(T)); + slm_scatter(Offsets, InVec); + } + } + barrier(); + + simd ByteOffsets(LocalElemOffset * sizeof(T), + VS * sizeof(T)); + auto ByteOffsetsView = ByteOffsets.template select(); + + simd Vals = slm_gather(ByteOffsets, Props); + Vals *= 2; + + auto ValsView = Vals.template select(); + simd_mask Pred = 0; + for (int I = 0; I < NOffsets; I++) + Pred[I] = (I % MaskStride == 0) ? 1 : 0; + if constexpr (VS > 1) { // VS > 1 requires specifying + if constexpr (UseMask) { + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals, Pred, Props); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals, Pred, Props); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView, Pred, Props); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView, Pred, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals, Pred); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals, Pred); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView, Pred); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView, Pred); + } + } else { // UseMask == false + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals, Props); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals, Props); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView, Props); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView); + } + } + } else { // VS == 1 + if constexpr (UseMask) { + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals, Pred, Props); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals, Pred, Props); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView, Pred, Props); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView, Pred, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals, Pred); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals, Pred); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView, Pred); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView, Pred); + } + } else { // UseMask == false + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals, Props); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals, Props); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView, Props); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + slm_scatter(ByteOffsets, Vals); + else if (GlobalID % 4 == 1) + slm_scatter(ByteOffsetsView, Vals); + else if (GlobalID % 4 == 2) + slm_scatter(ByteOffsets, ValsView); + else if (GlobalID % 4 == 3) + slm_scatter(ByteOffsetsView, ValsView); + } + } + } + barrier(); + if (LocalID == 0) { + for (int I = 0; I < Threads * N; I++) { + simd Offsets(I * sizeof(T), sizeof(T)); + simd OutVec = slm_gather(Offsets); + OutVec.copy_to(Out + GlobalElemOffset + I); + } + } + }); + }).wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + sycl::free(Out, Q); + return false; + } + + bool Passed = verify(Out, N, Size, VS, MaskStride, UseMask); + + sycl::free(Out, Q); + + return Passed; +} + +template bool testSLM(queue Q) { + constexpr bool CheckMask = true; + constexpr bool CheckProperties = true; + properties EmptyProps; + properties AlignElemProps{alignment}; + + bool Passed = true; + + // Test scatter() that is available on Gen12 and PVC. + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 1, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 1, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + + // // Test scatter() without passing compile-time properties argument. + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + Passed &= testSLM(Q, 2, EmptyProps); + + if constexpr (Features == TestFeatures::PVC || + Features == TestFeatures::DG2) { + properties LSCProps{alignment}; + Passed &= testSLM(Q, 2, LSCProps); + Passed &= testSLM(Q, 2, LSCProps); + Passed &= testSLM(Q, 2, LSCProps); + Passed &= testSLM(Q, 2, LSCProps); + + Passed &= testSLM(Q, 2, LSCProps); + + // Check VS > 1. GPU supports only dwords and qwords in this mode. + if constexpr (sizeof(T) >= 4) { + // TODO: This test case causes flaky fail. Enable it after the issue + // in GPU driver is fixed. + // Passed &= + // testSLM(Q, 2, AlignElemProps) + Passed &= + testSLM(Q, 2, AlignElemProps); + Passed &= + testSLM(Q, 2, AlignElemProps); + Passed &= + testSLM(Q, 2, AlignElemProps); + } + } // TestPVCFeatures + + return Passed; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp new file mode 100644 index 0000000000000..96c49fcdae8af --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp @@ -0,0 +1,33 @@ +//==------- slm_scatter.cpp - DPC++ ESIMD on-device test ---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===------------------------------------------------------------------===// +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::slm_scatter() functions accepting +// optional compile-time esimd::properties. +// The scatter() calls in this test do not use DG2/PVC features. + +#include "Inputs/scatter.hpp" + +int main() { + auto Q = queue{gpu_selector_v}; + esimd_test::printTestLabel(Q); + + constexpr auto TestFeatures = TestFeatures::Generic; + bool Passed = true; + + Passed &= testSLM(Q); + Passed &= testSLM(Q); + if (Q.get_device().has(sycl::aspect::fp16)) + Passed &= testSLM(Q); + Passed &= testSLM(Q); + Passed &= testSLM(Q); + + std::cout << (Passed ? "Passed\n" : "FAILED\n"); + return Passed ? 0 : 1; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp new file mode 100644 index 0000000000000..4ebe20e376cf1 --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_dg2_pvc.cpp @@ -0,0 +1,37 @@ +//==------- slm_scatter_dg2_pvc.cpp - DPC++ ESIMD on-device test--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===------------------------------------------------------------------===// +// REQUIRES: gpu-intel-pvc || gpu-intel-dg2 +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::slm_scatter() functions accepting +// optional compile-time esimd::properties. +// The slm_scatter() calls in this test use DG2 or PVC features. + +#include "Inputs/scatter.hpp" + +int main() { + auto Q = queue{gpu_selector_v}; + esimd_test::printTestLabel(Q); + + constexpr auto TestFeatures = TestFeatures::PVC; + bool Passed = true; + + Passed &= testSLM(Q); + Passed &= testSLM(Q); + if (Q.get_device().has(sycl::aspect::fp16)) + Passed &= testSLM(Q); + Passed &= testSLM(Q); + Passed &= testSLM(Q); + Passed &= testSLM(Q); + if (Q.get_device().has(sycl::aspect::fp64)) + Passed &= testSLM(Q); + + std::cout << (Passed ? "Passed\n" : "FAILED\n"); + return Passed ? 0 : 1; +} diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp index 3f653adb5e476..eb629935347db 100644 --- a/sycl/test/esimd/memory_properties.cpp +++ b/sycl/test/esimd/memory_properties.cpp @@ -1302,6 +1302,7 @@ test_slm_gather_scatter(int byte_offset32) { simd slm; simd pass_thru; auto pass_thru_view = pass_thru.select<32, 1>(); + auto slm_view = slm.select<32, 1>(); // Test SLM gather using this plan: // 1) slm_gather(offsets): offsets is simd or simd_view @@ -1373,4 +1374,59 @@ test_slm_gather_scatter(int byte_offset32) { props_align4); slm = slm_gather(ioffset_n16_view, mask_n16, pass_thru_view, props_align4); + + // Test SLM scatter using this plan: + // 1) slm_scatter(offsets, vals): offsets/vals is simd or simd_view + // 2) slm_scatter(offsets, vals, mask): offsets/vals is simd or simd_view + // 3) slm_scatter(...): same as (1), (2) above, but with VS > 1. + + // 1) slm_scatter(offsets): offsets is simd or simd_view + // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + slm_scatter(ioffset_n32, slm); + slm_scatter(ioffset_n32_view, slm); + slm_scatter(ioffset_n32, slm_view); + slm_scatter(ioffset_n32_view, slm_view); + + // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + slm_scatter(ioffset_n32, slm, props_align8); + slm_scatter(ioffset_n32_view, slm, props_align8); + slm_scatter(ioffset_n32, slm_view, props_align8); + slm_scatter(ioffset_n32_view, slm_view, props_align8); + + // 2) slm_gather(offsets, mask): offsets is simd or simd_view + // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + slm_scatter(ioffset_n32, slm, mask_n32); + slm_scatter(ioffset_n32_view, slm, mask_n32); + slm_scatter(ioffset_n32, slm_view, mask_n32); + slm_scatter(ioffset_n32_view, slm_view, mask_n32); + + // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + slm_scatter(ioffset_n32, slm, mask_n32, props_align8); + slm_scatter(ioffset_n32_view, slm, mask_n32, props_align8); + slm_scatter(ioffset_n32, slm_view, mask_n32, props_align8); + slm_scatter(ioffset_n32_view, slm_view, mask_n32, props_align8); + + // 4) slm_gather(...): same as (1), (2), above, but with VS > 1. + // CHECK-COUNT-16: call void @llvm.genx.lsc.store.slm.v16i1.v16i32.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, <32 x i32>{{[^)]+}}, i32 0) + // 4a) check VS > 1. no 'mask' operand first. + slm_scatter(ioffset_n16, slm); + slm_scatter(ioffset_n16_view, slm); + slm_scatter(ioffset_n16, slm_view); + slm_scatter(ioffset_n16_view, slm_view); + + slm_scatter(ioffset_n16, slm, props_align4); + slm_scatter(ioffset_n16_view, slm, props_align4); + slm_scatter(ioffset_n16, slm_view, props_align4); + slm_scatter(ioffset_n16_view, slm_view, props_align4); + + // 4b) check VS > 1. Pass the 'mask' operand this time. + slm_scatter(ioffset_n16, slm, mask_n16); + slm_scatter(ioffset_n16_view, slm, mask_n16); + slm_scatter(ioffset_n16, slm_view, mask_n16); + slm_scatter(ioffset_n16_view, slm_view, mask_n16); + + slm_scatter(ioffset_n16, slm, mask_n16, props_align4); + slm_scatter(ioffset_n16_view, slm, mask_n16, props_align4); + slm_scatter(ioffset_n16, slm_view, mask_n16, props_align4); + slm_scatter(ioffset_n16_view, slm_view, mask_n16, props_align4); } From 557df1aa57c810732f1635abb85045f407c5d0f3 Mon Sep 17 00:00:00 2001 From: Nikita Kornev Date: Wed, 7 Feb 2024 19:26:21 +0100 Subject: [PATCH 04/26] [SYCL][DOC] Extend sycl_ext_oneapi_root_group (#12643) Adding new enum "execution_scope" and few member functions --------- Co-authored-by: John Pennycook --- .../sycl_ext_oneapi_root_group.asciidoc | 96 ++++++++++++++++++- 1 file changed, 95 insertions(+), 1 deletion(-) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc index 0ef2126d86fa1..44eb1b376d113 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc +++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc @@ -182,7 +182,8 @@ inline constexpr use_root_sync_key::value_t use_root_sync; === The `root_group` class The `root_group` class implements all member functions common to the -`sycl::group` and `sycl::sub_group` classes. +`sycl::group` and `sycl::sub_group` classes and also contains own +additional functions. [source,c++] ---- @@ -191,6 +192,13 @@ namespace ext { namespace oneapi { namespace experimental { +enum class execution_scope { + work_item, + sub_group, + work_group, + root_group, +}; + template class root_group { public: @@ -221,6 +229,31 @@ public: bool leader() const; + template + std::enable_if_t<(Scope == execution_scope::work_item || + Scope == execution_scope::work_group), + id> + get_id() const; + + template + std::enable_if_t> get_id() const; + + template + size_t get_linear_id() const; + + template + std::enable_if_t<(Scope == execution_scope::work_item || + Scope == execution_scope::work_group), + range> + get_range() const; + + template + std::enable_if_t> + get_range() const; + + template + size_t get_linear_range() const; + }; } // namespace experimental @@ -307,6 +340,67 @@ work-item is the leader of the root-group, and `false` for all other work-items in the root-group. The leader of the root-group is guaranteed to be the work-item for which `get_local_id()` returns 0. +[source,c++] +---- +template +std::enable_if_t<(Scope == execution_scope::work_item || + Scope == execution_scope::work_group), + id> +get_id() const; +---- +_Returns_: An `id` representing the index of the current work-group or work-item at `Scope` +hierarchy level within the `root_group` object. + +[source,c++] +---- +template +std::enable_if_t> get_id() const; +---- +_Returns_: An `id` representing the index of the current sub-group within the +`root_group` object. + +[source,c++] +---- +template +size_t get_linear_id() const; +---- +_Constraints_: `Scope` must be narrower than +`execution_scope::root_group`. + +_Returns_: A linearized number of the current work-group or work-item at `Scope` hierarchy +level within the `root_group` object. + +[source,c++] +---- +template +std::enable_if_t<(Scope == execution_scope::work_item || + Scope == execution_scope::work_group), + range> +get_range() const; +---- +_Returns_: A `range` representing the number of work-groups or work-items of `Scope` +hierarchy level within the `root_group` object. + +[source,c++] +---- +template +std::enable_if_t> +get_range() const; +---- +_Returns_: A `range` representing the number of sub-groups within the `root_group` +object. + +[source,c++] +---- +template +size_t get_linear_range() const; +---- +_Constraints_: `Scope` must be narrower than +`execution_scope::root_group`. + +_Returns_: The number of work-groups or work-items of `Scope` hierarchy level within the +`root_group` object. + === Using a `root_group` From 7bacc52cd2e14955cdd80784aa9d6961c39d6cfc Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Wed, 7 Feb 2024 20:06:14 +0000 Subject: [PATCH 05/26] [SYCL][ESIMD][E2E] Fix aot_mixed.cpp (#12650) Arg was wrong. Manually tested this. CI is current broken because of this, Signed-off-by: Nick Sarnie --- sycl/test-e2e/ESIMD/aot_mixed.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/ESIMD/aot_mixed.cpp b/sycl/test-e2e/ESIMD/aot_mixed.cpp index 32c10969ee51b..5204d413e69ad 100644 --- a/sycl/test-e2e/ESIMD/aot_mixed.cpp +++ b/sycl/test-e2e/ESIMD/aot_mixed.cpp @@ -7,9 +7,9 @@ //===----------------------------------------------------------------------===// // TODO: Enable on other GPUs once internal ticket is fixed // REQUIRES: ocloc && gpu-intel-gen12 -// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen -Xs "-device tgllp" -o %t.sycl.out -DENABLE_SYCL=0 %s +// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device tgllp" -o %t.sycl.out -DENABLE_SYCL=0 %s // RUN: %{run} %t.sycl.out -// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen -Xs "-device tgllp" -o %t.out %s +// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device tgllp" -o %t.out %s // RUN: %{run} %t.out // This test checks the following ESIMD ahead-of-time compilation scenarios: From 5f1d98a6e9da8065e8c211479113385ff16cd71b Mon Sep 17 00:00:00 2001 From: Vyacheslav Klochkov Date: Wed, 7 Feb 2024 15:04:47 -0600 Subject: [PATCH 06/26] [ESIMD][E2E] Enable few cases in atomic_update after gpu driver update (#12639) Signed-off-by: Klochkov, Vyacheslav N --- .../Inputs/atomic_update.hpp | 16 ++++---------- .../Inputs/atomic_update_slm.hpp | 22 +++---------------- 2 files changed, 7 insertions(+), 31 deletions(-) diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp index 3da82ee8e15d1..ef8646ae4892a 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp @@ -715,13 +715,10 @@ bool test_int_types_and_sizes(queue q, const Config &cfg) { q, cfg); passed &= test_int_types<64, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>( q, cfg); - // non power of two values are supported only in newer driver. - // TODO: Enable this when the new driver reaches test infrastructure - // (v27556). -#if 0 - passed &= test_int_types<12, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg); - passed &= test_int_types<33, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg); -#endif + passed &= test_int_types<12, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>( + q, cfg); + passed &= test_int_types<33, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>( + q, cfg); } return passed; @@ -742,13 +739,8 @@ bool test_fp_types_and_sizes(queue q, const Config &cfg) { passed &= test_fp_types<32, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg); passed &= test_fp_types<64, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg); - // non power of two values are supported only in newer driver. - // TODO: Enable this when the new driver reaches test infrastructure - // (v27556). -#if 0 passed &= test_fp_types<12, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg); passed &= test_fp_types<35, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg); -#endif } return passed; } diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp index 92f000711e137..6b69d99158cb6 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp @@ -612,14 +612,9 @@ bool test_fp_types(queue q) { if constexpr (Features == TestFeatures::DG2 || Features == TestFeatures::PVC) { - // TODO: fmin/fmax/fcmpxchg for double requires a newer GPU driver. - if constexpr (!std::is_same_v, ImplLSCFmax> && - !std::is_same_v, ImplLSCFmin> && - !std::is_same_v, ImplLSCFcmpwr>) { - if (q.get_device().has(sycl::aspect::atomic64) && - q.get_device().has(sycl::aspect::fp64)) { - passed &= run_test(q); - } + if (q.get_device().has(sycl::aspect::atomic64) && + q.get_device().has(sycl::aspect::fp64)) { + passed &= run_test(q); } } return passed; @@ -633,7 +628,6 @@ bool test_int_types_and_sizes(queue q) { passed &= test_int_types<2, Op, UseMask, Features, UseAcc, SignMask>(q); passed &= test_int_types<4, Op, UseMask, Features, UseAcc, SignMask>(q); passed &= test_int_types<8, Op, UseMask, Features, UseAcc, SignMask>(q); - // TODO: N=16 and N=32 does not pass on Gen12 with mask due to older driver. if (UseMask && Features == TestFeatures::Generic && esimd_test::isGPUDriverGE(q, esimd_test::GPUDriverOS::LinuxAndWindows, "26918", "101.4953", false)) { @@ -645,13 +639,8 @@ bool test_int_types_and_sizes(queue q) { if constexpr (Features == TestFeatures::DG2 || Features == TestFeatures::PVC) { passed &= test_int_types<64, Op, UseMask, Features, UseAcc, SignMask>(q); - // non power of two values are supported only in newer driver. - // TODO: Enable this when the new driver reaches test infrastructure - // (v27556). -#if 0 passed &= test_int_types<12, Op, UseMask, Features, UseAcc, SignMask>(q); passed &= test_int_types<33, Op, UseMask, Features, UseAcc, SignMask>(q); -#endif } return passed; @@ -672,13 +661,8 @@ bool test_fp_types_and_sizes(queue q) { if constexpr (Features == TestFeatures::DG2 || Features == TestFeatures::PVC) { passed &= test_fp_types<64, Op, UseMask, Features, UseAcc>(q); - // non power of two values are supported only in newer driver. - // TODO: Enable this when the new driver reaches test infrastructure - // (v27556). -#if 0 passed &= test_fp_types<33, Op, UseMask, Features, UseAcc>(q); passed &= test_fp_types<65, Op, UseMask, Features, UseAcc>(q); -#endif } return passed; } From 8185f30da27161289487b664a3bb1df37ce647f9 Mon Sep 17 00:00:00 2001 From: lbushi25 <113361374+lbushi25@users.noreply.github.com> Date: Thu, 8 Feb 2024 03:32:34 -0500 Subject: [PATCH 07/26] Rewrite tests that fail when usm_shared_allocations not supported. (#12636) Nearly 40 E2E tests use malloc_shared and do not expect an exception to be thrown in case of failure which is not what the spec says, namely, malloc_shared should throw when the usm_shared_allocations aspect is not supported by the device. At the moment malloc_shared simply returns nullptr, however, once the implementation of malloc_shared is changed to throw an exception these tests will produce failures. This PR is the first of a series of PR's to rewrite these tests so that they don't fail once malloc_shared is changed to conform to the spec. These changes will only affect devices that do not support the relevant aspect. This is done either: - by having llvm-lit require that the aspect is supported for tests that rely heavily on malloc_shared, - by disabling, through if statements, sections of code that use malloc_shared for tests that use it sparsely, - by completely rewriting the malloc_shared code with equivalent buffer/accessor code that is guaranteed to work on all platforms. The latter approach is taken on tests that do not have the purpose of testing malloc_shared but simply use it to allocate memory for the test. These changes aim to eliminate test failures while maintaining test coverage. Subsequent PR's will omit this description and refer to this PR instead. --- .../Annotated_arg_ptr/annotated_arg.cpp | 2 +- .../Annotated_arg_ptr/annotated_ptr.cpp | 2 +- .../Annotated_usm/annotated_usm_kind.cpp | 35 +++--- sycl/test-e2e/Basic/group_local_memory.cpp | 26 ++-- sycl/test-e2e/Basic/large-range.cpp | 46 +++---- sycl/test-e2e/Basic/span.cpp | 2 +- sycl/test-e2e/Basic/wrapped_usm_pointers.cpp | 2 +- .../Complex/sycl_complex_math_test.cpp | 112 ++++++++---------- .../Complex/sycl_complex_operator_test.cpp | 79 ++++++------ .../Complex/sycl_complex_pow_test.cpp | 65 +++++----- .../Complex/sycl_complex_stream_test.cpp | 8 +- 11 files changed, 185 insertions(+), 194 deletions(-) diff --git a/sycl/test-e2e/Annotated_arg_ptr/annotated_arg.cpp b/sycl/test-e2e/Annotated_arg_ptr/annotated_arg.cpp index 8fad623752873..ea572bfc4c5d7 100644 --- a/sycl/test-e2e/Annotated_arg_ptr/annotated_arg.cpp +++ b/sycl/test-e2e/Annotated_arg_ptr/annotated_arg.cpp @@ -1,6 +1,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// +// REQUIRES: aspect-usm_shared_allocations #include "common.hpp" diff --git a/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr.cpp b/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr.cpp index 7e4a35d09b994..c478fd96afb34 100644 --- a/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr.cpp +++ b/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr.cpp @@ -1,6 +1,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// +// REQUIRES: aspect-usm_shared_allocations #include "common.hpp" diff --git a/sycl/test-e2e/Annotated_usm/annotated_usm_kind.cpp b/sycl/test-e2e/Annotated_usm/annotated_usm_kind.cpp index 812a0bb115737..8be1db398a1a1 100644 --- a/sycl/test-e2e/Annotated_usm/annotated_usm_kind.cpp +++ b/sycl/test-e2e/Annotated_usm/annotated_usm_kind.cpp @@ -124,26 +124,27 @@ template void testUsmKind(sycl::queue &q) { [&]() { return TAnnotated(dev, Ctx, properties{usm_kind_host}); }, [&]() { return ATHost(1, q); }, [&]() { return ATHost(1, Ctx); }, [&]() { return ATAnnotated(1, dev, Ctx, alloc::host); }}); - - CheckUsmKindAll( - alloc::shared, - std::tuple{ - [&]() { return MShared(q); }, [&]() { return MShared(dev, Ctx); }, - [&]() { return MAnnotated(dev, Ctx, alloc::shared); }, - [&]() { return MAnnotated(dev, Ctx, properties{usm_kind_shared}); }, - [&]() { return AShared(1, q); }, - [&]() { return AShared(1, dev, Ctx); }, - [&]() { return AAnnotated(1, dev, Ctx, alloc::shared); }, - [&]() { return TShared(q); }, [&]() { return TShared(dev, Ctx); }, - [&]() { return TAnnotated(dev, Ctx, alloc::shared); }, - [&]() { return TAnnotated(dev, Ctx, properties{usm_kind_shared}); }, - [&]() { return ATShared(1, q); }, - [&]() { return ATShared(1, dev, Ctx); }, - [&]() { return ATAnnotated(1, dev, Ctx, alloc::shared); }}); + if (dev.has(sycl::aspect::usm_shared_allocations)) { + CheckUsmKindAll( + alloc::shared, + std::tuple{ + [&]() { return MShared(q); }, [&]() { return MShared(dev, Ctx); }, + [&]() { return MAnnotated(dev, Ctx, alloc::shared); }, + [&]() { return MAnnotated(dev, Ctx, properties{usm_kind_shared}); }, + [&]() { return AShared(1, q); }, + [&]() { return AShared(1, dev, Ctx); }, + [&]() { return AAnnotated(1, dev, Ctx, alloc::shared); }, + [&]() { return TShared(q); }, [&]() { return TShared(dev, Ctx); }, + [&]() { return TAnnotated(dev, Ctx, alloc::shared); }, + [&]() { return TAnnotated(dev, Ctx, properties{usm_kind_shared}); }, + [&]() { return ATShared(1, q); }, + [&]() { return ATShared(1, dev, Ctx); }, + [&]() { return ATAnnotated(1, dev, Ctx, alloc::shared); }}); + } } int main() { sycl::queue q; testUsmKind(q); return 0; -} \ No newline at end of file +} diff --git a/sycl/test-e2e/Basic/group_local_memory.cpp b/sycl/test-e2e/Basic/group_local_memory.cpp index 8185378071c75..b5bb26917ec0d 100644 --- a/sycl/test-e2e/Basic/group_local_memory.cpp +++ b/sycl/test-e2e/Basic/group_local_memory.cpp @@ -7,17 +7,21 @@ constexpr int N = 5; int main() { sycl::queue q; - int *ptr = sycl::malloc_shared(N, q); - q.parallel_for(sycl::nd_range<1>{1, 1}, [=](sycl::nd_item<1> it) { - auto g = it.get_group(); - auto mem = sycl::ext::oneapi::group_local_memory(g, 1, 2, 3, 4, 5); - auto ref = *mem; - for (int i = 0; i < N; ++i) { - ptr[i] = ref[i]; - } - }).wait(); + sycl::buffer buf{sycl::range{N}}; + q.submit([&](sycl::handler &h) { + sycl::accessor acc{buf, h}; + h.parallel_for(sycl::nd_range<1>{1, 1}, [=](sycl::nd_item<1> it) { + auto g = it.get_group(); + auto mem = + sycl::ext::oneapi::group_local_memory(g, 1, 2, 3, 4, 5); + auto ref = *mem; + for (int i = 0; i < N; ++i) { + acc[i] = ref[i]; + } + }); + }); + sycl::host_accessor result{buf}; for (int i = 0; i < N; ++i) { - assert(ptr[i] == (i + 1)); + assert(result[i] == (i + 1)); } - sycl::free(ptr, q); } diff --git a/sycl/test-e2e/Basic/large-range.cpp b/sycl/test-e2e/Basic/large-range.cpp index 354276065a739..f2c38cbb5dd8d 100644 --- a/sycl/test-e2e/Basic/large-range.cpp +++ b/sycl/test-e2e/Basic/large-range.cpp @@ -26,15 +26,18 @@ void check_sum(std::string_view desc, const ContainerT &data, size_t N) { template void test_regular(std::string_view desc, queue &q, size_t B, RangeT range) { auto N = range.size(); - std::vector accumulators_v(B, 0, usm_allocator(q)); - auto *accumulators = accumulators_v.data(); - - q.parallel_for(range, [=](auto it) { - atomic_ref ref( - accumulators[it.get_linear_id() % B]); - ++ref; - }).wait(); - + std::vector accumulators_v(B, 0); + { + sycl::buffer accumulator_buf{accumulators_v}; + q.submit([&](sycl::handler &h) { + sycl::accessor accumulators{accumulator_buf, h}; + h.parallel_for(range, [=](auto it) { + atomic_ref ref( + accumulators[it.get_linear_id() % B]); + ++ref; + }); + }); + } // destruction of accumulator_buf here writes back data to accumulators_v check_sum(desc, accumulators_v, N); } @@ -42,18 +45,19 @@ template void test_spec_constant(std::string_view desc, queue &q, size_t B, RangeT range) { auto N = range.size(); - std::vector accumulators_v(B, 0, usm_allocator(q)); - auto *accumulators = accumulators_v.data(); - - q.submit([&](handler &cgh) { - cgh.set_specialization_constant(2); - cgh.parallel_for(range, [=](auto it, kernel_handler h) { - atomic_ref ref( - accumulators[it.get_linear_id() % B]); - ref += h.get_specialization_constant(); - }); - }).wait(); - + std::vector accumulators_v(B, 0); + { + sycl::buffer accumulators_buf{accumulators_v}; + q.submit([&](handler &cgh) { + sycl::accessor accumulators{accumulators_buf, cgh}; + cgh.set_specialization_constant(2); + cgh.parallel_for(range, [=](auto it, kernel_handler h) { + atomic_ref ref( + accumulators[it.get_linear_id() % B]); + ref += h.get_specialization_constant(); + }); + }); + } // destruction of accumulators_buf here writes data back to accumulators_v check_sum(desc, accumulators_v, N * 2); } diff --git a/sycl/test-e2e/Basic/span.cpp b/sycl/test-e2e/Basic/span.cpp index 43d16ff4d16d6..ec341d39d0eac 100644 --- a/sycl/test-e2e/Basic/span.cpp +++ b/sycl/test-e2e/Basic/span.cpp @@ -3,7 +3,7 @@ // // Fails to release USM pointer on HIP for NVIDIA // XFAIL: hip_nvidia - +// REQUIRES: aspect-usm_shared_allocations #include #include diff --git a/sycl/test-e2e/Basic/wrapped_usm_pointers.cpp b/sycl/test-e2e/Basic/wrapped_usm_pointers.cpp index 7b81493db6dce..d2fabb6f7e967 100644 --- a/sycl/test-e2e/Basic/wrapped_usm_pointers.cpp +++ b/sycl/test-e2e/Basic/wrapped_usm_pointers.cpp @@ -1,6 +1,6 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out - +// REQUIRES: aspect-usm_shared_allocations //==---------- wrapped_usm_pointer.cpp - test pointers in struct ---------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. diff --git a/sycl/test-e2e/Complex/sycl_complex_math_test.cpp b/sycl/test-e2e/Complex/sycl_complex_math_test.cpp index 9bb5122ffc7f7..1178bdf0b5ffe 100644 --- a/sycl/test-e2e/Complex/sycl_complex_math_test.cpp +++ b/sycl/test-e2e/Complex/sycl_complex_math_test.cpp @@ -13,31 +13,26 @@ bool operator()(sycl::queue &Q, cmplx init, \ cmplx ref = cmplx(0, 0), bool use_ref = false) { \ bool pass = true; \ - \ auto std_in = init_std_complex(init.re, init.im); \ experimental::complex cplx_input{init.re, init.im}; \ - \ - auto *cplx_out = sycl::malloc_shared>(1, Q); \ - \ + sycl::buffer> cplx_out_buf{sycl::range{1}}; \ /*Get std::complex output*/ \ std::complex std_out{ref.re, ref.im}; \ if (!use_ref) \ std_out = std::math_func(std_in); \ - \ /*Check cplx::complex output from device*/ \ - Q.single_task([=]() { \ - cplx_out[0] = experimental::math_func(cplx_input); \ - }).wait(); \ - \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ true); \ + Q.submit([&](sycl::handler &h) { \ + sycl::accessor cplx_out{cplx_out_buf, h}; \ + h.single_task( \ + [=]() { cplx_out[0] = experimental::math_func(cplx_input); }); \ + }); \ + sycl::host_accessor cplx_out_acc{cplx_out_buf}; \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true); \ \ /*Check cplx::complex output from host*/ \ - cplx_out[0] = experimental::math_func(cplx_input); \ - \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ false); \ - \ - sycl::free(cplx_out, Q); \ + cplx_out_acc[0] = experimental::math_func(cplx_input); \ \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false); \ return pass; \ } \ }; @@ -73,8 +68,7 @@ TEST_MATH_OP_TYPE(tanh) \ auto std_in = init_std_complex(init.re, init.im); \ experimental::complex cplx_input{init.re, init.im}; \ - \ - auto *cplx_out = sycl::malloc_shared(1, Q); \ + sycl::buffer cplx_out_buf{sycl::range{1}}; \ \ /*Get std::complex output*/ \ T std_out = ref.re; \ @@ -82,19 +76,18 @@ TEST_MATH_OP_TYPE(tanh) std_out = std::math_func(std_in); \ \ /*Check cplx::complex output from device*/ \ - Q.single_task([=]() { \ - cplx_out[0] = experimental::math_func(cplx_input); \ - }).wait(); \ - \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ true); \ + Q.submit([&](sycl::handler &h) { \ + sycl::accessor cplx_out{cplx_out_buf, h}; \ + h.single_task( \ + [=]() { cplx_out[0] = experimental::math_func(cplx_input); }); \ + }); \ + sycl::host_accessor cplx_out_acc{cplx_out_buf}; \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true); \ \ /*Check cplx::complex output from host*/ \ - cplx_out[0] = experimental::math_func(cplx_input); \ - \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ false); \ - \ - sycl::free(cplx_out, Q); \ + cplx_out_acc[0] = experimental::math_func(cplx_input); \ \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false); \ return pass; \ } \ }; @@ -121,23 +114,21 @@ TEST_MATH_OP_TYPE(imag) std::complex std_out = ref; \ if (!use_ref) \ std_out = std::math_func(std_in); \ - \ - auto *cplx_out = sycl::malloc_shared>(1, Q); \ - \ + sycl::buffer> cplx_out_buf{sycl::range{1}}; \ /*Check cplx::complex output from device*/ \ - Q.single_task([=]() { \ - cplx_out[0] = experimental::math_func(std_in); \ - }).wait(); \ + Q.submit([&](sycl::handler &h) { \ + sycl::accessor cplx_out{cplx_out_buf, h}; \ + h.single_task( \ + [=]() { cplx_out[0] = experimental::math_func(std_in); }); \ + }); \ + sycl::host_accessor cplx_out_acc{cplx_out_buf}; \ \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ true); \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true); \ \ /*Check cplx::complex output from host*/ \ - cplx_out[0] = experimental::math_func(std_in); \ - \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ false); \ - \ - sycl::free(cplx_out, Q); \ + cplx_out_acc[0] = experimental::math_func(std_in); \ \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false); \ return pass; \ } \ }; @@ -161,23 +152,21 @@ TEST_MATH_OP_TYPE(proj) T std_out = ref; \ if (!use_ref) \ std_out = std::math_func(std_in); \ - \ - auto *cplx_out = sycl::malloc_shared(1, Q); \ - \ + sycl::buffer cplx_out_buf{sycl::range{1}}; \ /*Check cplx::complex output from device*/ \ - Q.single_task([=]() { \ - cplx_out[0] = experimental::math_func(init); \ - }).wait(); \ + Q.submit([&](sycl::handler &h) { \ + sycl::accessor cplx_out{cplx_out_buf, h}; \ + h.single_task( \ + [=]() { cplx_out[0] = experimental::math_func(std_in); }); \ + }); \ + sycl::host_accessor cplx_out_acc{cplx_out_buf}; \ \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ true); \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true); \ \ /*Check cplx::complex output from host*/ \ - cplx_out[0] = experimental::math_func(init); \ - \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ false); \ - \ - sycl::free(cplx_out, Q); \ + cplx_out_acc[0] = experimental::math_func(init); \ \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false); \ return pass; \ } \ }; @@ -197,26 +186,25 @@ template struct test_polar { bool use_ref = false) { bool pass = true; - auto *cplx_out = sycl::malloc_shared>(1, Q); - + sycl::buffer> cplx_out_buf{sycl::range(1)}; /*Get std::complex output*/ std::complex std_out{ref.re, ref.im}; if (!use_ref) std_out = std::polar(init.re, init.im); /*Check cplx::complex output from device*/ - Q.single_task([=]() { - cplx_out[0] = experimental::polar(init.re, init.im); - }).wait(); - - pass &= check_results(cplx_out[0], std_out, /*is_device*/ true); + Q.submit([&](sycl::handler &h) { + sycl::accessor cplx_out{cplx_out_buf, h}; + h.single_task( + [=]() { cplx_out[0] = experimental::polar(init.re, init.im); }); + }); + sycl::host_accessor cplx_out_acc{cplx_out_buf}; + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true); /*Check cplx::complex output from host*/ - cplx_out[0] = experimental::polar(init.re, init.im); - - pass &= check_results(cplx_out[0], std_out, /*is_device*/ false); + cplx_out_acc[0] = experimental::polar(init.re, init.im); - sycl::free(cplx_out, Q); + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false); return pass; } diff --git a/sycl/test-e2e/Complex/sycl_complex_operator_test.cpp b/sycl/test-e2e/Complex/sycl_complex_operator_test.cpp index ed68332f60bdd..8894f5d965462 100644 --- a/sycl/test-e2e/Complex/sycl_complex_operator_test.cpp +++ b/sycl/test-e2e/Complex/sycl_complex_operator_test.cpp @@ -15,23 +15,19 @@ experimental::complex cplx_input1{init_re1, init_im1}; \ experimental::complex cplx_input2{init_re2, init_im2}; \ \ - auto *cplx_out = sycl::malloc_shared>(1, Q); \ - \ + sycl::buffer> cplx_out_buf{sycl::range{1}}; \ std::complex std_out; \ std_out = std_in1 op std_in2; \ + Q.submit([&](sycl::handler &h) { \ + sycl::accessor cplx_out{cplx_out_buf, h}; \ + h.single_task([=]() { cplx_out[0] = cplx_input1 op cplx_input2; }); \ + }); \ + sycl::host_accessor cplx_out_acc{cplx_out_buf}; \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true); \ \ - Q.single_task([=]() { \ - cplx_out[0] = cplx_input1 op cplx_input2; \ - }).wait(); \ - \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ true); \ - \ - cplx_out[0] = cplx_input1 op cplx_input2; \ - \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ false); \ - \ - sycl::free(cplx_out, Q); \ + cplx_out_acc[0] = cplx_input1 op cplx_input2; \ \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false); \ return pass; \ } \ }; @@ -53,28 +49,32 @@ test_op(test_div, /); experimental::complex cplx_input{init_re1, init_im1}; \ \ auto std_inout = init_std_complex(init_re2, init_im2); \ - auto *cplx_inout = sycl::malloc_shared>(1, Q); \ - cplx_inout[0].real(init_re2); \ - cplx_inout[0].imag(init_im2); \ - \ + experimental::complex cplx; \ + cplx.real(init_re2); \ + cplx.imag(init_im2); \ + sycl::buffer> cplx_out_buf{&cplx, \ + sycl::range{1}}; \ std_inout op_assign std_in; \ + Q.submit([&](sycl::handler &h) { \ + sycl::accessor cplx_out{cplx_out_buf, h}; \ + h.single_task([=]() { cplx_out[0] op_assign cplx_input; }); \ + }); \ + sycl::host_accessor cplx_out_acc{cplx_out_buf}; \ \ - Q.single_task([=]() { cplx_inout[0] op_assign cplx_input; }).wait(); \ + pass &= \ + check_results(cplx_out_acc[0], \ + std::complex(std_inout.real(), std_inout.imag()), \ + /*is_device*/ true); \ \ - pass &= check_results( \ - cplx_inout[0], std::complex(std_inout.real(), std_inout.imag()), \ - /*is_device*/ true); \ + cplx_out_acc[0].real(init_re2); \ + cplx_out_acc[0].imag(init_im2); \ \ - cplx_inout[0].real(init_re2); \ - cplx_inout[0].imag(init_im2); \ + cplx_out_acc[0] op_assign cplx_input; \ \ - cplx_inout[0] op_assign cplx_input; \ - \ - pass &= check_results( \ - cplx_inout[0], std::complex(std_inout.real(), std_inout.imag()), \ - /*is_device*/ false); \ - \ - sycl::free(cplx_inout, Q); \ + pass &= \ + check_results(cplx_out_acc[0], \ + std::complex(std_inout.real(), std_inout.imag()), \ + /*is_device*/ false); \ \ return pass; \ } \ @@ -99,19 +99,18 @@ test_op_assign(test_div_assign, /=); experimental::complex cplx_input{init_re1, init_im1}; \ \ std::complex std_out{}; \ - auto *cplx_out = sycl::malloc_shared>(1, Q); \ - \ + sycl::buffer> cplx_out_buf{sycl::range{1}}; \ std_out = op std_in; \ + Q.submit([&](sycl::handler &h) { \ + sycl::accessor cplx_out{cplx_out_buf, h}; \ + h.single_task([=]() { cplx_out[0] = op cplx_input; }); \ + }); \ + sycl::host_accessor cplx_out_acc{cplx_out_buf}; \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true); \ \ - Q.single_task([=]() { cplx_out[0] = op cplx_input; }).wait(); \ - \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ true); \ - \ - cplx_out[0] = op cplx_input; \ - \ - pass &= check_results(cplx_out[0], std_out, /*is_device*/ false); \ + cplx_out_acc[0] = op cplx_input; \ \ - sycl::free(cplx_out, Q); \ + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false); \ \ return pass; \ } \ diff --git a/sycl/test-e2e/Complex/sycl_complex_pow_test.cpp b/sycl/test-e2e/Complex/sycl_complex_pow_test.cpp index c8555687eb49b..91d58a4cdeebb 100644 --- a/sycl/test-e2e/Complex/sycl_complex_pow_test.cpp +++ b/sycl/test-e2e/Complex/sycl_complex_pow_test.cpp @@ -51,26 +51,26 @@ template struct test_pow_cplx_cplx { experimental::complex cplx_input1{init1.re, init1.im}; experimental::complex cplx_input2{init2.re, init2.im}; - auto *cplx_out = sycl::malloc_shared>(1, Q); - + sycl::buffer> cplx_out_buf{sycl::range{1}}; // Get std::complex output std::complex std_out{ref.re, ref.im}; if (!use_ref) std_out = std::pow(std_in1, std_in2); // Check cplx::complex output from device - Q.single_task([=]() { - cplx_out[0] = experimental::pow(cplx_input1, cplx_input2); - }).wait(); - - pass &= check_results(cplx_out[0], std_out, /*is_device*/ true); + Q.submit([&](sycl::handler &h) { + sycl::accessor cplx_out{cplx_out_buf, h}; + h.single_task([=]() { + cplx_out[0] = experimental::pow(cplx_input1, cplx_input2); + }); + }); + sycl::host_accessor cplx_out_acc{cplx_out_buf}; + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true); // Check cplx::complex output from host - cplx_out[0] = experimental::pow(cplx_input1, cplx_input2); - - pass &= check_results(cplx_out[0], std_out, /*is_device*/ false); + cplx_out_acc[0] = experimental::pow(cplx_input1, cplx_input2); - sycl::free(cplx_out, Q); + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false); return pass; } @@ -87,26 +87,25 @@ template struct test_pow_cplx_deci { experimental::complex cplx_input{init1.re, init1.im}; T deci_input = init2.re; - auto *cplx_out = sycl::malloc_shared>(1, Q); - + sycl::buffer> cplx_out_buf{sycl::range{1}}; // Get std::complex output std::complex std_out{ref.re, ref.im}; if (!use_ref) std_out = std::pow(std_in, std_deci_in); // Check cplx::complex output from device - Q.single_task([=]() { - cplx_out[0] = experimental::pow(cplx_input, deci_input); - }).wait(); - - pass &= check_results(cplx_out[0], std_out, /*is_device*/ true); + Q.submit([&](sycl::handler &h) { + sycl::accessor cplx_out{cplx_out_buf, h}; + h.single_task( + [=]() { cplx_out[0] = experimental::pow(cplx_input, deci_input); }); + }); + sycl::host_accessor cplx_out_acc{cplx_out_buf}; + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true); // Check cplx::complex output from host - cplx_out[0] = experimental::pow(cplx_input, deci_input); + cplx_out_acc[0] = experimental::pow(cplx_input, deci_input); - pass &= check_results(cplx_out[0], std_out, /*is_device*/ false); - - sycl::free(cplx_out, Q); + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false); return pass; } @@ -123,26 +122,24 @@ template struct test_pow_deci_cplx { experimental::complex cplx_input{init2.re, init2.im}; T deci_input = init1.re; - auto *cplx_out = sycl::malloc_shared>(1, Q); - + sycl::buffer> cplx_out_buf{sycl::range{1}}; // Get std::complex output std::complex std_out{ref.re, ref.im}; if (!use_ref) std_out = std::pow(std_deci_in, std_in); // Check cplx::complex output from device - Q.single_task([=]() { - cplx_out[0] = experimental::pow(deci_input, cplx_input); - }).wait(); - - pass &= check_results(cplx_out[0], std_out, /*is_device*/ true); - + Q.submit([&](sycl::handler &h) { + sycl::accessor cplx_out{cplx_out_buf, h}; + h.single_task( + [=]() { cplx_out[0] = experimental::pow(deci_input, cplx_input); }); + }); + sycl::host_accessor cplx_out_acc{cplx_out_buf}; + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ true); // Check cplx::complex output from host - cplx_out[0] = experimental::pow(deci_input, cplx_input); - - pass &= check_results(cplx_out[0], std_out, /*is_device*/ false); + cplx_out_acc[0] = experimental::pow(deci_input, cplx_input); - sycl::free(cplx_out, Q); + pass &= check_results(cplx_out_acc[0], std_out, /*is_device*/ false); return pass; } diff --git a/sycl/test-e2e/Complex/sycl_complex_stream_test.cpp b/sycl/test-e2e/Complex/sycl_complex_stream_test.cpp index b9cb8c519366d..d8d645f6ac7dc 100644 --- a/sycl/test-e2e/Complex/sycl_complex_stream_test.cpp +++ b/sycl/test-e2e/Complex/sycl_complex_stream_test.cpp @@ -7,17 +7,15 @@ template struct test_sycl_stream_operator { bool operator()(sycl::queue &Q, cmplx init) { - auto *cplx_out = sycl::malloc_shared>(1, Q); - cplx_out[0] = experimental::complex(init.re, init.im); - + experimental::complex cplx(init.re, init.im); + sycl::buffer> cplx_out_buf{&cplx, sycl::range{1}}; Q.submit([&](sycl::handler &CGH) { + sycl::accessor cplx_out{cplx_out_buf, CGH}; sycl::stream Out(512, 20, CGH); CGH.parallel_for<>(sycl::range<1>(1), [=](sycl::id<1> idx) { Out << cplx_out[idx] << sycl::endl; }); }).wait(); - - sycl::free(cplx_out, Q); return true; } }; From 96073b9ead9ac27cd4a63e7e00e65de2d39b895c Mon Sep 17 00:00:00 2001 From: Steffen Larsen Date: Thu, 8 Feb 2024 11:37:22 +0100 Subject: [PATCH 08/26] [SYCL] Change vec operators to be friends (#12396) This commit changes operators for sycl::vec to be defined like they are in the SYCL specification, i.e. friend functions instead of members. --------- Signed-off-by: Larsen, Steffen --- sycl/include/sycl/types.hpp | 451 ++++++++++++------ .../Basic/vector/vec_binary_scalar_order.cpp | 150 ++++++ sycl/test/basic_tests/types.cpp | 5 +- 3 files changed, 464 insertions(+), 142 deletions(-) create mode 100644 sycl/test-e2e/Basic/vector/vec_binary_scalar_order.cpp diff --git a/sycl/include/sycl/types.hpp b/sycl/include/sycl/types.hpp index a2f934a147c33..c12b9e6781eb9 100644 --- a/sycl/include/sycl/types.hpp +++ b/sycl/include/sycl/types.hpp @@ -987,6 +987,41 @@ template class vec { #endif #ifdef __SYCL_USE_EXT_VECTOR_TYPE__ +#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) +#define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT) \ + friend vec operator BINOP(const vec &Lhs, const vec &Rhs) { \ + vec Ret; \ + if constexpr (IsUsingArrayOnDevice) { \ + for (size_t I = 0; I < NumElements; ++I) { \ + Ret.setValue(I, (Lhs.getValue(I) BINOP Rhs.getValue(I))); \ + } \ + } else { \ + Ret.m_Data = Lhs.m_Data BINOP Rhs.m_Data; \ + if constexpr (std::is_same_v && CONVERT) { \ + Ret.ConvertToDataT(); \ + } \ + } \ + return Ret; \ + } \ + friend vec operator BINOP(const vec &Lhs, const DataT &Rhs) { \ + return Lhs BINOP vec(Rhs); \ + } \ + friend vec operator BINOP(const DataT &Lhs, const vec &Rhs) { \ + return vec(Lhs) BINOP Rhs; \ + } \ + friend vec &operator OPASSIGN(vec & Lhs, const vec & Rhs) { \ + Lhs = Lhs BINOP Rhs; \ + return Lhs; \ + } \ + template \ + friend typename std::enable_if_t operator OPASSIGN( \ + vec & Lhs, const DataT & Rhs) { \ + Lhs = Lhs BINOP vec(Rhs); \ + return Lhs; \ + } +#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES) + +#if !defined(__INTEL_PREVIEW_BREAKING_CHANGES) #define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT) \ template \ vec operator BINOP(const EnableIfNotUsingArrayOnDevice &Rhs) const { \ @@ -1024,38 +1059,37 @@ template class vec { *this = *this BINOP vec(Rhs); \ return *this; \ } +#endif // !defined(__INTEL_PREVIEW_BREAKING_CHANGES) + #else // __SYCL_USE_EXT_VECTOR_TYPE__ #if defined(__INTEL_PREVIEW_BREAKING_CHANGES) #define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT) \ - vec operator BINOP(const vec &Rhs) const { \ + friend vec operator BINOP(const vec &Lhs, const vec &Rhs) { \ vec Ret{}; \ if constexpr (NativeVec) \ - Ret.m_Data = m_Data BINOP Rhs.m_Data; \ + Ret.m_Data = Lhs.m_Data BINOP Rhs.m_Data; \ else \ for (size_t I = 0; I < NumElements; ++I) \ - Ret.setValue(I, (DataT)(vec_data::get(getValue( \ + Ret.setValue(I, (DataT)(vec_data::get(Lhs.getValue( \ I)) BINOP vec_data::get(Rhs.getValue(I)))); \ return Ret; \ } \ - template \ - typename std::enable_if_t< \ - std::is_convertible_v && \ - (std::is_fundamental_v> || \ - detail::is_half_or_bf16_v>), \ - vec> \ - operator BINOP(const T & Rhs) const { \ - return *this BINOP vec(static_cast(Rhs)); \ + friend vec operator BINOP(const vec &Lhs, const DataT &Rhs) { \ + return Lhs BINOP vec(Rhs); \ } \ - vec &operator OPASSIGN(const vec & Rhs) { \ - *this = *this BINOP Rhs; \ - return *this; \ + friend vec operator BINOP(const DataT &Lhs, const vec &Rhs) { \ + return vec(Lhs) BINOP Rhs; \ + } \ + friend vec &operator OPASSIGN(vec & Lhs, const vec & Rhs) { \ + Lhs = Lhs BINOP Rhs; \ + return Lhs; \ } \ template \ - typename std::enable_if_t operator OPASSIGN( \ - const DataT & Rhs) { \ - *this = *this BINOP vec(Rhs); \ - return *this; \ + friend typename std::enable_if_t operator OPASSIGN( \ + vec & Lhs, const DataT & Rhs) { \ + Lhs = Lhs BINOP vec(Rhs); \ + return Lhs; \ } #endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES) @@ -1120,6 +1154,42 @@ template class vec { // Use __SYCL_DEVICE_ONLY__ macro because cast to OpenCL vector type is defined // by SYCL device compiler only. #ifdef __SYCL_DEVICE_ONLY__ +#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) +#define __SYCL_RELLOGOP(RELLOGOP) \ + friend vec operator RELLOGOP(const vec & Lhs, \ + const vec & Rhs) { \ + vec Ret{}; \ + /* This special case is needed since there are no standard operator|| */ \ + /* or operator&& functions for std::array. */ \ + if constexpr (IsUsingArrayOnDevice && \ + (std::string_view(#RELLOGOP) == "||" || \ + std::string_view(#RELLOGOP) == "&&")) { \ + for (size_t I = 0; I < NumElements; ++I) { \ + /* We cannot use SetValue here as the operator is not a friend of*/ \ + /* Ret on Windows. */ \ + Ret[I] = static_cast(-(vec_data::get( \ + Lhs.getValue(I)) RELLOGOP vec_data::get(Rhs.getValue(I)))); \ + } \ + } else { \ + Ret = vec( \ + (typename vec::vector_t)( \ + Lhs.m_Data RELLOGOP Rhs.m_Data)); \ + if (NumElements == 1) /*Scalar 0/1 logic was applied, invert*/ \ + Ret *= -1; \ + } \ + return Ret; \ + } \ + friend vec operator RELLOGOP(const vec & Lhs, \ + const DataT & Rhs) { \ + return Lhs RELLOGOP vec(Rhs); \ + } \ + friend vec operator RELLOGOP(const DataT & Lhs, \ + const vec & Rhs) { \ + return vec(Lhs) RELLOGOP Rhs; \ + } +#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES) + +#if !defined(__INTEL_PREVIEW_BREAKING_CHANGES) #define __SYCL_RELLOGOP(RELLOGOP) \ vec operator RELLOGOP(const vec & Rhs) const { \ vec Ret{}; \ @@ -1129,9 +1199,10 @@ template class vec { (std::string_view(#RELLOGOP) == "||" || \ std::string_view(#RELLOGOP) == "&&")) { \ for (size_t I = 0; I < NumElements; ++I) { \ - Ret.setValue(I, \ - -(vec_data::get(getValue(I)) \ - RELLOGOP vec_data::get(Rhs.getValue(I)))); \ + /* We cannot use SetValue here as the operator is not a friend of*/ \ + /* Ret on Windows. */ \ + Ret[I] = static_cast(-(vec_data::get( \ + getValue(I)) RELLOGOP vec_data::get(Rhs.getValue(I)))); \ } \ } else { \ Ret = vec( \ @@ -1150,13 +1221,38 @@ template class vec { operator RELLOGOP(const T & Rhs) const { \ return *this RELLOGOP vec(static_cast(Rhs)); \ } +#endif // !defined(__INTEL_PREVIEW_BREAKING_CHANGES) #else +#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) +#define __SYCL_RELLOGOP(RELLOGOP) \ + friend vec operator RELLOGOP(const vec & Lhs, \ + const vec & Rhs) { \ + vec Ret{}; \ + for (size_t I = 0; I < NumElements; ++I) { \ + /* We cannot use SetValue here as the operator is not a friend of*/ \ + /* Ret on Windows. */ \ + Ret[I] = static_cast(-(vec_data::get( \ + Lhs.getValue(I)) RELLOGOP vec_data::get(Rhs.getValue(I)))); \ + } \ + return Ret; \ + } \ + friend vec operator RELLOGOP(const vec & Lhs, \ + const DataT & Rhs) { \ + return Lhs RELLOGOP vec(Rhs); \ + } \ + friend vec operator RELLOGOP(const DataT & Lhs, \ + const vec & Rhs) { \ + return vec(Lhs) RELLOGOP Rhs; \ + } +#else // defined(__INTEL_PREVIEW_BREAKING_CHANGES) #define __SYCL_RELLOGOP(RELLOGOP) \ vec operator RELLOGOP(const vec & Rhs) const { \ vec Ret{}; \ for (size_t I = 0; I < NumElements; ++I) { \ - Ret.setValue(I, -(vec_data::get(getValue(I)) \ - RELLOGOP vec_data::get(Rhs.getValue(I)))); \ + /* We cannot use SetValue here as the operator is not a friend of*/ \ + /* Ret on Windows. */ \ + Ret[I] = static_cast(-(vec_data::get( \ + getValue(I)) RELLOGOP vec_data::get(Rhs.getValue(I)))); \ } \ return Ret; \ } \ @@ -1168,6 +1264,7 @@ template class vec { operator RELLOGOP(const T & Rhs) const { \ return *this RELLOGOP vec(static_cast(Rhs)); \ } +#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES) #endif __SYCL_RELLOGOP(==) @@ -1184,6 +1281,18 @@ template class vec { #ifdef __SYCL_UOP #error "Undefine __SYCL_UOP macro" #endif +#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) +#define __SYCL_UOP(UOP, OPASSIGN) \ + friend vec &operator UOP(vec & Rhs) { \ + Rhs OPASSIGN vec_data::get(1); \ + return Rhs; \ + } \ + friend vec operator UOP(vec &Lhs, int) { \ + vec Ret(Lhs); \ + Lhs OPASSIGN vec_data::get(1); \ + return Ret; \ + } +#else // defined(__INTEL_PREVIEW_BREAKING_CHANGES) #define __SYCL_UOP(UOP, OPASSIGN) \ vec &operator UOP() { \ *this OPASSIGN vec_data::get(1); \ @@ -1194,6 +1303,7 @@ template class vec { *this OPASSIGN vec_data::get(1); \ return Ret; \ } +#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES) __SYCL_UOP(++, +=) __SYCL_UOP(--, -=) @@ -1203,150 +1313,84 @@ template class vec { // operator~() available only when: dataT != float && dataT != double // && dataT != half - template - typename std::enable_if_t> && - (!IsUsingArrayOnDevice && !IsUsingArrayOnHost), - vec> - operator~() const { - vec Ret{(typename vec::DataType) ~m_Data}; - if constexpr (std::is_same_v) { - Ret.ConvertToDataT(); - } - return Ret; - } - template - typename std::enable_if_t> && - (IsUsingArrayOnDevice || IsUsingArrayOnHost), - vec> - operator~() const { - vec Ret{}; - for (size_t I = 0; I < NumElements; ++I) { - Ret.setValue(I, ~getValue(I)); + friend vec operator~(const vec &Rhs) { + if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) { + vec Ret{}; + for (size_t I = 0; I < NumElements; ++I) { + Ret.setValue(I, ~Rhs.getValue(I)); + } + return Ret; + } else { + vec Ret{(typename vec::DataType) ~Rhs.m_Data}; + if constexpr (std::is_same_v) { + Ret.ConvertToDataT(); + } + return Ret; } - return Ret; } - template -#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) - using OpNotRet = detail::rel_t; -#else - using OpNotRet = T; -#endif // __INTEL_PREVIEW_BREAKING_CHANGES - // operator! - template - EnableIfNotUsingArray, N>> operator!() const { - return vec{(typename vec::DataType) !m_Data} -#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) - .template as, N>>(); -#else - ; -#endif // __INTEL_PREVIEW_BREAKING_CHANGES - } - - // std::byte neither supports ! unary op or casting, so special handling is - // needed. And, worse, Windows has a conflict with 'byte'. + friend vec, NumElements> operator!(const vec &Rhs) { + if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) { + vec Ret{}; + for (size_t I = 0; I < NumElements; ++I) { #if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) - template - typename std::enable_if_t && - (IsUsingArrayOnDevice || IsUsingArrayOnHost), - vec, N>> - operator!() const { - vec Ret{}; - for (size_t I = 0; I < NumElements; ++I) { - Ret.setValue(I, std::byte{!vec_data::get(getValue(I))}); + // std::byte neither supports ! unary op or casting, so special handling + // is needed. And, worse, Windows has a conflict with 'byte'. + if constexpr (std::is_same_v) { + Ret.setValue(I, std::byte{!vec_data::get(Rhs.getValue(I))}); + } else +#endif // (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0) + { + Ret.setValue(I, !vec_data::get(Rhs.getValue(I))); + } + } + return Ret.template as, NumElements>>(); + } else { + return vec{(typename vec::DataType) !Rhs.m_Data} + .template as, NumElements>>(); } -#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) - return Ret.template as, N>>(); -#else - return Ret; -#endif // __INTEL_PREVIEW_BREAKING_CHANGES } - template - typename std::enable_if_t && - (IsUsingArrayOnDevice || IsUsingArrayOnHost), - vec, N>> - operator!() const { - vec Ret{}; - for (size_t I = 0; I < NumElements; ++I) - Ret.setValue(I, !vec_data::get(getValue(I))); -#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) - return Ret.template as, N>>(); -#else - return Ret; -#endif // __INTEL_PREVIEW_BREAKING_CHANGES - } -#else - template - EnableIfUsingArray, N>> operator!() const { - vec Ret{}; - for (size_t I = 0; I < NumElements; ++I) - Ret.setValue(I, !vec_data::get(getValue(I))); -#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) - return Ret.template as, N>>(); -#else - return Ret; -#endif // __INTEL_PREVIEW_BREAKING_CHANGES - } -#endif - // operator + - template EnableIfNotUsingArray operator+() const { - return vec{+m_Data}; - } - - template EnableIfUsingArray operator+() const { - vec Ret{}; - for (size_t I = 0; I < NumElements; ++I) - Ret.setValue(I, vec_data::get(+vec_data::get(getValue(I)))); - return Ret; - } - - // operator - - template EnableIfNotUsingArray operator-() const { - namespace oneapi = sycl::ext::oneapi; - if constexpr (IsBfloat16 && NumElements == 1) { - vec Ret{}; - oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(m_Data); - oneapi::bfloat16 w = -v; - Ret.m_Data = oneapi::detail::bfloat16ToBits(w); - } else if constexpr (IsBfloat16) { + friend vec operator+(const vec &Lhs) { + if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) { vec Ret{}; - for (size_t I = 0; I < NumElements; ++I) { - oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(m_Data[I]); - oneapi::bfloat16 w = -v; - Ret.m_Data[I] = oneapi::detail::bfloat16ToBits(w); - } + for (size_t I = 0; I < NumElements; ++I) + Ret.setValue( + I, vec_data::get(+vec_data::get(Lhs.getValue(I)))); return Ret; } else { - vec Ret{-m_Data}; - if constexpr (std::is_same_v) { - Ret.ConvertToDataT(); - } - return Ret; + return vec{+Lhs.m_Data}; } } - template EnableIfUsingArray operator-() const { + // operator - + friend vec operator-(const vec &Lhs) { namespace oneapi = sycl::ext::oneapi; vec Ret{}; if constexpr (IsBfloat16 && NumElements == 1) { - oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(m_Data); + oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(Lhs.m_Data); oneapi::bfloat16 w = -v; Ret.m_Data = oneapi::detail::bfloat16ToBits(w); } else if constexpr (IsBfloat16) { for (size_t I = 0; I < NumElements; I++) { - oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(m_Data[I]); + oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(Lhs.m_Data[I]); oneapi::bfloat16 w = -v; Ret.m_Data[I] = oneapi::detail::bfloat16ToBits(w); } - } else { + } else if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) { for (size_t I = 0; I < NumElements; ++I) - Ret.setValue(I, - vec_data::get(-vec_data::get(getValue(I)))); + Ret.setValue( + I, vec_data::get(-vec_data::get(Lhs.getValue(I)))); + return Ret; + } else { + Ret = vec{-Lhs.m_Data}; + if constexpr (std::is_same_v) { + Ret.ConvertToDataT(); + } + return Ret; } - return Ret; } #endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES) @@ -1770,6 +1814,19 @@ class SwizzleOp { #ifdef __SYCL_OPASSIGN #error "Undefine __SYCL_OPASSIGN macro." #endif +#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) +#define __SYCL_OPASSIGN(OPASSIGN, OP) \ + friend SwizzleOp &operator OPASSIGN(SwizzleOp & Lhs, const DataT & Rhs) { \ + Lhs.operatorHelper(vec_t(Rhs)); \ + return Lhs; \ + } \ + template \ + friend SwizzleOp &operator OPASSIGN(SwizzleOp & Lhs, \ + const RhsOperation & Rhs) { \ + Lhs.operatorHelper(Rhs); \ + return Lhs; \ + } +#else // defined(__INTEL_PREVIEW_BREAKING_CHANGES) #define __SYCL_OPASSIGN(OPASSIGN, OP) \ SwizzleOp &operator OPASSIGN(const DataT & Rhs) { \ operatorHelper(vec_t(Rhs)); \ @@ -1780,6 +1837,7 @@ class SwizzleOp { operatorHelper(Rhs); \ return *this; \ } +#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES) __SYCL_OPASSIGN(+=, std::plus) __SYCL_OPASSIGN(-=, std::minus) @@ -1796,6 +1854,18 @@ class SwizzleOp { #ifdef __SYCL_UOP #error "Undefine __SYCL_UOP macro" #endif +#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) +#define __SYCL_UOP(UOP, OPASSIGN) \ + friend SwizzleOp &operator UOP(SwizzleOp & Rhs) { \ + Rhs OPASSIGN static_cast(1); \ + return Rhs; \ + } \ + friend vec_t operator UOP(SwizzleOp &Lhs, int) { \ + vec_t Ret = Lhs; \ + Lhs OPASSIGN static_cast(1); \ + return Ret; \ + } +#else // defined(__INTEL_PREVIEW_BREAKING_CHANGES) #define __SYCL_UOP(UOP, OPASSIGN) \ SwizzleOp &operator UOP() { \ *this OPASSIGN static_cast(1); \ @@ -1806,11 +1876,36 @@ class SwizzleOp { *this OPASSIGN static_cast(1); \ return Ret; \ } +#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES) __SYCL_UOP(++, +=) __SYCL_UOP(--, -=) #undef __SYCL_UOP +#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) + template + friend typename std::enable_if_t< + std::is_same_v && std::is_integral_v>, vec_t> + operator~(const SwizzleOp &Rhs) { + vec_t Tmp = Rhs; + return ~Tmp; + } + + friend vec_rel_t operator!(const SwizzleOp &Rhs) { + vec_t Tmp = Rhs; + return !Tmp; + } + + friend vec_t operator+(const SwizzleOp &Rhs) { + vec_t Tmp = Rhs; + return +Tmp; + } + + friend vec_t operator-(const SwizzleOp &Rhs) { + vec_t Tmp = Rhs; + return -Tmp; + } +#else // defined(__INTEL_PREVIEW_BREAKING_CHANGES) template typename std::enable_if_t>, vec_t> operator~() { @@ -1832,6 +1927,80 @@ class SwizzleOp { vec_t Tmp = *this; return -Tmp; } +#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES) + +#if defined(__INTEL_PREVIEW_BREAKING_CHANGES) +// scalar BINOP vec<> +// scalar BINOP SwizzleOp +// vec<> BINOP SwizzleOp +#ifdef __SYCL_BINOP +#error "Undefine __SYCL_BINOP macro" +#endif +#define __SYCL_BINOP(BINOP) \ + friend vec_t operator BINOP(const DataT &Lhs, const SwizzleOp &Rhs) { \ + vec_t Tmp = Rhs; \ + return Lhs BINOP Tmp; \ + } \ + friend vec_t operator BINOP(const SwizzleOp &Lhs, const DataT &Rhs) { \ + vec_t Tmp = Lhs; \ + return Tmp BINOP Rhs; \ + } \ + friend vec_t operator BINOP(const vec_t &Lhs, const SwizzleOp &Rhs) { \ + vec_t Tmp = Rhs; \ + return Lhs BINOP Tmp; \ + } \ + friend vec_t operator BINOP(const SwizzleOp &Lhs, const vec_t &Rhs) { \ + vec_t Tmp = Lhs; \ + return Tmp BINOP Rhs; \ + } + + __SYCL_BINOP(+) + __SYCL_BINOP(-) + __SYCL_BINOP(*) + __SYCL_BINOP(/) + __SYCL_BINOP(%) + __SYCL_BINOP(&) + __SYCL_BINOP(|) + __SYCL_BINOP(^) + __SYCL_BINOP(>>) + __SYCL_BINOP(<<) +#undef __SYCL_BINOP + +// scalar RELLOGOP vec<> +// scalar RELLOGOP SwizzleOp +// vec<> RELLOGOP SwizzleOp +#ifdef __SYCL_RELLOGOP +#error "Undefine __SYCL_RELLOGOP macro" +#endif +#define __SYCL_RELLOGOP(RELLOGOP) \ + friend vec_rel_t operator RELLOGOP(const DataT &Lhs, const SwizzleOp &Rhs) { \ + vec_t Tmp = Rhs; \ + return Lhs RELLOGOP Tmp; \ + } \ + friend vec_rel_t operator RELLOGOP(const SwizzleOp &Lhs, const DataT &Rhs) { \ + vec_t Tmp = Lhs; \ + return Tmp RELLOGOP Rhs; \ + } \ + friend vec_rel_t operator RELLOGOP(const vec_t &Lhs, const SwizzleOp &Rhs) { \ + vec_t Tmp = Rhs; \ + return Lhs RELLOGOP Tmp; \ + } \ + friend vec_rel_t operator RELLOGOP(const SwizzleOp &Lhs, const vec_t &Rhs) { \ + vec_t Tmp = Lhs; \ + return Tmp RELLOGOP Rhs; \ + } + + __SYCL_RELLOGOP(==) + __SYCL_RELLOGOP(!=) + __SYCL_RELLOGOP(>) + __SYCL_RELLOGOP(<) + __SYCL_RELLOGOP(>=) + __SYCL_RELLOGOP(<=) + // TODO: limit to integral types. + __SYCL_RELLOGOP(&&) + __SYCL_RELLOGOP(||) +#undef __SYCL_RELLOGOP +#endif // defined(__INTEL_PREVIEW_BREAKING_CHANGES) template > @@ -2265,6 +2434,7 @@ class SwizzleOp { }; } // namespace detail +#if !defined(__INTEL_PREVIEW_BREAKING_CHANGES) // scalar BINOP vec<> // scalar BINOP SwizzleOp // vec<> BINOP SwizzleOp @@ -2374,6 +2544,7 @@ __SYCL_RELLOGOP(<=) __SYCL_RELLOGOP(&&) __SYCL_RELLOGOP(||) #undef __SYCL_RELLOGOP +#endif // !defined(__INTEL_PREVIEW_BREAKING_CHANGES) namespace detail { diff --git a/sycl/test-e2e/Basic/vector/vec_binary_scalar_order.cpp b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order.cpp new file mode 100644 index 0000000000000..158b52ab5f27e --- /dev/null +++ b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order.cpp @@ -0,0 +1,150 @@ +// REQUIRES: preview-breaking-changes-supported +// RUN: %{build} -fpreview-breaking-changes -o %t.out +// RUN: %{run} %t.out + +// This test currently fails on AMD HIP due to an unresolved memcmp function. +// XFAIL: hip_amd + +// Checks scalar/vec operator ordering. + +#include + +template +using rel_t = std::conditional_t< + sizeof(T) == 1, int8_t, + std::conditional_t< + sizeof(T) == 2, int16_t, + std::conditional_t>>>; + +template +bool CheckResult(sycl::vec V, T2 Ref) { + if constexpr (IsRelOp) { + // Check that all elements have the same boolean representation as the + // scalar. + for (size_t I = 0; I < N; ++I) + if (static_cast(V[I]) != static_cast(Ref)) + return false; + return true; + } else { + // Check that all elements are equal to the scalar. + for (size_t I = 0; I < N; ++I) + if (V[I] != Ref) + return false; + return true; + } +} + +#define CHECK(Q, C, T, N, IS_RELOP, OP) \ + { \ + using VecT = sycl::vec; \ + using ResT = sycl::vec, T>, N>; \ + constexpr T RefVal = 2; \ + VecT InVec{static_cast(RefVal)}; \ + { \ + VecT OutVecsDevice[2]; \ + T OutRefsDevice[2]; \ + { \ + sycl::buffer OutVecsBuff{OutVecsDevice, 2}; \ + sycl::buffer OutRefsBuff{OutRefsDevice, 2}; \ + Q.submit([&](sycl::handler &CGH) { \ + sycl::accessor OutVecsAcc{OutVecsBuff, CGH, sycl::read_write}; \ + sycl::accessor OutRefsAcc{OutRefsBuff, CGH, sycl::read_write}; \ + CGH.single_task([=]() { \ + auto OutVec1 = InVec OP RefVal; \ + auto OutVec2 = RefVal OP InVec; \ + static_assert(std::is_same_v); \ + static_assert(std::is_same_v); \ + OutVecsAcc[0] = OutVec1; \ + OutVecsAcc[1] = OutVec2; \ + OutRefsAcc[0] = RefVal OP RefVal; \ + OutRefsAcc[1] = RefVal OP RefVal; \ + }); \ + }); \ + } \ + if (!CheckResult(OutVecsDevice[0], OutRefsDevice[0])) { \ + std::cout << ("Check of vector " #OP \ + " scalar from device failed for " #T " and " #N) \ + << std::endl; \ + ++C; \ + } \ + if (!CheckResult(OutVecsDevice[1], OutRefsDevice[1])) { \ + std::cout << ("Check of scalar " #OP \ + " vector from device failed for " #T " and " #N) \ + << std::endl; \ + ++C; \ + } \ + } \ + { \ + auto OutVec1 = InVec OP RefVal; \ + auto OutVec2 = RefVal OP InVec; \ + static_assert(std::is_same_v); \ + static_assert(std::is_same_v); \ + if (!CheckResult(OutVec1, RefVal OP RefVal)) { \ + std::cout << ("Check of vector " #OP \ + " scalar from host failed for " #T " and " #N) \ + << std::endl; \ + ++C; \ + } \ + if (!CheckResult(OutVec2, RefVal OP RefVal)) { \ + std::cout << ("Check of scalar " #OP \ + " vector from host failed for " #T " and " #N) \ + << std::endl; \ + ++C; \ + } \ + } \ + } + +#define CHECK_SIZES(Q, C, T, IS_RELOP, OP) \ + CHECK(Q, C, T, 1, IS_RELOP, OP) \ + CHECK(Q, C, T, 2, IS_RELOP, OP) \ + CHECK(Q, C, T, 4, IS_RELOP, OP) \ + CHECK(Q, C, T, 8, IS_RELOP, OP) \ + CHECK(Q, C, T, 16, IS_RELOP, OP) + +// NOTE: For the sake of compile-time we pick only a few operators per category. +#define CHECK_SIZES_AND_COMMON_OPS(Q, C, T) \ + CHECK_SIZES(Q, Failures, T, false, *) \ + CHECK_SIZES(Q, Failures, T, true, &&) \ + CHECK_SIZES(Q, Failures, T, true, ==) \ + CHECK_SIZES(Q, Failures, T, true, <) \ + CHECK_SIZES(Q, Failures, T, true, >=) +#define CHECK_SIZES_AND_INT_ONLY_OPS(Q, C, T) \ + CHECK_SIZES(Q, Failures, T, false, %) \ + CHECK_SIZES(Q, Failures, T, false, >>) \ + CHECK_SIZES(Q, Failures, T, false, ^) + +int main() { + sycl::queue Q; + int Failures = 0; + + // Check operators on types with requirements if they are supported. + if (Q.get_device().has(sycl::aspect::fp16)) { + CHECK_SIZES_AND_COMMON_OPS(Q, Failures, sycl::half); + } + if (Q.get_device().has(sycl::aspect::fp64)) { + CHECK_SIZES_AND_COMMON_OPS(Q, Failures, double); + } + + // Check all operators without requirements. + CHECK_SIZES_AND_COMMON_OPS(Q, Failures, float); + CHECK_SIZES_AND_COMMON_OPS(Q, Failures, int8_t); + CHECK_SIZES_AND_COMMON_OPS(Q, Failures, int16_t); + CHECK_SIZES_AND_COMMON_OPS(Q, Failures, int32_t); + CHECK_SIZES_AND_COMMON_OPS(Q, Failures, int64_t); + CHECK_SIZES_AND_COMMON_OPS(Q, Failures, uint8_t); + CHECK_SIZES_AND_COMMON_OPS(Q, Failures, uint16_t); + CHECK_SIZES_AND_COMMON_OPS(Q, Failures, uint32_t); + CHECK_SIZES_AND_COMMON_OPS(Q, Failures, uint64_t); + + // Check integer only operators. + CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, int8_t); + CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, int16_t); + CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, int32_t); + CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, int64_t); + CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, uint8_t); + CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, uint16_t); + CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, uint32_t); + CHECK_SIZES_AND_INT_ONLY_OPS(Q, Failures, uint64_t); + return Failures; +} diff --git a/sycl/test/basic_tests/types.cpp b/sycl/test/basic_tests/types.cpp index 6aab1e433c7a7..14a1070567274 100644 --- a/sycl/test/basic_tests/types.cpp +++ b/sycl/test/basic_tests/types.cpp @@ -134,12 +134,13 @@ template inline void checkVecNotReturnType() { using Vector = sycl::vec; #if defined(__INTEL_PREVIEW_BREAKING_CHANGES) using ExpectedVector = sycl::vec; + using OpNotResult = decltype(operator!(std::declval())); #else using ExpectedVector = sycl::vec; -#endif using OpNotResult = decltype(std::declval().operator!()); +#endif static_assert(std::is_same_v, - "Incorrect vec::operator! return type"); + "Incorrect operator! return type"); } // the math built-in testing ensures that the vec binary ops get tested, From 0eac61876fce2d524f16b9b1d4239a7733f9cbbd Mon Sep 17 00:00:00 2001 From: Dmitry Sidorov Date: Thu, 8 Feb 2024 11:59:38 +0100 Subject: [PATCH 09/26] [SYCL][Matrix] Correct Prefetch instruction usage (#12623) Signed-off-by: Sidorov, Dmitry --- sycl/include/CL/__spirv/spirv_ops.hpp | 9 +++++---- sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp | 5 +++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp index ea1a6580d30e6..9af5b7e75ae38 100644 --- a/sycl/include/CL/__spirv/spirv_ops.hpp +++ b/sycl/include/CL/__spirv/spirv_ops.hpp @@ -174,10 +174,11 @@ extern __DPCPP_SYCL_EXTERNAL __spv::__spirv_JointMatrixINTEL * __spirv_VectorInsertDynamic(__spv::__spirv_JointMatrixINTEL *, Ts val, size_t i); -template -extern __DPCPP_SYCL_EXTERNAL void __spirv_JointMatrixPrefetchINTEL( - T *Ptr, std::size_t coordX, std::size_t coordY, unsigned int CacheLevel, - __spv::MatrixLayout Layout, std::size_t Stride); +template +extern __DPCPP_SYCL_EXTERNAL void __spirv_CooperativeMatrixPrefetchINTEL( + T *Ptr, std::size_t coordX, std::size_t coordY, std::size_t NumRows, + std::size_t NumCols, unsigned int CacheLevel, __spv::MatrixLayout Layout, + std::size_t Stride); #ifndef __SPIRV_BUILTIN_DECLARATIONS__ #error \ diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp index 98aea6f04a48b..a07e9c144ba6a 100644 --- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp +++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp @@ -524,8 +524,9 @@ joint_matrix_prefetch(Group sg, T *Ptr, size_t stride, // Will be removed once SPIRV implementation also uses offsetpointer size_t coordX = 0; size_t coordY = 0; - __spirv_JointMatrixPrefetchINTEL( - Ptr, coordX, coordY, detail::PropertyMetaInfo::value, + __spirv_CooperativeMatrixPrefetchINTEL( + Ptr, coordX, coordY, NumRows, NumCols, + detail::PropertyMetaInfo::value, sycl::detail::joint_matrix_layout_to_spv(Layout), stride); #endif // defined(__NVPTX__) #else From b1d81d739fda76d7c45f36799676bff2e52e1fb9 Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Thu, 8 Feb 2024 11:33:22 +0000 Subject: [PATCH 10/26] [SYCL][Graph] Disable USM tests on gpu-intel-dg2 (#12648) After PR https://github.com/intel/llvm/pull/12279, the USM tests fail in post-commit CIs (https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479). We temporarily disable these tests during the bug investigation. --- sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp | 4 ++++ sycl/test-e2e/Graph/Explicit/basic_usm.cpp | 4 ++++ sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp | 4 ++++ sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp | 4 ++++ sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp | 4 ++++ sycl/test-e2e/Graph/Explicit/empty_node.cpp | 4 ++++ sycl/test-e2e/Graph/Explicit/host_task.cpp | 4 ++++ sycl/test-e2e/Graph/Explicit/host_task2.cpp | 4 ++++ sycl/test-e2e/Graph/Explicit/host_task_last.cpp | 4 ++++ sycl/test-e2e/Graph/Explicit/host_task_multiple_deps.cpp | 4 ++++ sycl/test-e2e/Graph/Explicit/host_task_single.cpp | 4 ++++ sycl/test-e2e/Graph/Explicit/host_task_successive.cpp | 4 ++++ sycl/test-e2e/Graph/Explicit/memadvise.cpp | 5 ++++- sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp | 4 ++++ sycl/test-e2e/Graph/Explicit/prefetch.cpp | 5 ++++- sycl/test-e2e/Graph/Explicit/queue_constructor_usm.cpp | 5 +++++ sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp | 4 ++++ sycl/test-e2e/Graph/Explicit/repeated_exec.cpp | 4 ++++ sycl/test-e2e/Graph/Explicit/sub_graph.cpp | 4 ++++ .../Graph/Explicit/sub_graph_execute_without_parent.cpp | 4 ++++ .../Graph/Explicit/sub_graph_multiple_submission.cpp | 4 ++++ sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp | 4 ++++ .../test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp | 4 ++++ sycl/test-e2e/Graph/Explicit/usm_copy.cpp | 5 ++++- sycl/test-e2e/Graph/Explicit/usm_fill.cpp | 4 ++++ sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp | 4 ++++ sycl/test-e2e/Graph/Explicit/usm_memset.cpp | 5 ++++- .../Graph/RecordReplay/add_nodes_after_finalize.cpp | 5 ++++- sycl/test-e2e/Graph/RecordReplay/after_use.cpp | 4 ++++ sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp | 4 ++++ sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp | 4 ++++ sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp | 4 ++++ sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp | 4 ++++ sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp | 4 ++++ sycl/test-e2e/Graph/RecordReplay/empty_node.cpp | 4 ++++ sycl/test-e2e/Graph/RecordReplay/host_task.cpp | 4 ++++ sycl/test-e2e/Graph/RecordReplay/host_task2.cpp | 4 ++++ sycl/test-e2e/Graph/RecordReplay/host_task_in_order.cpp | 4 ++++ sycl/test-e2e/Graph/RecordReplay/host_task_last.cpp | 4 ++++ .../test-e2e/Graph/RecordReplay/host_task_multiple_deps.cpp | 4 ++++ sycl/test-e2e/Graph/RecordReplay/host_task_single.cpp | 4 ++++ sycl/test-e2e/Graph/RecordReplay/host_task_successive.cpp | 4 ++++ sycl/test-e2e/Graph/RecordReplay/memadvise.cpp | 5 ++++- sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp | 4 ++++ sycl/test-e2e/Graph/RecordReplay/prefetch.cpp | 5 ++++- sycl/test-e2e/Graph/RecordReplay/queue_constructor_usm.cpp | 5 +++++ sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp | 4 ++++ sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp | 4 ++++ sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp | 4 ++++ .../Graph/RecordReplay/sub_graph_execute_without_parent.cpp | 4 ++++ .../Graph/RecordReplay/sub_graph_multiple_submission.cpp | 4 ++++ sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp | 4 ++++ .../Graph/RecordReplay/sub_graph_two_parent_graphs.cpp | 4 ++++ sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp | 6 ++++-- sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp | 4 ++++ sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp | 4 ++++ sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp | 5 ++++- sycl/test-e2e/Graph/empty_graph.cpp | 4 ++++ 58 files changed, 234 insertions(+), 10 deletions(-) diff --git a/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp index cf17e3b6c3dba..b39f3b4fc889b 100644 --- a/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp +++ b/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm.cpp index 6e76ec42c079e..00940dcd6d68c 100644 --- a/sycl/test-e2e/Graph/Explicit/basic_usm.cpp +++ b/sycl/test-e2e/Graph/Explicit/basic_usm.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp index 040ce8c30c38b..fd8fcec4b89d2 100644 --- a/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp +++ b/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp index bb12ff6006004..5855fc25ed4cc 100644 --- a/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp +++ b/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp index f989a27a1ca39..3be8febbe008e 100644 --- a/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp +++ b/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/empty_node.cpp b/sycl/test-e2e/Graph/Explicit/empty_node.cpp index 7f7501175899c..301c53aa69795 100644 --- a/sycl/test-e2e/Graph/Explicit/empty_node.cpp +++ b/sycl/test-e2e/Graph/Explicit/empty_node.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/host_task.cpp b/sycl/test-e2e/Graph/Explicit/host_task.cpp index 167219bf4b8ef..62bcd167e484f 100644 --- a/sycl/test-e2e/Graph/Explicit/host_task.cpp +++ b/sycl/test-e2e/Graph/Explicit/host_task.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/host_task2.cpp b/sycl/test-e2e/Graph/Explicit/host_task2.cpp index f7d59f48aac09..78be02a7bc991 100644 --- a/sycl/test-e2e/Graph/Explicit/host_task2.cpp +++ b/sycl/test-e2e/Graph/Explicit/host_task2.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/host_task_last.cpp b/sycl/test-e2e/Graph/Explicit/host_task_last.cpp index aed2916284cc1..26dcf148d6f7d 100644 --- a/sycl/test-e2e/Graph/Explicit/host_task_last.cpp +++ b/sycl/test-e2e/Graph/Explicit/host_task_last.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/host_task_multiple_deps.cpp b/sycl/test-e2e/Graph/Explicit/host_task_multiple_deps.cpp index 3d03b7f6013ad..32c7157d0958c 100644 --- a/sycl/test-e2e/Graph/Explicit/host_task_multiple_deps.cpp +++ b/sycl/test-e2e/Graph/Explicit/host_task_multiple_deps.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/host_task_single.cpp b/sycl/test-e2e/Graph/Explicit/host_task_single.cpp index 6fc8744e04c18..b2236b9a0d91b 100644 --- a/sycl/test-e2e/Graph/Explicit/host_task_single.cpp +++ b/sycl/test-e2e/Graph/Explicit/host_task_single.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/host_task_successive.cpp b/sycl/test-e2e/Graph/Explicit/host_task_successive.cpp index da26b8b182296..2147d075715af 100644 --- a/sycl/test-e2e/Graph/Explicit/host_task_successive.cpp +++ b/sycl/test-e2e/Graph/Explicit/host_task_successive.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/memadvise.cpp b/sycl/test-e2e/Graph/Explicit/memadvise.cpp index 11e0d6296290f..1a8313478ddb5 100644 --- a/sycl/test-e2e/Graph/Explicit/memadvise.cpp +++ b/sycl/test-e2e/Graph/Explicit/memadvise.cpp @@ -2,7 +2,10 @@ // RUN: %if linux && (level_zero || cuda) %{ env SYCL_PI_TRACE=2 %{run} %t.out 2>&1 FileCheck %s %} %else %{ %{run} %t.out %} // Mem advise command not supported for OpenCL -// UNSUPPORTED: opencl +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: opencl, gpu-intel-dg2 // Since Mem advise is only a memory hint that doesn't // impact results but only performances, we verify diff --git a/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp index a83775b8ecb8a..e9c76ad01113b 100644 --- a/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp +++ b/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/prefetch.cpp b/sycl/test-e2e/Graph/Explicit/prefetch.cpp index 8e0e6e15c292d..4b670ebc89357 100644 --- a/sycl/test-e2e/Graph/Explicit/prefetch.cpp +++ b/sycl/test-e2e/Graph/Explicit/prefetch.cpp @@ -2,7 +2,10 @@ // RUN: %if linux && (level_zero || cuda) %{ env SYCL_PI_TRACE=2 %{run} %t.out 2>&1 FileCheck %s %} %else %{ %{run} %t.out %} // prefetch command not supported for OpenCL -// UNSUPPORTED: opencl +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: opencl, gpu-intel-dg2 // Since Prefetch is only a memory hint that doesn't // impact results but only performances, we verify diff --git a/sycl/test-e2e/Graph/Explicit/queue_constructor_usm.cpp b/sycl/test-e2e/Graph/Explicit/queue_constructor_usm.cpp index a4f3710eea9f0..0760b5805e6d9 100644 --- a/sycl/test-e2e/Graph/Explicit/queue_constructor_usm.cpp +++ b/sycl/test-e2e/Graph/Explicit/queue_constructor_usm.cpp @@ -4,6 +4,11 @@ // RUN: %if level_zero %{env UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s %} // // CHECK-NOT: LEAK +// +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp b/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp index a2589cda54320..85d3b106bc3cd 100644 --- a/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp +++ b/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp b/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp index 8b1f6ba3d4455..935f431b09332 100644 --- a/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp +++ b/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph.cpp index 830cc5a42a668..c60aa2f63e34f 100644 --- a/sycl/test-e2e/Graph/Explicit/sub_graph.cpp +++ b/sycl/test-e2e/Graph/Explicit/sub_graph.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp index bc3cb1c8f352d..772c07599300f 100644 --- a/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp +++ b/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp index 92add086ee20d..17cdde9b87956 100644 --- a/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp +++ b/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp index 05ecfbd982a01..7fd72c79722d4 100644 --- a/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp +++ b/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp index 8fa8e0c334de2..d43e7e221e68c 100644 --- a/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp +++ b/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/usm_copy.cpp b/sycl/test-e2e/Graph/Explicit/usm_copy.cpp index 323d3e35935ac..6b60911d4627e 100644 --- a/sycl/test-e2e/Graph/Explicit/usm_copy.cpp +++ b/sycl/test-e2e/Graph/Explicit/usm_copy.cpp @@ -7,7 +7,10 @@ // // // USM copy command not supported for OpenCL -// UNSUPPORTED: opencl +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: opencl, gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/usm_fill.cpp b/sycl/test-e2e/Graph/Explicit/usm_fill.cpp index 86076768f5d32..966809cf5a8d3 100644 --- a/sycl/test-e2e/Graph/Explicit/usm_fill.cpp +++ b/sycl/test-e2e/Graph/Explicit/usm_fill.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp b/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp index e0100d42d7267..53feb52cad798 100644 --- a/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp +++ b/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/Explicit/usm_memset.cpp b/sycl/test-e2e/Graph/Explicit/usm_memset.cpp index e9bb9ebab0fa9..50821fd8d008b 100644 --- a/sycl/test-e2e/Graph/Explicit/usm_memset.cpp +++ b/sycl/test-e2e/Graph/Explicit/usm_memset.cpp @@ -6,7 +6,10 @@ // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // // USM memset command not supported for OpenCL -// UNSUPPORTED: opencl +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: opencl, gpu-intel-dg2 #define GRAPH_E2E_EXPLICIT diff --git a/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp index ba81666277e44..c0729ebb3a015 100644 --- a/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp @@ -7,7 +7,10 @@ // // // Temporarily disabled until failure is addressed. -// UNSUPPORTED: windows +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: windows, gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/after_use.cpp b/sycl/test-e2e/Graph/RecordReplay/after_use.cpp index 767ca8b425ab7..4a87a09e0fbce 100644 --- a/sycl/test-e2e/Graph/RecordReplay/after_use.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/after_use.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 // This test attempts recording a set of kernels after they have already been // executed once before. diff --git a/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp b/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp index c12038ba46185..951a016994261 100644 --- a/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #include "../graph_common.hpp" diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp index ffddf70716250..bcdfb702b78c7 100644 --- a/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp index a90931934f0ce..b71fd0a690220 100644 --- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp index 410850f5ff7a8..74bc18a7b3496 100644 --- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp index 2f19c2706380b..51e4597354473 100644 --- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp b/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp index 70128db712f78..2155b226e2fc3 100644 --- a/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task.cpp index 582a3fb3f14ff..4f4d50fe3b3f3 100644 --- a/sycl/test-e2e/Graph/RecordReplay/host_task.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/host_task.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task2.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task2.cpp index f947a13ffd63d..cd867eb4caadd 100644 --- a/sycl/test-e2e/Graph/RecordReplay/host_task2.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/host_task2.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_in_order.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_in_order.cpp index e40fe3a0e963c..c3037c78d8eaf 100644 --- a/sycl/test-e2e/Graph/RecordReplay/host_task_in_order.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/host_task_in_order.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 // This test uses a host_task when adding a command_graph node to an // in-order queue. diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_last.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_last.cpp index c8dfa02c63ea7..64d625197f089 100644 --- a/sycl/test-e2e/Graph/RecordReplay/host_task_last.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/host_task_last.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_multiple_deps.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_multiple_deps.cpp index 7e312df58d092..a8ff9b53637cd 100644 --- a/sycl/test-e2e/Graph/RecordReplay/host_task_multiple_deps.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/host_task_multiple_deps.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_single.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_single.cpp index 6d86cef96c862..4e2730592ef95 100644 --- a/sycl/test-e2e/Graph/RecordReplay/host_task_single.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/host_task_single.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_successive.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_successive.cpp index 51da588ab9444..e5570cb61a2d3 100644 --- a/sycl/test-e2e/Graph/RecordReplay/host_task_successive.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/host_task_successive.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/memadvise.cpp b/sycl/test-e2e/Graph/RecordReplay/memadvise.cpp index a5b87f8143832..1f7989f9a0361 100644 --- a/sycl/test-e2e/Graph/RecordReplay/memadvise.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/memadvise.cpp @@ -2,7 +2,10 @@ // RUN: %if linux && (level_zero || cuda) %{ env SYCL_PI_TRACE=2 %{run} %t.out 2>&1 FileCheck %s %} %else %{ %{run} %t.out %} // Mem advise command not supported for OpenCL -// UNSUPPORTED: opencl +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: opencl, gpu-intel-dg2 // Since Mem advise is only a memory hint that doesn't // impact results but only performances, we verify diff --git a/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp index 67b5335de4383..0709d25e225bd 100644 --- a/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/prefetch.cpp b/sycl/test-e2e/Graph/RecordReplay/prefetch.cpp index dca297f7772b3..a5a18c08c6b66 100644 --- a/sycl/test-e2e/Graph/RecordReplay/prefetch.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/prefetch.cpp @@ -2,7 +2,10 @@ // RUN: %if linux && (level_zero || cuda) %{ env SYCL_PI_TRACE=2 %{run} %t.out 2>&1 FileCheck %s %} %else %{ %{run} %t.out %} // prefetch command not supported for OpenCL -// UNSUPPORTED: opencl +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: opencl, gpu-intel-dg2 // Since Prefetch is only a memory hint that doesn't // impact results but only performances, we verify diff --git a/sycl/test-e2e/Graph/RecordReplay/queue_constructor_usm.cpp b/sycl/test-e2e/Graph/RecordReplay/queue_constructor_usm.cpp index aa81923251cb6..a88e462461a04 100644 --- a/sycl/test-e2e/Graph/RecordReplay/queue_constructor_usm.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/queue_constructor_usm.cpp @@ -4,6 +4,11 @@ // RUN: %if level_zero %{env UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s %} // // CHECK-NOT: LEAK +// +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp b/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp index 99a28bd745853..d5e9520417a71 100644 --- a/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp b/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp index badf7293f49cb..9e2416541e091 100644 --- a/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp index 2096a7c7a21f7..37e29e16edf40 100644 --- a/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp index f1aeef51f1c31..b7442d24c6f1d 100644 --- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp index d26f99a34c290..57b4c1cd3abdd 100644 --- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp index f2b7ca35a77d7..e78794733dc42 100644 --- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp index a1d0373290cc6..8c3c5f0980c55 100644 --- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp index 09486c4ebc8da..1fdf104310623 100644 --- a/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp @@ -5,9 +5,11 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // -// // USM copy command not supported for OpenCL -// UNSUPPORTED: opencl +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable the tests while investigating the bug. +// UNSUPPORTED: opencl, gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp index 8e89ecc693ed7..66a9589d9e242 100644 --- a/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp index 401cdc7aa430f..ed959ca5a400b 100644 --- a/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp index 137b3df19cf02..86c533110f26b 100644 --- a/sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/usm_memset.cpp @@ -6,7 +6,10 @@ // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // USM memset command not supported for OpenCL -// UNSUPPORTED: opencl +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: opencl, gpu-intel-dg2 #define GRAPH_E2E_RECORD_REPLAY diff --git a/sycl/test-e2e/Graph/empty_graph.cpp b/sycl/test-e2e/Graph/empty_graph.cpp index ff4984603d908..c574d65e9357f 100644 --- a/sycl/test-e2e/Graph/empty_graph.cpp +++ b/sycl/test-e2e/Graph/empty_graph.cpp @@ -5,6 +5,10 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero && linux %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 UR_L0_LEAKS_DEBUG=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // +// Post-commit test failed +// https://github.com/intel/llvm/actions/runs/7814201804/job/21315560479 +// Temporarily disable USM based tests while investigating the bug. +// UNSUPPORTED: gpu-intel-dg2 // Tests the ability to finalize and submit a command graph which doesn't // contain any nodes. From 4c8569c1ca3153b56ed03af71184292c8afdfa6d Mon Sep 17 00:00:00 2001 From: HPS-1 <65581261+HPS-1@users.noreply.github.com> Date: Thu, 8 Feb 2024 07:29:38 -0500 Subject: [PATCH 11/26] [SYCL] Fix handling of unsupported alignment by aligned_alloc_xxx (#12569) Implementation is supposed to return `nullptr` when requested alignment is not supported. Since our runtime performs all allocations through Unified Runtime that means for us that any alignment which is not a power of two is unsupported. Note that the resulting alignment may not be the same as requested one (per the SYCL 2020 specification) and therefore we can't just rely on return value of underlying non-templated version of the alloc function and have to perform the check explicitly. There is an issue with some backends not properly returning error on an unsupported alignment, it wis reported in #11642 --------- Signed-off-by: Hu, Peisen --- sycl/include/sycl/usm.hpp | 31 +++++++++++ sycl/test-e2e/USM/align.cpp | 102 ++++++++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+) create mode 100755 sycl/test-e2e/USM/align.cpp diff --git a/sycl/include/sycl/usm.hpp b/sycl/include/sycl/usm.hpp index f9ab634edc81a..ff941e865dff8 100644 --- a/sycl/include/sycl/usm.hpp +++ b/sycl/include/sycl/usm.hpp @@ -154,6 +154,13 @@ __SYCL_EXPORT void *aligned_alloc( const property_list &propList, const detail::code_location &CodeLoc = detail::code_location::current()); +/// +// Helper function used to determine if the Alignment argument is a power of 2 +/// +inline size_t is_not_power_of_two(size_t Alignment) { + return (Alignment & (Alignment - 1)); +} + /// // Template forms /// @@ -179,6 +186,9 @@ T *aligned_alloc_device( size_t Alignment, size_t Count, const device &Dev, const context &Ctxt, const property_list &PropList = {}, const detail::code_location &CodeLoc = detail::code_location::current()) { + if (is_not_power_of_two(Alignment)) { + return nullptr; + } return static_cast(aligned_alloc_device(max(Alignment, alignof(T)), Count * sizeof(T), Dev, Ctxt, PropList, CodeLoc)); @@ -189,6 +199,9 @@ T *aligned_alloc_device( size_t Alignment, size_t Count, const queue &Q, const property_list &PropList = {}, const detail::code_location &CodeLoc = detail::code_location::current()) { + if (is_not_power_of_two(Alignment)) { + return nullptr; + } return aligned_alloc_device(Alignment, Count, Q.get_device(), Q.get_context(), PropList, CodeLoc); } @@ -230,6 +243,9 @@ T *aligned_alloc_host( size_t Alignment, size_t Count, const context &Ctxt, const property_list &PropList = {}, const detail::code_location &CodeLoc = detail::code_location::current()) { + if (is_not_power_of_two(Alignment)) { + return nullptr; + } return static_cast(aligned_alloc_host(std ::max(Alignment, alignof(T)), Count * sizeof(T), Ctxt, PropList, CodeLoc)); @@ -240,6 +256,9 @@ T *aligned_alloc_host( size_t Alignment, size_t Count, const queue &Q, const property_list &PropList = {}, const detail::code_location &CodeLoc = detail::code_location::current()) { + if (is_not_power_of_two(Alignment)) { + return nullptr; + } return aligned_alloc_host(Alignment, Count, Q.get_context(), PropList, CodeLoc); } @@ -249,6 +268,9 @@ T *aligned_alloc_shared( size_t Alignment, size_t Count, const device &Dev, const context &Ctxt, const property_list &PropList = {}, const detail::code_location &CodeLoc = detail::code_location::current()) { + if (is_not_power_of_two(Alignment)) { + return nullptr; + } return static_cast(aligned_alloc_shared(max(Alignment, alignof(T)), Count * sizeof(T), Dev, Ctxt, PropList, CodeLoc)); @@ -259,6 +281,9 @@ T *aligned_alloc_shared( size_t Alignment, size_t Count, const queue &Q, const property_list &PropList = {}, const detail::code_location &CodeLoc = detail::code_location::current()) { + if (is_not_power_of_two(Alignment)) { + return nullptr; + } return aligned_alloc_shared(Alignment, Count, Q.get_device(), Q.get_context(), PropList, CodeLoc); } @@ -286,6 +311,9 @@ T *aligned_alloc( size_t Alignment, size_t Count, const device &Dev, const context &Ctxt, usm::alloc Kind, const property_list &PropList = {}, const detail::code_location &CodeLoc = detail::code_location::current()) { + if (is_not_power_of_two(Alignment)) { + return nullptr; + } return static_cast(aligned_alloc(max(Alignment, alignof(T)), Count * sizeof(T), Dev, Ctxt, Kind, PropList, CodeLoc)); @@ -296,6 +324,9 @@ T *aligned_alloc( size_t Alignment, size_t Count, const queue &Q, usm::alloc Kind, const property_list &PropList = {}, const detail::code_location &CodeLoc = detail::code_location::current()) { + if (is_not_power_of_two(Alignment)) { + return nullptr; + } return aligned_alloc(Alignment, Count, Q.get_device(), Q.get_context(), Kind, PropList, CodeLoc); } diff --git a/sycl/test-e2e/USM/align.cpp b/sycl/test-e2e/USM/align.cpp new file mode 100755 index 0000000000000..9659ec78b9a88 --- /dev/null +++ b/sycl/test-e2e/USM/align.cpp @@ -0,0 +1,102 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// UNSUPPORTED: gpu + +// E2E tests for annotated USM allocation functions with alignment arguments +// that are not powers of 2. Note this test does not work on gpu because some +// tests expect non-templated aligned_alloc_xxx functions to return nullptr, +// e.g. when the alignment argument is not a power of 2, while they fail to do +// so when run on gpu. This maybe because the gpu runtime has different +// behavior. Therefore, GPU is unsupported until issue #12638 gets resolved. + +#include + +#include +#include + +using namespace sycl; +using namespace ext::oneapi::experimental; +using namespace ext::intel::experimental; +using alloc = usm::alloc; + +template void testAlign(sycl::queue &q, unsigned align) { + const sycl::context &Ctx = q.get_context(); + auto dev = q.get_device(); + + constexpr int N = 10; + assert(align > 0 || (align & (align - 1)) == 0); + + auto ADevice = [&](size_t align, auto... args) { + return aligned_alloc_device(align, N, args...); + }; + auto AHost = [&](size_t align, auto... args) { + return aligned_alloc_host(align, N, args...); + }; + auto AShared = [&](size_t align, auto... args) { + return aligned_alloc_shared(align, N, args...); + }; + auto AAnnotated = [&](size_t align, auto... args) { + return aligned_alloc(align, N, args...); + }; + + auto ATDevice = [&](size_t align, auto... args) { + return aligned_alloc_device(align, N, args...); + }; + auto ATHost = [&](size_t align, auto... args) { + return aligned_alloc_host(align, N, args...); + }; + auto ATShared = [&](size_t align, auto... args) { + return aligned_alloc_shared(align, N, args...); + }; + auto ATAnnotated = [&](size_t align, auto... args) { + return aligned_alloc(align, N, args...); + }; + + // Test cases that are expected to return null + auto check_null = [&q](auto AllocFn, int Line, int Case) { + decltype(AllocFn()) Ptr = AllocFn(); + if (Ptr != nullptr) { + free(Ptr, q); + std::cout << "Failed at line " << Line << ", case " << Case << std::endl; + assert(false && "The return is not null!"); + } + }; + + auto CheckNullAll = [&](auto Funcs, int Line = __builtin_LINE()) { + std::apply( + [&](auto... Fs) { + int Case = 0; + (void)std::initializer_list{ + (check_null(Fs, Line, Case++), 0)...}; + }, + Funcs); + }; + + CheckNullAll(std::tuple{ + // Case: aligned_alloc_xxx with no alignment property, and the alignment + // argument is not a power of 2, the result is nullptr + [&]() { return ADevice(3, q); }, [&]() { return ADevice(5, dev, Ctx); }, + [&]() { return AHost(7, q); }, [&]() { return AHost(9, Ctx); }, + [&]() { return AShared(114, q); }, + [&]() { return AShared(1023, dev, Ctx); }, + [&]() { return AAnnotated(15, q, alloc::device); }, + [&]() { return AAnnotated(17, dev, Ctx, alloc::host); } + // Case: aligned_alloc_xxx with no alignment property, and the + // alignment argument is not a power of 2, the result is nullptr + , + [&]() { return ATDevice(3, q); }, [&]() { return ATDevice(5, dev, Ctx); }, + [&]() { return ATHost(7, q); }, [&]() { return ATHost(9, Ctx); }, + [&]() { return ATShared(1919, q); }, + [&]() { return ATShared(11, dev, Ctx); }, + [&]() { return ATAnnotated(15, q, alloc::device); }, + [&]() { return ATAnnotated(17, dev, Ctx, alloc::host); }}); +} + +int main() { + sycl::queue q; + testAlign(q, 4); + testAlign(q, 128); + testAlign>(q, 4); + return 0; +} From 170be1f07ee0fc92add0bbee3df35f30cdecc067 Mon Sep 17 00:00:00 2001 From: Jefferson Le Quellec Date: Thu, 8 Feb 2024 15:39:10 +0100 Subject: [PATCH 12/26] [SYCL][DOC] Update the 'constraints' for the complex specification (#12649) This PR apply the modifications asked in the https://github.com/intel/llvm/pull/11792 PR made by @Pennycook --------- Co-authored-by: John Pennycook --- .../experimental/sycl_ext_oneapi_complex.asciidoc | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_complex.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_complex.asciidoc index 41700a898fecb..e4eb444f1ed60 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_complex.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_complex.asciidoc @@ -10,6 +10,7 @@ :encoding: utf-8 :lang: en :dpcpp: pass:[DPC++] +:endnote: —{nbsp}end{nbsp}note // Set the default source code type in this document to C++, // for syntax highlighting purposes. This is needed because @@ -101,8 +102,8 @@ available only in host code as noted below. The complex type is trivially copyable and type trait `is_device_copyable` should resolve to `std::true_type`. -The `T` template parameter must be one of the types float, double, or -sycl::half. +_Constraints_: The `T` template parameter must be one of the types `float`, +`double`, or `sycl::half`. Note: When performing operations between complex numbers and decimals, the decimal is treated as a complex number with a real component equal to @@ -335,6 +336,13 @@ Additionally, this extension introduces support for the `real` and `imag` free functions, which returns the real and imaginary component of a number, respectively. +[_Note:_ The overloads of the functions `real(T)` and `imag(T)` match the +behavior in ISO C++ where `T` would be treated as a complex number with a zero +imaginary component. This is subject to the constraint that `T` must be one of +the types `float`, `double`, `sycl::half`, or evaluate to `true` for +`std::is_integral`. +_{endnote}_] + These functions are available in both host and device code, and each math function should follow the C++ standard for handling `NaN` and `Inf` values. From 698417a1571bb8f0d9c37487fa75f103ff8b1c37 Mon Sep 17 00:00:00 2001 From: Udit Agarwal <16324601+uditagarwal97@users.noreply.github.com> Date: Thu, 8 Feb 2024 07:37:50 -0800 Subject: [PATCH 13/26] [SYCL] Drop 'acc' in favor of 'fpga' from ONEAPI_DEVICE_SELECTOR (#12614) As per the ONEAPI_DEVICE_SELECTOR [documentation](https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#oneapi_device_selector), the device type can only be cpu, gpu, or fpga (or any combination of those). Currently, 'acc' is also accepted by ONEAPI_DEVICE_SELECTOR as a valid device type, which is incorrect. This PR modifies drops support of 'acc' in ONEAPI_DEVICE_SELECTOR in favor of 'fpga'. We have already updated existing test cases (#12551), testing scripts (#12596 ) to use 'fpga' with ONEAPI_DEVICE_SELECTOR. --- sycl/source/detail/allowlist.cpp | 6 ++++-- sycl/source/detail/config.cpp | 16 +++++++------- sycl/source/detail/config.hpp | 6 ++++-- sycl/source/detail/device_filter.cpp | 23 ++++++++++++++------- sycl/unittests/allowlist/ParseAllowList.cpp | 3 ++- 5 files changed, 35 insertions(+), 19 deletions(-) diff --git a/sycl/source/detail/allowlist.cpp b/sycl/source/detail/allowlist.cpp index 881a014c4831f..83309ec9f2d92 100644 --- a/sycl/source/detail/allowlist.cpp +++ b/sycl/source/detail/allowlist.cpp @@ -166,7 +166,8 @@ AllowListParsedT parseAllowList(const std::string &AllowListRaw) { // valid. E.g., for BackendName key, the allowed values are only ones // described in SyclBeMap ValidateEnumValues(BackendNameKeyName, getSyclBeMap()); - ValidateEnumValues(DeviceTypeKeyName, getSyclDeviceTypeMap()); + ValidateEnumValues(DeviceTypeKeyName, + getSyclDeviceTypeMap(true /*Enable 'acc'*/)); if (Key == DeviceVendorIdKeyName) { // DeviceVendorId should have hex format @@ -380,7 +381,8 @@ void applyAllowList(std::vector &PiDevices, Device, PI_DEVICE_INFO_TYPE, sizeof(sycl::detail::pi::PiDeviceType), &PiDevType, nullptr); sycl::info::device_type DeviceType = pi::cast(PiDevType); - for (const auto &SyclDeviceType : getSyclDeviceTypeMap()) { + for (const auto &SyclDeviceType : + getSyclDeviceTypeMap(true /*Enable 'acc'*/)) { if (SyclDeviceType.second == DeviceType) { const auto &DeviceTypeValue = SyclDeviceType.first; DeviceDesc[DeviceTypeKeyName] = DeviceTypeValue; diff --git a/sycl/source/detail/config.cpp b/sycl/source/detail/config.cpp index f7760aa227168..7ae96d42e220d 100644 --- a/sycl/source/detail/config.cpp +++ b/sycl/source/detail/config.cpp @@ -165,14 +165,16 @@ void dumpConfig() { // TODO: host device type will be removed once sycl_ext_oneapi_filter_selector // is removed. const std::array, 6> & -getSyclDeviceTypeMap() { +getSyclDeviceTypeMap(bool supportAcc) { static const std::array, 6> - SyclDeviceTypeMap = {{{"host", info::device_type::host}, - {"cpu", info::device_type::cpu}, - {"gpu", info::device_type::gpu}, - {"acc", info::device_type::accelerator}, - {"fpga", info::device_type::accelerator}, - {"*", info::device_type::all}}}; + SyclDeviceTypeMap = { + {{"host", info::device_type::host}, + {"cpu", info::device_type::cpu}, + {"gpu", info::device_type::gpu}, + /* Duplicate entries are fine as this map is one-directional.*/ + {supportAcc ? "acc" : "fpga", info::device_type::accelerator}, + {"fpga", info::device_type::accelerator}, + {"*", info::device_type::all}}}; return SyclDeviceTypeMap; } diff --git a/sycl/source/detail/config.hpp b/sycl/source/detail/config.hpp index 8f048e0f95f60..1079f32caa388 100644 --- a/sycl/source/detail/config.hpp +++ b/sycl/source/detail/config.hpp @@ -232,8 +232,10 @@ template <> class SYCLConfig { }; // Array is used by SYCL_DEVICE_ALLOWLIST and ONEAPI_DEVICE_SELECTOR. +// The 'supportAcc' parameter is used by SYCL_DEVICE_ALLOWLIST which, +// unlike ONEAPI_DEVICE_SELECTOR, also accepts 'acc' as a valid device type. const std::array, 6> & -getSyclDeviceTypeMap(); +getSyclDeviceTypeMap(bool supportAcc = false); // Array is used by SYCL_DEVICE_FILTER and SYCL_DEVICE_ALLOWLIST and // ONEAPI_DEVICE_SELECTOR @@ -514,7 +516,7 @@ template <> class SYCLConfig { return Result; std::string ValueStr{ValueRaw}; - auto DeviceTypeMap = getSyclDeviceTypeMap(); + auto DeviceTypeMap = getSyclDeviceTypeMap(true /*Enable 'acc'*/); // Iterate over all configurations. size_t Start = 0, End = 0; diff --git a/sycl/source/detail/device_filter.cpp b/sycl/source/detail/device_filter.cpp index 311ebeaa174b8..eb3d0f83ed26e 100644 --- a/sycl/source/detail/device_filter.cpp +++ b/sycl/source/detail/device_filter.cpp @@ -93,9 +93,13 @@ static void Parse_ODS_Device(ods_target &Target, std::string_view TopDeviceStr = DeviceSubTuple[0]; // Handle explicit device type (e.g. 'gpu'). - auto DeviceTypeMap = - getSyclDeviceTypeMap(); // <-- std::array> + auto DeviceTypeMap = getSyclDeviceTypeMap( +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES + true /*Enable 'acc'*/ +#endif + ); // <-- std::array> + auto It = std::find_if(std::begin(DeviceTypeMap), std::end(DeviceTypeMap), [&](auto DtPair) { return TopDeviceStr == DtPair.first; }); @@ -262,7 +266,11 @@ Parse_ONEAPI_DEVICE_SELECTOR(const std::string &envString) { std::ostream &operator<<(std::ostream &Out, const ods_target &Target) { Out << Target.Backend; if (Target.DeviceType) { - auto DeviceTypeMap = getSyclDeviceTypeMap(); + auto DeviceTypeMap = getSyclDeviceTypeMap( +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES + true /*Enable 'acc'*/ +#endif + ); auto Match = std::find_if( DeviceTypeMap.begin(), DeviceTypeMap.end(), [&](auto Pair) { return (Pair.second == Target.DeviceType); }); @@ -335,11 +343,12 @@ device_filter::device_filter(const std::string &FilterString) { if (TripleValueID >= Tokens.size()) { DeviceType = info::device_type::all; } else { - auto Iter = std::find_if(std::begin(getSyclDeviceTypeMap()), - std::end(getSyclDeviceTypeMap()), FindElement); + auto Iter = std::find_if( + std::begin(getSyclDeviceTypeMap(true /*Enable 'acc'*/)), + std::end(getSyclDeviceTypeMap(true /*Enable 'acc'*/)), FindElement); // If no match is found, set device_type 'all', // which actually means 'any device_type' will be a match. - if (Iter == getSyclDeviceTypeMap().end()) + if (Iter == getSyclDeviceTypeMap(true /*Enable 'acc'*/).end()) DeviceType = info::device_type::all; else { DeviceType = Iter->second; diff --git a/sycl/unittests/allowlist/ParseAllowList.cpp b/sycl/unittests/allowlist/ParseAllowList.cpp index faecab30aaeaf..40fbceb76616e 100644 --- a/sycl/unittests/allowlist/ParseAllowList.cpp +++ b/sycl/unittests/allowlist/ParseAllowList.cpp @@ -178,7 +178,8 @@ TEST(ParseAllowListTests, CheckAllValidBackendNameValuesAreProcessed) { TEST(ParseAllowListTests, CheckAllValidDeviceTypeValuesAreProcessed) { std::string AllowList; - for (const auto &SyclDeviceType : sycl::detail::getSyclDeviceTypeMap()) { + for (const auto &SyclDeviceType : + sycl::detail::getSyclDeviceTypeMap(true /*Enable 'acc'*/)) { if (!AllowList.empty()) AllowList += "|"; AllowList += "DeviceType:" + SyclDeviceType.first; From e2b48a4f2d40cd69d5324cbefdb3cb36c457380d Mon Sep 17 00:00:00 2001 From: Jefferson Le Quellec Date: Thu, 8 Feb 2024 16:56:38 +0100 Subject: [PATCH 14/26] [SYCL][DOC] Trim trailing whitespace and fix typo (#12646) This simple PR fix the `libspriv` typo and removes the unnecessary trailing whitespace. --- sycl/doc/design/CompilerAndRuntimeDesign.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/sycl/doc/design/CompilerAndRuntimeDesign.md b/sycl/doc/design/CompilerAndRuntimeDesign.md index c9d9e2b7eba90..ccc0f642f992f 100644 --- a/sycl/doc/design/CompilerAndRuntimeDesign.md +++ b/sycl/doc/design/CompilerAndRuntimeDesign.md @@ -567,8 +567,8 @@ Unlike other AOT targets, the bitcode module linked from intermediate compiled objects never goes through SPIR-V. Instead it is passed directly in bitcode form down to the NVPTX Back End. All produced bitcode depends on two libraries, `libdevice.bc` (provided by the CUDA SDK) and `libspirv-nvptx64--nvidiacl.bc` variants -(built by the libclc project). `libspirv-nvptx64--nvidiacl.bc` is not used directly. -Instead it is used to generate remangled variants +(built by the libclc project). `libspirv-nvptx64--nvidiacl.bc` is not used directly. +Instead it is used to generate remangled variants `remangled-l64-signed_char.libspirv-nvptx64--nvidiacl.bc` and `remangled-l32-signed_char.libspirv-nvptx64--nvidiacl.bc` to handle primitive type differences between Linux and Windows. @@ -600,14 +600,14 @@ path in SYCL kernels. ##### NVPTX Builtins -Builtins are implemented in OpenCL C within libclc. OpenCL C treats `long` +Builtins are implemented in OpenCL C within libclc. OpenCL C treats `long` types as 64 bit and has no `long long` types while Windows DPC++ treats `long` -types like 32-bit integers and `long long` types like 64-bit integers. -Differences between the primitive types can cause applications to use -incompatible libclc built-ins. A remangler creates multiple libspriv files -with different remangled function names to support both Windows and Linux. -When building a SYCL application targeting the CUDA backend the driver -will link the device code with +types like 32-bit integers and `long long` types like 64-bit integers. +Differences between the primitive types can cause applications to use +incompatible libclc built-ins. A remangler creates multiple libspirv files +with different remangled function names to support both Windows and Linux. +When building a SYCL application targeting the CUDA backend the driver +will link the device code with `remangled-l32-signed_char.libspirv-nvptx64--nvidiacl.bc` if the host target is Windows or it will link the device code with `remangled-l64-signed_char.libspirv-nvptx64--nvidiacl.bc` if the host target is @@ -916,7 +916,7 @@ template class multi_ptr { // DecoratedType::type == "__attribute__((opencl_global)) T" // See sycl/include/sycl/access/access.hpp for more details using pointer_t = typename DecoratedType::type *; - + pointer_t m_Pointer; public: pointer_t get() { return m_Pointer; } From 48f60d5c603de9dc74dd30936976c0d94c0afdaf Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Thu, 8 Feb 2024 17:06:11 +0100 Subject: [PATCH 15/26] [SYCL][NFC] Reduce ambiguity between discarding SYCL and PI events (#12547) The old variable name for support of discarding PI events (MHasDiscardEventsSupport) made it easy to confuse with discarding SYCL events (MDiscardEvents), especially since we could have a situation where MHasDiscardEventsSupport is false, but MDiscardEvents is true. Rename the variable for discarding PI events and reword its comment to make the distinction clearer. --- sycl/source/detail/queue_impl.cpp | 2 +- sycl/source/detail/queue_impl.hpp | 27 ++++++++++++----------- sycl/source/detail/scheduler/commands.cpp | 10 ++++----- sycl/source/handler.cpp | 2 +- 4 files changed, 21 insertions(+), 20 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 545cff3b9c5c3..ab066ed1906a4 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -354,7 +354,7 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr &Self, getExtendDependencyList(DepEvents, MutableDepEvents, Lock); if (areEventsSafeForSchedulerBypass(ExpandedDepEvents, MContext)) { - if (MHasDiscardEventsSupport) { + if (MSupportsDiscardingPiEvents) { MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents), /*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr); return createDiscardedEvent(); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index ddd6a71d7db80..0fe4242cc9472 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -115,8 +115,8 @@ class queue_impl { MDiscardEvents( has_property()), MIsProfilingEnabled(has_property()), - MHasDiscardEventsSupport(MDiscardEvents && - (MHostQueue ? true : MIsInorder)), + MSupportsDiscardingPiEvents(MDiscardEvents && + (MHostQueue ? true : MIsInorder)), MQueueID{ MNextAvailableQueueID.fetch_add(1, std::memory_order_relaxed)} { if (has_property()) { @@ -292,8 +292,8 @@ class queue_impl { MDiscardEvents( has_property()), MIsProfilingEnabled(has_property()), - MHasDiscardEventsSupport(MDiscardEvents && - (MHostQueue ? true : MIsInorder)), + MSupportsDiscardingPiEvents(MDiscardEvents && + (MHostQueue ? true : MIsInorder)), MQueueID{ MNextAvailableQueueID.fetch_add(1, std::memory_order_relaxed)} { queue_impl_interop(PiQueue); @@ -317,8 +317,8 @@ class queue_impl { MDiscardEvents( has_property()), MIsProfilingEnabled(has_property()), - MHasDiscardEventsSupport(MDiscardEvents && - (MHostQueue ? true : MIsInorder)) { + MSupportsDiscardingPiEvents(MDiscardEvents && + (MHostQueue ? true : MIsInorder)) { queue_impl_interop(PiQueue); } @@ -374,7 +374,9 @@ class queue_impl { bool is_host() const { return MHostQueue; } /// \return true if this queue has discard_events support. - bool has_discard_events_support() const { return MHasDiscardEventsSupport; } + bool supportsDiscardingPiEvents() const { + return MSupportsDiscardingPiEvents; + } bool isInOrder() const { return MIsInorder; } @@ -970,12 +972,11 @@ class queue_impl { const bool MIsProfilingEnabled; protected: - // This flag says if we can discard events based on a queue "setup" which will - // be common for all operations submitted to the queue. This is a must - // condition for discarding, but even if it's true, in some cases, we won't be - // able to discard events, because the final decision is made right before the - // operation itself. - const bool MHasDiscardEventsSupport; + // Indicates whether the queue supports discarding PI events for tasks + // submitted to it. This condition is necessary but not sufficient, PI events + // should be discarded only if they also don't represent potential implicit + // dependencies for future tasks in other queues. + const bool MSupportsDiscardingPiEvents; // Command graph which is associated with this queue for the purposes of // recording commands to it. diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 23b6eec33886e..955adae8423dc 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2704,7 +2704,7 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() { } sycl::detail::pi::PiEvent *Event = - (MQueue->has_discard_events_support() && + (MQueue->supportsDiscardingPiEvents() && MCommandGroup->getRequirements().size() == 0) ? nullptr : &MEvent->getHandleRef(); @@ -2851,11 +2851,11 @@ pi_int32 ExecCGCommand::enqueueImpQueue() { auto RawEvents = getPiEvents(EventImpls); flushCrossQueueDeps(EventImpls, getWorkerQueue()); - bool DiscardEvent = (MQueue->has_discard_events_support() && - MCommandGroup->getRequirements().size() == 0); + bool DiscardPiEvent = (MQueue->supportsDiscardingPiEvents() && + MCommandGroup->getRequirements().size() == 0); sycl::detail::pi::PiEvent *Event = - DiscardEvent ? nullptr : &MEvent->getHandleRef(); - detail::EventImplPtr EventImpl = DiscardEvent ? nullptr : MEvent; + DiscardPiEvent ? nullptr : &MEvent->getHandleRef(); + detail::EventImplPtr EventImpl = DiscardPiEvent ? nullptr : MEvent; switch (MCommandGroup->getType()) { diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 02ffef951d1b5..70681a7504358 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -292,7 +292,7 @@ event handler::finalize() { }; bool DiscardEvent = false; - if (MQueue->has_discard_events_support()) { + if (MQueue->supportsDiscardingPiEvents()) { // Kernel only uses assert if it's non interop one bool KernelUsesAssert = !(MKernel && MKernel->isInterop()) && From 2dc080eba3b071853b7587d53654f8b544c5f2fa Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Thu, 8 Feb 2024 17:18:08 +0100 Subject: [PATCH 16/26] [SYCL][NFC] Refresh inline asm E2E tests (#12642) This PR should improve tests stability and transparency (when it comes to unsupported tests). List of changes: - switched from `[[intel::reqd_sub_group_size]]` to `[[sycl::reqd_sub_group_size]]` attribute; - removed uses of deprecated `device::has_extension` API: there is no need for that check, because, `[[sycl::reqd_sub_group_size]]` is a core SYCL 2020 functionality; - simplified helpers to remove unnecessary extra arguments which could be inferred otherwise; - updated `REQUIRES` directives to include required sub-group size requirements in there. Adjusted calls to `launchInlineASMTest` helper to consistently pass required sub-group size in there as well; --- .../InlineAsm/Negative/asm_bad_opcode.cpp | 6 +-- .../Negative/asm_bad_operand_syntax.cpp | 6 +-- .../Negative/asm_duplicate_label.cpp | 6 +-- .../Negative/asm_illegal_exec_size.cpp | 6 +-- .../InlineAsm/Negative/asm_missing_label.cpp | 6 +-- .../InlineAsm/Negative/asm_missing_region.cpp | 6 +-- .../InlineAsm/Negative/asm_simple.cpp | 6 +-- .../InlineAsm/Negative/asm_undefined_decl.cpp | 6 +-- .../InlineAsm/Negative/asm_undefined_pred.cpp | 6 +-- .../InlineAsm/Negative/asm_wrong_declare.cpp | 6 +-- sycl/test-e2e/InlineAsm/asm_16_empty.cpp | 6 +-- .../test-e2e/InlineAsm/asm_16_matrix_mult.cpp | 6 +-- .../InlineAsm/asm_16_no_input_int.cpp | 6 +-- sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp | 6 +-- sycl/test-e2e/InlineAsm/asm_8_empty.cpp | 6 +-- .../test-e2e/InlineAsm/asm_8_no_input_int.cpp | 6 +-- .../InlineAsm/asm_arbitrary_ops_order.cpp | 6 +-- sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp | 6 +-- sycl/test-e2e/InlineAsm/asm_float_add.cpp | 6 +-- sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp | 6 +-- sycl/test-e2e/InlineAsm/asm_float_neg.cpp | 6 +-- sycl/test-e2e/InlineAsm/asm_if.cpp | 6 +-- sycl/test-e2e/InlineAsm/asm_imm_arg.cpp | 6 +-- sycl/test-e2e/InlineAsm/asm_loop.cpp | 6 +-- sycl/test-e2e/InlineAsm/asm_mul.cpp | 6 +-- .../InlineAsm/asm_multiple_instructions.cpp | 6 +-- sycl/test-e2e/InlineAsm/asm_no_operands.cpp | 18 ++++---- sycl/test-e2e/InlineAsm/asm_no_output.cpp | 6 +-- sycl/test-e2e/InlineAsm/asm_plus_mod.cpp | 6 +-- sycl/test-e2e/InlineAsm/asm_switch.cpp | 6 +-- sycl/test-e2e/InlineAsm/include/asmhelper.h | 41 ++++++++----------- sycl/test-e2e/InlineAsm/letter_example.cpp | 9 ++-- sycl/test-e2e/InlineAsm/malloc_shared_32.cpp | 9 ++-- .../InlineAsm/malloc_shared_in_out_dif.cpp | 9 ++-- .../InlineAsm/malloc_shared_no_input.cpp | 9 ++-- 35 files changed, 125 insertions(+), 144 deletions(-) diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_bad_opcode.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_bad_opcode.cpp index 935df8cab7afa..a16b903c09a84 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_bad_opcode.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_bad_opcode.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: cuda || hip // UNSUPPORTED: ze_debug -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -13,7 +13,7 @@ struct KernelFunctor { void operator()(sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>{16}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n" ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n" @@ -25,6 +25,6 @@ struct KernelFunctor { int main() { KernelFunctor f; - launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true); + launchInlineASMTest(f, {16}, /* exception expected */ true); return 0; } diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_bad_operand_syntax.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_bad_operand_syntax.cpp index cf21d2cd84714..ea7434dd31498 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_bad_operand_syntax.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_bad_operand_syntax.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: cuda || hip // UNSUPPORTED: ze_debug -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -13,7 +13,7 @@ struct KernelFunctor { void operator()(sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>{16}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n" ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n" @@ -25,6 +25,6 @@ struct KernelFunctor { int main() { KernelFunctor f; - launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true); + launchInlineASMTest(f, {16}, /* exception expected */ true); return 0; } diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_duplicate_label.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_duplicate_label.cpp index 50f63b777d647..58978e203d8a8 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_duplicate_label.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_duplicate_label.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: cuda || hip // UNSUPPORTED: ze_debug -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -13,7 +13,7 @@ struct KernelFunctor { void operator()(sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>{16}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n" ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n" @@ -25,6 +25,6 @@ struct KernelFunctor { int main() { KernelFunctor f; - launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true); + launchInlineASMTest(f, {16}, /* exception expected */ true); return 0; } diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_illegal_exec_size.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_illegal_exec_size.cpp index 053c48e53e353..98d9b1ff1520d 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_illegal_exec_size.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_illegal_exec_size.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: cuda || hip // UNSUPPORTED: ze_debug -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -13,7 +13,7 @@ struct KernelFunctor { void operator()(sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>{16}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n" ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n" @@ -25,6 +25,6 @@ struct KernelFunctor { int main() { KernelFunctor f; - launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true); + launchInlineASMTest(f, {16}, /* exception expected */ true); return 0; } diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_missing_label.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_missing_label.cpp index 4754d32a57b77..47f0fd98311ae 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_missing_label.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_missing_label.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: cuda || hip // UNSUPPORTED: ze_debug -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -13,7 +13,7 @@ struct KernelFunctor { void operator()(sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>{16}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n" ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n" @@ -25,6 +25,6 @@ struct KernelFunctor { int main() { KernelFunctor f; - launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true); + launchInlineASMTest(f, {16}, /* exception expected */ true); return 0; } diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_missing_region.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_missing_region.cpp index cf7ab9f6d8a6e..395eb4af68c03 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_missing_region.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_missing_region.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: cuda || hip // UNSUPPORTED: ze_debug -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -13,7 +13,7 @@ struct KernelFunctor { void operator()(sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>{16}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n" ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n" @@ -25,6 +25,6 @@ struct KernelFunctor { int main() { KernelFunctor f; - launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true); + launchInlineASMTest(f, {16}, /* exception expected */ true); return 0; } diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_simple.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_simple.cpp index 4a24d6dbd441e..e36a15cf1cbf4 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_simple.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_simple.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: cuda || hip // UNSUPPORTED: ze_debug -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -13,7 +13,7 @@ struct KernelFunctor { void operator()(sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>{16}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n" ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n" @@ -25,6 +25,6 @@ struct KernelFunctor { int main() { KernelFunctor f; - launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true); + launchInlineASMTest(f, {16}, /* exception expected */ true); return 0; } diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_undefined_decl.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_undefined_decl.cpp index ec6f1026d9bea..53b64cf2ba2fd 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_undefined_decl.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_undefined_decl.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: cuda || hip // UNSUPPORTED: ze_debug -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -13,7 +13,7 @@ struct KernelFunctor { void operator()(sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>{16}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n" ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n" @@ -25,6 +25,6 @@ struct KernelFunctor { int main() { KernelFunctor f; - launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true); + launchInlineASMTest(f, {16}, /* exception expected */ true); return 0; } diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_undefined_pred.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_undefined_pred.cpp index b76357962f0fc..0ce42082fe2ef 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_undefined_pred.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_undefined_pred.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: cuda || hip // UNSUPPORTED: ze_debug -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -13,7 +13,7 @@ struct KernelFunctor { void operator()(sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>{16}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n" ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n" @@ -25,6 +25,6 @@ struct KernelFunctor { int main() { KernelFunctor f; - launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true); + launchInlineASMTest(f, {16}, /* exception expected */ true); return 0; } diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_wrong_declare.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_wrong_declare.cpp index 3b2d8aa1353ee..86151cdc7cab7 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_wrong_declare.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_wrong_declare.cpp @@ -1,6 +1,6 @@ // UNSUPPORTED: cuda || hip // UNSUPPORTED: ze_debug -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -13,7 +13,7 @@ struct KernelFunctor { void operator()(sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>{16}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm volatile(".decl tmp1 v_type=G type=d num_elts=16 align=GRF\n" ".decl tmp2 v_type=G type=d num_elts=16 align=GRF\n" @@ -25,6 +25,6 @@ struct KernelFunctor { int main() { KernelFunctor f; - launchInlineASMTest(f, /* sg size */ true, /* exception expected */ true); + launchInlineASMTest(f, {16}, /* exception expected */ true); return 0; } diff --git a/sycl/test-e2e/InlineAsm/asm_16_empty.cpp b/sycl/test-e2e/InlineAsm/asm_16_empty.cpp index 71ca270f231cd..78321716777ac 100644 --- a/sycl/test-e2e/InlineAsm/asm_16_empty.cpp +++ b/sycl/test-e2e/InlineAsm/asm_16_empty.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda || hip_nvidia -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -19,7 +19,7 @@ template struct KernelFunctor : WithOutputBuffer { cgh); cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { C[wiID] = 43; #if defined(__SYCL_DEVICE_ONLY__) asm volatile(""); @@ -30,7 +30,7 @@ template struct KernelFunctor : WithOutputBuffer { int main() { KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; if (verify_all_the_same(f.getOutputBufferData(), 43)) diff --git a/sycl/test-e2e/InlineAsm/asm_16_matrix_mult.cpp b/sycl/test-e2e/InlineAsm/asm_16_matrix_mult.cpp index c28616ee70750..00c3eb3830f38 100644 --- a/sycl/test-e2e/InlineAsm/asm_16_matrix_mult.cpp +++ b/sycl/test-e2e/InlineAsm/asm_16_matrix_mult.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -19,7 +19,7 @@ template struct KernelFunctor : WithOutputBuffer { cgh); cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { volatile int output = 0; #if defined(__SYCL_DEVICE_ONLY__) asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d" : "=rw"(output)); @@ -33,7 +33,7 @@ template struct KernelFunctor : WithOutputBuffer { int main() { KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; if (verify_all_the_same(f.getOutputBufferData(), 7)) diff --git a/sycl/test-e2e/InlineAsm/asm_16_no_input_int.cpp b/sycl/test-e2e/InlineAsm/asm_16_no_input_int.cpp index c28616ee70750..00c3eb3830f38 100644 --- a/sycl/test-e2e/InlineAsm/asm_16_no_input_int.cpp +++ b/sycl/test-e2e/InlineAsm/asm_16_no_input_int.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -19,7 +19,7 @@ template struct KernelFunctor : WithOutputBuffer { cgh); cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { volatile int output = 0; #if defined(__SYCL_DEVICE_ONLY__) asm volatile("mov (M1,16) %0(0,0)<1> 0x7:d" : "=rw"(output)); @@ -33,7 +33,7 @@ template struct KernelFunctor : WithOutputBuffer { int main() { KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; if (verify_all_the_same(f.getOutputBufferData(), 7)) diff --git a/sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp b/sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp index 11d647cdfbe1d..812e30a7c9f56 100644 --- a/sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp +++ b/sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -19,7 +19,7 @@ template struct KernelFunctor : WithOutputBuffer { cgh); cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { for (int i = 0; i < 10; ++i) { #if defined(__SYCL_DEVICE_ONLY__) asm("fence_sw"); @@ -35,7 +35,7 @@ template struct KernelFunctor : WithOutputBuffer { int main() { KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; if (verify_all_the_same(f.getOutputBufferData(), 45)) diff --git a/sycl/test-e2e/InlineAsm/asm_8_empty.cpp b/sycl/test-e2e/InlineAsm/asm_8_empty.cpp index d43b9cd83ac81..26d0952225901 100644 --- a/sycl/test-e2e/InlineAsm/asm_8_empty.cpp +++ b/sycl/test-e2e/InlineAsm/asm_8_empty.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda || hip_nvidia -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-8 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -19,7 +19,7 @@ template struct KernelFunctor : WithOutputBuffer { cgh); cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(8)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(8)]] { C[wiID] = 43; #if defined(__SYCL_DEVICE_ONLY__) asm volatile(""); @@ -30,7 +30,7 @@ template struct KernelFunctor : WithOutputBuffer { int main() { KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); - if (!launchInlineASMTest(f, true, false, {8})) + if (!launchInlineASMTest(f, {8})) return 0; if (verify_all_the_same(f.getOutputBufferData(), 43)) diff --git a/sycl/test-e2e/InlineAsm/asm_8_no_input_int.cpp b/sycl/test-e2e/InlineAsm/asm_8_no_input_int.cpp index ab15d58164e3f..e671acc825e7b 100644 --- a/sycl/test-e2e/InlineAsm/asm_8_no_input_int.cpp +++ b/sycl/test-e2e/InlineAsm/asm_8_no_input_int.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-8 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -19,7 +19,7 @@ template struct KernelFunctor : WithOutputBuffer { cgh); cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(8)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(8)]] { volatile int output = 0; #if defined(__SYCL_DEVICE_ONLY__) asm volatile("mov (M1,8) %0(0,0)<1> 0x7:d" : "=rw"(output)); @@ -33,7 +33,7 @@ template struct KernelFunctor : WithOutputBuffer { int main() { KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); - if (!launchInlineASMTest(f, true, false, {8})) + if (!launchInlineASMTest(f, {8})) return 0; if (verify_all_the_same(f.getOutputBufferData(), 7)) diff --git a/sycl/test-e2e/InlineAsm/asm_arbitrary_ops_order.cpp b/sycl/test-e2e/InlineAsm/asm_arbitrary_ops_order.cpp index 401dfbcacb63c..739feed41d6c9 100644 --- a/sycl/test-e2e/InlineAsm/asm_arbitrary_ops_order.cpp +++ b/sycl/test-e2e/InlineAsm/asm_arbitrary_ops_order.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -33,7 +33,7 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm("mad (M1, 16) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0> %3(0, " "0)<1;1,0>" @@ -56,7 +56,7 @@ int main() { } KernelFunctor<> f(inputA, inputB, inputC); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; auto &D = f.getOutputBufferData(); diff --git a/sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp b/sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp index dd7cbc6739816..60d200d1c99df 100644 --- a/sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp +++ b/sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -29,7 +29,7 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { // declaration of temp within and outside the scope #if defined(__SYCL_DEVICE_ONLY__) asm("{\n" @@ -59,7 +59,7 @@ int main() { } KernelFunctor<> f(inputA, inputB); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; auto &C = f.getOutputBufferData(); diff --git a/sycl/test-e2e/InlineAsm/asm_float_add.cpp b/sycl/test-e2e/InlineAsm/asm_float_add.cpp index bd4298b0ddad3..b6374c96d2010 100644 --- a/sycl/test-e2e/InlineAsm/asm_float_add.cpp +++ b/sycl/test-e2e/InlineAsm/asm_float_add.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -30,7 +30,7 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm("add (M1, 16) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>" : "=rw"(C[wiID]) @@ -51,7 +51,7 @@ int main() { } KernelFunctor<> f(inputA, inputB); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; auto &C = f.getOutputBufferData(); diff --git a/sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp b/sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp index 51cfa81149118..948d983554c4e 100644 --- a/sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp +++ b/sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -27,7 +27,7 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm("mul (M1, 16) %0(0, 0)<1> %1(0, 0)<1;1,0> %2" : "=rw"(B[wiID]) @@ -45,7 +45,7 @@ int main() { input[i] = (float)1 / std::pow(2, i); KernelFunctor<> f(input); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; auto &B = f.getOutputBufferData(); diff --git a/sycl/test-e2e/InlineAsm/asm_float_neg.cpp b/sycl/test-e2e/InlineAsm/asm_float_neg.cpp index 39176ab355c96..43ad56e41222d 100644 --- a/sycl/test-e2e/InlineAsm/asm_float_neg.cpp +++ b/sycl/test-e2e/InlineAsm/asm_float_neg.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -25,7 +25,7 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm("mov (M1, 16) %0(0, 0)<1> (-)%1(0, 0)<1;1,0>" : "=rw"(B[wiID]) @@ -45,7 +45,7 @@ int main() { input[i] = 1.0 / i; KernelFunctor<> f(input); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; auto &R = f.getOutputBufferData(); diff --git a/sycl/test-e2e/InlineAsm/asm_if.cpp b/sycl/test-e2e/InlineAsm/asm_if.cpp index 9cee76efee2e0..679980a62aaf7 100644 --- a/sycl/test-e2e/InlineAsm/asm_if.cpp +++ b/sycl/test-e2e/InlineAsm/asm_if.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -18,7 +18,7 @@ template struct KernelFunctor : WithOutputBuffer { bool switchField = false; CGH.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { int Output = 0; #if defined(__SYCL_DEVICE_ONLY__) asm volatile("{\n" @@ -42,7 +42,7 @@ template struct KernelFunctor : WithOutputBuffer { int main() { KernelFunctor<> Functor(DEFAULT_PROBLEM_SIZE); - if (!launchInlineASMTest(Functor)) + if (!launchInlineASMTest(Functor, {16})) return 0; if (verify_all_the_same(Functor.getOutputBufferData(), 7)) diff --git a/sycl/test-e2e/InlineAsm/asm_imm_arg.cpp b/sycl/test-e2e/InlineAsm/asm_imm_arg.cpp index 48415e1445af0..d2a53f14691da 100644 --- a/sycl/test-e2e/InlineAsm/asm_imm_arg.cpp +++ b/sycl/test-e2e/InlineAsm/asm_imm_arg.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -26,7 +26,7 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm("add (M1, 16) %0(0, 0)<1> %1(0, 0)<1;1,0> %2" : "=rw"(B[wiID]) @@ -44,7 +44,7 @@ int main() { input[i] = i; KernelFunctor<> f(input); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; auto &B = f.getOutputBufferData(); diff --git a/sycl/test-e2e/InlineAsm/asm_loop.cpp b/sycl/test-e2e/InlineAsm/asm_loop.cpp index 0c1cc06568d6a..56518dffbf4e3 100644 --- a/sycl/test-e2e/InlineAsm/asm_loop.cpp +++ b/sycl/test-e2e/InlineAsm/asm_loop.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -29,7 +29,7 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { CGH); CGH.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm volatile("{\n" ".decl P1 v_type=P num_elts=16\n" @@ -67,7 +67,7 @@ int main() { } KernelFunctor<> Functor(InputA, InputB); - if (!launchInlineASMTest(Functor)) + if (!launchInlineASMTest(Functor, {16})) return 0; auto &C = Functor.getOutputBufferData(); diff --git a/sycl/test-e2e/InlineAsm/asm_mul.cpp b/sycl/test-e2e/InlineAsm/asm_mul.cpp index 3f2b91c53cf59..b038a79f8abb1 100644 --- a/sycl/test-e2e/InlineAsm/asm_mul.cpp +++ b/sycl/test-e2e/InlineAsm/asm_mul.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -28,7 +28,7 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm("mul (M1, 16) %0(0, 0)<1> %1(0, 0)<1;1,0> %2(0, 0)<1;1,0>" : "=rw"(C[wiID]) @@ -49,7 +49,7 @@ int main() { } KernelFunctor<> f(inputA, inputB); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; auto &C = f.getOutputBufferData(); diff --git a/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp b/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp index f51f5f58501ca..cdcf08b0f809d 100644 --- a/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp +++ b/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda || hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -DTO_PASS -o %t.out.pass // RUN: %{run} %t.out.pass // RUN: %{build} -o %t.out @@ -36,7 +36,7 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(TO_PASS) // The code below passing verification volatile int output = -1; @@ -85,7 +85,7 @@ int main() { } KernelFunctor<> f(inputA, inputB, inputC); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; if (verify_all_the_same(f.getOutputBufferData(), diff --git a/sycl/test-e2e/InlineAsm/asm_no_operands.cpp b/sycl/test-e2e/InlineAsm/asm_no_operands.cpp index 5037fd68af41c..5b770bb5baa0d 100644 --- a/sycl/test-e2e/InlineAsm/asm_no_operands.cpp +++ b/sycl/test-e2e/InlineAsm/asm_no_operands.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -12,10 +12,7 @@ int main() { sycl::queue Queue; sycl::device Device = Queue.get_device(); - auto Vec = Device.get_info(); - if (!isInlineASMSupported(Device) || - std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") == - std::end(Vec)) { + if (!isInlineASMSupported(Device)) { std::cout << "Skipping test\n"; return 0; } @@ -25,12 +22,13 @@ int main() { // Submitting command group(work) to queue Queue.submit([&](sycl::handler &cgh) { // Executing kernel - cgh.parallel_for( - NumOfWorkItems, - [=](sycl::id<1> WIid) [[intel::reqd_sub_group_size(16)]] { + // clang-format off + cgh.parallel_for(NumOfWorkItems, + [=](sycl::id<1> WIid) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) - asm("barrier"); + asm("barrier"); #endif - }); + }); + // clang-format on }); } diff --git a/sycl/test-e2e/InlineAsm/asm_no_output.cpp b/sycl/test-e2e/InlineAsm/asm_no_output.cpp index a6d44b35a63dd..0622f4f8edb38 100644 --- a/sycl/test-e2e/InlineAsm/asm_no_output.cpp +++ b/sycl/test-e2e/InlineAsm/asm_no_output.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda || hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -19,7 +19,7 @@ template struct KernelFunctor : WithOutputBuffer { cgh); cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { volatile int local_var = 47; local_var += C[0]; #if defined(__SYCL_DEVICE_ONLY__) @@ -37,7 +37,7 @@ template struct KernelFunctor : WithOutputBuffer { int main() { KernelFunctor<> f(DEFAULT_PROBLEM_SIZE); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; if (verify_all_the_same(f.getOutputBufferData(), 0)) diff --git a/sycl/test-e2e/InlineAsm/asm_plus_mod.cpp b/sycl/test-e2e/InlineAsm/asm_plus_mod.cpp index 10157938517bf..db2659d8e7077 100644 --- a/sycl/test-e2e/InlineAsm/asm_plus_mod.cpp +++ b/sycl/test-e2e/InlineAsm/asm_plus_mod.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda || hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -25,7 +25,7 @@ struct KernelFunctor : WithInputBuffers, WithOutputBuffer { cgh.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) asm("add (M1, 16) %0(0, 0)<1> %0(0, 0)<1;1,0> %1(0, 0)<1;1,0>" : "+rw"(B[wiID]) @@ -47,7 +47,7 @@ int main() { } KernelFunctor<> f(inputA, inputB); - if (!launchInlineASMTest(f)) + if (!launchInlineASMTest(f, {16})) return 0; auto &B = f.getOutputBufferData(); diff --git a/sycl/test-e2e/InlineAsm/asm_switch.cpp b/sycl/test-e2e/InlineAsm/asm_switch.cpp index e77a7e6edc531..ccd81fd34af45 100644 --- a/sycl/test-e2e/InlineAsm/asm_switch.cpp +++ b/sycl/test-e2e/InlineAsm/asm_switch.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda || hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -18,7 +18,7 @@ template struct KernelFunctor : WithOutputBuffer { int switchField = 2; CGH.parallel_for>( sycl::range<1>{this->getOutputBufferSize()}, - [=](sycl::id<1> wiID) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> wiID) [[sycl::reqd_sub_group_size(16)]] { int Output = 0; #if defined(__SYCL_DEVICE_ONLY__) asm volatile("{\n" @@ -62,7 +62,7 @@ template struct KernelFunctor : WithOutputBuffer { int main() { KernelFunctor<> Functor(DEFAULT_PROBLEM_SIZE); - if (!launchInlineASMTest(Functor)) + if (!launchInlineASMTest(Functor, {16})) return 0; if (verify_all_the_same(Functor.getOutputBufferData(), 7)) diff --git a/sycl/test-e2e/InlineAsm/include/asmhelper.h b/sycl/test-e2e/InlineAsm/include/asmhelper.h index f96d056dcfec7..abc7d24668691 100644 --- a/sycl/test-e2e/InlineAsm/include/asmhelper.h +++ b/sycl/test-e2e/InlineAsm/include/asmhelper.h @@ -91,8 +91,8 @@ auto exception_handler = [](sycl::exception_list exceptions) { }; template -bool launchInlineASMTestImpl(F &f, bool requires_particular_sg_size = true, - std::vector RequiredSGSizes = {}) { +bool launchInlineASMTestImpl(F &f, + const std::vector &RequiredSGSizes = {}) { sycl::queue deviceQueue(sycl::gpu_selector_v, exception_handler); sycl::device device = deviceQueue.get_device(); @@ -101,22 +101,19 @@ bool launchInlineASMTestImpl(F &f, bool requires_particular_sg_size = true, return false; } - auto Vec = device.get_info(); - if (requires_particular_sg_size && - std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") == - std::end(Vec)) { - std::cout << "Skipping test\n"; - return false; - } - - auto sg_sizes = device.get_info(); - if (std::any_of(RequiredSGSizes.begin(), RequiredSGSizes.end(), - [&](size_t RequiredSGSize) { - return std::find(sg_sizes.begin(), sg_sizes.end(), - RequiredSGSize) == sg_sizes.end(); - })) { - std::cout << "Skipping test\n"; - return false; + if (!RequiredSGSizes.empty()) { + auto supported_sg_sizes = + device.get_info(); + if (std::any_of(RequiredSGSizes.begin(), RequiredSGSizes.end(), + [&](size_t RequiredSGSize) { + return std::find(supported_sg_sizes.begin(), + supported_sg_sizes.end(), + RequiredSGSize) == + supported_sg_sizes.end(); + })) { + std::cout << "Skipping test\n"; + return false; + } } deviceQueue.submit(f).wait_and_throw(); @@ -128,13 +125,11 @@ bool launchInlineASMTestImpl(F &f, bool requires_particular_sg_size = true, /// /// \returns false if test wasn't launched (i.e.was skipped) and true otherwise template -bool launchInlineASMTest(F &f, bool requires_particular_sg_size = true, - bool exception_expected = false, - std::vector RequiredSGSizes = {}) { +bool launchInlineASMTest(F &f, const std::vector &RequiredSGSizes = {}, + bool exception_expected = false) { bool result = false; try { - result = launchInlineASMTestImpl(f, requires_particular_sg_size, - RequiredSGSizes); + result = launchInlineASMTestImpl(f, RequiredSGSizes); } catch (sycl::exception &e) { std::string what = e.what(); if (exception_expected && diff --git a/sycl/test-e2e/InlineAsm/letter_example.cpp b/sycl/test-e2e/InlineAsm/letter_example.cpp index 393c362c33fe0..c09c6d9f75ae4 100644 --- a/sycl/test-e2e/InlineAsm/letter_example.cpp +++ b/sycl/test-e2e/InlineAsm/letter_example.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16,aspect-usm_shared_allocations // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -15,10 +15,7 @@ int main() { sycl::queue q; sycl::device Device = q.get_device(); - auto Vec = Device.get_info(); - if (!isInlineASMSupported(Device) || - std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") == - std::end(Vec)) { + if (!isInlineASMSupported(Device)) { std::cout << "Skipping test\n"; return 0; } @@ -31,7 +28,7 @@ int main() { q.submit([&](sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>(problem_size), - [=](sycl::id<1> idx) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> idx) [[sycl::reqd_sub_group_size(16)]] { #if defined(__SYCL_DEVICE_ONLY__) int i = idx[0]; asm volatile("{\n.decl V52 v_type=G type=d num_elts=16 align=GRF\n" diff --git a/sycl/test-e2e/InlineAsm/malloc_shared_32.cpp b/sycl/test-e2e/InlineAsm/malloc_shared_32.cpp index be2ee81836261..c541ac4d17838 100644 --- a/sycl/test-e2e/InlineAsm/malloc_shared_32.cpp +++ b/sycl/test-e2e/InlineAsm/malloc_shared_32.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-32,aspect-usm_shared_allocations // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -16,10 +16,7 @@ int main() { sycl::device Device = q.get_device(); - auto Vec = Device.get_info(); - if (!isInlineASMSupported(Device) || - std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") == - std::end(Vec)) { + if (!isInlineASMSupported(Device)) { std::cout << "Skipping test\n"; return 0; } @@ -40,7 +37,7 @@ int main() { q.submit([&](sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>(problem_size), - [=](sycl::id<1> idx) [[intel::reqd_sub_group_size(32)]] { + [=](sycl::id<1> idx) [[sycl::reqd_sub_group_size(32)]] { int i = idx[0]; #if defined(__SYCL_DEVICE_ONLY__) asm volatile(R"a( diff --git a/sycl/test-e2e/InlineAsm/malloc_shared_in_out_dif.cpp b/sycl/test-e2e/InlineAsm/malloc_shared_in_out_dif.cpp index 0470bb2431e9a..5ea597d5d1064 100644 --- a/sycl/test-e2e/InlineAsm/malloc_shared_in_out_dif.cpp +++ b/sycl/test-e2e/InlineAsm/malloc_shared_in_out_dif.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16,aspect-usm_shared_allocations // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -16,10 +16,7 @@ int main() { sycl::device Device = q.get_device(); - auto Vec = Device.get_info(); - if (!isInlineASMSupported(Device) || - std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") == - std::end(Vec)) { + if (!isInlineASMSupported(Device)) { std::cout << "Skipping test\n"; return 0; } @@ -37,7 +34,7 @@ int main() { q.submit([&](sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>(problem_size), - [=](sycl::id<1> idx) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> idx) [[sycl::reqd_sub_group_size(16)]] { int i = idx[0]; volatile int tmp = a[i]; tmp += 1; diff --git a/sycl/test-e2e/InlineAsm/malloc_shared_no_input.cpp b/sycl/test-e2e/InlineAsm/malloc_shared_no_input.cpp index 76ad3f6e95260..3da4e4c7e7e71 100644 --- a/sycl/test-e2e/InlineAsm/malloc_shared_no_input.cpp +++ b/sycl/test-e2e/InlineAsm/malloc_shared_no_input.cpp @@ -1,5 +1,5 @@ // UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux +// REQUIRES: gpu,linux,sg-16,aspect-usm_shared_allocations // RUN: %{build} -o %t.out // RUN: %{run} %t.out @@ -15,10 +15,7 @@ int main() { sycl::queue q; sycl::device Device = q.get_device(); - auto Vec = Device.get_info(); - if (!isInlineASMSupported(Device) || - std::find(Vec.begin(), Vec.end(), "cl_intel_required_subgroup_size") == - std::end(Vec)) { + if (!isInlineASMSupported(Device)) { std::cout << "Skipping test\n"; return 0; } @@ -31,7 +28,7 @@ int main() { q.submit([&](sycl::handler &cgh) { cgh.parallel_for( sycl::range<1>(problem_size), - [=](sycl::id<1> idx) [[intel::reqd_sub_group_size(16)]] { + [=](sycl::id<1> idx) [[sycl::reqd_sub_group_size(16)]] { int i = idx[0]; #if defined(__SYCL_DEVICE_ONLY__) asm volatile("mov (M1, 16) %0(0,0)<1> 0x7:d" : "=rw"(a[i])); From 89eccced17d0654659516b2709327bd09729729e Mon Sep 17 00:00:00 2001 From: Maksim Sabianin Date: Thu, 8 Feb 2024 17:37:03 +0100 Subject: [PATCH 17/26] [SYCL] Move module splitting functionality from sycl-post-link to SYCLLowerIR (#12622) This is a part of migration to New Offloading model and clang-linker-wrapper tool. The signature of the verifyNoCrossModuleDeviceGlobalUsage is changed so that it returns an Error instead of aborting the executable. --- .../llvm/SYCLLowerIR}/ModuleSplitter.h | 10 +++-- llvm/lib/SYCLLowerIR/CMakeLists.txt | 1 + .../SYCLLowerIR}/ModuleSplitter.cpp | 43 ++++++++++++------- llvm/tools/sycl-post-link/CMakeLists.txt | 1 - .../sycl-post-link/SYCLDeviceRequirements.cpp | 2 +- llvm/tools/sycl-post-link/sycl-post-link.cpp | 9 ++-- 6 files changed, 42 insertions(+), 24 deletions(-) rename llvm/{tools/sycl-post-link => include/llvm/SYCLLowerIR}/ModuleSplitter.h (97%) rename llvm/{tools/sycl-post-link => lib/SYCLLowerIR}/ModuleSplitter.cpp (97%) diff --git a/llvm/tools/sycl-post-link/ModuleSplitter.h b/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h similarity index 97% rename from llvm/tools/sycl-post-link/ModuleSplitter.h rename to llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h index aaf6108325765..eb09e7528ff49 100644 --- a/llvm/tools/sycl-post-link/ModuleSplitter.h +++ b/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h @@ -10,7 +10,8 @@ // of the split is new modules containing corresponding callgraph. //===----------------------------------------------------------------------===// -#pragma once +#ifndef LLVM_SYCLLOWERIR_MODULE_SPLITTER_H +#define LLVM_SYCLLOWERIR_MODULE_SPLITTER_H #include "llvm/ADT/SetVector.h" #include "llvm/ADT/StringRef.h" @@ -18,6 +19,7 @@ #include "llvm/Support/Error.h" #include +#include #include namespace llvm { @@ -229,8 +231,8 @@ class ModuleSplitterBase { // For device global variables with the 'device_image_scope' property, // the function checks that there are no usages of a single device global // variable from kernels grouped to different modules. Otherwise, an error is - // issued and the tool is aborted. - void verifyNoCrossModuleDeviceGlobalUsage(); + // returned. + Error verifyNoCrossModuleDeviceGlobalUsage(); virtual ~ModuleSplitterBase() = default; @@ -262,3 +264,5 @@ void dumpEntryPoints(const Module &M, bool OnlyKernelsAreEntryPoints = false, } // namespace module_split } // namespace llvm + +#endif // LLVM_SYCLLOWERIR_MODULE_SPLITTER_H diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt index cfafbe1a66c87..b2afa8150aa4f 100644 --- a/llvm/lib/SYCLLowerIR/CMakeLists.txt +++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt @@ -62,6 +62,7 @@ add_llvm_component_library(LLVMSYCLLowerIR LowerInvokeSimd.cpp LowerWGLocalMemory.cpp LowerWGScope.cpp + ModuleSplitter.cpp MutatePrintfAddrspace.cpp SYCLAddOptLevelAttribute.cpp SYCLPropagateAspectsUsage.cpp diff --git a/llvm/tools/sycl-post-link/ModuleSplitter.cpp b/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp similarity index 97% rename from llvm/tools/sycl-post-link/ModuleSplitter.cpp rename to llvm/lib/SYCLLowerIR/ModuleSplitter.cpp index 87cbf42da2df2..92ff992141945 100644 --- a/llvm/tools/sycl-post-link/ModuleSplitter.cpp +++ b/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp @@ -8,9 +8,7 @@ // See comments in the header. //===----------------------------------------------------------------------===// -#include "ModuleSplitter.h" -#include "Support.h" - +#include "llvm/SYCLLowerIR/ModuleSplitter.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringExtras.h" @@ -23,6 +21,7 @@ #include "llvm/SYCLLowerIR/DeviceGlobals.h" #include "llvm/SYCLLowerIR/LowerInvokeSimd.h" #include "llvm/SYCLLowerIR/SYCLUtils.h" +#include "llvm/Support/Error.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/GlobalDCE.h" #include "llvm/Transforms/IPO/StripDeadPrototypes.h" @@ -426,14 +425,15 @@ class ModuleSplitter : public ModuleSplitterBase { DependencyGraph CG; }; } // namespace + namespace llvm { namespace module_split { -void ModuleSplitterBase::verifyNoCrossModuleDeviceGlobalUsage() { +Error ModuleSplitterBase::verifyNoCrossModuleDeviceGlobalUsage() { const Module &M = getInputModule(); // Early exit if there is only one group if (Groups.size() < 2) - return; + return Error::success(); // Reverse the EntryPointGroupMap to get a map of entry point -> module's name unsigned EntryPointNumber = 0; @@ -451,19 +451,25 @@ void ModuleSplitterBase::verifyNoCrossModuleDeviceGlobalUsage() { std::optional VarEntryPointModule{}; auto CheckEntryPointModule = [&VarEntryPointModule, &EntryPointModules, - &GV](const auto *F) { + &GV](const auto *F) -> Error { auto EntryPointModulesIt = EntryPointModules.find(F); - assert(EntryPointModulesIt != EntryPointModules.end() && - "There is no group for an entry point"); + if (EntryPointModulesIt == EntryPointModules.end()) + return createStringError(inconvertibleErrorCode(), + "There is no group for an entry point"); + if (!VarEntryPointModule.has_value()) { VarEntryPointModule = EntryPointModulesIt->second; - return; - } - if (EntryPointModulesIt->second != *VarEntryPointModule) { - error("device_global variable '" + Twine(GV.getName()) + - "' with property \"device_image_scope\" is used in more " - "than one device image."); + return Error::success(); } + + if (EntryPointModulesIt->second != *VarEntryPointModule) + return createStringError( + inconvertibleErrorCode(), + "device_global variable '" + Twine(GV.getName()) + + "' with property \"device_image_scope\" is used in more " + "than one device image."); + + return Error::success(); }; SmallSetVector Workqueue; @@ -478,13 +484,18 @@ void ModuleSplitterBase::verifyNoCrossModuleDeviceGlobalUsage() { continue; } if (auto *F = dyn_cast(U)) { - if (EntryPointModules.count(F)) - CheckEntryPointModule(F); + if (EntryPointModules.count(F)) { + auto E = CheckEntryPointModule(F); + if (E) + return E; + } } for (auto *UU : U->users()) Workqueue.insert(UU); } } + + return Error::success(); } #ifndef NDEBUG diff --git a/llvm/tools/sycl-post-link/CMakeLists.txt b/llvm/tools/sycl-post-link/CMakeLists.txt index 40bd3f899e487..3905e836aaae8 100644 --- a/llvm/tools/sycl-post-link/CMakeLists.txt +++ b/llvm/tools/sycl-post-link/CMakeLists.txt @@ -24,7 +24,6 @@ include_directories( add_llvm_tool(sycl-post-link sycl-post-link.cpp - ModuleSplitter.cpp SpecConstants.cpp SYCLDeviceLibReqMask.cpp SYCLKernelParamOptInfo.cpp diff --git a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp index 70d3a90f51785..5255ce7bf2a66 100644 --- a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp +++ b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp @@ -7,11 +7,11 @@ //===----------------------------------------------------------------------===// #include "SYCLDeviceRequirements.h" -#include "ModuleSplitter.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/Module.h" +#include "llvm/SYCLLowerIR/ModuleSplitter.h" #include "llvm/Support/PropertySetIO.h" #include diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp index 85728be8aabf0..068962b63c1fc 100644 --- a/llvm/tools/sycl-post-link/sycl-post-link.cpp +++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp @@ -13,7 +13,6 @@ // - specialization constant intrinsic transformation //===----------------------------------------------------------------------===// -#include "ModuleSplitter.h" #include "SYCLDeviceLibReqMask.h" #include "SYCLDeviceRequirements.h" #include "SYCLKernelParamOptInfo.h" @@ -40,6 +39,7 @@ #include "llvm/SYCLLowerIR/ESIMD/LowerESIMD.h" #include "llvm/SYCLLowerIR/HostPipes.h" #include "llvm/SYCLLowerIR/LowerInvokeSimd.h" +#include "llvm/SYCLLowerIR/ModuleSplitter.h" #include "llvm/SYCLLowerIR/SYCLUtils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" @@ -1009,8 +1009,11 @@ processInputModule(std::unique_ptr M) { Modified |= SplitOccurred; // FIXME: this check is not performed for ESIMD splits - if (DeviceGlobals) - Splitter->verifyNoCrossModuleDeviceGlobalUsage(); + if (DeviceGlobals) { + auto E = Splitter->verifyNoCrossModuleDeviceGlobalUsage(); + if (E) + error(toString(std::move(E))); + } // It is important that we *DO NOT* preserve all the splits in memory at the // same time, because it leads to a huge RAM consumption by the tool on bigger From 7f5ade2bd05cc8686bd50f6356000dc7eba5bcb7 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Thu, 8 Feb 2024 09:44:15 -0800 Subject: [PATCH 18/26] [NFC][SYCL] Move helpers from builtins_preview.hpp to their single uses (#12654) There are some issues with them as well (both in naming and bugs in implementation), to be fixed in a separate PR to ease review. --- sycl/include/sycl/builtins_preview.hpp | 23 ------------------- .../detail/builtins/integer_functions.inc | 12 ++++++++++ .../detail/builtins/relational_functions.inc | 14 +++++++++++ 3 files changed, 26 insertions(+), 23 deletions(-) diff --git a/sycl/include/sycl/builtins_preview.hpp b/sycl/include/sycl/builtins_preview.hpp index bc497c540693b..dfc44e5848f91 100644 --- a/sycl/include/sycl/builtins_preview.hpp +++ b/sycl/include/sycl/builtins_preview.hpp @@ -175,12 +175,6 @@ auto builtin_delegate_to_scalar(FuncTy F, const Ts &...x) { } } -template -struct any_elem_type - : std::bool_constant, float, double, half, char, signed char, short, - int, long, long long, unsigned char, unsigned short, unsigned int, - unsigned long, unsigned long long>> {}; template struct fp_elem_type : std::bool_constant< @@ -188,16 +182,6 @@ struct fp_elem_type template struct float_elem_type : std::bool_constant, float>> {}; -template -struct integer_elem_type - : std::bool_constant< - check_type_in_v, char, signed char, short, int, - long, long long, unsigned char, unsigned short, - unsigned int, unsigned long, unsigned long long>> {}; -template -struct suint32_elem_type - : std::bool_constant< - check_type_in_v, int32_t, uint32_t>> {}; template struct same_basic_shape : std::bool_constant> {}; @@ -244,13 +228,6 @@ struct builtin_enable SHAPE_CHECKER, EXTRA_CONDITIONS, Ts...>::type; \ } } // namespace detail - -BUILTIN_CREATE_ENABLER(builtin_enable_generic, default_ret_type, any_elem_type, - any_shape, same_elem_type) -BUILTIN_CREATE_ENABLER(builtin_enable_generic_scalar, default_ret_type, - any_elem_type, scalar_only, same_elem_type) -BUILTIN_CREATE_ENABLER(builtin_enable_generic_non_scalar, default_ret_type, - any_elem_type, non_scalar_only, same_elem_type) } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/detail/builtins/integer_functions.inc b/sycl/include/sycl/detail/builtins/integer_functions.inc index 455abf07aa250..dab0cee8a647a 100644 --- a/sycl/include/sycl/detail/builtins/integer_functions.inc +++ b/sycl/include/sycl/detail/builtins/integer_functions.inc @@ -12,6 +12,18 @@ namespace sycl { inline namespace _V1 { +namespace detail { +template +struct integer_elem_type + : std::bool_constant< + check_type_in_v, char, signed char, short, int, + long, long long, unsigned char, unsigned short, + unsigned int, unsigned long, unsigned long long>> {}; +template +struct suint32_elem_type + : std::bool_constant< + check_type_in_v, int32_t, uint32_t>> {}; +} // namespace detail BUILTIN_CREATE_ENABLER(builtin_enable_integer, default_ret_type, integer_elem_type, any_shape, same_elem_type) BUILTIN_CREATE_ENABLER(builtin_enable_integer_non_scalar, default_ret_type, diff --git a/sycl/include/sycl/detail/builtins/relational_functions.inc b/sycl/include/sycl/detail/builtins/relational_functions.inc index d23f144676cdb..0a38d79d20589 100644 --- a/sycl/include/sycl/detail/builtins/relational_functions.inc +++ b/sycl/include/sycl/detail/builtins/relational_functions.inc @@ -13,6 +13,13 @@ namespace sycl { inline namespace _V1 { namespace detail { +template +struct any_elem_type + : std::bool_constant, float, double, half, char, signed char, short, + int, long, long long, unsigned char, unsigned short, unsigned int, + unsigned long, unsigned long long>> {}; + template struct rel_ret_traits : std::conditional, bool, @@ -21,6 +28,13 @@ struct rel_ret_traits same_size_signed_int_t>>> { }; } // namespace detail + +BUILTIN_CREATE_ENABLER(builtin_enable_generic, default_ret_type, any_elem_type, + any_shape, same_elem_type) +BUILTIN_CREATE_ENABLER(builtin_enable_generic_scalar, default_ret_type, + any_elem_type, scalar_only, same_elem_type) +BUILTIN_CREATE_ENABLER(builtin_enable_generic_non_scalar, default_ret_type, + any_elem_type, non_scalar_only, same_elem_type) BUILTIN_CREATE_ENABLER(builtin_enable_rel, rel_ret_traits, fp_elem_type, non_scalar_only, same_elem_type) From 573e28b4085c89c1f65ab70da5d83d34ae9f0bb7 Mon Sep 17 00:00:00 2001 From: Piotr Balcer Date: Thu, 8 Feb 2024 20:35:04 +0100 Subject: [PATCH 19/26] [UR] update UR commit (#12658) --- sycl/plugins/unified_runtime/CMakeLists.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 6a91815bc890f..2ea2b2f37622a 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -57,13 +57,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) include(FetchContent) set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") - # commit 1cd402ead34a54459a6acb85777fbec105a178a0 - # Merge: 5b89ee8b c8e150c5 + # commit 9babc4d092a92c1036791d26ef328e5eeaf19803 + # Merge: 3be8f205 90498ec5 # Author: aarongreig - # Date: Tue Feb 6 14:48:55 2024 +0000 - # Merge pull request #1218 from Bensuo/maxime/imm-cmd-list-support - # [EXP][CMDBUF] L0 Immediate command-list support - set(UNIFIED_RUNTIME_TAG 1cd402ead34a54459a6acb85777fbec105a178a0) + # Date: Thu Feb 8 15:44:54 2024 +0000 + # Merge pull request #1321 from pbalcer/adapter-compute-constructor + # [L0] move adapter init into its constructor from urAdapterGet + set(UNIFIED_RUNTIME_TAG 9babc4d092a92c1036791d26ef328e5eeaf19803) if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO) set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}") From 7ee7e906e858c229cc256185560968754b773091 Mon Sep 17 00:00:00 2001 From: fineg74 <61437305+fineg74@users.noreply.github.com> Date: Thu, 8 Feb 2024 11:50:46 -0800 Subject: [PATCH 20/26] [SYCL][ESIMD] Use LLVM IR for USM/SLM scatter (#12628) --- llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp | 38 +++++++++++++++++++ .../ext/intel/esimd/detail/memory_intrin.hpp | 14 +++++++ sycl/include/sycl/ext/intel/esimd/memory.hpp | 19 +++++++++- .../ESIMD/unified_memory_api/scatter_usm.cpp | 6 +-- .../unified_memory_api/scatter_usm_legacy.cpp | 21 ++++++++++ .../ESIMD/unified_memory_api/slm_scatter.cpp | 8 ++-- .../unified_memory_api/slm_scatter_legacy.cpp | 20 ++++++++++ sycl/test/esimd/memory_properties.cpp | 25 +++++++++--- 8 files changed, 137 insertions(+), 14 deletions(-) create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_legacy.cpp create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_legacy.cpp diff --git a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp index 6ec1102f402ba..15c5e7c9a625e 100644 --- a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp +++ b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp @@ -1004,6 +1004,37 @@ static void translateGatherLoad(CallInst &CI, bool IsSLM) { CI.replaceAllUsesWith(LI); } +static void translateScatterStore(CallInst &CI, bool IsSLM) { + IRBuilder<> Builder(&CI); + constexpr int AlignmentTemplateArgIdx = 2; + APInt Val = parseTemplateArg(CI, AlignmentTemplateArgIdx, + ESIMDIntrinDesc::GenXArgConversion::TO_I64); + Align AlignValue(Val.getZExtValue()); + + auto ValsOp = CI.getArgOperand(0); + auto OffsetsOp = CI.getArgOperand(1); + auto MaskOp = CI.getArgOperand(2); + auto DataType = ValsOp->getType(); + + // Convert the mask from to . + Value *Zero = ConstantInt::get(MaskOp->getType(), 0); + MaskOp = Builder.CreateICmp(ICmpInst::ICMP_NE, MaskOp, Zero); + + // The address space may be 3-SLM, 1-global or private. + // At the moment of calling 'scatter()' operation the pointer passed to it + // is already 4-generic. Thus, simply use 4-generic for global and private + // and let GPU BE deduce the actual address space from the use-def graph. + unsigned AS = IsSLM ? 3 : 4; + auto ElemType = DataType->getScalarType(); + auto NumElems = (cast(DataType))->getElementCount(); + auto VPtrType = VectorType::get(PointerType::get(ElemType, AS), NumElems); + auto VPtrOp = Builder.CreateIntToPtr(OffsetsOp, VPtrType); + + auto SI = Builder.CreateMaskedScatter(ValsOp, VPtrOp, AlignValue, MaskOp); + SI->setDebugLoc(CI.getDebugLoc()); + CI.replaceAllUsesWith(SI); +} + // TODO Specify document behavior for slm_init and nbarrier_init when: // 1) they are called not from kernels // 2) there are multiple such calls reachable from a kernel @@ -1987,6 +2018,13 @@ size_t SYCLLowerESIMDPass::runOnFunction(Function &F, continue; } + if (Name.starts_with("__esimd_scatter_st") || + Name.starts_with("__esimd_slm_scatter_st")) { + translateScatterStore(*CI, Name.starts_with("__esimd_slm_scatter_st")); + ToErase.push_back(CI); + continue; + } + if (Name.starts_with("__esimd_nbarrier_init")) { translateNbarrierInit(*CI); ToErase.push_back(CI); diff --git a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp index d723d03353cb2..c9d0d6cc4d410 100644 --- a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp @@ -213,6 +213,20 @@ __ESIMD_INTRIN __ESIMD_DNS::vector_type_t __esimd_slm_gather_ld( __ESIMD_DNS::simd_mask_storage_t pred, __ESIMD_DNS::vector_type_t pass_thru) __ESIMD_INTRIN_END; +// Scatter data to given global or private addresses. +template +__ESIMD_INTRIN void +__esimd_scatter_st(__ESIMD_DNS::vector_type_t vals, + __ESIMD_DNS::vector_type_t vptr, + __ESIMD_DNS::simd_mask_storage_t pred) __ESIMD_INTRIN_END; + +// Scatter data to given SLM addresses. +template +__ESIMD_INTRIN void __esimd_slm_scatter_st( + __ESIMD_DNS::vector_type_t vals, + __ESIMD_DNS::vector_type_t vptr, + __ESIMD_DNS::simd_mask_storage_t pred) __ESIMD_INTRIN_END; + /// Surface-based gather. /// Supported platforms: DG2, PVC /// diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index 7f4833017d38a..4ffd5c418138c 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -724,11 +724,20 @@ scatter(T *p, simd byte_offsets, simd vals, // Use LSC lowering if L1/L2 or VS > 1. if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none || - VS > 1 || !__ESIMD_DNS::isPowerOf2(N, 32)) { + VS > 1 || + (!__ESIMD_DNS::isPowerOf2(N, 32) && + !detail::isMaskedGatherScatterLLVMAvailable())) { static_assert(VS == 1 || sizeof(T) >= 4, "VS > 1 is supprted only for 4- and 8-byte elements"); return detail::scatter_impl(p, byte_offsets, vals, mask); + } else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) { + simd Addrs(reinterpret_cast(p)); + Addrs = Addrs + convert(byte_offsets); + using MsgT = detail::__raw_t; + __esimd_scatter_st( + sycl::bit_cast<__ESIMD_DNS::vector_type_t>(vals.data()), + Addrs.data(), mask.data()); } else { using Tx = detail::__raw_t; simd byte_offsets_i = convert(byte_offsets); @@ -4227,9 +4236,15 @@ slm_scatter(simd byte_offsets, simd vals, "slm_scatter() requires at least element-size alignment"); // Use LSC lowering if VS > 1. - if constexpr (VS > 1 || !(detail::isPowerOf2(N, 32) && sizeof(T) <= 4)) { + if constexpr (VS > 1 || (!(detail::isPowerOf2(N, 32) && sizeof(T) <= 4) && + !detail::isMaskedGatherScatterLLVMAvailable())) { __ESIMD_DNS::slm_scatter_impl( byte_offsets, vals, mask); + } else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) { + using MsgT = detail::__raw_t; + __esimd_slm_scatter_st( + sycl::bit_cast<__ESIMD_DNS::vector_type_t>(vals.data()), + byte_offsets.data(), mask.data()); } else { detail::LocalAccessorMarker acc; detail::scatter_impl(acc, vals, byte_offsets, 0, mask); diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp index 929d3c6fc04f7..ff331a421ccef 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp @@ -5,13 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===------------------------------------------------------------------===// -// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{build} -fsycl-device-code-split=per_kernel -D__ESIMD_GATHER_SCATTER_LLVM_IR -o %t.out // RUN: %{run} %t.out // The test verifies esimd::scatter() functions accepting USM pointer // and optional compile-time esimd::properties. -// The scatter() calls in this test do not use cache-hint -// properties to not impose using DG2/PVC features. +// The scatter() calls in this test do not use cache-hint properties +// or VS > 1 (number of stores per offset) to not impose using PVC features. #include "Inputs/scatter.hpp" diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_legacy.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_legacy.cpp new file mode 100644 index 0000000000000..08ac29ba6b605 --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_legacy.cpp @@ -0,0 +1,21 @@ +//==------- scatter_usm_legacy.cpp - DPC++ ESIMD on-device test -----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Use per-kernel compilation to have more information about failing cases. +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::scatter() functions accepting USM pointer +// and optional compile-time esimd::properties. +// The scatter() calls in this test do not use cache-hint properties +// or VS > 1 (number of stores per offset) to not impose using PVC features. +// +// TODO: Remove this test when GPU driver issue with llvm.masked.scatter is +// resolved and ESIMD starts using llvm.masked.scatter by default. +// "-D__ESIMD_GATHER_SCATTER_LLVM_IR" is not used here. + +#include "scatter_usm.cpp" diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp index 96c49fcdae8af..ffa0a718e7689 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp @@ -5,12 +5,12 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===------------------------------------------------------------------===// -// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{build} -fsycl-device-code-split=per_kernel -D__ESIMD_GATHER_SCATTER_LLVM_IR -o %t.out // RUN: %{run} %t.out -// The test verifies esimd::slm_scatter() functions accepting -// optional compile-time esimd::properties. -// The scatter() calls in this test do not use DG2/PVC features. +// The test verifies esimd::slm_scatter() functions accepting optional +// compile-time esimd::properties. The slm_scatter() calls in this test do not +// use VS > 1 (number of stores per offset) to not impose using PVC features. #include "Inputs/scatter.hpp" diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_legacy.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_legacy.cpp new file mode 100644 index 0000000000000..0aca53311ef6b --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_legacy.cpp @@ -0,0 +1,20 @@ +//==------- slm_scatter_legacy.cpp - DPC++ ESIMD on-device test -----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Use per-kernel compilation to have more information about failing cases. +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::slm_scatter() functions accepting optional +// compile-time esimd::properties. The slm_scatter() calls in this test do not +// use VS > 1 (number of stores per offset) to not impose using PVC features. +// +// TODO: Remove this test when GPU driver issue with llvm.masked.scatter is +// resolved and ESIMD starts using llvm.masked.scatter by default. +// "-D__ESIMD_GATHER_SCATTER_LLVM_IR" is not used here. + +#include "slm_scatter.cpp" diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp index eb629935347db..8305bd9b83b18 100644 --- a/sycl/test/esimd/memory_properties.cpp +++ b/sycl/test/esimd/memory_properties.cpp @@ -1224,7 +1224,7 @@ test_gather_scatter(AccType &acc, LocalAccType &local_acc, float *ptrf, acc_res = gather(local_acc, ioffset_n32, 0); acc_res = gather(local_acc, ioffset_n32, 0, mask_n32); - // CHECK-COUNT-4: call void @llvm.genx.svm.scatter.v32i1.v32i64.v32f32(<32 x i1> {{[^)]+}}, i32 0, <32 x i64> {{[^)]+}}, <32 x float> {{[^)]+}}) + // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p4(<32 x float> {{[^)]+}}, <32 x ptr addrspace(4)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}) scatter(ptrf, ioffset_n32, usm, mask_n32); scatter(ptrf, ioffset_n32, usm); @@ -1281,6 +1281,14 @@ test_gather_scatter(AccType &acc, LocalAccType &local_acc, float *ptrf, scatter(ptrf, ioffset_n16_view, usm_view, mask_n16); scatter(ptrf, ioffset_n16_view, usm_view); + + simd ioffset_n10(byte_offset32, 8); + simd usm_n10; + + // Check special case to verify that for cases when N is not power of 2 llvm + // intrinsic is used + // CHECK-COUNT-1: call void @llvm.masked.scatter.v10f32.v10p4(<10 x float> {{[^)]+}}, <10 x ptr addrspace(4)> {{[^)]+}}, i32 4, <10 x i1> {{[^)]+}}) + scatter(ptrf, ioffset_n10, usm_n10); } // CHECK-LABEL: define {{.*}} @_Z23test_slm_gather_scatter{{.*}} @@ -1381,26 +1389,26 @@ test_slm_gather_scatter(int byte_offset32) { // 3) slm_scatter(...): same as (1), (2) above, but with VS > 1. // 1) slm_scatter(offsets): offsets is simd or simd_view - // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}) slm_scatter(ioffset_n32, slm); slm_scatter(ioffset_n32_view, slm); slm_scatter(ioffset_n32, slm_view); slm_scatter(ioffset_n32_view, slm_view); - // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 8, <32 x i1> {{[^)]+}}) slm_scatter(ioffset_n32, slm, props_align8); slm_scatter(ioffset_n32_view, slm, props_align8); slm_scatter(ioffset_n32, slm_view, props_align8); slm_scatter(ioffset_n32_view, slm_view, props_align8); // 2) slm_gather(offsets, mask): offsets is simd or simd_view - // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}) slm_scatter(ioffset_n32, slm, mask_n32); slm_scatter(ioffset_n32_view, slm, mask_n32); slm_scatter(ioffset_n32, slm_view, mask_n32); slm_scatter(ioffset_n32_view, slm_view, mask_n32); - // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 8, <32 x i1> {{[^)]+}}) slm_scatter(ioffset_n32, slm, mask_n32, props_align8); slm_scatter(ioffset_n32_view, slm, mask_n32, props_align8); slm_scatter(ioffset_n32, slm_view, mask_n32, props_align8); @@ -1429,4 +1437,11 @@ test_slm_gather_scatter(int byte_offset32) { slm_scatter(ioffset_n16_view, slm, mask_n16, props_align4); slm_scatter(ioffset_n16, slm_view, mask_n16, props_align4); slm_scatter(ioffset_n16_view, slm_view, mask_n16, props_align4); + + simd ioffset_n10(byte_offset32, 8); + simd usm_n10; + // Check special case to verify that for cases when N is not power of 2 llvm + // intrinsic is used + // CHECK-COUNT-1: call void @llvm.masked.scatter.v10f32.v10p3(<10 x float> {{[^)]+}}, <10 x ptr addrspace(3)> {{[^)]+}}, i32 4, <10 x i1> {{[^)]+}}) + slm_scatter(ioffset_n10, usm_n10); } From 34051496442355315f6f096c6022dbff9b2e510c Mon Sep 17 00:00:00 2001 From: Vyacheslav Klochkov Date: Thu, 8 Feb 2024 15:11:35 -0600 Subject: [PATCH 21/26] [ESIMD] Fix obsolete TODO comments + enable more cases in ballot.cpp (#12657) Signed-off-by: Klochkov, Vyacheslav N --- sycl/test-e2e/ESIMD/api/ballot.cpp | 12 +++++------- sycl/test-e2e/ESIMD/dpas/dpas_common.hpp | 5 +---- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/sycl/test-e2e/ESIMD/api/ballot.cpp b/sycl/test-e2e/ESIMD/api/ballot.cpp index a636bcee147e0..4f87e562972db 100644 --- a/sycl/test-e2e/ESIMD/api/ballot.cpp +++ b/sycl/test-e2e/ESIMD/api/ballot.cpp @@ -71,15 +71,13 @@ template bool test(queue &Q) { template bool test(queue &Q) { bool Pass = true; - // TODO: uncomment calls below once simd<...>.copy_from() starts supporting - // sizes other than 8, 16 and 32. - // Pass &= test(Q); + Pass &= test(Q); Pass &= test(Q); - // Pass &= test(Q); + Pass &= test(Q); Pass &= test(Q); - // Pass &= test(Q); - // Pass &= test(Q); - // Pass &= test(Q); + Pass &= test(Q); + Pass &= test(Q); + Pass &= test(Q); Pass &= test(Q); return Pass; diff --git a/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp b/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp index 3d50a0915afe1..4c9e528646545 100644 --- a/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp +++ b/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp @@ -94,7 +94,6 @@ template struct DpasNaturalOperandType { static constexpr bool is_bf16 = T == dpas_argument_type::bf16; static constexpr bool is_tf32 = T == dpas_argument_type::tf32; - // TODO: support tf32 here. using type = std::conditional_t< is_sint, signed char, std::conditional_t< @@ -149,7 +148,7 @@ void writeToHorizontallyPackedMatrix(void *VVec, int Row, int Col, ElemT *Vec = reinterpret_cast(VVec); // 1. Find and read the target 'unsigned int' element. - // THe unpacked matrix has dimensions: NumRows*NumCols + // The unpacked matrix dimensions are NumRows*NumCols. constexpr int ElemsInElemT = sizeof(ElemT) * 8 / ElemBitSize; int UnpackedLinearIndex = Row * NumCols + Col; int PackedLinearIndex = UnpackedLinearIndex / ElemsInElemT; @@ -160,7 +159,6 @@ void writeToHorizontallyPackedMatrix(void *VVec, int Row, int Col, } else { ElemT TargetElem = Vec[PackedLinearIndex]; // TargetElem has 2 or more elements in it. Need to extract one. - // TODO: for now assume that is the case only for 2 or 4-bit integers. assert((ElemBitSize == 2 || ElemBitSize == 4) && "Unexpected element type"); unsigned int Offset = (UnpackedLinearIndex % ElemsInElemT) * ElemBitSize; @@ -196,7 +194,6 @@ ReadT readFromHorizontallyPackedMatrix(void *VVec, int Row, int Col) { return static_cast(TargetElem); } else { // TargetElem has 2 or more elements in it. Need to extract one. - // TODO: for now assume that is the case only for 2 or 4-bit integers. assert((ElemBitSize == 2 || ElemBitSize == 4) && "Unexpected element type"); unsigned int Offset = (UnpackedLinearIndex % ElemsInElemT) * ElemBitSize; unsigned int Mask = (static_cast(1) << ElemBitSize) - 1; From dbd65abb45fea9ca0fd80feba56085141acf0944 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Thu, 8 Feb 2024 14:30:32 -0800 Subject: [PATCH 22/26] [NFC][SYCL] Rename builtin_enabler_* for bitselect (#12664) --- .../detail/builtins/relational_functions.inc | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/sycl/include/sycl/detail/builtins/relational_functions.inc b/sycl/include/sycl/detail/builtins/relational_functions.inc index 0a38d79d20589..fb0b3f7682b0a 100644 --- a/sycl/include/sycl/detail/builtins/relational_functions.inc +++ b/sycl/include/sycl/detail/builtins/relational_functions.inc @@ -14,7 +14,7 @@ namespace sycl { inline namespace _V1 { namespace detail { template -struct any_elem_type +struct bitselect_elem_type : std::bool_constant, float, double, half, char, signed char, short, int, long, long long, unsigned char, unsigned short, unsigned int, @@ -29,12 +29,8 @@ struct rel_ret_traits }; } // namespace detail -BUILTIN_CREATE_ENABLER(builtin_enable_generic, default_ret_type, any_elem_type, - any_shape, same_elem_type) -BUILTIN_CREATE_ENABLER(builtin_enable_generic_scalar, default_ret_type, - any_elem_type, scalar_only, same_elem_type) -BUILTIN_CREATE_ENABLER(builtin_enable_generic_non_scalar, default_ret_type, - any_elem_type, non_scalar_only, same_elem_type) +BUILTIN_CREATE_ENABLER(builtin_enable_bitselect, default_ret_type, + bitselect_elem_type, any_shape, same_elem_type) BUILTIN_CREATE_ENABLER(builtin_enable_rel, rel_ret_traits, fp_elem_type, non_scalar_only, same_elem_type) @@ -133,13 +129,14 @@ BUILTIN_REL(ONE_ARG, signbit, __spirv_SignBitSet) #ifdef __SYCL_DEVICE_ONLY__ DEVICE_IMPL_TEMPLATE( - THREE_ARGS, bitselect, builtin_enable_generic_t, [](auto... xs) { - using ret_ty = detail::builtin_enable_generic_t; + THREE_ARGS, bitselect, builtin_enable_bitselect_t, [](auto... xs) { + using ret_ty = + detail::builtin_enable_bitselect_t; using detail::builtins::convert_result; return convert_result(__spirv_ocl_bitselect(xs...)); }) #else -HOST_IMPL_TEMPLATE(THREE_ARGS, bitselect, builtin_enable_generic_t, rel, +HOST_IMPL_TEMPLATE(THREE_ARGS, bitselect, builtin_enable_bitselect_t, rel, default_ret_type) #endif From d69a4c2ac50eddee27425a0a07b78bc84e194dfd Mon Sep 17 00:00:00 2001 From: ldrumm Date: Thu, 8 Feb 2024 23:21:16 +0000 Subject: [PATCH 23/26] Revert "[ESIMD][E2E] Enable few cases in atomic_update after gpu driver update" (#12669) Reverts intel/llvm#12639 This regressed atomic_update_acc_dg2_pvc_cmpxchg.cpp --- .../Inputs/atomic_update.hpp | 16 ++++++++++---- .../Inputs/atomic_update_slm.hpp | 22 ++++++++++++++++--- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp index ef8646ae4892a..3da82ee8e15d1 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp @@ -715,10 +715,13 @@ bool test_int_types_and_sizes(queue q, const Config &cfg) { q, cfg); passed &= test_int_types<64, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>( q, cfg); - passed &= test_int_types<12, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>( - q, cfg); - passed &= test_int_types<33, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>( - q, cfg); + // non power of two values are supported only in newer driver. + // TODO: Enable this when the new driver reaches test infrastructure + // (v27556). +#if 0 + passed &= test_int_types<12, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg); + passed &= test_int_types<33, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg); +#endif } return passed; @@ -739,8 +742,13 @@ bool test_fp_types_and_sizes(queue q, const Config &cfg) { passed &= test_fp_types<32, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg); passed &= test_fp_types<64, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg); + // non power of two values are supported only in newer driver. + // TODO: Enable this when the new driver reaches test infrastructure + // (v27556). +#if 0 passed &= test_fp_types<12, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg); passed &= test_fp_types<35, Op, UseMask, UseLSCFeatures, UseAcc>(q, cfg); +#endif } return passed; } diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp index 6b69d99158cb6..92f000711e137 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp @@ -612,9 +612,14 @@ bool test_fp_types(queue q) { if constexpr (Features == TestFeatures::DG2 || Features == TestFeatures::PVC) { - if (q.get_device().has(sycl::aspect::atomic64) && - q.get_device().has(sycl::aspect::fp64)) { - passed &= run_test(q); + // TODO: fmin/fmax/fcmpxchg for double requires a newer GPU driver. + if constexpr (!std::is_same_v, ImplLSCFmax> && + !std::is_same_v, ImplLSCFmin> && + !std::is_same_v, ImplLSCFcmpwr>) { + if (q.get_device().has(sycl::aspect::atomic64) && + q.get_device().has(sycl::aspect::fp64)) { + passed &= run_test(q); + } } } return passed; @@ -628,6 +633,7 @@ bool test_int_types_and_sizes(queue q) { passed &= test_int_types<2, Op, UseMask, Features, UseAcc, SignMask>(q); passed &= test_int_types<4, Op, UseMask, Features, UseAcc, SignMask>(q); passed &= test_int_types<8, Op, UseMask, Features, UseAcc, SignMask>(q); + // TODO: N=16 and N=32 does not pass on Gen12 with mask due to older driver. if (UseMask && Features == TestFeatures::Generic && esimd_test::isGPUDriverGE(q, esimd_test::GPUDriverOS::LinuxAndWindows, "26918", "101.4953", false)) { @@ -639,8 +645,13 @@ bool test_int_types_and_sizes(queue q) { if constexpr (Features == TestFeatures::DG2 || Features == TestFeatures::PVC) { passed &= test_int_types<64, Op, UseMask, Features, UseAcc, SignMask>(q); + // non power of two values are supported only in newer driver. + // TODO: Enable this when the new driver reaches test infrastructure + // (v27556). +#if 0 passed &= test_int_types<12, Op, UseMask, Features, UseAcc, SignMask>(q); passed &= test_int_types<33, Op, UseMask, Features, UseAcc, SignMask>(q); +#endif } return passed; @@ -661,8 +672,13 @@ bool test_fp_types_and_sizes(queue q) { if constexpr (Features == TestFeatures::DG2 || Features == TestFeatures::PVC) { passed &= test_fp_types<64, Op, UseMask, Features, UseAcc>(q); + // non power of two values are supported only in newer driver. + // TODO: Enable this when the new driver reaches test infrastructure + // (v27556). +#if 0 passed &= test_fp_types<33, Op, UseMask, Features, UseAcc>(q); passed &= test_fp_types<65, Op, UseMask, Features, UseAcc>(q); +#endif } return passed; } From db04b1ee8e955d342e07ad2cc2a02dbd4317be93 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Thu, 8 Feb 2024 16:01:15 -0800 Subject: [PATCH 24/26] [NFC][SYCL] Move builtin tests sycl/test/basic_tests/{*builtins*.cpp,builtins/} (#12667) While at it, fix formatting and modernize include to SYCL 2020 (``). --- .../builtins_implicitly_convertible_args.cpp | 0 .../{ => builtins}/builtins_templates.cpp | 0 .../{ => builtins}/relational_builtins.cpp | 14 +++++++------- 3 files changed, 7 insertions(+), 7 deletions(-) rename sycl/test/basic_tests/{ => builtins}/builtins_implicitly_convertible_args.cpp (100%) rename sycl/test/basic_tests/{ => builtins}/builtins_templates.cpp (100%) rename sycl/test/basic_tests/{ => builtins}/relational_builtins.cpp (96%) diff --git a/sycl/test/basic_tests/builtins_implicitly_convertible_args.cpp b/sycl/test/basic_tests/builtins/builtins_implicitly_convertible_args.cpp similarity index 100% rename from sycl/test/basic_tests/builtins_implicitly_convertible_args.cpp rename to sycl/test/basic_tests/builtins/builtins_implicitly_convertible_args.cpp diff --git a/sycl/test/basic_tests/builtins_templates.cpp b/sycl/test/basic_tests/builtins/builtins_templates.cpp similarity index 100% rename from sycl/test/basic_tests/builtins_templates.cpp rename to sycl/test/basic_tests/builtins/builtins_templates.cpp diff --git a/sycl/test/basic_tests/relational_builtins.cpp b/sycl/test/basic_tests/builtins/relational_builtins.cpp similarity index 96% rename from sycl/test/basic_tests/relational_builtins.cpp rename to sycl/test/basic_tests/builtins/relational_builtins.cpp index ab9f3aa5c4326..eabd7e26c35cb 100644 --- a/sycl/test/basic_tests/relational_builtins.cpp +++ b/sycl/test/basic_tests/builtins/relational_builtins.cpp @@ -4,7 +4,7 @@ // NOTE: Compile the test fully to ensure the library exports the right host // symbols. -#include +#include // Some helper macros to verify return type of the builtins. To be used like // this @@ -19,15 +19,15 @@ template struct CheckHelper { template static auto call(F f) { return f(Args()...); } }; -#define CHECK(EXPECTED, FUNC, ...) \ +#define CHECK(EXPECTED, FUNC, ...) \ { \ auto ret = CheckHelper<__VA_ARGS__>::call( \ - [](auto... args) { return cl::sycl::FUNC(args...); }); \ - static_assert(std::is_same_v); \ + [](auto... args) { return sycl::FUNC(args...); }); \ + static_assert(std::is_same_v); \ } void foo() { - using namespace cl::sycl; + using namespace sycl; using boolm = marray; using int16v = vec; @@ -247,9 +247,9 @@ void foo() { } int main() { - cl::sycl::queue q; + sycl::queue q; foo(); // Verify host. - q.submit([&](cl::sycl::handler &cgh) { + q.submit([&](sycl::handler &cgh) { cgh.single_task([]() { foo(); // verify device }); From 6098a75b3092da2a9efc34b169b09e76e0336b4c Mon Sep 17 00:00:00 2001 From: Udit Agarwal <16324601+uditagarwal97@users.noreply.github.com> Date: Thu, 8 Feb 2024 23:40:51 -0800 Subject: [PATCH 25/26] [SYCL][E2E] Fix the test script after changing sycl-ls output (#12672) In PR#https://github.com/intel/llvm/pull/12596, we changed the output of sycl-ls to use 'fpga' instead of 'acc'. This has caused several tests to fail when fpga is available on the system. This PR fixes that. In retrospect, we should have enabled testing on FPGA in pre-commit testing as this bug could have been caught earlier. --- sycl/test-e2e/lit.cfg.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py index a25f775edc9e1..ef44dc972797b 100644 --- a/sycl/test-e2e/lit.cfg.py +++ b/sycl/test-e2e/lit.cfg.py @@ -669,7 +669,7 @@ features.update(sg_size_features) be, dev = sycl_device.split(":") - features.add(dev.replace("acc", "accelerator")) + features.add(dev.replace("fpga", "accelerator")) # Use short names for LIT rules. features.add(be) From 1f37b5ed804123d9068d78a739f8451584fdafca Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Fri, 9 Feb 2024 02:33:50 -0800 Subject: [PATCH 26/26] [SYCL] Fix SFINAE rules for integer builtins/bitselect (#12671) In case of vectors/swizzles of integer types only fixed width types are allowed per SYCL 2020 revision 8. Update the implementation to match that. --- sycl/include/sycl/builtins_preview.hpp | 9 +- sycl/include/sycl/builtins_utils_scalar.hpp | 11 ++ .../sycl/detail/builtins/helper_macros.hpp | 8 ++ .../detail/builtins/integer_functions.inc | 7 +- .../detail/builtins/relational_functions.inc | 10 +- sycl/source/builtins/host_helper_macros.hpp | 7 + sycl/source/builtins/integer_functions.cpp | 3 +- sycl/source/builtins/relational_functions.cpp | 3 +- .../builtins/builtin_unit_tests.cpp | 133 ++++++++++++++++++ 9 files changed, 181 insertions(+), 10 deletions(-) create mode 100644 sycl/test/basic_tests/builtins/builtin_unit_tests.cpp diff --git a/sycl/include/sycl/builtins_preview.hpp b/sycl/include/sycl/builtins_preview.hpp index dfc44e5848f91..91fd50d2ec4f2 100644 --- a/sycl/include/sycl/builtins_preview.hpp +++ b/sycl/include/sycl/builtins_preview.hpp @@ -137,7 +137,14 @@ auto builtin_marray_impl(FuncTy F, const Ts &...x) { marray Res; constexpr auto N = T::size(); for (size_t I = 0; I < N / 2; ++I) { - auto PartialRes = F(to_vec2(x, I * 2)...); + auto PartialRes = [&]() { + using elem_ty = get_elem_type_t; + if constexpr (std::is_integral_v) + return F(to_vec2(x, I * 2) + .template as, 2>>()...); + else + return F(to_vec2(x, I * 2)...); + }(); std::memcpy(&Res[I * 2], &PartialRes, sizeof(decltype(PartialRes))); } if (N % 2) diff --git a/sycl/include/sycl/builtins_utils_scalar.hpp b/sycl/include/sycl/builtins_utils_scalar.hpp index 62d86df045cdf..31f267892243b 100644 --- a/sycl/include/sycl/builtins_utils_scalar.hpp +++ b/sycl/include/sycl/builtins_utils_scalar.hpp @@ -128,6 +128,17 @@ template struct get_unsigned_int_by_size { template struct same_size_unsigned_int { using type = typename get_unsigned_int_by_size::type; }; +template +using same_size_unsigned_int_t = typename same_size_unsigned_int::type; + +template struct get_fixed_sized_int { + static_assert(std::is_integral_v); + using type = + std::conditional_t, same_size_signed_int_t, + same_size_unsigned_int_t>; +}; +template +using get_fixed_sized_int_t = typename get_fixed_sized_int::type; // Utility trait for getting an upsampled integer type. // NOTE: For upsampling we look for an integer of double the size of the diff --git a/sycl/include/sycl/detail/builtins/helper_macros.hpp b/sycl/include/sycl/detail/builtins/helper_macros.hpp index 49d4af8981d70..38014c4b62c8f 100644 --- a/sycl/include/sycl/detail/builtins/helper_macros.hpp +++ b/sycl/include/sycl/detail/builtins/helper_macros.hpp @@ -48,6 +48,11 @@ FOR_EACH4_A6(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2, ARG3, \ ARG4, ARG5, ARG6) \ BASE_CASE(FIXED1, FIXED2, FIXED3, FIXED4, ARG7) +#define FOR_EACH4_A8(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2, \ + ARG3, ARG4, ARG5, ARG6, ARG7, ARG8) \ + FOR_EACH4_A7(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2, ARG3, \ + ARG4, ARG5, ARG6, ARG7) \ + BASE_CASE(FIXED1, FIXED2, FIXED3, FIXED4, ARG8) #define FOR_EACH4_A11(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2, \ ARG3, ARG4, ARG5, ARG6, ARG7, ARG8, ARG9, ARG10, ARG11) \ FOR_EACH4_A7(BASE_CASE, FIXED1, FIXED2, FIXED3, FIXED4, ARG1, ARG2, ARG3, \ @@ -169,6 +174,9 @@ unsigned char, unsigned short, unsigned int, unsigned long, unsigned long long // 11 types #define INTEGER_TYPES SIGNED_TYPES, UNSIGNED_TYPES +// 8 types +#define FIXED_WIDTH_INTEGER_TYPES \ + int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t #define DEVICE_IMPL_TEMPLATE_CUSTOM_DELEGATE( \ NUM_ARGS, NAME, ENABLER, DELEGATOR, NS, /*SCALAR_VEC_IMPL*/...) \ diff --git a/sycl/include/sycl/detail/builtins/integer_functions.inc b/sycl/include/sycl/detail/builtins/integer_functions.inc index dab0cee8a647a..44699765ff7fd 100644 --- a/sycl/include/sycl/detail/builtins/integer_functions.inc +++ b/sycl/include/sycl/detail/builtins/integer_functions.inc @@ -16,9 +16,10 @@ namespace detail { template struct integer_elem_type : std::bool_constant< - check_type_in_v, char, signed char, short, int, - long, long long, unsigned char, unsigned short, - unsigned int, unsigned long, unsigned long long>> {}; + (is_vec_or_swizzle_v && + check_type_in_v, FIXED_WIDTH_INTEGER_TYPES>) || + (!is_vec_or_swizzle_v && + check_type_in_v, INTEGER_TYPES>)> {}; template struct suint32_elem_type : std::bool_constant< diff --git a/sycl/include/sycl/detail/builtins/relational_functions.inc b/sycl/include/sycl/detail/builtins/relational_functions.inc index fb0b3f7682b0a..d63a7716aa9b7 100644 --- a/sycl/include/sycl/detail/builtins/relational_functions.inc +++ b/sycl/include/sycl/detail/builtins/relational_functions.inc @@ -15,10 +15,12 @@ inline namespace _V1 { namespace detail { template struct bitselect_elem_type - : std::bool_constant, float, double, half, char, signed char, short, - int, long, long long, unsigned char, unsigned short, unsigned int, - unsigned long, unsigned long long>> {}; + : std::bool_constant< + check_type_in_v, FP_TYPES> || + (is_vec_or_swizzle_v && + check_type_in_v, FIXED_WIDTH_INTEGER_TYPES>) || + (!is_vec_or_swizzle_v && + check_type_in_v, INTEGER_TYPES>)> {}; template struct rel_ret_traits diff --git a/sycl/source/builtins/host_helper_macros.hpp b/sycl/source/builtins/host_helper_macros.hpp index 484b0bc95fb8b..41aac2148db71 100644 --- a/sycl/source/builtins/host_helper_macros.hpp +++ b/sycl/source/builtins/host_helper_macros.hpp @@ -56,6 +56,9 @@ #define EXPORT_VEC(NUM_ARGS, NAME, TYPE, VL) \ EXPORT_VEC_NS(NUM_ARGS, NAME, sycl, TYPE, VL) +#define EXPORT_VEC_1_16_IMPL(NUM_ARGS, NAME, NS, TYPE) \ + FOR_VEC_1_16(EXPORT_VEC_NS, NUM_ARGS, NAME, NS, TYPE) + #define EXPORT_SCALAR_AND_VEC_1_16_IMPL(NUM_ARGS, NAME, NS, TYPE) \ EXPORT_SCALAR_NS(NUM_ARGS, NAME, NS, TYPE) \ FOR_VEC_1_16(EXPORT_VEC_NS, NUM_ARGS, NAME, NS, TYPE) @@ -69,8 +72,12 @@ #define EXPORT_SCALAR_AND_VEC_1_16_NS(NUM_ARGS, NAME, NS, ...) \ FOR_EACH3(EXPORT_SCALAR_AND_VEC_1_16_IMPL, NUM_ARGS, NAME, NS, __VA_ARGS__) +#define EXPORT_VEC_1_16_NS(NUM_ARGS, NAME, NS, ...) \ + FOR_EACH3(EXPORT_VEC_1_16_IMPL, NUM_ARGS, NAME, NS, __VA_ARGS__) #define EXPORT_SCALAR_AND_VEC_1_16(NUM_ARGS, NAME, ...) \ EXPORT_SCALAR_AND_VEC_1_16_NS(NUM_ARGS, NAME, sycl, __VA_ARGS__) +#define EXPORT_VEC_1_16(NUM_ARGS, NAME, ...) \ + EXPORT_VEC_1_16_NS(NUM_ARGS, NAME, sycl, __VA_ARGS__) #define EXPORT_SCALAR_AND_VEC_2_4(NUM_ARGS, NAME, ...) \ FOR_EACH2(EXPORT_SCALAR_AND_VEC_2_4_IMPL, NUM_ARGS, NAME, __VA_ARGS__) diff --git a/sycl/source/builtins/integer_functions.cpp b/sycl/source/builtins/integer_functions.cpp index 381d6f1fa0a10..cd92b2180df73 100644 --- a/sycl/source/builtins/integer_functions.cpp +++ b/sycl/source/builtins/integer_functions.cpp @@ -76,7 +76,8 @@ namespace sycl { inline namespace _V1 { #define BUILTIN_GENINT(NUM_ARGS, NAME, IMPL) \ HOST_IMPL(NAME, IMPL) \ - EXPORT_SCALAR_AND_VEC_1_16(NUM_ARGS, NAME, INTEGER_TYPES) + FOR_EACH2(EXPORT_SCALAR, NUM_ARGS, NAME, INTEGER_TYPES) \ + EXPORT_VEC_1_16(NUM_ARGS, NAME, FIXED_WIDTH_INTEGER_TYPES) #define BUILTIN_GENINT_SU(NUM_ARGS, NAME, IMPL) \ BUILTIN_GENINT(NUM_ARGS, NAME, IMPL) diff --git a/sycl/source/builtins/relational_functions.cpp b/sycl/source/builtins/relational_functions.cpp index b54c55e283e5e..b8b7795f6fb79 100644 --- a/sycl/source/builtins/relational_functions.cpp +++ b/sycl/source/builtins/relational_functions.cpp @@ -103,6 +103,7 @@ HOST_IMPL(bitselect, [](auto x, auto y, auto z) { assert((ures & std::numeric_limits::max()) == ures); return bit_cast(static_cast(ures)); }) -EXPORT_SCALAR_AND_VEC_1_16(THREE_ARGS, bitselect, INTEGER_TYPES, FP_TYPES) +FOR_EACH2(EXPORT_SCALAR, THREE_ARGS, bitselect, INTEGER_TYPES, FP_TYPES) +EXPORT_VEC_1_16(THREE_ARGS, bitselect, FIXED_WIDTH_INTEGER_TYPES, FP_TYPES) } // namespace _V1 } // namespace sycl diff --git a/sycl/test/basic_tests/builtins/builtin_unit_tests.cpp b/sycl/test/basic_tests/builtins/builtin_unit_tests.cpp new file mode 100644 index 0000000000000..d241a90568fc3 --- /dev/null +++ b/sycl/test/basic_tests/builtins/builtin_unit_tests.cpp @@ -0,0 +1,133 @@ +// RUN: %clangxx -fsycl -fpreview-breaking-changes -fsyntax-only %s -Xclang -verify +// REQUIRES: preview-breaking-changes-supported + +#include + +using namespace sycl; +using namespace sycl::detail; + +namespace builtin_same_shape_v_tests { +using swizzle1 = decltype(std::declval>().swizzle<0>()); +using swizzle2 = decltype(std::declval>().swizzle<0, 0>()); +using swizzle3 = decltype(std::declval>().swizzle<0, 0, 1>()); + +static_assert(builtin_same_shape_v); +static_assert(builtin_same_shape_v); +static_assert(builtin_same_shape_v>); +static_assert(builtin_same_shape_v, marray>); +static_assert(builtin_same_shape_v>); +static_assert(builtin_same_shape_v, vec>); +static_assert(builtin_same_shape_v, swizzle2>); + +static_assert(!builtin_same_shape_v>); +static_assert(!builtin_same_shape_v>); +static_assert(!builtin_same_shape_v, vec>); +static_assert(!builtin_same_shape_v); +static_assert(!builtin_same_shape_v, swizzle1>); +static_assert(!builtin_same_shape_v); +} // namespace builtin_same_shape_v_tests + +namespace builtin_marray_impl_tests { +// Integer functions/relational bitselect only accept fixed-width integer +// element types for vector/swizzle elements. Make sure that our marray->vec +// delegator can handle that. + +auto foo(char x) { return x; } +auto foo(signed char x) { return x; } +auto foo(unsigned char x) { return x; } +auto foo(vec x) { return x; } +auto foo(vec x) { return x; } + +auto test() { + marray x; + marray y; + marray z; + auto TestOne = [](auto x) { + std::ignore = builtin_marray_impl([](auto x) { return foo(x); }, x); + }; + TestOne(x); + TestOne(y); + TestOne(z); +} +} // namespace builtin_marray_impl_tests + +namespace builtin_enable_integer_tests { +using swizzle1 = decltype(std::declval>().swizzle<0>()); +using swizzle2 = decltype(std::declval>().swizzle<0, 0>()); +template void ignore() {} + +void test() { + // clang-format off + ignore, + builtin_enable_integer_t, + builtin_enable_integer_t>(); + // clang-format on + + ignore>, + builtin_enable_integer_t>>(); + + ignore>(); + ignore, vec>>(); + ignore, swizzle2>>(); + ignore>(); + + { + // Only one of char/signed char maps onto int8_t. The other type isn't a + // valid vector element type for integer builtins. + + static_assert(std::is_signed_v); + + // clang-format off + // expected-error-re@*:* {{no type named 'type' in 'sycl::detail::builtin_enable>'}} + // expected-note@+1 {{in instantiation of template type alias 'builtin_enable_integer_t' requested here}} + ignore>, builtin_enable_integer_t>>(); + // clang-format on + } + + // expected-error@*:* {{no type named 'type' in 'sycl::detail::builtin_enable'}} + // expected-note@+1 {{in instantiation of template type alias 'builtin_enable_integer_t' requested here}} + ignore>(); +} +} // namespace builtin_enable_integer_tests + +namespace builtin_enable_bitselect_tests { +// Essentially the same as builtin_enable_integer_t + FP types support. +using swizzle1 = decltype(std::declval>().swizzle<0>()); +using swizzle2 = decltype(std::declval>().swizzle<0, 0>()); +template void ignore() {} + +void test() { + // clang-format off + ignore, + builtin_enable_bitselect_t, + builtin_enable_bitselect_t, + builtin_enable_bitselect_t>(); + // clang-format on + + ignore>, + builtin_enable_bitselect_t>, + builtin_enable_bitselect_t>>(); + + ignore>(); + ignore, vec>>(); + ignore, swizzle2>>(); + ignore>(); + + { + // Only one of char/signed char maps onto int8_t. The other type isn't a + // valid vector element type for integer builtins. + + static_assert(std::is_signed_v); + + // clang-format off + // expected-error-re@*:* {{no type named 'type' in 'sycl::detail::builtin_enable>'}} + // expected-note@+1 {{in instantiation of template type alias 'builtin_enable_bitselect_t' requested here}} + ignore>, builtin_enable_bitselect_t>>(); + // clang-format on + } + + // expected-error@*:* {{no type named 'type' in 'sycl::detail::builtin_enable'}} + // expected-note@+1 {{in instantiation of template type alias 'builtin_enable_bitselect_t' requested here}} + ignore>(); +} +} // namespace builtin_enable_bitselect_tests