From 0bf2e666c1d59ba5435bfa1d30f0c3ecb2db640d Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Wed, 31 Jan 2024 18:21:26 +0000 Subject: [PATCH 01/14] [SYCL][ESIMD] Implement unified memory API for scatter(usm, ...) (#12510) This implements the unified memory API for scatter with USM pointers. --------- Signed-off-by: Sarnie, Nick --- sycl/include/sycl/ext/intel/esimd/memory.hpp | 264 +++++++++++++++--- .../ext/intel/experimental/esimd/common.hpp | 10 - .../ext/intel/experimental/esimd/memory.hpp | 25 +- .../unified_memory_api/Inputs/scatter.hpp | 227 +++++++++++++++ .../ESIMD/unified_memory_api/scatter_usm.cpp | 37 +++ .../scatter_usm_dg2_pvc.cpp | 38 +++ sycl/test/esimd/memory_properties.cpp | 60 ++++ 7 files changed, 597 insertions(+), 64 deletions(-) create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_dg2_pvc.cpp diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index 7af684ee99cfb..5d8f13ce82619 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -179,6 +179,45 @@ __ESIMD_API simd gather_impl(const T *p, simd offsets, return lsc_format_ret(Result); } +/// USM pointer scatter. +/// Supported platforms: DG2, PVC +/// VISA instruction: lsc_store.ugm +/// +/// Scatters elements to specific address. +/// +/// @tparam T is element type. +/// @tparam NElts is the number of elements to store per address. +/// @tparam DS is the data size. +/// @tparam L1H is L1 cache hint. +/// @tparam L2H is L2 cache hint. +/// @tparam N is the number of channels (platform dependent). +/// @param p is the base pointer. +/// @param offsets is the zero-based offsets in bytes. +/// @param vals is values to store. +/// @param pred is predicates. +/// +template +__ESIMD_API void scatter_impl(T *p, simd offsets, + simd vals, simd_mask pred) { + static_assert(std::is_integral_v, "Unsupported offset type"); + check_lsc_vector_size(); + check_lsc_data_size(); + check_cache_hint(); + constexpr uint16_t AddressScale = 1; + constexpr int ImmOffset = 0; + constexpr lsc_data_size EDS = expand_data_size(finalize_data_size()); + constexpr lsc_vector_size VS = to_lsc_vector_size(); + constexpr lsc_data_order Transposed = lsc_data_order::nontranspose; + using MsgT = typename lsc_expand_type::type; + simd addrs = reinterpret_cast(p); + addrs += convert(offsets); + simd Tmp = lsc_format_input(vals); + __esimd_lsc_store_stateless(pred.data(), addrs.data(), + Tmp.data()); +} + // Returns true iff it is Ok to use llvm.masked.gather and llvm.masked.scatter. // By default (without use specifying __ESIMD_GATHER_SCATTER_LLVM_IR) it is // not used because of an issue in GPU driver, which does not recognize @@ -616,44 +655,203 @@ gather(const Tx *p, Toffset offset, simd_mask mask = 1) { return gather(p, simd(offset), mask); } +/// template +/// void scatter(T *p, simd byte_offsets, simd vals, +/// simd_mask mask, PropertyListT props = {}); // (usm-sc-1) + +/// template +/// void scatter(T *p, simd byte_offsets, simd vals, +/// PropertyListT props = {}); // (usm-sc-2) + +/// The next two functions are similar to usm-sc-{1,2} with the 'byte_offsets' +/// parameter represerented as 'simd_view'. + +/// template +/// void scatter(T *p, OffsetSimdViewT byte_offsets, simd vals, +/// simd_mask mask, PropertyListT props = {}); // (usm-sc-3) + +/// template +/// void scatter(T *p, OffsetSimdViewT byte_offsets, simd vals, +/// PropertyListT props = {}); // (usm-sc-4) + +/// template +/// void scatter(T *p, simd byte_offsets, simd vals, +/// simd_mask mask, PropertyListT props = {}); // (usm-sc-1) +/// /// Writes ("scatters") elements of the input vector to different memory /// locations. Each memory location is base address plus an offset - a /// value of the corresponding element in the input offset vector. Access to /// any element's memory location can be disabled via the input mask. -/// @tparam Tx Element type, must be of size 4 or less. -/// @tparam N Number of elements to write; can be \c 1, \c 2, \c 4, \c 8, \c 16 -/// or \c 32. +/// @tparam T Element type. +/// @tparam N Number of elements to write. +/// @tparam VS Vector size. It can also be read as the number of writes per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. /// @param p The base address. -/// @param offsets A vector of 32-bit or 64-bit offsets in bytes. For each lane -/// \c i, ((byte*)p + offsets[i]) must be element size aligned. +/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. /// @param vals The vector to scatter. -/// @param mask The access mask, defaults to all 1s. +/// @param mask The access mask. +/// @param props The optional compile-time properties. Only 'alignment' +/// and cache hint properties are used. +template +__ESIMD_API std::enable_if_t< + ext::oneapi::experimental::is_property_list_v> +scatter(T *p, simd byte_offsets, simd vals, + simd_mask mask, PropertyListT props = {}) { + static_assert(std::is_integral_v, "Unsupported offset type"); + static_assert(N / VS >= 1 && N % VS == 0, "N must be divisible by VS"); + + constexpr size_t Alignment = + detail::getPropertyValue(sizeof(T)); + static_assert(Alignment >= sizeof(T), + "scatter() requires at least element-size alignment"); + constexpr auto L1Hint = + detail::getPropertyValue( + cache_hint::none); + constexpr auto L2Hint = + detail::getPropertyValue( + cache_hint::none); + + // Use LSC lowering if L1/L2 or VS > 1. + if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none || + VS > 1 || !__ESIMD_DNS::isPowerOf2(N, 32)) { + static_assert(VS == 1 || sizeof(T) >= 4, + "VS > 1 is supprted only for 4- and 8-byte elements"); + return detail::scatter_impl(p, byte_offsets, vals, mask); + } else { + using Tx = detail::__raw_t; + simd byte_offsets_i = convert(byte_offsets); + simd addrs(reinterpret_cast(p)); + addrs = addrs + byte_offsets_i; + if constexpr (sizeof(T) == 1) { + simd D = __esimd_wrregion( + D.data(), vals.data(), 0); + __esimd_svm_scatter(), + detail::ElemsPerAddrEncoding<1>()>( + addrs.data(), D.data(), mask.data()); + } else if constexpr (sizeof(T) == 2) { + simd D = __esimd_wrregion( + D.data(), vals.data(), 0); + __esimd_svm_scatter(), + detail::ElemsPerAddrEncoding<2>()>( + addrs.data(), D.data(), mask.data()); + } else + __esimd_svm_scatter(), + detail::ElemsPerAddrEncoding<1>()>( + addrs.data(), vals.data(), mask.data()); + } +} + +// template +// void scatter(T *p, simd byte_offsets, simd vals, +// PropertyListT props = {}); // (usm-sc-2) /// -template -__ESIMD_API void scatter(Tx *p, simd offsets, simd vals, - simd_mask mask = 1) { - using T = detail::__raw_t; - static_assert(std::is_integral_v, "Unsupported offset type"); - static_assert(detail::isPowerOf2(N, 32), "Unsupported value of N"); - simd offsets_i = convert(offsets); - simd addrs(reinterpret_cast(p)); - addrs = addrs + offsets_i; - if constexpr (sizeof(T) == 1) { - simd D; - D = __esimd_wrregion(D.data(), vals.data(), 0); - __esimd_svm_scatter(), - detail::ElemsPerAddrEncoding<1>()>( - addrs.data(), D.data(), mask.data()); - } else if constexpr (sizeof(T) == 2) { - simd D; - D = __esimd_wrregion(D.data(), vals.data(), 0); - __esimd_svm_scatter(), - detail::ElemsPerAddrEncoding<2>()>( - addrs.data(), D.data(), mask.data()); - } else - __esimd_svm_scatter(), - detail::ElemsPerAddrEncoding<1>()>( - addrs.data(), vals.data(), mask.data()); +/// Writes ("scatters") elements of the input vector to different memory +/// locations. Each memory location is base address plus an offset - a +/// value of the corresponding element in the input offset vector. +/// @tparam T Element type. +/// @tparam N Number of elements to write. +/// @tparam VS Vector size. It can also be read as the number of writes per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param p The base address. +/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param vals The vector to scatter. +/// @param props The optional compile-time properties. Only 'alignment' +/// and cache hint properties are used. +template +__ESIMD_API std::enable_if_t< + ext::oneapi::experimental::is_property_list_v> +scatter(T *p, simd byte_offsets, simd vals, + PropertyListT props = {}) { + simd_mask Mask = 1; + scatter(p, byte_offsets, vals, Mask, props); +} + +// template +// void scatter(T *p, OffsetSimdViewT byte_offsets, simd vals, +// simd_mask mask, PropertyListT props = {}); // (usm-sc-3) +/// +/// Writes ("scatters") elements of the input vector to different memory +/// locations. Each memory location is base address plus an offset - a +/// value of the corresponding element in the input offset vector. Access to +/// any element's memory location can be disabled via the input mask. +/// @tparam T Element type. +/// @tparam N Number of elements to write. +/// @tparam VS Vector size. It can also be read as the number of writes per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param p The base address. +/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes +/// represented as a 'simd_view' object. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param vals The vector to scatter. +/// @param mask The access mask. +/// @param props The optional compile-time properties. Only 'alignment' +/// and cache hint properties are used. +template +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v> +scatter(T *p, OffsetSimdViewT byte_offsets, simd vals, + simd_mask mask, PropertyListT props = {}) { + scatter(p, byte_offsets.read(), vals, mask, props); +} + +/// template +/// void scatter(T *p, OffsetSimdViewT byte_offsets, simd vals, +/// PropertyListT props = {}); // (usm-sc-4) +/// +/// Writes ("scatters") elements of the input vector to different memory +/// locations. Each memory location is base address plus an offset - a +/// value of the corresponding element in the input offset vector. +/// @tparam T Element type. +/// @tparam N Number of elements to write. +/// @tparam VS Vector size. It can also be read as the number of writes per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param p The base address. +/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes +/// represented as a 'simd_view' object. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param vals The vector to scatter. +/// @param props The optional compile-time properties. Only 'alignment' +/// and cache hint properties are used. +template +__ESIMD_API std::enable_if_t< + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v> +scatter(T *p, OffsetSimdViewT byte_offsets, simd vals, + PropertyListT props = {}) { + simd_mask Mask = 1; + scatter(p, byte_offsets.read(), vals, Mask, props); } /// A variation of \c scatter API with \c offsets represented as \c simd_view @@ -671,7 +869,7 @@ __ESIMD_API void scatter(Tx *p, simd offsets, simd vals, template __ESIMD_API void scatter(Tx *p, simd_view offsets, simd vals, simd_mask mask = 1) { - scatter(p, offsets.read(), vals, mask); + scatter(p, offsets.read(), vals, mask); } /// A variation of \c scatter API with \c offsets represented as scalar. @@ -688,7 +886,7 @@ __ESIMD_API void scatter(Tx *p, simd_view offsets, template __ESIMD_API std::enable_if_t && N == 1> scatter(Tx *p, Toffset offset, simd vals, simd_mask mask = 1) { - scatter(p, simd(offset), vals, mask); + scatter(p, simd(offset), vals, mask); } namespace detail { diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp index 774154dfc0f97..a71981de54bec 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/common.hpp @@ -102,16 +102,6 @@ template struct lsc_expand_type { using type = __ESIMD_DNS::lsc_expand_type::type; }; -template struct lsc_bitcast_type { -public: - using type = std::conditional_t< - sizeof(T) == 1, uint8_t, - std::conditional_t< - sizeof(T) == 2, uint16_t, - std::conditional_t>>>; -}; - } // namespace detail /// L1 or L3 cache hint kinds. diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp index 069898f14e747..7fb0ae833e793 100644 --- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp @@ -1434,7 +1434,7 @@ __ESIMD_API void lsc_slm_scatter(__ESIMD_NS::simd offsets, constexpr detail::lsc_data_order _Transposed = detail::lsc_data_order::nontranspose; using MsgT = typename detail::lsc_expand_type::type; - using CstT = typename detail::lsc_bitcast_type::type; + using CstT = __ESIMD_DNS::uint_type_t; __ESIMD_NS::simd Tmp = vals.template bit_cast_view(); __esimd_lsc_store_slm( @@ -1487,25 +1487,8 @@ template offsets, __ESIMD_NS::simd vals, __ESIMD_NS::simd_mask pred = 1) { - static_assert(std::is_integral_v, "Unsupported offset type"); - detail::check_lsc_vector_size(); - detail::check_lsc_data_size(); - detail::check_lsc_cache_hint(); - constexpr uint16_t _AddressScale = 1; - constexpr int _ImmOffset = 0; - constexpr lsc_data_size _DS = - detail::expand_data_size(detail::finalize_data_size()); - constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size(); - constexpr detail::lsc_data_order _Transposed = - detail::lsc_data_order::nontranspose; - using MsgT = typename detail::lsc_expand_type::type; - using _CstT = typename detail::lsc_bitcast_type::type; - __ESIMD_NS::simd Tmp = vals.template bit_cast_view<_CstT>(); - __ESIMD_NS::simd addrs = reinterpret_cast(p); - addrs += convert(offsets); - __esimd_lsc_store_stateless(pred.data(), addrs.data(), - Tmp.data()); + __ESIMD_DNS::scatter_impl(p, offsets, + vals, pred); } template ::type; - using _CstT = typename detail::lsc_bitcast_type::type; + using _CstT = __ESIMD_DNS::uint_type_t; __ESIMD_NS::simd Tmp = vals.template bit_cast_view<_CstT>(); auto si = __ESIMD_NS::get_surface_index(acc); __esimd_lsc_store_bti +bool verify(const T *Out, int N, int Size, int VS, uint32_t MaskStride, + bool UseMask) { + using Tuint = esimd_test::uint_type_t; + int NumErrors = 0; + int NOffsets = N / VS; + for (uint32_t I = 0; I < Size; I += N) { // Verify by 1 vector at once + for (int VSI = 0; VSI < VS; VSI++) { + for (int OffsetI = 0; OffsetI < NOffsets; OffsetI++) { + size_t OutIndex = I + VSI * NOffsets + OffsetI; + bool IsMaskSet = UseMask ? ((OutIndex / VS) % MaskStride == 0) : true; + Tuint Expected = sycl::bit_cast((T)OutIndex); + if (!UseMask || IsMaskSet) + Expected = sycl::bit_cast((T)(OutIndex * 2)); + Tuint Computed = sycl::bit_cast(Out[OutIndex]); + if (Computed != Expected && ++NumErrors < 16) { + std::cout << "Out[" << OutIndex << "] = " << std::to_string(Computed) + << " vs " << std::to_string(Expected) << std::endl; + } + } + } + } + return NumErrors == 0; +} + +template +bool testUSM(queue Q, uint32_t MaskStride, + ScatterPropertiesT ScatterProperties) { + uint32_t Groups = 8; + uint32_t Threads = 16; + size_t Size = Groups * Threads * N; + static_assert(VS > 0 && N % VS == 0, + "Incorrect VS parameter. N must be divisible by VS."); + constexpr int NOffsets = N / VS; + using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t; + + std::cout << "USM case: T=" << esimd_test::type_name() << ",N=" << N + << ", VS=" << VS << ",UseMask=" << UseMask + << ",UseProperties=" << UseProperties << std::endl; + + sycl::range<1> GlobalRange{Groups}; + sycl::range<1> LocalRange{Threads}; + sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange}; + + T *Out = static_cast(sycl::malloc_shared(Size * sizeof(T), Q)); + for (size_t i = 0; i < Size; i++) + Out[i] = i; + + try { + Q.submit([&](handler &cgh) { + cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { + ScatterPropertiesT Props{}; + uint16_t GlobalID = ndi.get_global_id(0); + simd ByteOffsets(GlobalID * N * sizeof(T), + VS * sizeof(T)); + auto ByteOffsetsView = ByteOffsets.template select(); + simd Vals = gather(Out, ByteOffsets); + Vals *= 2; + auto ValsView = Vals.template select(); + simd_mask Pred = 0; + for (int I = 0; I < NOffsets; I++) + Pred[I] = (I % MaskStride == 0) ? 1 : 0; + if constexpr (VS > 1) { // VS > 1 requires specifying + if constexpr (UseMask) { + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + scatter(Out, ByteOffsets, Vals, Pred, Props); + else if (GlobalID % 4 == 1) + scatter(Out, ByteOffsetsView, Vals, Pred, Props); + else if (GlobalID % 4 == 2) + scatter(Out, ByteOffsets, ValsView, Pred, Props); + else if (GlobalID % 4 == 3) + scatter(Out, ByteOffsetsView, ValsView, Pred, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + scatter(Out, ByteOffsets, Vals, Pred); + else if (GlobalID % 4 == 1) + scatter(Out, ByteOffsetsView, Vals, Pred); + else if (GlobalID % 4 == 2) + scatter(Out, ByteOffsets, ValsView, Pred); + else if (GlobalID % 4 == 3) + scatter(Out, ByteOffsetsView, ValsView, Pred); + } + } else { // UseMask == false + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + scatter(Out, ByteOffsets, Vals, Props); + else if (GlobalID % 4 == 1) + scatter(Out, ByteOffsetsView, Vals, Props); + else if (GlobalID % 4 == 2) + scatter(Out, ByteOffsets, ValsView, Props); + else if (GlobalID % 4 == 3) + scatter(Out, ByteOffsetsView, ValsView, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + scatter(Out, ByteOffsets, Vals); + else if (GlobalID % 4 == 1) + scatter(Out, ByteOffsetsView, Vals); + else if (GlobalID % 4 == 2) + scatter(Out, ByteOffsets, ValsView); + else if (GlobalID % 4 == 3) + scatter(Out, ByteOffsetsView, ValsView); + } + } + } else { // VS == 1 + if constexpr (UseMask) { + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + scatter(Out, ByteOffsets, Vals, Pred, Props); + else if (GlobalID % 4 == 1) + scatter(Out, ByteOffsetsView, Vals, Pred, Props); + else if (GlobalID % 4 == 2) + scatter(Out, ByteOffsets, ValsView, Pred, Props); + else if (GlobalID % 4 == 3) + scatter(Out, ByteOffsetsView, ValsView, Pred, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + scatter(Out, ByteOffsets, Vals, Pred); + else if (GlobalID % 4 == 1) + scatter(Out, ByteOffsetsView, Vals, Pred); + else if (GlobalID % 4 == 2) + scatter(Out, ByteOffsets, ValsView, Pred); + else if (GlobalID % 4 == 3) + scatter(Out, ByteOffsetsView, ValsView, Pred); + } + } else { // UseMask == false + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + scatter(Out, ByteOffsets, Vals, Props); + else if (GlobalID % 4 == 1) + scatter(Out, ByteOffsetsView, Vals, Props); + else if (GlobalID % 4 == 2) + scatter(Out, ByteOffsets, ValsView, Props); + else if (GlobalID % 4 == 3) + scatter(Out, ByteOffsetsView, ValsView, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + scatter(Out, ByteOffsets, Vals); + else if (GlobalID % 4 == 1) + scatter(Out, ByteOffsetsView, Vals); + else if (GlobalID % 4 == 2) + scatter(Out, ByteOffsets, ValsView); + else if (GlobalID % 4 == 3) + scatter(Out, ByteOffsetsView, ValsView); + } + } + } + }); + }).wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + sycl::free(Out, Q); + return false; + } + + bool Passed = verify(Out, N, Size, VS, MaskStride, UseMask); + + sycl::free(Out, Q); + + return Passed; +} + +template bool testUSM(queue Q) { + constexpr bool CheckMask = true; + constexpr bool CheckProperties = true; + properties EmptyProps; + properties AlignElemProps{alignment}; + + bool Passed = true; + + // // Test scatter() that is available on Gen12 and PVC. + Passed &= testUSM(Q, 2, EmptyProps); + Passed &= testUSM(Q, 1, EmptyProps); + Passed &= testUSM(Q, 2, EmptyProps); + Passed &= testUSM(Q, 2, EmptyProps); + Passed &= testUSM(Q, 2, EmptyProps); + Passed &= testUSM(Q, 2, EmptyProps); + + Passed &= testUSM(Q, 2, EmptyProps); + + // // Test scatter() without passing compile-time properties argument. + Passed &= testUSM(Q, 2, EmptyProps); + Passed &= testUSM(Q, 2, EmptyProps); + + if constexpr (Features == TestFeatures::PVC || + Features == TestFeatures::DG2) { + properties LSCProps{cache_hint_L1, + cache_hint_L2, + alignment}; + Passed &= testUSM(Q, 2, LSCProps); + Passed &= testUSM(Q, 2, LSCProps); + Passed &= testUSM(Q, 2, LSCProps); + Passed &= testUSM(Q, 2, LSCProps); + + Passed &= testUSM(Q, 2, LSCProps); + + // Check VS > 1. GPU supports only dwords and qwords in this mode. + if constexpr (sizeof(T) >= 4) { + // TODO: This test case causes flaky fail. Enable it after the issue + // in GPU driver is fixed. + // Passed &= + // testUSM(Q, 2, AlignElemProps) + Passed &= + testUSM(Q, 2, AlignElemProps); + Passed &= + testUSM(Q, 2, AlignElemProps); + Passed &= + testUSM(Q, 2, AlignElemProps); + } + } // TestPVCFeatures + + return Passed; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp new file mode 100644 index 0000000000000..929d3c6fc04f7 --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp @@ -0,0 +1,37 @@ +//==------- scatter_usm.cpp - DPC++ ESIMD on-device test ---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===------------------------------------------------------------------===// +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::scatter() functions accepting USM pointer +// and optional compile-time esimd::properties. +// The scatter() calls in this test do not use cache-hint +// properties to not impose using DG2/PVC features. + +#include "Inputs/scatter.hpp" + +int main() { + auto Q = queue{gpu_selector_v}; + esimd_test::printTestLabel(Q); + + constexpr auto TestFeatures = TestFeatures::Generic; + bool Passed = true; + + Passed &= testUSM(Q); + Passed &= testUSM(Q); + if (Q.get_device().has(sycl::aspect::fp16)) + Passed &= testUSM(Q); + Passed &= testUSM(Q); + Passed &= testUSM(Q); + Passed &= testUSM(Q); + if (Q.get_device().has(sycl::aspect::fp64)) + Passed &= testUSM(Q); + + std::cout << (Passed ? "Passed\n" : "FAILED\n"); + return Passed ? 0 : 1; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_dg2_pvc.cpp new file mode 100644 index 0000000000000..aa466795e9b06 --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_dg2_pvc.cpp @@ -0,0 +1,38 @@ +//==------- scatter_usm_dg2_pvc.cpp - DPC++ ESIMD on-device test--------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===------------------------------------------------------------------===// +// REQUIRES: gpu-intel-pvc || gpu-intel-dg2 +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::scatter() functions accepting USM pointer +// and optional compile-time esimd::properties. +// The scatter() calls in this test uses cache-hint +// properties and requires DG2 or PVC. + +#include "Inputs/scatter.hpp" + +int main() { + auto Q = queue{gpu_selector_v}; + esimd_test::printTestLabel(Q); + + constexpr auto TestFeatures = TestFeatures::PVC; + bool Passed = true; + + Passed &= testUSM(Q); + Passed &= testUSM(Q); + if (Q.get_device().has(sycl::aspect::fp16)) + Passed &= testUSM(Q); + Passed &= testUSM(Q); + Passed &= testUSM(Q); + Passed &= testUSM(Q); + if (Q.get_device().has(sycl::aspect::fp64)) + Passed &= testUSM(Q); + + std::cout << (Passed ? "Passed\n" : "FAILED\n"); + return Passed ? 0 : 1; +} diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp index 737d7b4fabfad..31dbc3e889f4b 100644 --- a/sycl/test/esimd/memory_properties.cpp +++ b/sycl/test/esimd/memory_properties.cpp @@ -969,6 +969,8 @@ test_gather_scatter(AccType &acc, float *ptrf, int byte_offset32, simd pass_thru; auto pass_thru_view = pass_thru.select<32, 1>(); + auto usm_view = usm.select<32, 1>(); + // Test USM and ACC gather using this plan: // 1) gather(usm, offsets): offsets is simd or simd_view // 2) gather(usm, offsets, mask): offsets is simd or simd_view @@ -1151,6 +1153,64 @@ test_gather_scatter(AccType &acc, float *ptrf, int byte_offset32, props_align4); acc_res = gather(acc, ioffset_n16_view, mask_n16, pass_thru_view, props_align4); + + // CHECK-COUNT-4: call void @llvm.genx.svm.scatter.v32i1.v32i64.v32f32(<32 x i1> {{[^)]+}}, i32 0, <32 x i64> {{[^)]+}}, <32 x float> {{[^)]+}}) + scatter(ptrf, ioffset_n32, usm, mask_n32); + + scatter(ptrf, ioffset_n32, usm); + + scatter(ptrf, ioffset_n32, usm, mask_n32, props_align4); + + scatter(ptrf, ioffset_n32, usm, props_align4); + + // CHECK-COUNT-8: call void @llvm.genx.lsc.store.stateless.v32i1.v32i64.v32i32(<32 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, <32 x i32> {{[^)]+}}, i32 0) + scatter(ptrf, ioffset_n32, usm, mask_n32, props_cache_load); + scatter(ptrf, ioffset_n32, usm, props_cache_load); + + scatter(ptrf, ioffset_n32_view, usm, mask_n32, props_cache_load); + scatter(ptrf, ioffset_n32_view, usm, props_cache_load); + + scatter(ptrf, ioffset_n32, usm_view, mask_n32, props_cache_load); + scatter(ptrf, ioffset_n32, usm_view, props_cache_load); + + scatter(ptrf, ioffset_n32_view, usm_view, mask_n32, + props_cache_load); + scatter(ptrf, ioffset_n32_view, usm_view, props_cache_load); + + // VS > 1 + // CHECK-COUNT-8: call void @llvm.genx.lsc.store.stateless.v16i1.v16i64.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, <32 x i32> {{[^)]+}}, i32 0) + scatter(ptrf, ioffset_n16, usm, mask_n16, props_cache_load); + + scatter(ptrf, ioffset_n16, usm, props_cache_load); + + scatter(ptrf, ioffset_n16_view, usm, mask_n16, + props_cache_load); + scatter(ptrf, ioffset_n16_view, usm, props_cache_load); + + scatter(ptrf, ioffset_n16, usm_view, mask_n16, + props_cache_load); + scatter(ptrf, ioffset_n16, usm_view, props_cache_load); + + scatter(ptrf, ioffset_n16_view, usm_view, mask_n16, + props_cache_load); + scatter(ptrf, ioffset_n16_view, usm_view, props_cache_load); + + // CHECK-COUNT-8: call void @llvm.genx.lsc.store.stateless.v16i1.v16i64.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, <32 x i32> {{[^)]+}}, i32 0) + scatter(ptrf, ioffset_n16, usm, mask_n16); + + scatter(ptrf, ioffset_n16, usm); + + scatter(ptrf, ioffset_n16_view, usm, mask_n16); + + scatter(ptrf, ioffset_n16_view, usm); + + scatter(ptrf, ioffset_n16, usm_view, mask_n16); + + scatter(ptrf, ioffset_n16, usm_view); + + scatter(ptrf, ioffset_n16_view, usm_view, mask_n16); + + scatter(ptrf, ioffset_n16_view, usm_view); } // CHECK-LABEL: define {{.*}} @_Z23test_slm_gather_scatter{{.*}} From 16a368c8a711b79e84940d2aab4d1d217d0c5697 Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Wed, 31 Jan 2024 11:20:46 -0800 Subject: [PATCH 02/14] [CI][NFC] Unify naming scheme for SYCL workflows. (#12525) All GitHub Actions workflows added by intel/llvm project follow similar naming notation: 1. Name starts with `sycl` prefix. 2. Use dash `-` instead of underscore `_` to separate words. --- .github/workflows/{sycl_aws.yml => sycl-aws.yml} | 0 .github/workflows/{sycl_containers.yaml => sycl-containers.yaml} | 0 .../{sycl_detect_changes.yml => sycl-detect-changes.yml} | 0 .github/workflows/{gh_pages.yml => sycl-docs.yml} | 0 .github/workflows/{sycl_linux_build.yml => sycl-linux-build.yml} | 0 ...ix_e2e_on_nightly.yml => sycl-linux-matrix-e2e-on-nightly.yml} | 0 ...{sycl_linux_precommit_aws.yml => sycl-linux-precommit-aws.yml} | 0 .../{sycl_linux_precommit.yml => sycl-linux-precommit.yml} | 0 .../{sycl_linux_run_tests.yml => sycl-linux-run-tests.yml} | 0 ...ycl_macos_build_and_test.yml => sycl-macos-build-and-test.yml} | 0 .github/workflows/{sycl_nightly.yml => sycl-nightly.yml} | 0 .github/workflows/{sycl_post_commit.yml => sycl-post-commit.yml} | 0 .../workflows/{sycl_stale_issues.yml => sycl-stale-issues.yml} | 0 .github/workflows/{sync-main.yml => sycl-sync-main.yml} | 0 .../{sycl_update_gpu_driver.yml => sycl-update-gpu-driver.yml} | 0 .../workflows/{sycl_windows_build.yml => sycl-windows-build.yml} | 0 .../{sycl_windows_precommit.yml => sycl-windows-precommit.yml} | 0 .../{sycl_windows_run_tests.yml => sycl-windows-run-tests.yml} | 0 18 files changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{sycl_aws.yml => sycl-aws.yml} (100%) rename .github/workflows/{sycl_containers.yaml => sycl-containers.yaml} (100%) rename .github/workflows/{sycl_detect_changes.yml => sycl-detect-changes.yml} (100%) rename .github/workflows/{gh_pages.yml => sycl-docs.yml} (100%) rename .github/workflows/{sycl_linux_build.yml => sycl-linux-build.yml} (100%) rename .github/workflows/{sycl_linux_matrix_e2e_on_nightly.yml => sycl-linux-matrix-e2e-on-nightly.yml} (100%) rename .github/workflows/{sycl_linux_precommit_aws.yml => sycl-linux-precommit-aws.yml} (100%) rename .github/workflows/{sycl_linux_precommit.yml => sycl-linux-precommit.yml} (100%) rename .github/workflows/{sycl_linux_run_tests.yml => sycl-linux-run-tests.yml} (100%) rename .github/workflows/{sycl_macos_build_and_test.yml => sycl-macos-build-and-test.yml} (100%) rename .github/workflows/{sycl_nightly.yml => sycl-nightly.yml} (100%) rename .github/workflows/{sycl_post_commit.yml => sycl-post-commit.yml} (100%) rename .github/workflows/{sycl_stale_issues.yml => sycl-stale-issues.yml} (100%) rename .github/workflows/{sync-main.yml => sycl-sync-main.yml} (100%) rename .github/workflows/{sycl_update_gpu_driver.yml => sycl-update-gpu-driver.yml} (100%) rename .github/workflows/{sycl_windows_build.yml => sycl-windows-build.yml} (100%) rename .github/workflows/{sycl_windows_precommit.yml => sycl-windows-precommit.yml} (100%) rename .github/workflows/{sycl_windows_run_tests.yml => sycl-windows-run-tests.yml} (100%) diff --git a/.github/workflows/sycl_aws.yml b/.github/workflows/sycl-aws.yml similarity index 100% rename from .github/workflows/sycl_aws.yml rename to .github/workflows/sycl-aws.yml diff --git a/.github/workflows/sycl_containers.yaml b/.github/workflows/sycl-containers.yaml similarity index 100% rename from .github/workflows/sycl_containers.yaml rename to .github/workflows/sycl-containers.yaml diff --git a/.github/workflows/sycl_detect_changes.yml b/.github/workflows/sycl-detect-changes.yml similarity index 100% rename from .github/workflows/sycl_detect_changes.yml rename to .github/workflows/sycl-detect-changes.yml diff --git a/.github/workflows/gh_pages.yml b/.github/workflows/sycl-docs.yml similarity index 100% rename from .github/workflows/gh_pages.yml rename to .github/workflows/sycl-docs.yml diff --git a/.github/workflows/sycl_linux_build.yml b/.github/workflows/sycl-linux-build.yml similarity index 100% rename from .github/workflows/sycl_linux_build.yml rename to .github/workflows/sycl-linux-build.yml diff --git a/.github/workflows/sycl_linux_matrix_e2e_on_nightly.yml b/.github/workflows/sycl-linux-matrix-e2e-on-nightly.yml similarity index 100% rename from .github/workflows/sycl_linux_matrix_e2e_on_nightly.yml rename to .github/workflows/sycl-linux-matrix-e2e-on-nightly.yml diff --git a/.github/workflows/sycl_linux_precommit_aws.yml b/.github/workflows/sycl-linux-precommit-aws.yml similarity index 100% rename from .github/workflows/sycl_linux_precommit_aws.yml rename to .github/workflows/sycl-linux-precommit-aws.yml diff --git a/.github/workflows/sycl_linux_precommit.yml b/.github/workflows/sycl-linux-precommit.yml similarity index 100% rename from .github/workflows/sycl_linux_precommit.yml rename to .github/workflows/sycl-linux-precommit.yml diff --git a/.github/workflows/sycl_linux_run_tests.yml b/.github/workflows/sycl-linux-run-tests.yml similarity index 100% rename from .github/workflows/sycl_linux_run_tests.yml rename to .github/workflows/sycl-linux-run-tests.yml diff --git a/.github/workflows/sycl_macos_build_and_test.yml b/.github/workflows/sycl-macos-build-and-test.yml similarity index 100% rename from .github/workflows/sycl_macos_build_and_test.yml rename to .github/workflows/sycl-macos-build-and-test.yml diff --git a/.github/workflows/sycl_nightly.yml b/.github/workflows/sycl-nightly.yml similarity index 100% rename from .github/workflows/sycl_nightly.yml rename to .github/workflows/sycl-nightly.yml diff --git a/.github/workflows/sycl_post_commit.yml b/.github/workflows/sycl-post-commit.yml similarity index 100% rename from .github/workflows/sycl_post_commit.yml rename to .github/workflows/sycl-post-commit.yml diff --git a/.github/workflows/sycl_stale_issues.yml b/.github/workflows/sycl-stale-issues.yml similarity index 100% rename from .github/workflows/sycl_stale_issues.yml rename to .github/workflows/sycl-stale-issues.yml diff --git a/.github/workflows/sync-main.yml b/.github/workflows/sycl-sync-main.yml similarity index 100% rename from .github/workflows/sync-main.yml rename to .github/workflows/sycl-sync-main.yml diff --git a/.github/workflows/sycl_update_gpu_driver.yml b/.github/workflows/sycl-update-gpu-driver.yml similarity index 100% rename from .github/workflows/sycl_update_gpu_driver.yml rename to .github/workflows/sycl-update-gpu-driver.yml diff --git a/.github/workflows/sycl_windows_build.yml b/.github/workflows/sycl-windows-build.yml similarity index 100% rename from .github/workflows/sycl_windows_build.yml rename to .github/workflows/sycl-windows-build.yml diff --git a/.github/workflows/sycl_windows_precommit.yml b/.github/workflows/sycl-windows-precommit.yml similarity index 100% rename from .github/workflows/sycl_windows_precommit.yml rename to .github/workflows/sycl-windows-precommit.yml diff --git a/.github/workflows/sycl_windows_run_tests.yml b/.github/workflows/sycl-windows-run-tests.yml similarity index 100% rename from .github/workflows/sycl_windows_run_tests.yml rename to .github/workflows/sycl-windows-run-tests.yml From 1b5daa810a2178dc7fa094e7962a52b88f085998 Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Wed, 31 Jan 2024 11:25:19 -0800 Subject: [PATCH 03/14] Revert "[CI][NFC] Unify naming scheme for SYCL workflows." (#12567) Reverts intel/llvm#12525 In addition to file renaming, we need to update file names referenced inside the workflow files. --- .github/workflows/{sycl-docs.yml => gh_pages.yml} | 0 .github/workflows/{sycl-aws.yml => sycl_aws.yml} | 0 .github/workflows/{sycl-containers.yaml => sycl_containers.yaml} | 0 .../{sycl-detect-changes.yml => sycl_detect_changes.yml} | 0 .github/workflows/{sycl-linux-build.yml => sycl_linux_build.yml} | 0 ...ix-e2e-on-nightly.yml => sycl_linux_matrix_e2e_on_nightly.yml} | 0 .../{sycl-linux-precommit.yml => sycl_linux_precommit.yml} | 0 ...{sycl-linux-precommit-aws.yml => sycl_linux_precommit_aws.yml} | 0 .../{sycl-linux-run-tests.yml => sycl_linux_run_tests.yml} | 0 ...ycl-macos-build-and-test.yml => sycl_macos_build_and_test.yml} | 0 .github/workflows/{sycl-nightly.yml => sycl_nightly.yml} | 0 .github/workflows/{sycl-post-commit.yml => sycl_post_commit.yml} | 0 .../workflows/{sycl-stale-issues.yml => sycl_stale_issues.yml} | 0 .../{sycl-update-gpu-driver.yml => sycl_update_gpu_driver.yml} | 0 .../workflows/{sycl-windows-build.yml => sycl_windows_build.yml} | 0 .../{sycl-windows-precommit.yml => sycl_windows_precommit.yml} | 0 .../{sycl-windows-run-tests.yml => sycl_windows_run_tests.yml} | 0 .github/workflows/{sycl-sync-main.yml => sync-main.yml} | 0 18 files changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{sycl-docs.yml => gh_pages.yml} (100%) rename .github/workflows/{sycl-aws.yml => sycl_aws.yml} (100%) rename .github/workflows/{sycl-containers.yaml => sycl_containers.yaml} (100%) rename .github/workflows/{sycl-detect-changes.yml => sycl_detect_changes.yml} (100%) rename .github/workflows/{sycl-linux-build.yml => sycl_linux_build.yml} (100%) rename .github/workflows/{sycl-linux-matrix-e2e-on-nightly.yml => sycl_linux_matrix_e2e_on_nightly.yml} (100%) rename .github/workflows/{sycl-linux-precommit.yml => sycl_linux_precommit.yml} (100%) rename .github/workflows/{sycl-linux-precommit-aws.yml => sycl_linux_precommit_aws.yml} (100%) rename .github/workflows/{sycl-linux-run-tests.yml => sycl_linux_run_tests.yml} (100%) rename .github/workflows/{sycl-macos-build-and-test.yml => sycl_macos_build_and_test.yml} (100%) rename .github/workflows/{sycl-nightly.yml => sycl_nightly.yml} (100%) rename .github/workflows/{sycl-post-commit.yml => sycl_post_commit.yml} (100%) rename .github/workflows/{sycl-stale-issues.yml => sycl_stale_issues.yml} (100%) rename .github/workflows/{sycl-update-gpu-driver.yml => sycl_update_gpu_driver.yml} (100%) rename .github/workflows/{sycl-windows-build.yml => sycl_windows_build.yml} (100%) rename .github/workflows/{sycl-windows-precommit.yml => sycl_windows_precommit.yml} (100%) rename .github/workflows/{sycl-windows-run-tests.yml => sycl_windows_run_tests.yml} (100%) rename .github/workflows/{sycl-sync-main.yml => sync-main.yml} (100%) diff --git a/.github/workflows/sycl-docs.yml b/.github/workflows/gh_pages.yml similarity index 100% rename from .github/workflows/sycl-docs.yml rename to .github/workflows/gh_pages.yml diff --git a/.github/workflows/sycl-aws.yml b/.github/workflows/sycl_aws.yml similarity index 100% rename from .github/workflows/sycl-aws.yml rename to .github/workflows/sycl_aws.yml diff --git a/.github/workflows/sycl-containers.yaml b/.github/workflows/sycl_containers.yaml similarity index 100% rename from .github/workflows/sycl-containers.yaml rename to .github/workflows/sycl_containers.yaml diff --git a/.github/workflows/sycl-detect-changes.yml b/.github/workflows/sycl_detect_changes.yml similarity index 100% rename from .github/workflows/sycl-detect-changes.yml rename to .github/workflows/sycl_detect_changes.yml diff --git a/.github/workflows/sycl-linux-build.yml b/.github/workflows/sycl_linux_build.yml similarity index 100% rename from .github/workflows/sycl-linux-build.yml rename to .github/workflows/sycl_linux_build.yml diff --git a/.github/workflows/sycl-linux-matrix-e2e-on-nightly.yml b/.github/workflows/sycl_linux_matrix_e2e_on_nightly.yml similarity index 100% rename from .github/workflows/sycl-linux-matrix-e2e-on-nightly.yml rename to .github/workflows/sycl_linux_matrix_e2e_on_nightly.yml diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl_linux_precommit.yml similarity index 100% rename from .github/workflows/sycl-linux-precommit.yml rename to .github/workflows/sycl_linux_precommit.yml diff --git a/.github/workflows/sycl-linux-precommit-aws.yml b/.github/workflows/sycl_linux_precommit_aws.yml similarity index 100% rename from .github/workflows/sycl-linux-precommit-aws.yml rename to .github/workflows/sycl_linux_precommit_aws.yml diff --git a/.github/workflows/sycl-linux-run-tests.yml b/.github/workflows/sycl_linux_run_tests.yml similarity index 100% rename from .github/workflows/sycl-linux-run-tests.yml rename to .github/workflows/sycl_linux_run_tests.yml diff --git a/.github/workflows/sycl-macos-build-and-test.yml b/.github/workflows/sycl_macos_build_and_test.yml similarity index 100% rename from .github/workflows/sycl-macos-build-and-test.yml rename to .github/workflows/sycl_macos_build_and_test.yml diff --git a/.github/workflows/sycl-nightly.yml b/.github/workflows/sycl_nightly.yml similarity index 100% rename from .github/workflows/sycl-nightly.yml rename to .github/workflows/sycl_nightly.yml diff --git a/.github/workflows/sycl-post-commit.yml b/.github/workflows/sycl_post_commit.yml similarity index 100% rename from .github/workflows/sycl-post-commit.yml rename to .github/workflows/sycl_post_commit.yml diff --git a/.github/workflows/sycl-stale-issues.yml b/.github/workflows/sycl_stale_issues.yml similarity index 100% rename from .github/workflows/sycl-stale-issues.yml rename to .github/workflows/sycl_stale_issues.yml diff --git a/.github/workflows/sycl-update-gpu-driver.yml b/.github/workflows/sycl_update_gpu_driver.yml similarity index 100% rename from .github/workflows/sycl-update-gpu-driver.yml rename to .github/workflows/sycl_update_gpu_driver.yml diff --git a/.github/workflows/sycl-windows-build.yml b/.github/workflows/sycl_windows_build.yml similarity index 100% rename from .github/workflows/sycl-windows-build.yml rename to .github/workflows/sycl_windows_build.yml diff --git a/.github/workflows/sycl-windows-precommit.yml b/.github/workflows/sycl_windows_precommit.yml similarity index 100% rename from .github/workflows/sycl-windows-precommit.yml rename to .github/workflows/sycl_windows_precommit.yml diff --git a/.github/workflows/sycl-windows-run-tests.yml b/.github/workflows/sycl_windows_run_tests.yml similarity index 100% rename from .github/workflows/sycl-windows-run-tests.yml rename to .github/workflows/sycl_windows_run_tests.yml diff --git a/.github/workflows/sycl-sync-main.yml b/.github/workflows/sync-main.yml similarity index 100% rename from .github/workflows/sycl-sync-main.yml rename to .github/workflows/sync-main.yml From 7348207426ef239650af8e3a4b1ebf8a088f9c51 Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Wed, 31 Jan 2024 19:28:39 +0000 Subject: [PATCH 04/14] [SYCL][ESIMD][E2E] Disable two LSC tests on DG2 (#12565) They started failing in the recent driver update. I can't reproduce it locally with the same driver version but the hardware we have is a little different, maybe that's why. I made an internal tracker for this. Signed-off-by: Sarnie, Nick --- sycl/test-e2e/ESIMD/lsc/local_accessor_atomic_smoke_cmpxchg.cpp | 2 +- sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/ESIMD/lsc/local_accessor_atomic_smoke_cmpxchg.cpp b/sycl/test-e2e/ESIMD/lsc/local_accessor_atomic_smoke_cmpxchg.cpp index 77f6a6c4122d9..83bb92eca5440 100644 --- a/sycl/test-e2e/ESIMD/lsc/local_accessor_atomic_smoke_cmpxchg.cpp +++ b/sycl/test-e2e/ESIMD/lsc/local_accessor_atomic_smoke_cmpxchg.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // This test checks local accessor cmpxchg atomic operations. //===----------------------------------------------------------------------===// -// REQUIRES: gpu-intel-pvc || gpu-intel-dg2 +// REQUIRES: gpu-intel-pvc // RUN: %{build} -o %t.out // RUN: %{run} %t.out // diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp index 25bce1b2b2244..4fde6446bdff1 100644 --- a/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp +++ b/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// // This test checks LSC SLM atomic operations. //===----------------------------------------------------------------------===// -// REQUIRES: gpu-intel-pvc || gpu-intel-dg2 +// REQUIRES: gpu-intel-pvc // RUN: %{build} -o %t.out // RUN: %{run} %t.out From b478d2fac950e88c0049844418b5daedc1c9df89 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Wed, 31 Jan 2024 12:08:09 -0800 Subject: [PATCH 05/14] [SYCL] Fix resource leak related to SYCL_FALLBACK_ASSERT (#12532) https://github.com/intel/llvm/pull/6837 enabled asynchronous buffer destruction for buffers constructed without host data. However, initial fallback assert implementation in https://github.com/intel/llvm/pull/3767 predates it and as such had to place the buffer inside `queue_impl` to avoid unintended synchronization point. I don't know if there was the same crash observed on the end-to-end test added as part of this PR prior to https://github.com/intel/llvm/pull/3767, but it doesn't even matter because the "new" implementation is both simpler and doesn't result in a crash. I suspect that without it (with the buffer for fallback assert implementation being a data member of `sycl::queue_impl`) we had a cyclic dependency somewhere leading to resource leak and ultimately to the assert in `DeviceGlobalUSMMem::~DeviceGlobalUSMMem()`. --- sycl/include/sycl/queue.hpp | 6 ++-- sycl/source/detail/queue_impl.hpp | 13 +++++++- sycl/source/queue.cpp | 2 ++ sycl/test-e2e/Assert/check_resource_leak.cpp | 32 ++++++++++++++++++++ 4 files changed, 49 insertions(+), 4 deletions(-) create mode 100644 sycl/test-e2e/Assert/check_resource_leak.cpp diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 4706639127022..b693cbaae62c8 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -2965,7 +2965,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { Rest...); } +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES buffer &getAssertHappenedBuffer(); +#endif event memcpyToDeviceGlobal(void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScope, size_t NumBytes, @@ -3019,9 +3021,7 @@ class AssertInfoCopier; */ event submitAssertCapture(queue &Self, event &Event, queue *SecondaryQueue, const detail::code_location &CodeLoc) { - using AHBufT = buffer; - - AHBufT &Buffer = Self.getAssertHappenedBuffer(); + buffer Buffer{1}; event CopierEv, CheckerEv, PostCheckerEv; auto CopierCGF = [&](handler &CGH) { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 810f991d6667f..ddd6a71d7db80 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -108,7 +108,9 @@ class queue_impl { const async_handler &AsyncHandler, const property_list &PropList) : MDevice(Device), MContext(Context), MAsyncHandler(AsyncHandler), MPropList(PropList), MHostQueue(MDevice->is_host()), +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES MAssertHappenedBuffer(range<1>{1}), +#endif MIsInorder(has_property()), MDiscardEvents( has_property()), @@ -283,7 +285,9 @@ class queue_impl { queue_impl(sycl::detail::pi::PiQueue PiQueue, const ContextImplPtr &Context, const async_handler &AsyncHandler) : MContext(Context), MAsyncHandler(AsyncHandler), MHostQueue(false), +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES MAssertHappenedBuffer(range<1>{1}), +#endif MIsInorder(has_property()), MDiscardEvents( has_property()), @@ -305,7 +309,10 @@ class queue_impl { queue_impl(sycl::detail::pi::PiQueue PiQueue, const ContextImplPtr &Context, const async_handler &AsyncHandler, const property_list &PropList) : MContext(Context), MAsyncHandler(AsyncHandler), MPropList(PropList), - MHostQueue(false), MAssertHappenedBuffer(range<1>{1}), + MHostQueue(false), +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES + MAssertHappenedBuffer(range<1>{1}), +#endif MIsInorder(has_property()), MDiscardEvents( has_property()), @@ -673,9 +680,11 @@ class queue_impl { /// \return a native handle. pi_native_handle getNative(int32_t &NativeHandleDesc) const; +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES buffer &getAssertHappenedBuffer() { return MAssertHappenedBuffer; } +#endif void registerStreamServiceEvent(const EventImplPtr &Event) { std::lock_guard Lock(MMutex); @@ -918,8 +927,10 @@ class queue_impl { /// need to emulate it with multiple native in-order queues. bool MEmulateOOO = false; +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES // Buffer to store assert failure descriptor buffer MAssertHappenedBuffer; +#endif // This event is employed for enhanced dependency tracking with in-order queue // Access to the event should be guarded with MMutex diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index 47b6b29b89bab..1b877a31da4e0 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -298,9 +298,11 @@ pi_native_handle queue::getNative(int32_t &NativeHandleDesc) const { return impl->getNative(NativeHandleDesc); } +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES buffer &queue::getAssertHappenedBuffer() { return impl->getAssertHappenedBuffer(); } +#endif event queue::memcpyToDeviceGlobal(void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScope, size_t NumBytes, diff --git a/sycl/test-e2e/Assert/check_resource_leak.cpp b/sycl/test-e2e/Assert/check_resource_leak.cpp new file mode 100644 index 0000000000000..252d2ed9e0c49 --- /dev/null +++ b/sycl/test-e2e/Assert/check_resource_leak.cpp @@ -0,0 +1,32 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Device globals aren't supported on opencl:gpu yet. +// UNSUPPORTED: opencl && gpu + +// TODO: Fails at JIT compilation for some reason. +// UNSUPPORTED: hip +#define SYCL_FALLBACK_ASSERT 1 + +#include + +// DeviceGlobalUSMMem::~DeviceGlobalUSMMem() has asserts to ensure some +// resources have been cleaned up when it's executed. Those asserts used to fail +// when "AssertHappened" buffer used in fallback implementation of the device +// assert was a data member of the queue_impl. +sycl::ext::oneapi::experimental::device_global dg; + +int main() { + sycl::queue q; + q.submit([&](sycl::handler &cgh) { + sycl::range<1> R{16}; + cgh.parallel_for(sycl::nd_range<1>{R, R}, [=](sycl::nd_item<1> ndi) { + if (ndi.get_global_linear_id() == 0) + dg.get() = 42; + auto sg = sycl::ext::oneapi::experimental::this_sub_group(); + auto active = sycl::ext::oneapi::group_ballot(sg, 1); + }); + }).wait(); + + return 0; +} From 85e461e5688dae634b3c2f1026f525cffd0182b3 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Wed, 31 Jan 2024 12:23:27 -0800 Subject: [PATCH 06/14] [SYCL][E2E] Disable USM/usm_pooling.cpp on gpu-intel-dg2 (#12564) See https://github.com/intel/llvm/issues/12397, the test is flaky in post-commit. --- sycl/test-e2e/USM/usm_pooling.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sycl/test-e2e/USM/usm_pooling.cpp b/sycl/test-e2e/USM/usm_pooling.cpp index 4a9d16ec5a34e..2f2d4009dce1e 100644 --- a/sycl/test-e2e/USM/usm_pooling.cpp +++ b/sycl/test-e2e/USM/usm_pooling.cpp @@ -1,6 +1,9 @@ // REQUIRES: level_zero // RUN: %{build} -o %t.out +// https://github.com/intel/llvm/issues/12397 +// UNSUPPORTED: gpu-intel-dg2 + // Allocate 2 items of 2MB. Free 2. Allocate 3 more of 2MB. // With no pooling: 1,2,3,4,5 allocs lead to ZE call. From 435845bc23f68ad146915018aa5dd5b451d8bc5e Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Wed, 31 Jan 2024 15:13:43 -0800 Subject: [PATCH 07/14] [CI][NFC] Unify naming scheme for SYCL workflows. (#12568) All GitHub Actions workflows added by intel/llvm project are expected to use following naming notation: 1. Name starts with `sycl` prefix. 2. Use dash `-` to separate words (instead of underscore `_`). This patches fixes naming of workflows which do not follow this notation. --- .../workflows/{sycl_aws.yml => sycl-aws.yml} | 0 ...cl_containers.yaml => sycl-containers.yaml} | 4 ++-- ...ect_changes.yml => sycl-detect-changes.yml} | 0 .../workflows/{gh_pages.yml => sycl-docs.yml} | 2 +- ...cl_linux_build.yml => sycl-linux-build.yml} | 0 ...ml => sycl-linux-matrix-e2e-on-nightly.yml} | 8 ++++---- ...it_aws.yml => sycl-linux-precommit-aws.yml} | 2 +- ..._precommit.yml => sycl-linux-precommit.yml} | 12 ++++++------ ..._run_tests.yml => sycl-linux-run-tests.yml} | 0 ..._test.yml => sycl-macos-build-and-test.yml} | 0 .../{sycl_nightly.yml => sycl-nightly.yml} | 8 ++++---- ...cl_post_commit.yml => sycl-post-commit.yml} | 18 +++++++++--------- ..._stale_issues.yml => sycl-stale-issues.yml} | 0 .../{sync-main.yml => sycl-sync-main.yml} | 0 ...u_driver.yml => sycl-update-gpu-driver.yml} | 0 ...indows_build.yml => sycl-windows-build.yml} | 0 ...recommit.yml => sycl-windows-precommit.yml} | 12 ++++++------ ...un_tests.yml => sycl-windows-run-tests.yml} | 0 README.md | 4 ++-- 19 files changed, 35 insertions(+), 35 deletions(-) rename .github/workflows/{sycl_aws.yml => sycl-aws.yml} (100%) rename .github/workflows/{sycl_containers.yaml => sycl-containers.yaml} (98%) rename .github/workflows/{sycl_detect_changes.yml => sycl-detect-changes.yml} (100%) rename .github/workflows/{gh_pages.yml => sycl-docs.yml} (97%) rename .github/workflows/{sycl_linux_build.yml => sycl-linux-build.yml} (100%) rename .github/workflows/{sycl_linux_matrix_e2e_on_nightly.yml => sycl-linux-matrix-e2e-on-nightly.yml} (94%) rename .github/workflows/{sycl_linux_precommit_aws.yml => sycl-linux-precommit-aws.yml} (98%) rename .github/workflows/{sycl_linux_precommit.yml => sycl-linux-precommit.yml} (94%) rename .github/workflows/{sycl_linux_run_tests.yml => sycl-linux-run-tests.yml} (100%) rename .github/workflows/{sycl_macos_build_and_test.yml => sycl-macos-build-and-test.yml} (100%) rename .github/workflows/{sycl_nightly.yml => sycl-nightly.yml} (96%) rename .github/workflows/{sycl_post_commit.yml => sycl-post-commit.yml} (88%) rename .github/workflows/{sycl_stale_issues.yml => sycl-stale-issues.yml} (100%) rename .github/workflows/{sync-main.yml => sycl-sync-main.yml} (100%) rename .github/workflows/{sycl_update_gpu_driver.yml => sycl-update-gpu-driver.yml} (100%) rename .github/workflows/{sycl_windows_build.yml => sycl-windows-build.yml} (100%) rename .github/workflows/{sycl_windows_precommit.yml => sycl-windows-precommit.yml} (80%) rename .github/workflows/{sycl_windows_run_tests.yml => sycl-windows-run-tests.yml} (100%) diff --git a/.github/workflows/sycl_aws.yml b/.github/workflows/sycl-aws.yml similarity index 100% rename from .github/workflows/sycl_aws.yml rename to .github/workflows/sycl-aws.yml diff --git a/.github/workflows/sycl_containers.yaml b/.github/workflows/sycl-containers.yaml similarity index 98% rename from .github/workflows/sycl_containers.yaml rename to .github/workflows/sycl-containers.yaml index 011d45ee9fee5..bb2e086929847 100644 --- a/.github/workflows/sycl_containers.yaml +++ b/.github/workflows/sycl-containers.yaml @@ -12,14 +12,14 @@ on: - 'devops/dependencies.json' - 'devops/scripts/install_drivers.sh' - 'devops/scripts/install_build_tools.sh' - - '.github/workflows/sycl_containers.yaml' + - '.github/workflows/sycl-containers.yaml' pull_request: paths: - 'devops/containers/**' - 'devops/dependencies.json' - 'devops/scripts/install_drivers.sh' - 'devops/scripts/install_build_tools.sh' - - '.github/workflows/sycl_containers.yaml' + - '.github/workflows/sycl-containers.yaml' jobs: base_image_ubuntu2204: diff --git a/.github/workflows/sycl_detect_changes.yml b/.github/workflows/sycl-detect-changes.yml similarity index 100% rename from .github/workflows/sycl_detect_changes.yml rename to .github/workflows/sycl-detect-changes.yml diff --git a/.github/workflows/gh_pages.yml b/.github/workflows/sycl-docs.yml similarity index 97% rename from .github/workflows/gh_pages.yml rename to .github/workflows/sycl-docs.yml index d0d4a130d6f78..73062642535cf 100644 --- a/.github/workflows/gh_pages.yml +++ b/.github/workflows/sycl-docs.yml @@ -7,7 +7,7 @@ on: branches: - sycl paths: - - '.github/workflows/gh_pages.yml' + - '.github/workflows/sycl-docs.yml' - 'clang/docs/**' - 'sycl/doc/**' diff --git a/.github/workflows/sycl_linux_build.yml b/.github/workflows/sycl-linux-build.yml similarity index 100% rename from .github/workflows/sycl_linux_build.yml rename to .github/workflows/sycl-linux-build.yml diff --git a/.github/workflows/sycl_linux_matrix_e2e_on_nightly.yml b/.github/workflows/sycl-linux-matrix-e2e-on-nightly.yml similarity index 94% rename from .github/workflows/sycl_linux_matrix_e2e_on_nightly.yml rename to .github/workflows/sycl-linux-matrix-e2e-on-nightly.yml index 50dc1ca700035..06a5f13674768 100644 --- a/.github/workflows/sycl_linux_matrix_e2e_on_nightly.yml +++ b/.github/workflows/sycl-linux-matrix-e2e-on-nightly.yml @@ -48,7 +48,7 @@ jobs: image: ghcr.io/intel/llvm/sycl_ubuntu2204_nightly:build image_options: -u 1001 --gpus all --cap-add SYS_ADMIN target_devices: ext_oneapi_cuda:gpu - uses: ./.github/workflows/sycl_linux_run_tests.yml + uses: ./.github/workflows/sycl-linux-run-tests.yml with: name: ${{ matrix.name }} runner: ${{ matrix. runner }} @@ -62,7 +62,7 @@ jobs: aws_start: name: AWS Start - uses: ./.github/workflows/sycl_aws.yml + uses: ./.github/workflows/sycl-aws.yml secrets: inherit with: mode: start @@ -71,7 +71,7 @@ jobs: linux_e2e_on_nightly_aws: name: '[AWS][CUDA] E2E on Nightly' needs: [aws_start] - uses: ./.github/workflows/sycl_linux_run_tests.yml + uses: ./.github/workflows/sycl-linux-run-tests.yml with: name: CUDA runner: '["aws-cuda_${{ github.run_id }}-${{ github.run_attempt }}"]' @@ -86,7 +86,7 @@ jobs: name: AWS Stop needs: [aws_start, linux_e2e_on_nightly_aws] if: always() - uses: ./.github/workflows/sycl_aws.yml + uses: ./.github/workflows/sycl-aws.yml secrets: inherit with: mode: stop diff --git a/.github/workflows/sycl_linux_precommit_aws.yml b/.github/workflows/sycl-linux-precommit-aws.yml similarity index 98% rename from .github/workflows/sycl_linux_precommit_aws.yml rename to .github/workflows/sycl-linux-precommit-aws.yml index 3fce40107d90e..8ff68e725e447 100644 --- a/.github/workflows/sycl_linux_precommit_aws.yml +++ b/.github/workflows/sycl-linux-precommit-aws.yml @@ -54,7 +54,7 @@ jobs: e2e-cuda: needs: [aws-start] - uses: ./.github/workflows/sycl_linux_run_tests.yml + uses: ./.github/workflows/sycl-linux-run-tests.yml with: name: CUDA E2E runner: '["aws_cuda-${{ github.event.workflow_run.id }}-${{ github.event.workflow_run.run_attempt }}"]' diff --git a/.github/workflows/sycl_linux_precommit.yml b/.github/workflows/sycl-linux-precommit.yml similarity index 94% rename from .github/workflows/sycl_linux_precommit.yml rename to .github/workflows/sycl-linux-precommit.yml index 8df5237d63d04..eb3f30aebcfe6 100644 --- a/.github/workflows/sycl_linux_precommit.yml +++ b/.github/workflows/sycl-linux-precommit.yml @@ -18,8 +18,8 @@ on: - 'clang/docs/**' - '**.md' - '**.rst' - - '.github/workflows/sycl_windows_*.yml' - - '.github/workflows/sycl_macos_*.yml' + - '.github/workflows/sycl-windows-*.yml' + - '.github/workflows/sycl-macos-*.yml' - 'devops/containers/**' - 'devops/actions/build_container/**' @@ -30,12 +30,12 @@ concurrency: jobs: detect_changes: - uses: ./.github/workflows/sycl_detect_changes.yml + uses: ./.github/workflows/sycl-detect-changes.yml build: needs: [detect_changes] if: always() && success() - uses: ./.github/workflows/sycl_linux_build.yml + uses: ./.github/workflows/sycl-linux-build.yml with: build_ref: ${{ github.sha }} merge_ref: '' @@ -75,7 +75,7 @@ jobs: install_drivers: ${{ contains(needs.detect_changes.outputs.filters, 'drivers') }} extra_lit_opts: --param matrix-xmx8=True --param gpu-intel-dg2=True env: '{"LIT_FILTER":"Matrix/"}' - uses: ./.github/workflows/sycl_linux_run_tests.yml + uses: ./.github/workflows/sycl-linux-run-tests.yml with: name: ${{ matrix.name }} runner: ${{ matrix. runner }} @@ -111,7 +111,7 @@ jobs: runner: '["Linux", "gen12"]' - name: Perf tests on Intel Arc A-Series Graphics system runner: '["Linux", "arc"]' - uses: ./.github/workflows/sycl_linux_run_tests.yml + uses: ./.github/workflows/sycl-linux-run-tests.yml with: name: ${{ matrix.name }} runner: ${{ matrix. runner }} diff --git a/.github/workflows/sycl_linux_run_tests.yml b/.github/workflows/sycl-linux-run-tests.yml similarity index 100% rename from .github/workflows/sycl_linux_run_tests.yml rename to .github/workflows/sycl-linux-run-tests.yml diff --git a/.github/workflows/sycl_macos_build_and_test.yml b/.github/workflows/sycl-macos-build-and-test.yml similarity index 100% rename from .github/workflows/sycl_macos_build_and_test.yml rename to .github/workflows/sycl-macos-build-and-test.yml diff --git a/.github/workflows/sycl_nightly.yml b/.github/workflows/sycl-nightly.yml similarity index 96% rename from .github/workflows/sycl_nightly.yml rename to .github/workflows/sycl-nightly.yml index e802af89a29b4..28df4a8b8de13 100644 --- a/.github/workflows/sycl_nightly.yml +++ b/.github/workflows/sycl-nightly.yml @@ -8,7 +8,7 @@ on: jobs: ubuntu2204_build: if: github.repository == 'intel/llvm' - uses: ./.github/workflows/sycl_linux_build.yml + uses: ./.github/workflows/sycl-linux-build.yml secrets: inherit with: build_cache_root: "/__w/" @@ -59,7 +59,7 @@ jobs: image: ghcr.io/intel/llvm/ubuntu2204_build:latest image_options: -u 1001 --gpus all --cap-add SYS_ADMIN target_devices: ext_oneapi_cuda:gpu - uses: ./.github/workflows/sycl_linux_run_tests.yml + uses: ./.github/workflows/sycl-linux-run-tests.yml with: name: ${{ matrix.name }} runner: ${{ matrix.runner }} @@ -74,7 +74,7 @@ jobs: sycl_toolchain_decompress_command: ${{ needs.ubuntu2204_build.outputs.artifact_decompress_command }} build-win: - uses: ./.github/workflows/sycl_windows_build.yml + uses: ./.github/workflows/sycl-windows-build.yml if: github.repository == 'intel/llvm' with: retention-days: 90 @@ -89,7 +89,7 @@ jobs: always() && !cancelled() && needs.build.outputs.build_conclusion == 'success' - uses: ./.github/workflows/sycl_windows_run_tests.yml + uses: ./.github/workflows/sycl-windows-run-tests.yml with: name: Intel GEN12 Graphics with Level Zero runner: '["Windows","gen12"]' diff --git a/.github/workflows/sycl_post_commit.yml b/.github/workflows/sycl-post-commit.yml similarity index 88% rename from .github/workflows/sycl_post_commit.yml rename to .github/workflows/sycl-post-commit.yml index 76dd3ac88e3d2..00205a52b03e1 100644 --- a/.github/workflows/sycl_post_commit.yml +++ b/.github/workflows/sycl-post-commit.yml @@ -12,10 +12,10 @@ on: - sycl - sycl-devops-pr/** paths: - - .github/workflows/sycl_post_commit.yml - - .github/workflows/sycl_linux_build.yml - - .github/workflows/sycl_linux_run_tests.yml - - .github/workflows/sycl_macos_build_and_test.yml + - .github/workflows/sycl-post-commit.yml + - .github/workflows/sycl-linux-build.yml + - .github/workflows/sycl-linux-run-tests.yml + - .github/workflows/sycl-macos-build-and-test.yml - ./devops/actions/cleanup - ./devops/actions/cached_checkout @@ -23,7 +23,7 @@ jobs: build-lin: name: Linux (Self build + shared libraries + no-assertions) if: github.repository == 'intel/llvm' - uses: ./.github/workflows/sycl_linux_build.yml + uses: ./.github/workflows/sycl-linux-build.yml with: build_cache_root: "/__w/llvm" build_cache_suffix: sprod_shared @@ -63,7 +63,7 @@ jobs: env: '{"LIT_FILTER":"PerformanceTests/"}' extra_lit_opts: -a -j 1 --param enable-perf-tests=True target_devices: all - uses: ./.github/workflows/sycl_linux_run_tests.yml + uses: ./.github/workflows/sycl-linux-run-tests.yml with: name: ${{ matrix.name }} runner: ${{ matrix. runner }} @@ -87,7 +87,7 @@ jobs: always() && success() && github.repository == 'intel/llvm' - uses: ./.github/workflows/sycl_windows_build.yml + uses: ./.github/workflows/sycl-windows-build.yml e2e-win: needs: build-win @@ -96,7 +96,7 @@ jobs: always() && !cancelled() && needs.build-win.outputs.build_conclusion == 'success' - uses: ./.github/workflows/sycl_windows_run_tests.yml + uses: ./.github/workflows/sycl-windows-run-tests.yml with: name: Intel GEN12 Graphics with Level Zero runner: '["Windows","gen12"]' @@ -105,4 +105,4 @@ jobs: macos_default: name: macOS if: github.repository == 'intel/llvm' - uses: ./.github/workflows/sycl_macos_build_and_test.yml + uses: ./.github/workflows/sycl-macos-build-and-test.yml diff --git a/.github/workflows/sycl_stale_issues.yml b/.github/workflows/sycl-stale-issues.yml similarity index 100% rename from .github/workflows/sycl_stale_issues.yml rename to .github/workflows/sycl-stale-issues.yml diff --git a/.github/workflows/sync-main.yml b/.github/workflows/sycl-sync-main.yml similarity index 100% rename from .github/workflows/sync-main.yml rename to .github/workflows/sycl-sync-main.yml diff --git a/.github/workflows/sycl_update_gpu_driver.yml b/.github/workflows/sycl-update-gpu-driver.yml similarity index 100% rename from .github/workflows/sycl_update_gpu_driver.yml rename to .github/workflows/sycl-update-gpu-driver.yml diff --git a/.github/workflows/sycl_windows_build.yml b/.github/workflows/sycl-windows-build.yml similarity index 100% rename from .github/workflows/sycl_windows_build.yml rename to .github/workflows/sycl-windows-build.yml diff --git a/.github/workflows/sycl_windows_precommit.yml b/.github/workflows/sycl-windows-precommit.yml similarity index 80% rename from .github/workflows/sycl_windows_precommit.yml rename to .github/workflows/sycl-windows-precommit.yml index 2b7883895d97c..3b96b463560a9 100644 --- a/.github/workflows/sycl_windows_precommit.yml +++ b/.github/workflows/sycl-windows-precommit.yml @@ -16,9 +16,9 @@ on: - 'clang/docs/**' - '**.md' - '**.rst' - - '.github/workflows/sycl_linux_*.yml' - - '.github/workflows/sycl_precommit_aws.yml' - - '.github/workflows/sycl_macos_*.yml' + - '.github/workflows/sycl-linux-*.yml' + - '.github/workflows/sycl-precommit-aws.yml' + - '.github/workflows/sycl-macos-*.yml' - 'devops/containers/**' - 'devops/actions/build_container/**' @@ -32,14 +32,14 @@ concurrency: jobs: detect_changes: - uses: ./.github/workflows/sycl_detect_changes.yml + uses: ./.github/workflows/sycl-detect-changes.yml build: needs: [detect_changes] if: | always() && success() && github.repository == 'intel/llvm' - uses: ./.github/workflows/sycl_windows_build.yml + uses: ./.github/workflows/sycl-windows-build.yml with: changes: ${{ needs.detect_changes.outputs.filters }} @@ -50,7 +50,7 @@ jobs: always() && !cancelled() && needs.build.outputs.build_conclusion == 'success' - uses: ./.github/workflows/sycl_windows_run_tests.yml + uses: ./.github/workflows/sycl-windows-run-tests.yml with: name: Intel GEN12 Graphics with Level Zero runner: '["Windows","gen12"]' diff --git a/.github/workflows/sycl_windows_run_tests.yml b/.github/workflows/sycl-windows-run-tests.yml similarity index 100% rename from .github/workflows/sycl_windows_run_tests.yml rename to .github/workflows/sycl-windows-run-tests.yml diff --git a/README.md b/README.md index bf72abab3e686..8ab5cda1621bb 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,8 @@ For general contribution process see [CONTRIBUTING.md](./CONTRIBUTING.md) [![oneAPI logo](https://spec.oneapi.io/oneapi-logo-white-scaled.jpg)](https://www.oneapi.io/) -[![SYCL Post Commit](https://github.com/intel/llvm/actions/workflows/sycl_post_commit.yml/badge.svg?branch=sycl)](https://github.com/intel/llvm/actions/workflows/sycl_post_commit.yml) -[![Generate Doxygen documentation](https://github.com/intel/llvm/actions/workflows/gh_pages.yml/badge.svg?branch=sycl)](https://github.com/intel/llvm/actions/workflows/gh_pages.yml) +[![SYCL Post Commit](https://github.com/intel/llvm/actions/workflows/sycl-post-commit.yml/badge.svg?branch=sycl)](https://github.com/intel/llvm/actions/workflows/sycl-post-commit.yml) +[![Generate Doxygen documentation](https://github.com/intel/llvm/actions/workflows/sycl-docs.yml/badge.svg?branch=sycl)](https://github.com/intel/llvm/actions/workflows/sycl-docs.yml) The DPC++ is a LLVM-based compiler project that implements compiler and runtime support for the SYCL\* language. The project is hosted in the From af448b06107a41b409987b36a7d1bd6f6bb50411 Mon Sep 17 00:00:00 2001 From: Victor Perez Date: Thu, 1 Feb 2024 12:43:49 +0100 Subject: [PATCH 08/14] [SYCL][Fusion] Handle fusion leading to synchronization issues (#12538) Do not allow fusion when one of the kernels has an explicit local size and it requires ID remapping, i.e., it has a different number of dimensions w.r.t. the fused ND-range or different global size in dimensions [2, N). In this case, two work-items belonging to the same work-group may not belong to the same work-group in the fused ND-range. Signed-off-by: Victor Perez --------- Signed-off-by: Victor Perez --- sycl-fusion/common/lib/NDRangesHelper.cpp | 12 ++++++++ sycl/doc/design/KernelFusionJIT.md | 3 +- sycl/test-e2e/KernelFusion/abort_fusion.cpp | 33 +++++++++++++++++---- 3 files changed, 41 insertions(+), 7 deletions(-) diff --git a/sycl-fusion/common/lib/NDRangesHelper.cpp b/sycl-fusion/common/lib/NDRangesHelper.cpp index 96f26d96a4ea5..7c418022ef4ee 100644 --- a/sycl-fusion/common/lib/NDRangesHelper.cpp +++ b/sycl-fusion/common/lib/NDRangesHelper.cpp @@ -170,6 +170,18 @@ jit_compiler::FusedNDRange::get(ArrayRef NDRanges) { "Cannot fuse kernels whose fusion would " "yield non-uniform work-group sizes"); } + + // Work-items in the same work-group in the original ND-ranges must be in + // the same work-group in the fused one. + if (LocalSize && any_of(NDRanges, [&Fused](const NDRange &NDR) { + return NDR.hasSpecificLocalSize() && requireIDRemapping(Fused, NDR); + })) { + return createStringError( + inconvertibleErrorCode(), + "Cannot fuse kernels when any of the fused kernels with a specific " + "local size has different global sizes in dimensions [2, N) or " + "different number of dimensions"); + } } return FusedNDRange{Fused, IsHeterogeneousList, NDRanges}; diff --git a/sycl/doc/design/KernelFusionJIT.md b/sycl/doc/design/KernelFusionJIT.md index b83a8a26eeae0..224a1984d2902 100644 --- a/sycl/doc/design/KernelFusionJIT.md +++ b/sycl/doc/design/KernelFusionJIT.md @@ -212,7 +212,8 @@ These restrictions can be simplified to: - No two local sizes specified by the nd-ranges will be different; - No global id remapping is needed ([see](#work-item-remapping)) or all input offsets are 0; -- All the fused nd-ranges must have the same offset. +- All the fused nd-ranges must have the same offset; +- No global id remapping is needed for kernels specifying a local size. As we can see, there is no restrictions in the number of dimensions or global sizes of the input nd-ranges. diff --git a/sycl/test-e2e/KernelFusion/abort_fusion.cpp b/sycl/test-e2e/KernelFusion/abort_fusion.cpp index 709befa514915..930fcc12eff86 100644 --- a/sycl/test-e2e/KernelFusion/abort_fusion.cpp +++ b/sycl/test-e2e/KernelFusion/abort_fusion.cpp @@ -15,14 +15,19 @@ enum class Internalization { None, Local, Private }; template size_t getSize(Range r); -template <> size_t getSize(range<1> r) { return r.size(); } -template <> size_t getSize(nd_range<1> r) { +template size_t getSize(range r) { + return r.size(); +} +template size_t getSize(nd_range r) { return r.get_global_range().size(); } template void performFusion(queue &q, Range1 R1, Range2 R2) { + using IndexTy1 = item; + using IndexTy2 = item; + int in[dataSize], tmp[dataSize], out[dataSize]; for (size_t i = 0; i < dataSize; ++i) { @@ -43,15 +48,19 @@ void performFusion(queue &q, Range1 R1, Range2 R2) { q.submit([&](handler &cgh) { auto accIn = bIn.get_access(cgh); auto accTmp = bTmp.get_access(cgh); - cgh.parallel_for( - R1, [=](item<1> i) { accTmp[i] = accIn[i] + 5; }); + cgh.parallel_for(R1, [=](IndexTy1 i) { + size_t j = i.get_linear_id(); + accTmp[j] = accIn[j] + 5; + }); }); q.submit([&](handler &cgh) { auto accTmp = bTmp.get_access(cgh); auto accOut = bOut.get_access(cgh); - cgh.parallel_for( - R2, [=](id<1> i) { accOut[i] = accTmp[i] * 2; }); + cgh.parallel_for(R2, [=](IndexTy2 i) { + size_t j = i.get_linear_id(); + accOut[j] = accTmp[j] * 2; + }); }); fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}}); @@ -117,5 +126,17 @@ int main() { // CHECK-NEXT: Cannot fuse kernels whose fusion would yield non-uniform work-group sizes // CHECK: COMPUTATION OK + // Scenario: Fusing two kernels that may lead to synchronization issues as two + // work-items in the same work-group may not be in the same work-group in the + // fused ND-range. + performFusion( + q, nd_range<2>{range<2>{2, 2}, range<2>{2, 2}}, + nd_range<2>{range<2>{4, 4}, range<2>{2, 2}}); + // CHECK: ERROR: JIT compilation for kernel fusion failed with message: + // CHECK-NEXT: Illegal ND-range combination + // CHECK-NEXT: Detailed information: + // CHECK-NEXT: Cannot fuse kernels when any of the fused kernels with a specific local size has different global sizes in dimensions [2, N) or different number of dimensions + // CHECK: COMPUTATION OK + return 0; } From e402523e67e9bb91ec44d714ab4d57a9ecf2effe Mon Sep 17 00:00:00 2001 From: Pierre-Andre Saulais Date: Thu, 1 Feb 2024 12:33:29 +0000 Subject: [PATCH 09/14] [UR][CUDA] Use new variant of the enableCUDATracing function (#12521) https://github.com/oneapi-src/unified-runtime/pull/1070 and https://github.com/intel/llvm/pull/11952 introduced a new variant of the `enableCUDATracing` function that takes a context pointer parameter, replacing the parameterless variant of that function. The older variant will be removed from UR once this PR is merged. --- sycl/plugins/cuda/pi_cuda.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 4b8163b03efbd..6d50d120aa2f9 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -17,7 +17,12 @@ #include // Forward declarations -void enableCUDATracing(); +struct cuda_tracing_context_t_; + +void enableCUDATracing(cuda_tracing_context_t_ *ctx); +void disableCUDATracing(cuda_tracing_context_t_ *ctx); +cuda_tracing_context_t_ *createCUDATracingContext(); +void freeCUDATracingContext(cuda_tracing_context_t_ *Ctx); //-- PI API implementation extern "C" { @@ -1237,7 +1242,8 @@ pi_result piPluginInit(pi_plugin *PluginInit) { std::memset(&(PluginInit->PiFunctionTable), 0, sizeof(PluginInit->PiFunctionTable)); - enableCUDATracing(); + cuda_tracing_context_t_ *Ctx = createCUDATracingContext(); + enableCUDATracing(Ctx); // Forward calls to CUDA RT. #define _PI_API(api) \ From f9e4f10662c15fdd5d77313b9fb1ca8614976c2c Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Thu, 1 Feb 2024 12:35:25 +0000 Subject: [PATCH 10/14] [SYCL][CUDA] Improved joint_matrix layout test coverage. (#12483) Improved joint_matrix layout test coverage. The test framework that the cuda backend tests use has been updated to support all possible `joint_matrix` gemm API combinations, including all matrix layouts. the gemm header is backend agnostic; hence all backends could use this test framework in the future. This test framework can also act as an example to show how to deal with different layout combinations when computing a general GEMM. Signed-off-by: JackAKirk --- .../Matrix/joint_matrix_gemm_cuda.hpp | 105 ++++++++++++------ .../Matrix/joint_matrix_tensorcores_sm70.cpp | 15 ++- .../Matrix/joint_matrix_tensorcores_sm72.cpp | 19 +++- .../Matrix/joint_matrix_tensorcores_sm80.cpp | 22 +++- 4 files changed, 118 insertions(+), 43 deletions(-) diff --git a/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp b/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp index fe5b110864e6b..9fd4f184692be 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp @@ -11,7 +11,7 @@ using namespace sycl::ext::oneapi; using namespace sycl::ext::oneapi::experimental::matrix; constexpr float bf16_eps = 0.00390625; -// Example usage of Nvidia matrix multiply. +// Example usage of joint_matrix matrix multiply. // Optimizations such as memory paddings for avoiding bank conflicts are not // included in this test which aids clarity for what is going on. This example // forms a "Big matrix" corresponding to a single "TILE" using cuda example @@ -30,37 +30,47 @@ constexpr float bf16_eps = 0.00390625; constexpr int N_THREADS_PER_MATRIX_OP = 32; // number of submatrices per row of accumulator ("C", "D") matrices. -constexpr int SUB_TILES_M = 3; +constexpr int SUB_TILES_M = 2; // number of submatrices per col of accumulator matrices. constexpr int SUB_TILES_N = 2; // number of submatrices per col of "A"/per row of "B", matrices. -constexpr int SUB_TILES_K = 1; +constexpr int SUB_TILES_K = 2; -template +template class TypeHelper; -template -using KernelName = class TypeHelper; +template +using KernelName = + class TypeHelper; -template +template Tc matrix_ref_mn(const int &m, const int &n, Tm *A, Tm *B, Tc *C) { Tc res = C[m * Big_N + n]; - if constexpr (std::is_same::value) { - for (int k = 0; k < Big_K; k++) - res += A[m * Big_K + k] * B[k * Big_N + n]; - } else { - for (int k = 0; k < Big_K; k++) - res += - static_cast(A[m * Big_K + k]) * static_cast(B[k * Big_N + n]); + for (int k = 0; k < Big_K; k++) { + auto index_a = + layout_A == layout::row_major ? m * Big_K + k : m + k * Big_M; + auto index_b = + layout_B == layout::row_major ? k * Big_N + n : k + n * Big_K; + + if constexpr (std::is_same::value) { + res += A[index_a] * B[index_b]; + } else { + res += static_cast(A[index_a]) * static_cast(B[index_b]); + } } return res; } -template > +template < + typename Tm, typename Tc, typename Td, size_t Sub_Tiles_M, + size_t Sub_Tiles_K, size_t Sub_Tiles_N, size_t M, size_t K, size_t N, + layout layout_A = layout::row_major, layout layout_B = layout::row_major, + layout layout_C = layout::row_major, typename T3 = std::remove_const_t> void test(queue &q) { // total number of M dimension matrix elements for the "Big matrix". constexpr auto Big_M = Sub_Tiles_M * M; @@ -97,7 +107,8 @@ void test(queue &q) { accessor accA(bufA, cgh); - cgh.parallel_for>( + cgh.parallel_for>( range<1>(Big_M * Big_K), [=](item<1> item) { auto i = item.get_linear_id(); accA[i] = 0.1f * (i % 10); @@ -107,7 +118,8 @@ void test(queue &q) { accessor accB(bufB, cgh); - cgh.parallel_for>( + cgh.parallel_for>( range<1>(Big_K * Big_N), [=](item<1> item) { auto i = item.get_linear_id(); accB[i] = 0.1f * (i % 10); @@ -130,7 +142,8 @@ void test(queue &q) { range<2> GlobalRange = {Sub_Tiles_M, Sub_Tiles_N * N_THREADS_PER_MATRIX_OP}; - cgh.parallel_for>( + cgh.parallel_for< + KernelName>( nd_range<2>(GlobalRange, LocalRange), [=](nd_item<2> item) { sycl::sub_group sg = item.get_sub_group(); // row id of current submatrix of BIG C matrix @@ -138,33 +151,46 @@ void test(queue &q) { // column id of current submatrix of BIG C matrix const auto n = item.get_group().get_group_id()[1]; - joint_matrix - sub_a; - joint_matrix - sub_b; + joint_matrix sub_a; + joint_matrix sub_b; joint_matrix, use::accumulator, M, N> sub_c; joint_matrix sub_d; + auto stride_C = layout_C == layout::row_major ? Big_N : Big_M; + auto load_stride_C = layout_C == layout::row_major + ? (m * M) * Big_N + n * N + : (m * M) + n * N * Big_M; joint_matrix_load( sg, sub_c, accC.template get_multi_ptr() + - (m * M) * Big_N + n * N, - Big_N, layout::row_major); + load_stride_C, + stride_C, layout_C); + + auto stride_A = layout_A == layout::row_major ? Big_K : Big_M; + auto stride_B = layout_B == layout::row_major ? Big_N : Big_K; + // k = row/col id of current submatrix of BIG A/B matrices for (int k = 0; k < Sub_Tiles_K; k++) { + auto load_stride_A = layout_A == layout::row_major + ? (k * K) + (m * M * Big_K) + : (k * K * Big_M) + (m * M); + auto load_stride_B = layout_B == layout::row_major + ? (k * K * Big_N) + (n * N) + : (k * K) + (n * N * Big_K); + joint_matrix_load( sg, sub_a, accA.template get_multi_ptr() + - (k * K) + (m * M * Big_K), - Big_K); + load_stride_A, + stride_A); joint_matrix_load( sg, sub_b, accB.template get_multi_ptr() + - (k * K * Big_N) + (n * N), - Big_N); + load_stride_B, + stride_B); // round values to correct precision if using tf32 if constexpr (std::is_same::value) { @@ -174,12 +200,13 @@ void test(queue &q) { } joint_matrix_mad(sg, sub_d, sub_a, sub_b, sub_c); + joint_matrix_copy(sg, sub_d, sub_c); } joint_matrix_store( sg, sub_d, accD.template get_multi_ptr() + - (m * M) * Big_N + n * N, - Big_N, layout::row_major); + load_stride_C, + stride_C, layout_C); }); }); q.wait(); @@ -187,14 +214,18 @@ void test(queue &q) { for (int m = 0; m < Big_M; m++) { for (int n = 0; n < Big_N; n++) { + auto index_D = + layout_C == layout::row_major ? m * Big_N + n : m + n * Big_M; if constexpr (std::is_same, bfloat16>::value) { - auto res_device = matrix_ref_mn(m, n, A, B, C); - assert(fabs(2 * (D[m * Big_N + n] - res_device)) / - (D[m * Big_N + n] + res_device) < + auto res_device = + matrix_ref_mn(m, n, A, B, + C); + assert(fabs(2 * (D[index_D] - res_device)) / (D[index_D] + res_device) < bf16_eps * 2); } else { - assert( - (D[m * Big_N + n] == matrix_ref_mn(m, n, A, B, C))); + assert((D[index_D] == + matrix_ref_mn(m, n, A, + B, C))); } } } diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp index f28372b6277dc..a558600ad390c 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp @@ -80,12 +80,23 @@ int main() { test(Q); + // test different layout combinations for one case + + test(Q); + test(Q); + test(Q); + test(Q); + + // joint_matrix_apply tests + auto apply_add = [](auto &x) { x = x + 2; }; float D[MATRIX_M][MATRIX_N]; big_matrix MD_f((float *)&D); - // joint_matrix_apply tests - matrix_verify_lambda(Q, MD_f, 0.0, apply_add); } diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp index cea15392408cc..1dea8c879b5eb 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm72.cpp @@ -50,13 +50,28 @@ int main() { test(Q); + // test different layout combinations for one case + + test(Q); + test(Q); + test(Q); + test(Q); + + // joint_matrix_apply tests + auto apply_add = [](auto &x) { x = x + 2; }; int32_t D_i[MATRIX_M][MATRIX_N]; big_matrix MD_i((int32_t *)&D_i); - // joint_matrix_apply tests - matrix_verify_lambda(Q, MD_i, 0, apply_add); matrix_verify_lambda(Q, MD_i, 0, apply_add); } diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp index 2a0731d9b988e..ca823161b6197 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm80.cpp @@ -43,9 +43,28 @@ int main() { // A/B tf32 test(Q); test(Q); + 16, 8, 16, layout::row_major, layout::row_major, layout::row_major, + precision::tf32>(Q); + + // test different layout combinations for one case + + test(Q); + test(Q); + test(Q); + test(Q); + + // joint_matrix_apply tests float D[MATRIX_M][MATRIX_N]; big_matrix MD_f((float *)&D); @@ -54,7 +73,6 @@ int main() { big_matrix MD_d((double *)&D_d); auto apply_add = [](auto &x) { x = x + 2; }; - // joint_matrix_apply tests matrix_verify_lambda(Q, MD_f, 0.0, apply_add); matrix_verify_lambda(Q, MD_d, -60.0, apply_add); From f7a360deecc69ce11629f5bc540a236ea38f29a5 Mon Sep 17 00:00:00 2001 From: jinge90 Date: Fri, 2 Feb 2024 00:14:44 +0800 Subject: [PATCH 11/14] [SYCL][NFC] Fix some 'startswith/endswith' related to SYCL (#12573) Replace some deprecated 'startswith' and 'endswith' with 'starts_with' and 'ends_with' to clear some warnings when building SYCL compiler. --------- Signed-off-by: jinge90 --- clang/lib/CodeGen/CodeGenTypes.cpp | 4 ++-- clang/lib/Driver/Driver.cpp | 4 ++-- clang/lib/Driver/ToolChains/Linux.cpp | 2 +- clang/lib/Driver/ToolChains/SYCL.cpp | 9 +++++---- clang/lib/Driver/ToolChains/SYCL.h | 2 +- clang/lib/Sema/SemaExpr.cpp | 10 +++++----- clang/lib/Sema/SemaSYCL.cpp | 2 +- clang/tools/clang-offload-deps/ClangOffloadDeps.cpp | 2 +- llvm/lib/SYCLLowerIR/ESIMD/ESIMDUtils.cpp | 4 ++-- llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp | 10 +++++----- llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp | 2 +- llvm/lib/Support/PropertySetIO.cpp | 2 +- llvm/lib/Support/SimpleTable.cpp | 4 ++-- llvm/lib/TargetParser/Triple.cpp | 2 +- llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp | 2 +- .../Transforms/Instrumentation/SPIRITTAnnotations.cpp | 6 +++--- llvm/tools/sycl-post-link/ModuleSplitter.cpp | 6 +++--- llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp | 2 +- llvm/tools/sycl-post-link/SpecConstants.cpp | 8 ++++---- llvm/tools/sycl-post-link/sycl-post-link.cpp | 2 +- 20 files changed, 43 insertions(+), 42 deletions(-) diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp index 05fc203797f25..7642d3d79a666 100644 --- a/clang/lib/CodeGen/CodeGenTypes.cpp +++ b/clang/lib/CodeGen/CodeGenTypes.cpp @@ -342,8 +342,8 @@ llvm::Type *CodeGenTypes::ConvertSYCLJointMatrixINTELType(RecordDecl *RD) { if (CompTy->isStructTy()) { StringRef LlvmTyName = CompTy->getStructName(); // Emit half/int16/float for sycl[::*]::{half,bfloat16,tf32} - if (LlvmTyName.startswith("class.sycl::") || - LlvmTyName.startswith("class.__sycl_internal::")) + if (LlvmTyName.starts_with("class.sycl::") || + LlvmTyName.starts_with("class.__sycl_internal::")) LlvmTyName = LlvmTyName.rsplit("::").second; if (LlvmTyName == "half") { CompTy = llvm::Type::getHalfTy(getLLVMContext()); diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 21a563ce99c45..6265280877e9f 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -3518,7 +3518,7 @@ getLinkerArgs(Compilation &C, DerivedArgList &Args, bool IncludeObj = false) { // manner than the OpenMP processing. We should try and refactor this // to use the OpenMP flow (adding -l to the llvm-link step) auto resolveStaticLib = [&](StringRef LibName, bool IsStatic) -> bool { - if (!LibName.startswith("-l")) + if (!LibName.starts_with("-l")) return false; for (auto &LPath : LibPaths) { if (!IsStatic) { @@ -3663,7 +3663,7 @@ static bool IsSYCLDeviceLibObj(std::string ObjFilePath, bool isMSVCEnv) { StringRef ObjFileName = llvm::sys::path::filename(ObjFilePath); StringRef ObjSuffix = isMSVCEnv ? ".obj" : ".o"; bool Ret = - (ObjFileName.startswith("libsycl-") && ObjFileName.endswith(ObjSuffix)) + (ObjFileName.starts_with("libsycl-") && ObjFileName.ends_with(ObjSuffix)) ? true : false; return Ret; diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp index ae33f79198692..01887706d8757 100644 --- a/clang/lib/Driver/ToolChains/Linux.cpp +++ b/clang/lib/Driver/ToolChains/Linux.cpp @@ -344,7 +344,7 @@ Linux::Linux(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) // The deprecated -DLLVM_ENABLE_PROJECTS=libcxx configuration installs // libc++.so in D.Dir+"/../lib/". Detect this path. // TODO Remove once LLVM_ENABLE_PROJECTS=libcxx is unsupported. - if (StringRef(D.Dir).startswith(SysRoot) && + if (StringRef(D.Dir).starts_with(SysRoot) && (Args.hasArg(options::OPT_fsycl) || D.getVFS().exists(D.Dir + "/../lib/libsycl.so"))) addPathIfExists(D, D.Dir + "/../lib", Paths); diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp index 32ad99ba384b0..ca68b97858027 100644 --- a/clang/lib/Driver/ToolChains/SYCL.cpp +++ b/clang/lib/Driver/ToolChains/SYCL.cpp @@ -401,8 +401,9 @@ const char *SYCL::Linker::constructLLVMLinkCommand( LibPostfix = ".cubin"; } StringRef LibSyclPrefix("libsycl-"); - if (!InputFilename.startswith(LibSyclPrefix) || - !InputFilename.endswith(LibPostfix) || (InputFilename.count('-') < 2)) + if (!InputFilename.starts_with(LibSyclPrefix) || + !InputFilename.ends_with(LibPostfix) || + (InputFilename.count('-') < 2)) return false; // Skip the prefix "libsycl-" std::string PureLibName = @@ -419,7 +420,7 @@ const char *SYCL::Linker::constructLLVMLinkCommand( PureLibName.substr(0, FinalDashPos) + PureLibName.substr(DotPos); } for (const auto &L : SYCLDeviceLibList) { - if (StringRef(PureLibName).startswith(L)) + if (StringRef(PureLibName).starts_with(L)) return true; } return false; @@ -1354,7 +1355,7 @@ void SYCLToolChain::AddImpliedTargetArgs(const llvm::Triple &Triple, for (auto *A : Args) { if (!A->getOption().matches(options::OPT_Xsycl_backend_EQ)) continue; - if (StringRef(A->getValue()).startswith("intel_gpu")) + if (StringRef(A->getValue()).starts_with("intel_gpu")) TargArgs.push_back(A->getValue(1)); } if (llvm::find_if(TargArgs, [&](auto Cur) { diff --git a/clang/lib/Driver/ToolChains/SYCL.h b/clang/lib/Driver/ToolChains/SYCL.h index 8e1c97dbbb546..2e1a30be76eda 100644 --- a/clang/lib/Driver/ToolChains/SYCL.h +++ b/clang/lib/Driver/ToolChains/SYCL.h @@ -125,7 +125,7 @@ constexpr char AmdGPU[] = "amd_gpu_"; template std::optional isGPUTarget(StringRef Target) { // Handle target specifications that resemble '(intel, nvidia, amd)_gpu_*' // here. - if (Target.startswith(GPUArh)) { + if (Target.starts_with(GPUArh)) { return resolveGenDevice(Target); } return std::nullopt; diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index dcd567d32b698..2c5f907b4797e 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -282,11 +282,11 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, ArrayRef Locs, .Default(false); }; if ((getEmissionReason(FDecl) == Sema::DeviceDiagnosticReason::Sycl) && - Id && !Id->getName().startswith("__spirv_") && - !Id->getName().startswith("__sycl_") && - !Id->getName().startswith("__devicelib_ConvertBF16ToFINTEL") && - !Id->getName().startswith("__devicelib_ConvertFToBF16INTEL") && - !Id->getName().startswith("__assert_fail") && + Id && !Id->getName().starts_with("__spirv_") && + !Id->getName().starts_with("__sycl_") && + !Id->getName().starts_with("__devicelib_ConvertBF16ToFINTEL") && + !Id->getName().starts_with("__devicelib_ConvertFToBF16INTEL") && + !Id->getName().starts_with("__assert_fail") && !isMsvcMathFn(Id->getName())) { SYCLDiagIfDeviceCode( *Locs.begin(), diag::err_sycl_device_function_is_called_from_esimd, diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index e2a8d10ded1ab..d0f8ddbe7193c 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -4295,7 +4295,7 @@ static void CheckSYCL2020SubGroupSizes(Sema &S, FunctionDecl *SYCLKernel, // No need to validate __spirv routines here since they // are mapped to the equivalent SPIRV operations. const IdentifierInfo *II = FD->getIdentifier(); - if (II && II->getName().startswith("__spirv_")) + if (II && II->getName().starts_with("__spirv_")) return; // Else we need to figure out why they don't match. diff --git a/clang/tools/clang-offload-deps/ClangOffloadDeps.cpp b/clang/tools/clang-offload-deps/ClangOffloadDeps.cpp index c9fa2f87a072f..5dbf044675106 100644 --- a/clang/tools/clang-offload-deps/ClangOffloadDeps.cpp +++ b/clang/tools/clang-offload-deps/ClangOffloadDeps.cpp @@ -173,7 +173,7 @@ int main(int argc, const char **argv) { // possibly reusing ClangOffloadBundler's 'OffloadTargetInfo'. for (const std::string &Target : Targets) { std::string Prefix = Target + "."; - if (Symbol.startswith(Prefix)) + if (Symbol.starts_with(Prefix)) Target2Symbols[Target].insert( Symbol.substr(Prefix.size(), Len - Prefix.size())); } diff --git a/llvm/lib/SYCLLowerIR/ESIMD/ESIMDUtils.cpp b/llvm/lib/SYCLLowerIR/ESIMD/ESIMDUtils.cpp index e104ab115e970..480ba1944ebba 100644 --- a/llvm/lib/SYCLLowerIR/ESIMD/ESIMDUtils.cpp +++ b/llvm/lib/SYCLLowerIR/ESIMD/ESIMDUtils.cpp @@ -34,13 +34,13 @@ constexpr char SLM_ALLOCATOR_DTOR_SUFFIX[] = "EED2Ev"; bool isSlmAllocatorConstructor(const Function &F) { auto Name = F.getName(); return Name.starts_with(SLM_ALLOCATOR_CTOR_DTOR_PREFIX) && - Name.endswith(SLM_ALLOCATOR_CTOR_SUFFIX); + Name.ends_with(SLM_ALLOCATOR_CTOR_SUFFIX); } bool isSlmAllocatorDestructor(const Function &F) { auto Name = F.getName(); return Name.starts_with(SLM_ALLOCATOR_CTOR_DTOR_PREFIX) && - Name.endswith(SLM_ALLOCATOR_DTOR_SUFFIX); + Name.ends_with(SLM_ALLOCATOR_DTOR_SUFFIX); } bool isSlmInit(const Function &F) { diff --git a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp index 9a3aa8437a590..6ec1102f402ba 100644 --- a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp +++ b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp @@ -1596,15 +1596,15 @@ SmallPtrSet collectGenXVolatileTypes(Module &M) { // TODO FIXME relying on type name in LLVM IR is fragile, needs rework if (!GTy || !GTy->getName() .rtrim(".0123456789") - .endswith("sycl::_V1::ext::intel::esimd::simd")) + .ends_with("sycl::_V1::ext::intel::esimd::simd")) continue; assert(GTy->getNumContainedTypes() == 1); auto VTy = GTy->getContainedType(0); if ((GTy = dyn_cast(VTy))) { - assert( - GTy->getName() - .rtrim(".0123456789") - .endswith("sycl::_V1::ext::intel::esimd::detail::simd_obj_impl")); + assert(GTy->getName() + .rtrim(".0123456789") + .ends_with( + "sycl::_V1::ext::intel::esimd::detail::simd_obj_impl")); VTy = GTy->getContainedType(0); } assert(VTy->isVectorTy()); diff --git a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp index 264a8b4bc817a..2624e26116823 100644 --- a/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp +++ b/llvm/lib/SYCLLowerIR/PrepareSYCLNativeCPU.cpp @@ -283,7 +283,7 @@ static Function *addGetFunc(Module &M, StringRef Name, Type *StateType) { static Function *addReplaceFunc(Module &M, StringRef Name, Type *StateType) { Function *Res; const char GetPrefix[] = "__dpcpp_nativecpu_get"; - if (Name.startswith(GetPrefix)) { + if (Name.starts_with(GetPrefix)) { Res = addGetFunc(M, Name, StateType); } else if (Name == NativeCPUSetLocalId) { Res = addSetLocalIdFunc(M, Name, StateType); diff --git a/llvm/lib/Support/PropertySetIO.cpp b/llvm/lib/Support/PropertySetIO.cpp index 2a3586ba49339..ffb6394913fc9 100644 --- a/llvm/lib/Support/PropertySetIO.cpp +++ b/llvm/lib/Support/PropertySetIO.cpp @@ -37,7 +37,7 @@ PropertySetRegistry::read(const MemoryBuffer *Buf) { for (line_iterator LI(*Buf); !LI.is_at_end(); LI++) { // see if this line starts a new property set - if (LI->startswith("[")) { + if (LI->starts_with("[")) { // yes - parse the category (property name) auto EndPos = LI->rfind(']'); if (EndPos == StringRef::npos) diff --git a/llvm/lib/Support/SimpleTable.cpp b/llvm/lib/Support/SimpleTable.cpp index 434e7d7cb7670..ec47391450100 100644 --- a/llvm/lib/Support/SimpleTable.cpp +++ b/llvm/lib/Support/SimpleTable.cpp @@ -214,8 +214,8 @@ Expected SimpleTable::read(MemoryBuffer *Buf, return std::make_unique(); UPtrTy Res; - if (LI->startswith(COL_TITLE_LINE_OPEN)) { - if (!LI->endswith(COL_TITLE_LINE_CLOSE)) + if (LI->starts_with(COL_TITLE_LINE_OPEN)) { + if (!LI->ends_with(COL_TITLE_LINE_CLOSE)) return createStringError(errc::invalid_argument, "malformed title line"); // column titles present StringRef L = LI->substr(1, LI->size() - 2); // trim '[' and ']' diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index f0c4cd875f6fe..1e923ebae21f9 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -726,7 +726,7 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) { (SubArchName.ends_with("r6el") || SubArchName.ends_with("r6"))) return Triple::MipsSubArch_r6; - if (SubArchName.startswith("spir")) { + if (SubArchName.starts_with("spir")) { StringRef SA(SubArchName); if (SA.consume_front("spir64_") || SA.consume_front("spir_")) { if (SA == "fpga") diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 1891e10679a0f..f37f1fdcfc2e9 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -838,7 +838,7 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) { // // extractelement <8 x i32> (bitcast <4 x i64> %X to <8 x i32>), i32 0 // ``` // can't be lowered by SPIR-V translator to "standard" format. - if (StringRef(Trunc.getModule()->getTargetTriple()).startswith("spir")) + if (StringRef(Trunc.getModule()->getTargetTriple()).starts_with("spir")) return nullptr; // Whenever an element is extracted from a vector, and then truncated, diff --git a/llvm/lib/Transforms/Instrumentation/SPIRITTAnnotations.cpp b/llvm/lib/Transforms/Instrumentation/SPIRITTAnnotations.cpp index b8dd7cda8883c..105643062aa90 100644 --- a/llvm/lib/Transforms/Instrumentation/SPIRITTAnnotations.cpp +++ b/llvm/lib/Transforms/Instrumentation/SPIRITTAnnotations.cpp @@ -243,7 +243,7 @@ bool insertAtomicInstrumentationCall(Module &M, StringRef Name, PreservedAnalyses SPIRITTAnnotationsPass::run(Module &M, ModuleAnalysisManager &MAM) { - assert(StringRef(M.getTargetTriple()).startswith("spir")); + assert(StringRef(M.getTargetTriple()).starts_with("spir")); bool IRModified = false; std::vector SPIRVCrossWGInstuctions = { SPIRV_CONTROL_BARRIER, SPIRV_GROUP_ALL, SPIRV_GROUP_ANY, @@ -299,7 +299,7 @@ PreservedAnalyses SPIRITTAnnotationsPass::run(Module &M, if (std::any_of(SPIRVCrossWGInstuctions.begin(), SPIRVCrossWGInstuctions.end(), [&CalleeName](StringRef Name) { - return CalleeName.startswith(Name); + return CalleeName.starts_with(Name); })) { Instruction *InstAfterBarrier = CI->getNextNode(); const DebugLoc &DL = CI->getDebugLoc(); @@ -307,7 +307,7 @@ PreservedAnalyses SPIRITTAnnotationsPass::run(Module &M, insertSimpleInstrumentationCall(M, ITT_ANNOTATION_WI_RESUME, InstAfterBarrier, DL); IRModified = true; - } else if (CalleeName.startswith(SPIRV_ATOMIC_INST)) { + } else if (CalleeName.starts_with(SPIRV_ATOMIC_INST)) { Instruction *InstAfterAtomic = CI->getNextNode(); IRModified |= insertAtomicInstrumentationCall( M, ITT_ANNOTATION_ATOMIC_START, CI, CI, CalleeName); diff --git a/llvm/tools/sycl-post-link/ModuleSplitter.cpp b/llvm/tools/sycl-post-link/ModuleSplitter.cpp index a31ad613f9918..87cbf42da2df2 100644 --- a/llvm/tools/sycl-post-link/ModuleSplitter.cpp +++ b/llvm/tools/sycl-post-link/ModuleSplitter.cpp @@ -79,7 +79,7 @@ bool isSpirvSyclBuiltin(StringRef FName) { // now skip the digits FName = FName.drop_while([](char C) { return std::isdigit(C); }); - return FName.startswith("__spirv_") || FName.startswith("__sycl_"); + return FName.starts_with("__spirv_") || FName.starts_with("__sycl_"); } // Return true if the function is a ESIMD builtin @@ -91,12 +91,12 @@ bool isESIMDBuiltin(StringRef FName) { // now skip the digits FName = FName.drop_while([](char C) { return std::isdigit(C); }); - return FName.startswith("__esimd_"); + return FName.starts_with("__esimd_"); } // Return true if the function name starts with "__builtin_" bool isGenericBuiltin(StringRef FName) { - return FName.startswith("__builtin_"); + return FName.starts_with("__builtin_"); } bool isKernel(const Function &F) { diff --git a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp index 6ca581d8caf14..1b6cdefb9a541 100644 --- a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp +++ b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp @@ -696,7 +696,7 @@ uint32_t llvm::getSYCLDeviceLibReqMask(const Module &M) { return 0; uint32_t ReqMask = 0; for (const Function &SF : M) { - if (SF.getName().startswith(DEVICELIB_FUNC_PREFIX) && SF.isDeclaration()) { + if (SF.getName().starts_with(DEVICELIB_FUNC_PREFIX) && SF.isDeclaration()) { assert(SF.getCallingConv() == CallingConv::SPIR_FUNC); uint32_t DeviceLibBits = getDeviceLibBits(SF.getName().str()); ReqMask |= DeviceLibBits; diff --git a/llvm/tools/sycl-post-link/SpecConstants.cpp b/llvm/tools/sycl-post-link/SpecConstants.cpp index dc5f1b2fd3f2c..66c2cc5ef1818 100644 --- a/llvm/tools/sycl-post-link/SpecConstants.cpp +++ b/llvm/tools/sycl-post-link/SpecConstants.cpp @@ -801,8 +801,8 @@ PreservedAnalyses SpecConstantsPass::run(Module &M, if (!F.isDeclaration()) continue; - if (!F.getName().startswith(SYCL_GET_SCALAR_2020_SPEC_CONST_VAL) && - !F.getName().startswith(SYCL_GET_COMPOSITE_2020_SPEC_CONST_VAL)) + if (!F.getName().starts_with(SYCL_GET_SCALAR_2020_SPEC_CONST_VAL) && + !F.getName().starts_with(SYCL_GET_COMPOSITE_2020_SPEC_CONST_VAL)) continue; SmallVector SCIntrCalls; @@ -1014,8 +1014,8 @@ bool SpecConstantsPass::collectSpecConstantDefaultValuesMetadata( bool llvm::checkModuleContainsSpecConsts(const Module &M) { for (const Function &F : M.functions()) { - if (F.getName().startswith(SYCL_GET_SCALAR_2020_SPEC_CONST_VAL) || - F.getName().startswith(SYCL_GET_COMPOSITE_2020_SPEC_CONST_VAL)) + if (F.getName().starts_with(SYCL_GET_SCALAR_2020_SPEC_CONST_VAL) || + F.getName().starts_with(SYCL_GET_COMPOSITE_2020_SPEC_CONST_VAL)) return true; } diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp index 69f43b8b486ea..50cb225f6cb18 100644 --- a/llvm/tools/sycl-post-link/sycl-post-link.cpp +++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp @@ -340,7 +340,7 @@ std::string makeResultFileName(Twine Ext, int I, StringRef Suffix) { : sys::path::parent_path(OutputFilename); const StringRef Sep = sys::path::get_separator(); std::string Dir = Dir0.str(); - if (!Dir0.empty() && !Dir0.endswith(Sep)) + if (!Dir0.empty() && !Dir0.ends_with(Sep)) Dir += Sep.str(); return Dir + sys::path::stem(OutputFilename).str() + Suffix.str() + "_" + std::to_string(I) + Ext.str(); From 0af4ac7d2f38286c6e6ab32fe8f0c4437cb81d30 Mon Sep 17 00:00:00 2001 From: Michael Toguchi Date: Thu, 1 Feb 2024 08:54:29 -0800 Subject: [PATCH 12/14] [Driver] Allow for -O3 on Windows using clang-cl (#12504) We currently support -O3 for Linux compilations, expand this to also be available on Windows. This also better aligns with our existing product offerings. --- clang/include/clang/Driver/Options.td | 3 +++ clang/lib/Driver/ToolChains/MSVC.cpp | 9 +++++++-- clang/test/Driver/Xarch.c | 12 ++++++------ clang/test/Driver/cl-options.c | 4 ++++ 4 files changed, 20 insertions(+), 8 deletions(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 2d0f05f6ca35b..e32047292ab8a 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -8404,6 +8404,9 @@ def : CLFlag<"O1">, Alias<_SLASH_O>, AliasArgs<["1"]>, HelpText<"Optimize for size (like /Og /Os /Oy /Ob2 /GF /Gy)">; def : CLFlag<"O2">, Alias<_SLASH_O>, AliasArgs<["2"]>, HelpText<"Optimize for speed (like /Og /Oi /Ot /Oy /Ob2 /GF /Gy)">; +def : CLFlag<"O3">, Alias<_SLASH_O>, AliasArgs<["3"]>, + HelpText<"Optimize for maximum speed and enable more aggressive optimizations" + " that may not improve performance on some programs">; def : CLFlag<"Ob0">, Alias<_SLASH_O>, AliasArgs<["b0"]>, HelpText<"Disable function inlining">; def : CLFlag<"Ob1">, Alias<_SLASH_O>, AliasArgs<["b1"]>, diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp index c1b912db596e8..52f811621ba82 100644 --- a/clang/lib/Driver/ToolChains/MSVC.cpp +++ b/clang/lib/Driver/ToolChains/MSVC.cpp @@ -930,6 +930,7 @@ static void TranslateOptArg(Arg *A, llvm::opt::DerivedArgList &DAL, break; case '1': case '2': + case '3': case 'x': case 'd': // Ignore /O[12xd] flags that aren't the last one on the command line. @@ -946,11 +947,14 @@ static void TranslateOptArg(Arg *A, llvm::opt::DerivedArgList &DAL, } else if (OptChar == '2' || OptChar == 'x') { DAL.AddFlagArg(A, Opts.getOption(options::OPT_fbuiltin)); DAL.AddJoinedArg(A, Opts.getOption(options::OPT_O), "2"); + } else if (OptChar == '3') { + DAL.AddFlagArg(A, Opts.getOption(options::OPT_fbuiltin)); + DAL.AddJoinedArg(A, Opts.getOption(options::OPT_O), "3"); } if (SupportsForcingFramePointer && !DAL.hasArgNoClaim(options::OPT_fno_omit_frame_pointer)) DAL.AddFlagArg(A, Opts.getOption(options::OPT_fomit_frame_pointer)); - if (OptChar == '1' || OptChar == '2') + if (OptChar == '1' || OptChar == '2' || OptChar == '3') DAL.AddFlagArg(A, Opts.getOption(options::OPT_ffunction_sections)); } break; @@ -1070,7 +1074,8 @@ MSVCToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args, // OptChar does not expand; it's an argument to the previous char. continue; } - if (OptChar == '1' || OptChar == '2' || OptChar == 'x' || OptChar == 'd') + if (OptChar == '1' || OptChar == '2' || OptChar == 'x' || + OptChar == 'd' || OptChar == '3') ExpandChar = OptStr.data() + I; } } diff --git a/clang/test/Driver/Xarch.c b/clang/test/Driver/Xarch.c index f7693fb689d58..ec58211d6fba0 100644 --- a/clang/test/Driver/Xarch.c +++ b/clang/test/Driver/Xarch.c @@ -1,10 +1,10 @@ -// RUN: %clang -target i386-apple-darwin11 -m32 -Xarch_i386 -O3 %s -S -### 2>&1 | FileCheck -check-prefix=O3ONCE %s -// O3ONCE: "-O3" -// O3ONCE-NOT: "-O3" +// RUN: %clang -target i386-apple-darwin11 -m32 -Xarch_i386 -O5 %s -S -### 2>&1 | FileCheck -check-prefix=O5ONCE %s +// O5ONCE: "-O5" +// O5ONCE-NOT: "-O5" -// RUN: %clang -target i386-apple-darwin11 -m64 -Xarch_i386 -O3 %s -S -### 2>&1 | FileCheck -check-prefix=O3NONE %s -// O3NONE-NOT: "-O3" -// O3NONE: argument unused during compilation: '-Xarch_i386 -O3' +// RUN: %clang -target i386-apple-darwin11 -m64 -Xarch_i386 -O5 %s -S -### 2>&1 | FileCheck -check-prefix=O5NONE %s +// O5NONE-NOT: "-O5" +// O5NONE: argument unused during compilation: '-Xarch_i386 -O5' // RUN: not %clang -target i386-apple-darwin11 -m32 -Xarch_i386 -o -Xarch_i386 -S %s -S -Xarch_i386 -o 2>&1 | FileCheck -check-prefix=INVALID %s // INVALID: error: invalid Xarch argument: '-Xarch_i386 -o' diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c index 5b6dfe308a76e..2f60637f7c010 100644 --- a/clang/test/Driver/cl-options.c +++ b/clang/test/Driver/cl-options.c @@ -207,6 +207,10 @@ // RUN: %clang_cl --target=i686-pc-win32 -Werror -Wno-msvc-not-found /O2 /O2 -### -- %s 2>&1 | FileCheck -check-prefix=O2O2 %s // O2O2: "-O2" +// RUN: %clang_cl --target=i686-pc-win32 -Werror -Wno-msvc-not-found /O3 -### -- %s 2>&1 | FileCheck -check-prefix=O3 %s +// O3: -mframe-pointer=none +// O3: -O3 + // RUN: %clang_cl /Zs -Werror /Oy -- %s 2>&1 // RUN: %clang_cl --target=i686-pc-win32 -Werror -Wno-msvc-not-found /Oy- -### -- %s 2>&1 | FileCheck -check-prefix=Oy_ %s From 4fdcb5835a3f8a25d0a519d4bb56a26bf5d6f318 Mon Sep 17 00:00:00 2001 From: Zahira Ammarguellat Date: Thu, 1 Feb 2024 09:07:59 -0800 Subject: [PATCH 13/14] [SYCL] Fix compiler crash. (#12324) The compiler was crashing when the user requested fp-accuracy for the functions in a call of the form f1(f2(f3 ...), where f1, f2 and f3 were fpbuiltin but the innermost function didn't have an fpbuiltin. The current builtinID was used instead of getting the builtinID from the current function. that created a crash in the compiler. This patch fixes the issue and renames the function EmitFPBuiltinIndirectCall to MaybeEmitFPBuiltinofFD . --- clang/lib/CodeGen/CGBuiltin.cpp | 23 ++---- clang/lib/CodeGen/CGCall.cpp | 5 +- clang/lib/CodeGen/CodeGenFunction.h | 7 +- clang/test/CodeGen/fp-accuracy.c | 87 ++++++++++++++++++++- sycl/test/check_device_code/fp-accuracy.cpp | 29 +++++++ 5 files changed, 127 insertions(+), 24 deletions(-) create mode 100644 sycl/test/check_device_code/fp-accuracy.cpp diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index ca1752b24b062..0a90073cf50d0 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -23271,21 +23271,11 @@ static bool hasFuncNameRequestedFPAccuracy(StringRef Name, return (FuncMapIt != LangOpts.FPAccuracyFuncMap.end()); } -llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall( +llvm::CallInst *CodeGenFunction::MaybeEmitFPBuiltinofFD( llvm::FunctionType *IRFuncTy, const SmallVectorImpl &IRArgs, - llvm::Value *FnPtr, const FunctionDecl *FD) { - llvm::Function *Func; + llvm::Value *FnPtr, StringRef Name, unsigned FDBuiltinID) { unsigned FPAccuracyIntrinsicID = 0; - StringRef Name; - if (CurrentBuiltinID == 0) { - // Even if the current function doesn't have a clang builtin, create - // an 'fpbuiltin-max-error' attribute for it; unless it's marked with - // an NoBuiltin attribute. - if (FD->hasAttr() || - !FD->getNameInfo().getName().isIdentifier()) - return nullptr; - - Name = FD->getName(); + if (FDBuiltinID == 0) { FPAccuracyIntrinsicID = llvm::StringSwitch(Name) .Case("fadd", llvm::Intrinsic::fpbuiltin_fadd) @@ -23300,9 +23290,7 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall( } else { // The function has a clang builtin. Create an attribute for it // only if it has an fpbuiltin intrinsic. - unsigned BuiltinID = getCurrentBuiltinID(); - Name = CGM.getContext().BuiltinInfo.getName(BuiltinID); - switch (BuiltinID) { + switch (FDBuiltinID) { default: // If the function has a clang builtin but doesn't have an // fpbuiltin, it will be generated with no 'fpbuiltin-max-error' @@ -23384,7 +23372,8 @@ llvm::CallInst *CodeGenFunction::EmitFPBuiltinIndirectCall( const LangOptions &LangOpts = getLangOpts(); if (hasFuncNameRequestedFPAccuracy(Name, LangOpts) || !LangOpts.FPAccuracyVal.empty()) { - Func = CGM.getIntrinsic(FPAccuracyIntrinsicID, IRArgs[0]->getType()); + llvm::Function *Func = + CGM.getIntrinsic(FPAccuracyIntrinsicID, IRArgs[0]->getType()); return CreateBuiltinCallWithAttr(*this, Name, Func, ArrayRef(IRArgs), FPAccuracyIntrinsicID); } diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index d55023875f2fc..442059fb03789 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -5707,8 +5707,9 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, if (!getLangOpts().FPAccuracyFuncMap.empty() || !getLangOpts().FPAccuracyVal.empty()) { const auto *FD = dyn_cast_if_present(TargetDecl); - if (FD) { - CI = EmitFPBuiltinIndirectCall(IRFuncTy, IRCallArgs, CalleePtr, FD); + if (FD && FD->getNameInfo().getName().isIdentifier()) { + CI = MaybeEmitFPBuiltinofFD(IRFuncTy, IRCallArgs, CalleePtr, + FD->getName(), FD->getBuiltinID()); if (CI) return RValue::get(CI); } diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 3ddd05cb53d8e..04c3a715da205 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4434,9 +4434,10 @@ class CodeGenFunction : public CodeGenTypeCache { RValue EmitIntelSYCLPtrAnnotationBuiltin(const CallExpr *E); llvm::CallInst * - EmitFPBuiltinIndirectCall(llvm::FunctionType *IRFuncTy, - const SmallVectorImpl &IRArgs, - llvm::Value *FnPtr, const FunctionDecl *FD); + MaybeEmitFPBuiltinofFD(llvm::FunctionType *IRFuncTy, + const SmallVectorImpl &IRArgs, + llvm::Value *FnPtr, StringRef Name, + unsigned FDBuiltinID); enum class MSVCIntrin; llvm::Value *EmitMSVCBuiltinExpr(MSVCIntrin BuiltinID, const CallExpr *E); diff --git a/clang/test/CodeGen/fp-accuracy.c b/clang/test/CodeGen/fp-accuracy.c index 74de6a1f72c80..3fdde4443b8c8 100644 --- a/clang/test/CodeGen/fp-accuracy.c +++ b/clang/test/CodeGen/fp-accuracy.c @@ -177,7 +177,7 @@ double rsqrt(double); // CHECK-F3: call double @llvm.fpbuiltin.atanh.f64(double {{.*}}) #[[ATTR_F3_HIGH]] // CHECK-F3: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F3_HIGH]] // CHECK-F3: call double @llvm.fpbuiltin.cosh.f64(double {{.*}}) #[[ATTR_F3_HIGH]] -// CHECk-F3: call double @llvm.fpbuiltin.erf.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.erf.f64(double {{.*}}) #[[ATTR_F3_HIGH]] // CHECK-F3: call double @llvm.fpbuiltin.erfc.f64(double {{.*}}) #[[ATTR_F3_HIGH]] // CHECK-F3: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F3_HIGH]] // CHECK-F3: call double @llvm.fpbuiltin.exp10.f64(double {{.*}}) #[[ATTR_F3_HIGH]] @@ -203,6 +203,19 @@ double rsqrt(double); // CHECK-F3: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F3_LOW:[0-9]+]] // CHECK-F3: call double @llvm.fpbuiltin.tanh.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3-LABEL: define dso_local void @f2 +// CHECK-F3: call float @llvm.fpbuiltin.cos.f32(float {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call float @llvm.fpbuiltin.sin.f32(float {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F3_LOW]] +// CHECK-F3: call double @llvm.fpbuiltin.log10.f64(double {{.*}}) #[[ATTR_F3_MEDIUM]] +// CHECK-F3: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F3_MEDIUM]] + +// CHECK-F3-LABEL: define dso_local float @fake_exp10 + +// CHECK-F3-LABEL: define dso_local void @f4 +// CHECK-F3: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F3_HIGH]] +// CHECK-F3: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F3_HIGH]] + // CHECK-F3: attributes #[[ATTR_F3_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0" // CHECK-F3: attributes #[[ATTR_F3_MEDIUM]] = {{.*}}"fpbuiltin-max-error"="4.0" // CHECK-F3: attributes #[[ATTR_F3_LOW]] = {{.*}}"fpbuiltin-max-error"="67108864.0" @@ -414,7 +427,7 @@ void f1(float a, float b) { // CHECK-F2: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F2_MEDIUM]] // CHECK-F2: call float @tanf(float {{.*}}) // -// CHECK-LABEL-F4: define dso_local void @f2 +// CHECK-F4-LABEL: define dso_local void @f2 // CHECK-F4: call float @llvm.fpbuiltin.cos.f32(float {{.*}}) #[[ATTR_F4_MEDIUM]] // CHECK-F4: call float @llvm.fpbuiltin.sin.f32(float {{.*}}) #[[ATTR_F4_MEDIUM]] // CHECK-F4: call double @llvm.fpbuiltin.tan.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] @@ -422,6 +435,17 @@ void f1(float a, float b) { // CHECK-F4: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F4_MEDIUM]] // CHECK-F4: call float @tanf(float {{.*}}) // +// CHECK-F4-LABEL: define dso_local float @fake_exp10 + +// CHECK-F4-LABEL: define dso_local void @f3 +// CHECK-F4: call float @fake_exp10(float {{.*}}) + +// CHECK-F4-LABEL: define dso_local void @f4 +// CHECK-F4: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] +// CHECK-F4: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F4_MEDIUM]] + +// CHECK-F4: attributes #[[ATTR_F4_MEDIUM]] = {{.*}}"fpbuiltin-max-error"="4.0" + // CHECK-F5-LABEL: define dso_local void @f2 // CHECK-F5: call float @llvm.cos.f32(float {{.*}}) // CHECK-F5: call float @llvm.sin.f32(float {{.*}}) @@ -430,6 +454,15 @@ void f1(float a, float b) { // CHECK-F5: call i32 (double, ptr, ptr, ...) @sincos(double {{.*}}, ptr {{.*}}, ptr {{.*}}) // CHECK-F5: call float @tanf(float {{.*}}) // +// CHECK-F5-LABEL: define dso_local float @fake_exp10 + +// CHECK-F5-LABEL: define dso_local void @f3 +// CHECK-F5: call float @fake_exp10(float {{.*}}) + +// CHECK-F5-LABEL: define dso_local void @f4 +// CHECK-F5: call double @llvm.exp.f64(double {{.*}}) +// CHECK-F5: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F5_MEDIUM]] + // CHECK-F5: attributes #[[ATTR_F5_MEDIUM]] = {{.*}}"fpbuiltin-max-error"="4.0" // CHECK-F5: attributes #[[ATTR_F5_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0" // @@ -441,6 +474,15 @@ void f1(float a, float b) { // CHECK-F6: call void @llvm.fpbuiltin.sincos.f64(double {{.*}}, ptr {{.*}}, ptr {{.*}}) #[[ATTR_F6_MEDIUM]] // CHECK-F6: call float @tanf(float {{.*}}) #[[ATTR8:[0-9]+]] // +// CHECK-F6-LABEL: define dso_local float @fake_exp10 +// +// CHECK-F6-LABEL: define dso_local void @f3 +// CHECK-F6: call float @fake_exp10(float {{.*}}) + +// CHECK-F6-LABEL: define dso_local void @f4 +// CHECK-F6: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F6_MEDIUM]] +// CHECK-F6: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F6_MEDIUM]] + // CHECK-F6: attributes #[[ATTR_F6_MEDIUM]] = {{.*}}"fpbuiltin-max-error"="4.0" // CHECK-F6: attributes #[[ATTR_F6_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0" // @@ -454,12 +496,36 @@ void f1(float a, float b) { // CHECK-LABEL: define dso_local void @f3 // CHECK: call float @fake_exp10(float {{.*}}) + +// CHECK-LABEL: define dso_local void @f4 +// CHECK: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_HIGH]] +// CHECK: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_HIGH]] + +// CHECK-F1-LABEL: define dso_local void @f3 // CHECK-F1: call float @fake_exp10(float {{.*}}) + +// CHECK-F1-LABEL: define dso_local void @f4 +// CHECK-F1: call double @llvm.exp.f64(double {{.*}}) +// CHECK-F1: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F1_HIGH]] + +// CHECK-F2-LABEL: define dso_local float @fake_exp10 + +// CHECK-F2-LABEL: define dso_local void @f3 // CHECK-F2: call float @fake_exp10(float {{.*}}) +// CHECK-F2-LABEL: define dso_local void @f4 +// CHECK-F2: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F2_MEDIUM]] +// CHECK-F2: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F2_CUDA]] + +// CHECK-SPIR-LABEL: define dso_local spir_func float @fake_exp10 +// // CHECK-SPIR-LABEL: define dso_local spir_func void @f3 // CHECK-SPIR: call spir_func float @fake_exp10(float {{.*}}) +// CHECK-SPIR-LABEL: define dso_local spir_func void @f4 +// CHECK-SPIR: call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_SYCL5]] +// CHECK-SPIR: call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_SYCL1]] + // CHECK: attributes #[[ATTR_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0" // CHECK-F1: attributes #[[ATTR_F1_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0" @@ -526,6 +592,10 @@ void f1(float a, float b) { // CHECK-DEFAULT-LABEL: define dso_local void @f3 // CHECK-DEFAULT: call float @fake_exp10(float {{.*}}) +// CHECK-DEFAULT-LABEL: define dso_local void @f4 +// CHECK-DEFAULT: call double @llvm.exp.f64(double {{.*}}) +// CHECK-DEFAULT: call double @llvm.cos.f64(double {{.*}}) + void f2(float a, float b) { float sin = 0.f, cos = 0.f; @@ -541,3 +611,16 @@ float fake_exp10(float a) __attribute__((no_builtin)){} void f3(float a, float b) { a = fake_exp10(b); } + +#define sz 2 +double in[sz]; +double out[sz]; + +double getInput(int i) { + return in[i]; +} + +void f4() { + for (int i = 0; i < sz; i++) + out[i] = cos(exp(getInput(i))); +} diff --git a/sycl/test/check_device_code/fp-accuracy.cpp b/sycl/test/check_device_code/fp-accuracy.cpp new file mode 100644 index 0000000000000..f5a42c2bbc436 --- /dev/null +++ b/sycl/test/check_device_code/fp-accuracy.cpp @@ -0,0 +1,29 @@ +// DEFINE: %{common_opts} = -fsycl -fsycl-device-only -fno-math-errno \ +// DEFINE: -ffp-accuracy=high -S -emit-llvm -o - %s + +// RUN: %clangxx %{common_opts} | FileCheck %s + +// RUN: %clangxx %{common_opts} -ffp-accuracy=low:exp \ +// RUN: | FileCheck %s --check-prefix=CHECK-LOW-EXP + +#include + +SYCL_EXTERNAL auto foo(double x) { + using namespace sycl; + return cos(exp(log(x))); +} + +// CHECK-LABEL: define {{.*}}food +// CHECK: tail call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR_HIGH:[0-9]+]] +// CHECK: tail call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_HIGH]] +// CHECK: tail call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_HIGH]] + +// CHECK: attributes #[[ATTR_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0" + +// CHECK-LOW-EXP-LABEL: define {{.*}}food +// CHECK-LOW-EXP: tail call double @llvm.fpbuiltin.log.f64(double {{.*}}) #[[ATTR_F1_HIGH:[0-9]+]] +// CHECK-LOW-EXP: tail call double @llvm.fpbuiltin.exp.f64(double {{.*}}) #[[ATTR_F1_LOW:[0-9]+]] +// CHECK-LOW-EXP: tail call double @llvm.fpbuiltin.cos.f64(double {{.*}}) #[[ATTR_F1_HIGH]] + +// CHECK-F1: attributes #[[ATTR_F1_HIGH]] = {{.*}}"fpbuiltin-max-error"="1.0" +// CHECK-F1: attributes #[[ATTR_F1_LOW]] = {{.*}}"fpbuiltin-max-error"="67108864.0" From 8427bd224b3db2fa7df6138156620fec4b5ed323 Mon Sep 17 00:00:00 2001 From: Hugh Delaney <46290137+hdelan@users.noreply.github.com> Date: Thu, 1 Feb 2024 17:12:12 +0000 Subject: [PATCH 14/14] [SYCL][HIP][CUDA] Use new version of piMemGetNativeHandle and add test (#12297) We want to change the signature of `piMemGetNativeHandle` for reasons explained here https://github.com/oneapi-src/unified-runtime/pull/1199 Corresponding UR PR: https://github.com/oneapi-src/unified-runtime/pull/1226 A previous PR added a new entry point https://github.com/intel/llvm/pull/12199 but it was decided that it is better to modify the existing entry point --- sycl/include/sycl/detail/pi.h | 9 +- sycl/plugins/cuda/pi_cuda.cpp | 5 +- sycl/plugins/hip/pi_hip.cpp | 5 +- sycl/plugins/level_zero/pi_level_zero.cpp | 5 +- sycl/plugins/native_cpu/pi_native_cpu.cpp | 5 +- sycl/plugins/opencl/pi_opencl.cpp | 5 +- sycl/plugins/unified_runtime/CMakeLists.txt | 14 +- sycl/plugins/unified_runtime/pi2ur.hpp | 5 +- .../unified_runtime/pi_unified_runtime.cpp | 6 +- sycl/source/detail/buffer_impl.cpp | 6 +- sycl/source/detail/memory_manager.cpp | 12 +- sycl/source/interop_handle.cpp | 4 +- .../HostInteropTask/interop-task-hip.cpp | 136 ++++++++++++++++++ sycl/unittests/helpers/PiMockPlugin.hpp | 2 +- 14 files changed, 188 insertions(+), 31 deletions(-) create mode 100644 sycl/test-e2e/HostInteropTask/interop-task-hip.cpp diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 4d0da9995908a..31c6a86f3300d 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -149,9 +149,11 @@ // 14.40 Add HIP _pi_mem_advice alises to match the PI_MEM_ADVICE_CUDA* ones. // 14.41 Added piextCommandBufferMemBufferFill & piextCommandBufferFillUSM // 14.42 Added piextCommandBufferPrefetchUSM and piextCommandBufferAdviseUSM +// 15.43 Changed the signature of piextMemGetNativeHandle to also take a +// pi_device -#define _PI_H_VERSION_MAJOR 14 -#define _PI_H_VERSION_MINOR 42 +#define _PI_H_VERSION_MAJOR 15 +#define _PI_H_VERSION_MINOR 43 #define _PI_STRING_HELPER(a) #a #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b) @@ -1424,8 +1426,9 @@ __SYCL_EXPORT pi_result piMemBufferPartition( /// Gets the native handle of a PI mem object. /// /// \param mem is the PI mem to get the native handle of. +/// \param dev is the PI device that the native allocation will be resident on /// \param nativeHandle is the native handle of mem. -__SYCL_EXPORT pi_result piextMemGetNativeHandle(pi_mem mem, +__SYCL_EXPORT pi_result piextMemGetNativeHandle(pi_mem mem, pi_device dev, pi_native_handle *nativeHandle); /// Creates PI mem object from a native handle. diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 6d50d120aa2f9..de715de0835fd 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -233,8 +233,9 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, HostPtr, RetImage); } -pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) { - return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle); +pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev, + pi_native_handle *NativeHandle) { + return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle); } pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle, diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index 266c72a3b3587..126ada92348f6 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -236,8 +236,9 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, HostPtr, RetImage); } -pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) { - return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle); +pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev, + pi_native_handle *NativeHandle) { + return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle); } pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle, diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 79e047850af88..0fc36a231be6c 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -243,8 +243,9 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, HostPtr, RetImage); } -pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) { - return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle); +pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev, + pi_native_handle *NativeHandle) { + return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle); } pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle, diff --git a/sycl/plugins/native_cpu/pi_native_cpu.cpp b/sycl/plugins/native_cpu/pi_native_cpu.cpp index c7e71f9791d35..48ce104a94e90 100644 --- a/sycl/plugins/native_cpu/pi_native_cpu.cpp +++ b/sycl/plugins/native_cpu/pi_native_cpu.cpp @@ -240,8 +240,9 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, HostPtr, RetImage); } -pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) { - return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle); +pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev, + pi_native_handle *NativeHandle) { + return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle); } pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle, diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp index 3e7f3aea4dfed..c09be92f89406 100644 --- a/sycl/plugins/opencl/pi_opencl.cpp +++ b/sycl/plugins/opencl/pi_opencl.cpp @@ -222,8 +222,9 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, HostPtr, RetImage); } -pi_result piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) { - return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle); +pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev, + pi_native_handle *NativeHandle) { + return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle); } pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle, diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index af56733f4b72f..c990359e39e96 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -56,14 +56,14 @@ endif() if(SYCL_PI_UR_USE_FETCH_CONTENT) include(FetchContent) - set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime") - # commit 3225b822b5d8cbfa85d7fc1bd5a5bf96e5bb8c1a - # Merge: edb281f3 5fc41099 + set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") + # commit d216eb44d5c9fe3433eecdd09b10e3e79ac25bd7 + # Merge: 40517d2b fc1f3066 # Author: Kenneth Benzie (Benie) - # Date: Tue Jan 30 12:31:44 2024 +0000 - # Merge pull request #1168 from Seanst98/sean/unique-addr-mode-per-dim-adapters - # [Bindless][CUDA] Unique addressing modes per dimension - set(UNIFIED_RUNTIME_TAG 3225b822b5d8cbfa85d7fc1bd5a5bf96e5bb8c1a) + # Date: Wed Jan 31 10:38:07 2024 +0000 + # Merge pull request #1226 from hdelan/get-native-mem-on-device2 + # [UR] Add extra param to urMemGetNativeHandle + set(UNIFIED_RUNTIME_TAG d216eb44d5c9fe3433eecdd09b10e3e79ac25bd7) if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO) set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}") diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index df841b786bfb1..c19c93a6af53a 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -3081,13 +3081,14 @@ inline pi_result piMemBufferPartition(pi_mem Buffer, pi_mem_flags Flags, return PI_SUCCESS; } -inline pi_result piextMemGetNativeHandle(pi_mem Mem, +inline pi_result piextMemGetNativeHandle(pi_mem Mem, pi_device Dev, pi_native_handle *NativeHandle) { PI_ASSERT(Mem, PI_ERROR_INVALID_MEM_OBJECT); ur_mem_handle_t UrMem = reinterpret_cast(Mem); + ur_device_handle_t UrDev = reinterpret_cast(Dev); ur_native_handle_t NativeMem{}; - HANDLE_ERRORS(urMemGetNativeHandle(UrMem, &NativeMem)); + HANDLE_ERRORS(urMemGetNativeHandle(UrMem, UrDev, &NativeMem)); *NativeHandle = reinterpret_cast(NativeMem); diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp index adbeb652bf613..b9742b8697fa8 100644 --- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp +++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp @@ -235,9 +235,9 @@ __SYCL_EXPORT pi_result piMemBufferPartition( BufferCreateInfo, RetMem); } -__SYCL_EXPORT pi_result -piextMemGetNativeHandle(pi_mem Mem, pi_native_handle *NativeHandle) { - return pi2ur::piextMemGetNativeHandle(Mem, NativeHandle); +__SYCL_EXPORT pi_result piextMemGetNativeHandle( + pi_mem Mem, pi_device Dev, pi_native_handle *NativeHandle) { + return pi2ur::piextMemGetNativeHandle(Mem, Dev, NativeHandle); } __SYCL_EXPORT pi_result diff --git a/sycl/source/detail/buffer_impl.cpp b/sycl/source/detail/buffer_impl.cpp index 04c055465a9cf..835c732a40bf9 100644 --- a/sycl/source/detail/buffer_impl.cpp +++ b/sycl/source/detail/buffer_impl.cpp @@ -84,7 +84,11 @@ buffer_impl::getNativeVector(backend BackendName) const { } pi_native_handle Handle; - Plugin->call(NativeMem, &Handle); + // When doing buffer interop we don't know what device the memory should be + // resident on, so pass nullptr for Device param. Buffer interop may not be + // supported by all backends. + Plugin->call(NativeMem, /*Dev*/ nullptr, + &Handle); Handles.push_back(Handle); } diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 0daa53587ed4d..e59fb94a09f65 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -144,7 +144,11 @@ void memBufferCreateHelper(const PluginPtr &Plugin, pi_context Ctx, // Always use call_nocheck here, because call may throw an exception, // and this lambda will be called from destructor, which in combination // rewards us with UB. - Plugin->call_nocheck(*RetMem, &Ptr); + // When doing buffer interop we don't know what device the memory should + // be resident on, so pass nullptr for Device param. Buffer interop may + // not be supported by all backends. + Plugin->call_nocheck( + *RetMem, /*Dev*/ nullptr, &Ptr); emitMemAllocEndTrace(MemObjID, (uintptr_t)(Ptr), Size, 0 /* guard zone */, CorrID); }}; @@ -167,7 +171,11 @@ void memReleaseHelper(const PluginPtr &Plugin, pi_mem Mem) { // Do not make unnecessary PI calls without instrumentation enabled if (xptiTraceEnabled()) { pi_native_handle PtrHandle = 0; - Plugin->call(Mem, &PtrHandle); + // When doing buffer interop we don't know what device the memory should be + // resident on, so pass nullptr for Device param. Buffer interop may not be + // supported by all backends. + Plugin->call(Mem, /*Dev*/ nullptr, + &PtrHandle); Ptr = (uintptr_t)(PtrHandle); } #endif diff --git a/sycl/source/interop_handle.cpp b/sycl/source/interop_handle.cpp index 0b0ab39199370..cd479493bbae3 100644 --- a/sycl/source/interop_handle.cpp +++ b/sycl/source/interop_handle.cpp @@ -34,8 +34,8 @@ pi_native_handle interop_handle::getNativeMem(detail::Requirement *Req) const { auto Plugin = MQueue->getPlugin(); pi_native_handle Handle; - Plugin->call(Iter->second, - &Handle); + Plugin->call( + Iter->second, MDevice->getHandleRef(), &Handle); return Handle; } diff --git a/sycl/test-e2e/HostInteropTask/interop-task-hip.cpp b/sycl/test-e2e/HostInteropTask/interop-task-hip.cpp new file mode 100644 index 0000000000000..1f98b2da24c33 --- /dev/null +++ b/sycl/test-e2e/HostInteropTask/interop-task-hip.cpp @@ -0,0 +1,136 @@ +// FIXME: the rocm include path and link path are highly platform dependent, +// we should set this with some variable instead. +// RUN: %{build} -o %t.out -I/opt/rocm/include -L/opt/rocm/lib -lamdhip64 +// RUN: %{run} %t.out +// REQUIRES: hip + +#include +#include + +#define __HIP_PLATFORM_AMD__ + +#include + +using namespace sycl; +using namespace sycl::access; + +static constexpr size_t BUFFER_SIZE = 1024; + +template class Modifier; + +template class Init; + +template +void checkBufferValues(BufferT Buffer, ValueT Value) { + auto Acc = Buffer.get_host_access(); + for (size_t Idx = 0; Idx < Acc.get_count(); ++Idx) { + if (Acc[Idx] != Value) { + std::cerr << "buffer[" << Idx << "] = " << Acc[Idx] + << ", expected val = " << Value << '\n'; + exit(1); + } + } +} + +template +void copy(buffer &Src, buffer &Dst, queue &Q) { + Q.submit([&](handler &CGH) { + auto SrcA = Src.template get_access(CGH); + auto DstA = Dst.template get_access(CGH); + + auto Func = [=](interop_handle IH) { + auto HipStream = IH.get_native_queue(); + auto SrcMem = IH.get_native_mem(SrcA); + auto DstMem = IH.get_native_mem(DstA); + + if (hipMemcpyWithStream(DstMem, SrcMem, sizeof(DataT) * SrcA.get_count(), + hipMemcpyDefault, HipStream) != hipSuccess) { + throw; + } + + if (hipStreamSynchronize(HipStream) != hipSuccess) { + throw; + } + + if (Q.get_backend() != IH.get_backend()) + throw; + }; + CGH.host_task(Func); + }); +} + +template void modify(buffer &B, queue &Q) { + Q.submit([&](handler &CGH) { + auto Acc = B.template get_access(CGH); + + auto Kernel = [=](item<1> Id) { Acc[Id] += 1; }; + + CGH.parallel_for>(Acc.get_count(), Kernel); + }); +} + +template +void init(buffer &B1, buffer &B2, queue &Q) { + Q.submit([&](handler &CGH) { + auto Acc1 = B1.template get_access(CGH); + auto Acc2 = B2.template get_access(CGH); + + CGH.parallel_for>(BUFFER_SIZE, [=](item<1> Id) { + Acc1[Id] = B1Init; + Acc2[Id] = B2Init; + }); + }); +} + +// Check that a single host-interop-task with a buffer will work. +void test_ht_buffer(queue &Q) { + buffer Buffer{BUFFER_SIZE}; + + Q.submit([&](handler &CGH) { + auto Acc = Buffer.get_access(CGH); + auto Func = [=](interop_handle IH) { /*A no-op */ }; + CGH.host_task(Func); + }); +} + +// A test that uses HIP interop to copy data from buffer A to buffer B, by +// getting HIP ptrs and calling the hipMemcpyWithStream. Then run a SYCL +// kernel that modifies the data in place for B, e.g. increment one, then copy +// back to buffer A. Run it on a loop, to ensure the dependencies and the +// reference counting of the objects is not leaked. +void test_ht_kernel_dependencies(queue &Q) { + static constexpr int COUNT = 4; + buffer Buffer1{BUFFER_SIZE}; + buffer Buffer2{BUFFER_SIZE}; + + // Init the buffer with a'priori invalid data. + init(Buffer1, Buffer2, Q); + + // Repeat a couple of times. + for (size_t Idx = 0; Idx < COUNT; ++Idx) { + copy(Buffer1, Buffer2, Q); + modify(Buffer2, Q); + copy(Buffer2, Buffer1, Q); + } + + checkBufferValues(Buffer1, COUNT - 1); + checkBufferValues(Buffer2, COUNT - 1); +} + +void tests(queue &Q) { + test_ht_buffer(Q); + test_ht_kernel_dependencies(Q); +} + +int main() { + queue Q([](sycl::exception_list ExceptionList) { + if (ExceptionList.size() != 1) { + std::cerr << "Should be one exception in exception list" << std::endl; + std::abort(); + } + std::rethrow_exception(*ExceptionList.begin()); + }); + tests(Q); + std::cout << "Test PASSED" << std::endl; + return 0; +} diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp index c1f0a58f82274..31eac5598f588 100644 --- a/sycl/unittests/helpers/PiMockPlugin.hpp +++ b/sycl/unittests/helpers/PiMockPlugin.hpp @@ -613,7 +613,7 @@ mock_piMemBufferPartition(pi_mem buffer, pi_mem_flags flags, return PI_SUCCESS; } -inline pi_result mock_piextMemGetNativeHandle(pi_mem mem, +inline pi_result mock_piextMemGetNativeHandle(pi_mem mem, pi_device dev, pi_native_handle *nativeHandle) { *nativeHandle = reinterpret_cast(mem); return PI_SUCCESS;