From 90866aa059e0a6ad37ebbce1655291f47d3d6215 Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Mon, 29 Jan 2024 17:58:18 -0800 Subject: [PATCH] Implement gather(lacc) accepting compile time properties --- sycl/include/sycl/ext/intel/esimd/memory.hpp | 313 +++++++++++++++++- .../unified_memory_api/Inputs/gather.hpp | 253 +++++++++++++- .../ESIMD/unified_memory_api/gather_lacc.cpp | 37 +++ .../gather_lacc_dg2_pvc.cpp | 40 +++ sycl/test/esimd/memory_properties.cpp | 73 +++- 5 files changed, 710 insertions(+), 6 deletions(-) create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/gather_lacc.cpp create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/gather_lacc_dg2_pvc.cpp diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index 7af684ee99cfb..4683aeeaf097c 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -7209,6 +7209,317 @@ __ESIMD_API flags); } +/// Variant of gather that uses local accessor as a parameter +/// template +/// simd gather(AccessorT acc, simd byte_offsets, +/// simd_mask mask, simd pass_thru, +/// PropertyListT props = {}); // (lacc-ga-1) +/// simd gather(AccessorT acc, simd byte_offsets, +/// simd_mask mask, +/// PropertyListT props = {}); // (lacc-ga-2) +/// simd gather(AccessorT acc, simd byte_offsets, +/// PropertyListT props = {}); // (lacc-ga-3) +/// +/// The next 3 functions are similar to (lacc-ga-1,2,3), but they don't have +/// the template parameter 'VS'. These functions are added for convenience and +/// to make it possible for user to omit the template parameters T and N, +/// e.g. 'auto res = gather(acc, byte_offsets); +/// template +/// simd gather(AccessorT acc, simd byte_offsets, +/// simd_mask mask, simd pass_thru, +/// PropertyListT props = {}); // (lacc-ga-4) +/// simd gather(AccessorT acc, simd byte_offsets, +/// simd_mask mask, PropertyListT props = {});//(lacc-ga-5) +/// simd gather(AccessorT acc, simd byte_offsets, +/// PropertyListT props = {}); // (lacc-ga-6) +/// +/// The next 3 functions are similar to (lacc-ga-1,2,3), but accept the +/// \p byte_offsets as a \c simd_view argument: +/// template +/// simd gather(AccessorT acc, OffsetSimdViewT byte_offsets, +/// simd_mask mask, simd pass_thru, +/// PropertyListT props = {}); // (lacc-ga-7) +/// simd gather(AccessorT acc, OffsetSimdViewT byte_offsets, +/// simd_mask mask, +/// PropertyListT props = {}); // (lacc-ga-8) +/// simd gather(AccessorT acc, OffsetSimdViewT byte_offsets, +/// PropertyListT props = {}); // (lacc-ga-9) + +/// template +/// simd gather(AccessorT acc, simd byte_offsets, +/// simd_mask mask, simd pass_thru, +/// PropertyListT props = {}); // (lacc-ga-1) +/// Supported platforms: DG2, PVC only - Temporary restriction for the variant +/// with pass_thru operand. The only exception: DG2/PVC is not required if +/// the __ESIMD_GATHER_SCATTER_LLVM_IR macro is used. +/// +/// Loads ("gathers") elements of the type 'T' from memory locations addressed +/// by the local accessor \p acc and byte offsets \p byte_offsets, and returns +/// the loaded elements. +/// Access to any element's memory location can be disabled via the input vector +/// of predicates \p mask. If mask[i] is unset, then the load from +/// (acc + byte_offsets[i]) is skipped and the corresponding i-th element from +/// \p pass_thru operand is returned. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @tparam VS Vector size. It can also be read as the number of reads per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param acc Accessor referencing the data to load. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param mask The access mask. +/// @param pass_thru The vector pass through values. +/// @param props The optional compile-time properties. Only 'alignment' +/// property is used. +/// @return A vector of elements read. +template +__ESIMD_API std::enable_if_t< + (detail::is_local_accessor_with_v && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, simd byte_offsets, + simd_mask mask, simd pass_thru, PropertyListT props = {}) { + return slm_gather(byte_offsets + + __ESIMD_DNS::localAccessorToOffset(acc), + mask, pass_thru, props); +} + +/// template +/// simd gather(AccessorT acc, simd byte_offsets, +/// simd_mask mask, +/// PropertyListT props = {}); // (lacc-ga-2) +/// Supported platforms: DG2, PVC in most cases. The DG2/PVC is not required if +/// VS == 1 and the __ESIMD_GATHER_SCATTER_LLVM_IR macro is used or sizeof(T) <= +/// 4 and N = {1,2,4,8,16,32} +/// +/// Loads ("gathers") elements of the type 'T' from memory locations addressed +/// by the local accessor \p acc and byte offsets \p byte_offsets, and returns +/// the loaded elements. +/// Access to any element's memory location can be disabled via the input vector +/// of predicates \p mask. If mask[i] is unset, then the load from +/// (acc + byte_offsets[i]) is skipped and the corresponding i-th element of +/// the returned vector is undefined. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @tparam VS Vector size. It can also be read as the number of reads per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param acc Accessor referencing the data to load. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param mask The access mask. +/// @param props The optional compile-time properties. Only 'alignment' +/// property is used. +/// @return A vector of elements read. +template +__ESIMD_API std::enable_if_t< + (detail::is_local_accessor_with_v && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, simd byte_offsets, + simd_mask mask, PropertyListT props = {}) { + return slm_gather( + byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), mask, props); +} + +/// template +/// simd gather(AccessorT acc, simd byte_offsets, +/// PropertyListT props = {}); // (lacc-ga-3) +/// Supported platforms: DG2, PVC in most cases. The DG2/PVC is not required if +/// VS == 1 and the __ESIMD_GATHER_SCATTER_LLVM_IR macro is used or sizeof(T) <= +/// 4 and N = {1,2,4,8,16,32} +/// +/// Loads ("gathers") elements of the type 'T' from memory locations addressed +/// by the local accessor \p acc and byte offsets \p byte_offsets, and returns +/// the loaded elements. +/// @tparam T Element type. +/// @tparam N Number of elements to read. +/// @tparam VS Vector size. It can also be read as the number of reads per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param acc Accessor referencing the data to load. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param props The optional compile-time properties. Only 'alignment' +/// and cache hint properties are used. +/// @return A vector of elements read. +template +__ESIMD_API std::enable_if_t< + (detail::is_local_accessor_with_v && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, simd byte_offsets, + PropertyListT props = {}) { + return slm_gather( + byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), props); +} + +/// template +/// simd gather(AccessorT acc, simd byte_offsets, +/// simd_mask mask, simd pass_thru, +/// PropertyListT props = {}); // (lacc-ga-4) +/// This function is identical to (lacc-ga-1) except that vector size is fixed +/// to 1. This variant is added for convenience and let user omit the template +/// arguments and call the function as +/// 'gather(acc, byte_offsets, mask, pass_thru);'. +// Dev note: the mask type was turned into template parameter `MaskT` to +// avoid the conflicts of this prototype with the old gather() function +// accepting a 'global_offset' parameter and avoid 'ambiguous call' errors +// for calls like this: gather(acc, byte_offsets_simd, 0, mask); +template +__ESIMD_API std::enable_if_t< + (detail::is_local_accessor_with_v && + std::is_same_v> && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, simd byte_offsets, MaskT mask, + simd pass_thru, PropertyListT props = {}) { + return slm_gather(byte_offsets + + __ESIMD_DNS::localAccessorToOffset(acc), + mask, pass_thru, props); +} + +/// template +/// simd gather(AccessorT acc, simd byte_offsets, +/// simd_mask mask, PropertyListT props // (lacc-ga-5) +/// This function is identical to (lacc-ga-2) except that vector size is fixed +/// to 1. This variant is added for convenience and let user omit the template +/// arguments and call the function as 'gather(acc, byte_offsets, mask);'. +// Dev note: the mask type was turned into template parameter `MaskT` to +// avoid the conflicts of this prototype with the old gather() function +// accepting a 'global_offset' parameter and avoid 'ambiguous call' errors +// for calls like this: gather(acc, byte_offsets_simd, 0); +template +__ESIMD_API std::enable_if_t< + (detail::is_local_accessor_with_v && + std::is_same_v> && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, simd byte_offsets, MaskT mask, + PropertyListT props = {}) { + return slm_gather( + byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), mask, props); +} + +/// template +/// simd gather(AccessorT acc, simd byte_offsets, +/// PropertyListT props = {}); // (lacc-ga-6) +/// This function is identical to (lacc-ga-3) except that vector size is fixed +/// to 1. This variant is added for convenience and let user omit the template +/// arguments and call the function as 'gather(acc, byte_offsets);'. +template +__ESIMD_API std::enable_if_t< + (detail::is_local_accessor_with_v && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, simd byte_offsets, + PropertyListT props = {}) { + return slm_gather( + byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), props); +} + +/// template +/// simd gather(AccessorT acc, OffsetSimdViewT byte_offsets, +/// simd_mask mask, simd pass_thru, +/// PropertyListT props = {}); // (lacc-ga-7) +/// This function is identical to (lacc-ga-1) except that the \p byte_offsets +/// is represented as \c simd_view. +template +__ESIMD_API std::enable_if_t< + (detail::is_local_accessor_with_v && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask mask, + simd pass_thru, PropertyListT props = {}) { + return gather(acc, byte_offsets.read(), mask, pass_thru, props); +} + +/// template +/// simd gather(AccessorT acc, OffsetSimdViewT byte_offsets, +/// simd_mask mask, +/// PropertyListT props = {}); // (lacc-ga-8) +/// This function is identical to (lacc-ga-2) except that the \p byte_offsets +/// is represented as \c simd_view. +template +__ESIMD_API std::enable_if_t< + (detail::is_local_accessor_with_v && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, OffsetSimdViewT byte_offsets, simd_mask mask, + PropertyListT props = {}) { + return gather(acc, byte_offsets.read(), mask, props); +} + +/// template +/// simd gather(AccessorT acc, OffsetSimdViewT byte_offsets, +/// PropertyListT props = {}); // (lacc-ga-9) +/// This function is identical to (lacc-ga-3) except that the \p byte_offsets +/// is represented as \c simd_view. +template +__ESIMD_API std::enable_if_t< + (detail::is_local_accessor_with_v && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v), + simd> +gather(AccessorT acc, OffsetSimdViewT byte_offsets, PropertyListT props = {}) { + return gather(acc, byte_offsets.read(), props); +} + /// Variant of gather that uses local accessor as a parameter /// /// Collects elements located at given offsets in an accessor and returns them @@ -7231,7 +7542,7 @@ __ESIMD_API std::enable_if_t, simd> - gather(AccessorTy acc, simd offsets, uint32_t glob_offset = 0, + gather(AccessorTy acc, simd offsets, uint32_t glob_offset, simd_mask mask = 1) { return slm_gather( offsets + glob_offset + __ESIMD_DNS::localAccessorToOffset(acc), mask); diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp index 03ef9bc46c483..c82d69df4e1c3 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp @@ -508,7 +508,7 @@ bool testSLM(queue Q, uint32_t MaskStride, PropertiesT) { uint32_t GlobalElemOffset = GlobalID * N; uint32_t LocalElemOffset = LocalID * N; - // Allocate a bit more to safely initialize it with 4-element chunks. + // Allocate a bit more to safely initialize it with 8-element chunks. constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T); slm_init(); @@ -761,3 +761,254 @@ template bool testACC(queue Q) { } return Passed; } + +template +bool testLACC(queue Q, uint32_t MaskStride, PropertiesT) { + + static_assert(VS > 0 && N % VS == 0, + "Incorrect VS parameter. N must be divisible by VS."); + constexpr int NOffsets = N / VS; + static_assert(!UsePassThru || UseMask, + "PassThru cannot be used without using mask"); + + constexpr uint32_t Groups = 8; + constexpr uint32_t Threads = 16; + + std::cout << "Running case: T=" << esimd_test::type_name() << ", N=" << N + << ", VS=" << VS << ", MaskStride=" << MaskStride + << ", Groups=" << Groups << ", Threads=" << Threads + << ", use_mask=" << UseMask << ", use_pass_thru=" << UsePassThru + << ", use_properties=" << UseProperties << std::endl; + + uint16_t Size = Groups * Threads * N; + using Tuint = esimd_test::uint_type_t; + + sycl::range<1> GlobalRange{Groups}; + sycl::range<1> LocalRange{Threads}; + sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange}; + + T *Out = sycl::malloc_shared(Size, Q); + std::memset(Out, 0, Size * sizeof(T)); + + T *In = sycl::malloc_shared(Size * 2, Q); + for (int I = 0; I < Size; I++) + In[I] = esimd_test::getRandomValue(); + + try { + buffer InBuf(In, Size * 2); + Q.submit([&](handler &CGH) { + // Allocate a bit more to safely initialize it with 8-element chunks. + constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T); + + auto InAcc = local_accessor(SLMSize, CGH); + + CGH.parallel_for(Range, [=](sycl::nd_item<1> NDI) SYCL_ESIMD_KERNEL { + uint16_t GlobalID = NDI.get_global_id(0); + uint16_t LocalID = NDI.get_local_id(0); + uint32_t GlobalElemOffset = GlobalID * N; + uint32_t LocalElemOffset = LocalID * N; + + if (LocalID == 0) { + for (int I = 0; I < Threads * N; I += 8) { + simd InVec(In + GlobalElemOffset + I); + simd offsets(I * sizeof(T), sizeof(T)); + scatter(InAcc, offsets, InVec); + } + } + barrier(); + PropertiesT Props{}; + + simd ByteOffsets(LocalElemOffset * sizeof(T), + VS * sizeof(T)); + simd_view ByteOffsetsView = ByteOffsets.template select(); + + simd_mask Pred; + for (int I = 0; I < NOffsets; I++) + Pred[I] = (I % MaskStride == 0) ? 1 : 0; + + using Tuint = esimd_test::uint_type_t; + simd PassThruInt(GlobalElemOffset, 1); + simd PassThru = PassThruInt.template bit_cast_view(); + auto PassThruView = PassThru.template select(0); + + simd Vals; + if constexpr (VS > 1) { // VS > 1 requires specifying + if constexpr (UsePassThru) { + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd + Vals = gather(InAcc, ByteOffsets, Pred, PassThru, + Props); + else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view + Vals = gather(InAcc, ByteOffsets, Pred, PassThruView, + Props); + else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd + Vals = gather(InAcc, ByteOffsetsView, Pred, PassThru, + Props); + else // ByteOffset - view, PassThru - view + Vals = gather(InAcc, ByteOffsetsView, Pred, + PassThruView, Props); + } else { // UseProperties is false + if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd + Vals = gather(InAcc, ByteOffsets, Pred, PassThru); + else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view + Vals = + gather(InAcc, ByteOffsets, Pred, PassThruView); + else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd + Vals = + gather(InAcc, ByteOffsetsView, Pred, PassThru); + else // ByteOffset - view, PassThru - view + Vals = gather(InAcc, ByteOffsetsView, Pred, + PassThruView); + } + } else if constexpr (UseMask) { // UsePassThru is false + if constexpr (UseProperties) { + if (GlobalID % 2 == 0) // ByteOffset - simd + Vals = gather(InAcc, ByteOffsets, Pred, Props); + else // ByteOffset - simd_view + Vals = gather(InAcc, ByteOffsetsView, Pred, Props); + } else { // UseProperties is false + if (GlobalID % 2 == 0) // ByteOffset - simd + Vals = gather(InAcc, ByteOffsets, Pred); + else // ByteOffset - simd_view + Vals = gather(InAcc, ByteOffsetsView, Pred); + } + } else { // UseMask is false, UsePassThru is false + if constexpr (UseProperties) { + if (GlobalID % 2 == 0) // ByteOffset - simd + Vals = gather(InAcc, ByteOffsets, Props); + else // ByteOffset - simd_view + Vals = gather(InAcc, ByteOffsetsView, Props); + } else { // UseProperties is false + if (GlobalID % 2 == 0) // ByteOffset - simd + Vals = gather(InAcc, ByteOffsets); + else // ByteOffset - simd_view + Vals = gather(InAcc, ByteOffsetsView); + } + } + } else { + // if (VS == 1) then can often be omitted - test it here. + // The variants accepting simd_view for 'PassThru' operand though + // still require to be specified explicitly to help + // C++ FE do simd to simd_view matching. + if constexpr (UsePassThru) { + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd + Vals = gather(InAcc, ByteOffsets, Pred, PassThru, Props); + else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view + Vals = gather(InAcc, ByteOffsets, Pred, PassThruView, + Props); + else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd + Vals = gather(InAcc, ByteOffsetsView, Pred, PassThru, Props); + else // ByteOffset - view, PassThru - view + Vals = gather(InAcc, ByteOffsetsView, Pred, PassThruView, + Props); + } else { // UseProperties is false + if (GlobalID % 4 == 0) // ByteOffset - simd, PassThru - simd + Vals = gather(InAcc, ByteOffsets, Pred, PassThru); + else if (GlobalID % 4 == 1) // ByteOffset - simd, PassThru - view + Vals = gather(InAcc, ByteOffsets, Pred, PassThruView); + else if (GlobalID % 4 == 2) // ByteOffset - view, PassThru - simd + Vals = gather(InAcc, ByteOffsetsView, Pred, PassThru); + else // ByteOffset - view, PassThru - view + Vals = + gather(InAcc, ByteOffsetsView, Pred, PassThruView); + } + } else if constexpr (UseMask) { // UsePassThru is false + if constexpr (UseProperties) { + if (GlobalID % 2 == 0) // ByteOffset - simd + Vals = gather(InAcc, ByteOffsets, Pred, Props); + else // ByteOffset - simd_view + Vals = gather(InAcc, ByteOffsetsView, Pred, Props); + } else { // UseProperties is false + if (GlobalID % 2 == 0) // ByteOffset - simd + Vals = gather(InAcc, ByteOffsets, Pred); + else // ByteOffset - simd_view + Vals = gather(InAcc, ByteOffsetsView, Pred); + } + } else { // UsePassThru is false, UseMask is false + if constexpr (UseProperties) { + if (GlobalID % 2 == 0) // ByteOffset - simd + Vals = gather(InAcc, ByteOffsets, Props); + else // ByteOffset - simd_view + Vals = gather(InAcc, ByteOffsetsView, Props); + } else { + if (GlobalID % 2 == 0) // ByteOffset - simd + Vals = gather(InAcc, ByteOffsets); + else // ByteOffset - simd_view + Vals = gather(InAcc, ByteOffsetsView); + } + } + } // end if (VS == 1) + Vals.copy_to(Out + GlobalID * N); + }); + }).wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + sycl::free(In, Q); + sycl::free(Out, Q); + return false; + } + + bool Passed = verify(In, Out, N, Size, VS, MaskStride, UseMask, UsePassThru); + if (!Passed) + std::cout << "Case FAILED" << std::endl; + + sycl::free(In, Q); + sycl::free(Out, Q); + return Passed; +} + +template bool testLACC(queue Q) { + constexpr bool UseMask = true; + constexpr bool UsePassThru = true; + constexpr bool UseProperties = true; + + properties AlignElemProps{alignment}; + + bool Passed = true; + Passed &= testLACC( + Q, 2, AlignElemProps); + Passed &= testLACC( + Q, 2, AlignElemProps); + Passed &= testLACC( + Q, 2, AlignElemProps); + Passed &= testLACC( + Q, 3, AlignElemProps); + Passed &= testLACC( + Q, 2, AlignElemProps); + Passed &= testLACC( + Q, 3, AlignElemProps); + + if constexpr (Features == TestFeatures::PVC || + Features == TestFeatures::DG2) { + properties LSCProps{alignment}; + Passed &= testLACC( + Q, 2, LSCProps); + Passed &= + testLACC(Q, 2, LSCProps); + Passed &= + testLACC(Q, 2, LSCProps); + Passed &= + testLACC(Q, 3, LSCProps); + + Passed &= + testLACC(Q, 2, LSCProps); + + // Check VS > 1. GPU supports only dwords and qwords in this mode. + if constexpr (sizeof(T) >= 4) { + // TODO: This test case causes flaky fail. Enable it after the issue + // in GPU driver is fixed. + // Passed &= testACC( + // Q, 3, AlignElemProps); + + Passed &= testLACC( + Q, 3, AlignElemProps); + Passed &= testLACC( + Q, 3, AlignElemProps); + Passed &= testLACC( + Q, 3, AlignElemProps); + } + } + return Passed; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/gather_lacc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/gather_lacc.cpp new file mode 100644 index 0000000000000..fde14ee287b75 --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/gather_lacc.cpp @@ -0,0 +1,37 @@ +//==------- gather_lacc.cpp - DPC++ ESIMD on-device test +//--------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Use per-kernel compilation to have more information about failing cases. +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::gather() functions accepting Local ACCESSOR +// and optional compile-time esimd::properties. +// The gather() calls in this test do not use VS > 1 (number of loads per +// offset) to not impose using DG2/PVC features. + +#include "Inputs/gather.hpp" + +int main() { + auto Q = queue{gpu_selector_v}; + esimd_test::printTestLabel(Q); + + constexpr auto TestFeatures = TestFeatures::Generic; + bool Passed = true; + + Passed &= testLACC(Q); + Passed &= testLACC(Q); + if (Q.get_device().has(sycl::aspect::fp16)) + Passed &= testLACC(Q); + Passed &= testLACC(Q); + Passed &= testLACC(Q); + Passed &= + testLACC(Q); + std::cout << (Passed ? "Passed\n" : "FAILED\n"); + return Passed ? 0 : 1; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/gather_lacc_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/gather_lacc_dg2_pvc.cpp new file mode 100644 index 0000000000000..503f8bf71a8d6 --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/gather_lacc_dg2_pvc.cpp @@ -0,0 +1,40 @@ +//==------- gather_lacc_dg2_pvc.cpp - DPC++ ESIMD on-device test +//------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: gpu-intel-dg2 || gpu-intel-pvc + +// Use per-kernel compilation to have more information about failing cases. +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::gather() functions accepting Local ACCESSOR +// and optional compile-time esimd::properties. +// The gather() calls in this test can use VS > 1 (number of loads per offset). + +#include "Inputs/gather.hpp" + +int main() { + auto Q = queue{gpu_selector_v}; + esimd_test::printTestLabel(Q); + + constexpr auto TestFeatures = TestFeatures::DG2; + bool Passed = true; + + Passed &= testLACC(Q); + Passed &= testLACC(Q); + if (Q.get_device().has(sycl::aspect::fp16)) + Passed &= testLACC(Q); + Passed &= testLACC(Q); + Passed &= testLACC(Q); + Passed &= + testLACC(Q); + + std::cout << (Passed ? "Passed\n" : "FAILED\n"); + return Passed ? 0 : 1; +} diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp index 737d7b4fabfad..a02b250924f8c 100644 --- a/sycl/test/esimd/memory_properties.cpp +++ b/sycl/test/esimd/memory_properties.cpp @@ -31,7 +31,7 @@ test_block_store(AccType &, LocalAccType &local_acc, float *, int byte_offset32, size_t byte_offset64); SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void -test_gather_scatter(AccType &, float *, int byte_offset32, +test_gather_scatter(AccType &, LocalAccType &, float *, int byte_offset32, size_t byte_offset64); SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void test_slm_gather_scatter(int byte_offset32); @@ -48,7 +48,7 @@ class EsimdFunctor { test_block_load(acc, local_acc, ptr, byte_offset32, byte_offset64); test_atomic_update(acc, local_acc_int, ptr, byte_offset32, byte_offset64); test_block_store(acc, local_acc, ptr, byte_offset32, byte_offset64); - test_gather_scatter(acc, ptr, byte_offset32, byte_offset64); + test_gather_scatter(acc, local_acc, ptr, byte_offset32, byte_offset64); test_slm_gather_scatter(byte_offset32); } }; @@ -939,8 +939,8 @@ test_block_store(AccType &acc, LocalAccType &local_acc, float *ptrf, // CHECK-LABEL: define {{.*}} @_Z19test_gather_scatter{{.*}} SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void -test_gather_scatter(AccType &acc, float *ptrf, int byte_offset32, - size_t byte_offset64) { +test_gather_scatter(AccType &acc, LocalAccType &local_acc, float *ptrf, + int byte_offset32, size_t byte_offset64) { properties props_cache_load{cache_hint_L1, cache_hint_L2, alignment<8>}; @@ -978,6 +978,10 @@ test_gather_scatter(AccType &acc, float *ptrf, int byte_offset32, // 6) gather(acc, offsets, mask): offsets is simd or simd_view // 7) gather(acc, offsets, mask, pass_thru) // 8) gather(acc, ...): same as (5), (6), (7) above, but with VS > 1. + // 9) gather(lacc, offsets): offsets is simd or simd_view + // 10) gather(lacc, offsets, mask): offsets is simd or simd_view + // 11) gather(lacc, offsets, mask, pass_thru) + // 12) gather(lacc, ...): same as (9), (10), (11) above, but with VS > 1. // 1) gather(usm, offsets): offsets is simd or simd_view // CHECK-COUNT-4: call <32 x float> @llvm.masked.gather.v32f32.v32p4(<32 x ptr addrspace(4)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}, <32 x float> {{[^)]+}}) @@ -1151,6 +1155,67 @@ test_gather_scatter(AccType &acc, float *ptrf, int byte_offset32, props_align4); acc_res = gather(acc, ioffset_n16_view, mask_n16, pass_thru_view, props_align4); + + // 9) gather(lacc, offsets): offsets is simd or simd_view + // CHECK-COUNT-16: call <32 x float> @llvm.masked.gather.v32f32.v32p3(<32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}, <32 x float> {{[^)]+}}) + acc_res = gather(local_acc, ioffset_n32); + acc_res = gather(local_acc, ioffset_n32_view); + acc_res = gather(local_acc, ioffset_n32, props_align4); + acc_res = gather(local_acc, ioffset_n32_view, props_align4); + + // 10) gather(lacc, offsets, mask): offsets is simd or simd_view + acc_res = gather(local_acc, ioffset_n32, mask_n32); + acc_res = gather(local_acc, ioffset_n32_view, mask_n32); + acc_res = gather(local_acc, ioffset_n32, mask_n32, props_align4); + acc_res = + gather(local_acc, ioffset_n32_view, mask_n32, props_align4); + + // 11) gather(lacc, offsets, mask, pass_thru) + acc_res = gather(local_acc, ioffset_n32, mask_n32, pass_thru); + acc_res = gather(local_acc, ioffset_n32_view, mask_n32, pass_thru); + acc_res = + gather(local_acc, ioffset_n32, mask_n32, pass_thru, props_align4); + acc_res = gather(local_acc, ioffset_n32_view, mask_n32, pass_thru, + props_align4); + + acc_res = gather(local_acc, ioffset_n32, mask_n32, pass_thru_view); + acc_res = + gather(local_acc, ioffset_n32_view, mask_n32, pass_thru_view); + acc_res = gather(local_acc, ioffset_n32, mask_n32, pass_thru_view, + props_align4); + acc_res = gather(local_acc, ioffset_n32_view, mask_n32, + pass_thru_view, props_align4); + + // 12) gather(lacc, ...): same as (9), (10), (11) above, but with VS > 1. + // CHECK-COUNT-16: call <32 x i32> @llvm.genx.lsc.load.merge.slm.v32i32.v16i1.v16i32(<16 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, i32 0, <32 x i32> {{[^)]+}}) + acc_res = gather(local_acc, ioffset_n16); + acc_res = gather(local_acc, ioffset_n16_view); + acc_res = gather(local_acc, ioffset_n16, props_align4); + acc_res = gather(local_acc, ioffset_n16_view, props_align4); + + acc_res = gather(local_acc, ioffset_n16, mask_n16); + acc_res = gather(local_acc, ioffset_n16_view, mask_n16); + acc_res = + gather(local_acc, ioffset_n16, mask_n16, props_align4); + acc_res = + gather(local_acc, ioffset_n16_view, mask_n16, props_align4); + + acc_res = gather(local_acc, ioffset_n16, mask_n16, pass_thru); + acc_res = + gather(local_acc, ioffset_n16_view, mask_n16, pass_thru); + acc_res = gather(local_acc, ioffset_n16, mask_n16, pass_thru, + props_align4); + acc_res = gather(local_acc, ioffset_n16_view, mask_n16, + pass_thru, props_align4); + + acc_res = + gather(local_acc, ioffset_n16, mask_n16, pass_thru_view); + acc_res = gather(local_acc, ioffset_n16_view, mask_n16, + pass_thru_view); + acc_res = gather(local_acc, ioffset_n16, mask_n16, + pass_thru_view, props_align4); + acc_res = gather(local_acc, ioffset_n16_view, mask_n16, + pass_thru_view, props_align4); } // CHECK-LABEL: define {{.*}} @_Z23test_slm_gather_scatter{{.*}}