From 0fef8d9202d1b0dfdf307afafa4b05e9437869be Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Thu, 8 Feb 2024 19:20:45 -0800 Subject: [PATCH 01/16] Implement scatter for loacal accessors accepting compile time properties --- sycl/include/sycl/ext/intel/esimd/memory.hpp | 174 +++++++++++++- .../unified_memory_api/Inputs/gather.hpp | 1 - .../unified_memory_api/Inputs/scatter.hpp | 223 ++++++++++++++++++ .../ESIMD/unified_memory_api/scatter_lacc.cpp | 36 +++ .../scatter_lacc_dg2_pvc.cpp | 38 +++ sycl/test/esimd/memory_properties.cpp | 61 +++++ 6 files changed, 528 insertions(+), 5 deletions(-) create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc.cpp create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc_dg2_pvc.cpp diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index 7f4833017d38a..7ec85c0a75ab6 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -4300,9 +4300,8 @@ slm_scatter(OffsetSimdViewT byte_offsets, simd vals, /// void slm_scatter( /// OffsetSimdViewT byte_offsets, simd vals, /// PropertyListT props = {}); // (slm-sc-4) -/// Loads ("gathers") elements of the type 'T' from Shared Local Memory -/// locations addressed by byte offsets \p byte_offsets, and returns the loaded -/// elements. +/// Stores ("scatters") elements of the type 'T' to Shared Local Memory +/// locations addressed by byte offsets \p byte_offsets. /// @tparam T Element type. /// @tparam N Number of elements to read. /// @tparam VS Vector size. It can also be read as the number of reads per each @@ -7898,6 +7897,173 @@ __ESIMD_API offsets + glob_offset + __ESIMD_DNS::localAccessorToOffset(acc), mask); } +/// Variant of scatter that uses local accessor as a parameter +/// template +/// void scatter(AccessorT acc, simd byte_offsets, simd +/// vals, simd_mask mask, PropertyListT props = {}); // +/// (lacc-sc-1) + +/// template +/// void scatter(AccessorT acc, simd byte_offsets, simd +/// vals, PropertyListT props = {}); // (lacc-sc-2) + +/// The next two functions are similar to lacc-sc-{1,2} with the 'byte_offsets' +/// parameter represerented as 'simd_view'. + +/// template +/// void scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd vals, +/// simd_mask mask, PropertyListT props = {}); // (lacc-sc-3) + +/// template +/// void scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd vals, +/// PropertyListT props = {}); // (lacc-sc-4) + +/// template +/// void scatter(AccessorT acc, simd byte_offsets, simd +/// vals, simd_mask mask, PropertyListT props = {}); // (lacc-sc-1) +/// +/// Writes ("scatters") elements of the input vector to memory locations +/// addressed by the local accessor \p acc and byte offsets \p byte_offsets. +/// Access to any element's memory location can be disabled via +/// the input mask. +/// @tparam T Element type. +/// @tparam N Number of elements to write. +/// @tparam VS Vector size. It can also be read as the number of writes per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param acc The accessor to scatter to. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param vals The vector to scatter. +/// @param mask The access mask. +/// @param props The optional compile-time properties. Only 'alignment' +/// property is used. +template +__ESIMD_API std::enable_if_t< + detail::is_local_accessor_with_v && + ext::oneapi::experimental::is_property_list_v> +scatter(AccessorT acc, simd byte_offsets, simd vals, + simd_mask mask, PropertyListT props = {}) { + slm_scatter(byte_offsets + __ESIMD_DNS::localAccessorToOffset(acc), + vals, mask, props); +} + +/// template +/// void scatter(AccessorT acc, simd byte_offsets, simd +/// vals, PropertyListT props = {}); // (lacc-sc-2) +/// +/// Writes ("scatters") elements of the input vector to memory locations +/// addressed by the local accessor \p acc and byte offsets \p byte_offsets. +/// @tparam T Element type. +/// @tparam N Number of elements to write. +/// @tparam VS Vector size. It can also be read as the number of writes per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param acc The accessor to scatter to. +/// @param byte_offsets the vector of 32-bit offsets in bytes. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param vals The vector to scatter. +/// @param props The optional compile-time properties. Only 'alignment' +/// property is used. +template +__ESIMD_API std::enable_if_t< + detail::is_local_accessor_with_v && + ext::oneapi::experimental::is_property_list_v> +scatter(AccessorT acc, simd byte_offsets, simd vals, + PropertyListT props = {}) { + simd_mask Mask = 1; + scatter(acc, byte_offsets, vals, Mask, props); +} + +// template +// void scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd vals, +// simd_mask mask, PropertyListT props = {}); // (lacc-sc-3) +/// +/// Writes ("scatters") elements of the input vector to memory locations +/// addressed by the local accessor \p acc and byte offsets \p byte_offsets. +/// Access to any element's memory location can be disabled via the input mask. +/// @tparam T Element type. +/// @tparam N Number of elements to write. +/// @tparam VS Vector size. It can also be read as the number of writes per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param acc The accessor to scatter to. +/// @param byte_offsets the vector of 32-bit offsets in bytes +/// represented as a 'simd_view' object. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param vals The vector to scatter. +/// @param mask The access mask. +/// @param props The optional compile-time properties. Only 'alignment' +/// and cache hint properties are used. +template +__ESIMD_API std::enable_if_t< + detail::is_local_accessor_with_v && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v> +scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd vals, + simd_mask mask, PropertyListT props = {}) { + scatter(acc, byte_offsets.read(), vals, mask, props); +} + +/// template +/// void scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd vals, +/// PropertyListT props = {}); // (lacc-sc-4) +/// +/// Writes ("scatters") elements of the input vector to memory locations +/// addressed by the local accessor \p acc and byte offsets \p byte_offsets. +/// @tparam T Element type. +/// @tparam N Number of elements to write. +/// @tparam VS Vector size. It can also be read as the number of writes per each +/// address. The parameter 'N' must be divisible by 'VS'. (VS > 1) is supported +/// only on DG2 and PVC and only for 4- and 8-byte element vectors. +/// @param acc The accessor to scatter to. +/// @param byte_offsets the vector of 32-bit offsets in bytes +/// represented as a 'simd_view' object. +/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned. +/// If the alignment property is not passed, then it is assumed that each +/// accessed address is aligned by element-size. +/// @param vals The vector to scatter. +/// @param props The optional compile-time properties. Only 'alignment' +/// property is used. +template +__ESIMD_API std::enable_if_t< + detail::is_local_accessor_with_v && + detail::is_simd_view_type_v && + ext::oneapi::experimental::is_property_list_v> +scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd vals, + PropertyListT props = {}) { + simd_mask Mask = 1; + scatter(acc, byte_offsets.read(), vals, Mask, props); +} + /// Variant of scatter that uses local accessor as a parameter /// /// Writes elements of a \ref simd object into an accessor at given offsets. @@ -7920,7 +8086,7 @@ template __ESIMD_API std::enable_if_t> scatter(AccessorTy acc, simd offsets, simd vals, - uint32_t glob_offset = 0, simd_mask mask = 1) { + uint32_t glob_offset, simd_mask mask = 1) { slm_scatter(offsets + glob_offset + __ESIMD_DNS::localAccessorToOffset(acc), vals, mask); diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp index e34f259c093ec..e85e16e03d3e1 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/gather.hpp @@ -796,7 +796,6 @@ bool testLACC(queue Q, uint32_t MaskStride, PropertiesT) { In[I] = esimd_test::getRandomValue(); try { - buffer InBuf(In, Size * 2); Q.submit([&](handler &CGH) { // Allocate a bit more to safely initialize it with 8-element chunks. constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T); diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp index 8ea1fcf4a08ad..b55479b64879a 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp @@ -449,3 +449,226 @@ template bool testSLM(queue Q) { return Passed; } + +template +bool testLACC(queue Q, uint32_t MaskStride, + ScatterPropertiesT ScatterProperties) { + constexpr uint32_t Groups = 8; + constexpr uint32_t Threads = 1; + constexpr size_t Size = Groups * Threads * N; + static_assert(VS > 0 && N % VS == 0, + "Incorrect VS parameter. N must be divisible by VS."); + constexpr int NOffsets = N / VS; + using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t; + + std::cout << "Local Accessor case: T=" << esimd_test::type_name() + << ",N=" << N << ", VS=" << VS << ",UseMask=" << UseMask + << ",UseProperties=" << UseProperties << std::endl; + + sycl::range<1> GlobalRange{Groups}; + sycl::range<1> LocalRange{Threads}; + sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange}; + + T *Out = static_cast(sycl::malloc_shared(Size * sizeof(T), Q)); + for (size_t i = 0; i < Size; i++) + Out[i] = i; + + try { + Q.submit([&](handler &cgh) { + constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T); + auto InAcc = local_accessor(SLMSize, cgh); + + cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { + ScatterPropertiesT Props{}; + uint16_t GlobalID = ndi.get_global_id(0); + uint16_t LocalID = ndi.get_local_id(0); + uint32_t GlobalElemOffset = GlobalID * N; + uint32_t LocalElemOffset = LocalID * N; + + slm_init(); + + if (LocalID == 0) { + for (int I = 0; I < Threads * N; I += 8) { + simd InVec(Out + GlobalElemOffset + I); + simd Offsets(I * sizeof(T), sizeof(T)); + scatter(InAcc, Offsets, InVec); + } + } + barrier(); + + simd ByteOffsets(LocalElemOffset * sizeof(T), + VS * sizeof(T)); + auto ByteOffsetsView = ByteOffsets.template select(); + + simd Vals = gather(InAcc, ByteOffsets, Props); + Vals *= 2; + + auto ValsView = Vals.template select(); + simd_mask Pred = 0; + for (int I = 0; I < NOffsets; I++) + Pred[I] = (I % MaskStride == 0) ? 1 : 0; + if constexpr (VS > 1) { // VS > 1 requires specifying + if constexpr (UseMask) { + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + scatter(InAcc, ByteOffsets, Vals, Pred, Props); + else if (GlobalID % 4 == 1) + scatter(InAcc, ByteOffsetsView, Vals, Pred, Props); + else if (GlobalID % 4 == 2) + scatter(InAcc, ByteOffsets, ValsView, Pred, Props); + else if (GlobalID % 4 == 3) + scatter(InAcc, ByteOffsetsView, ValsView, Pred, + Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + scatter(InAcc, ByteOffsets, Vals, Pred); + else if (GlobalID % 4 == 1) + scatter(InAcc, ByteOffsetsView, Vals, Pred); + else if (GlobalID % 4 == 2) + scatter(InAcc, ByteOffsets, ValsView, Pred); + else if (GlobalID % 4 == 3) + scatter(InAcc, ByteOffsetsView, ValsView, Pred); + } + } else { // UseMask == false + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + scatter(InAcc, ByteOffsets, Vals, Props); + else if (GlobalID % 4 == 1) + scatter(InAcc, ByteOffsetsView, Vals, Props); + else if (GlobalID % 4 == 2) + scatter(InAcc, ByteOffsets, ValsView, Props); + else if (GlobalID % 4 == 3) + scatter(InAcc, ByteOffsetsView, ValsView, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + scatter(InAcc, ByteOffsets, Vals); + else if (GlobalID % 4 == 1) + scatter(InAcc, ByteOffsetsView, Vals); + else if (GlobalID % 4 == 2) + scatter(InAcc, ByteOffsets, ValsView); + else if (GlobalID % 4 == 3) + scatter(InAcc, ByteOffsetsView, ValsView); + } + } + } else { // VS == 1 + if constexpr (UseMask) { + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + scatter(InAcc, ByteOffsets, Vals, Pred, Props); + else if (GlobalID % 4 == 1) + scatter(InAcc, ByteOffsetsView, Vals, Pred, Props); + else if (GlobalID % 4 == 2) + scatter(InAcc, ByteOffsets, ValsView, Pred, Props); + else if (GlobalID % 4 == 3) + scatter(InAcc, ByteOffsetsView, ValsView, Pred, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + scatter(InAcc, ByteOffsets, Vals, Pred); + else if (GlobalID % 4 == 1) + scatter(InAcc, ByteOffsetsView, Vals, Pred); + else if (GlobalID % 4 == 2) + scatter(InAcc, ByteOffsets, ValsView, Pred); + else if (GlobalID % 4 == 3) + scatter(InAcc, ByteOffsetsView, ValsView, Pred); + } + } else { // UseMask == false + if constexpr (UseProperties) { + if (GlobalID % 4 == 0) + scatter(InAcc, ByteOffsets, Vals, Props); + else if (GlobalID % 4 == 1) + scatter(InAcc, ByteOffsetsView, Vals, Props); + else if (GlobalID % 4 == 2) + scatter(InAcc, ByteOffsets, ValsView, Props); + else if (GlobalID % 4 == 3) + scatter(InAcc, ByteOffsetsView, ValsView, Props); + } else { // UseProperties == false + if (GlobalID % 4 == 0) + scatter(InAcc, ByteOffsets, Vals); + else if (GlobalID % 4 == 1) + scatter(InAcc, ByteOffsetsView, Vals); + else if (GlobalID % 4 == 2) + scatter(InAcc, ByteOffsets, ValsView); + else if (GlobalID % 4 == 3) + scatter(InAcc, ByteOffsetsView, ValsView); + } + } + } + barrier(); + if (LocalID == 0) { + for (int I = 0; I < Threads * N; I++) { + simd Offsets(I * sizeof(T), sizeof(T)); + simd OutVec = gather(InAcc, Offsets); + OutVec.copy_to(Out + GlobalElemOffset + I); + } + } + }); + }).wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + sycl::free(Out, Q); + return false; + } + + bool Passed = verify(Out, N, Size, VS, MaskStride, UseMask); + + sycl::free(Out, Q); + + return Passed; +} + +template bool testLACC(queue Q) { + constexpr bool CheckMask = true; + constexpr bool CheckProperties = true; + properties EmptyProps; + properties AlignElemProps{alignment}; + + bool Passed = true; + + // Test scatter() that is available on Gen12 and PVC. + Passed &= testLACC(Q, 2, EmptyProps); + Passed &= testLACC(Q, 1, EmptyProps); + Passed &= testLACC(Q, 2, EmptyProps); + Passed &= testLACC(Q, 2, EmptyProps); + Passed &= testLACC(Q, 2, EmptyProps); + Passed &= testLACC(Q, 2, EmptyProps); + Passed &= testLACC(Q, 1, EmptyProps); + Passed &= testLACC(Q, 2, EmptyProps); + Passed &= testLACC(Q, 2, EmptyProps); + Passed &= testLACC(Q, 2, EmptyProps); + Passed &= testLACC(Q, 2, EmptyProps); + + // // Test scatter() without passing compile-time properties argument. + Passed &= testLACC(Q, 2, EmptyProps); + Passed &= testLACC(Q, 2, EmptyProps); + Passed &= testLACC(Q, 2, EmptyProps); + Passed &= testLACC(Q, 2, EmptyProps); + + if constexpr (Features == TestFeatures::PVC || + Features == TestFeatures::DG2) { + properties LSCProps{alignment}; + Passed &= testLACC(Q, 2, LSCProps); + Passed &= testLACC(Q, 2, LSCProps); + Passed &= testLACC(Q, 2, LSCProps); + Passed &= testLACC(Q, 2, LSCProps); + + Passed &= testLACC(Q, 2, LSCProps); + + // Check VS > 1. GPU supports only dwords and qwords in this mode. + if constexpr (sizeof(T) >= 4) { + // TODO: This test case causes flaky fail. Enable it after the issue + // in GPU driver is fixed. + // Passed &= + // testLACC(Q, 2, + // AlignElemProps) + Passed &= + testLACC(Q, 2, AlignElemProps); + Passed &= + testLACC(Q, 2, AlignElemProps); + Passed &= + testLACC(Q, 2, AlignElemProps); + } + } // TestPVCFeatures + + return Passed; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc.cpp new file mode 100644 index 0000000000000..2ca5b867749b6 --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc.cpp @@ -0,0 +1,36 @@ +//==------- scatter_lacc.cpp - DPC++ ESIMD on-device test -----------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Use per-kernel compilation to have more information about failing cases. +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::scatter() functions accepting local accessor +// and optional compile-time esimd::properties. +// The scatter() calls in this test do not use VS > 1 (number of loads per +// offset) to not impose using PVC features. + +#include "Inputs/scatter.hpp" + +int main() { + auto Q = queue{gpu_selector_v}; + esimd_test::printTestLabel(Q); + + constexpr auto TestFeatures = TestFeatures::Generic; + bool Passed = true; + + Passed &= testLACC(Q); + Passed &= testLACC(Q); + if (Q.get_device().has(sycl::aspect::fp16)) + Passed &= testLACC(Q); + Passed &= testLACC(Q); + Passed &= testLACC(Q); + Passed &= + testLACC(Q); + std::cout << (Passed ? "Passed\n" : "FAILED\n"); + return Passed ? 0 : 1; +} diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc_dg2_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc_dg2_pvc.cpp new file mode 100644 index 0000000000000..da358621df927 --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc_dg2_pvc.cpp @@ -0,0 +1,38 @@ +//==------- scatter_lacc_dg2_pvc.cpp - DPC++ ESIMD on-device test------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===------------------------------------------------------------------===// +// REQUIRES: gpu-intel-pvc || gpu-intel-dg2 +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::scatter() functions accepting local accessor +// and optional compile-time esimd::properties. +// The scatter() calls in this test use VS > 1 (number of loads per +// offset) and requires DG2 or PVC. + +#include "Inputs/scatter.hpp" + +int main() { + auto Q = queue{gpu_selector_v}; + esimd_test::printTestLabel(Q); + + constexpr auto TestFeatures = TestFeatures::PVC; + bool Passed = true; + + Passed &= testLACC(Q); + Passed &= testLACC(Q); + if (Q.get_device().has(sycl::aspect::fp16)) + Passed &= testLACC(Q); + Passed &= testLACC(Q); + Passed &= testLACC(Q); + Passed &= testLACC(Q); + if (Q.get_device().has(sycl::aspect::fp64)) + Passed &= testLACC(Q); + + std::cout << (Passed ? "Passed\n" : "FAILED\n"); + return Passed ? 0 : 1; +} diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp index eb629935347db..79ade124a3e01 100644 --- a/sycl/test/esimd/memory_properties.cpp +++ b/sycl/test/esimd/memory_properties.cpp @@ -1281,6 +1281,67 @@ test_gather_scatter(AccType &acc, LocalAccType &local_acc, float *ptrf, scatter(ptrf, ioffset_n16_view, usm_view, mask_n16); scatter(ptrf, ioffset_n16_view, usm_view); + + // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + scatter(local_acc, ioffset_n32, usm, mask_n32); + + scatter(local_acc, ioffset_n32, usm); + + scatter(local_acc, ioffset_n32, usm, mask_n32, props_align4); + + scatter(local_acc, ioffset_n32, usm, props_align4); + + // CHECK-COUNT-8: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + scatter(local_acc, ioffset_n32, usm, mask_n32, props_cache_load); + scatter(local_acc, ioffset_n32, usm, props_cache_load); + + scatter(local_acc, ioffset_n32_view, usm, mask_n32, props_cache_load); + scatter(local_acc, ioffset_n32_view, usm, props_cache_load); + + scatter(local_acc, ioffset_n32, usm_view, mask_n32, + props_cache_load); + scatter(local_acc, ioffset_n32, usm_view, props_cache_load); + + scatter(local_acc, ioffset_n32_view, usm_view, mask_n32, + props_cache_load); + scatter(local_acc, ioffset_n32_view, usm_view, props_cache_load); + + // VS > 1 + // CHECK-COUNT-8: call void @llvm.genx.lsc.store.slm.v16i1.v16i32.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, <32 x i32>{{[^)]+}}, i32 0) + scatter(local_acc, ioffset_n16, usm, mask_n16, + props_cache_load); + + scatter(local_acc, ioffset_n16, usm, props_cache_load); + + scatter(local_acc, ioffset_n16_view, usm, mask_n16, + props_cache_load); + scatter(local_acc, ioffset_n16_view, usm, props_cache_load); + + scatter(local_acc, ioffset_n16, usm_view, mask_n16, + props_cache_load); + scatter(local_acc, ioffset_n16, usm_view, props_cache_load); + + scatter(local_acc, ioffset_n16_view, usm_view, mask_n16, + props_cache_load); + scatter(local_acc, ioffset_n16_view, usm_view, + props_cache_load); + + // CHECK-COUNT-8: call void @llvm.genx.lsc.store.slm.v16i1.v16i32.v32i32(<16 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i32> {{[^)]+}}, <32 x i32>{{[^)]+}}, i32 0) + scatter(local_acc, ioffset_n16, usm, mask_n16); + + scatter(local_acc, ioffset_n16, usm); + + scatter(local_acc, ioffset_n16_view, usm, mask_n16); + + scatter(local_acc, ioffset_n16_view, usm); + + scatter(local_acc, ioffset_n16, usm_view, mask_n16); + + scatter(local_acc, ioffset_n16, usm_view); + + scatter(local_acc, ioffset_n16_view, usm_view, mask_n16); + + scatter(local_acc, ioffset_n16_view, usm_view); } // CHECK-LABEL: define {{.*}} @_Z23test_slm_gather_scatter{{.*}} From 594e8e4dd834cfb099f96a8c5efedf81d94d1063 Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Thu, 8 Feb 2024 20:44:09 -0800 Subject: [PATCH 02/16] Remove test for unsupported type --- sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc.cpp index 2ca5b867749b6..0cfc8374c342a 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc.cpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc.cpp @@ -29,8 +29,7 @@ int main() { Passed &= testLACC(Q); Passed &= testLACC(Q); Passed &= testLACC(Q); - Passed &= - testLACC(Q); + std::cout << (Passed ? "Passed\n" : "FAILED\n"); return Passed ? 0 : 1; } From 615c80aee76cc5f2846b3f4e8bc55e875feab23d Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Mon, 12 Feb 2024 12:37:02 -0800 Subject: [PATCH 03/16] Address PR comments --- sycl/include/sycl/ext/intel/esimd/memory.hpp | 75 +++++++++++------- .../unified_memory_api/Inputs/scatter.hpp | 77 ++++++++++--------- sycl/test/esimd/memory_properties.cpp | 4 + 3 files changed, 92 insertions(+), 64 deletions(-) diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index cea9745816edd..ccf02f132e35e 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -7914,33 +7914,47 @@ __ESIMD_API /// Variant of scatter that uses local accessor as a parameter /// template -/// void scatter(AccessorT acc, simd byte_offsets, simd -/// vals, simd_mask mask, PropertyListT props = {}); // -/// (lacc-sc-1) +/// typename PropertyListT = empty_properties_t> +/// void scatter(AccessorT acc, +/// simd byte_offsets, +/// simd vals, +/// simd_mask mask, +/// PropertyListT props = {}); // (lacc-sc-1) /// template -/// void scatter(AccessorT acc, simd byte_offsets, simd -/// vals, PropertyListT props = {}); // (lacc-sc-2) +/// typename PropertyListT = empty_properties_t> +/// void scatter(AccessorT acc, +/// simd byte_offsets, +/// simd vals, +/// PropertyListT props = {}); // (lacc-sc-2) /// The next two functions are similar to lacc-sc-{1,2} with the 'byte_offsets' /// parameter represerented as 'simd_view'. /// template -/// void scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd vals, -/// simd_mask mask, PropertyListT props = {}); // (lacc-sc-3) +/// typename OffsetSimdViewT, +/// typename PropertyListT = empty_properties_t> +/// void scatter(AccessorT acc, +/// OffsetSimdViewT byte_offsets, +/// simd vals, +/// simd_mask mask, +/// PropertyListT props = {}); // (lacc-sc-3) /// template -/// void scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd vals, -/// PropertyListT props = {}); // (lacc-sc-4) +/// typename AccessorT, +/// typename PropertyListT = empty_properties_t> +/// void scatter(AccessorT acc, +/// OffsetSimdViewT byte_offsets, +/// simd vals, +/// PropertyListT props = {}); // (lacc-sc-4) /// template -/// void scatter(AccessorT acc, simd byte_offsets, simd -/// vals, simd_mask mask, PropertyListT props = {}); // (lacc-sc-1) +/// typename PropertyListT = empty_properties_t> +/// void scatter(AccessorT acc, +/// simd byte_offsets, +/// simd vals, +/// simd_mask mask, +/// PropertyListT props = {}); // (lacc-sc-1) /// /// Writes ("scatters") elements of the input vector to memory locations /// addressed by the local accessor \p acc and byte offsets \p byte_offsets. @@ -7974,9 +7988,11 @@ scatter(AccessorT acc, simd byte_offsets, simd vals, } /// template -/// void scatter(AccessorT acc, simd byte_offsets, simd -/// vals, PropertyListT props = {}); // (lacc-sc-2) +/// typename PropertyListT = empty_properties_t> +/// void scatter(AccessorT acc, +/// simd byte_offsets, +/// simd vals, +/// PropertyListT props = {}); // (lacc-sc-2) /// /// Writes ("scatters") elements of the input vector to memory locations /// addressed by the local accessor \p acc and byte offsets \p byte_offsets. @@ -8006,10 +8022,14 @@ scatter(AccessorT acc, simd byte_offsets, simd vals, scatter(acc, byte_offsets, vals, Mask, props); } -// template -// void scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd vals, -// simd_mask mask, PropertyListT props = {}); // (lacc-sc-3) +/// template +/// void scatter(AccessorT acc, +/// OffsetSimdViewT byte_offsets, +/// simd vals, +/// simd_mask mask, +/// PropertyListT props = {}); // (lacc-sc-3) /// /// Writes ("scatters") elements of the input vector to memory locations /// addressed by the local accessor \p acc and byte offsets \p byte_offsets. @@ -8044,9 +8064,12 @@ scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd vals, } /// template -/// void scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd vals, -/// PropertyListT props = {}); // (lacc-sc-4) +/// typename AccessorT, +/// typename PropertyListT = empty_properties_t> +/// void scatter(AccessorT acc, +/// OffsetSimdViewT byte_offsets, +/// simd vals, +/// PropertyListT props = {}); // (lacc-sc-4) /// /// Writes ("scatters") elements of the input vector to memory locations /// addressed by the local accessor \p acc and byte offsets \p byte_offsets. diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp index b55479b64879a..a030428b62f7d 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp @@ -477,7 +477,7 @@ bool testLACC(queue Q, uint32_t MaskStride, try { Q.submit([&](handler &cgh) { constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T); - auto InAcc = local_accessor(SLMSize, cgh); + auto LocalAcc = local_accessor(SLMSize, cgh); cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { ScatterPropertiesT Props{}; @@ -486,13 +486,11 @@ bool testLACC(queue Q, uint32_t MaskStride, uint32_t GlobalElemOffset = GlobalID * N; uint32_t LocalElemOffset = LocalID * N; - slm_init(); - if (LocalID == 0) { for (int I = 0; I < Threads * N; I += 8) { simd InVec(Out + GlobalElemOffset + I); simd Offsets(I * sizeof(T), sizeof(T)); - scatter(InAcc, Offsets, InVec); + scatter(LocalAcc, Offsets, InVec); } } barrier(); @@ -501,7 +499,7 @@ bool testLACC(queue Q, uint32_t MaskStride, VS * sizeof(T)); auto ByteOffsetsView = ByteOffsets.template select(); - simd Vals = gather(InAcc, ByteOffsets, Props); + simd Vals = gather(LocalAcc, ByteOffsets, Props); Vals *= 2; auto ValsView = Vals.template select(); @@ -512,85 +510,88 @@ bool testLACC(queue Q, uint32_t MaskStride, if constexpr (UseMask) { if constexpr (UseProperties) { if (GlobalID % 4 == 0) - scatter(InAcc, ByteOffsets, Vals, Pred, Props); + scatter(LocalAcc, ByteOffsets, Vals, Pred, Props); else if (GlobalID % 4 == 1) - scatter(InAcc, ByteOffsetsView, Vals, Pred, Props); + scatter(LocalAcc, ByteOffsetsView, Vals, Pred, + Props); else if (GlobalID % 4 == 2) - scatter(InAcc, ByteOffsets, ValsView, Pred, Props); + scatter(LocalAcc, ByteOffsets, ValsView, Pred, + Props); else if (GlobalID % 4 == 3) - scatter(InAcc, ByteOffsetsView, ValsView, Pred, + scatter(LocalAcc, ByteOffsetsView, ValsView, Pred, Props); } else { // UseProperties == false if (GlobalID % 4 == 0) - scatter(InAcc, ByteOffsets, Vals, Pred); + scatter(LocalAcc, ByteOffsets, Vals, Pred); else if (GlobalID % 4 == 1) - scatter(InAcc, ByteOffsetsView, Vals, Pred); + scatter(LocalAcc, ByteOffsetsView, Vals, Pred); else if (GlobalID % 4 == 2) - scatter(InAcc, ByteOffsets, ValsView, Pred); + scatter(LocalAcc, ByteOffsets, ValsView, Pred); else if (GlobalID % 4 == 3) - scatter(InAcc, ByteOffsetsView, ValsView, Pred); + scatter(LocalAcc, ByteOffsetsView, ValsView, Pred); } } else { // UseMask == false if constexpr (UseProperties) { if (GlobalID % 4 == 0) - scatter(InAcc, ByteOffsets, Vals, Props); + scatter(LocalAcc, ByteOffsets, Vals, Props); else if (GlobalID % 4 == 1) - scatter(InAcc, ByteOffsetsView, Vals, Props); + scatter(LocalAcc, ByteOffsetsView, Vals, Props); else if (GlobalID % 4 == 2) - scatter(InAcc, ByteOffsets, ValsView, Props); + scatter(LocalAcc, ByteOffsets, ValsView, Props); else if (GlobalID % 4 == 3) - scatter(InAcc, ByteOffsetsView, ValsView, Props); + scatter(LocalAcc, ByteOffsetsView, ValsView, Props); } else { // UseProperties == false if (GlobalID % 4 == 0) - scatter(InAcc, ByteOffsets, Vals); + scatter(LocalAcc, ByteOffsets, Vals); else if (GlobalID % 4 == 1) - scatter(InAcc, ByteOffsetsView, Vals); + scatter(LocalAcc, ByteOffsetsView, Vals); else if (GlobalID % 4 == 2) - scatter(InAcc, ByteOffsets, ValsView); + scatter(LocalAcc, ByteOffsets, ValsView); else if (GlobalID % 4 == 3) - scatter(InAcc, ByteOffsetsView, ValsView); + scatter(LocalAcc, ByteOffsetsView, ValsView); } } } else { // VS == 1 if constexpr (UseMask) { if constexpr (UseProperties) { if (GlobalID % 4 == 0) - scatter(InAcc, ByteOffsets, Vals, Pred, Props); + scatter(LocalAcc, ByteOffsets, Vals, Pred, Props); else if (GlobalID % 4 == 1) - scatter(InAcc, ByteOffsetsView, Vals, Pred, Props); + scatter(LocalAcc, ByteOffsetsView, Vals, Pred, Props); else if (GlobalID % 4 == 2) - scatter(InAcc, ByteOffsets, ValsView, Pred, Props); + scatter(LocalAcc, ByteOffsets, ValsView, Pred, Props); else if (GlobalID % 4 == 3) - scatter(InAcc, ByteOffsetsView, ValsView, Pred, Props); + scatter(LocalAcc, ByteOffsetsView, ValsView, Pred, + Props); } else { // UseProperties == false if (GlobalID % 4 == 0) - scatter(InAcc, ByteOffsets, Vals, Pred); + scatter(LocalAcc, ByteOffsets, Vals, Pred); else if (GlobalID % 4 == 1) - scatter(InAcc, ByteOffsetsView, Vals, Pred); + scatter(LocalAcc, ByteOffsetsView, Vals, Pred); else if (GlobalID % 4 == 2) - scatter(InAcc, ByteOffsets, ValsView, Pred); + scatter(LocalAcc, ByteOffsets, ValsView, Pred); else if (GlobalID % 4 == 3) - scatter(InAcc, ByteOffsetsView, ValsView, Pred); + scatter(LocalAcc, ByteOffsetsView, ValsView, Pred); } } else { // UseMask == false if constexpr (UseProperties) { if (GlobalID % 4 == 0) - scatter(InAcc, ByteOffsets, Vals, Props); + scatter(LocalAcc, ByteOffsets, Vals, Props); else if (GlobalID % 4 == 1) - scatter(InAcc, ByteOffsetsView, Vals, Props); + scatter(LocalAcc, ByteOffsetsView, Vals, Props); else if (GlobalID % 4 == 2) - scatter(InAcc, ByteOffsets, ValsView, Props); + scatter(LocalAcc, ByteOffsets, ValsView, Props); else if (GlobalID % 4 == 3) - scatter(InAcc, ByteOffsetsView, ValsView, Props); + scatter(LocalAcc, ByteOffsetsView, ValsView, Props); } else { // UseProperties == false if (GlobalID % 4 == 0) - scatter(InAcc, ByteOffsets, Vals); + scatter(LocalAcc, ByteOffsets, Vals); else if (GlobalID % 4 == 1) - scatter(InAcc, ByteOffsetsView, Vals); + scatter(LocalAcc, ByteOffsetsView, Vals); else if (GlobalID % 4 == 2) - scatter(InAcc, ByteOffsets, ValsView); + scatter(LocalAcc, ByteOffsets, ValsView); else if (GlobalID % 4 == 3) - scatter(InAcc, ByteOffsetsView, ValsView); + scatter(LocalAcc, ByteOffsetsView, ValsView); } } } @@ -598,7 +599,7 @@ bool testLACC(queue Q, uint32_t MaskStride, if (LocalID == 0) { for (int I = 0; I < Threads * N; I++) { simd Offsets(I * sizeof(T), sizeof(T)); - simd OutVec = gather(InAcc, Offsets); + simd OutVec = gather(LocalAcc, Offsets); OutVec.copy_to(Out + GlobalElemOffset + I); } } diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp index fd4831f9ab64d..e00f8605bb2d4 100644 --- a/sycl/test/esimd/memory_properties.cpp +++ b/sycl/test/esimd/memory_properties.cpp @@ -1342,6 +1342,10 @@ test_gather_scatter(AccType &acc, LocalAccType &local_acc, float *ptrf, simd ioffset_n10(byte_offset32, 8); simd usm_n10; + // Check special case involving glbal offset and mask + // CHECK-COUNT-1: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}) + scatter(local_acc, ioffset_n32, usm, 0, 1); + // Check special case to verify that for cases when N is not power of 2 llvm // intrinsic is used // CHECK-COUNT-1: call void @llvm.masked.scatter.v10f32.v10p4(<10 x float> {{[^)]+}}, <10 x ptr addrspace(4)> {{[^)]+}}, i32 4, <10 x i1> {{[^)]+}}) From b0f6864def7c16ec6025c6ca11ce822030cc6abf Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Mon, 12 Feb 2024 13:31:03 -0800 Subject: [PATCH 04/16] Address PR comments --- sycl/include/sycl/ext/intel/esimd/memory.hpp | 12 ++++++------ sycl/test/esimd/memory_properties.cpp | 3 ++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index ccf02f132e35e..c26eb1ceb3204 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -7914,7 +7914,7 @@ __ESIMD_API /// Variant of scatter that uses local accessor as a parameter /// template +/// typename PropertyListT = empty_properties_t> /// void scatter(AccessorT acc, /// simd byte_offsets, /// simd vals, @@ -7922,7 +7922,7 @@ __ESIMD_API /// PropertyListT props = {}); // (lacc-sc-1) /// template +/// typename PropertyListT = empty_properties_t> /// void scatter(AccessorT acc, /// simd byte_offsets, /// simd vals, @@ -7941,7 +7941,7 @@ __ESIMD_API /// PropertyListT props = {}); // (lacc-sc-3) /// template /// void scatter(AccessorT acc, /// OffsetSimdViewT byte_offsets, @@ -7949,7 +7949,7 @@ __ESIMD_API /// PropertyListT props = {}); // (lacc-sc-4) /// template +/// typename PropertyListT = empty_properties_t> /// void scatter(AccessorT acc, /// simd byte_offsets, /// simd vals, @@ -7988,7 +7988,7 @@ scatter(AccessorT acc, simd byte_offsets, simd vals, } /// template +/// typename PropertyListT = empty_properties_t> /// void scatter(AccessorT acc, /// simd byte_offsets, /// simd vals, @@ -8064,7 +8064,7 @@ scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd vals, } /// template /// void scatter(AccessorT acc, /// OffsetSimdViewT byte_offsets, diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp index e00f8605bb2d4..f9a1057a3550a 100644 --- a/sycl/test/esimd/memory_properties.cpp +++ b/sycl/test/esimd/memory_properties.cpp @@ -1343,8 +1343,9 @@ test_gather_scatter(AccType &acc, LocalAccType &local_acc, float *ptrf, simd usm_n10; // Check special case involving glbal offset and mask - // CHECK-COUNT-1: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}) + // CHECK-COUNT-2: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}) scatter(local_acc, ioffset_n32, usm, 0, 1); + scatter(local_acc, ioffset_n32, usm, 0); // Check special case to verify that for cases when N is not power of 2 llvm // intrinsic is used From 9b8ff920ed9a55ca98ef68f0a3c07ba6de19322d Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Mon, 12 Feb 2024 18:45:43 -0800 Subject: [PATCH 05/16] Fix a test failure --- sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp index a030428b62f7d..5479f783734b3 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp @@ -476,15 +476,14 @@ bool testLACC(queue Q, uint32_t MaskStride, try { Q.submit([&](handler &cgh) { - constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T); - auto LocalAcc = local_accessor(SLMSize, cgh); - - cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { + cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { ScatterPropertiesT Props{}; uint16_t GlobalID = ndi.get_global_id(0); uint16_t LocalID = ndi.get_local_id(0); uint32_t GlobalElemOffset = GlobalID * N; uint32_t LocalElemOffset = LocalID * N; + constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T); + auto LocalAcc = local_accessor(SLMSize, cgh); if (LocalID == 0) { for (int I = 0; I < Threads * N; I += 8) { From 4277bb80c1756ef3f19e00bb086385b3e704c061 Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Mon, 12 Feb 2024 19:09:38 -0800 Subject: [PATCH 06/16] Fix clang-format --- sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp index 5479f783734b3..cf5d02323d3f8 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp @@ -476,7 +476,7 @@ bool testLACC(queue Q, uint32_t MaskStride, try { Q.submit([&](handler &cgh) { - cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { + cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { ScatterPropertiesT Props{}; uint16_t GlobalID = ndi.get_global_id(0); uint16_t LocalID = ndi.get_local_id(0); From 48bb10673644099354b044af64ebd90b0f9bb0b4 Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Mon, 12 Feb 2024 20:19:08 -0800 Subject: [PATCH 07/16] Revert "Fix clang-format" This reverts commit 4277bb80c1756ef3f19e00bb086385b3e704c061. --- sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp index cf5d02323d3f8..5479f783734b3 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp @@ -476,7 +476,7 @@ bool testLACC(queue Q, uint32_t MaskStride, try { Q.submit([&](handler &cgh) { - cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { + cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { ScatterPropertiesT Props{}; uint16_t GlobalID = ndi.get_global_id(0); uint16_t LocalID = ndi.get_local_id(0); From b513aa7d43c2a640642f46ec531b1391d22380b7 Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Mon, 12 Feb 2024 20:19:55 -0800 Subject: [PATCH 08/16] Revert "Fix a test failure" This reverts commit 9b8ff920ed9a55ca98ef68f0a3c07ba6de19322d. --- sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp index 5479f783734b3..a030428b62f7d 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp @@ -476,14 +476,15 @@ bool testLACC(queue Q, uint32_t MaskStride, try { Q.submit([&](handler &cgh) { - cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { + constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T); + auto LocalAcc = local_accessor(SLMSize, cgh); + + cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { ScatterPropertiesT Props{}; uint16_t GlobalID = ndi.get_global_id(0); uint16_t LocalID = ndi.get_local_id(0); uint32_t GlobalElemOffset = GlobalID * N; uint32_t LocalElemOffset = LocalID * N; - constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T); - auto LocalAcc = local_accessor(SLMSize, cgh); if (LocalID == 0) { for (int I = 0; I < Threads * N; I += 8) { From fa6a137848e39fa8c797902c612eec423a6e3d81 Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Mon, 12 Feb 2024 20:25:05 -0800 Subject: [PATCH 09/16] Fix test failure --- sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp index a030428b62f7d..af0c13d30aafd 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp @@ -476,7 +476,7 @@ bool testLACC(queue Q, uint32_t MaskStride, try { Q.submit([&](handler &cgh) { - constexpr uint32_t SLMSize = (Threads * N + 8) * sizeof(T); + constexpr uint32_t SLMSize = (Threads * N) * sizeof(T); auto LocalAcc = local_accessor(SLMSize, cgh); cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { @@ -487,9 +487,9 @@ bool testLACC(queue Q, uint32_t MaskStride, uint32_t LocalElemOffset = LocalID * N; if (LocalID == 0) { - for (int I = 0; I < Threads * N; I += 8) { - simd InVec(Out + GlobalElemOffset + I); - simd Offsets(I * sizeof(T), sizeof(T)); + for (int I = 0; I < Threads * N; I++) { + simd InVec(Out + GlobalElemOffset + I); + simd Offsets(I * sizeof(T), sizeof(T)); scatter(LocalAcc, Offsets, InVec); } } From c0218b7929dfc538c93b0a6143ef6a1868ed82d5 Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Tue, 13 Feb 2024 10:33:35 -0800 Subject: [PATCH 10/16] Eliminate use of scatter to initialize the memory in the test --- sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp index af0c13d30aafd..2f787db23c098 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp @@ -489,8 +489,7 @@ bool testLACC(queue Q, uint32_t MaskStride, if (LocalID == 0) { for (int I = 0; I < Threads * N; I++) { simd InVec(Out + GlobalElemOffset + I); - simd Offsets(I * sizeof(T), sizeof(T)); - scatter(LocalAcc, Offsets, InVec); + LocalAcc[I] = InVec[0]; } } barrier(); From 837355e8b51df84fd5ae0d4cccfdc64019a887c2 Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Tue, 13 Feb 2024 18:46:32 -0800 Subject: [PATCH 11/16] Refactor the test --- .../unified_memory_api/Inputs/scatter.hpp | 24 +++++++------------ 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp index 2f787db23c098..e4998358a0f26 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp @@ -455,7 +455,7 @@ template 0 && N % VS == 0, "Incorrect VS parameter. N must be divisible by VS."); @@ -476,7 +476,7 @@ bool testLACC(queue Q, uint32_t MaskStride, try { Q.submit([&](handler &cgh) { - constexpr uint32_t SLMSize = (Threads * N) * sizeof(T); + constexpr uint32_t SLMSize = Size; auto LocalAcc = local_accessor(SLMSize, cgh); cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { @@ -486,19 +486,16 @@ bool testLACC(queue Q, uint32_t MaskStride, uint32_t GlobalElemOffset = GlobalID * N; uint32_t LocalElemOffset = LocalID * N; - if (LocalID == 0) { - for (int I = 0; I < Threads * N; I++) { - simd InVec(Out + GlobalElemOffset + I); - LocalAcc[I] = InVec[0]; - } + for (int I = 0; I < Size; I++) { + LocalAcc[I] = Out[I]; } - barrier(); - simd ByteOffsets(LocalElemOffset * sizeof(T), + simd ByteOffsets(GlobalElemOffset * sizeof(T), VS * sizeof(T)); auto ByteOffsetsView = ByteOffsets.template select(); simd Vals = gather(LocalAcc, ByteOffsets, Props); + barrier(); Vals *= 2; auto ValsView = Vals.template select(); @@ -595,12 +592,9 @@ bool testLACC(queue Q, uint32_t MaskStride, } } barrier(); - if (LocalID == 0) { - for (int I = 0; I < Threads * N; I++) { - simd Offsets(I * sizeof(T), sizeof(T)); - simd OutVec = gather(LocalAcc, Offsets); - OutVec.copy_to(Out + GlobalElemOffset + I); - } + + for (int I = 0; I < N; I++) { + Out[GlobalElemOffset + I] = LocalAcc[GlobalElemOffset + I]; } }); }).wait(); From 4a62ab3802912bc0efd5879244d4181b95f10bc3 Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Wed, 14 Feb 2024 09:13:50 -0800 Subject: [PATCH 12/16] Fix test failure --- .../ESIMD/unified_memory_api/Inputs/scatter.hpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp index e4998358a0f26..7f59299769e02 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp @@ -486,14 +486,13 @@ bool testLACC(queue Q, uint32_t MaskStride, uint32_t GlobalElemOffset = GlobalID * N; uint32_t LocalElemOffset = LocalID * N; - for (int I = 0; I < Size; I++) { - LocalAcc[I] = Out[I]; - } + simd InVec(Out + GlobalElemOffset); simd ByteOffsets(GlobalElemOffset * sizeof(T), VS * sizeof(T)); + slm_scatter(ByteOffsets, InVec); + barrier(); auto ByteOffsetsView = ByteOffsets.template select(); - simd Vals = gather(LocalAcc, ByteOffsets, Props); barrier(); Vals *= 2; @@ -593,9 +592,8 @@ bool testLACC(queue Q, uint32_t MaskStride, } barrier(); - for (int I = 0; I < N; I++) { - Out[GlobalElemOffset + I] = LocalAcc[GlobalElemOffset + I]; - } + simd OutVec = gather(LocalAcc, ByteOffsets, Props); + OutVec.copy_to(Out + GlobalElemOffset); }); }).wait(); } catch (sycl::exception const &e) { From d5f51f0ec24a7e46853265037444caeb190e742f Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Wed, 14 Feb 2024 09:53:24 -0800 Subject: [PATCH 13/16] Reduce size of used SLM memory --- .../unified_memory_api/Inputs/scatter.hpp | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp index 7f59299769e02..42119fd163018 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp @@ -455,7 +455,7 @@ template 0 && N % VS == 0, "Incorrect VS parameter. N must be divisible by VS."); @@ -476,7 +476,7 @@ bool testLACC(queue Q, uint32_t MaskStride, try { Q.submit([&](handler &cgh) { - constexpr uint32_t SLMSize = Size; + constexpr uint32_t SLMSize = N; auto LocalAcc = local_accessor(SLMSize, cgh); cgh.parallel_for(Range, [=](sycl::nd_item<1> ndi) SYCL_ESIMD_KERNEL { @@ -486,18 +486,14 @@ bool testLACC(queue Q, uint32_t MaskStride, uint32_t GlobalElemOffset = GlobalID * N; uint32_t LocalElemOffset = LocalID * N; - simd InVec(Out + GlobalElemOffset); + simd Vals(GlobalElemOffset, 1); - simd ByteOffsets(GlobalElemOffset * sizeof(T), - VS * sizeof(T)); - slm_scatter(ByteOffsets, InVec); - barrier(); - auto ByteOffsetsView = ByteOffsets.template select(); - simd Vals = gather(LocalAcc, ByteOffsets, Props); - barrier(); + simd ByteOffsets(0, VS * sizeof(T)); Vals *= 2; auto ValsView = Vals.template select(); + auto ByteOffsetsView = ByteOffsets.template select(); + simd_mask Pred = 0; for (int I = 0; I < NOffsets; I++) Pred[I] = (I % MaskStride == 0) ? 1 : 0; @@ -590,10 +586,13 @@ bool testLACC(queue Q, uint32_t MaskStride, } } } - barrier(); simd OutVec = gather(LocalAcc, ByteOffsets, Props); - OutVec.copy_to(Out + GlobalElemOffset); + if constexpr (UseMask) { + scatter(Out + GlobalElemOffset, ByteOffsets, OutVec, Pred); + } else { + OutVec.copy_to(Out + GlobalElemOffset); + } }); }).wait(); } catch (sycl::exception const &e) { From 085f6512e4ed64010ced081e790c2abcbb8a9c58 Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Wed, 14 Feb 2024 14:21:53 -0800 Subject: [PATCH 14/16] Refactor the test --- .../ESIMD/unified_memory_api/Inputs/scatter.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp index 42119fd163018..0678bec72f307 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp @@ -486,13 +486,17 @@ bool testLACC(queue Q, uint32_t MaskStride, uint32_t GlobalElemOffset = GlobalID * N; uint32_t LocalElemOffset = LocalID * N; - simd Vals(GlobalElemOffset, 1); + simd InVec(GlobalElemOffset, 1); simd ByteOffsets(0, VS * sizeof(T)); + scatter(LocalAcc, ByteOffsets, InVec); + barrier(); + auto ByteOffsetsView = ByteOffsets.template select(); + simd Vals = gather(LocalAcc, ByteOffsets, Props); + Vals *= 2; auto ValsView = Vals.template select(); - auto ByteOffsetsView = ByteOffsets.template select(); simd_mask Pred = 0; for (int I = 0; I < NOffsets; I++) @@ -588,11 +592,7 @@ bool testLACC(queue Q, uint32_t MaskStride, } simd OutVec = gather(LocalAcc, ByteOffsets, Props); - if constexpr (UseMask) { - scatter(Out + GlobalElemOffset, ByteOffsets, OutVec, Pred); - } else { - OutVec.copy_to(Out + GlobalElemOffset); - } + OutVec.copy_to(Out + GlobalElemOffset); }); }).wait(); } catch (sycl::exception const &e) { From 69d37063e2b0d124a94bc3de8235993b67be5c5b Mon Sep 17 00:00:00 2001 From: "Fine, Gregory" Date: Wed, 14 Feb 2024 14:37:42 -0800 Subject: [PATCH 15/16] Add drriver requirements for the test --- sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc.cpp index 0cfc8374c342a..301392a247381 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc.cpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc.cpp @@ -5,6 +5,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +// REQUIRES-INTEL-DRIVER: lin: 26816, win: 101.51086 // Use per-kernel compilation to have more information about failing cases. // RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out // RUN: %{run} %t.out From 7d6ea3f0d54ce14adc6ce975c1381592dc9dcd44 Mon Sep 17 00:00:00 2001 From: gregory Date: Thu, 15 Feb 2024 11:14:30 -0800 Subject: [PATCH 16/16] Address PR comments --- sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp index 0678bec72f307..b3b3d498276f4 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/scatter.hpp @@ -490,7 +490,6 @@ bool testLACC(queue Q, uint32_t MaskStride, simd ByteOffsets(0, VS * sizeof(T)); scatter(LocalAcc, ByteOffsets, InVec); - barrier(); auto ByteOffsetsView = ByteOffsets.template select(); simd Vals = gather(LocalAcc, ByteOffsets, Props);