From 7ee7e906e858c229cc256185560968754b773091 Mon Sep 17 00:00:00 2001 From: fineg74 <61437305+fineg74@users.noreply.github.com> Date: Thu, 8 Feb 2024 11:50:46 -0800 Subject: [PATCH] [SYCL][ESIMD] Use LLVM IR for USM/SLM scatter (#12628) --- llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp | 38 +++++++++++++++++++ .../ext/intel/esimd/detail/memory_intrin.hpp | 14 +++++++ sycl/include/sycl/ext/intel/esimd/memory.hpp | 19 +++++++++- .../ESIMD/unified_memory_api/scatter_usm.cpp | 6 +-- .../unified_memory_api/scatter_usm_legacy.cpp | 21 ++++++++++ .../ESIMD/unified_memory_api/slm_scatter.cpp | 8 ++-- .../unified_memory_api/slm_scatter_legacy.cpp | 20 ++++++++++ sycl/test/esimd/memory_properties.cpp | 25 +++++++++--- 8 files changed, 137 insertions(+), 14 deletions(-) create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_legacy.cpp create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_legacy.cpp diff --git a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp index 6ec1102f402ba..15c5e7c9a625e 100644 --- a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp +++ b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp @@ -1004,6 +1004,37 @@ static void translateGatherLoad(CallInst &CI, bool IsSLM) { CI.replaceAllUsesWith(LI); } +static void translateScatterStore(CallInst &CI, bool IsSLM) { + IRBuilder<> Builder(&CI); + constexpr int AlignmentTemplateArgIdx = 2; + APInt Val = parseTemplateArg(CI, AlignmentTemplateArgIdx, + ESIMDIntrinDesc::GenXArgConversion::TO_I64); + Align AlignValue(Val.getZExtValue()); + + auto ValsOp = CI.getArgOperand(0); + auto OffsetsOp = CI.getArgOperand(1); + auto MaskOp = CI.getArgOperand(2); + auto DataType = ValsOp->getType(); + + // Convert the mask from to . + Value *Zero = ConstantInt::get(MaskOp->getType(), 0); + MaskOp = Builder.CreateICmp(ICmpInst::ICMP_NE, MaskOp, Zero); + + // The address space may be 3-SLM, 1-global or private. + // At the moment of calling 'scatter()' operation the pointer passed to it + // is already 4-generic. Thus, simply use 4-generic for global and private + // and let GPU BE deduce the actual address space from the use-def graph. + unsigned AS = IsSLM ? 3 : 4; + auto ElemType = DataType->getScalarType(); + auto NumElems = (cast(DataType))->getElementCount(); + auto VPtrType = VectorType::get(PointerType::get(ElemType, AS), NumElems); + auto VPtrOp = Builder.CreateIntToPtr(OffsetsOp, VPtrType); + + auto SI = Builder.CreateMaskedScatter(ValsOp, VPtrOp, AlignValue, MaskOp); + SI->setDebugLoc(CI.getDebugLoc()); + CI.replaceAllUsesWith(SI); +} + // TODO Specify document behavior for slm_init and nbarrier_init when: // 1) they are called not from kernels // 2) there are multiple such calls reachable from a kernel @@ -1987,6 +2018,13 @@ size_t SYCLLowerESIMDPass::runOnFunction(Function &F, continue; } + if (Name.starts_with("__esimd_scatter_st") || + Name.starts_with("__esimd_slm_scatter_st")) { + translateScatterStore(*CI, Name.starts_with("__esimd_slm_scatter_st")); + ToErase.push_back(CI); + continue; + } + if (Name.starts_with("__esimd_nbarrier_init")) { translateNbarrierInit(*CI); ToErase.push_back(CI); diff --git a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp index d723d03353cb2..c9d0d6cc4d410 100644 --- a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp +++ b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp @@ -213,6 +213,20 @@ __ESIMD_INTRIN __ESIMD_DNS::vector_type_t __esimd_slm_gather_ld( __ESIMD_DNS::simd_mask_storage_t pred, __ESIMD_DNS::vector_type_t pass_thru) __ESIMD_INTRIN_END; +// Scatter data to given global or private addresses. +template +__ESIMD_INTRIN void +__esimd_scatter_st(__ESIMD_DNS::vector_type_t vals, + __ESIMD_DNS::vector_type_t vptr, + __ESIMD_DNS::simd_mask_storage_t pred) __ESIMD_INTRIN_END; + +// Scatter data to given SLM addresses. +template +__ESIMD_INTRIN void __esimd_slm_scatter_st( + __ESIMD_DNS::vector_type_t vals, + __ESIMD_DNS::vector_type_t vptr, + __ESIMD_DNS::simd_mask_storage_t pred) __ESIMD_INTRIN_END; + /// Surface-based gather. /// Supported platforms: DG2, PVC /// diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp index 7f4833017d38a..4ffd5c418138c 100644 --- a/sycl/include/sycl/ext/intel/esimd/memory.hpp +++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp @@ -724,11 +724,20 @@ scatter(T *p, simd byte_offsets, simd vals, // Use LSC lowering if L1/L2 or VS > 1. if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none || - VS > 1 || !__ESIMD_DNS::isPowerOf2(N, 32)) { + VS > 1 || + (!__ESIMD_DNS::isPowerOf2(N, 32) && + !detail::isMaskedGatherScatterLLVMAvailable())) { static_assert(VS == 1 || sizeof(T) >= 4, "VS > 1 is supprted only for 4- and 8-byte elements"); return detail::scatter_impl(p, byte_offsets, vals, mask); + } else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) { + simd Addrs(reinterpret_cast(p)); + Addrs = Addrs + convert(byte_offsets); + using MsgT = detail::__raw_t; + __esimd_scatter_st( + sycl::bit_cast<__ESIMD_DNS::vector_type_t>(vals.data()), + Addrs.data(), mask.data()); } else { using Tx = detail::__raw_t; simd byte_offsets_i = convert(byte_offsets); @@ -4227,9 +4236,15 @@ slm_scatter(simd byte_offsets, simd vals, "slm_scatter() requires at least element-size alignment"); // Use LSC lowering if VS > 1. - if constexpr (VS > 1 || !(detail::isPowerOf2(N, 32) && sizeof(T) <= 4)) { + if constexpr (VS > 1 || (!(detail::isPowerOf2(N, 32) && sizeof(T) <= 4) && + !detail::isMaskedGatherScatterLLVMAvailable())) { __ESIMD_DNS::slm_scatter_impl( byte_offsets, vals, mask); + } else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) { + using MsgT = detail::__raw_t; + __esimd_slm_scatter_st( + sycl::bit_cast<__ESIMD_DNS::vector_type_t>(vals.data()), + byte_offsets.data(), mask.data()); } else { detail::LocalAccessorMarker acc; detail::scatter_impl(acc, vals, byte_offsets, 0, mask); diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp index 929d3c6fc04f7..ff331a421ccef 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp @@ -5,13 +5,13 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===------------------------------------------------------------------===// -// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{build} -fsycl-device-code-split=per_kernel -D__ESIMD_GATHER_SCATTER_LLVM_IR -o %t.out // RUN: %{run} %t.out // The test verifies esimd::scatter() functions accepting USM pointer // and optional compile-time esimd::properties. -// The scatter() calls in this test do not use cache-hint -// properties to not impose using DG2/PVC features. +// The scatter() calls in this test do not use cache-hint properties +// or VS > 1 (number of stores per offset) to not impose using PVC features. #include "Inputs/scatter.hpp" diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_legacy.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_legacy.cpp new file mode 100644 index 0000000000000..08ac29ba6b605 --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_legacy.cpp @@ -0,0 +1,21 @@ +//==------- scatter_usm_legacy.cpp - DPC++ ESIMD on-device test -----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Use per-kernel compilation to have more information about failing cases. +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::scatter() functions accepting USM pointer +// and optional compile-time esimd::properties. +// The scatter() calls in this test do not use cache-hint properties +// or VS > 1 (number of stores per offset) to not impose using PVC features. +// +// TODO: Remove this test when GPU driver issue with llvm.masked.scatter is +// resolved and ESIMD starts using llvm.masked.scatter by default. +// "-D__ESIMD_GATHER_SCATTER_LLVM_IR" is not used here. + +#include "scatter_usm.cpp" diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp index 96c49fcdae8af..ffa0a718e7689 100644 --- a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp +++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter.cpp @@ -5,12 +5,12 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===------------------------------------------------------------------===// -// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{build} -fsycl-device-code-split=per_kernel -D__ESIMD_GATHER_SCATTER_LLVM_IR -o %t.out // RUN: %{run} %t.out -// The test verifies esimd::slm_scatter() functions accepting -// optional compile-time esimd::properties. -// The scatter() calls in this test do not use DG2/PVC features. +// The test verifies esimd::slm_scatter() functions accepting optional +// compile-time esimd::properties. The slm_scatter() calls in this test do not +// use VS > 1 (number of stores per offset) to not impose using PVC features. #include "Inputs/scatter.hpp" diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_legacy.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_legacy.cpp new file mode 100644 index 0000000000000..0aca53311ef6b --- /dev/null +++ b/sycl/test-e2e/ESIMD/unified_memory_api/slm_scatter_legacy.cpp @@ -0,0 +1,20 @@ +//==------- slm_scatter_legacy.cpp - DPC++ ESIMD on-device test -----------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Use per-kernel compilation to have more information about failing cases. +// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out +// RUN: %{run} %t.out + +// The test verifies esimd::slm_scatter() functions accepting optional +// compile-time esimd::properties. The slm_scatter() calls in this test do not +// use VS > 1 (number of stores per offset) to not impose using PVC features. +// +// TODO: Remove this test when GPU driver issue with llvm.masked.scatter is +// resolved and ESIMD starts using llvm.masked.scatter by default. +// "-D__ESIMD_GATHER_SCATTER_LLVM_IR" is not used here. + +#include "slm_scatter.cpp" diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp index eb629935347db..8305bd9b83b18 100644 --- a/sycl/test/esimd/memory_properties.cpp +++ b/sycl/test/esimd/memory_properties.cpp @@ -1224,7 +1224,7 @@ test_gather_scatter(AccType &acc, LocalAccType &local_acc, float *ptrf, acc_res = gather(local_acc, ioffset_n32, 0); acc_res = gather(local_acc, ioffset_n32, 0, mask_n32); - // CHECK-COUNT-4: call void @llvm.genx.svm.scatter.v32i1.v32i64.v32f32(<32 x i1> {{[^)]+}}, i32 0, <32 x i64> {{[^)]+}}, <32 x float> {{[^)]+}}) + // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p4(<32 x float> {{[^)]+}}, <32 x ptr addrspace(4)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}) scatter(ptrf, ioffset_n32, usm, mask_n32); scatter(ptrf, ioffset_n32, usm); @@ -1281,6 +1281,14 @@ test_gather_scatter(AccType &acc, LocalAccType &local_acc, float *ptrf, scatter(ptrf, ioffset_n16_view, usm_view, mask_n16); scatter(ptrf, ioffset_n16_view, usm_view); + + simd ioffset_n10(byte_offset32, 8); + simd usm_n10; + + // Check special case to verify that for cases when N is not power of 2 llvm + // intrinsic is used + // CHECK-COUNT-1: call void @llvm.masked.scatter.v10f32.v10p4(<10 x float> {{[^)]+}}, <10 x ptr addrspace(4)> {{[^)]+}}, i32 4, <10 x i1> {{[^)]+}}) + scatter(ptrf, ioffset_n10, usm_n10); } // CHECK-LABEL: define {{.*}} @_Z23test_slm_gather_scatter{{.*}} @@ -1381,26 +1389,26 @@ test_slm_gather_scatter(int byte_offset32) { // 3) slm_scatter(...): same as (1), (2) above, but with VS > 1. // 1) slm_scatter(offsets): offsets is simd or simd_view - // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}) slm_scatter(ioffset_n32, slm); slm_scatter(ioffset_n32_view, slm); slm_scatter(ioffset_n32, slm_view); slm_scatter(ioffset_n32_view, slm_view); - // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 8, <32 x i1> {{[^)]+}}) slm_scatter(ioffset_n32, slm, props_align8); slm_scatter(ioffset_n32_view, slm, props_align8); slm_scatter(ioffset_n32, slm_view, props_align8); slm_scatter(ioffset_n32_view, slm_view, props_align8); // 2) slm_gather(offsets, mask): offsets is simd or simd_view - // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}}) slm_scatter(ioffset_n32, slm, mask_n32); slm_scatter(ioffset_n32_view, slm, mask_n32); slm_scatter(ioffset_n32, slm_view, mask_n32); slm_scatter(ioffset_n32_view, slm_view, mask_n32); - // CHECK-COUNT-4: call void @llvm.genx.scatter.scaled.v32i1.v32i32.v32f32(<32 x i1> {{[^)]+}}, i32 2, i16 0, i32 {{[^)]+}}, i32 {{[^)]+}}, <32 x i32> {{[^)]+}}, <32 x float> {{[^)]+}}) + // CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p3(<32 x float> {{[^)]+}}, <32 x ptr addrspace(3)> {{[^)]+}}, i32 8, <32 x i1> {{[^)]+}}) slm_scatter(ioffset_n32, slm, mask_n32, props_align8); slm_scatter(ioffset_n32_view, slm, mask_n32, props_align8); slm_scatter(ioffset_n32, slm_view, mask_n32, props_align8); @@ -1429,4 +1437,11 @@ test_slm_gather_scatter(int byte_offset32) { slm_scatter(ioffset_n16_view, slm, mask_n16, props_align4); slm_scatter(ioffset_n16, slm_view, mask_n16, props_align4); slm_scatter(ioffset_n16_view, slm_view, mask_n16, props_align4); + + simd ioffset_n10(byte_offset32, 8); + simd usm_n10; + // Check special case to verify that for cases when N is not power of 2 llvm + // intrinsic is used + // CHECK-COUNT-1: call void @llvm.masked.scatter.v10f32.v10p3(<10 x float> {{[^)]+}}, <10 x ptr addrspace(3)> {{[^)]+}}, i32 4, <10 x i1> {{[^)]+}}) + slm_scatter(ioffset_n10, usm_n10); }