Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL][ESIMD] Use LLVM IR for USM/SLM scatter #12628

Merged
merged 11 commits into from
Feb 8, 2024
38 changes: 38 additions & 0 deletions llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1004,6 +1004,37 @@ static void translateGatherLoad(CallInst &CI, bool IsSLM) {
CI.replaceAllUsesWith(LI);
}

static void translateScatterStore(CallInst &CI, bool IsSLM) {
IRBuilder<> Builder(&CI);
constexpr int AlignmentTemplateArgIdx = 2;
APInt Val = parseTemplateArg(CI, AlignmentTemplateArgIdx,
ESIMDIntrinDesc::GenXArgConversion::TO_I64);
Align AlignValue(Val.getZExtValue());

auto OffsetsOp = CI.getArgOperand(1);
auto MaskOp = CI.getArgOperand(2);
auto ValsOp = CI.getArgOperand(0);
v-klochkov marked this conversation as resolved.
Show resolved Hide resolved
auto DataType = ValsOp->getType();

// Convert the mask from <N x i16> to <N x i1>.
Value *Zero = ConstantInt::get(MaskOp->getType(), 0);
MaskOp = Builder.CreateICmp(ICmpInst::ICMP_NE, MaskOp, Zero);

// The address space may be 3-SLM, 1-global or private.
// At the moment of calling 'scatter()' operation the pointer passed to it
// is already 4-generic. Thus, simply use 4-generic for global and private
// and let GPU BE deduce the actual address space from the use-def graph.
unsigned AS = IsSLM ? 3 : 4;
auto ElemType = DataType->getScalarType();
auto NumElems = (cast<VectorType>(DataType))->getElementCount();
auto VPtrType = VectorType::get(PointerType::get(ElemType, AS), NumElems);
auto VPtrOp = Builder.CreateIntToPtr(OffsetsOp, VPtrType);

auto SI = Builder.CreateMaskedScatter(ValsOp, VPtrOp, AlignValue, MaskOp);
SI->setDebugLoc(CI.getDebugLoc());
CI.replaceAllUsesWith(SI);
}

// TODO Specify document behavior for slm_init and nbarrier_init when:
// 1) they are called not from kernels
// 2) there are multiple such calls reachable from a kernel
Expand Down Expand Up @@ -1987,6 +2018,13 @@ size_t SYCLLowerESIMDPass::runOnFunction(Function &F,
continue;
}

if (Name.starts_with("__esimd_scatter_st") ||
Name.starts_with("__esimd_slm_scatter_st")) {
translateScatterStore(*CI, Name.starts_with("__esimd_slm_scatter_st"));
ToErase.push_back(CI);
continue;
}

if (Name.starts_with("__esimd_nbarrier_init")) {
translateNbarrierInit(*CI);
ToErase.push_back(CI);
Expand Down
23 changes: 18 additions & 5 deletions sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,20 @@ __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<T, N> __esimd_slm_gather_ld(
__ESIMD_DNS::simd_mask_storage_t<N> pred,
__ESIMD_DNS::vector_type_t<T, N> pass_thru) __ESIMD_INTRIN_END;

// Scatter data to given global or private addresses.
template <typename T, int N, size_t Align>
__ESIMD_INTRIN void
__esimd_scatter_st(__ESIMD_DNS::vector_type_t<T, N> vals,
__ESIMD_DNS::vector_type_t<uint64_t, N> vptr,
__ESIMD_DNS::simd_mask_storage_t<N> pred) __ESIMD_INTRIN_END;

// Scatter data to given SLM addresses.
template <typename T, int N, size_t Align>
__ESIMD_INTRIN void __esimd_slm_scatter_st(
__ESIMD_DNS::vector_type_t<T, N> vals,
__ESIMD_DNS::vector_type_t<uint32_t, N> vptr,
__ESIMD_DNS::simd_mask_storage_t<N> pred) __ESIMD_INTRIN_END;

/// Surface-based gather.
/// Supported platforms: DG2, PVC
///
Expand Down Expand Up @@ -262,11 +276,10 @@ __esimd_lsc_load_bti(__ESIMD_DNS::simd_mask_storage_t<N> pred,

// flat_read4 does flat-address gather4
template <typename Ty, int N, __ESIMD_NS::rgba_channel_mask Mask>
__ESIMD_DNS::vector_type_t<Ty,
N * get_num_channels_enabled(Mask)> __ESIMD_INTRIN
__esimd_svm_gather4_scaled(__ESIMD_DNS::vector_type_t<uint64_t, N> addrs,
__ESIMD_DNS::simd_mask_storage_t<N> pred = 1)
__ESIMD_INTRIN_END;
__ESIMD_DNS::vector_type_t<Ty, N * get_num_channels_enabled(Mask)>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this just a whitespace change? if so can we revert it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done but whoever touches this file will get this again

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh sorry if this is done by clang-format we should keep it, I thought it was a manual typo

__ESIMD_INTRIN __esimd_svm_gather4_scaled(
__ESIMD_DNS::vector_type_t<uint64_t, N> addrs,
__ESIMD_DNS::simd_mask_storage_t<N> pred = 1) __ESIMD_INTRIN_END;

// flat_write does flat-address scatter
template <typename Ty, int N, __ESIMD_NS::rgba_channel_mask Mask>
Expand Down
11 changes: 10 additions & 1 deletion sycl/include/sycl/ext/intel/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -724,11 +724,20 @@ scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,

// Use LSC lowering if L1/L2 or VS > 1.
if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none ||
VS > 1 || !__ESIMD_DNS::isPowerOf2(N, 32)) {
VS > 1 ||
(!__ESIMD_DNS::isPowerOf2(N, 32) &&
!detail::isMaskedGatherScatterLLVMAvailable())) {
static_assert(VS == 1 || sizeof(T) >= 4,
"VS > 1 is supprted only for 4- and 8-byte elements");
return detail::scatter_impl<T, VS, detail::lsc_data_size::default_size,
L1Hint, L2Hint>(p, byte_offsets, vals, mask);
} else if constexpr (detail::isMaskedGatherScatterLLVMAvailable()) {
simd<uint64_t, N> Addrs(reinterpret_cast<uint64_t>(p));
Addrs = Addrs + convert<uint64_t>(byte_offsets);
using MsgT = detail::__raw_t<T>;
__esimd_scatter_st<MsgT, N, Alignment>(
sycl::bit_cast<__ESIMD_DNS::vector_type_t<MsgT, N>>(vals.data()),
Addrs.data(), mask.data());
} else {
using Tx = detail::__raw_t<T>;
simd<uint64_t, N> byte_offsets_i = convert<uint64_t>(byte_offsets);
Expand Down
2 changes: 1 addition & 1 deletion sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===------------------------------------------------------------------===//
// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
// RUN: %{build} -fsycl-device-code-split=per_kernel -D__ESIMD_GATHER_SCATTER_LLVM_IR -o %t.out
// RUN: %{run} %t.out

// The test verifies esimd::scatter() functions accepting USM pointer
Expand Down
22 changes: 22 additions & 0 deletions sycl/test-e2e/ESIMD/unified_memory_api/scatter_usm_legacy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//==------- scatter_usm_legacy.cpp - DPC++ ESIMD on-device test
v-klochkov marked this conversation as resolved.
Show resolved Hide resolved
//-------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Use per-kernel compilation to have more information about failing cases.
// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
// RUN: %{run} %t.out

// The test verifies esimd::scatter() functions accepting USM pointer
// and optional compile-time esimd::properties.
// The scatter() calls in this test do not use cache-hint properties
// or VS > 1 (number of loads per offset) to not impose using PVC features.
v-klochkov marked this conversation as resolved.
Show resolved Hide resolved
//
// TODO: Remove this test when GPU driver issue with llvm.masked.scatter is
// resolved and ESIMD starts using llvm.masked.scatter by default.
// "-D__ESIMD_GATHER_SCATTER_LLVM_IR" is not used here.

#include "scatter_usm.cpp"
2 changes: 1 addition & 1 deletion sycl/test/esimd/memory_properties.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1224,7 +1224,7 @@ test_gather_scatter(AccType &acc, LocalAccType &local_acc, float *ptrf,
acc_res = gather<float, 32>(local_acc, ioffset_n32, 0);
acc_res = gather<float, 32>(local_acc, ioffset_n32, 0, mask_n32);

// CHECK-COUNT-4: call void @llvm.genx.svm.scatter.v32i1.v32i64.v32f32(<32 x i1> {{[^)]+}}, i32 0, <32 x i64> {{[^)]+}}, <32 x float> {{[^)]+}})
// CHECK-COUNT-4: call void @llvm.masked.scatter.v32f32.v32p4(<32 x float> {{[^)]+}}, <32 x ptr addrspace(4)> {{[^)]+}}, i32 4, <32 x i1> {{[^)]+}})
sarnex marked this conversation as resolved.
Show resolved Hide resolved
scatter(ptrf, ioffset_n32, usm, mask_n32);

scatter(ptrf, ioffset_n32, usm);
Expand Down
Loading