From 470e3789573fb74ade5e2b6dd8a38917ca2347df Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Thu, 1 Feb 2024 03:48:37 +1300 Subject: [PATCH] [SYCL][Fusion] Handle GEPs that were canonicalized to byte offsets (#12557) Upstream now canonicalizes constant GEPs to represent byte offsets, i.e. using `i8` as source element type. This PR adapts the internalization pass to this change by also remapping GEPs with a constant offset, if that offset is a multiple of the internalized accessor's element size. Signed-off-by: Julian Oppermann --- .../internalization/Internalization.cpp | 56 ++++++++++++++++--- .../promote-private-non-unit.ll | 15 +++-- .../KernelFusion/internalize_vfunc.cpp | 1 - 3 files changed, 58 insertions(+), 14 deletions(-) diff --git a/sycl-fusion/passes/internalization/Internalization.cpp b/sycl-fusion/passes/internalization/Internalization.cpp index dd3e6be43e405..8a36106c0d48f 100644 --- a/sycl-fusion/passes/internalization/Internalization.cpp +++ b/sycl-fusion/passes/internalization/Internalization.cpp @@ -188,6 +188,21 @@ static void updateInternalizationMD(Function *F, StringRef Kind, } } +/// +/// If \p GEPI represents a constant offset in bytes, return it, otherwise +/// return an empty value. +static std::optional getConstantByteOffset(GetElementPtrInst *GEPI, + const DataLayout &DL) { + MapVector VariableOffsets; + auto IW = DL.getIndexSizeInBits(GEPI->getPointerAddressSpace()); + APInt ConstantOffset = APInt::getZero(IW); + if (GEPI->collectOffset(DL, IW, VariableOffsets, ConstantOffset) && + VariableOffsets.empty()) { + return ConstantOffset.getZExtValue(); + } + return {}; +} + /// /// When performing internalization, GEP instructions must be remapped, as the /// address space has changed from N to N / LocalSize. @@ -200,6 +215,25 @@ static void remap(GetElementPtrInst *GEPI, const PromotionInfo &PromInfo) { return; } + // GEPs with constant offset may be marked for remapping even if their element + // size differs from the accessor's element size. However we know that the + // offset is a multiple of the latter. Rewrite the instruction to represent a + // number of _elements_ to make it compatible with other GEPs in the current + // chain. + auto &DL = GEPI->getModule()->getDataLayout(); + auto SrcElemTySz = DL.getTypeAllocSize(GEPI->getSourceElementType()); + if (SrcElemTySz != PromInfo.ElemSize) { + auto COff = getConstantByteOffset(GEPI, DL); + // This is special case #2 in `getGEPKind`. + assert(COff.has_value() && *COff % PromInfo.ElemSize == 0 && + GEPI->getNumIndices() == 1); + auto *IntTypeWithSameWidthAsAccessorElementType = + Builder.getIntNTy(PromInfo.ElemSize * 8); + GEPI->setSourceElementType(IntTypeWithSameWidthAsAccessorElementType); + GEPI->setResultElementType(IntTypeWithSameWidthAsAccessorElementType); + GEPI->idx_begin()->set(Builder.getInt64(*COff / PromInfo.ElemSize)); + } + // An individual `GEP(ptr, offset)` is rewritten as // `GEP(ptr, offset % LocalSize)`. // @@ -302,15 +336,19 @@ static int getGEPKind(GetElementPtrInst *GEPI, const PromotionInfo &PromInfo) { return Kind; } - // Check whether `GEPI` adds a constant offset, e.g. a byte offset to address - // into a padded structure, smaller than the element size. - MapVector VariableOffsets; - auto IW = DL.getIndexSizeInBits(GEPI->getPointerAddressSpace()); - APInt ConstantOffset = APInt::getZero(IW); - if (GEPI->collectOffset(DL, IW, VariableOffsets, ConstantOffset) && - VariableOffsets.empty() && - ConstantOffset.getZExtValue() < PromInfo.ElemSize) { - return ADDRESSES_INTO_AGGREGATE; + // We can handle a mismatch between `GEPI`'s element size and the accessors + // element size if `GEPI` represents a constant offset. + if (auto COff = getConstantByteOffset(GEPI, DL)) { + if (*COff < PromInfo.ElemSize) { + // Special case #1: The offset is less than the element size, hence we're + // addressing into an aggregrate and no remapping is required. + return ADDRESSES_INTO_AGGREGATE; + } + if (*COff % PromInfo.ElemSize == 0 && GEPI->getNumIndices() == 1) { + // Special case #2: The offset is a multiple of the element size, meaning + // `GEPI` selects an element and is subject to remapping. + return NEEDS_REMAPPING; + } } // We don't know what `GEPI` addresses; bail out. diff --git a/sycl-fusion/test/internalization/promote-private-non-unit.ll b/sycl-fusion/test/internalization/promote-private-non-unit.ll index b4b73aa9881b9..ac95f0606c628 100644 --- a/sycl-fusion/test/internalization/promote-private-non-unit.ll +++ b/sycl-fusion/test/internalization/promote-private-non-unit.ll @@ -36,12 +36,17 @@ define spir_kernel void @fused_0(ptr addrspace(1) nocapture align 16 %KernelOne_ ; CHECK: [[TMP4:%.*]] = tail call spir_func i64 @_Z33__spirv_BuiltInGlobalInvocationIdi(i32 0) ; CHECK: [[MUL:%.*]] = mul nuw nsw i64 [[TMP4]], 3 ; CHECK: [[ADD:%.*]] = add nuw nsw i64 [[MUL]], 1 -; CHECK: [[TMP6:%.*]] = add i64 [[TMP2]], [[ADD]] +; CHECK: [[TMP6:%.*]] = add i64 [[TMP2]], [[MUL]] ; CHECK: [[TMP7:%.*]] = urem i64 [[TMP6]], 3 ; CHECK: [[ARRAYIDX_1:%.*]] = getelementptr inbounds %struct.MyStruct, ptr [[TMP1]], i64 [[TMP7]] +; CHECK: [[ADDA:%.*]] = add i64 [[TMP7]], 1 +; CHECK: [[TMP7A:%.*]] = urem i64 [[ADDA]], 3 + +; COM: This constant i8-GEP was rewritten to encode an _element_ offset, and subsequently remapped. +; CHECK: [[ARRAYIDX_1A:%.*]] = getelementptr inbounds i256, ptr [[TMP1]], i64 [[TMP7A]] ; COM: This i8-GEP _was_ not remapped because it addresses into a single MyStruct element -; CHECK: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX_1]], i64 20 +; CHECK: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX_1A]], i64 20 ; CHECK: store i32 {{.*}}, ptr [[ARRAYIDX_2]], align 4 ; CHECK: [[TMP8:%.*]] = add i64 [[TMP3]], [[ADD]] @@ -66,8 +71,10 @@ entry: %add.j2 = add nuw nsw i64 %mul.j2, 1 %arrayidx.j2 = getelementptr inbounds i32, ptr addrspace(1) %add.ptr.i35.i, i64 %add.j2 %1 = load i32, ptr addrspace(1) %arrayidx.j2, align 4 - %arrayidx.i54.i = getelementptr inbounds %struct.MyStruct, ptr addrspace(1) %add.ptr.j2, i64 %add.j2 - %arrayidx.j3 = getelementptr inbounds i8, ptr addrspace(1) %arrayidx.i54.i, i64 20 + %arrayidx.i54.i = getelementptr inbounds %struct.MyStruct, ptr addrspace(1) %add.ptr.j2, i64 %mul.j2 + ; Mimic %add.j2 by artificially representing it as a constant byte offset (sizeof(MyStruct)==32 byte) + %arrayidx.plus.one.element = getelementptr inbounds i8, ptr addrspace(1) %arrayidx.i54.i, i64 32 + %arrayidx.j3 = getelementptr inbounds i8, ptr addrspace(1) %arrayidx.plus.one.element, i64 20 store i32 %1, ptr addrspace(1) %arrayidx.j3, align 4 %conv.j2 = trunc i32 %1 to i8 %arrayidx.i70.i = getelementptr inbounds i8, ptr addrspace(1) %add.ptr.i44.i, i64 %add.j2 diff --git a/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp b/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp index a538099a2aa8f..278395f2a90fd 100644 --- a/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp +++ b/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp @@ -1,6 +1,5 @@ // RUN: %{build} -fsycl-embed-ir -O2 -o %t.out // RUN: %{run} %t.out -// XFAIL: hip,cuda // Test complete fusion with private internalization specified on the // accessors for a device kernel with sycl::vec::load and sycl::vec::store.