Skip to content

Commit

Permalink
[SeparateConstOffsetFromGEP] Reland: Reorder trivial GEP chains to se…
Browse files Browse the repository at this point in the history
…parate constants (llvm#81671)

Actually update tests w.r.t
llvm@9e5a77f
and reland llvm#73056

(cherry picked from commit 7180c23)
Change-Id: I9cfcd36ff5586dffea1df9fd222c98b0bba3f8a8
  • Loading branch information
jrbyrnes authored and bcahoon committed Sep 6, 2024
1 parent 1c7a195 commit 77cf9ad
Show file tree
Hide file tree
Showing 8 changed files with 683 additions and 158 deletions.
73 changes: 70 additions & 3 deletions llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,11 @@ class SeparateConstOffsetFromGEP {
/// and returns true if the splitting succeeds.
bool splitGEP(GetElementPtrInst *GEP);

/// Tries to reorder the given GEP with the GEP that produces the base if
/// doing so results in producing a constant offset as the outermost
/// index.
bool reorderGEP(GetElementPtrInst *GEP, TargetTransformInfo &TTI);

/// Lower a GEP with multiple indices into multiple GEPs with a single index.
/// Function splitGEP already split the original GEP into a variadic part and
/// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
Expand Down Expand Up @@ -970,6 +975,66 @@ SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
Variadic->eraseFromParent();
}

bool SeparateConstOffsetFromGEP::reorderGEP(GetElementPtrInst *GEP,
TargetTransformInfo &TTI) {
Type *GEPType = GEP->getResultElementType();
// TODO: support reordering for non-trivial GEP chains
if (GEPType->isAggregateType() || GEP->getNumIndices() != 1)
return false;

auto PtrGEP = dyn_cast<GetElementPtrInst>(GEP->getPointerOperand());
if (!PtrGEP)
return false;
Type *PtrGEPType = PtrGEP->getResultElementType();
// TODO: support reordering for non-trivial GEP chains
if (PtrGEPType->isAggregateType() || PtrGEP->getNumIndices() != 1)
return false;

// TODO: support reordering for non-trivial GEP chains
if (PtrGEPType != GEPType ||
PtrGEP->getSourceElementType() != GEP->getSourceElementType())
return false;

bool NestedNeedsExtraction;
int64_t NestedByteOffset =
accumulateByteOffset(PtrGEP, NestedNeedsExtraction);
if (!NestedNeedsExtraction)
return false;

unsigned AddrSpace = PtrGEP->getPointerAddressSpace();
if (!TTI.isLegalAddressingMode(GEP->getResultElementType(),
/*BaseGV=*/nullptr, NestedByteOffset,
/*HasBaseReg=*/true, /*Scale=*/0, AddrSpace))
return false;

IRBuilder<> Builder(GEP);
Builder.SetCurrentDebugLocation(GEP->getDebugLoc());
bool GEPInBounds = GEP->isInBounds();
bool PtrGEPInBounds = PtrGEP->isInBounds();
bool IsChainInBounds = GEPInBounds && PtrGEPInBounds;
if (IsChainInBounds) {
auto GEPIdx = GEP->indices().begin();
auto KnownGEPIdx = computeKnownBits(GEPIdx->get(), *DL);
IsChainInBounds &= KnownGEPIdx.isNonNegative();
if (IsChainInBounds) {
auto PtrGEPIdx = GEP->indices().begin();
auto KnownPtrGEPIdx = computeKnownBits(PtrGEPIdx->get(), *DL);
IsChainInBounds &= KnownPtrGEPIdx.isNonNegative();
}
}

// For trivial GEP chains, we can swap the indicies.
auto NewSrc = Builder.CreateGEP(PtrGEPType, PtrGEP->getPointerOperand(),
SmallVector<Value *, 4>(GEP->indices()));
cast<GetElementPtrInst>(NewSrc)->setIsInBounds(IsChainInBounds);
auto NewGEP = Builder.CreateGEP(GEPType, NewSrc,
SmallVector<Value *, 4>(PtrGEP->indices()));
cast<GetElementPtrInst>(NewGEP)->setIsInBounds(IsChainInBounds);
GEP->replaceAllUsesWith(NewGEP);
RecursivelyDeleteTriviallyDeadInstructions(GEP);
return true;
}

bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
// Skip vector GEPs.
if (GEP->getType()->isVectorTy())
Expand All @@ -985,11 +1050,13 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
bool NeedsExtraction;
int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);

if (!NeedsExtraction)
return Changed;

TargetTransformInfo &TTI = GetTTI(*GEP->getFunction());

if (!NeedsExtraction) {
Changed |= reorderGEP(GEP, TTI);
return Changed;
}

// If LowerGEP is disabled, before really splitting the GEP, check whether the
// backend supports the addressing mode we are about to produce. If no, this
// splitting probably won't be beneficial.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -143,11 +143,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_mov_b32_e32 v47, 0
; CHECK-NEXT: s_mov_b32 s49, 0
; CHECK-NEXT: s_branch .LBB0_7
; CHECK-NEXT: .LBB0_5: ; %Flow41
; CHECK-NEXT: .LBB0_5: ; %Flow43
; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=1
; CHECK-NEXT: s_inst_prefetch 0x2
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s49
; CHECK-NEXT: .LBB0_6: ; %Flow42
; CHECK-NEXT: .LBB0_6: ; %Flow44
; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=1
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s48, v45
; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47
Expand Down Expand Up @@ -304,7 +304,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: ds_write_b32 v0, v58
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55
; CHECK-NEXT: s_branch .LBB0_9
; CHECK-NEXT: .LBB0_18: ; %Flow43
; CHECK-NEXT: .LBB0_18: ; %Flow45
; CHECK-NEXT: ; in Loop: Header=BB0_7 Depth=1
; CHECK-NEXT: v_mov_b32_e32 v57, v0
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s52
Expand Down Expand Up @@ -357,7 +357,7 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: ds_write_b32 v0, v57
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s53
; CHECK-NEXT: s_branch .LBB0_21
; CHECK-NEXT: .LBB0_24: ; %Flow47
; CHECK-NEXT: .LBB0_24: ; %Flow49
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s42
; CHECK-NEXT: .LBB0_25:
; CHECK-NEXT: v_mov_b32_e32 v31, v40
Expand All @@ -382,13 +382,11 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_cmov_b32 exec_lo, vcc_lo
; CHECK-NEXT: s_cbranch_scc0 .LBB0_34
; CHECK-NEXT: ; %bb.26:
; CHECK-NEXT: s_add_u32 s42, s44, 8
; CHECK-NEXT: s_addc_u32 s43, s45, 0
; CHECK-NEXT: s_mov_b32 s44, 0
; CHECK-NEXT: s_mov_b32 s42, 0
; CHECK-NEXT: s_branch .LBB0_29
; CHECK-NEXT: .LBB0_27: ; %Flow38
; CHECK-NEXT: .LBB0_27: ; %Flow40
; CHECK-NEXT: ; in Loop: Header=BB0_29 Depth=1
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s45
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s43
; CHECK-NEXT: .LBB0_28: ; in Loop: Header=BB0_29 Depth=1
; CHECK-NEXT: v_mov_b32_e32 v31, v40
; CHECK-NEXT: v_mov_b32_e32 v0, 0
Expand All @@ -405,13 +403,13 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_swappc_b64 s[30:31], s[6:7]
; CHECK-NEXT: v_add_co_u32 v41, vcc_lo, v0, v41
; CHECK-NEXT: v_cmp_le_u32_e32 vcc_lo, v47, v41
; CHECK-NEXT: s_or_b32 s44, vcc_lo, s44
; CHECK-NEXT: s_andn2_b32 s4, exec_lo, s44
; CHECK-NEXT: s_cselect_b32 exec_lo, s4, s44
; CHECK-NEXT: s_or_b32 s42, vcc_lo, s42
; CHECK-NEXT: s_andn2_b32 s4, exec_lo, s42
; CHECK-NEXT: s_cselect_b32 exec_lo, s4, s42
; CHECK-NEXT: s_cbranch_scc0 .LBB0_34
; CHECK-NEXT: .LBB0_29: ; =>This Inner Loop Header: Depth=1
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v41
; CHECK-NEXT: s_mov_b32 s45, exec_lo
; CHECK-NEXT: s_mov_b32 s43, exec_lo
; CHECK-NEXT: ds_read_b32 v0, v0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_lshrrev_b32_e32 v63, 10, v0
Expand All @@ -420,15 +418,15 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: v_mul_u32_u24_e32 v1, 0x180, v63
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 5, v62
; CHECK-NEXT: v_lshlrev_b32_e32 v4, 5, v72
; CHECK-NEXT: v_add_co_u32 v2, s4, s42, v1
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s43, 0, s4
; CHECK-NEXT: v_add_co_u32 v2, s4, s44, v1
; CHECK-NEXT: v_add_co_ci_u32_e64 v3, null, s45, 0, s4
; CHECK-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
; CHECK-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v3, vcc_lo
; CHECK-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4
; CHECK-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
; CHECK-NEXT: s_clause 0x1
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[0:1], off
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off
; CHECK-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:8
; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:8
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_xor_b32_e32 v46, v9, v5
; CHECK-NEXT: v_xor_b32_e32 v45, v8, v4
Expand All @@ -442,8 +440,8 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no
; CHECK-NEXT: s_cbranch_scc0 .LBB0_28
; CHECK-NEXT: ; %bb.30: ; in Loop: Header=BB0_29 Depth=1
; CHECK-NEXT: s_clause 0x1
; CHECK-NEXT: global_load_dwordx2 v[58:59], v[2:3], off offset:16
; CHECK-NEXT: global_load_dwordx2 v[60:61], v[0:1], off offset:16
; CHECK-NEXT: global_load_dwordx2 v[58:59], v[2:3], off offset:24
; CHECK-NEXT: global_load_dwordx2 v[60:61], v[0:1], off offset:24
; CHECK-NEXT: v_lshlrev_b32_e32 v0, 4, v45
; CHECK-NEXT: v_alignbit_b32 v1, v46, v45, 12
; CHECK-NEXT: v_and_b32_e32 v2, 0xf0000, v45
Expand Down
Loading

0 comments on commit 77cf9ad

Please sign in to comment.