Skip to content

Commit

Permalink
[VPlan] Add initial loop-invariant code motion transform. (llvm#107894)
Browse files Browse the repository at this point in the history
Add initial transform to move out loop-invariant recipes.

This also helps to fix a divergence between legacy and VPlan-based cost
model due to legacy using ScalarEvolution::isLoopInvariant in some
cases.

Fixes llvm#107501.

PR: llvm#107894
  • Loading branch information
fhahn committed Sep 20, 2024
1 parent 37e5319 commit a861ed4
Show file tree
Hide file tree
Showing 37 changed files with 746 additions and 574 deletions.
3 changes: 2 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2378,7 +2378,8 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
AC->registerAssumption(II);

// End if-block.
bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
VPRegionBlock *Parent = RepRecipe->getParent()->getParent();
bool IfPredicateInstr = Parent ? Parent->isReplicator() : false;
if (IfPredicateInstr)
PredicatedInstructions.push_back(Cloned);
}
Expand Down
36 changes: 36 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -971,6 +971,41 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
return R.getVPSingleValue()->replaceAllUsesWith(A);
}

/// Move loop-invariant recipes out of the vector loop region in \p Plan.
static void licm(VPlan &Plan) {
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
VPBasicBlock *Preheader =
cast<VPBasicBlock>(LoopRegion->getSinglePredecessor());

// Return true if we do not know how to (mechanically) hoist a given recipe
// out of a loop region. Does not address legality concerns such as aliasing
// or speculation safety.
auto CannotHoistRecipe = [](VPRecipeBase &R) {
// Allocas cannot be hoisted.
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
return RepR && RepR->getOpcode() == Instruction::Alloca;
};

// Hoist any loop invariant recipes from the vector loop region to the
// preheader. Preform a shallow traversal of the vector loop region, to
// exclude recipes in replicate regions.
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_shallow(LoopRegion->getEntry()))) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
if (CannotHoistRecipe(R))
continue;
// TODO: Relax checks in the future, e.g. we could also hoist reads, if
// their memory location is not modified in the vector loop.
if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() ||
any_of(R.operands(), [](VPValue *Op) {
return !Op->isDefinedOutsideLoopRegions();
}))
continue;
R.moveBefore(*Preheader, Preheader->end());
}
}
}

/// Try to simplify the recipes in \p Plan.
static void simplifyRecipes(VPlan &Plan) {
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
Expand Down Expand Up @@ -1123,6 +1158,7 @@ void VPlanTransforms::optimize(VPlan &Plan) {
removeRedundantInductionCasts(Plan);

simplifyRecipes(Plan);
licm(Plan);
legalizeAndOptimizeInductions(Plan);
removeDeadRecipes(Plan);

Expand Down
295 changes: 75 additions & 220 deletions llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4
; CHECK-NEXT: [[TMP18:%.*]] = sdiv i64 [[M]], [[CONV6]]
; CHECK-NEXT: [[TMP19:%.*]] = sdiv i64 [[M]], [[CONV6]]
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
Expand All @@ -35,8 +37,6 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP14]], 0
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 1
; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], [[TMP16]]
; CHECK-NEXT: [[TMP18:%.*]] = sdiv i64 [[M]], [[CONV6]]
; CHECK-NEXT: [[TMP19:%.*]] = sdiv i64 [[M]], [[CONV6]]
; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP18]] to i32
; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP19]] to i32
; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP18]], [[CONV61]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,23 @@

; Check that the extractvalue operands are actually free in vector code.

; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph
; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; FORCED-NEXT: %0 = add i32 %index, 0
; FORCED-NEXT: %1 = extractvalue { i64, i64 } %sv, 0
; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x i64> poison, i64 %1, i64 0
; FORCED: [[E1:%.+]] = extractvalue { i64, i64 } %sv, 0
; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x i64> poison, i64 [[E1]], i64 0
; FORCED-NEXT: %broadcast.splat = shufflevector <2 x i64> %broadcast.splatinsert, <2 x i64> poison, <2 x i32> zeroinitializer
; FORCED-NEXT: %2 = extractvalue { i64, i64 } %sv, 1
; FORCED-NEXT: %broadcast.splatinsert1 = insertelement <2 x i64> poison, i64 %2, i64 0
; FORCED-NEXT: [[E2:%.+]] = extractvalue { i64, i64 } %sv, 1
; FORCED-NEXT: %broadcast.splatinsert1 = insertelement <2 x i64> poison, i64 [[E2]], i64 0
; FORCED-NEXT: %broadcast.splat2 = shufflevector <2 x i64> %broadcast.splatinsert1, <2 x i64> poison, <2 x i32> zeroinitializer
; FORCED-NEXT: %3 = getelementptr i64, ptr %dst, i32 %0
; FORCED-NEXT: %4 = add <2 x i64> %broadcast.splat, %broadcast.splat2
; FORCED-NEXT: %5 = getelementptr i64, ptr %3, i32 0
; FORCED-NEXT: store <2 x i64> %4, ptr %5, align 4

; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph
; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; FORCED-NEXT: [[IV_0:%.]] = add i32 %index, 0
; FORCED-NEXT: [[GEP:%.+]] = getelementptr i64, ptr %dst, i32 [[IV_0]]
; FORCED-NEXT: [[ADD:%.+]] = add <2 x i64> %broadcast.splat, %broadcast.splat2
; FORCED-NEXT: [[GEP2:%.+]] = getelementptr i64, ptr [[GEP]], i32 0
; FORCED-NEXT: store <2 x i64> [[ADD]], ptr [[GEP2]], align 4
; FORCED-NEXT: %index.next = add nuw i32 %index, 2
; FORCED-NEXT: %6 = icmp eq i32 %index.next, 1000
; FORCED-NEXT: br i1 %6, label %middle.block, label %vector.body
; FORCED-NEXT: [[C:%.+]] = icmp eq i32 %index.next, 1000
; FORCED-NEXT: br i1 [[C]], label %middle.block, label %vector.body

define void @test1(ptr %dst, {i64, i64} %sv) {
entry:
Expand Down Expand Up @@ -66,22 +67,23 @@ declare float @powf(float, float) readnone nounwind

; FORCED-LABEL: define void @test_getVectorCallCost

; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph
; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; FORCED-NEXT: %0 = add i32 %index, 0
; FORCED-NEXT: %1 = extractvalue { float, float } %sv, 0
; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x float> poison, float %1, i64 0
; FORCED: [[E1:%.+]] = extractvalue { float, float } %sv, 0
; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x float> poison, float [[E1]], i64 0
; FORCED-NEXT: %broadcast.splat = shufflevector <2 x float> %broadcast.splatinsert, <2 x float> poison, <2 x i32> zeroinitializer
; FORCED-NEXT: %2 = extractvalue { float, float } %sv, 1
; FORCED-NEXT: %broadcast.splatinsert1 = insertelement <2 x float> poison, float %2, i64 0
; FORCED-NEXT: [[E2:%.+]] = extractvalue { float, float } %sv, 1
; FORCED-NEXT: %broadcast.splatinsert1 = insertelement <2 x float> poison, float [[E2]], i64 0
; FORCED-NEXT: %broadcast.splat2 = shufflevector <2 x float> %broadcast.splatinsert1, <2 x float> poison, <2 x i32> zeroinitializer
; FORCED-NEXT: %3 = getelementptr float, ptr %dst, i32 %0
; FORCED-NEXT: %4 = call <2 x float> @llvm.pow.v2f32(<2 x float> %broadcast.splat, <2 x float> %broadcast.splat2)
; FORCED-NEXT: %5 = getelementptr float, ptr %3, i32 0
; FORCED-NEXT: store <2 x float> %4, ptr %5, align 4

; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph
; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; FORCED-NEXT: [[IV0:%.+]] = add i32 %index, 0
; FORCED-NEXT: [[GEP1:%.+]] = getelementptr float, ptr %dst, i32 [[IV0]]
; FORCED-NEXT: [[POW:%.+]] = call <2 x float> @llvm.pow.v2f32(<2 x float> %broadcast.splat, <2 x float> %broadcast.splat2)
; FORCED-NEXT: [[GEP2:%.+]] = getelementptr float, ptr [[GEP1]], i32 0
; FORCED-NEXT: store <2 x float> [[POW]], ptr [[GEP2]], align 4
; FORCED-NEXT: %index.next = add nuw i32 %index, 2
; FORCED-NEXT: %6 = icmp eq i32 %index.next, 1000
; FORCED-NEXT: br i1 %6, label %middle.block, label %vector.body
; FORCED-NEXT: [[C:%.+]] = icmp eq i32 %index.next, 1000
; FORCED-NEXT: br i1 [[C]], label %middle.block, label %vector.body

define void @test_getVectorCallCost(ptr %dst, {float, float} %sv) {
entry:
Expand Down
Loading

0 comments on commit a861ed4

Please sign in to comment.