[VPlan] Add initial loop-invariant code motion transform. (llvm#107894)

Add initial transform to move out loop-invariant recipes. This also helps to fix a divergence between legacy and VPlan-based cost model due to legacy using ScalarEvolution::isLoopInvariant in some cases. Fixes llvm#107501. PR: llvm#107894
ROCm · Sep 20, 2024 · a861ed4 · a861ed4
1 parent 37e5319
commit a861ed4
Show file tree

Hide file tree

Showing 37 changed files with 746 additions and 574 deletions.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2378,7 +2378,8 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
  AC->registerAssumption(II);
 
  // End if-block.
- bool IfPredicateInstr = RepRecipe->getParent()->getParent()->isReplicator();
+ VPRegionBlock *Parent = RepRecipe->getParent()->getParent();
+ bool IfPredicateInstr = Parent ? Parent->isReplicator() : false;
  if (IfPredicateInstr)
  PredicatedInstructions.push_back(Cloned);
 }

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -971,6 +971,41 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
  return R.getVPSingleValue()->replaceAllUsesWith(A);
 }
 
+/// Move loop-invariant recipes out of the vector loop region in \p Plan.
+static void licm(VPlan &Plan) {
+ VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+ VPBasicBlock *Preheader =
+ cast<VPBasicBlock>(LoopRegion->getSinglePredecessor());
+
+ // Return true if we do not know how to (mechanically) hoist a given recipe
+ // out of a loop region. Does not address legality concerns such as aliasing
+ // or speculation safety.
+ auto CannotHoistRecipe = [](VPRecipeBase &R) {
+ // Allocas cannot be hoisted.
+ auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
+ return RepR && RepR->getOpcode() == Instruction::Alloca;
+ };
+
+ // Hoist any loop invariant recipes from the vector loop region to the
+ // preheader. Preform a shallow traversal of the vector loop region, to
+ // exclude recipes in replicate regions.
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_shallow(LoopRegion->getEntry()))) {
+ for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+ if (CannotHoistRecipe(R))
+ continue;
+ // TODO: Relax checks in the future, e.g. we could also hoist reads, if
+ // their memory location is not modified in the vector loop.
+ if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi() ||
+ any_of(R.operands(), [](VPValue *Op) {
+ return !Op->isDefinedOutsideLoopRegions();
+ }))
+ continue;
+ R.moveBefore(*Preheader, Preheader->end());
+ }
+ }
+}
+
 /// Try to simplify the recipes in \p Plan.
 static void simplifyRecipes(VPlan &Plan) {
  ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
@@ -1123,6 +1158,7 @@ void VPlanTransforms::optimize(VPlan &Plan) {
  removeRedundantInductionCasts(Plan);
 
  simplifyRecipes(Plan);
+ licm(Plan);
  legalizeAndOptimizeInductions(Plan);
  removeDeadRecipes(Plan);
 

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
@@ -26,6 +26,8 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4
+; CHECK-NEXT: [[TMP18:%.*]] = sdiv i64 [[M]], [[CONV6]]
+; CHECK-NEXT: [[TMP19:%.*]] = sdiv i64 [[M]], [[CONV6]]
 ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
 ; CHECK: [[VECTOR_BODY]]:
 ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -35,8 +37,6 @@ define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
 ; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP14]], 0
 ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 1
 ; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], [[TMP16]]
-; CHECK-NEXT: [[TMP18:%.*]] = sdiv i64 [[M]], [[CONV6]]
-; CHECK-NEXT: [[TMP19:%.*]] = sdiv i64 [[M]], [[CONV6]]
 ; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[TMP18]] to i32
 ; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP19]] to i32
 ; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP18]], [[CONV61]]

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll b/llvm/test/Transforms/LoopVectorize/AArch64/extractvalue-no-scalarization-required.ll
@@ -17,22 +17,23 @@
 
 ; Check that the extractvalue operands are actually free in vector code.
 
-; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph
-; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; FORCED-NEXT: %0 = add i32 %index, 0
-; FORCED-NEXT: %1 = extractvalue { i64, i64 } %sv, 0
-; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x i64> poison, i64 %1, i64 0
+; FORCED: [[E1:%.+]] = extractvalue { i64, i64 } %sv, 0
+; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x i64> poison, i64 [[E1]], i64 0
 ; FORCED-NEXT: %broadcast.splat = shufflevector <2 x i64> %broadcast.splatinsert, <2 x i64> poison, <2 x i32> zeroinitializer
-; FORCED-NEXT: %2 = extractvalue { i64, i64 } %sv, 1
-; FORCED-NEXT: %broadcast.splatinsert1 = insertelement <2 x i64> poison, i64 %2, i64 0
+; FORCED-NEXT: [[E2:%.+]] = extractvalue { i64, i64 } %sv, 1
+; FORCED-NEXT: %broadcast.splatinsert1 = insertelement <2 x i64> poison, i64 [[E2]], i64 0
 ; FORCED-NEXT: %broadcast.splat2 = shufflevector <2 x i64> %broadcast.splatinsert1, <2 x i64> poison, <2 x i32> zeroinitializer
-; FORCED-NEXT: %3 = getelementptr i64, ptr %dst, i32 %0
-; FORCED-NEXT: %4 = add <2 x i64> %broadcast.splat, %broadcast.splat2
-; FORCED-NEXT: %5 = getelementptr i64, ptr %3, i32 0
-; FORCED-NEXT: store <2 x i64> %4, ptr %5, align 4
+
+; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph
+; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; FORCED-NEXT: [[IV_0:%.]] = add i32 %index, 0
+; FORCED-NEXT: [[GEP:%.+]] = getelementptr i64, ptr %dst, i32 [[IV_0]]
+; FORCED-NEXT: [[ADD:%.+]] = add <2 x i64> %broadcast.splat, %broadcast.splat2
+; FORCED-NEXT: [[GEP2:%.+]] = getelementptr i64, ptr [[GEP]], i32 0
+; FORCED-NEXT: store <2 x i64> [[ADD]], ptr [[GEP2]], align 4
 ; FORCED-NEXT: %index.next = add nuw i32 %index, 2
-; FORCED-NEXT: %6 = icmp eq i32 %index.next, 1000
-; FORCED-NEXT: br i1 %6, label %middle.block, label %vector.body
+; FORCED-NEXT: [[C:%.+]] = icmp eq i32 %index.next, 1000
+; FORCED-NEXT: br i1 [[C]], label %middle.block, label %vector.body
 
 define void @test1(ptr %dst, {i64, i64} %sv) {
 entry:
@@ -66,22 +67,23 @@ declare float @powf(float, float) readnone nounwind
 
 ; FORCED-LABEL: define void @test_getVectorCallCost
 
-; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph
-; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; FORCED-NEXT: %0 = add i32 %index, 0
-; FORCED-NEXT: %1 = extractvalue { float, float } %sv, 0
-; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x float> poison, float %1, i64 0
+; FORCED: [[E1:%.+]] = extractvalue { float, float } %sv, 0
+; FORCED-NEXT: %broadcast.splatinsert = insertelement <2 x float> poison, float [[E1]], i64 0
 ; FORCED-NEXT: %broadcast.splat = shufflevector <2 x float> %broadcast.splatinsert, <2 x float> poison, <2 x i32> zeroinitializer
-; FORCED-NEXT: %2 = extractvalue { float, float } %sv, 1
-; FORCED-NEXT: %broadcast.splatinsert1 = insertelement <2 x float> poison, float %2, i64 0
+; FORCED-NEXT: [[E2:%.+]] = extractvalue { float, float } %sv, 1
+; FORCED-NEXT: %broadcast.splatinsert1 = insertelement <2 x float> poison, float [[E2]], i64 0
 ; FORCED-NEXT: %broadcast.splat2 = shufflevector <2 x float> %broadcast.splatinsert1, <2 x float> poison, <2 x i32> zeroinitializer
-; FORCED-NEXT: %3 = getelementptr float, ptr %dst, i32 %0
-; FORCED-NEXT: %4 = call <2 x float> @llvm.pow.v2f32(<2 x float> %broadcast.splat, <2 x float> %broadcast.splat2)
-; FORCED-NEXT: %5 = getelementptr float, ptr %3, i32 0
-; FORCED-NEXT: store <2 x float> %4, ptr %5, align 4
+
+; FORCED-LABEL: vector.body: ; preds = %vector.body, %vector.ph
+; FORCED-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; FORCED-NEXT: [[IV0:%.+]] = add i32 %index, 0
+; FORCED-NEXT: [[GEP1:%.+]] = getelementptr float, ptr %dst, i32 [[IV0]]
+; FORCED-NEXT: [[POW:%.+]] = call <2 x float> @llvm.pow.v2f32(<2 x float> %broadcast.splat, <2 x float> %broadcast.splat2)
+; FORCED-NEXT: [[GEP2:%.+]] = getelementptr float, ptr [[GEP1]], i32 0
+; FORCED-NEXT: store <2 x float> [[POW]], ptr [[GEP2]], align 4
 ; FORCED-NEXT: %index.next = add nuw i32 %index, 2
-; FORCED-NEXT: %6 = icmp eq i32 %index.next, 1000
-; FORCED-NEXT: br i1 %6, label %middle.block, label %vector.body
+; FORCED-NEXT: [[C:%.+]] = icmp eq i32 %index.next, 1000
+; FORCED-NEXT: br i1 [[C]], label %middle.block, label %vector.body
 
 define void @test_getVectorCallCost(ptr %dst, {float, float} %sv) {
 entry: