From 37a94b7eddeab0ae42207699fb9be37587172a9e Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 8 Aug 2024 16:31:20 +0200 Subject: [PATCH] [LICM][MustExec] Make must-exec logic for IV condition commutative (#93150) MustExec has special logic to determine whether the first loop iteration will always be executed, by simplifying the IV comparison with the start value. Currently, this code assumes that the IV is on the LHS of the comparison, but this is not guaranteed. Make sure it handles the commuted variant as well. The changed PhaseOrdering test previously performed peeling to make the loads dereferenceable -- as a side effect, this also reduced the exit count by one, avoiding the awkward <= MAX case. Now we know up-front the the loads are dereferenceable and can be simply hoisted. As such, we retain the original exit count and now have to handle it by widening the exit count calculation to i128. This is a regression, but at least it preserves the vectorization, which was the original goal. I'm not sure what else can be done about that test. --- llvm/lib/Analysis/MustExecute.cpp | 19 +- llvm/test/Transforms/LICM/hoist-mustexec.ll | 3 +- ...ple-unreachable-exits-for-vectorization.ll | 287 ++++++++---------- 3 files changed, 148 insertions(+), 161 deletions(-) diff --git a/llvm/lib/Analysis/MustExecute.cpp b/llvm/lib/Analysis/MustExecute.cpp index 904d30d0544654..caed62679a683c 100644 --- a/llvm/lib/Analysis/MustExecute.cpp +++ b/llvm/lib/Analysis/MustExecute.cpp @@ -135,16 +135,21 @@ static bool CanProveNotTakenFirstIteration(const BasicBlock *ExitBlock, // todo: this would be a lot more powerful if we used scev, but all the // plumbing is currently missing to pass a pointer in from the pass // Check for cmp (phi [x, preheader] ...), y where (pred x, y is known + ICmpInst::Predicate Pred = Cond->getPredicate(); auto *LHS = dyn_cast(Cond->getOperand(0)); auto *RHS = Cond->getOperand(1); - if (!LHS || LHS->getParent() != CurLoop->getHeader()) - return false; - auto DL = ExitBlock->getDataLayout(); + if (!LHS || LHS->getParent() != CurLoop->getHeader()) { + Pred = Cond->getSwappedPredicate(); + LHS = dyn_cast(Cond->getOperand(1)); + RHS = Cond->getOperand(0); + if (!LHS || LHS->getParent() != CurLoop->getHeader()) + return false; + } + + auto DL = ExitBlock->getModule()->getDataLayout(); auto *IVStart = LHS->getIncomingValueForBlock(CurLoop->getLoopPreheader()); - auto *SimpleValOrNull = simplifyCmpInst(Cond->getPredicate(), - IVStart, RHS, - {DL, /*TLI*/ nullptr, - DT, /*AC*/ nullptr, BI}); + auto *SimpleValOrNull = simplifyCmpInst( + Pred, IVStart, RHS, {DL, /*TLI*/ nullptr, DT, /*AC*/ nullptr, BI}); auto *SimpleCst = dyn_cast_or_null(SimpleValOrNull); if (!SimpleCst) return false; diff --git a/llvm/test/Transforms/LICM/hoist-mustexec.ll b/llvm/test/Transforms/LICM/hoist-mustexec.ll index 81e0815053ffe5..a6f5a2be05ee41 100644 --- a/llvm/test/Transforms/LICM/hoist-mustexec.ll +++ b/llvm/test/Transforms/LICM/hoist-mustexec.ll @@ -218,7 +218,6 @@ fail: } ; Same as previous case, with commuted icmp. -; FIXME: The load should get hoisted here as well. define i32 @test3_commuted(ptr noalias nocapture readonly %a) nounwind uwtable { ; CHECK-LABEL: define i32 @test3_commuted( ; CHECK-SAME: ptr noalias nocapture readonly [[A:%.*]]) #[[ATTR1]] { @@ -227,6 +226,7 @@ define i32 @test3_commuted(ptr noalias nocapture readonly %a) nounwind uwtable { ; CHECK-NEXT: [[IS_ZERO:%.*]] = icmp eq i32 [[LEN]], 0 ; CHECK-NEXT: br i1 [[IS_ZERO]], label [[FAIL:%.*]], label [[PREHEADER:%.*]] ; CHECK: preheader: +; CHECK-NEXT: [[I1:%.*]] = load i32, ptr [[A]], align 4 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[PREHEADER]] ], [ [[INC:%.*]], [[CONTINUE:%.*]] ] @@ -234,7 +234,6 @@ define i32 @test3_commuted(ptr noalias nocapture readonly %a) nounwind uwtable { ; CHECK-NEXT: [[R_CHK:%.*]] = icmp uge i32 [[LEN]], [[IV]] ; CHECK-NEXT: br i1 [[R_CHK]], label [[CONTINUE]], label [[FAIL_LOOPEXIT:%.*]] ; CHECK: continue: -; CHECK-NEXT: [[I1:%.*]] = load i32, ptr [[A]], align 4 ; CHECK-NEXT: [[ADD]] = add nsw i32 [[I1]], [[ACC]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000 diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll index 8fc5189e8bc79e..cc4890e27f2bda 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll @@ -9,95 +9,87 @@ define i64 @sum_2_at_with_int_conversion(ptr %A, ptr %B, i64 %N) { ; CHECK-LABEL: @sum_2_at_with_int_conversion( -; CHECK-NEXT: at_with_int_conversion.exit11.peel: +; CHECK-NEXT: entry: ; CHECK-NEXT: [[START_I:%.*]] = load ptr, ptr [[A:%.*]], align 8 ; CHECK-NEXT: [[GEP_END_I:%.*]] = getelementptr i8, ptr [[A]], i64 8 ; CHECK-NEXT: [[END_I:%.*]] = load ptr, ptr [[GEP_END_I]], align 8 ; CHECK-NEXT: [[START_INT_I:%.*]] = ptrtoint ptr [[START_I]] to i64 ; CHECK-NEXT: [[END_INT_I:%.*]] = ptrtoint ptr [[END_I]] to i64 ; CHECK-NEXT: [[SUB_I:%.*]] = sub i64 [[END_INT_I]], [[START_INT_I]] +; CHECK-NEXT: [[START_I1:%.*]] = load ptr, ptr [[B:%.*]], align 8 +; CHECK-NEXT: [[GEP_END_I2:%.*]] = getelementptr i8, ptr [[B]], i64 8 +; CHECK-NEXT: [[END_I3:%.*]] = load ptr, ptr [[GEP_END_I2]], align 8 +; CHECK-NEXT: [[START_INT_I4:%.*]] = ptrtoint ptr [[START_I1]] to i64 +; CHECK-NEXT: [[END_INT_I5:%.*]] = ptrtoint ptr [[END_I3]] to i64 +; CHECK-NEXT: [[SUB_I6:%.*]] = sub i64 [[END_INT_I5]], [[START_INT_I4]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i64 [[SUB_I]] to i128 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i128 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[SUB_I6]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i128 [[TMP2]], 1 +; CHECK-NEXT: [[UMIN:%.*]] = tail call i128 @llvm.umin.i128(i128 [[TMP3]], i128 [[TMP1]]) ; CHECK-NEXT: [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 0) -; CHECK-NEXT: [[GEP_END_I2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 8 -; CHECK-NEXT: [[START_I1_PEEL:%.*]] = load ptr, ptr [[B]], align 8 -; CHECK-NEXT: [[END_I3_PEEL:%.*]] = load ptr, ptr [[GEP_END_I2]], align 8 -; CHECK-NEXT: [[START_INT_I4_PEEL:%.*]] = ptrtoint ptr [[START_I1_PEEL]] to i64 -; CHECK-NEXT: [[END_INT_I5_PEEL:%.*]] = ptrtoint ptr [[END_I3_PEEL]] to i64 -; CHECK-NEXT: [[SUB_I6_PEEL:%.*]] = sub i64 [[END_INT_I5_PEEL]], [[START_INT_I4_PEEL]] -; CHECK-NEXT: [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 8 -; CHECK-NEXT: [[LV_I9_PEEL:%.*]] = load i64, ptr [[START_I1_PEEL]], align 8 -; CHECK-NEXT: [[SUM_NEXT_PEEL:%.*]] = add i64 [[LV_I_PEEL]], [[LV_I9_PEEL]] -; CHECK-NEXT: [[EXITCOND_PEEL_NOT:%.*]] = icmp slt i64 [[N]], 1 -; CHECK-NEXT: br i1 [[EXITCOND_PEEL_NOT]], label [[EXIT:%.*]], label [[LOOP_PREHEADER:%.*]] +; CHECK-NEXT: [[TMP4:%.*]] = zext nneg i64 [[SMAX]] to i128 +; CHECK-NEXT: [[UMIN12:%.*]] = tail call i128 @llvm.umin.i128(i128 [[UMIN]], i128 [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i128 [[TMP1]], [[UMIN12]] +; CHECK-NEXT: br i1 [[TMP5]], label [[ERROR_I:%.*]], label [[ENTRY_SPLIT:%.*]] +; CHECK: entry.split: +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i128 [[TMP3]], [[UMIN12]] +; CHECK-NEXT: br i1 [[TMP6]], label [[ERROR_I10:%.*]], label [[LOOP_PREHEADER:%.*]] ; CHECK: loop.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1 -; CHECK-NEXT: [[UMIN:%.*]] = tail call i64 @llvm.umin.i64(i64 [[SUB_I6_PEEL]], i64 [[TMP0]]) -; CHECK-NEXT: [[TMP1:%.*]] = freeze i64 [[UMIN]] -; CHECK-NEXT: [[UMIN15:%.*]] = tail call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SUB_I]]) -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[UMIN15]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 5 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[LOOP_PREHEADER20:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP7:%.*]] = add nuw i64 [[SMAX]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 3 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[LOOP_PREHEADER17:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[TMP2]], 3 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[N_VEC]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> , i64 [[SUM_NEXT_PEEL]], i64 0 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP7]], -4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI16:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = or disjoint i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i64 16 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8 -; CHECK-NEXT: [[WIDE_LOAD17:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI13:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[INDEX]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 16 -; CHECK-NEXT: [[WIDE_LOAD18:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i64> [[WIDE_LOAD17]], [[VEC_PHI16]] -; CHECK-NEXT: [[TMP12]] = add <2 x i64> [[TMP10]], [[WIDE_LOAD18]] -; CHECK-NEXT: [[TMP13]] = add <2 x i64> [[TMP11]], [[WIDE_LOAD19]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8 +; CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[START_I1]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i64 16 +; CHECK-NEXT: [[WIDE_LOAD15:%.*]] = load <2 x i64>, ptr [[TMP10]], align 8 +; CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <2 x i64>, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP13:%.*]] = add <2 x i64> [[WIDE_LOAD14]], [[VEC_PHI13]] +; CHECK-NEXT: [[TMP14]] = add <2 x i64> [[TMP12]], [[WIDE_LOAD15]] +; CHECK-NEXT: [[TMP15]] = add <2 x i64> [[TMP13]], [[WIDE_LOAD16]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP13]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) -; CHECK-NEXT: br label [[LOOP_PREHEADER20]] -; CHECK: loop.preheader20: -; CHECK-NEXT: [[IV_PH:%.*]] = phi i64 [ 1, [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[SUM_PH:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[LOOP_PREHEADER]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP7]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[LOOP_PREHEADER17]] +; CHECK: loop.preheader17: +; CHECK-NEXT: [[IV_PH:%.*]] = phi i64 [ 0, [[LOOP_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_PH:%.*]] = phi i64 [ 0, [[LOOP_PREHEADER]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT11:%.*]] ], [ [[IV_PH]], [[LOOP_PREHEADER20]] ] -; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ [[SUM_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT11]] ], [ [[SUM_PH]], [[LOOP_PREHEADER20]] ] -; CHECK-NEXT: [[INRANGE_I:%.*]] = icmp ult i64 [[SUB_I]], [[IV]] -; CHECK-NEXT: br i1 [[INRANGE_I]], label [[ERROR_I:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT:%.*]] -; CHECK: error.i: -; CHECK-NEXT: tail call void @error() -; CHECK-NEXT: unreachable -; CHECK: at_with_int_conversion.exit: -; CHECK-NEXT: [[INRANGE_I7:%.*]] = icmp ult i64 [[SUB_I6_PEEL]], [[IV]] -; CHECK-NEXT: br i1 [[INRANGE_I7]], label [[ERROR_I10:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT11]] -; CHECK: error.i10: -; CHECK-NEXT: tail call void @error() -; CHECK-NEXT: unreachable -; CHECK: at_with_int_conversion.exit11: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[IV_PH]], [[LOOP_PREHEADER17]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ [[SUM_NEXT:%.*]], [[LOOP]] ], [ [[SUM_PH]], [[LOOP_PREHEADER17]] ] ; CHECK-NEXT: [[GEP_IDX_I:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[IV]] ; CHECK-NEXT: [[LV_I:%.*]] = load i64, ptr [[GEP_IDX_I]], align 8 -; CHECK-NEXT: [[GEP_IDX_I8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[IV]] +; CHECK-NEXT: [[GEP_IDX_I8:%.*]] = getelementptr i64, ptr [[START_I1]], i64 [[IV]] ; CHECK-NEXT: [[LV_I9:%.*]] = load i64, ptr [[GEP_IDX_I8]], align 8 ; CHECK-NEXT: [[ADD:%.*]] = add i64 [[LV_I]], [[SUM]] ; CHECK-NEXT: [[SUM_NEXT]] = add i64 [[ADD]], [[LV_I9]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[SMAX]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: error.i: +; CHECK-NEXT: tail call void @error() +; CHECK-NEXT: unreachable +; CHECK: error.i10: +; CHECK-NEXT: tail call void @error() +; CHECK-NEXT: unreachable ; CHECK: exit: -; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[AT_WITH_INT_CONVERSION_EXIT11_PEEL:%.*]] ], [ [[SUM_NEXT]], [[AT_WITH_INT_CONVERSION_EXIT11]] ] +; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[SUM_NEXT]], [[LOOP]] ] ; CHECK-NEXT: ret i64 [[SUM_NEXT_LCSSA]] ; entry: @@ -120,120 +112,111 @@ exit: define i64 @sum_3_at_with_int_conversion(ptr %A, ptr %B, ptr %C, i64 %N) { ; CHECK-LABEL: @sum_3_at_with_int_conversion( -; CHECK-NEXT: at_with_int_conversion.exit22.peel: +; CHECK-NEXT: entry: ; CHECK-NEXT: [[START_I:%.*]] = load ptr, ptr [[A:%.*]], align 8 ; CHECK-NEXT: [[GEP_END_I:%.*]] = getelementptr i8, ptr [[A]], i64 8 ; CHECK-NEXT: [[END_I:%.*]] = load ptr, ptr [[GEP_END_I]], align 8 ; CHECK-NEXT: [[START_INT_I:%.*]] = ptrtoint ptr [[START_I]] to i64 ; CHECK-NEXT: [[END_INT_I:%.*]] = ptrtoint ptr [[END_I]] to i64 ; CHECK-NEXT: [[SUB_I:%.*]] = sub i64 [[END_INT_I]], [[START_INT_I]] -; CHECK-NEXT: [[GEP_END_I13:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 8 +; CHECK-NEXT: [[START_I1:%.*]] = load ptr, ptr [[B:%.*]], align 8 +; CHECK-NEXT: [[GEP_END_I2:%.*]] = getelementptr i8, ptr [[B]], i64 8 +; CHECK-NEXT: [[END_I3:%.*]] = load ptr, ptr [[GEP_END_I2]], align 8 +; CHECK-NEXT: [[START_INT_I4:%.*]] = ptrtoint ptr [[START_I1]] to i64 +; CHECK-NEXT: [[END_INT_I5:%.*]] = ptrtoint ptr [[END_I3]] to i64 +; CHECK-NEXT: [[SUB_I6:%.*]] = sub i64 [[END_INT_I5]], [[START_INT_I4]] +; CHECK-NEXT: [[START_I12:%.*]] = load ptr, ptr [[C:%.*]], align 8 +; CHECK-NEXT: [[GEP_END_I13:%.*]] = getelementptr i8, ptr [[C]], i64 8 +; CHECK-NEXT: [[END_I14:%.*]] = load ptr, ptr [[GEP_END_I13]], align 8 +; CHECK-NEXT: [[START_INT_I15:%.*]] = ptrtoint ptr [[START_I12]] to i64 +; CHECK-NEXT: [[END_INT_I16:%.*]] = ptrtoint ptr [[END_I14]] to i64 +; CHECK-NEXT: [[SUB_I17:%.*]] = sub i64 [[END_INT_I16]], [[START_INT_I15]] +; CHECK-NEXT: [[TMP0:%.*]] = zext i64 [[SUB_I]] to i128 +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i128 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[SUB_I6]] to i128 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i128 [[TMP2]], 1 +; CHECK-NEXT: [[UMIN:%.*]] = tail call i128 @llvm.umin.i128(i128 [[TMP3]], i128 [[TMP1]]) +; CHECK-NEXT: [[TMP4:%.*]] = zext i64 [[SUB_I17]] to i128 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i128 [[TMP4]], 1 +; CHECK-NEXT: [[UMIN23:%.*]] = tail call i128 @llvm.umin.i128(i128 [[UMIN]], i128 [[TMP5]]) ; CHECK-NEXT: [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 0) -; CHECK-NEXT: [[GEP_END_I2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 8 -; CHECK-NEXT: [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 8 -; CHECK-NEXT: [[START_I1_PEEL:%.*]] = load ptr, ptr [[B]], align 8 -; CHECK-NEXT: [[END_I3_PEEL:%.*]] = load ptr, ptr [[GEP_END_I2]], align 8 -; CHECK-NEXT: [[START_INT_I4_PEEL:%.*]] = ptrtoint ptr [[START_I1_PEEL]] to i64 -; CHECK-NEXT: [[END_I3_PEEL_FR:%.*]] = freeze ptr [[END_I3_PEEL]] -; CHECK-NEXT: [[END_INT_I5_PEEL:%.*]] = ptrtoint ptr [[END_I3_PEEL_FR]] to i64 -; CHECK-NEXT: [[SUB_I6_PEEL:%.*]] = sub i64 [[END_INT_I5_PEEL]], [[START_INT_I4_PEEL]] -; CHECK-NEXT: [[START_I12_PEEL:%.*]] = load ptr, ptr [[C]], align 8 -; CHECK-NEXT: [[END_I14_PEEL:%.*]] = load ptr, ptr [[GEP_END_I13]], align 8 -; CHECK-NEXT: [[START_INT_I15_PEEL:%.*]] = ptrtoint ptr [[START_I12_PEEL]] to i64 -; CHECK-NEXT: [[END_INT_I16_PEEL:%.*]] = ptrtoint ptr [[END_I14_PEEL]] to i64 -; CHECK-NEXT: [[SUB_I17_PEEL:%.*]] = sub i64 [[END_INT_I16_PEEL]], [[START_INT_I15_PEEL]] -; CHECK-NEXT: [[LV_I9_PEEL:%.*]] = load i64, ptr [[START_I1_PEEL]], align 8 -; CHECK-NEXT: [[LV_I20_PEEL:%.*]] = load i64, ptr [[START_I12_PEEL]], align 8 -; CHECK-NEXT: [[ADD_2_PEEL:%.*]] = add i64 [[LV_I_PEEL]], [[LV_I9_PEEL]] -; CHECK-NEXT: [[SUM_NEXT_PEEL:%.*]] = add i64 [[ADD_2_PEEL]], [[LV_I20_PEEL]] -; CHECK-NEXT: [[EXITCOND_PEEL_NOT:%.*]] = icmp slt i64 [[N]], 1 -; CHECK-NEXT: br i1 [[EXITCOND_PEEL_NOT]], label [[EXIT:%.*]], label [[LOOP_PREHEADER:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = zext nneg i64 [[SMAX]] to i128 +; CHECK-NEXT: [[UMIN24:%.*]] = tail call i128 @llvm.umin.i128(i128 [[UMIN23]], i128 [[TMP6]]) +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i128 [[TMP1]], [[UMIN24]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i128 [[TMP5]], [[UMIN24]] +; CHECK-NEXT: br i1 [[TMP7]], label [[ERROR_I:%.*]], label [[ENTRY_SPLIT:%.*]] +; CHECK: entry.split: +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i128 [[TMP3]], [[UMIN24]] +; CHECK-NEXT: br i1 [[TMP9]], label [[ERROR_I10:%.*]], label [[ENTRY_SPLIT_SPLIT:%.*]] +; CHECK: entry.split.split: +; CHECK-NEXT: br i1 [[TMP8]], label [[ERROR_I21:%.*]], label [[LOOP_PREHEADER:%.*]] ; CHECK: loop.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1 -; CHECK-NEXT: [[UMIN:%.*]] = tail call i64 @llvm.umin.i64(i64 [[SUB_I17_PEEL]], i64 [[TMP0]]) -; CHECK-NEXT: [[TMP1:%.*]] = freeze i64 [[UMIN]] -; CHECK-NEXT: [[UMIN26:%.*]] = tail call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SUB_I6_PEEL]]) -; CHECK-NEXT: [[UMIN27:%.*]] = tail call i64 @llvm.umin.i64(i64 [[UMIN26]], i64 [[SUB_I]]) -; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[UMIN27]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 5 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[LOOP_PREHEADER34:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = add nuw i64 [[SMAX]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp slt i64 [[N]], 3 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[LOOP_PREHEADER31:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[TMP2]], 3 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[N_VEC]], 1 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> , i64 [[SUM_NEXT_PEEL]], i64 0 +; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP10]], -4 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI28:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = or disjoint i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i64 16 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8 -; CHECK-NEXT: [[WIDE_LOAD29:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 16 -; CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[WIDE_LOAD31:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[START_I12_PEEL]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i64 16 -; CHECK-NEXT: [[WIDE_LOAD32:%.*]] = load <2 x i64>, ptr [[TMP10]], align 8 -; CHECK-NEXT: [[WIDE_LOAD33:%.*]] = load <2 x i64>, ptr [[TMP11]], align 8 -; CHECK-NEXT: [[TMP12:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP13:%.*]] = add <2 x i64> [[WIDE_LOAD29]], [[VEC_PHI28]] -; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i64> [[TMP12]], [[WIDE_LOAD30]] -; CHECK-NEXT: [[TMP15:%.*]] = add <2 x i64> [[TMP13]], [[WIDE_LOAD31]] -; CHECK-NEXT: [[TMP16]] = add <2 x i64> [[TMP14]], [[WIDE_LOAD32]] -; CHECK-NEXT: [[TMP17]] = add <2 x i64> [[TMP15]], [[WIDE_LOAD33]] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI25:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP11]], i64 16 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP11]], align 8 +; CHECK-NEXT: [[WIDE_LOAD26:%.*]] = load <2 x i64>, ptr [[TMP12]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[START_I1]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i64 16 +; CHECK-NEXT: [[WIDE_LOAD27:%.*]] = load <2 x i64>, ptr [[TMP13]], align 8 +; CHECK-NEXT: [[WIDE_LOAD28:%.*]] = load <2 x i64>, ptr [[TMP14]], align 8 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i64, ptr [[START_I12]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[TMP15]], i64 16 +; CHECK-NEXT: [[WIDE_LOAD29:%.*]] = load <2 x i64>, ptr [[TMP15]], align 8 +; CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <2 x i64>, ptr [[TMP16]], align 8 +; CHECK-NEXT: [[TMP17:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP18:%.*]] = add <2 x i64> [[WIDE_LOAD26]], [[VEC_PHI25]] +; CHECK-NEXT: [[TMP19:%.*]] = add <2 x i64> [[TMP17]], [[WIDE_LOAD27]] +; CHECK-NEXT: [[TMP20:%.*]] = add <2 x i64> [[TMP18]], [[WIDE_LOAD28]] +; CHECK-NEXT: [[TMP21]] = add <2 x i64> [[TMP19]], [[WIDE_LOAD29]] +; CHECK-NEXT: [[TMP22]] = add <2 x i64> [[TMP20]], [[WIDE_LOAD30]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP17]], [[TMP16]] -; CHECK-NEXT: [[TMP19:%.*]] = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) -; CHECK-NEXT: br label [[LOOP_PREHEADER34]] -; CHECK: loop.preheader34: -; CHECK-NEXT: [[IV_PH:%.*]] = phi i64 [ 1, [[LOOP_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] -; CHECK-NEXT: [[SUM_PH:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[LOOP_PREHEADER]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP22]], [[TMP21]] +; CHECK-NEXT: [[TMP24:%.*]] = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP10]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[LOOP_PREHEADER31]] +; CHECK: loop.preheader31: +; CHECK-NEXT: [[IV_PH:%.*]] = phi i64 [ 0, [[LOOP_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[SUM_PH:%.*]] = phi i64 [ 0, [[LOOP_PREHEADER]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT22:%.*]] ], [ [[IV_PH]], [[LOOP_PREHEADER34]] ] -; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ [[SUM_NEXT:%.*]], [[AT_WITH_INT_CONVERSION_EXIT22]] ], [ [[SUM_PH]], [[LOOP_PREHEADER34]] ] -; CHECK-NEXT: [[INRANGE_I:%.*]] = icmp ult i64 [[SUB_I]], [[IV]] -; CHECK-NEXT: br i1 [[INRANGE_I]], label [[ERROR_I:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT:%.*]] -; CHECK: error.i: -; CHECK-NEXT: tail call void @error() -; CHECK-NEXT: unreachable -; CHECK: at_with_int_conversion.exit: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[IV_PH]], [[LOOP_PREHEADER31]] ] +; CHECK-NEXT: [[SUM:%.*]] = phi i64 [ [[SUM_NEXT:%.*]], [[LOOP]] ], [ [[SUM_PH]], [[LOOP_PREHEADER31]] ] ; CHECK-NEXT: [[GEP_IDX_I:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[IV]] ; CHECK-NEXT: [[LV_I:%.*]] = load i64, ptr [[GEP_IDX_I]], align 8 -; CHECK-NEXT: [[INRANGE_I7:%.*]] = icmp ult i64 [[SUB_I6_PEEL]], [[IV]] -; CHECK-NEXT: br i1 [[INRANGE_I7]], label [[ERROR_I10:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT11:%.*]] -; CHECK: error.i10: -; CHECK-NEXT: tail call void @error() -; CHECK-NEXT: unreachable -; CHECK: at_with_int_conversion.exit11: -; CHECK-NEXT: [[INRANGE_I18:%.*]] = icmp ult i64 [[SUB_I17_PEEL]], [[IV]] -; CHECK-NEXT: br i1 [[INRANGE_I18]], label [[ERROR_I21:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT22]] -; CHECK: error.i21: -; CHECK-NEXT: tail call void @error() -; CHECK-NEXT: unreachable -; CHECK: at_with_int_conversion.exit22: -; CHECK-NEXT: [[GEP_IDX_I8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[IV]] +; CHECK-NEXT: [[GEP_IDX_I8:%.*]] = getelementptr i64, ptr [[START_I1]], i64 [[IV]] ; CHECK-NEXT: [[LV_I9:%.*]] = load i64, ptr [[GEP_IDX_I8]], align 8 -; CHECK-NEXT: [[GEP_IDX_I19:%.*]] = getelementptr i64, ptr [[START_I12_PEEL]], i64 [[IV]] +; CHECK-NEXT: [[GEP_IDX_I19:%.*]] = getelementptr i64, ptr [[START_I12]], i64 [[IV]] ; CHECK-NEXT: [[LV_I20:%.*]] = load i64, ptr [[GEP_IDX_I19]], align 8 ; CHECK-NEXT: [[ADD_1:%.*]] = add i64 [[LV_I]], [[SUM]] ; CHECK-NEXT: [[ADD_2:%.*]] = add i64 [[ADD_1]], [[LV_I9]] ; CHECK-NEXT: [[SUM_NEXT]] = add i64 [[ADD_2]], [[LV_I20]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[SMAX]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: error.i: +; CHECK-NEXT: tail call void @error() +; CHECK-NEXT: unreachable +; CHECK: error.i10: +; CHECK-NEXT: tail call void @error() +; CHECK-NEXT: unreachable +; CHECK: error.i21: +; CHECK-NEXT: tail call void @error() +; CHECK-NEXT: unreachable ; CHECK: exit: -; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[AT_WITH_INT_CONVERSION_EXIT22_PEEL:%.*]] ], [ [[SUM_NEXT]], [[AT_WITH_INT_CONVERSION_EXIT22]] ] +; CHECK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[TMP24]], [[MIDDLE_BLOCK]] ], [ [[SUM_NEXT]], [[LOOP]] ] ; CHECK-NEXT: ret i64 [[SUM_NEXT_LCSSA]] ; entry: