From f0df4fbd0c7b6bb369ceaa1fd6f9e0c88d781ae5 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Sun, 11 Aug 2024 20:38:36 +0200 Subject: [PATCH] [LV] Support generating masks for switch terminators. (#99808) Update createEdgeMask to created masks where the terminator in Src is a switch. We need to handle 2 separate cases: 1. Dst is not the default desintation. Dst is reached if any of the cases with destination == Dst are taken. Join the conditions for each case where destination == Dst using a logical OR. 2. Dst is the default destination. Dst is reached if none of the cases with destination != Dst are taken. Join the conditions for each case where the destination is != Dst using a logical OR and negate it. Edge masks are created for every destination of cases and/or default when requesting a mask where the source is a switch. Fixes https://github.com/llvm/llvm-project/issues/48188. PR: https://github.com/llvm/llvm-project/pull/99808 --- .../Vectorize/LoopVectorizationLegality.cpp | 21 +- .../Transforms/Vectorize/LoopVectorize.cpp | 74 +- .../Transforms/Vectorize/VPRecipeBuilder.h | 3 + .../LoopVectorize/X86/predicate-switch.ll | 874 +++++++++++++++++- .../Transforms/LoopVectorize/no_switch.ll | 12 +- .../LoopVectorize/predicate-switch.ll | 264 +++++- .../LoopVectorize/vplan-predicate-switch.ll | 93 +- .../X86/pr48844-br-to-switch-vectorization.ll | 60 +- 8 files changed, 1349 insertions(+), 52 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp index 465d0df30e3f75..0417916546bb10 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1340,12 +1340,21 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { // Collect the blocks that need predication. for (BasicBlock *BB : TheLoop->blocks()) { - // We don't support switch statements inside loops. - if (!isa(BB->getTerminator())) { - reportVectorizationFailure("Loop contains a switch statement", - "loop contains a switch statement", - "LoopContainsSwitch", ORE, TheLoop, - BB->getTerminator()); + // We support only branches and switch statements as terminators inside the + // loop. + if (isa(BB->getTerminator())) { + if (TheLoop->isLoopExiting(BB)) { + reportVectorizationFailure("Loop contains an unsupported switch", + "loop contains an unsupported switch", + "LoopContainsUnsupportedSwitch", ORE, + TheLoop, BB->getTerminator()); + return false; + } + } else if (!isa(BB->getTerminator())) { + reportVectorizationFailure("Loop contains an unsupported terminator", + "loop contains an unsupported terminator", + "LoopContainsUnsupportedTerminator", ORE, + TheLoop, BB->getTerminator()); return false; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 180eb43bb71815..6ac34ef6375ce1 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6453,6 +6453,17 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, // a predicated block since it will become a fall-through, although we // may decide in the future to call TTI for all branches. } + case Instruction::Switch: { + if (VF.isScalar()) + return TTI.getCFInstrCost(Instruction::Switch, CostKind); + auto *Switch = cast(I); + return Switch->getNumCases() * + TTI.getCmpSelInstrCost( + Instruction::ICmp, + ToVectorTy(Switch->getCondition()->getType(), VF), + ToVectorTy(Type::getInt1Ty(I->getContext()), VF), + CmpInst::ICMP_EQ, CostKind); + } case Instruction::PHI: { auto *Phi = cast(I); @@ -7841,6 +7852,62 @@ VPRecipeBuilder::mapToVPValues(User::op_range Operands) { return map_range(Operands, Fn); } +void VPRecipeBuilder::createSwitchEdgeMasks(SwitchInst *SI) { + BasicBlock *Src = SI->getParent(); + assert(!OrigLoop->isLoopExiting(Src) && + all_of(successors(Src), + [this](BasicBlock *Succ) { + return OrigLoop->getHeader() != Succ; + }) && + "unsupported switch either exiting loop or continuing to header"); + // Create masks where the terminator in Src is a switch. We create mask for + // all edges at the same time. This is more efficient, as we can create and + // collect compares for all cases once. + VPValue *Cond = getVPValueOrAddLiveIn(SI->getCondition(), Plan); + BasicBlock *DefaultDst = SI->getDefaultDest(); + MapVector> Dst2Compares; + for (auto &C : SI->cases()) { + BasicBlock *Dst = C.getCaseSuccessor(); + assert(!EdgeMaskCache.contains({Src, Dst}) && "Edge masks already created"); + // Cases whose destination is the same as default are redundant and can be + // ignored - they will get there anyhow. + if (Dst == DefaultDst) + continue; + auto I = Dst2Compares.insert({Dst, {}}); + VPValue *V = getVPValueOrAddLiveIn(C.getCaseValue(), Plan); + I.first->second.push_back(Builder.createICmp(CmpInst::ICMP_EQ, Cond, V)); + } + + // We need to handle 2 separate cases below for all entries in Dst2Compares, + // which excludes destinations matching the default destination. + VPValue *SrcMask = getBlockInMask(Src); + VPValue *DefaultMask = nullptr; + for (const auto &[Dst, Conds] : Dst2Compares) { + // 1. Dst is not the default destination. Dst is reached if any of the cases + // with destination == Dst are taken. Join the conditions for each case + // whose destination == Dst using an OR. + VPValue *Mask = Conds[0]; + for (VPValue *V : ArrayRef(Conds).drop_front()) + Mask = Builder.createOr(Mask, V); + if (SrcMask) + Mask = Builder.createLogicalAnd(SrcMask, Mask); + EdgeMaskCache[{Src, Dst}] = Mask; + + // 2. Create the mask for the default destination, which is reached if none + // of the cases with destination != default destination are taken. Join the + // conditions for each case where the destination is != Dst using an OR and + // negate it. + DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask; + } + + if (DefaultMask) { + DefaultMask = Builder.createNot(DefaultMask); + if (SrcMask) + DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask); + } + EdgeMaskCache[{Src, DefaultDst}] = DefaultMask; +} + VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { assert(is_contained(predecessors(Dst), Src) && "Invalid edge"); @@ -7850,12 +7917,17 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { if (ECEntryIt != EdgeMaskCache.end()) return ECEntryIt->second; + if (auto *SI = dyn_cast(Src->getTerminator())) { + createSwitchEdgeMasks(SI); + assert(EdgeMaskCache.contains(Edge) && "Mask for Edge not created?"); + return EdgeMaskCache[Edge]; + } + VPValue *SrcMask = getBlockInMask(Src); // The terminator has to be a branch inst! BranchInst *BI = dyn_cast(Src->getTerminator()); assert(BI && "Unexpected terminator found"); - if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1)) return EdgeMaskCache[Edge] = SrcMask; diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index b4c7ab02f928f0..544000059f8954 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -134,6 +134,9 @@ class VPRecipeBuilder { /// Returns the *entry* mask for the block \p BB. VPValue *getBlockInMask(BasicBlock *BB) const; + /// Create an edge mask for every destination of cases and/or default. + void createSwitchEdgeMasks(SwitchInst *SI); + /// A helper function that computes the predicate of the edge between SRC /// and DST. VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst); diff --git a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll index 46c62e1ea77411..4720eb254f84dc 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/predicate-switch.ll @@ -6,9 +6,43 @@ define void @switch_default_to_latch_common_dest(ptr %start, ptr %end) { ; COST-LABEL: define void @switch_default_to_latch_common_dest( ; COST-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0:[0-9]+]] { ; COST-NEXT: [[ENTRY:.*]]: +; COST-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; COST-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; COST-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; COST-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; COST-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; COST-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; COST-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 +; COST-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; COST: [[VECTOR_PH]]: +; COST-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4 +; COST-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; COST-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; COST-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; COST-NEXT: br label %[[VECTOR_BODY:.*]] +; COST: [[VECTOR_BODY]]: +; COST-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; COST-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; COST-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; COST-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; COST-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; COST-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 1 +; COST-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; COST-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; COST-NEXT: [[TMP9:%.*]] = or <4 x i1> [[TMP7]], [[TMP8]] +; COST-NEXT: [[TMP10:%.*]] = or <4 x i1> [[TMP9]], [[TMP9]] +; COST-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP6]], i32 1, <4 x i1> [[TMP10]]) +; COST-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; COST-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; COST-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; COST: [[MIDDLE_BLOCK]]: +; COST-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; COST-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; COST: [[SCALAR_PH]]: +; COST-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; COST-NEXT: br label %[[LOOP_HEADER:.*]] ; COST: [[LOOP_HEADER]]: -; COST-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; COST-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; COST-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; COST-NEXT: switch i64 [[L]], label %[[LOOP_LATCH]] [ ; COST-NEXT: i64 -12, label %[[IF_THEN:.*]] @@ -20,16 +54,59 @@ define void @switch_default_to_latch_common_dest(ptr %start, ptr %end) { ; COST: [[LOOP_LATCH]]: ; COST-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; COST-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; COST-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; COST-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] ; COST: [[EXIT]]: ; COST-NEXT: ret void ; ; FORCED-LABEL: define void @switch_default_to_latch_common_dest( ; FORCED-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0:[0-9]+]] { ; FORCED-NEXT: [[ENTRY:.*]]: +; FORCED-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; FORCED-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; FORCED-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; FORCED-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; FORCED-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; FORCED-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; FORCED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 +; FORCED-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; FORCED: [[VECTOR_PH]]: +; FORCED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 8 +; FORCED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; FORCED-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; FORCED-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; FORCED-NEXT: br label %[[VECTOR_BODY:.*]] +; FORCED: [[VECTOR_BODY]]: +; FORCED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; FORCED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; FORCED-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; FORCED-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32 +; FORCED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; FORCED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]] +; FORCED-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; FORCED-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4 +; FORCED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1 +; FORCED-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1 +; FORCED-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP11:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP12:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP13:%.*]] = or <4 x i1> [[TMP9]], [[TMP11]] +; FORCED-NEXT: [[TMP14:%.*]] = or <4 x i1> [[TMP10]], [[TMP12]] +; FORCED-NEXT: [[TMP15:%.*]] = or <4 x i1> [[TMP13]], [[TMP13]] +; FORCED-NEXT: [[TMP16:%.*]] = or <4 x i1> [[TMP14]], [[TMP14]] +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP7]], i32 1, <4 x i1> [[TMP15]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP8]], i32 1, <4 x i1> [[TMP16]]) +; FORCED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FORCED-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FORCED-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; FORCED: [[MIDDLE_BLOCK]]: +; FORCED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; FORCED-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; FORCED: [[SCALAR_PH]]: +; FORCED-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; FORCED-NEXT: br label %[[LOOP_HEADER:.*]] ; FORCED: [[LOOP_HEADER]]: -; FORCED-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; FORCED-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; FORCED-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; FORCED-NEXT: switch i64 [[L]], label %[[LOOP_LATCH]] [ ; FORCED-NEXT: i64 -12, label %[[IF_THEN:.*]] @@ -41,7 +118,7 @@ define void @switch_default_to_latch_common_dest(ptr %start, ptr %end) { ; FORCED: [[LOOP_LATCH]]: ; FORCED-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; FORCED-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; FORCED-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; FORCED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] ; FORCED: [[EXIT]]: ; FORCED-NEXT: ret void ; @@ -69,6 +146,159 @@ exit: ret void } +define void @switch_default_to_latch_common_dest_using_branches(ptr %start, ptr %end) { +; COST-LABEL: define void @switch_default_to_latch_common_dest_using_branches( +; COST-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { +; COST-NEXT: [[ENTRY:.*]]: +; COST-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; COST-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; COST-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; COST-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; COST-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; COST-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; COST-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 +; COST-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; COST: [[VECTOR_PH]]: +; COST-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4 +; COST-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; COST-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; COST-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; COST-NEXT: br label %[[VECTOR_BODY:.*]] +; COST: [[VECTOR_BODY]]: +; COST-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; COST-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; COST-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; COST-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; COST-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; COST-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 1 +; COST-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; COST-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; COST-NEXT: [[TMP9:%.*]] = xor <4 x i1> [[TMP7]], +; COST-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer +; COST-NEXT: [[TMP11:%.*]] = or <4 x i1> [[TMP10]], [[TMP7]] +; COST-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP6]], i32 1, <4 x i1> [[TMP11]]) +; COST-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; COST-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; COST-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; COST: [[MIDDLE_BLOCK]]: +; COST-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; COST-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; COST: [[SCALAR_PH]]: +; COST-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] +; COST-NEXT: br label %[[LOOP_HEADER:.*]] +; COST: [[LOOP_HEADER]]: +; COST-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; COST-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 +; COST-NEXT: [[EQ_1:%.*]] = icmp eq i64 [[L]], -12 +; COST-NEXT: [[EQ_2:%.*]] = icmp eq i64 [[L]], 13 +; COST-NEXT: br i1 [[EQ_1]], label %[[IF_THEN:.*]], label %[[ELSE:.*]] +; COST: [[IF_THEN]]: +; COST-NEXT: store i64 42, ptr [[PTR_IV]], align 1 +; COST-NEXT: br label %[[LOOP_LATCH]] +; COST: [[ELSE]]: +; COST-NEXT: br i1 [[EQ_2]], label %[[IF_THEN]], label %[[LOOP_LATCH]] +; COST: [[LOOP_LATCH]]: +; COST-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 +; COST-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] +; COST-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]] +; COST: [[EXIT]]: +; COST-NEXT: ret void +; +; FORCED-LABEL: define void @switch_default_to_latch_common_dest_using_branches( +; FORCED-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { +; FORCED-NEXT: [[ENTRY:.*]]: +; FORCED-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; FORCED-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; FORCED-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; FORCED-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; FORCED-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; FORCED-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; FORCED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 +; FORCED-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; FORCED: [[VECTOR_PH]]: +; FORCED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 8 +; FORCED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; FORCED-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; FORCED-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; FORCED-NEXT: br label %[[VECTOR_BODY:.*]] +; FORCED: [[VECTOR_BODY]]: +; FORCED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; FORCED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; FORCED-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; FORCED-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32 +; FORCED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; FORCED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]] +; FORCED-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; FORCED-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4 +; FORCED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1 +; FORCED-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1 +; FORCED-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP11:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP12:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP13:%.*]] = xor <4 x i1> [[TMP9]], +; FORCED-NEXT: [[TMP14:%.*]] = xor <4 x i1> [[TMP10]], +; FORCED-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP13]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer +; FORCED-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP14]], <4 x i1> [[TMP12]], <4 x i1> zeroinitializer +; FORCED-NEXT: [[TMP17:%.*]] = or <4 x i1> [[TMP15]], [[TMP9]] +; FORCED-NEXT: [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP10]] +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP7]], i32 1, <4 x i1> [[TMP17]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP8]], i32 1, <4 x i1> [[TMP18]]) +; FORCED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FORCED-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FORCED-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; FORCED: [[MIDDLE_BLOCK]]: +; FORCED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; FORCED-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; FORCED: [[SCALAR_PH]]: +; FORCED-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] +; FORCED-NEXT: br label %[[LOOP_HEADER:.*]] +; FORCED: [[LOOP_HEADER]]: +; FORCED-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; FORCED-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 +; FORCED-NEXT: [[EQ_1:%.*]] = icmp eq i64 [[L]], -12 +; FORCED-NEXT: [[EQ_2:%.*]] = icmp eq i64 [[L]], 13 +; FORCED-NEXT: br i1 [[EQ_1]], label %[[IF_THEN:.*]], label %[[ELSE:.*]] +; FORCED: [[IF_THEN]]: +; FORCED-NEXT: store i64 42, ptr [[PTR_IV]], align 1 +; FORCED-NEXT: br label %[[LOOP_LATCH]] +; FORCED: [[ELSE]]: +; FORCED-NEXT: br i1 [[EQ_2]], label %[[IF_THEN]], label %[[LOOP_LATCH]] +; FORCED: [[LOOP_LATCH]]: +; FORCED-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 +; FORCED-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] +; FORCED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]] +; FORCED: [[EXIT]]: +; FORCED-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: + %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ] + %l = load i64, ptr %ptr.iv, align 1 + %eq.1 = icmp eq i64 %l, -12 + %eq.2 = icmp eq i64 %l, 13 + br i1 %eq.1, label %if.then, label %else + +if.then: + store i64 42, ptr %ptr.iv, align 1 + br label %loop.latch + +else: + br i1 %eq.2, label %if.then, label %loop.latch + +loop.latch: + %ptr.iv.next = getelementptr inbounds i64, ptr %ptr.iv, i64 1 + %ec = icmp eq ptr %ptr.iv.next, %end + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} + +; TODO: Instead of using masked stores, the store can be sunk, executed +; unconditionally and fed by selects. define void @switch_all_dests_distinct(ptr %start, ptr %end) { ; COST-LABEL: define void @switch_all_dests_distinct( ; COST-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { @@ -104,9 +334,62 @@ define void @switch_all_dests_distinct(ptr %start, ptr %end) { ; FORCED-LABEL: define void @switch_all_dests_distinct( ; FORCED-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { ; FORCED-NEXT: [[ENTRY:.*]]: +; FORCED-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; FORCED-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; FORCED-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; FORCED-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; FORCED-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; FORCED-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; FORCED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 +; FORCED-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; FORCED: [[VECTOR_PH]]: +; FORCED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 8 +; FORCED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; FORCED-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; FORCED-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; FORCED-NEXT: br label %[[VECTOR_BODY:.*]] +; FORCED: [[VECTOR_BODY]]: +; FORCED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; FORCED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; FORCED-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; FORCED-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32 +; FORCED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; FORCED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]] +; FORCED-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; FORCED-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4 +; FORCED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1 +; FORCED-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1 +; FORCED-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP11:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP12:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP13:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], zeroinitializer +; FORCED-NEXT: [[TMP14:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], zeroinitializer +; FORCED-NEXT: [[TMP15:%.*]] = or <4 x i1> [[TMP9]], [[TMP11]] +; FORCED-NEXT: [[TMP16:%.*]] = or <4 x i1> [[TMP10]], [[TMP12]] +; FORCED-NEXT: [[TMP17:%.*]] = or <4 x i1> [[TMP15]], [[TMP13]] +; FORCED-NEXT: [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP14]] +; FORCED-NEXT: [[TMP19:%.*]] = xor <4 x i1> [[TMP17]], +; FORCED-NEXT: [[TMP20:%.*]] = xor <4 x i1> [[TMP18]], +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP7]], i32 1, <4 x i1> [[TMP13]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP8]], i32 1, <4 x i1> [[TMP14]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP7]], i32 1, <4 x i1> [[TMP11]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP12]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP7]], i32 1, <4 x i1> [[TMP9]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP8]], i32 1, <4 x i1> [[TMP10]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP7]], i32 1, <4 x i1> [[TMP19]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP8]], i32 1, <4 x i1> [[TMP20]]) +; FORCED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FORCED-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FORCED-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; FORCED: [[MIDDLE_BLOCK]]: +; FORCED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; FORCED-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; FORCED: [[SCALAR_PH]]: +; FORCED-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; FORCED-NEXT: br label %[[LOOP_HEADER:.*]] ; FORCED: [[LOOP_HEADER]]: -; FORCED-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; FORCED-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; FORCED-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; FORCED-NEXT: switch i64 [[L]], label %[[DEFAULT:.*]] [ ; FORCED-NEXT: i64 -12, label %[[IF_THEN_1:.*]] @@ -128,7 +411,7 @@ define void @switch_all_dests_distinct(ptr %start, ptr %end) { ; FORCED: [[LOOP_LATCH]]: ; FORCED-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; FORCED-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; FORCED-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; FORCED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]] ; FORCED: [[EXIT]]: ; FORCED-NEXT: ret void ; @@ -169,6 +452,213 @@ exit: ret void } +define void @switch_all_dests_distinct_variant_using_branches(ptr %start, ptr %end) { +; COST-LABEL: define void @switch_all_dests_distinct_variant_using_branches( +; COST-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { +; COST-NEXT: [[ENTRY:.*]]: +; COST-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; COST-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; COST-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; COST-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; COST-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; COST-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; COST-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 4 +; COST-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; COST: [[VECTOR_PH]]: +; COST-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4 +; COST-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; COST-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; COST-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; COST-NEXT: br label %[[VECTOR_BODY:.*]] +; COST: [[VECTOR_BODY]]: +; COST-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; COST-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; COST-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; COST-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; COST-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; COST-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP6]], align 1 +; COST-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; COST-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; COST-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], zeroinitializer +; COST-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], +; COST-NEXT: [[TMP11:%.*]] = xor <4 x i1> [[TMP8]], +; COST-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer +; COST-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP12]], <4 x i1> [[TMP9]], <4 x i1> zeroinitializer +; COST-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP6]], i32 1, <4 x i1> [[TMP13]]) +; COST-NEXT: [[TMP14:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer +; COST-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP6]], i32 1, <4 x i1> [[TMP14]]) +; COST-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP6]], i32 1, <4 x i1> [[TMP7]]) +; COST-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; COST-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; COST-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; COST: [[MIDDLE_BLOCK]]: +; COST-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; COST-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; COST: [[SCALAR_PH]]: +; COST-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] +; COST-NEXT: br label %[[LOOP_HEADER:.*]] +; COST: [[LOOP_HEADER]]: +; COST-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; COST-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 +; COST-NEXT: [[EQ_0:%.*]] = icmp eq i64 [[L]], -12 +; COST-NEXT: [[EQ_1:%.*]] = icmp eq i64 [[L]], 13 +; COST-NEXT: [[EQ_2:%.*]] = icmp eq i64 [[L]], 0 +; COST-NEXT: br i1 [[EQ_0]], label %[[IF_THEN_1:.*]], label %[[ELSE_1:.*]] +; COST: [[ELSE_1]]: +; COST-NEXT: br i1 [[EQ_1]], label %[[IF_THEN_2:.*]], label %[[ELSE_2:.*]] +; COST: [[ELSE_2]]: +; COST-NEXT: br i1 [[EQ_2]], label %[[IF_THEN_3:.*]], label %[[LOOP_LATCH]] +; COST: [[IF_THEN_1]]: +; COST-NEXT: store i64 42, ptr [[PTR_IV]], align 1 +; COST-NEXT: br label %[[LOOP_LATCH]] +; COST: [[IF_THEN_2]]: +; COST-NEXT: store i64 0, ptr [[PTR_IV]], align 1 +; COST-NEXT: br label %[[LOOP_LATCH]] +; COST: [[IF_THEN_3]]: +; COST-NEXT: store i64 1, ptr [[PTR_IV]], align 1 +; COST-NEXT: br label %[[LOOP_LATCH]] +; COST: [[DEFAULT:.*:]] +; COST-NEXT: store i64 2, ptr poison, align 1 +; COST-NEXT: unreachable +; COST: [[LOOP_LATCH]]: +; COST-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 +; COST-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] +; COST-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP7:![0-9]+]] +; COST: [[EXIT]]: +; COST-NEXT: ret void +; +; FORCED-LABEL: define void @switch_all_dests_distinct_variant_using_branches( +; FORCED-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { +; FORCED-NEXT: [[ENTRY:.*]]: +; FORCED-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; FORCED-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; FORCED-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; FORCED-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; FORCED-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; FORCED-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; FORCED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 +; FORCED-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; FORCED: [[VECTOR_PH]]: +; FORCED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 8 +; FORCED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; FORCED-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; FORCED-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; FORCED-NEXT: br label %[[VECTOR_BODY:.*]] +; FORCED: [[VECTOR_BODY]]: +; FORCED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; FORCED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; FORCED-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; FORCED-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32 +; FORCED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; FORCED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]] +; FORCED-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; FORCED-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4 +; FORCED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1 +; FORCED-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1 +; FORCED-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP11:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP12:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP13:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], zeroinitializer +; FORCED-NEXT: [[TMP14:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], zeroinitializer +; FORCED-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[TMP9]], +; FORCED-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP10]], +; FORCED-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP11]], +; FORCED-NEXT: [[TMP18:%.*]] = xor <4 x i1> [[TMP12]], +; FORCED-NEXT: [[TMP19:%.*]] = select <4 x i1> [[TMP15]], <4 x i1> [[TMP17]], <4 x i1> zeroinitializer +; FORCED-NEXT: [[TMP20:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP18]], <4 x i1> zeroinitializer +; FORCED-NEXT: [[TMP21:%.*]] = select <4 x i1> [[TMP19]], <4 x i1> [[TMP13]], <4 x i1> zeroinitializer +; FORCED-NEXT: [[TMP22:%.*]] = select <4 x i1> [[TMP20]], <4 x i1> [[TMP14]], <4 x i1> zeroinitializer +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP7]], i32 1, <4 x i1> [[TMP21]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP8]], i32 1, <4 x i1> [[TMP22]]) +; FORCED-NEXT: [[TMP23:%.*]] = select <4 x i1> [[TMP15]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer +; FORCED-NEXT: [[TMP24:%.*]] = select <4 x i1> [[TMP16]], <4 x i1> [[TMP12]], <4 x i1> zeroinitializer +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP7]], i32 1, <4 x i1> [[TMP23]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP24]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP7]], i32 1, <4 x i1> [[TMP9]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP8]], i32 1, <4 x i1> [[TMP10]]) +; FORCED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FORCED-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FORCED-NEXT: br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; FORCED: [[MIDDLE_BLOCK]]: +; FORCED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; FORCED-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; FORCED: [[SCALAR_PH]]: +; FORCED-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] +; FORCED-NEXT: br label %[[LOOP_HEADER:.*]] +; FORCED: [[LOOP_HEADER]]: +; FORCED-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; FORCED-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 +; FORCED-NEXT: [[EQ_0:%.*]] = icmp eq i64 [[L]], -12 +; FORCED-NEXT: [[EQ_1:%.*]] = icmp eq i64 [[L]], 13 +; FORCED-NEXT: [[EQ_2:%.*]] = icmp eq i64 [[L]], 0 +; FORCED-NEXT: br i1 [[EQ_0]], label %[[IF_THEN_1:.*]], label %[[ELSE_1:.*]] +; FORCED: [[ELSE_1]]: +; FORCED-NEXT: br i1 [[EQ_1]], label %[[IF_THEN_2:.*]], label %[[ELSE_2:.*]] +; FORCED: [[ELSE_2]]: +; FORCED-NEXT: br i1 [[EQ_2]], label %[[IF_THEN_3:.*]], label %[[LOOP_LATCH]] +; FORCED: [[IF_THEN_1]]: +; FORCED-NEXT: store i64 42, ptr [[PTR_IV]], align 1 +; FORCED-NEXT: br label %[[LOOP_LATCH]] +; FORCED: [[IF_THEN_2]]: +; FORCED-NEXT: store i64 0, ptr [[PTR_IV]], align 1 +; FORCED-NEXT: br label %[[LOOP_LATCH]] +; FORCED: [[IF_THEN_3]]: +; FORCED-NEXT: store i64 1, ptr [[PTR_IV]], align 1 +; FORCED-NEXT: br label %[[LOOP_LATCH]] +; FORCED: [[DEFAULT:.*:]] +; FORCED-NEXT: store i64 2, ptr poison, align 1 +; FORCED-NEXT: unreachable +; FORCED: [[LOOP_LATCH]]: +; FORCED-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 +; FORCED-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] +; FORCED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP9:![0-9]+]] +; FORCED: [[EXIT]]: +; FORCED-NEXT: ret void +; +entry: + br label %loop.header + +loop.header: + %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop.latch ] + %l = load i64, ptr %ptr.iv, align 1 + %eq.0 = icmp eq i64 %l, -12 + %eq.1 = icmp eq i64 %l, 13 + %eq.2 = icmp eq i64 %l, 0 + br i1 %eq.0, label %if.then.1, label %else.1 + +else.1: + br i1 %eq.1, label %if.then.2, label %else.2 + +else.2: + br i1 %eq.2, label %if.then.3, label %loop.latch + +if.then.1: + store i64 42, ptr %ptr.iv, align 1 + br label %loop.latch + +if.then.2: + store i64 0, ptr %ptr.iv, align 1 + br label %loop.latch + +if.then.3: + store i64 1, ptr %ptr.iv, align 1 + br label %loop.latch + +default: + store i64 2, ptr %ptr.iv, align 1 + br label %loop.latch + +loop.latch: + %ptr.iv.next = getelementptr inbounds i64, ptr %ptr.iv, i64 1 + %ec = icmp eq ptr %ptr.iv.next, %end + br i1 %ec, label %exit, label %loop.header + +exit: + ret void +} + + define void @switch_multiple_common_dests(ptr %start, ptr %end) { ; COST-LABEL: define void @switch_multiple_common_dests( @@ -204,9 +694,74 @@ define void @switch_multiple_common_dests(ptr %start, ptr %end) { ; FORCED-LABEL: define void @switch_multiple_common_dests( ; FORCED-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { ; FORCED-NEXT: [[ENTRY:.*]]: +; FORCED-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; FORCED-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; FORCED-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; FORCED-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; FORCED-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; FORCED-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; FORCED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 +; FORCED-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; FORCED: [[VECTOR_PH]]: +; FORCED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 8 +; FORCED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; FORCED-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; FORCED-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; FORCED-NEXT: br label %[[VECTOR_BODY:.*]] +; FORCED: [[VECTOR_BODY]]: +; FORCED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; FORCED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; FORCED-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; FORCED-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32 +; FORCED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; FORCED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]] +; FORCED-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; FORCED-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4 +; FORCED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1 +; FORCED-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1 +; FORCED-NEXT: [[TMP23:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP24:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP25:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], zeroinitializer +; FORCED-NEXT: [[TMP26:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], zeroinitializer +; FORCED-NEXT: [[TMP13:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP14:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP17:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP18:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP15:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP16:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP27:%.*]] = or <4 x i1> [[TMP23]], [[TMP25]] +; FORCED-NEXT: [[TMP28:%.*]] = or <4 x i1> [[TMP24]], [[TMP26]] +; FORCED-NEXT: [[TMP21:%.*]] = or <4 x i1> [[TMP13]], [[TMP17]] +; FORCED-NEXT: [[TMP22:%.*]] = or <4 x i1> [[TMP14]], [[TMP18]] +; FORCED-NEXT: [[TMP35:%.*]] = or <4 x i1> [[TMP21]], [[TMP15]] +; FORCED-NEXT: [[TMP36:%.*]] = or <4 x i1> [[TMP22]], [[TMP16]] +; FORCED-NEXT: [[TMP37:%.*]] = or <4 x i1> [[TMP27]], [[TMP35]] +; FORCED-NEXT: [[TMP38:%.*]] = or <4 x i1> [[TMP28]], [[TMP36]] +; FORCED-NEXT: [[TMP39:%.*]] = xor <4 x i1> [[TMP37]], +; FORCED-NEXT: [[TMP40:%.*]] = xor <4 x i1> [[TMP38]], +; FORCED-NEXT: [[TMP29:%.*]] = or <4 x i1> [[TMP35]], [[TMP35]] +; FORCED-NEXT: [[TMP30:%.*]] = or <4 x i1> [[TMP36]], [[TMP36]] +; FORCED-NEXT: [[TMP31:%.*]] = or <4 x i1> [[TMP29]], [[TMP35]] +; FORCED-NEXT: [[TMP32:%.*]] = or <4 x i1> [[TMP30]], [[TMP36]] +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP7]], i32 1, <4 x i1> [[TMP31]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP32]]) +; FORCED-NEXT: [[TMP33:%.*]] = or <4 x i1> [[TMP27]], [[TMP27]] +; FORCED-NEXT: [[TMP34:%.*]] = or <4 x i1> [[TMP28]], [[TMP28]] +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP7]], i32 1, <4 x i1> [[TMP33]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP8]], i32 1, <4 x i1> [[TMP34]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP7]], i32 1, <4 x i1> [[TMP39]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP8]], i32 1, <4 x i1> [[TMP40]]) +; FORCED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FORCED-NEXT: [[TMP41:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FORCED-NEXT: br i1 [[TMP41]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; FORCED: [[MIDDLE_BLOCK]]: +; FORCED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; FORCED-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; FORCED: [[SCALAR_PH]]: +; FORCED-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; FORCED-NEXT: br label %[[LOOP_HEADER:.*]] ; FORCED: [[LOOP_HEADER]]: -; FORCED-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; FORCED-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; FORCED-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; FORCED-NEXT: switch i64 [[L]], label %[[DEFAULT:.*]] [ ; FORCED-NEXT: i64 -12, label %[[IF_THEN_1:.*]] @@ -227,7 +782,7 @@ define void @switch_multiple_common_dests(ptr %start, ptr %end) { ; FORCED: [[LOOP_LATCH]]: ; FORCED-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; FORCED-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; FORCED-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; FORCED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP11:![0-9]+]] ; FORCED: [[EXIT]]: ; FORCED-NEXT: ret void ; @@ -298,9 +853,58 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) { ; FORCED-LABEL: define void @switch4_default_common_dest_with_case( ; FORCED-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { ; FORCED-NEXT: [[ENTRY:.*]]: +; FORCED-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; FORCED-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; FORCED-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; FORCED-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; FORCED-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; FORCED-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; FORCED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 +; FORCED-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; FORCED: [[VECTOR_PH]]: +; FORCED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 8 +; FORCED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; FORCED-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; FORCED-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; FORCED-NEXT: br label %[[VECTOR_BODY:.*]] +; FORCED: [[VECTOR_BODY]]: +; FORCED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; FORCED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; FORCED-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; FORCED-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32 +; FORCED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; FORCED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]] +; FORCED-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; FORCED-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4 +; FORCED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1 +; FORCED-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1 +; FORCED-NEXT: [[TMP15:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP16:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP11:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP12:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP17:%.*]] = or <4 x i1> [[TMP15]], [[TMP11]] +; FORCED-NEXT: [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP12]] +; FORCED-NEXT: [[TMP20:%.*]] = xor <4 x i1> [[TMP17]], +; FORCED-NEXT: [[TMP21:%.*]] = xor <4 x i1> [[TMP18]], +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP7]], i32 1, <4 x i1> [[TMP11]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP12]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP7]], i32 1, <4 x i1> [[TMP15]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP8]], i32 1, <4 x i1> [[TMP16]]) +; FORCED-NEXT: [[TMP22:%.*]] = or <4 x i1> [[TMP20]], [[TMP20]] +; FORCED-NEXT: [[TMP23:%.*]] = or <4 x i1> [[TMP21]], [[TMP21]] +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP7]], i32 1, <4 x i1> [[TMP22]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP8]], i32 1, <4 x i1> [[TMP23]]) +; FORCED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FORCED-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FORCED-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; FORCED: [[MIDDLE_BLOCK]]: +; FORCED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; FORCED-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; FORCED: [[SCALAR_PH]]: +; FORCED-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; FORCED-NEXT: br label %[[LOOP_HEADER:.*]] ; FORCED: [[LOOP_HEADER]]: -; FORCED-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; FORCED-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; FORCED-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; FORCED-NEXT: switch i64 [[L]], label %[[DEFAULT:.*]] [ ; FORCED-NEXT: i64 -12, label %[[IF_THEN_1:.*]] @@ -319,7 +923,7 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) { ; FORCED: [[LOOP_LATCH]]: ; FORCED-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; FORCED-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; FORCED-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; FORCED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP13:![0-9]+]] ; FORCED: [[EXIT]]: ; FORCED-NEXT: ret void ; @@ -391,9 +995,68 @@ define void @switch_under_br_default_common_dest_with_case(ptr %start, ptr %end, ; FORCED-LABEL: define void @switch_under_br_default_common_dest_with_case( ; FORCED-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i64 [[X:%.*]]) #[[ATTR0]] { ; FORCED-NEXT: [[ENTRY:.*]]: +; FORCED-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; FORCED-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; FORCED-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; FORCED-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; FORCED-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; FORCED-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; FORCED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 +; FORCED-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; FORCED: [[VECTOR_PH]]: +; FORCED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 8 +; FORCED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; FORCED-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; FORCED-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; FORCED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[X]], i64 0 +; FORCED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FORCED-NEXT: br label %[[VECTOR_BODY:.*]] +; FORCED: [[VECTOR_BODY]]: +; FORCED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; FORCED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; FORCED-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; FORCED-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32 +; FORCED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; FORCED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]] +; FORCED-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; FORCED-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4 +; FORCED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1 +; FORCED-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1 +; FORCED-NEXT: [[TMP9:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; FORCED-NEXT: [[TMP10:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD4]], [[BROADCAST_SPLAT]] +; FORCED-NEXT: [[TMP11:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP12:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP13:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP14:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP11]], <4 x i1> zeroinitializer +; FORCED-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP12]], <4 x i1> zeroinitializer +; FORCED-NEXT: [[TMP25:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP13]], <4 x i1> zeroinitializer +; FORCED-NEXT: [[TMP26:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP14]], <4 x i1> zeroinitializer +; FORCED-NEXT: [[TMP27:%.*]] = or <4 x i1> [[TMP15]], [[TMP25]] +; FORCED-NEXT: [[TMP20:%.*]] = or <4 x i1> [[TMP16]], [[TMP26]] +; FORCED-NEXT: [[TMP21:%.*]] = xor <4 x i1> [[TMP27]], +; FORCED-NEXT: [[TMP22:%.*]] = xor <4 x i1> [[TMP20]], +; FORCED-NEXT: [[TMP23:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP21]], <4 x i1> zeroinitializer +; FORCED-NEXT: [[TMP24:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP22]], <4 x i1> zeroinitializer +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP7]], i32 1, <4 x i1> [[TMP25]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP26]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP7]], i32 1, <4 x i1> [[TMP15]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP8]], i32 1, <4 x i1> [[TMP16]]) +; FORCED-NEXT: [[TMP17:%.*]] = or <4 x i1> [[TMP23]], [[TMP23]] +; FORCED-NEXT: [[TMP18:%.*]] = or <4 x i1> [[TMP24]], [[TMP24]] +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP7]], i32 1, <4 x i1> [[TMP17]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP8]], i32 1, <4 x i1> [[TMP18]]) +; FORCED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FORCED-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FORCED-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; FORCED: [[MIDDLE_BLOCK]]: +; FORCED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; FORCED-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; FORCED: [[SCALAR_PH]]: +; FORCED-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; FORCED-NEXT: br label %[[LOOP_HEADER:.*]] ; FORCED: [[LOOP_HEADER]]: -; FORCED-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; FORCED-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; FORCED-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; FORCED-NEXT: [[C:%.*]] = icmp ule i64 [[L]], [[X]] ; FORCED-NEXT: br i1 [[C]], label %[[THEN:.*]], label %[[LOOP_LATCH]] @@ -415,7 +1078,7 @@ define void @switch_under_br_default_common_dest_with_case(ptr %start, ptr %end, ; FORCED: [[LOOP_LATCH]]: ; FORCED-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; FORCED-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; FORCED-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; FORCED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP15:![0-9]+]] ; FORCED: [[EXIT]]: ; FORCED-NEXT: ret void ; @@ -491,9 +1154,72 @@ define void @br_under_switch_default_common_dest_with_case(ptr %start, ptr %end, ; FORCED-LABEL: define void @br_under_switch_default_common_dest_with_case( ; FORCED-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i64 [[X:%.*]]) #[[ATTR0]] { ; FORCED-NEXT: [[ENTRY:.*]]: +; FORCED-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; FORCED-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; FORCED-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; FORCED-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; FORCED-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; FORCED-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; FORCED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 +; FORCED-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; FORCED: [[VECTOR_PH]]: +; FORCED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 8 +; FORCED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; FORCED-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; FORCED-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; FORCED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[X]], i64 0 +; FORCED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FORCED-NEXT: br label %[[VECTOR_BODY:.*]] +; FORCED: [[VECTOR_BODY]]: +; FORCED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; FORCED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; FORCED-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; FORCED-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32 +; FORCED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; FORCED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]] +; FORCED-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; FORCED-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4 +; FORCED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1 +; FORCED-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1 +; FORCED-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP25:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP26:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP13:%.*]] = or <4 x i1> [[TMP9]], [[TMP25]] +; FORCED-NEXT: [[TMP14:%.*]] = or <4 x i1> [[TMP10]], [[TMP26]] +; FORCED-NEXT: [[TMP15:%.*]] = xor <4 x i1> [[TMP13]], +; FORCED-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP14]], +; FORCED-NEXT: [[TMP17:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; FORCED-NEXT: [[TMP18:%.*]] = icmp ule <4 x i64> [[WIDE_LOAD4]], [[BROADCAST_SPLAT]] +; FORCED-NEXT: [[TMP19:%.*]] = xor <4 x i1> [[TMP17]], +; FORCED-NEXT: [[TMP20:%.*]] = xor <4 x i1> [[TMP18]], +; FORCED-NEXT: [[TMP27:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP19]], <4 x i1> zeroinitializer +; FORCED-NEXT: [[TMP28:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP20]], <4 x i1> zeroinitializer +; FORCED-NEXT: [[TMP29:%.*]] = or <4 x i1> [[TMP27]], [[TMP25]] +; FORCED-NEXT: [[TMP30:%.*]] = or <4 x i1> [[TMP28]], [[TMP26]] +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP7]], i32 1, <4 x i1> [[TMP29]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> zeroinitializer, ptr [[TMP8]], i32 1, <4 x i1> [[TMP30]]) +; FORCED-NEXT: [[TMP32:%.*]] = select <4 x i1> [[TMP9]], <4 x i1> [[TMP17]], <4 x i1> zeroinitializer +; FORCED-NEXT: [[TMP33:%.*]] = select <4 x i1> [[TMP10]], <4 x i1> [[TMP18]], <4 x i1> zeroinitializer +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP7]], i32 1, <4 x i1> [[TMP32]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP8]], i32 1, <4 x i1> [[TMP33]]) +; FORCED-NEXT: [[TMP34:%.*]] = or <4 x i1> [[TMP32]], [[TMP15]] +; FORCED-NEXT: [[TMP35:%.*]] = or <4 x i1> [[TMP33]], [[TMP16]] +; FORCED-NEXT: [[TMP36:%.*]] = or <4 x i1> [[TMP34]], [[TMP15]] +; FORCED-NEXT: [[TMP37:%.*]] = or <4 x i1> [[TMP35]], [[TMP16]] +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP7]], i32 1, <4 x i1> [[TMP36]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP8]], i32 1, <4 x i1> [[TMP37]]) +; FORCED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FORCED-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FORCED-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; FORCED: [[MIDDLE_BLOCK]]: +; FORCED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; FORCED-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; FORCED: [[SCALAR_PH]]: +; FORCED-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; FORCED-NEXT: br label %[[LOOP_HEADER:.*]] ; FORCED: [[LOOP_HEADER]]: -; FORCED-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; FORCED-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; FORCED-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; FORCED-NEXT: switch i64 [[L]], label %[[DEFAULT:.*]] [ ; FORCED-NEXT: i64 -12, label %[[IF_THEN_1:.*]] @@ -515,7 +1241,7 @@ define void @br_under_switch_default_common_dest_with_case(ptr %start, ptr %end, ; FORCED: [[LOOP_LATCH]]: ; FORCED-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; FORCED-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; FORCED-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; FORCED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP17:![0-9]+]] ; FORCED: [[EXIT]]: ; FORCED-NEXT: ret void ; @@ -588,9 +1314,94 @@ define void @large_number_of_cases(ptr %start, ptr %end) { ; FORCED-LABEL: define void @large_number_of_cases( ; FORCED-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) #[[ATTR0]] { ; FORCED-NEXT: [[ENTRY:.*]]: +; FORCED-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; FORCED-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; FORCED-NEXT: [[TMP0:%.*]] = add i64 [[END1]], -8 +; FORCED-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; FORCED-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 3 +; FORCED-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; FORCED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 8 +; FORCED-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; FORCED: [[VECTOR_PH]]: +; FORCED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 8 +; FORCED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; FORCED-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 8 +; FORCED-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; FORCED-NEXT: br label %[[VECTOR_BODY:.*]] +; FORCED: [[VECTOR_BODY]]: +; FORCED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; FORCED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 +; FORCED-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0 +; FORCED-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 32 +; FORCED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]] +; FORCED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]] +; FORCED-NEXT: [[TMP7:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0 +; FORCED-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 4 +; FORCED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP7]], align 1 +; FORCED-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP8]], align 1 +; FORCED-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP10:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP11:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP12:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP13:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP14:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP15:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP16:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP17:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP18:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP19:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP20:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP21:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP22:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP23:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP24:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP25:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], +; FORCED-NEXT: [[TMP26:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD4]], +; FORCED-NEXT: [[TMP27:%.*]] = or <4 x i1> [[TMP9]], [[TMP11]] +; FORCED-NEXT: [[TMP28:%.*]] = or <4 x i1> [[TMP10]], [[TMP12]] +; FORCED-NEXT: [[TMP29:%.*]] = or <4 x i1> [[TMP27]], [[TMP13]] +; FORCED-NEXT: [[TMP30:%.*]] = or <4 x i1> [[TMP28]], [[TMP14]] +; FORCED-NEXT: [[TMP31:%.*]] = or <4 x i1> [[TMP29]], [[TMP15]] +; FORCED-NEXT: [[TMP32:%.*]] = or <4 x i1> [[TMP30]], [[TMP16]] +; FORCED-NEXT: [[TMP33:%.*]] = or <4 x i1> [[TMP31]], [[TMP17]] +; FORCED-NEXT: [[TMP34:%.*]] = or <4 x i1> [[TMP32]], [[TMP18]] +; FORCED-NEXT: [[TMP35:%.*]] = or <4 x i1> [[TMP33]], [[TMP19]] +; FORCED-NEXT: [[TMP36:%.*]] = or <4 x i1> [[TMP34]], [[TMP20]] +; FORCED-NEXT: [[TMP37:%.*]] = or <4 x i1> [[TMP35]], [[TMP21]] +; FORCED-NEXT: [[TMP38:%.*]] = or <4 x i1> [[TMP36]], [[TMP22]] +; FORCED-NEXT: [[TMP39:%.*]] = or <4 x i1> [[TMP37]], [[TMP23]] +; FORCED-NEXT: [[TMP40:%.*]] = or <4 x i1> [[TMP38]], [[TMP24]] +; FORCED-NEXT: [[TMP41:%.*]] = or <4 x i1> [[TMP39]], [[TMP25]] +; FORCED-NEXT: [[TMP42:%.*]] = or <4 x i1> [[TMP40]], [[TMP26]] +; FORCED-NEXT: [[TMP43:%.*]] = or <4 x i1> [[TMP41]], [[TMP41]] +; FORCED-NEXT: [[TMP44:%.*]] = or <4 x i1> [[TMP42]], [[TMP42]] +; FORCED-NEXT: [[TMP45:%.*]] = or <4 x i1> [[TMP43]], [[TMP41]] +; FORCED-NEXT: [[TMP46:%.*]] = or <4 x i1> [[TMP44]], [[TMP42]] +; FORCED-NEXT: [[TMP47:%.*]] = or <4 x i1> [[TMP45]], [[TMP41]] +; FORCED-NEXT: [[TMP48:%.*]] = or <4 x i1> [[TMP46]], [[TMP42]] +; FORCED-NEXT: [[TMP49:%.*]] = or <4 x i1> [[TMP47]], [[TMP41]] +; FORCED-NEXT: [[TMP50:%.*]] = or <4 x i1> [[TMP48]], [[TMP42]] +; FORCED-NEXT: [[TMP51:%.*]] = or <4 x i1> [[TMP49]], [[TMP41]] +; FORCED-NEXT: [[TMP52:%.*]] = or <4 x i1> [[TMP50]], [[TMP42]] +; FORCED-NEXT: [[TMP53:%.*]] = or <4 x i1> [[TMP51]], [[TMP41]] +; FORCED-NEXT: [[TMP54:%.*]] = or <4 x i1> [[TMP52]], [[TMP42]] +; FORCED-NEXT: [[TMP55:%.*]] = or <4 x i1> [[TMP53]], [[TMP41]] +; FORCED-NEXT: [[TMP56:%.*]] = or <4 x i1> [[TMP54]], [[TMP42]] +; FORCED-NEXT: [[TMP57:%.*]] = or <4 x i1> [[TMP55]], [[TMP41]] +; FORCED-NEXT: [[TMP58:%.*]] = or <4 x i1> [[TMP56]], [[TMP42]] +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP7]], i32 1, <4 x i1> [[TMP57]]) +; FORCED-NEXT: call void @llvm.masked.store.v4i64.p0(<4 x i64> , ptr [[TMP8]], i32 1, <4 x i1> [[TMP58]]) +; FORCED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; FORCED-NEXT: [[TMP59:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FORCED-NEXT: br i1 [[TMP59]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; FORCED: [[MIDDLE_BLOCK]]: +; FORCED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; FORCED-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; FORCED: [[SCALAR_PH]]: +; FORCED-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; FORCED-NEXT: br label %[[LOOP_HEADER:.*]] ; FORCED: [[LOOP_HEADER]]: -; FORCED-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; FORCED-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; FORCED-NEXT: [[L:%.*]] = load i64, ptr [[PTR_IV]], align 1 ; FORCED-NEXT: switch i64 [[L]], label %[[LOOP_LATCH]] [ ; FORCED-NEXT: i64 1, label %[[IF_THEN:.*]] @@ -609,7 +1420,7 @@ define void @large_number_of_cases(ptr %start, ptr %end) { ; FORCED: [[LOOP_LATCH]]: ; FORCED-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i64, ptr [[PTR_IV]], i64 1 ; FORCED-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; FORCED-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; FORCED-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP19:![0-9]+]] ; FORCED: [[EXIT]]: ; FORCED-NEXT: ret void ; @@ -644,3 +1455,34 @@ exit: ret void } +;. +; COST: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; COST: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; COST: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; COST: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; COST: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; COST: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; COST: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; COST: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +;. +; FORCED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; FORCED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; FORCED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; FORCED: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; FORCED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; FORCED: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; FORCED: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; FORCED: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; FORCED: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; FORCED: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; FORCED: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; FORCED: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +; FORCED: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]} +; FORCED: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]} +; FORCED: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]} +; FORCED: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]} +; FORCED: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]} +; FORCED: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]} +; FORCED: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]} +; FORCED: [[LOOP19]] = distinct !{[[LOOP19]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/no_switch.ll b/llvm/test/Transforms/LoopVectorize/no_switch.ll index c62826f9554e63..118a15e63fe99f 100644 --- a/llvm/test/Transforms/LoopVectorize/no_switch.ll +++ b/llvm/test/Transforms/LoopVectorize/no_switch.ll @@ -2,18 +2,16 @@ ; RUN: opt < %s -passes=loop-vectorize,transform-warning -force-vector-width=1 -S 2>&1 | FileCheck %s -check-prefix=NOANALYSIS ; RUN: opt < %s -passes=loop-vectorize,transform-warning -force-vector-width=4 -pass-remarks-missed='loop-vectorize' -S 2>&1 | FileCheck %s -check-prefix=MOREINFO -; CHECK: remark: source.cpp:4:5: loop not vectorized: loop contains a switch statement -; CHECK: warning: source.cpp:4:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering +; CHECK-NOT: loop not vectorized: loop contains a switch statement +; CHECK-NOT: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering ; NOANALYSIS-NOT: remark: {{.*}} -; NOANALYSIS: warning: source.cpp:4:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering +; NOANALYSIS: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering -; MOREINFO: remark: source.cpp:4:5: loop not vectorized: loop contains a switch statement -; MOREINFO: remark: source.cpp:4:5: loop not vectorized (Force=true, Vector Width=4) -; MOREINFO: warning: source.cpp:4:5: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering +; MOREINFO-NOT: remark ; CHECK: _Z11test_switchPii -; CHECK-NOT: x i32> +; CHECK: vector.body: ; CHECK: ret target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" diff --git a/llvm/test/Transforms/LoopVectorize/predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/predicate-switch.ll index 71d8ca931760a2..56b55cf5d3b398 100644 --- a/llvm/test/Transforms/LoopVectorize/predicate-switch.ll +++ b/llvm/test/Transforms/LoopVectorize/predicate-switch.ll @@ -6,9 +6,76 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) { ; IC1-LABEL: define void @switch4_default_common_dest_with_case( ; IC1-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) { ; IC1-NEXT: [[ENTRY:.*]]: +; IC1-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; IC1-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; IC1-NEXT: [[TMP0:%.*]] = sub i64 [[END1]], [[START2]] +; IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2 +; IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC1: [[VECTOR_PH]]: +; IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2 +; IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; IC1-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] +; IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1: [[VECTOR_BODY]]: +; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE13:.*]] ] +; IC1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; IC1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 +; IC1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP1]] +; IC1-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP2]] +; IC1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 +; IC1-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP3]], align 1 +; IC1-NEXT: [[TMP7:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD]], +; IC1-NEXT: [[TMP4:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD]], +; IC1-NEXT: [[TMP11:%.*]] = or <2 x i1> [[TMP7]], [[TMP4]] +; IC1-NEXT: [[TMP10:%.*]] = xor <2 x i1> [[TMP11]], +; IC1-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; IC1-NEXT: br i1 [[TMP5]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; IC1: [[PRED_STORE_IF]]: +; IC1-NEXT: store i8 0, ptr [[NEXT_GEP]], align 1 +; IC1-NEXT: br label %[[PRED_STORE_CONTINUE]] +; IC1: [[PRED_STORE_CONTINUE]]: +; IC1-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; IC1-NEXT: br i1 [[TMP6]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5:.*]] +; IC1: [[PRED_STORE_IF4]]: +; IC1-NEXT: store i8 0, ptr [[NEXT_GEP3]], align 1 +; IC1-NEXT: br label %[[PRED_STORE_CONTINUE5]] +; IC1: [[PRED_STORE_CONTINUE5]]: +; IC1-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; IC1-NEXT: br i1 [[TMP8]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]] +; IC1: [[PRED_STORE_IF6]]: +; IC1-NEXT: store i8 42, ptr [[NEXT_GEP]], align 1 +; IC1-NEXT: br label %[[PRED_STORE_CONTINUE7]] +; IC1: [[PRED_STORE_CONTINUE7]]: +; IC1-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; IC1-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]] +; IC1: [[PRED_STORE_IF8]]: +; IC1-NEXT: store i8 42, ptr [[NEXT_GEP3]], align 1 +; IC1-NEXT: br label %[[PRED_STORE_CONTINUE9]] +; IC1: [[PRED_STORE_CONTINUE9]]: +; IC1-NEXT: [[TMP12:%.*]] = or <2 x i1> [[TMP10]], [[TMP10]] +; IC1-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0 +; IC1-NEXT: br i1 [[TMP13]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11:.*]] +; IC1: [[PRED_STORE_IF10]]: +; IC1-NEXT: store i8 2, ptr [[NEXT_GEP]], align 1 +; IC1-NEXT: br label %[[PRED_STORE_CONTINUE11]] +; IC1: [[PRED_STORE_CONTINUE11]]: +; IC1-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP12]], i32 1 +; IC1-NEXT: br i1 [[TMP14]], label %[[PRED_STORE_IF12:.*]], label %[[PRED_STORE_CONTINUE13]] +; IC1: [[PRED_STORE_IF12]]: +; IC1-NEXT: store i8 2, ptr [[NEXT_GEP3]], align 1 +; IC1-NEXT: br label %[[PRED_STORE_CONTINUE13]] +; IC1: [[PRED_STORE_CONTINUE13]]: +; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; IC1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC1-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IC1: [[MIDDLE_BLOCK]]: +; IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; IC1-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC1: [[SCALAR_PH]]: +; IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; IC1-NEXT: br label %[[LOOP_HEADER:.*]] ; IC1: [[LOOP_HEADER]]: -; IC1-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC1-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC1-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 1 ; IC1-NEXT: switch i8 [[L]], label %[[DEFAULT:.*]] [ ; IC1-NEXT: i8 -12, label %[[IF_THEN_1:.*]] @@ -27,16 +94,130 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) { ; IC1: [[LOOP_LATCH]]: ; IC1-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i8, ptr [[PTR_IV]], i64 1 ; IC1-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; IC1-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; IC1-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] ; IC1: [[EXIT]]: ; IC1-NEXT: ret void ; ; IC2-LABEL: define void @switch4_default_common_dest_with_case( ; IC2-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) { ; IC2-NEXT: [[ENTRY:.*]]: +; IC2-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; IC2-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; IC2-NEXT: [[TMP0:%.*]] = sub i64 [[END1]], [[START2]] +; IC2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; IC2-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC2: [[VECTOR_PH]]: +; IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; IC2-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] +; IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; IC2: [[VECTOR_BODY]]: +; IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE28:.*]] ] +; IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 +; IC2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 +; IC2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 2 +; IC2-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 3 +; IC2-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP1]] +; IC2-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP2]] +; IC2-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP3]] +; IC2-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; IC2-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 +; IC2-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 2 +; IC2-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP5]], align 1 +; IC2-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x i8>, ptr [[TMP6]], align 1 +; IC2-NEXT: [[TMP13:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD]], +; IC2-NEXT: [[TMP14:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD6]], +; IC2-NEXT: [[TMP7:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD]], +; IC2-NEXT: [[TMP8:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD6]], +; IC2-NEXT: [[TMP21:%.*]] = or <2 x i1> [[TMP13]], [[TMP7]] +; IC2-NEXT: [[TMP22:%.*]] = or <2 x i1> [[TMP14]], [[TMP8]] +; IC2-NEXT: [[TMP19:%.*]] = xor <2 x i1> [[TMP21]], +; IC2-NEXT: [[TMP20:%.*]] = xor <2 x i1> [[TMP22]], +; IC2-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; IC2-NEXT: br i1 [[TMP9]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]] +; IC2: [[PRED_STORE_IF]]: +; IC2-NEXT: store i8 0, ptr [[NEXT_GEP]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE]] +; IC2: [[PRED_STORE_CONTINUE]]: +; IC2-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; IC2-NEXT: br i1 [[TMP10]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]] +; IC2: [[PRED_STORE_IF7]]: +; IC2-NEXT: store i8 0, ptr [[NEXT_GEP3]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE8]] +; IC2: [[PRED_STORE_CONTINUE8]]: +; IC2-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0 +; IC2-NEXT: br i1 [[TMP11]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]] +; IC2: [[PRED_STORE_IF9]]: +; IC2-NEXT: store i8 0, ptr [[NEXT_GEP4]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE10]] +; IC2: [[PRED_STORE_CONTINUE10]]: +; IC2-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1 +; IC2-NEXT: br i1 [[TMP12]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]] +; IC2: [[PRED_STORE_IF11]]: +; IC2-NEXT: store i8 0, ptr [[NEXT_GEP5]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE12]] +; IC2: [[PRED_STORE_CONTINUE12]]: +; IC2-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP13]], i32 0 +; IC2-NEXT: br i1 [[TMP15]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]] +; IC2: [[PRED_STORE_IF13]]: +; IC2-NEXT: store i8 42, ptr [[NEXT_GEP]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE14]] +; IC2: [[PRED_STORE_CONTINUE14]]: +; IC2-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP13]], i32 1 +; IC2-NEXT: br i1 [[TMP16]], label %[[PRED_STORE_IF15:.*]], label %[[PRED_STORE_CONTINUE16:.*]] +; IC2: [[PRED_STORE_IF15]]: +; IC2-NEXT: store i8 42, ptr [[NEXT_GEP3]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE16]] +; IC2: [[PRED_STORE_CONTINUE16]]: +; IC2-NEXT: [[TMP17:%.*]] = extractelement <2 x i1> [[TMP14]], i32 0 +; IC2-NEXT: br i1 [[TMP17]], label %[[PRED_STORE_IF17:.*]], label %[[PRED_STORE_CONTINUE18:.*]] +; IC2: [[PRED_STORE_IF17]]: +; IC2-NEXT: store i8 42, ptr [[NEXT_GEP4]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE18]] +; IC2: [[PRED_STORE_CONTINUE18]]: +; IC2-NEXT: [[TMP18:%.*]] = extractelement <2 x i1> [[TMP14]], i32 1 +; IC2-NEXT: br i1 [[TMP18]], label %[[PRED_STORE_IF19:.*]], label %[[PRED_STORE_CONTINUE20:.*]] +; IC2: [[PRED_STORE_IF19]]: +; IC2-NEXT: store i8 42, ptr [[NEXT_GEP5]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE20]] +; IC2: [[PRED_STORE_CONTINUE20]]: +; IC2-NEXT: [[TMP23:%.*]] = or <2 x i1> [[TMP19]], [[TMP19]] +; IC2-NEXT: [[TMP24:%.*]] = or <2 x i1> [[TMP20]], [[TMP20]] +; IC2-NEXT: [[TMP25:%.*]] = extractelement <2 x i1> [[TMP23]], i32 0 +; IC2-NEXT: br i1 [[TMP25]], label %[[PRED_STORE_IF21:.*]], label %[[PRED_STORE_CONTINUE22:.*]] +; IC2: [[PRED_STORE_IF21]]: +; IC2-NEXT: store i8 2, ptr [[NEXT_GEP]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE22]] +; IC2: [[PRED_STORE_CONTINUE22]]: +; IC2-NEXT: [[TMP26:%.*]] = extractelement <2 x i1> [[TMP23]], i32 1 +; IC2-NEXT: br i1 [[TMP26]], label %[[PRED_STORE_IF23:.*]], label %[[PRED_STORE_CONTINUE24:.*]] +; IC2: [[PRED_STORE_IF23]]: +; IC2-NEXT: store i8 2, ptr [[NEXT_GEP3]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE24]] +; IC2: [[PRED_STORE_CONTINUE24]]: +; IC2-NEXT: [[TMP27:%.*]] = extractelement <2 x i1> [[TMP24]], i32 0 +; IC2-NEXT: br i1 [[TMP27]], label %[[PRED_STORE_IF25:.*]], label %[[PRED_STORE_CONTINUE26:.*]] +; IC2: [[PRED_STORE_IF25]]: +; IC2-NEXT: store i8 2, ptr [[NEXT_GEP4]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE26]] +; IC2: [[PRED_STORE_CONTINUE26]]: +; IC2-NEXT: [[TMP28:%.*]] = extractelement <2 x i1> [[TMP24]], i32 1 +; IC2-NEXT: br i1 [[TMP28]], label %[[PRED_STORE_IF27:.*]], label %[[PRED_STORE_CONTINUE28]] +; IC2: [[PRED_STORE_IF27]]: +; IC2-NEXT: store i8 2, ptr [[NEXT_GEP5]], align 1 +; IC2-NEXT: br label %[[PRED_STORE_CONTINUE28]] +; IC2: [[PRED_STORE_CONTINUE28]]: +; IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC2-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IC2-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IC2: [[MIDDLE_BLOCK]]: +; IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; IC2-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC2: [[SCALAR_PH]]: +; IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ] ; IC2-NEXT: br label %[[LOOP_HEADER:.*]] ; IC2: [[LOOP_HEADER]]: -; IC2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[START]], %[[ENTRY]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC2-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC2-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 1 ; IC2-NEXT: switch i8 [[L]], label %[[DEFAULT:.*]] [ ; IC2-NEXT: i8 -12, label %[[IF_THEN_1:.*]] @@ -55,7 +236,7 @@ define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) { ; IC2: [[LOOP_LATCH]]: ; IC2-NEXT: [[PTR_IV_NEXT]] = getelementptr inbounds i8, ptr [[PTR_IV]], i64 1 ; IC2-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV_NEXT]], [[END]] -; IC2-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; IC2-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] ; IC2: [[EXIT]]: ; IC2-NEXT: ret void ; @@ -162,13 +343,13 @@ define void @switch_to_header(ptr %start) { ; IC1-NEXT: [[ENTRY:.*]]: ; IC1-NEXT: br label %[[LOOP_HEADER:.*]] ; IC1: [[LOOP_HEADER]]: -; IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[IF_THEN1:.*]] ] +; IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_HEADER_BACKEDGE:.*]] ] ; IC1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; IC1-NEXT: switch i64 [[IV]], label %[[LOOP_LATCH:.*]] [ -; IC1-NEXT: i64 120, label %[[IF_THEN1]] +; IC1-NEXT: i64 120, label %[[LOOP_HEADER_BACKEDGE]] ; IC1-NEXT: i64 100, label %[[LOOP_LATCH]] ; IC1-NEXT: ] -; IC1: [[IF_THEN1]]: +; IC1: [[LOOP_HEADER_BACKEDGE]]: ; IC1-NEXT: br label %[[LOOP_HEADER]] ; IC1: [[IF_THEN:.*:]] ; IC1-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 poison @@ -176,7 +357,7 @@ define void @switch_to_header(ptr %start) { ; IC1-NEXT: unreachable ; IC1: [[LOOP_LATCH]]: ; IC1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 100 -; IC1-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[IF_THEN1]] +; IC1-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP_HEADER_BACKEDGE]] ; IC1: [[EXIT]]: ; IC1-NEXT: ret void ; @@ -185,13 +366,13 @@ define void @switch_to_header(ptr %start) { ; IC2-NEXT: [[ENTRY:.*]]: ; IC2-NEXT: br label %[[LOOP_HEADER:.*]] ; IC2: [[LOOP_HEADER]]: -; IC2-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[IF_THEN1:.*]] ] +; IC2-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_HEADER_BACKEDGE:.*]] ] ; IC2-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; IC2-NEXT: switch i64 [[IV]], label %[[LOOP_LATCH:.*]] [ -; IC2-NEXT: i64 120, label %[[IF_THEN1]] +; IC2-NEXT: i64 120, label %[[LOOP_HEADER_BACKEDGE]] ; IC2-NEXT: i64 100, label %[[LOOP_LATCH]] ; IC2-NEXT: ] -; IC2: [[IF_THEN1]]: +; IC2: [[LOOP_HEADER_BACKEDGE]]: ; IC2-NEXT: br label %[[LOOP_HEADER]] ; IC2: [[IF_THEN:.*:]] ; IC2-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 poison @@ -199,7 +380,7 @@ define void @switch_to_header(ptr %start) { ; IC2-NEXT: unreachable ; IC2: [[LOOP_LATCH]]: ; IC2-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 100 -; IC2-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[IF_THEN1]] +; IC2-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP_HEADER_BACKEDGE]] ; IC2: [[EXIT]]: ; IC2-NEXT: ret void ; @@ -231,9 +412,25 @@ define void @switch_all_to_default(ptr %start) { ; IC1-LABEL: define void @switch_all_to_default( ; IC1-SAME: ptr [[START:%.*]]) { ; IC1-NEXT: [[ENTRY:.*]]: +; IC1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC1: [[VECTOR_PH]]: +; IC1-NEXT: br label %[[VECTOR_BODY:.*]] +; IC1: [[VECTOR_BODY]]: +; IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 [[TMP0]] +; IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0 +; IC1-NEXT: store <2 x i64> , ptr [[TMP2]], align 1 +; IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; IC1-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; IC1-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IC1: [[MIDDLE_BLOCK]]: +; IC1-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC1: [[SCALAR_PH]]: +; IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; IC1-NEXT: br label %[[LOOP_HEADER:.*]] ; IC1: [[LOOP_HEADER]]: -; IC1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; IC1-NEXT: switch i64 [[IV]], label %[[LOOP_LATCH]] [ ; IC1-NEXT: i64 120, label %[[LOOP_LATCH]] @@ -243,16 +440,36 @@ define void @switch_all_to_default(ptr %start) { ; IC1-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 [[IV]] ; IC1-NEXT: store i64 42, ptr [[GEP]], align 1 ; IC1-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 100 -; IC1-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; IC1-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]] ; IC1: [[EXIT]]: ; IC1-NEXT: ret void ; ; IC2-LABEL: define void @switch_all_to_default( ; IC2-SAME: ptr [[START:%.*]]) { ; IC2-NEXT: [[ENTRY:.*]]: +; IC2-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; IC2: [[VECTOR_PH]]: +; IC2-NEXT: br label %[[VECTOR_BODY:.*]] +; IC2: [[VECTOR_BODY]]: +; IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; IC2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; IC2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; IC2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 [[TMP0]] +; IC2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 [[TMP1]] +; IC2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 +; IC2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 +; IC2-NEXT: store <2 x i64> , ptr [[TMP4]], align 1 +; IC2-NEXT: store <2 x i64> , ptr [[TMP5]], align 1 +; IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; IC2-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100 +; IC2-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; IC2: [[MIDDLE_BLOCK]]: +; IC2-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]] +; IC2: [[SCALAR_PH]]: +; IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 100, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] ; IC2-NEXT: br label %[[LOOP_HEADER:.*]] ; IC2: [[LOOP_HEADER]]: -; IC2-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; IC2-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] ; IC2-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; IC2-NEXT: switch i64 [[IV]], label %[[LOOP_LATCH]] [ ; IC2-NEXT: i64 120, label %[[LOOP_LATCH]] @@ -262,7 +479,7 @@ define void @switch_all_to_default(ptr %start) { ; IC2-NEXT: [[GEP:%.*]] = getelementptr inbounds i64, ptr [[START]], i64 [[IV]] ; IC2-NEXT: store i64 42, ptr [[GEP]], align 1 ; IC2-NEXT: [[CMP:%.*]] = icmp eq i64 [[IV_NEXT]], 100 -; IC2-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; IC2-NEXT: br i1 [[CMP]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]] ; IC2: [[EXIT]]: ; IC2-NEXT: ret void ; @@ -286,3 +503,18 @@ loop.latch: exit: ret void } +;. +; IC1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; IC1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; IC1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; IC1: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; IC1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; IC1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +;. +; IC2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; IC2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; IC2: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; IC2: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; IC2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; IC2: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll index 6e3e55a319ec20..7d8611405e2264 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-predicate-switch.ll @@ -2,7 +2,98 @@ ; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -debug -disable-output %s 2>&1 | FileCheck %s define void @switch4_default_common_dest_with_case(ptr %start, ptr %end) { -; CHECK-NOT: VPlan +; CHECK: VPlan 'Final VPlan for VF={2},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VTC:%.+]]> = vector-trip-count +; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV ((-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<%12> +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> +; CHECK-NEXT: EMIT vp<[[PTR:%.+]]> = ptradd ir<%start>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[WIDE_PTR:%.+]]> = vector-pointer vp<[[PTR]]> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[WIDE_PTR]]> +; CHECK-NEXT: EMIT vp<[[C1:%.+]]> = icmp eq ir<%l>, ir<-12> +; CHECK-NEXT: EMIT vp<[[C2:%.+]]> = icmp eq ir<%l>, ir<13> +; CHECK-NEXT: EMIT vp<[[OR_CASES:%.+]]> = or vp<[[C1]]>, vp<[[C2]]> +; CHECK-NEXT: EMIT vp<[[DEFAULT_MASK:%.+]]> = not vp<[[OR_CASES]]> +; CHECK-NEXT: Successor(s): pred.store +; CHECK-EMPTY: +; CHECK-NEXT: pred.store: { +; CHECK-NEXT: pred.store.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<[[C2]]> +; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.if: +; CHECK-NEXT: REPLICATE store ir<0>, vp<[[PTR]]> +; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): if.then.2.0 +; CHECK-EMPTY: +; CHECK-NEXT: if.then.2.0: +; CHECK-NEXT: Successor(s): pred.store +; CHECK-EMPTY: +; CHECK-NEXT: pred.store: { +; CHECK-NEXT: pred.store.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<[[C1]]> +; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.if: +; CHECK-NEXT: REPLICATE store ir<42>, vp<[[PTR]]> +; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): if.then.1.1 +; CHECK-EMPTY: +; CHECK-NEXT: if.then.1.1: +; CHECK-NEXT: EMIT vp<[[C3:%.+]]> = or vp<[[DEFAULT_MASK]]>, vp<[[DEFAULT_MASK]]> +; CHECK-NEXT: Successor(s): pred.store +; CHECK-EMPTY: +; CHECK-NEXT: pred.store: { +; CHECK-NEXT: pred.store.entry: +; CHECK-NEXT: BRANCH-ON-MASK vp<[[C3]]> +; CHECK-NEXT: Successor(s): pred.store.if, pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.if: +; CHECK-NEXT: REPLICATE store ir<2>, vp<[[PTR]]> +; CHECK-NEXT: Successor(s): pred.store.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.store.continue: +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): default.2 +; CHECK-EMPTY: +; CHECK-NEXT: default.2: +; CHECK-NEXT: EMIT vp<[[CAN_CMP:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_CMP]]>, vp<[[VTC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[MIDDLE_CMP:%.+]]> = icmp eq vp<[[TC]]>, vp<[[VTC]]> +; CHECK-NEXT: EMIT branch-on-cond vp<[[MIDDLE_CMP]]> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: No successors +; CHECK-NEXT: } ; entry: br label %loop.header diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll index 94e5f7feb53a1c..3dfa68ff96ddce 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll @@ -5,9 +5,9 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -; FIXME: The br -> switch conversion blocks vectorization. - +; Make sure we vectorize when branches are convered to switch. define dso_local void @test(ptr %start, ptr %end) #0 { +; ; AVX-LABEL: @test( ; AVX-NEXT: entry: ; AVX-NEXT: [[I11_NOT1:%.*]] = icmp eq ptr [[START:%.*]], [[END:%.*]] @@ -32,9 +32,59 @@ define dso_local void @test(ptr %start, ptr %end) #0 { ; AVX2-LABEL: @test( ; AVX2-NEXT: entry: ; AVX2-NEXT: [[I11_NOT1:%.*]] = icmp eq ptr [[START:%.*]], [[END:%.*]] -; AVX2-NEXT: br i1 [[I11_NOT1]], label [[EXIT:%.*]], label [[BB12:%.*]] +; AVX2-NEXT: br i1 [[I11_NOT1]], label [[EXIT:%.*]], label [[BB12_PREHEADER:%.*]] +; AVX2: bb12.preheader: +; AVX2-NEXT: [[END3:%.*]] = ptrtoint ptr [[END]] to i64 +; AVX2-NEXT: [[START4:%.*]] = ptrtoint ptr [[START]] to i64 +; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[END3]], -4 +; AVX2-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START4]] +; AVX2-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 2 +; AVX2-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1 +; AVX2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 124 +; AVX2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[BB12_PREHEADER11:%.*]], label [[VECTOR_PH:%.*]] +; AVX2: vector.ph: +; AVX2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775776 +; AVX2-NEXT: [[TMP4:%.*]] = shl i64 [[N_VEC]], 2 +; AVX2-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]] +; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX2: vector.body: +; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX2-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 2 +; AVX2-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[OFFSET_IDX]] +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 32 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 64 +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 96 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[NEXT_GEP]], align 4 +; AVX2-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4 +; AVX2-NEXT: [[WIDE_LOAD9:%.*]] = load <8 x i32>, ptr [[TMP6]], align 4 +; AVX2-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i32>, ptr [[TMP7]], align 4 +; AVX2-NEXT: [[TMP8:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD]], +; AVX2-NEXT: [[TMP9:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD8]], +; AVX2-NEXT: [[TMP10:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD9]], +; AVX2-NEXT: [[TMP11:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD10]], +; AVX2-NEXT: [[TMP12:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD]], +; AVX2-NEXT: [[TMP13:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD8]], +; AVX2-NEXT: [[TMP14:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD9]], +; AVX2-NEXT: [[TMP15:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD10]], +; AVX2-NEXT: [[TMP16:%.*]] = or <8 x i1> [[TMP8]], [[TMP12]] +; AVX2-NEXT: [[TMP17:%.*]] = or <8 x i1> [[TMP9]], [[TMP13]] +; AVX2-NEXT: [[TMP18:%.*]] = or <8 x i1> [[TMP10]], [[TMP14]] +; AVX2-NEXT: [[TMP19:%.*]] = or <8 x i1> [[TMP11]], [[TMP15]] +; AVX2-NEXT: tail call void @llvm.masked.store.v8i32.p0(<8 x i32> , ptr [[NEXT_GEP]], i32 4, <8 x i1> [[TMP16]]) +; AVX2-NEXT: tail call void @llvm.masked.store.v8i32.p0(<8 x i32> , ptr [[TMP5]], i32 4, <8 x i1> [[TMP17]]) +; AVX2-NEXT: tail call void @llvm.masked.store.v8i32.p0(<8 x i32> , ptr [[TMP6]], i32 4, <8 x i1> [[TMP18]]) +; AVX2-NEXT: tail call void @llvm.masked.store.v8i32.p0(<8 x i32> , ptr [[TMP7]], i32 4, <8 x i1> [[TMP19]]) +; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; AVX2-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX2-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; AVX2: middle.block: +; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; AVX2-NEXT: br i1 [[CMP_N]], label [[EXIT]], label [[BB12_PREHEADER11]] +; AVX2: bb12.preheader11: +; AVX2-NEXT: [[PTR2_PH:%.*]] = phi ptr [ [[START]], [[BB12_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] +; AVX2-NEXT: br label [[BB12:%.*]] ; AVX2: bb12: -; AVX2-NEXT: [[PTR2:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LATCH:%.*]] ], [ [[START]], [[ENTRY:%.*]] ] +; AVX2-NEXT: [[PTR2:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LATCH:%.*]] ], [ [[PTR2_PH]], [[BB12_PREHEADER11]] ] ; AVX2-NEXT: [[VAL:%.*]] = load i32, ptr [[PTR2]], align 4 ; AVX2-NEXT: switch i32 [[VAL]], label [[LATCH]] [ ; AVX2-NEXT: i32 -12, label [[STORE:%.*]] @@ -46,7 +96,7 @@ define dso_local void @test(ptr %start, ptr %end) #0 { ; AVX2: latch: ; AVX2-NEXT: [[PTR_NEXT]] = getelementptr inbounds i8, ptr [[PTR2]], i64 4 ; AVX2-NEXT: [[I11_NOT:%.*]] = icmp eq ptr [[PTR_NEXT]], [[END]] -; AVX2-NEXT: br i1 [[I11_NOT]], label [[EXIT]], label [[BB12]] +; AVX2-NEXT: br i1 [[I11_NOT]], label [[EXIT]], label [[BB12]], !llvm.loop [[LOOP3:![0-9]+]] ; AVX2: exit: ; AVX2-NEXT: ret void ;