From 96b3166602cbe3dc1240bc3189cf1581273928a2 Mon Sep 17 00:00:00 2001 From: Vitaly Buka Date: Thu, 22 Aug 2024 22:21:20 -0700 Subject: [PATCH] Revert "[SLP]Improve/fix subvectors in gather/buildvector nodes handling" (#105780) with "[Vectorize] Fix warnings" It introduced compiler crashes, see #104144. This reverts commit 69332bb8995aef60d830406de12cb79a50390261 and 351f4a5593f1ef507708ec5eeca165b20add3340. --- .../Transforms/Vectorize/SLPVectorizer.cpp | 328 +++++++++------- .../PhaseOrdering/AArch64/slpordering.ll | 74 ++-- .../SLPVectorizer/AArch64/getelementptr.ll | 11 +- .../SLPVectorizer/AArch64/loadorder.ll | 192 ++++----- .../AArch64/multiple_reduction.ll | 365 +++++++++++------- .../AArch64/scalarization-overhead.ll | 62 +-- .../AArch64/shuffle-vectors-mask-size.ll | 7 +- .../SLPVectorizer/AArch64/tsc-s116.ll | 8 +- .../vectorizable-selects-uniform-cmps.ll | 32 +- .../RISCV/combined-loads-stored.ll | 7 +- .../SLPVectorizer/RISCV/reductions.ll | 48 +-- .../SLPVectorizer/SystemZ/pr34619.ll | 11 +- .../Transforms/SLPVectorizer/X86/addsub.ll | 18 +- .../X86/extract-many-users-buildvector.ll | 43 ++- .../X86/extract-scalar-from-undef.ll | 27 +- .../X86/gather-node-same-as-vect-but-order.ll | 13 +- .../SLPVectorizer/X86/horizontal-minmax.ll | 16 +- .../SLPVectorizer/X86/inst_size_bug.ll | 18 +- .../SLPVectorizer/X86/landing_pad.ll | 19 +- llvm/test/Transforms/SLPVectorizer/X86/phi.ll | 36 +- .../SLPVectorizer/X86/reduction-logical.ll | 17 +- .../X86/remark-partial-loads-vectorize.ll | 16 +- .../X86/scatter-vectorize-reused-pointer.ll | 26 +- .../X86/schedule_budget_debug_info.ll | 40 +- .../SLPVectorizer/X86/split-load8_2-unord.ll | 39 +- .../Transforms/SLPVectorizer/X86/tiny-tree.ll | 5 +- .../X86/vect-gather-same-nodes.ll | 6 +- 27 files changed, 785 insertions(+), 699 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e8ab6839d9fa87..d7763a022f3b6e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3094,10 +3094,6 @@ class BoUpSLP { /// The index of this treeEntry in VectorizableTree. int Idx = -1; - /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from - /// other nodes as a series of insertvector instructions. - SmallVector, 0> CombinedEntriesWithIndices; - private: /// The operands of each instruction in each lane Operands[op_index][lane]. /// Note: This helps avoid the replication of the code that performs the @@ -3398,9 +3394,7 @@ class BoUpSLP { if (!isConstant(V)) { auto *I = dyn_cast(V); AllConstsOrCasts &= I && I->getType()->isIntegerTy(); - if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE || - !UserTreeIdx.UserTE->isGather()) - ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last); + ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last); } if (AllConstsOrCasts) CastMaxMinBWSizes = @@ -8355,49 +8349,8 @@ getGEPCosts(const TargetTransformInfo &TTI, ArrayRef Ptrs, void BoUpSLP::transformNodes() { constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - // The tree may grow here, so iterate over nodes, built before. - for (unsigned Idx : seq(VectorizableTree.size())) { - TreeEntry &E = *VectorizableTree[Idx]; - if (E.isGather()) { - ArrayRef VL = E.Scalars; - const unsigned Sz = getVectorElementSize(VL.front()); - unsigned MinVF = getMinVF(2 * Sz); - if (VL.size() <= 2 || - (E.getOpcode() && - (E.isAltShuffle() || E.getOpcode() != Instruction::Load))) - continue; - // Try to find vectorizable sequences and transform them into a series of - // insertvector instructions. - unsigned StartIdx = 0; - unsigned End = VL.size(); - for (unsigned VF = VL.size() / 2; VF >= MinVF; VF /= 2) { - for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) { - ArrayRef Slice = VL.slice(Cnt, VF); - InstructionsState S = getSameOpcode(Slice, *TLI); - if (!S.getOpcode() || S.isAltShuffle() || - (S.getOpcode() != Instruction::Load && - any_of(Slice, [&](Value *V) { - return !areAllUsersVectorized(cast(V), - UserIgnoreList); - }))) - continue; - if (!getTreeEntry(Slice.front()) && !getTreeEntry(Slice.back())) { - unsigned PrevSize = VectorizableTree.size(); - buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX)); - if (PrevSize + 1 == VectorizableTree.size() && - VectorizableTree[PrevSize]->isGather()) { - VectorizableTree.pop_back(); - continue; - } - E.CombinedEntriesWithIndices.emplace_back(PrevSize, Cnt); - if (StartIdx == Cnt) - StartIdx = Cnt + VF; - if (End == Cnt + VF) - End = Cnt; - } - } - } - } + for (std::unique_ptr &TE : VectorizableTree) { + TreeEntry &E = *TE; switch (E.getOpcode()) { case Instruction::Load: { // No need to reorder masked gather loads, just reorder the scalar @@ -8520,7 +8473,175 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { auto *VecTy = getWidenedType(ScalarTy, VL.size()); InstructionCost GatherCost = 0; SmallVector Gathers(VL); - if (!Root && isSplat(VL)) { + // Improve gather cost for gather of loads, if we can group some of the + // loads into vector loads. + InstructionsState S = getSameOpcode(VL, *R.TLI); + const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy); + unsigned MinVF = R.getMinVF(2 * Sz); + if (VL.size() > 2 && + ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) || + (InVectors.empty() && + any_of(seq(0, VL.size() / MinVF), + [&](unsigned Idx) { + ArrayRef SubVL = VL.slice(Idx * MinVF, MinVF); + InstructionsState S = getSameOpcode(SubVL, *R.TLI); + return S.getOpcode() == Instruction::Load && + !S.isAltShuffle(); + }))) && + !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) && + !isSplat(Gathers)) { + InstructionCost BaseCost = R.getGatherCost(Gathers, !Root, ScalarTy); + SetVector VectorizedLoads; + SmallVector> VectorizedStarts; + SmallVector ScatterVectorized; + unsigned StartIdx = 0; + unsigned VF = VL.size() / 2; + for (; VF >= MinVF; VF /= 2) { + for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End; + Cnt += VF) { + ArrayRef Slice = VL.slice(Cnt, VF); + if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) { + InstructionsState SliceS = getSameOpcode(Slice, *R.TLI); + if (SliceS.getOpcode() != Instruction::Load || + SliceS.isAltShuffle()) + continue; + } + if (!VectorizedLoads.count(Slice.front()) && + !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) { + SmallVector PointerOps; + OrdersType CurrentOrder; + LoadsState LS = R.canVectorizeLoads(Slice, Slice.front(), + CurrentOrder, PointerOps); + switch (LS) { + case LoadsState::Vectorize: + case LoadsState::ScatterVectorize: + case LoadsState::StridedVectorize: + // Mark the vectorized loads so that we don't vectorize them + // again. + // TODO: better handling of loads with reorders. + if (((LS == LoadsState::Vectorize || + LS == LoadsState::StridedVectorize) && + CurrentOrder.empty()) || + (LS == LoadsState::StridedVectorize && + isReverseOrder(CurrentOrder))) + VectorizedStarts.emplace_back(Cnt, LS); + else + ScatterVectorized.push_back(Cnt); + VectorizedLoads.insert(Slice.begin(), Slice.end()); + // If we vectorized initial block, no need to try to vectorize + // it again. + if (Cnt == StartIdx) + StartIdx += VF; + break; + case LoadsState::Gather: + break; + } + } + } + // Check if the whole array was vectorized already - exit. + if (StartIdx >= VL.size()) + break; + // Found vectorizable parts - exit. + if (!VectorizedLoads.empty()) + break; + } + if (!VectorizedLoads.empty()) { + unsigned NumParts = TTI.getNumberOfParts(VecTy); + bool NeedInsertSubvectorAnalysis = + !NumParts || (VL.size() / VF) > NumParts; + // Get the cost for gathered loads. + for (unsigned I = 0, End = VL.size(); I < End; I += VF) { + if (VectorizedLoads.contains(VL[I])) + continue; + GatherCost += + getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root); + } + // Exclude potentially vectorized loads from list of gathered + // scalars. + Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType())); + // The cost for vectorized loads. + InstructionCost ScalarsCost = 0; + for (Value *V : VectorizedLoads) { + auto *LI = cast(V); + ScalarsCost += + TTI.getMemoryOpCost(Instruction::Load, LI->getType(), + LI->getAlign(), LI->getPointerAddressSpace(), + CostKind, TTI::OperandValueInfo(), LI); + } + auto *LoadTy = getWidenedType(VL.front()->getType(), VF); + for (const std::pair &P : VectorizedStarts) { + auto *LI = cast(VL[P.first]); + Align Alignment = LI->getAlign(); + GatherCost += + P.second == LoadsState::Vectorize + ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, + LI->getPointerAddressSpace(), CostKind, + TTI::OperandValueInfo(), LI) + : TTI.getStridedMemoryOpCost( + Instruction::Load, LoadTy, LI->getPointerOperand(), + /*VariableMask=*/false, Alignment, CostKind, LI); + // Add external uses costs. + for (auto [Idx, V] : enumerate(VL.slice( + P.first, std::min(VL.size() - P.first, VF)))) + if (!R.areAllUsersVectorized(cast(V))) + GatherCost += TTI.getVectorInstrCost(Instruction::ExtractElement, + LoadTy, CostKind, Idx); + // Estimate GEP cost. + SmallVector PointerOps(VF); + for (auto [I, V] : enumerate(VL.slice(P.first, VF))) + PointerOps[I] = cast(V)->getPointerOperand(); + auto [ScalarGEPCost, VectorGEPCost] = + getGEPCosts(TTI, PointerOps, LI->getPointerOperand(), + Instruction::Load, CostKind, LI->getType(), LoadTy); + GatherCost += VectorGEPCost - ScalarGEPCost; + } + for (unsigned P : ScatterVectorized) { + auto *LI0 = cast(VL[P]); + ArrayRef Slice = VL.slice(P, VF); + Align CommonAlignment = computeCommonAlignment(Slice); + GatherCost += TTI.getGatherScatterOpCost( + Instruction::Load, LoadTy, LI0->getPointerOperand(), + /*VariableMask=*/false, CommonAlignment, CostKind, LI0); + // Estimate GEP cost. + SmallVector PointerOps(VF); + for (auto [I, V] : enumerate(Slice)) + PointerOps[I] = cast(V)->getPointerOperand(); + OrdersType Order; + if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE, + Order)) { + // TODO: improve checks if GEPs can be vectorized. + Value *Ptr0 = PointerOps.front(); + Type *ScalarTy = Ptr0->getType(); + auto *VecTy = getWidenedType(ScalarTy, VF); + auto [ScalarGEPCost, VectorGEPCost] = + getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr, + CostKind, ScalarTy, VecTy); + GatherCost += VectorGEPCost - ScalarGEPCost; + if (!Order.empty()) { + SmallVector Mask; + inversePermutation(Order, Mask); + GatherCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, + VecTy, Mask, CostKind); + } + } else { + GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true, + PointerOps.front()->getType()); + } + } + if (NeedInsertSubvectorAnalysis) { + // Add the cost for the subvectors insert. + SmallVector ShuffleMask(VL.size()); + for (unsigned I = VF, E = VL.size(); I < E; I += VF) { + for (unsigned Idx : seq(0, E)) + ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx; + GatherCost += ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, + ShuffleMask, CostKind, I, LoadTy); + } + } + GatherCost -= ScalarsCost; + } + GatherCost = std::min(BaseCost, GatherCost); + } else if (!Root && isSplat(VL)) { // Found the broadcasting of the single scalar, calculate the cost as // the broadcast. const auto *It = find_if_not(VL, IsaPred); @@ -9268,9 +9389,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { InstructionCost createFreeze(InstructionCost Cost) { return Cost; } /// Finalize emission of the shuffles. InstructionCost - finalize(ArrayRef ExtMask, - ArrayRef> SubVectors, - unsigned VF = 0, + finalize(ArrayRef ExtMask, unsigned VF = 0, function_ref &)> Action = {}) { IsFinalized = true; if (Action) { @@ -9288,29 +9407,6 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { Action(V, CommonMask); InVectors.front() = V; } - if (!SubVectors.empty()) { - const PointerUnion &Vec = InVectors.front(); - if (InVectors.size() == 2) - Cost += createShuffle(Vec, InVectors.back(), CommonMask); - else - Cost += createShuffle(Vec, nullptr, CommonMask); - for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) - if (CommonMask[Idx] != PoisonMaskElem) - CommonMask[Idx] = Idx; - for (const auto &[E, Idx] : SubVectors) { - Cost += ::getShuffleCost( - TTI, TTI::SK_InsertSubvector, - FixedVectorType::get(ScalarTy, CommonMask.size()), std::nullopt, - CostKind, Idx, - FixedVectorType::get(ScalarTy, E->getVectorFactor())); - if (!CommonMask.empty()) { - std::iota(std::next(CommonMask.begin(), Idx), - std::next(CommonMask.begin(), Idx + E->getVectorFactor()), - Idx); - } - } - } - ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true); if (CommonMask.empty()) { assert(InVectors.size() == 1 && "Expected only one vector with no mask"); @@ -12408,9 +12504,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { /// \param Action the action (if any) to be performed before final applying of /// the \p ExtMask mask. Value * - finalize(ArrayRef ExtMask, - ArrayRef> SubVectors, - unsigned VF = 0, + finalize(ArrayRef ExtMask, unsigned VF = 0, function_ref &)> Action = {}) { IsFinalized = true; SmallVector NewExtMask(ExtMask); @@ -12444,29 +12538,6 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis { Action(Vec, CommonMask); InVectors.front() = Vec; } - if (!SubVectors.empty()) { - Value *Vec = InVectors.front(); - if (InVectors.size() == 2) { - Vec = createShuffle(Vec, InVectors.back(), CommonMask); - InVectors.pop_back(); - } else { - Vec = createShuffle(Vec, nullptr, CommonMask); - } - for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) - if (CommonMask[Idx] != PoisonMaskElem) - CommonMask[Idx] = Idx; - for (const auto &[E, Idx] : SubVectors) { - Vec = Builder.CreateInsertVector( - Vec->getType(), Vec, E->VectorizedValue, Builder.getInt64(Idx)); - if (!CommonMask.empty()) { - std::iota(std::next(CommonMask.begin(), Idx), - std::next(CommonMask.begin(), Idx + E->getVectorFactor()), - Idx); - } - } - InVectors.front() = Vec; - } - if (!ExtMask.empty()) { if (CommonMask.empty()) { CommonMask.assign(ExtMask.begin(), ExtMask.end()); @@ -12545,14 +12616,7 @@ Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx, : ScalarTy, Builder, *this); ShuffleBuilder.add(V, Mask); - SmallVector> SubVectors( - E->CombinedEntriesWithIndices.size()); - transform(E->CombinedEntriesWithIndices, SubVectors.begin(), - [&](const auto &P) { - return std::make_pair(VectorizableTree[P.first].get(), - P.second); - }); - return ShuffleBuilder.finalize(std::nullopt, SubVectors); + return ShuffleBuilder.finalize(std::nullopt); }; Value *V = vectorizeTree(VE, PostponedPHIs); if (VF * getNumElements(VL[0]->getType()) != @@ -12635,17 +12699,6 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, SmallVector ReuseShuffleIndices(E->ReuseShuffleIndices.begin(), E->ReuseShuffleIndices.end()); SmallVector GatheredScalars(E->Scalars.begin(), E->Scalars.end()); - // Clear values, to be replaced by insertvector instructions. - for (const auto &[EIdx, Idx] : E->CombinedEntriesWithIndices) - for_each(MutableArrayRef(GatheredScalars) - .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()), - [&](Value *&V) { V = PoisonValue::get(V->getType()); }); - SmallVector> SubVectors( - E->CombinedEntriesWithIndices.size()); - transform(E->CombinedEntriesWithIndices, SubVectors.begin(), - [&](const auto &P) { - return std::make_pair(VectorizableTree[P.first].get(), P.second); - }); // Build a mask out of the reorder indices and reorder scalars per this // mask. SmallVector ReorderMask; @@ -12783,7 +12836,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } } ShuffleBuilder.add(*FrontTE, Mask); - Res = ShuffleBuilder.finalize(E->getCommonMask(), SubVectors); + Res = ShuffleBuilder.finalize(E->getCommonMask()); return Res; } if (!Resized) { @@ -13040,10 +13093,10 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, (IsSingleShuffle && ((IsIdentityShuffle && IsNonPoisoned) || IsUsedInExpr) && isa(V)); })) - Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); else Res = ShuffleBuilder.finalize( - E->ReuseShuffleIndices, SubVectors, E->Scalars.size(), + E->ReuseShuffleIndices, E->Scalars.size(), [&](Value *&Vec, SmallVectorImpl &Mask) { TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false); Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec); @@ -13054,7 +13107,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true); Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size()); ShuffleBuilder.add(BV, ReuseMask); - Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); } else { // Gather all constants. SmallVector Mask(GatheredScalars.size(), PoisonMaskElem); @@ -13064,7 +13117,7 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } Value *BV = ShuffleBuilder.gather(GatheredScalars); ShuffleBuilder.add(BV, Mask); - Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); + Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices); } if (NeedFreeze) @@ -13073,8 +13126,6 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) { - for (const auto &[EIdx, _] : E->CombinedEntriesWithIndices) - (void)vectorizeTree(VectorizableTree[EIdx].get(), /*PostponedPHIs=*/false); return processBuildVector(E, ScalarTy, Builder, *this); } @@ -13126,13 +13177,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { } else { ShuffleBuilder.addOrdered(V, E->ReorderIndices); } - SmallVector> SubVectors( - E->CombinedEntriesWithIndices.size()); - transform( - E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) { - return std::make_pair(VectorizableTree[P.first].get(), P.second); - }); - return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors); + return ShuffleBuilder.finalize(E->ReuseShuffleIndices); }; assert(!E->isGather() && "Unhandled state"); @@ -14535,7 +14580,7 @@ Value *BoUpSLP::vectorizeTree( ShuffleBuilder.add(V1, CombinedMask1); if (V2) ShuffleBuilder.add(V2, CombinedMask2); - return ShuffleBuilder.finalize(std::nullopt, std::nullopt); + return ShuffleBuilder.finalize(std::nullopt); }; auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef Mask, @@ -14673,14 +14718,7 @@ Value *BoUpSLP::vectorizeTree( // Clear up reduction references, if any. if (UserIgnoreList) { for (Instruction *I : RemovedInsts) { - const TreeEntry *IE = getTreeEntry(I); - if (IE->Idx != 0 && - !(VectorizableTree.front()->isGather() && isa(I) && - !IE->UserTreeIndices.empty() && - any_of(IE->UserTreeIndices, [&](const EdgeInfo &EI) { - return EI.UserTE == VectorizableTree.front().get() && - EI.EdgeIdx == UINT_MAX; - }))) + if (getTreeEntry(I)->Idx != 0) continue; SmallVector LogicalOpSelects; I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) { diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll index 2121775224098e..22511c018dca2d 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll @@ -18,62 +18,62 @@ define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[IP2]] to i64 ; CHECK-NEXT: [[RRRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 4 ; CHECK-NEXT: [[RRRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 4 -; CHECK-NEXT: [[RDD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[RDD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[RRRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR]], i64 4 -; CHECK-NEXT: [[RRRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64]], i64 4 -; CHECK-NEXT: [[RDD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[RDD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[RRRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_1]], i64 4 -; CHECK-NEXT: [[RRRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 4 -; CHECK-NEXT: [[RDD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[RDD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[RRRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_2]], i64 4 -; CHECK-NEXT: [[RRRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_2]], i64 4 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1, !tbaa [[TBAA0:![0-9]+]] ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[RDD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[RDD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[RRRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR]], i64 4 +; CHECK-NEXT: [[RRRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64]], i64 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[RDD_PTR]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[RDD_PTR64]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_1]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_1]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[RDD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[RDD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[RRRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_1]], i64 4 +; CHECK-NEXT: [[RRRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 4 ; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[RDD_PTR_1]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_1]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_2]], align 1, !tbaa [[TBAA0]] ; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[RDD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_1]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[RDD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_1]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[RRRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR_2]], i64 4 +; CHECK-NEXT: [[RRRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[RDD_PTR64_2]], i64 4 ; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[RDD_PTR_2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[TBAA0]] -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[RDD_PTR64_2]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[RRRAYIDX3_3]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[RRRAYIDX5_3]], align 1, !tbaa [[TBAA0]] +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP18]], <16 x i8> [[TMP19]], <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP21]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <16 x i8> [[TMP31]], <16 x i8> [[TMP32]], <16 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32> ; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> ; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> ; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> -; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]] +; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP34]], [[TMP40]] ; CHECK-NEXT: [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], -; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP26]] +; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP28]] ; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP46:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll index 91c8db14a45aa1..c1cef6ff3d10b4 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll @@ -169,12 +169,11 @@ define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[T11]] to i64 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP9]] ; CHECK-NEXT: [[T12:%.*]] = load i32, ptr [[ARRAYIDX15]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[T10]], i64 2 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[T12]], i64 3 -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP11]], <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) -; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP14]], [[SUM_032]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[T10]], i64 2 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[T12]], i64 3 +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]]) +; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP13]], [[SUM_032]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll index 5b878108af59af..d79aed89b0be73 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll @@ -340,12 +340,12 @@ entry: define i16 @reduce_blockstrided4(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, i32 noundef %stride) { ; CHECK-LABEL: @reduce_blockstrided4( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X:%.*]], align 2 ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X]], align 2 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM]] ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y:%.*]], align 2 +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM]] ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i16> [[TMP2]], [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i16> [[TMP3]], [[TMP1]] @@ -416,31 +416,31 @@ define i32 @reduce_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[OFF2:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4 -; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 ; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[TMP10]] to <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP12]], <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> [[TMP9]], <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> [[TMP11]], <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = zext <16 x i8> [[TMP12]] to <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> [[TMP15]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = mul nuw nsw <16 x i32> [[TMP11]], [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = mul nuw nsw <16 x i32> [[TMP13]], [[TMP19]] ; CHECK-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP20]]) ; CHECK-NEXT: ret i32 [[TMP21]] ; @@ -677,63 +677,63 @@ entry: define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, ptr nocapture noundef writeonly %z, i32 noundef %stride) { ; CHECK-LABEL: @store_blockstrided3( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i64 8 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[X:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[ADD4:%.*]] = add nsw i32 [[STRIDE:%.*]], 1 ; CHECK-NEXT: [[IDXPROM5:%.*]] = sext i32 [[ADD4]] to i64 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4 ; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[STRIDE]], 1 ; CHECK-NEXT: [[IDXPROM11:%.*]] = sext i32 [[MUL]] to i64 ; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM11]] -; CHECK-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL]], 2 -; CHECK-NEXT: [[IDXPROM19:%.*]] = sext i32 [[ADD18]] to i64 -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM19]] -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[ADD14:%.*]] = or disjoint i32 [[MUL]], 1 +; CHECK-NEXT: [[IDXPROM15:%.*]] = sext i32 [[ADD14]] to i64 +; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM15]] ; CHECK-NEXT: [[MUL21:%.*]] = mul nsw i32 [[STRIDE]], 3 ; CHECK-NEXT: [[IDXPROM23:%.*]] = sext i32 [[MUL21]] to i64 ; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM23]] -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX24]], align 4 -; CHECK-NEXT: [[ADD26:%.*]] = add nsw i32 [[MUL21]], 1 -; CHECK-NEXT: [[IDXPROM27:%.*]] = sext i32 [[ADD26]] to i64 -; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM27]] -; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i64 8 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4 +; CHECK-NEXT: [[ADD30:%.*]] = add nsw i32 [[MUL21]], 2 +; CHECK-NEXT: [[IDXPROM31:%.*]] = sext i32 [[ADD30]] to i64 +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM31]] +; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX32]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i32>, ptr [[Y:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, ptr [[Y]], i64 8 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM5]] +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM11]] -; CHECK-NEXT: [[ARRAYIDX56:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM19]] -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX56]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4 +; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM15]] ; CHECK-NEXT: [[ARRAYIDX60:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM23]] -; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX60]], align 4 -; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM27]] +; CHECK-NEXT: [[ARRAYIDX68:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM31]] +; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX68]], align 4 ; CHECK-NEXT: [[ARRAYIDX72:%.*]] = getelementptr inbounds i8, ptr [[Z:%.*]], i64 4 -; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP3]], [[TMP0]] +; CHECK-NEXT: [[MUL73:%.*]] = mul nsw i32 [[TMP6]], [[TMP1]] ; CHECK-NEXT: [[ARRAYIDX76:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 24 -; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[X]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr [[Y]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP8]], [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP9]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP5]], [[TMP0]] +; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP7]], [[TMP2]] ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> -; CHECK-NEXT: [[ARRAYIDX84:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 28 -; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX48]], align 4 +; CHECK-NEXT: [[MUL81:%.*]] = mul nsw i32 [[TMP8]], [[TMP3]] +; CHECK-NEXT: [[ARRAYIDX82:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 32 +; CHECK-NEXT: [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX16]], align 4 +; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX52]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP14]], [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[MUL85:%.*]] = mul nsw i32 [[TMP4]], [[TMP1]] -; CHECK-NEXT: [[MUL87:%.*]] = mul nsw i32 [[TMP5]], [[TMP2]] -; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 44 -; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 36 -; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX28]], align 4 -; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX64]], align 4 +; CHECK-NEXT: [[ARRAYIDX90:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 40 +; CHECK-NEXT: [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX24]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX60]], align 4 ; CHECK-NEXT: store i32 [[MUL73]], ptr [[Z]], align 4 ; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr [[ARRAYIDX72]], align 4 -; CHECK-NEXT: store <2 x i32> [[TMP16]], ptr [[ARRAYIDX84]], align 4 -; CHECK-NEXT: store i32 [[MUL85]], ptr [[ARRAYIDX76]], align 4 -; CHECK-NEXT: store i32 [[MUL87]], ptr [[ARRAYIDX88]], align 4 +; CHECK-NEXT: store i32 [[MUL81]], ptr [[ARRAYIDX82]], align 4 +; CHECK-NEXT: store <2 x i32> [[TMP16]], ptr [[ARRAYIDX76]], align 4 ; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <2 x i32> [[TMP18]], [[TMP17]] ; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP19]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: store <2 x i32> [[TMP20]], ptr [[ARRAYIDX92]], align 4 +; CHECK-NEXT: store <2 x i32> [[TMP20]], ptr [[ARRAYIDX90]], align 4 +; CHECK-NEXT: [[MUL91:%.*]] = mul nsw i32 [[TMP9]], [[TMP4]] +; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 36 +; CHECK-NEXT: store i32 [[MUL91]], ptr [[ARRAYIDX92]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -833,12 +833,12 @@ entry: define void @store_blockstrided4(ptr nocapture noundef readonly %x, ptr nocapture noundef readonly %y, i32 noundef %stride, ptr %dst0) { ; CHECK-LABEL: @store_blockstrided4( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X:%.*]], align 2 ; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y:%.*]], i64 [[IDXPROM]] -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X]], align 2 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM]] ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX4]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[Y:%.*]], align 2 +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM]] ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i16>, ptr [[ARRAYIDX20]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i16> [[TMP2]], [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i16> [[TMP3]], [[TMP1]] @@ -1203,62 +1203,62 @@ define dso_local i32 @full(ptr nocapture noundef readonly %p1, i32 noundef %st1, ; CHECK-NEXT: [[IDX_EXT63:%.*]] = sext i32 [[ST2:%.*]] to i64 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P1:%.*]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[P2:%.*]], i64 4 -; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4 -; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4 -; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4 -; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[P2]], align 1 ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 ; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR]], align 1 ; CHECK-NEXT: [[TMP5:%.*]] = load <4 x i8>, ptr [[ADD_PTR64]], align 1 ; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 ; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 4 ; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 ; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 ; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 +; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[ADD_PTR64_2:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_1]], i64 [[IDX_EXT63]] +; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR_2]], i64 4 +; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64_2]], i64 4 ; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ADD_PTR_2]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP16]], <16 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = zext <16 x i8> [[TMP17]] to <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <16 x i8> [[TMP20]], <16 x i8> [[TMP21]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP19]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP18]], [[TMP25]] -; CHECK-NEXT: [[TMP27:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i8> [[TMP28]], <16 x i8> [[TMP29]], <16 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP27]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> -; CHECK-NEXT: [[TMP33:%.*]] = zext <16 x i8> [[TMP32]] to <16 x i32> -; CHECK-NEXT: [[TMP34:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_2]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_3]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP16]], <16 x i8> [[TMP17]], <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <16 x i8> [[TMP18]], <16 x i8> [[TMP19]], <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = zext <16 x i8> [[TMP20]] to <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> [[TMP5]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> [[TMP23]], <16 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i8> [[TMP24]], <16 x i8> [[TMP25]], <16 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = zext <16 x i8> [[TMP26]] to <16 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP21]], [[TMP27]] +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP10]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <16 x i8> [[TMP29]], <16 x i8> [[TMP30]], <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <16 x i8> [[TMP31]], <16 x i8> [[TMP32]], <16 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = zext <16 x i8> [[TMP33]] to <16 x i32> ; CHECK-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> [[TMP7]], <16 x i32> ; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP37:%.*]] = shufflevector <16 x i8> [[TMP35]], <16 x i8> [[TMP36]], <16 x i32> -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP34]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i8> [[TMP37]], <16 x i8> [[TMP38]], <16 x i32> ; CHECK-NEXT: [[TMP40:%.*]] = zext <16 x i8> [[TMP39]] to <16 x i32> -; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP33]], [[TMP40]] +; CHECK-NEXT: [[TMP41:%.*]] = sub nsw <16 x i32> [[TMP34]], [[TMP40]] ; CHECK-NEXT: [[TMP42:%.*]] = shl nsw <16 x i32> [[TMP41]], -; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP26]] +; CHECK-NEXT: [[TMP43:%.*]] = add nsw <16 x i32> [[TMP42]], [[TMP28]] ; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP45:%.*]] = add nsw <16 x i32> [[TMP43]], [[TMP44]] ; CHECK-NEXT: [[TMP46:%.*]] = sub nsw <16 x i32> [[TMP43]], [[TMP44]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll index 07411cacb36268..d89d6286703605 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/multiple_reduction.ll @@ -14,161 +14,232 @@ define i64 @straight(ptr nocapture noundef readonly %p, i32 noundef %st) { ; CHECK-LABEL: @straight( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[ST:%.*]] to i64 -; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[ADD_PTR_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]] -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[P]], align 2 +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 2 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[ADD_PTR]], align 2 +; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr [[ADD_PTR_1]], align 2 +; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i16>, ptr [[ADD_PTR_2]], align 2 +; CHECK-NEXT: [[ADD_PTR_3:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_2]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i16>, ptr [[ADD_PTR_3]], align 2 +; CHECK-NEXT: [[ADD_PTR_4:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_3]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[TMP5:%.*]] = load <8 x i16>, ptr [[ADD_PTR_4]], align 2 +; CHECK-NEXT: [[ADD_PTR_5:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_4]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[TMP6:%.*]] = load <8 x i16>, ptr [[ADD_PTR_5]], align 2 +; CHECK-NEXT: [[ADD_PTR_6:%.*]] = getelementptr inbounds i16, ptr [[ADD_PTR_5]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[TMP7:%.*]] = load <8 x i16>, ptr [[ADD_PTR_6]], align 2 -; CHECK-NEXT: [[TMP8:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> poison, <8 x i16> [[TMP0]], i64 0) -; CHECK-NEXT: [[TMP9:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP8]], <8 x i16> [[TMP1]], i64 8) -; CHECK-NEXT: [[TMP10:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP9]], <8 x i16> [[TMP2]], i64 16) -; CHECK-NEXT: [[TMP11:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP10]], <8 x i16> [[TMP3]], i64 24) -; CHECK-NEXT: [[TMP12:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP11]], <8 x i16> [[TMP4]], i64 32) -; CHECK-NEXT: [[TMP13:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP12]], <8 x i16> [[TMP5]], i64 40) -; CHECK-NEXT: [[TMP14:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP13]], <8 x i16> [[TMP6]], i64 48) -; CHECK-NEXT: [[TMP15:%.*]] = call <64 x i16> @llvm.vector.insert.v64i16.v8i16(<64 x i16> [[TMP14]], <8 x i16> [[TMP7]], i64 56) -; CHECK-NEXT: [[TMP16:%.*]] = zext <64 x i16> [[TMP15]] to <64 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <64 x i32> [[TMP16]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <64 x i32> [[TMP16]], i32 1 -; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i32 [[TMP17]], [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = mul nuw nsw <64 x i32> [[TMP16]], [[TMP16]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <64 x i32> [[TMP16]], i32 2 -; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[TMP20]] -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <64 x i32> [[TMP16]], i32 3 -; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[TMP21]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <64 x i32> [[TMP16]], i32 4 -; CHECK-NEXT: [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <64 x i32> [[TMP16]], i32 5 -; CHECK-NEXT: [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[TMP23]] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <64 x i32> [[TMP16]], i32 6 -; CHECK-NEXT: [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[TMP24]] -; CHECK-NEXT: [[TMP25:%.*]] = extractelement <64 x i32> [[TMP16]], i32 7 -; CHECK-NEXT: [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[TMP25]] -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <64 x i32> [[TMP16]], i32 8 -; CHECK-NEXT: [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[TMP26]] -; CHECK-NEXT: [[TMP27:%.*]] = extractelement <64 x i32> [[TMP16]], i32 9 -; CHECK-NEXT: [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[TMP27]] -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <64 x i32> [[TMP16]], i32 10 -; CHECK-NEXT: [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[TMP28]] -; CHECK-NEXT: [[TMP29:%.*]] = extractelement <64 x i32> [[TMP16]], i32 11 -; CHECK-NEXT: [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[TMP29]] -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <64 x i32> [[TMP16]], i32 12 -; CHECK-NEXT: [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[TMP30]] -; CHECK-NEXT: [[TMP31:%.*]] = extractelement <64 x i32> [[TMP16]], i32 13 -; CHECK-NEXT: [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[TMP31]] -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <64 x i32> [[TMP16]], i32 14 -; CHECK-NEXT: [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[TMP32]] -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <64 x i32> [[TMP16]], i32 15 -; CHECK-NEXT: [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[TMP33]] -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <64 x i32> [[TMP16]], i32 16 -; CHECK-NEXT: [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[TMP34]] -; CHECK-NEXT: [[TMP35:%.*]] = extractelement <64 x i32> [[TMP16]], i32 17 -; CHECK-NEXT: [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[TMP35]] -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <64 x i32> [[TMP16]], i32 18 -; CHECK-NEXT: [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[TMP36]] -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <64 x i32> [[TMP16]], i32 19 -; CHECK-NEXT: [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[TMP37]] -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <64 x i32> [[TMP16]], i32 20 -; CHECK-NEXT: [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[TMP38]] -; CHECK-NEXT: [[TMP39:%.*]] = extractelement <64 x i32> [[TMP16]], i32 21 -; CHECK-NEXT: [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[TMP39]] -; CHECK-NEXT: [[TMP40:%.*]] = extractelement <64 x i32> [[TMP16]], i32 22 -; CHECK-NEXT: [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[TMP40]] -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <64 x i32> [[TMP16]], i32 23 -; CHECK-NEXT: [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[TMP41]] -; CHECK-NEXT: [[TMP42:%.*]] = extractelement <64 x i32> [[TMP16]], i32 24 -; CHECK-NEXT: [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[TMP42]] -; CHECK-NEXT: [[TMP43:%.*]] = extractelement <64 x i32> [[TMP16]], i32 25 -; CHECK-NEXT: [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[TMP43]] -; CHECK-NEXT: [[TMP44:%.*]] = extractelement <64 x i32> [[TMP16]], i32 26 -; CHECK-NEXT: [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[TMP44]] -; CHECK-NEXT: [[TMP45:%.*]] = extractelement <64 x i32> [[TMP16]], i32 27 -; CHECK-NEXT: [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[TMP45]] -; CHECK-NEXT: [[TMP46:%.*]] = extractelement <64 x i32> [[TMP16]], i32 28 -; CHECK-NEXT: [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[TMP46]] -; CHECK-NEXT: [[TMP47:%.*]] = extractelement <64 x i32> [[TMP16]], i32 29 -; CHECK-NEXT: [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[TMP47]] -; CHECK-NEXT: [[TMP48:%.*]] = extractelement <64 x i32> [[TMP16]], i32 30 -; CHECK-NEXT: [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[TMP48]] -; CHECK-NEXT: [[TMP49:%.*]] = extractelement <64 x i32> [[TMP16]], i32 31 -; CHECK-NEXT: [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[TMP49]] -; CHECK-NEXT: [[TMP50:%.*]] = extractelement <64 x i32> [[TMP16]], i32 32 -; CHECK-NEXT: [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[TMP50]] -; CHECK-NEXT: [[TMP51:%.*]] = extractelement <64 x i32> [[TMP16]], i32 33 -; CHECK-NEXT: [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[TMP51]] -; CHECK-NEXT: [[TMP52:%.*]] = extractelement <64 x i32> [[TMP16]], i32 34 -; CHECK-NEXT: [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[TMP52]] -; CHECK-NEXT: [[TMP53:%.*]] = extractelement <64 x i32> [[TMP16]], i32 35 -; CHECK-NEXT: [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[TMP53]] -; CHECK-NEXT: [[TMP54:%.*]] = extractelement <64 x i32> [[TMP16]], i32 36 -; CHECK-NEXT: [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[TMP54]] -; CHECK-NEXT: [[TMP55:%.*]] = extractelement <64 x i32> [[TMP16]], i32 37 -; CHECK-NEXT: [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[TMP55]] -; CHECK-NEXT: [[TMP56:%.*]] = extractelement <64 x i32> [[TMP16]], i32 38 -; CHECK-NEXT: [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[TMP56]] -; CHECK-NEXT: [[TMP57:%.*]] = extractelement <64 x i32> [[TMP16]], i32 39 -; CHECK-NEXT: [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[TMP57]] -; CHECK-NEXT: [[TMP58:%.*]] = extractelement <64 x i32> [[TMP16]], i32 40 -; CHECK-NEXT: [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[TMP58]] -; CHECK-NEXT: [[TMP59:%.*]] = extractelement <64 x i32> [[TMP16]], i32 41 -; CHECK-NEXT: [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[TMP59]] -; CHECK-NEXT: [[TMP60:%.*]] = extractelement <64 x i32> [[TMP16]], i32 42 -; CHECK-NEXT: [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[TMP60]] -; CHECK-NEXT: [[TMP61:%.*]] = extractelement <64 x i32> [[TMP16]], i32 43 -; CHECK-NEXT: [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[TMP61]] -; CHECK-NEXT: [[TMP62:%.*]] = extractelement <64 x i32> [[TMP16]], i32 44 -; CHECK-NEXT: [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[TMP62]] -; CHECK-NEXT: [[TMP63:%.*]] = extractelement <64 x i32> [[TMP16]], i32 45 -; CHECK-NEXT: [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[TMP63]] -; CHECK-NEXT: [[TMP64:%.*]] = extractelement <64 x i32> [[TMP16]], i32 46 -; CHECK-NEXT: [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[TMP64]] -; CHECK-NEXT: [[TMP65:%.*]] = extractelement <64 x i32> [[TMP16]], i32 47 -; CHECK-NEXT: [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[TMP65]] -; CHECK-NEXT: [[TMP66:%.*]] = extractelement <64 x i32> [[TMP16]], i32 48 -; CHECK-NEXT: [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[TMP66]] -; CHECK-NEXT: [[TMP67:%.*]] = extractelement <64 x i32> [[TMP16]], i32 49 -; CHECK-NEXT: [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[TMP67]] -; CHECK-NEXT: [[TMP68:%.*]] = extractelement <64 x i32> [[TMP16]], i32 50 -; CHECK-NEXT: [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[TMP68]] -; CHECK-NEXT: [[TMP69:%.*]] = extractelement <64 x i32> [[TMP16]], i32 51 -; CHECK-NEXT: [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[TMP69]] -; CHECK-NEXT: [[TMP70:%.*]] = extractelement <64 x i32> [[TMP16]], i32 52 -; CHECK-NEXT: [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[TMP70]] -; CHECK-NEXT: [[TMP71:%.*]] = extractelement <64 x i32> [[TMP16]], i32 53 -; CHECK-NEXT: [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[TMP71]] -; CHECK-NEXT: [[TMP72:%.*]] = extractelement <64 x i32> [[TMP16]], i32 54 -; CHECK-NEXT: [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[TMP72]] -; CHECK-NEXT: [[TMP73:%.*]] = extractelement <64 x i32> [[TMP16]], i32 55 -; CHECK-NEXT: [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[TMP73]] -; CHECK-NEXT: [[TMP74:%.*]] = extractelement <64 x i32> [[TMP16]], i32 56 -; CHECK-NEXT: [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[TMP74]] -; CHECK-NEXT: [[TMP75:%.*]] = extractelement <64 x i32> [[TMP16]], i32 57 -; CHECK-NEXT: [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[TMP75]] -; CHECK-NEXT: [[TMP76:%.*]] = extractelement <64 x i32> [[TMP16]], i32 58 -; CHECK-NEXT: [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[TMP76]] -; CHECK-NEXT: [[TMP77:%.*]] = extractelement <64 x i32> [[TMP16]], i32 59 -; CHECK-NEXT: [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[TMP77]] -; CHECK-NEXT: [[TMP78:%.*]] = extractelement <64 x i32> [[TMP16]], i32 60 -; CHECK-NEXT: [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[TMP78]] -; CHECK-NEXT: [[TMP79:%.*]] = extractelement <64 x i32> [[TMP16]], i32 61 -; CHECK-NEXT: [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[TMP79]] -; CHECK-NEXT: [[TMP80:%.*]] = extractelement <64 x i32> [[TMP16]], i32 62 -; CHECK-NEXT: [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[TMP80]] -; CHECK-NEXT: [[TMP81:%.*]] = extractelement <64 x i32> [[TMP16]], i32 63 -; CHECK-NEXT: [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[TMP81]] -; CHECK-NEXT: [[TMP82:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP19]]) +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP7]], i32 7 +; CHECK-NEXT: [[CONV_7_7:%.*]] = zext i16 [[TMP8]] to i32 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <64 x i16> [[TMP9]], <64 x i16> [[TMP10]], <64 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <64 x i16> [[TMP11]], <64 x i16> [[TMP12]], <64 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <64 x i16> [[TMP13]], <64 x i16> [[TMP14]], <64 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <64 x i16> [[TMP15]], <64 x i16> [[TMP16]], <64 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <64 x i16> [[TMP17]], <64 x i16> [[TMP18]], <64 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <64 x i16> [[TMP19]], <64 x i16> [[TMP20]], <64 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x i16> [[TMP7]], <8 x i16> poison, <64 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <64 x i16> [[TMP21]], <64 x i16> [[TMP22]], <64 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = zext <64 x i16> [[TMP23]] to <64 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i16> [[TMP7]], i32 6 +; CHECK-NEXT: [[CONV_6_7:%.*]] = zext i16 [[TMP25]] to i32 +; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i16> [[TMP7]], i32 5 +; CHECK-NEXT: [[CONV_5_7:%.*]] = zext i16 [[TMP26]] to i32 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x i16> [[TMP7]], i32 4 +; CHECK-NEXT: [[CONV_4_7:%.*]] = zext i16 [[TMP27]] to i32 +; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i16> [[TMP7]], i32 3 +; CHECK-NEXT: [[CONV_3_7:%.*]] = zext i16 [[TMP28]] to i32 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i16> [[TMP7]], i32 2 +; CHECK-NEXT: [[CONV_2_7:%.*]] = zext i16 [[TMP29]] to i32 +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i16> [[TMP7]], i32 1 +; CHECK-NEXT: [[CONV_1_7:%.*]] = zext i16 [[TMP30]] to i32 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x i16> [[TMP7]], i32 0 +; CHECK-NEXT: [[CONV_764:%.*]] = zext i16 [[TMP31]] to i32 +; CHECK-NEXT: [[TMP32:%.*]] = extractelement <8 x i16> [[TMP6]], i32 7 +; CHECK-NEXT: [[CONV_7_6:%.*]] = zext i16 [[TMP32]] to i32 +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i16> [[TMP6]], i32 6 +; CHECK-NEXT: [[CONV_6_6:%.*]] = zext i16 [[TMP33]] to i32 +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <8 x i16> [[TMP6]], i32 5 +; CHECK-NEXT: [[CONV_5_6:%.*]] = zext i16 [[TMP34]] to i32 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i16> [[TMP6]], i32 4 +; CHECK-NEXT: [[CONV_4_6:%.*]] = zext i16 [[TMP35]] to i32 +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <8 x i16> [[TMP6]], i32 3 +; CHECK-NEXT: [[CONV_3_6:%.*]] = zext i16 [[TMP36]] to i32 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i16> [[TMP6]], i32 2 +; CHECK-NEXT: [[CONV_2_6:%.*]] = zext i16 [[TMP37]] to i32 +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <8 x i16> [[TMP6]], i32 1 +; CHECK-NEXT: [[CONV_1_6:%.*]] = zext i16 [[TMP38]] to i32 +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <8 x i16> [[TMP6]], i32 0 +; CHECK-NEXT: [[CONV_660:%.*]] = zext i16 [[TMP39]] to i32 +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <8 x i16> [[TMP5]], i32 7 +; CHECK-NEXT: [[CONV_7_5:%.*]] = zext i16 [[TMP40]] to i32 +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <8 x i16> [[TMP5]], i32 6 +; CHECK-NEXT: [[CONV_6_5:%.*]] = zext i16 [[TMP41]] to i32 +; CHECK-NEXT: [[TMP42:%.*]] = extractelement <8 x i16> [[TMP5]], i32 5 +; CHECK-NEXT: [[CONV_5_5:%.*]] = zext i16 [[TMP42]] to i32 +; CHECK-NEXT: [[TMP43:%.*]] = extractelement <8 x i16> [[TMP5]], i32 4 +; CHECK-NEXT: [[CONV_4_5:%.*]] = zext i16 [[TMP43]] to i32 +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <8 x i16> [[TMP5]], i32 3 +; CHECK-NEXT: [[CONV_3_5:%.*]] = zext i16 [[TMP44]] to i32 +; CHECK-NEXT: [[TMP45:%.*]] = extractelement <8 x i16> [[TMP5]], i32 2 +; CHECK-NEXT: [[CONV_2_5:%.*]] = zext i16 [[TMP45]] to i32 +; CHECK-NEXT: [[TMP46:%.*]] = extractelement <8 x i16> [[TMP5]], i32 1 +; CHECK-NEXT: [[CONV_1_5:%.*]] = zext i16 [[TMP46]] to i32 +; CHECK-NEXT: [[TMP47:%.*]] = extractelement <8 x i16> [[TMP5]], i32 0 +; CHECK-NEXT: [[CONV_556:%.*]] = zext i16 [[TMP47]] to i32 +; CHECK-NEXT: [[TMP48:%.*]] = extractelement <8 x i16> [[TMP4]], i32 7 +; CHECK-NEXT: [[CONV_7_4:%.*]] = zext i16 [[TMP48]] to i32 +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <8 x i16> [[TMP4]], i32 6 +; CHECK-NEXT: [[CONV_6_4:%.*]] = zext i16 [[TMP49]] to i32 +; CHECK-NEXT: [[TMP50:%.*]] = extractelement <8 x i16> [[TMP4]], i32 5 +; CHECK-NEXT: [[CONV_5_4:%.*]] = zext i16 [[TMP50]] to i32 +; CHECK-NEXT: [[TMP51:%.*]] = extractelement <8 x i16> [[TMP4]], i32 4 +; CHECK-NEXT: [[CONV_4_4:%.*]] = zext i16 [[TMP51]] to i32 +; CHECK-NEXT: [[TMP52:%.*]] = extractelement <8 x i16> [[TMP4]], i32 3 +; CHECK-NEXT: [[CONV_3_4:%.*]] = zext i16 [[TMP52]] to i32 +; CHECK-NEXT: [[TMP53:%.*]] = extractelement <8 x i16> [[TMP4]], i32 2 +; CHECK-NEXT: [[CONV_2_4:%.*]] = zext i16 [[TMP53]] to i32 +; CHECK-NEXT: [[TMP54:%.*]] = extractelement <8 x i16> [[TMP4]], i32 1 +; CHECK-NEXT: [[CONV_1_4:%.*]] = zext i16 [[TMP54]] to i32 +; CHECK-NEXT: [[TMP55:%.*]] = extractelement <8 x i16> [[TMP4]], i32 0 +; CHECK-NEXT: [[CONV_452:%.*]] = zext i16 [[TMP55]] to i32 +; CHECK-NEXT: [[TMP56:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 +; CHECK-NEXT: [[CONV_7_3:%.*]] = zext i16 [[TMP56]] to i32 +; CHECK-NEXT: [[TMP57:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 +; CHECK-NEXT: [[CONV_6_3:%.*]] = zext i16 [[TMP57]] to i32 +; CHECK-NEXT: [[TMP58:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 +; CHECK-NEXT: [[CONV_5_3:%.*]] = zext i16 [[TMP58]] to i32 +; CHECK-NEXT: [[TMP59:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 +; CHECK-NEXT: [[CONV_4_3:%.*]] = zext i16 [[TMP59]] to i32 +; CHECK-NEXT: [[TMP60:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +; CHECK-NEXT: [[CONV_3_3:%.*]] = zext i16 [[TMP60]] to i32 +; CHECK-NEXT: [[TMP61:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 +; CHECK-NEXT: [[CONV_2_3:%.*]] = zext i16 [[TMP61]] to i32 +; CHECK-NEXT: [[TMP62:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 +; CHECK-NEXT: [[CONV_1_3:%.*]] = zext i16 [[TMP62]] to i32 +; CHECK-NEXT: [[TMP63:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; CHECK-NEXT: [[CONV_348:%.*]] = zext i16 [[TMP63]] to i32 +; CHECK-NEXT: [[TMP64:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 +; CHECK-NEXT: [[CONV_7_2:%.*]] = zext i16 [[TMP64]] to i32 +; CHECK-NEXT: [[TMP65:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6 +; CHECK-NEXT: [[CONV_6_2:%.*]] = zext i16 [[TMP65]] to i32 +; CHECK-NEXT: [[TMP66:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5 +; CHECK-NEXT: [[CONV_5_2:%.*]] = zext i16 [[TMP66]] to i32 +; CHECK-NEXT: [[TMP67:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4 +; CHECK-NEXT: [[CONV_4_2:%.*]] = zext i16 [[TMP67]] to i32 +; CHECK-NEXT: [[TMP68:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3 +; CHECK-NEXT: [[CONV_3_2:%.*]] = zext i16 [[TMP68]] to i32 +; CHECK-NEXT: [[TMP69:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2 +; CHECK-NEXT: [[CONV_2_2:%.*]] = zext i16 [[TMP69]] to i32 +; CHECK-NEXT: [[TMP70:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1 +; CHECK-NEXT: [[CONV_1_2:%.*]] = zext i16 [[TMP70]] to i32 +; CHECK-NEXT: [[TMP71:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0 +; CHECK-NEXT: [[CONV_244:%.*]] = zext i16 [[TMP71]] to i32 +; CHECK-NEXT: [[TMP72:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +; CHECK-NEXT: [[CONV_7_1:%.*]] = zext i16 [[TMP72]] to i32 +; CHECK-NEXT: [[TMP73:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6 +; CHECK-NEXT: [[CONV_6_1:%.*]] = zext i16 [[TMP73]] to i32 +; CHECK-NEXT: [[TMP74:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5 +; CHECK-NEXT: [[CONV_5_1:%.*]] = zext i16 [[TMP74]] to i32 +; CHECK-NEXT: [[TMP75:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4 +; CHECK-NEXT: [[CONV_4_1:%.*]] = zext i16 [[TMP75]] to i32 +; CHECK-NEXT: [[TMP76:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3 +; CHECK-NEXT: [[CONV_3_1:%.*]] = zext i16 [[TMP76]] to i32 +; CHECK-NEXT: [[TMP77:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2 +; CHECK-NEXT: [[CONV_2_1:%.*]] = zext i16 [[TMP77]] to i32 +; CHECK-NEXT: [[TMP78:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1 +; CHECK-NEXT: [[CONV_1_1:%.*]] = zext i16 [[TMP78]] to i32 +; CHECK-NEXT: [[TMP79:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 +; CHECK-NEXT: [[CONV_140:%.*]] = zext i16 [[TMP79]] to i32 +; CHECK-NEXT: [[TMP80:%.*]] = extractelement <8 x i16> [[TMP0]], i32 7 +; CHECK-NEXT: [[CONV_7:%.*]] = zext i16 [[TMP80]] to i32 +; CHECK-NEXT: [[TMP81:%.*]] = extractelement <8 x i16> [[TMP0]], i32 6 +; CHECK-NEXT: [[CONV_6:%.*]] = zext i16 [[TMP81]] to i32 +; CHECK-NEXT: [[TMP82:%.*]] = extractelement <8 x i16> [[TMP0]], i32 5 +; CHECK-NEXT: [[CONV_5:%.*]] = zext i16 [[TMP82]] to i32 +; CHECK-NEXT: [[TMP83:%.*]] = extractelement <8 x i16> [[TMP0]], i32 4 +; CHECK-NEXT: [[CONV_4:%.*]] = zext i16 [[TMP83]] to i32 +; CHECK-NEXT: [[TMP84:%.*]] = extractelement <8 x i16> [[TMP0]], i32 3 +; CHECK-NEXT: [[CONV_3:%.*]] = zext i16 [[TMP84]] to i32 +; CHECK-NEXT: [[TMP85:%.*]] = extractelement <8 x i16> [[TMP0]], i32 2 +; CHECK-NEXT: [[CONV_2:%.*]] = zext i16 [[TMP85]] to i32 +; CHECK-NEXT: [[TMP86:%.*]] = extractelement <8 x i16> [[TMP0]], i32 0 +; CHECK-NEXT: [[CONV:%.*]] = zext i16 [[TMP86]] to i32 +; CHECK-NEXT: [[TMP87:%.*]] = extractelement <8 x i16> [[TMP0]], i32 1 +; CHECK-NEXT: [[CONV_1:%.*]] = zext i16 [[TMP87]] to i32 +; CHECK-NEXT: [[ADD_1:%.*]] = add nuw nsw i32 [[CONV]], [[CONV_1]] +; CHECK-NEXT: [[TMP88:%.*]] = mul nuw nsw <64 x i32> [[TMP24]], [[TMP24]] +; CHECK-NEXT: [[ADD_2:%.*]] = add nuw nsw i32 [[ADD_1]], [[CONV_2]] +; CHECK-NEXT: [[ADD_3:%.*]] = add nuw nsw i32 [[ADD_2]], [[CONV_3]] +; CHECK-NEXT: [[ADD_4:%.*]] = add nuw nsw i32 [[ADD_3]], [[CONV_4]] +; CHECK-NEXT: [[ADD_5:%.*]] = add nuw nsw i32 [[ADD_4]], [[CONV_5]] +; CHECK-NEXT: [[ADD_6:%.*]] = add nuw nsw i32 [[ADD_5]], [[CONV_6]] +; CHECK-NEXT: [[ADD_7:%.*]] = add nuw nsw i32 [[ADD_6]], [[CONV_7]] +; CHECK-NEXT: [[ADD_141:%.*]] = add nuw nsw i32 [[ADD_7]], [[CONV_140]] +; CHECK-NEXT: [[ADD_1_1:%.*]] = add nuw nsw i32 [[ADD_141]], [[CONV_1_1]] +; CHECK-NEXT: [[ADD_2_1:%.*]] = add nuw nsw i32 [[ADD_1_1]], [[CONV_2_1]] +; CHECK-NEXT: [[ADD_3_1:%.*]] = add nuw nsw i32 [[ADD_2_1]], [[CONV_3_1]] +; CHECK-NEXT: [[ADD_4_1:%.*]] = add nuw nsw i32 [[ADD_3_1]], [[CONV_4_1]] +; CHECK-NEXT: [[ADD_5_1:%.*]] = add nuw nsw i32 [[ADD_4_1]], [[CONV_5_1]] +; CHECK-NEXT: [[ADD_6_1:%.*]] = add nuw nsw i32 [[ADD_5_1]], [[CONV_6_1]] +; CHECK-NEXT: [[ADD_7_1:%.*]] = add nuw nsw i32 [[ADD_6_1]], [[CONV_7_1]] +; CHECK-NEXT: [[ADD_245:%.*]] = add nuw nsw i32 [[ADD_7_1]], [[CONV_244]] +; CHECK-NEXT: [[ADD_1_2:%.*]] = add nuw nsw i32 [[ADD_245]], [[CONV_1_2]] +; CHECK-NEXT: [[ADD_2_2:%.*]] = add nuw nsw i32 [[ADD_1_2]], [[CONV_2_2]] +; CHECK-NEXT: [[ADD_3_2:%.*]] = add nuw nsw i32 [[ADD_2_2]], [[CONV_3_2]] +; CHECK-NEXT: [[ADD_4_2:%.*]] = add nuw nsw i32 [[ADD_3_2]], [[CONV_4_2]] +; CHECK-NEXT: [[ADD_5_2:%.*]] = add nuw nsw i32 [[ADD_4_2]], [[CONV_5_2]] +; CHECK-NEXT: [[ADD_6_2:%.*]] = add nuw nsw i32 [[ADD_5_2]], [[CONV_6_2]] +; CHECK-NEXT: [[ADD_7_2:%.*]] = add nuw nsw i32 [[ADD_6_2]], [[CONV_7_2]] +; CHECK-NEXT: [[ADD_349:%.*]] = add nuw nsw i32 [[ADD_7_2]], [[CONV_348]] +; CHECK-NEXT: [[ADD_1_3:%.*]] = add nuw nsw i32 [[ADD_349]], [[CONV_1_3]] +; CHECK-NEXT: [[ADD_2_3:%.*]] = add nuw nsw i32 [[ADD_1_3]], [[CONV_2_3]] +; CHECK-NEXT: [[ADD_3_3:%.*]] = add nuw nsw i32 [[ADD_2_3]], [[CONV_3_3]] +; CHECK-NEXT: [[ADD_4_3:%.*]] = add nuw nsw i32 [[ADD_3_3]], [[CONV_4_3]] +; CHECK-NEXT: [[ADD_5_3:%.*]] = add nuw nsw i32 [[ADD_4_3]], [[CONV_5_3]] +; CHECK-NEXT: [[ADD_6_3:%.*]] = add nuw nsw i32 [[ADD_5_3]], [[CONV_6_3]] +; CHECK-NEXT: [[ADD_7_3:%.*]] = add nuw nsw i32 [[ADD_6_3]], [[CONV_7_3]] +; CHECK-NEXT: [[ADD_453:%.*]] = add nuw nsw i32 [[ADD_7_3]], [[CONV_452]] +; CHECK-NEXT: [[ADD_1_4:%.*]] = add nuw nsw i32 [[ADD_453]], [[CONV_1_4]] +; CHECK-NEXT: [[ADD_2_4:%.*]] = add nuw nsw i32 [[ADD_1_4]], [[CONV_2_4]] +; CHECK-NEXT: [[ADD_3_4:%.*]] = add nuw nsw i32 [[ADD_2_4]], [[CONV_3_4]] +; CHECK-NEXT: [[ADD_4_4:%.*]] = add nuw nsw i32 [[ADD_3_4]], [[CONV_4_4]] +; CHECK-NEXT: [[ADD_5_4:%.*]] = add nuw nsw i32 [[ADD_4_4]], [[CONV_5_4]] +; CHECK-NEXT: [[ADD_6_4:%.*]] = add nuw nsw i32 [[ADD_5_4]], [[CONV_6_4]] +; CHECK-NEXT: [[ADD_7_4:%.*]] = add nuw nsw i32 [[ADD_6_4]], [[CONV_7_4]] +; CHECK-NEXT: [[ADD_557:%.*]] = add nuw nsw i32 [[ADD_7_4]], [[CONV_556]] +; CHECK-NEXT: [[ADD_1_5:%.*]] = add nuw nsw i32 [[ADD_557]], [[CONV_1_5]] +; CHECK-NEXT: [[ADD_2_5:%.*]] = add nuw nsw i32 [[ADD_1_5]], [[CONV_2_5]] +; CHECK-NEXT: [[ADD_3_5:%.*]] = add nuw nsw i32 [[ADD_2_5]], [[CONV_3_5]] +; CHECK-NEXT: [[ADD_4_5:%.*]] = add nuw nsw i32 [[ADD_3_5]], [[CONV_4_5]] +; CHECK-NEXT: [[ADD_5_5:%.*]] = add nuw nsw i32 [[ADD_4_5]], [[CONV_5_5]] +; CHECK-NEXT: [[ADD_6_5:%.*]] = add nuw nsw i32 [[ADD_5_5]], [[CONV_6_5]] +; CHECK-NEXT: [[ADD_7_5:%.*]] = add nuw nsw i32 [[ADD_6_5]], [[CONV_7_5]] +; CHECK-NEXT: [[ADD_661:%.*]] = add nuw nsw i32 [[ADD_7_5]], [[CONV_660]] +; CHECK-NEXT: [[ADD_1_6:%.*]] = add nuw nsw i32 [[ADD_661]], [[CONV_1_6]] +; CHECK-NEXT: [[ADD_2_6:%.*]] = add nuw nsw i32 [[ADD_1_6]], [[CONV_2_6]] +; CHECK-NEXT: [[ADD_3_6:%.*]] = add nuw nsw i32 [[ADD_2_6]], [[CONV_3_6]] +; CHECK-NEXT: [[ADD_4_6:%.*]] = add nuw nsw i32 [[ADD_3_6]], [[CONV_4_6]] +; CHECK-NEXT: [[ADD_5_6:%.*]] = add nuw nsw i32 [[ADD_4_6]], [[CONV_5_6]] +; CHECK-NEXT: [[ADD_6_6:%.*]] = add nuw nsw i32 [[ADD_5_6]], [[CONV_6_6]] +; CHECK-NEXT: [[ADD_7_6:%.*]] = add nuw nsw i32 [[ADD_6_6]], [[CONV_7_6]] +; CHECK-NEXT: [[ADD_765:%.*]] = add nuw nsw i32 [[ADD_7_6]], [[CONV_764]] +; CHECK-NEXT: [[ADD_1_7:%.*]] = add nuw nsw i32 [[ADD_765]], [[CONV_1_7]] +; CHECK-NEXT: [[ADD_2_7:%.*]] = add nuw nsw i32 [[ADD_1_7]], [[CONV_2_7]] +; CHECK-NEXT: [[ADD_3_7:%.*]] = add nuw nsw i32 [[ADD_2_7]], [[CONV_3_7]] +; CHECK-NEXT: [[ADD_4_7:%.*]] = add nuw nsw i32 [[ADD_3_7]], [[CONV_4_7]] +; CHECK-NEXT: [[ADD_5_7:%.*]] = add nuw nsw i32 [[ADD_4_7]], [[CONV_5_7]] +; CHECK-NEXT: [[ADD_6_7:%.*]] = add nuw nsw i32 [[ADD_5_7]], [[CONV_6_7]] +; CHECK-NEXT: [[ADD_7_7:%.*]] = add nuw nsw i32 [[ADD_6_7]], [[CONV_7_7]] +; CHECK-NEXT: [[TMP89:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[TMP88]]) ; CHECK-NEXT: [[CONV15:%.*]] = zext i32 [[ADD_7_7]] to i64 -; CHECK-NEXT: [[CONV16:%.*]] = zext i32 [[TMP82]] to i64 +; CHECK-NEXT: [[CONV16:%.*]] = zext i32 [[TMP89]] to i64 ; CHECK-NEXT: [[SHL:%.*]] = shl nuw i64 [[CONV16]], 32 ; CHECK-NEXT: [[ADD17:%.*]] = or i64 [[SHL]], [[CONV15]] ; CHECK-NEXT: ret i64 [[ADD17]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll index 8093285ad8717c..6f6b66255a4340 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll @@ -3,63 +3,39 @@ ; Test case reported on D134605 where the vectorization was causing a slowdown due to an underestimation in the cost of the extractions. -; NOTE: cost of shuffle <4 x float>, <4 x float>, <2 x i32> is 12! - define fastcc i64 @zot(float %arg, float %arg1, float %arg2, float %arg3, float %arg4, ptr %arg5, i1 %arg6, i1 %arg7, i1 %arg8) { ; CHECK-LABEL: @zot( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[VAL:%.*]] = fmul fast float 0.000000e+00, 0.000000e+00 ; CHECK-NEXT: [[VAL9:%.*]] = fmul fast float 0.000000e+00, [[ARG:%.*]] -; CHECK-NEXT: [[VAL10:%.*]] = fmul fast float [[ARG3:%.*]], 1.000000e+00 -; CHECK-NEXT: [[VAL11:%.*]] = fmul fast float [[ARG3]], 1.000000e+00 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> , float [[ARG]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float [[ARG3:%.*]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <4 x float> , [[TMP2]] ; CHECK-NEXT: [[VAL12:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 -; CHECK-NEXT: [[VAL13:%.*]] = fadd fast float [[VAL12]], 2.000000e+00 -; CHECK-NEXT: [[VAL14:%.*]] = fadd fast float 0.000000e+00, 0.000000e+00 -; CHECK-NEXT: [[VAL15:%.*]] = fadd fast float [[VAL14]], 1.000000e+00 -; CHECK-NEXT: [[VAL16:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 -; CHECK-NEXT: [[VAL17:%.*]] = fadd fast float [[ARG3]], 1.000000e+00 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP2]], float [[VAL12]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float 0.000000e+00, i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <4 x float> [[TMP5]], ; CHECK-NEXT: br i1 [[ARG6:%.*]], label [[BB18:%.*]], label [[BB57:%.*]] ; CHECK: bb18: -; CHECK-NEXT: [[VAL19:%.*]] = phi float [ [[VAL13]], [[BB:%.*]] ] -; CHECK-NEXT: [[VAL20:%.*]] = phi float [ [[VAL15]], [[BB]] ] -; CHECK-NEXT: [[VAL21:%.*]] = phi float [ [[VAL16]], [[BB]] ] -; CHECK-NEXT: [[VAL22:%.*]] = phi float [ [[VAL17]], [[BB]] ] -; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[VAL16]], 2.000000e+00 -; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[VAL17]], 3.000000e+00 +; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x float> [ [[TMP6]], [[BB:%.*]] ] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[TMP6]], i32 2 +; CHECK-NEXT: [[VAL23:%.*]] = fmul fast float [[TMP8]], 2.000000e+00 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP6]], i32 3 +; CHECK-NEXT: [[VAL24:%.*]] = fmul fast float [[TMP9]], 3.000000e+00 ; CHECK-NEXT: br i1 [[ARG7:%.*]], label [[BB25:%.*]], label [[BB57]] ; CHECK: bb25: -; CHECK-NEXT: [[VAL26:%.*]] = phi float [ [[VAL19]], [[BB18]] ] -; CHECK-NEXT: [[VAL27:%.*]] = phi float [ [[VAL20]], [[BB18]] ] -; CHECK-NEXT: [[VAL28:%.*]] = phi float [ [[VAL21]], [[BB18]] ] -; CHECK-NEXT: [[VAL29:%.*]] = phi float [ [[VAL22]], [[BB18]] ] +; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x float> [ [[TMP7]], [[BB18]] ] ; CHECK-NEXT: br label [[BB30:%.*]] ; CHECK: bb30: ; CHECK-NEXT: [[VAL31:%.*]] = phi float [ [[VAL55:%.*]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] ; CHECK-NEXT: [[VAL32:%.*]] = phi float [ [[VAL9]], [[BB30]] ], [ 0.000000e+00, [[BB25]] ] -; CHECK-NEXT: [[VAL33:%.*]] = load i8, ptr [[ARG5:%.*]], align 1 -; CHECK-NEXT: [[VAL34:%.*]] = uitofp i8 [[VAL33]] to float -; CHECK-NEXT: [[VAL35:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 1 -; CHECK-NEXT: [[VAL36:%.*]] = load i8, ptr [[VAL35]], align 1 -; CHECK-NEXT: [[VAL37:%.*]] = uitofp i8 [[VAL36]] to float -; CHECK-NEXT: [[VAL38:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 2 -; CHECK-NEXT: [[VAL39:%.*]] = load i8, ptr [[VAL38]], align 1 -; CHECK-NEXT: [[VAL40:%.*]] = uitofp i8 [[VAL39]] to float -; CHECK-NEXT: [[VAL41:%.*]] = getelementptr inbounds i8, ptr [[ARG5]], i64 3 -; CHECK-NEXT: [[VAL42:%.*]] = load i8, ptr [[VAL41]], align 1 -; CHECK-NEXT: [[VAL43:%.*]] = uitofp i8 [[VAL42]] to float -; CHECK-NEXT: [[VAL44:%.*]] = fsub fast float [[VAL34]], [[VAL]] -; CHECK-NEXT: [[VAL45:%.*]] = fsub fast float [[VAL37]], [[VAL9]] -; CHECK-NEXT: [[VAL46:%.*]] = fsub fast float [[VAL40]], [[VAL10]] -; CHECK-NEXT: [[VAL47:%.*]] = fsub fast float [[VAL43]], [[VAL11]] -; CHECK-NEXT: [[VAL48:%.*]] = fmul fast float [[VAL44]], [[VAL26]] -; CHECK-NEXT: [[VAL49:%.*]] = fmul fast float [[VAL45]], [[VAL27]] -; CHECK-NEXT: [[VAL50:%.*]] = fadd fast float [[VAL49]], [[VAL48]] -; CHECK-NEXT: [[VAL51:%.*]] = fmul fast float [[VAL46]], [[VAL28]] -; CHECK-NEXT: [[VAL52:%.*]] = fadd fast float [[VAL50]], [[VAL51]] -; CHECK-NEXT: [[VAL53:%.*]] = fmul fast float [[VAL47]], [[VAL29]] -; CHECK-NEXT: [[VAL54:%.*]] = fadd fast float [[VAL52]], [[VAL53]] +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARG5:%.*]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = uitofp <4 x i8> [[TMP11]] to <4 x float> +; CHECK-NEXT: [[TMP13:%.*]] = fsub fast <4 x float> [[TMP12]], [[TMP3]] +; CHECK-NEXT: [[TMP14:%.*]] = fmul fast <4 x float> [[TMP13]], [[TMP10]] +; CHECK-NEXT: [[TMP15:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP14]]) ; CHECK-NEXT: [[VAL55]] = tail call fast float @llvm.minnum.f32(float [[VAL31]], float [[ARG1:%.*]]) -; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[VAL54]]) +; CHECK-NEXT: [[VAL56:%.*]] = tail call fast float @llvm.maxnum.f32(float [[ARG2:%.*]], float [[TMP15]]) ; CHECK-NEXT: call void @ham(float [[VAL55]], float [[VAL56]]) ; CHECK-NEXT: br i1 [[ARG8:%.*]], label [[BB30]], label [[BB57]] ; CHECK: bb57: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll index 4f881823746228..e39cd8aaa111b1 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/shuffle-vectors-mask-size.ll @@ -7,13 +7,16 @@ define void @p(double %0) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x double> , double [[TMP0]], i32 3 ; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], zeroinitializer +; CHECK-NEXT: [[MUL16_150_1_I:%.*]] = fmul double 0.000000e+00, 0.000000e+00 ; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x double> zeroinitializer, [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> [[TMP3]], <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP14]], double [[MUL16_150_1_I]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> zeroinitializer, [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x double> [[TMP4]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = call <4 x double> @llvm.vector.insert.v4f64.v2f64(<4 x double> , <2 x double> [[TMP7]], i64 2) +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> , <4 x double> [[TMP9]], <4 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x double> [[TMP8]], [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = fadd <4 x double> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = fptosi <4 x double> [[TMP12]] to <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll index ff1d6253bec928..95aa40f664c0ce 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -25,11 +25,11 @@ define void @s116_modified(ptr %a) { ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP2]], i64 2) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP6]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x float> [[TMP7]], [[TMP9]] ; CHECK-NEXT: store <4 x float> [[TMP10]], ptr [[A]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll index f04c359b432b5e..b59659ca75eb24 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll @@ -241,9 +241,12 @@ entry: define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) { ; CHECK-LABEL: @select_uniform_ugt_16xi8( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i8 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR:%.*]], align 1 +; CHECK-NEXT: [[GEP_8:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 8 ; CHECK-NEXT: [[L_8:%.*]] = load i8, ptr [[GEP_8]], align 1 ; CHECK-NEXT: [[CMP_8:%.*]] = icmp ugt i8 [[L_8]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0 +; CHECK-NEXT: [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]] ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 9 ; CHECK-NEXT: [[L_9:%.*]] = load i8, ptr [[GEP_9]], align 1 ; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 10 @@ -251,28 +254,19 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) { ; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 11 ; CHECK-NEXT: [[L_11:%.*]] = load i8, ptr [[GEP_11]], align 1 ; CHECK-NEXT: [[GEP_12:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 12 -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR]], align 1 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0 -; CHECK-NEXT: [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]] ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[L_9]], i32 9 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[L_10]], i32 10 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[L_11]], i32 11 -; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP6]], <8 x i8> [[TMP0]], i64 0) -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP7]], <4 x i8> [[TMP2]], i64 12) -; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt <16 x i8> [[TMP8]], -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> [[TMP11]], <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[L_9]], i32 9 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[L_10]], i32 10 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[L_11]], i32 11 -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP17]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP16]], <16 x i8> [[TMP18]] -; CHECK-NEXT: store <16 x i8> [[TMP19]], ptr [[PTR]], align 2 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt <16 x i8> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i8> [[TMP11]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = select <16 x i1> [[TMP10]], <16 x i8> [[TMP9]], <16 x i8> [[TMP12]] +; CHECK-NEXT: store <16 x i8> [[TMP13]], ptr [[PTR]], align 2 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll index cd79250e8fb6be..94a55c435c8c39 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/combined-loads-stored.ll @@ -4,11 +4,12 @@ define void @test(ptr noalias %p, ptr %p1) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ptr noalias [[P:%.*]], ptr [[P1:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[P]], align 2 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr [[GEP2]], align 2 -; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.vector.insert.v4i16.v2i16(<4 x i16> poison, <2 x i16> [[TMP1]], i64 0) -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i16> @llvm.vector.insert.v4i16.v2i16(<4 x i16> [[TMP3]], <2 x i16> [[TMP2]], i64 2) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP3]], <4 x i16> [[TMP4]], <4 x i32> ; CHECK-NEXT: store <4 x i16> [[TMP5]], ptr [[P1]], align 2 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll index 151b91184bf428..ff3d2c4c59394c 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll @@ -1013,20 +1013,22 @@ declare i32 @llvm.abs.i32(i32, i1) define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) { ; CHECK-LABEL: @stride_sum_abs_diff( -; CHECK-NEXT: [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 [[STRIDE:%.*]] -; CHECK-NEXT: [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i64 [[STRIDE]] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[Q:%.*]], align 4 +; CHECK-NEXT: [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[STRIDE:%.*]] +; CHECK-NEXT: [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q]], i64 [[STRIDE]] ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[P_2]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[Q_2]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP1]], i64 0) -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP5]], <2 x i32> [[TMP3]], i64 2) -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP2]], i64 0) -; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP7]], <2 x i32> [[TMP4]], i64 2) -; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP9]], i1 true) -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]) -; CHECK-NEXT: ret i32 [[TMP11]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP11]], i1 true) +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]]) +; CHECK-NEXT: ret i32 [[TMP13]] ; %x.0 = load i32, ptr %p %y.0 = load i32, ptr %q @@ -1066,11 +1068,12 @@ define i32 @reduce_sum_2arrays_a(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[P:%.*]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[Q:%.*]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> poison, <4 x i8> [[TMP0]], i64 0) -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP2]], <4 x i8> [[TMP1]], i64 4) -; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]]) -; CHECK-NEXT: ret i32 [[TMP5]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) +; CHECK-NEXT: ret i32 [[TMP6]] ; entry: %x.0 = load i8, ptr %p, align 1 @@ -1114,11 +1117,12 @@ define i32 @reduce_sum_2arrays_b(ptr noalias noundef %x, ptr noalias %y) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr [[X:%.*]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[Y:%.*]], align 1 -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> poison, <4 x i8> [[TMP0]], i64 0) -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.vector.insert.v8i8.v4i8(<8 x i8> [[TMP2]], <4 x i8> [[TMP1]], i64 4) -; CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]]) -; CHECK-NEXT: ret i32 [[TMP5]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP0]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) +; CHECK-NEXT: ret i32 [[TMP6]] ; entry: %0 = load i8, ptr %x, align 1 diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll index 413aedefe9b6ad..0fcbead65d0d66 100644 --- a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll +++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll @@ -13,11 +13,12 @@ define void @foo() local_unnamed_addr { ; CHECK-NEXT: [[ARRAYIDX372:%.*]] = getelementptr inbounds [4 x [4 x i32]], ptr @dct_luma, i64 0, i64 3, i64 0 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([4 x [4 x i32]], ptr @bar, i64 0, i64 3, i64 2), align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP2]], <2 x i32> [[TMP1]], i64 2) -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> , i32 [[ADD277]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = ashr <4 x i32> [[TMP5]], -; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[ARRAYIDX372]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> , i32 [[ADD277]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = ashr <4 x i32> [[TMP6]], +; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr [[ARRAYIDX372]], align 4 ; CHECK-NEXT: unreachable ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll index 96b498ced7d0f8..f7bd2431a76054 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll @@ -392,14 +392,16 @@ define void @vec_shuff_reorder() #0 { ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr @fb, align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4 -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP1]], i64 0) -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP3]], i64 2) -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP2]], i64 0) -; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP7]], <2 x float> [[TMP4]], i64 2) -; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = fsub <4 x float> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP10]], <4 x i32> -; CHECK-NEXT: store <4 x float> [[TMP11]], ptr @fc, align 4 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = fsub <4 x float> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x i32> +; CHECK-NEXT: store <4 x float> [[TMP13]], ptr @fc, align 4 ; CHECK-NEXT: ret void ; %1 = load float, ptr @fb, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll index 87b1302e4cecf4..3b03ca13ea65d0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-many-users-buildvector.ll @@ -6,25 +6,30 @@ define i1 @test(float %0, double %1) { ; CHECK-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) { ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> , float [[TMP0]], i32 3 ; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> zeroinitializer, [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> , <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> , <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] -; CHECK-NEXT: [[TMP12:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP10]], i64 0) -; CHECK-NEXT: [[TMP13:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v4f64(<8 x double> , <4 x double> [[TMP11]], i64 0) -; CHECK-NEXT: [[TMP14:%.*]] = call <8 x double> @llvm.vector.insert.v8f64.v2f64(<8 x double> [[TMP13]], <2 x double> [[TMP6]], i64 4) -; CHECK-NEXT: [[TMP15:%.*]] = fsub <8 x double> [[TMP12]], [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = fmul <8 x double> [[TMP12]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x double> [[TMP15]], <8 x double> [[TMP16]], <8 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = fptrunc <8 x double> [[TMP17]] to <8 x float> -; CHECK-NEXT: [[TMP19:%.*]] = fmul <8 x float> [[TMP18]], zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = fcmp oeq <8 x float> [[TMP19]], zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = freeze <8 x i1> [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP21]]) -; CHECK-NEXT: ret i1 [[TMP22]] +; CHECK-NEXT: [[TMP5:%.*]] = fpext float 0.000000e+00 to double +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> zeroinitializer, [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP8]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x double> , double [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x double> [[TMP10]], double [[TMP5]], i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = fmul <4 x double> [[TMP9]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x double> [[TMP12]], <4 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x double> , <8 x double> [[TMP14]], <8 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <8 x double> , <8 x double> [[TMP16]], <8 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <8 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x double> [[TMP17]], <8 x double> [[TMP18]], <8 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = fsub <8 x double> [[TMP15]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = fmul <8 x double> [[TMP15]], [[TMP19]] +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x double> [[TMP20]], <8 x double> [[TMP21]], <8 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = fptrunc <8 x double> [[TMP22]] to <8 x float> +; CHECK-NEXT: [[TMP24:%.*]] = fmul <8 x float> [[TMP23]], zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = fcmp oeq <8 x float> [[TMP24]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = freeze <8 x i1> [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP26]]) +; CHECK-NEXT: ret i1 [[TMP27]] ; %3 = fpext float %0 to double %4 = fpext float 0.000000e+00 to double diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll index 6ff03acf85cdfd..d326c855a10912 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-scalar-from-undef.ll @@ -4,19 +4,20 @@ define i64 @foo(i32 %tmp7) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7:%.*]], 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 undef, i32 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 0, i32 5 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> , i32 [[TMP8]], i32 3 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> , <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP3]], <2 x i32> zeroinitializer, i64 0) -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <8 x i32> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <8 x i32> zeroinitializer, [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <8 x i32> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP10]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[TMP7:%.*]], i32 2 +; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[TMP0]], zeroinitializer +; CHECK-NEXT: [[TMP24:%.*]] = sub i32 undef, 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> , i32 [[TMP24]], i32 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 0, i32 5 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP24]], i32 6 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP77:%.*]] = sub nsw <8 x i32> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP77]], <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i32> zeroinitializer, [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = xor <8 x i32> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP10]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP11]], 0 ; CHECK-NEXT: [[TMP64:%.*]] = zext i32 [[OP_RDX]] to i64 ; CHECK-NEXT: ret i64 [[TMP64]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll index 757d0b1708b6fb..d80d7b5ecd4e76 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-as-vect-but-order.ll @@ -8,18 +8,19 @@ define void @foo(ptr %i7, i32 %0, i1 %tobool62.not) { ; CHECK-SAME: ptr [[I7:%.*]], i32 [[TMP0:%.*]], i1 [[TOBOOL62_NOT:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[RC21:%.*]] = alloca [0 x [0 x %struct.rect]], i32 0, align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[RC21]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = sitofp <2 x i32> [[TMP3]] to <2 x float> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[Y0:%.*]] = getelementptr i8, ptr [[RC21]], i64 8 -; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[Y0]], align 4 +; CHECK-NEXT: [[X1:%.*]] = getelementptr i8, ptr [[RC21]], i64 4 +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, ptr [[X1]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[I7]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[RC21]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> poison, float [[TMP6]], i32 2 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 3 -; CHECK-NEXT: [[TMP13:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP8]], i64 0) -; CHECK-NEXT: [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = fcmp olt <4 x float> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = fcmp olt <4 x float> [[TMP5]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x float> [[TMP5]], <4 x float> zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP12]], <4 x float> zeroinitializer, <4 x float> [[TMP15]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll index b0d9fea43a0e6c..fa022ad69af791 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -1016,13 +1016,15 @@ define i32 @maxi8_wrong_parent(i32) { ; THRESH-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr @arr, align 16 ; THRESH-NEXT: br label [[PP:%.*]] ; THRESH: pp: -; THRESH-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 -; THRESH-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 4), align 16 -; THRESH-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP4]], i64 4) -; THRESH-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP2]], i64 0) -; THRESH-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP3]], i64 2) -; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP7]]) -; THRESH-NEXT: ret i32 [[TMP8]] +; THRESH-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 +; THRESH-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 +; THRESH-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> +; THRESH-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <8 x i32> +; THRESH-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> [[TMP6]], <8 x i32> +; THRESH-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <8 x i32> +; THRESH-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> +; THRESH-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> [[TMP9]]) +; THRESH-NEXT: ret i32 [[TMP10]] ; %2 = load i32, ptr @arr, align 16 %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 1), align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll b/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll index 54c950a0785020..6c4572593027d6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/inst_size_bug.ll @@ -4,20 +4,14 @@ define void @inst_size(ptr %a, <2 x i64> %b) { ; CHECK-LABEL: @inst_size( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMPL1:%.*]] = load i64, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[PTR2]], align 4 -; CHECK-NEXT: [[PTR4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 3 -; CHECK-NEXT: [[TMPL4:%.*]] = load i64, ptr [[PTR4]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[B:%.*]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[TMPL1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP2]], <2 x i64> [[TMP0]], i64 2) -; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <4 x i64> zeroinitializer, [[TMP3]] -; CHECK-NEXT: [[T45:%.*]] = icmp sgt i64 0, [[TMPL4]] +; CHECK-NEXT: [[VAL:%.*]] = extractelement <2 x i64> [[B:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[T41:%.*]] = icmp sgt i64 0, [[VAL]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i64> zeroinitializer, [[TMP1]] ; CHECK-NEXT: br label [[BLOCK:%.*]] ; CHECK: block: -; CHECK-NEXT: [[PHI5:%.*]] = phi i1 [ [[T45]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi <4 x i1> [ [[TMP4]], [[ENTRY]] ] +; CHECK-NEXT: [[PHI1:%.*]] = phi i1 [ [[T41]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x i1> [ [[TMP2]], [[ENTRY]] ] ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll b/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll index 813c5e7418b30e..47b42bc8f32a7d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/landing_pad.ll @@ -10,10 +10,10 @@ define void @foo() personality ptr @bar { ; CHECK: bb2.loopexit: ; CHECK-NEXT: br label [[BB2:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ [[TMP8:%.*]], [[BB9:%.*]] ], [ poison, [[BB2_LOOPEXIT:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ [[TMP7:%.*]], [[BB9:%.*]] ], [ poison, [[BB2_LOOPEXIT:%.*]] ] ; CHECK-NEXT: ret void ; CHECK: bb3: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP4:%.*]], [[BB6:%.*]] ], [ poison, [[BB1:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP3:%.*]], [[BB6:%.*]] ], [ poison, [[BB1:%.*]] ] ; CHECK-NEXT: [[TMP2:%.*]] = invoke i32 poison(ptr addrspace(1) nonnull poison, i32 0, i32 0, i32 poison) [ "deopt"() ] ; CHECK-NEXT: to label [[BB4:%.*]] unwind label [[BB10:%.*]] ; CHECK: bb4: @@ -21,30 +21,29 @@ define void @foo() personality ptr @bar { ; CHECK: bb5: ; CHECK-NEXT: br label [[BB7:%.*]] ; CHECK: bb6: -; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ , [[BB8:%.*]] ] -; CHECK-NEXT: [[TMP4]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP3]] = phi <2 x i32> [ , [[BB8:%.*]] ] ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb7: ; CHECK-NEXT: [[LOCAL_5_84111:%.*]] = phi i32 [ poison, [[BB8]] ], [ poison, [[BB5]] ] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[LOCAL_5_84111]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = invoke i32 poison(ptr addrspace(1) nonnull poison, i32 poison, i32 poison, i32 poison) [ "deopt"() ] +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[LOCAL_5_84111]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = invoke i32 poison(ptr addrspace(1) nonnull poison, i32 poison, i32 poison, i32 poison) [ "deopt"() ] ; CHECK-NEXT: to label [[BB8]] unwind label [[BB12:%.*]] ; CHECK: bb8: ; CHECK-NEXT: br i1 poison, label [[BB7]], label [[BB6]] ; CHECK: bb9: ; CHECK-NEXT: [[INDVARS_IV528799:%.*]] = phi i64 [ poison, [[BB10]] ], [ poison, [[BB12]] ] -; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x i32> [ [[TMP9:%.*]], [[BB10]] ], [ [[TMP10:%.*]], [[BB12]] ] -; CHECK-NEXT: [[TMP8]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> poison, <2 x i32> [[TMP7]], i64 2) +; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ [[TMP8:%.*]], [[BB10]] ], [ [[TMP9:%.*]], [[BB12]] ] +; CHECK-NEXT: [[TMP7]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: br label [[BB2]] ; CHECK: bb10: -; CHECK-NEXT: [[TMP9]] = phi <2 x i32> [ [[TMP1]], [[BB3]] ] +; CHECK-NEXT: [[TMP8]] = phi <2 x i32> [ [[TMP1]], [[BB3]] ] ; CHECK-NEXT: [[LANDING_PAD68:%.*]] = landingpad { ptr, i32 } ; CHECK-NEXT: cleanup ; CHECK-NEXT: br label [[BB9]] ; CHECK: bb11: ; CHECK-NEXT: ret void ; CHECK: bb12: -; CHECK-NEXT: [[TMP10]] = phi <2 x i32> [ [[TMP5]], [[BB7]] ] +; CHECK-NEXT: [[TMP9]] = phi <2 x i32> [ [[TMP4]], [[BB7]] ] ; CHECK-NEXT: [[LANDING_PAD149:%.*]] = landingpad { ptr, i32 } ; CHECK-NEXT: cleanup ; CHECK-NEXT: br label [[BB9]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll index 7201583f3450e0..96151e0bd6c418 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -144,8 +144,8 @@ define float @foo3(ptr nocapture readonly %A) #0 { ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP14:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[TMP0]], [[ENTRY]] ], [ [[TMP8:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0 ; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00 ; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]] @@ -154,25 +154,23 @@ define float @foo3(ptr nocapture readonly %A) #0 { ; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX14]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]] -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 -; CHECK-NEXT: [[TMP9]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP11]], <2 x float> [[TMP8]], i64 2) -; CHECK-NEXT: [[TMP13:%.*]] = fmul <4 x float> [[TMP12]], -; CHECK-NEXT: [[TMP14]] = fadd <4 x float> [[TMP3]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP15]], 121 +; CHECK-NEXT: [[TMP8]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP8]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP10]], +; CHECK-NEXT: [[TMP12]] = fadd <4 x float> [[TMP3]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP13]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP14]], i32 0 -; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP16]] -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP14]], i32 1 -; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP14]], i32 2 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP14]], i32 3 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP19]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP12]], i32 0 +; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP12]], i32 1 +; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP12]], i32 2 +; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP12]], i32 3 +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP17]] ; CHECK-NEXT: ret float [[ADD31]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll index 12389f4a3dbf4a..865d8178667167 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -390,15 +390,14 @@ define i1 @logical_and_icmp_clamp_partial(<4 x i32> %x) { define i1 @logical_and_icmp_clamp_pred_diff(<4 x i32> %x) { ; CHECK-LABEL: @logical_and_icmp_clamp_pred_diff( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP2]], <4 x i32> [[X]], i64 0) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt <8 x i32> [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp ult <8 x i32> [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i1> [[TMP5]], <8 x i1> [[TMP6]], <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = freeze <8 x i1> [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP8]]) -; CHECK-NEXT: ret i1 [[TMP9]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult <8 x i32> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = freeze <8 x i1> [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP7]]) +; CHECK-NEXT: ret i1 [[TMP8]] ; %x0 = extractelement <4 x i32> %x, i32 0 %x1 = extractelement <4 x i32> %x, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll index 8aaa71ef47a8c9..7de2cde45525ae 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark-partial-loads-vectorize.ll @@ -10,7 +10,16 @@ ; YAML-NEXT: - String: 'SLP vectorized with cost ' ; YAML-NEXT: - Cost: '-4' ; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '5' +; YAML-NEXT: - TreeSize: '4' +; YAML-LABEL: --- !Passed +; YAML-NEXT: Pass: slp-vectorizer +; YAML-NEXT: Name: VectorizedList +; YAML-NEXT: Function: test +; YAML-NEXT: Args: +; YAML-NEXT: - String: 'SLP vectorized with cost ' +; YAML-NEXT: - Cost: '-2' +; YAML-NEXT: - String: ' and with tree size ' +; YAML-NEXT: - TreeSize: '2' define <4 x float> @test(ptr %x, float %v, float %a) { ; CHECK-LABEL: define <4 x float> @test( @@ -19,8 +28,9 @@ define <4 x float> @test(ptr %x, float %v, float %a) { ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[A]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> poison, float [[V]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP1]], i64 2) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP3]], [[TMP7]] ; CHECK-NEXT: ret <4 x float> [[TMP8]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll index c01c44ff03c153..dadf5992ba288d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reused-pointer.ll @@ -5,25 +5,23 @@ define void @test(i1 %c, ptr %arg) { ; CHECK-LABEL: @test( ; CHECK-NEXT: br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: -; CHECK-NEXT: [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 24 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP4]], i64 0) -; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP5]], <2 x i64> [[TMP2]], i64 2) +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARG:%.*]], align 8 +; CHECK-NEXT: [[ARG2_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr [[ARG2_2]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <4 x i32> ; CHECK-NEXT: br label [[JOIN:%.*]] ; CHECK: else: +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 ; CHECK-NEXT: [[ARG_2:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 24 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> poison, <2 x i64> [[TMP10]], i64 0) -; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP11]], <2 x i64> [[TMP8]], i64 2) +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x i64>, ptr [[ARG_2]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> [[TMP9]], <4 x i32> ; CHECK-NEXT: br label [[JOIN]] ; CHECK: join: -; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i64> [ [[TMP6]], [[IF]] ], [ [[TMP12]], [[ELSE]] ] +; CHECK-NEXT: [[TMP11:%.*]] = phi <4 x i64> [ [[TMP5]], [[IF]] ], [ [[TMP10]], [[ELSE]] ] ; CHECK-NEXT: ret void ; br i1 %c, label %if, label %else diff --git a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget_debug_info.ll b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget_debug_info.ll index 207b2d45c335e0..d45054b6bebce7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget_debug_info.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/schedule_budget_debug_info.ll @@ -14,21 +14,7 @@ declare void @unknown() define void @test(ptr %a, ptr %b, ptr %c, ptr %d) { ; VECTOR_DBG-LABEL: @test( ; VECTOR_DBG-NEXT: entry: -; VECTOR_DBG-NEXT: [[L0:%.*]] = load float, ptr [[A:%.*]], align 4 -; VECTOR_DBG-NEXT: [[A1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 -; VECTOR_DBG-NEXT: [[L1:%.*]] = load float, ptr [[A1]], align 4 -; VECTOR_DBG-NEXT: [[A2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2 -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3:![0-9]+]], !DIExpression(), [[META5:![0-9]+]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) -; VECTOR_DBG-NEXT: [[B1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 -; VECTOR_DBG-NEXT: [[B2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 -; VECTOR_DBG-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A2]], align 4 +; VECTOR_DBG-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4 ; VECTOR_DBG-NEXT: call void @unknown() ; VECTOR_DBG-NEXT: call void @unknown() ; VECTOR_DBG-NEXT: call void @unknown() @@ -57,22 +43,22 @@ define void @test(ptr %a, ptr %b, ptr %c, ptr %d) { ; VECTOR_DBG-NEXT: call void @unknown() ; VECTOR_DBG-NEXT: call void @unknown() ; VECTOR_DBG-NEXT: call void @unknown() -; VECTOR_DBG-NEXT: store float [[L0]], ptr [[B]], align 4 -; VECTOR_DBG-NEXT: store float [[L1]], ptr [[B1]], align 4 -; VECTOR_DBG-NEXT: store <2 x float> [[TMP0]], ptr [[B2]], align 4 +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3:![0-9]+]], !DIExpression(), [[META5:![0-9]+]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: #dbg_value(i16 1, [[META3]], !DIExpression(), [[META5]]) +; VECTOR_DBG-NEXT: store <4 x float> [[TMP0]], ptr [[B:%.*]], align 4 ; VECTOR_DBG-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4 ; VECTOR_DBG-NEXT: store <4 x float> [[TMP1]], ptr [[D:%.*]], align 4 ; VECTOR_DBG-NEXT: ret void ; ; VECTOR_NODBG-LABEL: @test( ; VECTOR_NODBG-NEXT: entry: -; VECTOR_NODBG-NEXT: [[L0:%.*]] = load float, ptr [[A:%.*]], align 4 -; VECTOR_NODBG-NEXT: [[A1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 -; VECTOR_NODBG-NEXT: [[L1:%.*]] = load float, ptr [[A1]], align 4 -; VECTOR_NODBG-NEXT: [[A2:%.*]] = getelementptr inbounds float, ptr [[A]], i64 2 -; VECTOR_NODBG-NEXT: [[B1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1 -; VECTOR_NODBG-NEXT: [[B2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2 -; VECTOR_NODBG-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A2]], align 4 +; VECTOR_NODBG-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4 ; VECTOR_NODBG-NEXT: call void @unknown() ; VECTOR_NODBG-NEXT: call void @unknown() ; VECTOR_NODBG-NEXT: call void @unknown() @@ -101,9 +87,7 @@ define void @test(ptr %a, ptr %b, ptr %c, ptr %d) { ; VECTOR_NODBG-NEXT: call void @unknown() ; VECTOR_NODBG-NEXT: call void @unknown() ; VECTOR_NODBG-NEXT: call void @unknown() -; VECTOR_NODBG-NEXT: store float [[L0]], ptr [[B]], align 4 -; VECTOR_NODBG-NEXT: store float [[L1]], ptr [[B1]], align 4 -; VECTOR_NODBG-NEXT: store <2 x float> [[TMP0]], ptr [[B2]], align 4 +; VECTOR_NODBG-NEXT: store <4 x float> [[TMP0]], ptr [[B:%.*]], align 4 ; VECTOR_NODBG-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4 ; VECTOR_NODBG-NEXT: store <4 x float> [[TMP1]], ptr [[D:%.*]], align 4 ; VECTOR_NODBG-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll index 6ca1f8119c1cf0..6825f43b5a9eb4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll @@ -14,21 +14,22 @@ define dso_local void @_Z4testP1S(ptr %p) local_unnamed_addr { ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 4 ; CHECK-NEXT: [[I7:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4 ; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 12 -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 14 -; CHECK-NEXT: [[I13:%.*]] = load i32, ptr [[ARRAYIDX41]], align 4 +; CHECK-NEXT: [[I9:%.*]] = load i32, ptr [[ARRAYIDX27]], align 4 +; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 13 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX34]], align 4 ; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 5 ; CHECK-NEXT: [[I15:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX27]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[I1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[I7]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[I13]], i32 6 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[I15]], i32 7 -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP8]], <2 x i32> [[TMP1]], i64 4) -; CHECK-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[TMP9]], [[TMP2]] -; CHECK-NEXT: store <8 x i32> [[TMP10]], ptr [[P]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[I9]], i32 4 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x i32> [[TMP9]], i32 [[I15]], i32 7 +; CHECK-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[TMP10]], [[TMP2]] +; CHECK-NEXT: store <8 x i32> [[TMP11]], ptr [[P]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -105,10 +106,11 @@ define dso_local void @test_unordered_splits(ptr nocapture %p) local_unnamed_add ; CHECK-NEXT: [[G20:%.*]] = getelementptr inbounds [16 x i32], ptr [[P2]], i32 0, i64 12 ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[G10]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[G20]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> poison, <4 x i32> [[TMP0]], i64 0) -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> [[TMP2]], <4 x i32> [[TMP1]], i64 4) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: store <8 x i32> [[TMP4]], ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: store <8 x i32> [[TMP5]], ptr [[P:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -163,11 +165,14 @@ define dso_local void @test_cost_splits(ptr nocapture %p) local_unnamed_addr { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[G12]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[G20]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr [[G22]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> poison, <2 x i32> [[TMP0]], i64 0) -; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP4]], <2 x i32> [[TMP1]], i64 2) -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP2]], i64 4) -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP6]], <2 x i32> [[TMP3]], i64 6) -; CHECK-NEXT: store <8 x i32> [[TMP7]], ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> +; CHECK-NEXT: store <8 x i32> [[TMP10]], ptr [[P:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll index 3eabed5882e58b..eb3d395f4c6a6f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/tiny-tree.ll @@ -160,8 +160,9 @@ define void @tiny_tree_not_fully_vectorizable2(ptr noalias nocapture %dst, ptr n ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[ARRAYIDX4]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP4]], <2 x float> [[TMP2]], i64 2) -; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[DST_ADDR_022]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP5]], <4 x i32> +; CHECK-NEXT: store <4 x float> [[TMP6]], ptr [[DST_ADDR_022]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds float, ptr [[SRC_ADDR_021]], i64 [[I_023]] ; CHECK-NEXT: [[ADD_PTR8]] = getelementptr inbounds float, ptr [[DST_ADDR_022]], i64 [[I_023]] ; CHECK-NEXT: [[INC]] = add i64 [[I_023]], 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll index e1b091cc6fcda7..6ac6884ca5377f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vect-gather-same-nodes.ll @@ -8,14 +8,14 @@ define void @test(ptr %a, ptr %b) { ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr null, align 4 ; CHECK-NEXT: [[ARRAYIDX120:%.*]] = getelementptr [4 x float], ptr [[B:%.*]], i64 0, i64 3 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[ARRAYIDX120]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr null, align 4 ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[A:%.*]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP3]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP6]], <2 x float> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP3]], i32 2 ; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[TMP5]], [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP5]], zeroinitializer