From e588fd994fe8ce0fa7804284f2a2a9a6922980fd Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Sat, 21 Sep 2024 15:24:49 -0700 Subject: [PATCH] Revert "[SLP]Vectorize gathered loads" This reverts commit dc2deb53131b9d4c5e881229190bdda1ca3ea47f to fix the issue reported in https://lab.llvm.org/buildbot/#/builders/25/builds/2668 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 871 +++--------------- .../SLPVectorizer/AArch64/tsc-s116.ll | 10 +- .../SLPVectorizer/AArch64/vec3-calls.ll | 32 +- .../vectorizable-selects-uniform-cmps.ll | 28 +- .../SLPVectorizer/X86/crash_dequeue.ll | 12 +- .../SLPVectorizer/X86/horizontal-minmax.ll | 27 +- .../X86/load-merge-inseltpoison.ll | 21 +- .../SLPVectorizer/X86/load-merge.ll | 21 +- .../Transforms/SLPVectorizer/X86/lookahead.ll | 28 +- .../SLPVectorizer/X86/pr47629-inseltpoison.ll | 270 +++--- .../Transforms/SLPVectorizer/X86/pr47629.ll | 270 +++--- .../SLPVectorizer/X86/pr48879-sroa.ll | 162 ++-- .../X86/reorder-possible-strided-node.ll | 36 +- .../X86/scatter-vectorize-reorder.ll | 14 +- .../Transforms/SLPVectorizer/X86/sin-sqrt.ll | 99 +- .../SLPVectorizer/X86/split-load8_2-unord.ll | 26 +- .../Transforms/SLPVectorizer/X86/supernode.ll | 80 +- .../SLPVectorizer/X86/vec3-calls.ll | 14 +- .../X86/vec_list_bias-inseltpoison.ll | 39 +- .../SLPVectorizer/X86/vec_list_bias.ll | 39 +- .../vec_list_bias_external_insert_shuffled.ll | 22 +- 21 files changed, 835 insertions(+), 1286 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 97859907b909bb..3102ed9d844e56 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1338,7 +1338,6 @@ class BoUpSLP { MustGather.clear(); NonScheduledFirst.clear(); EntryToLastInstruction.clear(); - GatheredLoadsEntriesFirst = NoGatheredLoads; ExternalUses.clear(); ExternalUsesAsOriginalScalar.clear(); for (auto &Iter : BlocksSchedules) { @@ -1355,11 +1354,7 @@ class BoUpSLP { ValueToGatherNodes.clear(); } - unsigned getTreeSize() const { - return GatheredLoadsEntriesFirst == NoGatheredLoads - ? VectorizableTree.size() - : GatheredLoadsEntriesFirst; - } + unsigned getTreeSize() const { return VectorizableTree.size(); } /// Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); @@ -1479,27 +1474,13 @@ class BoUpSLP { /// \param VL0 main load value. /// \param Order returned order of load instructions. /// \param PointerOps returned list of pointer operands. - /// \param BestVF return best vector factor, if recursive check found better - /// vectorization sequences rather than masked gather. /// \param TryRecursiveCheck used to check if long masked gather can be /// represented as a serie of loads/insert subvector, if profitable. LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, SmallVectorImpl &Order, SmallVectorImpl &PointerOps, - unsigned *BestVF = nullptr, bool TryRecursiveCheck = true) const; - /// Registers non-vectorizable sequence of loads - template void registerNonVectorizableLoads(ArrayRef VL) { - ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL)); - } - - /// Checks if the given loads sequence is known as not vectorizable - template - bool areKnownNonVectorizableLoads(ArrayRef VL) const { - return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL)); - } - OptimizationRemarkEmitter *getORE() { return ORE; } /// This structure holds any data we need about the edges being traversed @@ -2981,12 +2962,6 @@ class BoUpSLP { /// be beneficial even the tree height is tiny. bool isFullyVectorizableTinyTree(bool ForReduction) const; - /// Run through the list of all gathered loads in the graph and try to find - /// vector loads/masked gathers instead of regular gathers. Later these loads - /// are reshufled to build final gathered nodes. - void tryToVectorizeGatheredLoads( - ArrayRef>> GatheredLoads); - /// Reorder commutative or alt operands to get better probability of /// generating vectorized code. static void reorderInputsAccordingToOpcode(ArrayRef VL, @@ -3059,7 +3034,7 @@ class BoUpSLP { } bool isOperandGatherNode(const EdgeInfo &UserEI) const { - return isGather() && !UserTreeIndices.empty() && + return isGather() && (Idx > 0 || !UserTreeIndices.empty()) && UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx && UserTreeIndices.front().UserTE == UserEI.UserTE; } @@ -3406,12 +3381,6 @@ class BoUpSLP { assert(((!Bundle && EntryState == TreeEntry::NeedToGather) || (Bundle && EntryState != TreeEntry::NeedToGather)) && "Need to vectorize gather entry?"); - // Gathered loads still gathered? Do not create entry, use the original one. - if (GatheredLoadsEntriesFirst != NoGatheredLoads && - EntryState == TreeEntry::NeedToGather && - S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX && - !UserTreeIdx.UserTE) - return nullptr; VectorizableTree.push_back(std::make_unique(VectorizableTree)); TreeEntry *Last = VectorizableTree.back().get(); Last->Idx = VectorizableTree.size() - 1; @@ -3522,7 +3491,7 @@ class BoUpSLP { /// and fills required data before actual scheduling of the instructions. TreeEntry::EntryState getScalarsVectorizationState( InstructionsState &S, ArrayRef VL, bool IsScatterVectorizeUserTE, - OrdersType &CurrentOrder, SmallVectorImpl &PointerOps); + OrdersType &CurrentOrder, SmallVectorImpl &PointerOps) const; /// Maps a specific scalar to its tree entry. SmallDenseMap ScalarToTreeEntry; @@ -3557,10 +3526,6 @@ class BoUpSLP { DenseMap>; ValueToGatherNodesMap ValueToGatherNodes; - /// The index of the first gathered load entry in the VectorizeTree. - constexpr static int NoGatheredLoads = -1; - int GatheredLoadsEntriesFirst = NoGatheredLoads; - /// This POD struct describes one external user in the vectorized tree. struct ExternalUser { ExternalUser(Value *S, llvm::User *U, int L) @@ -3645,9 +3610,6 @@ class BoUpSLP { /// A list of blocks that we are going to CSE. DenseSet CSEBlocks; - /// List of hashes of vector of loads, which are known to be non vectorizable. - DenseSet ListOfKnonwnNonVectorizableLoads; - /// Contains all scheduling relevant data for an instruction. /// A ScheduleData either represents a single instruction or a member of an /// instruction bundle (= a group of instructions which is combined into a @@ -4735,21 +4697,15 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args); } -BoUpSLP::LoadsState -BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, - SmallVectorImpl &Order, - SmallVectorImpl &PointerOps, - unsigned *BestVF, bool TryRecursiveCheck) const { +BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( + ArrayRef VL, const Value *VL0, SmallVectorImpl &Order, + SmallVectorImpl &PointerOps, bool TryRecursiveCheck) const { // Check that a vectorized load would load the same memory as a scalar // load. For example, we don't want to vectorize loads that are smaller // than 8-bit. Even though we have a packed struct {} LLVM // treats loading/storing it as an i8 struct. If we vectorize loads/stores // from such a struct, we read/write packed bits disagreeing with the // unvectorized version. - if (BestVF) - *BestVF = 0; - if (areKnownNonVectorizableLoads(VL)) - return LoadsState::Gather; Type *ScalarTy = VL0->getType(); if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) @@ -4770,28 +4726,22 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, } Order.clear(); + auto *VecTy = getWidenedType(ScalarTy, Sz); // Check the order of pointer operands or that all pointers are the same. bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order); - auto *VecTy = getWidenedType(ScalarTy, Sz); Align CommonAlignment = computeCommonAlignment(VL); - if (!IsSorted) { - if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) { - if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) && - calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order)) - return LoadsState::StridedVectorize; - } - - if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || - TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) - return LoadsState::Gather; + if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) && + TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) && + calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order)) + return LoadsState::StridedVectorize; - if (!all_of(PointerOps, [&](Value *P) { - return arePointersCompatible(P, PointerOps.front(), *TLI); - })) - return LoadsState::Gather; + if (!IsSorted && !all_of(PointerOps, [&](Value *P) { + return arePointersCompatible(P, PointerOps.front(), *TLI); + })) + return LoadsState::Gather; - } else { + if (IsSorted) { Value *Ptr0; Value *PtrN; if (Order.empty()) { @@ -4806,9 +4756,6 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, // Check that the sorted loads are consecutive. if (static_cast(*Diff) == Sz - 1) return LoadsState::Vectorize; - if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) || - TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) - return LoadsState::Gather; // Simple check if not a strided access - clear order. bool IsPossibleStrided = *Diff % (Sz - 1) == 0; // Try to generate strided load node if: @@ -4868,10 +4815,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, // strided/masked gather loads. Returns true if vectorized + shuffles // representation is better than just gather. auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment, - unsigned *BestVF, bool ProfitableGatherPointers) { - if (BestVF) - *BestVF = 0; // Compare masked gather cost and loads + insert subvector costs. TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; auto [ScalarGEPCost, VectorGEPCost] = @@ -4939,14 +4883,10 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, SmallVector Order; SmallVector PointerOps; LoadsState LS = - canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF, + canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, /*TryRecursiveCheck=*/false); // Check that the sorted loads are consecutive. if (LS == LoadsState::Gather) { - if (BestVF) { - DemandedElts.setAllBits(); - break; - } DemandedElts.setBits(Cnt, Cnt + VF); continue; } @@ -5039,11 +4979,8 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, // consider it as a gather node. It will be better estimated // later. if (MaskedGatherCost >= VecLdCost && - VecLdCost - GatherCost < -SLPCostThreshold) { - if (BestVF) - *BestVF = VF; + VecLdCost - GatherCost < -SLPCostThreshold) return true; - } } return MaskedGatherCost - GatherCost >= -SLPCostThreshold; }; @@ -5061,14 +4998,20 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, (GEP && GEP->getNumOperands() == 2 && isa(GEP->getOperand(1))); })) { - // Check if potential masked gather can be represented as series - // of loads + insertsubvectors. - // If masked gather cost is higher - better to vectorize, so - // consider it as a gather node. It will be better estimated - // later. - if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF, - ProfitableGatherPointers)) + Align CommonAlignment = computeCommonAlignment(VL); + if (TTI->isLegalMaskedGather(VecTy, CommonAlignment) && + !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) { + // Check if potential masked gather can be represented as series + // of loads + insertsubvectors. + if (TryRecursiveCheck && + CheckForShuffledLoads(CommonAlignment, ProfitableGatherPointers)) { + // If masked gather cost is higher - better to vectorize, so + // consider it as a gather node. It will be better estimated + // later. + return LoadsState::Gather; + } return LoadsState::ScatterVectorize; + } } return LoadsState::Gather; @@ -5482,16 +5425,7 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { if (TE.Scalars.size() >= 3) if (std::optional Order = findPartiallyOrderedLoads(TE)) return Order; - // Check if can include the order of vectorized loads. For masked gathers do - // extra analysis later, so include such nodes into a special list. - if (TE.isGather() && TE.getOpcode() == Instruction::Load) { - SmallVector PointerOps; - OrdersType CurrentOrder; - LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(), - CurrentOrder, PointerOps); - if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize) - return std::move(CurrentOrder); - } + // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars // has been auditted for correctness with non-power-of-two vectors. if (!TE.isNonPowOf2Vec()) @@ -6381,476 +6315,6 @@ void BoUpSLP::buildTree(ArrayRef Roots) { buildTree_rec(Roots, 0, EdgeInfo()); } -/// Tries to find subvector of loads and builds new vector of only loads if can -/// be profitable. -static void gatherPossiblyVectorizableLoads( - const BoUpSLP &R, ArrayRef VL, const DataLayout &DL, - ScalarEvolution &SE, const TargetTransformInfo &TTI, - SmallVectorImpl>> &GatheredLoads, - bool AddNew = true) { - if (VL.empty()) - return; - Type *ScalarTy = getValueType(VL.front()); - if (!isValidElementType(ScalarTy)) - return; - const int NumScalars = VL.size(); - int NumParts = 1; - if (NumScalars > 1) { - auto *VecTy = getWidenedType(ScalarTy, NumScalars); - NumParts = TTI.getNumberOfParts(VecTy); - if (NumParts == 0 || NumParts >= NumScalars) - NumParts = 1; - } - unsigned VF = PowerOf2Ceil(NumScalars / NumParts); - SmallVector>> ClusteredLoads; - for (int I : seq(NumParts)) { - for (Value *V : - VL.slice(I * VF, std::min(VF, VL.size() - I * VF))) { - auto *LI = dyn_cast(V); - if (!LI) - continue; - if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple()) - continue; - bool IsFound = false; - for (auto &Data : ClusteredLoads) { - if (LI->getParent() != Data.front().first->getParent()) - continue; - std::optional Dist = - getPointersDiff(LI->getType(), LI->getPointerOperand(), - Data.front().first->getType(), - Data.front().first->getPointerOperand(), DL, SE, - /*StrictCheck=*/true); - if (Dist && all_of(Data, [&](const std::pair &Pair) { - IsFound |= Pair.first == LI; - return IsFound || Pair.second != *Dist; - })) { - if (!IsFound) - Data.emplace_back(LI, *Dist); - IsFound = true; - break; - } - } - if (!IsFound) - ClusteredLoads.emplace_back().emplace_back(LI, 0); - } - } - auto FindMatchingLoads = - [&](ArrayRef> Loads, - SmallVectorImpl>> - &GatheredLoads, - SetVector &ToAdd, SetVector &Repeated, - int &Offset, unsigned &Start) { - if (Loads.empty()) - return GatheredLoads.end(); - SmallVector> Res; - LoadInst *LI = Loads.front().first; - for (auto [Idx, Data] : enumerate(GatheredLoads)) { - if (Idx < Start) - continue; - ToAdd.clear(); - if (LI->getParent() != Data.front().first->getParent()) - continue; - std::optional Dist = - getPointersDiff(LI->getType(), LI->getPointerOperand(), - Data.front().first->getType(), - Data.front().first->getPointerOperand(), DL, SE, - /*StrictCheck=*/true); - if (Dist) { - // Found matching gathered loads - check if all loads are unique or - // can be effectively vectorized. - unsigned NumUniques = 0; - for (auto [Cnt, Pair] : enumerate(Loads)) { - bool Used = any_of( - Data, [&, &P = Pair](const std::pair &PD) { - return PD.first == P.first; - }); - if (!Used && - none_of(Data, - [&, &P = Pair](const std::pair &PD) { - return *Dist + P.second == PD.second; - })) { - ++NumUniques; - ToAdd.insert(Cnt); - } else if (Used) { - Repeated.insert(Cnt); - } - } - if (NumUniques > 0 && - (Loads.size() == NumUniques || - (Loads.size() - NumUniques >= 2 && - Loads.size() - NumUniques >= Loads.size() / 2 && - (has_single_bit(Data.size() + NumUniques) || - bit_ceil(Data.size()) < - bit_ceil(Data.size() + NumUniques))))) { - Offset = *Dist; - Start = Idx + 1; - return std::next(GatheredLoads.begin(), Idx); - } - } - } - ToAdd.clear(); - return GatheredLoads.end(); - }; - for (ArrayRef> Data : ClusteredLoads) { - unsigned Start = 0; - SetVector ToAdd, LocalToAdd, Repeated; - int Offset = 0; - auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, - Offset, Start); - while (It != GatheredLoads.end()) { - assert(!LocalToAdd.empty() && "Expected some elements to add."); - for (unsigned Idx : LocalToAdd) - It->emplace_back(Data[Idx].first, Data[Idx].second + Offset); - ToAdd.insert(LocalToAdd.begin(), LocalToAdd.end()); - It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset, - Start); - } - if (any_of(seq(Data.size()), [&](unsigned Idx) { - return !ToAdd.contains(Idx) && !Repeated.contains(Idx); - })) { - auto AddNewLoads = - [&](SmallVectorImpl> &Loads) { - for (unsigned Idx : seq(Data.size())) { - if (ToAdd.contains(Idx) || Repeated.contains(Idx)) - continue; - Loads.push_back(Data[Idx]); - } - }; - if (!AddNew) { - LoadInst *LI = Data.front().first; - It = find_if( - GatheredLoads, [&](ArrayRef> PD) { - return PD.front().first->getParent() == LI->getParent() && - PD.front().first->getType() == LI->getType(); - }); - while (It != GatheredLoads.end()) { - AddNewLoads(*It); - It = std::find_if( - std::next(It), GatheredLoads.end(), - [&](ArrayRef> PD) { - return PD.front().first->getParent() == LI->getParent() && - PD.front().first->getType() == LI->getType(); - }); - } - } - GatheredLoads.emplace_back().append(Data.begin(), Data.end()); - AddNewLoads(GatheredLoads.emplace_back()); - } - } -} - -void BoUpSLP::tryToVectorizeGatheredLoads( - ArrayRef>> GatheredLoads) { - GatheredLoadsEntriesFirst = VectorizableTree.size(); - - // Sort loads by distance. - auto LoadSorter = [](const std::pair &L1, - const std::pair &L2) { - return L1.second > L2.second; - }; - - auto IsMaskedGatherSupported = [&](ArrayRef Loads) { - ArrayRef Values(reinterpret_cast(Loads.begin()), - Loads.size()); - Align Alignment = computeCommonAlignment(Values); - auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size()); - return TTI->isLegalMaskedGather(Ty, Alignment) && - !TTI->forceScalarizeMaskedGather(Ty, Alignment); - }; - - auto GetVectorizedRanges = [this](ArrayRef Loads, - BoUpSLP::ValueSet &VectorizedLoads, - SmallVectorImpl &NonVectorized, - bool Final, unsigned MaxVF) { - SmallVector, LoadsState>> Results; - unsigned StartIdx = 0; - SmallVector CandidateVFs; - if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1)) - CandidateVFs.push_back(MaxVF); - for (int NumElts = bit_floor(MaxVF); NumElts > 1; NumElts /= 2) { - CandidateVFs.push_back(NumElts); - if (VectorizeNonPowerOf2 && NumElts > 2) - CandidateVFs.push_back(NumElts - 1); - } - - if (Final && CandidateVFs.empty()) - return Results; - - unsigned BestVF = Final ? CandidateVFs.back() : 0; - for (unsigned NumElts : CandidateVFs) { - if (Final && NumElts > BestVF) - continue; - SmallVector MaskedGatherVectorized; - for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt + NumElts <= E; - ++Cnt) { - ArrayRef Slice = ArrayRef(Loads).slice(Cnt, NumElts); - if (VectorizedLoads.count(Slice.front()) || - VectorizedLoads.count(Slice.back()) || - areKnownNonVectorizableLoads(Slice)) - continue; - // Check if it is profitable to try vectorizing gathered loads. It is - // profitable if we have more than 3 consecutive loads or if we have - // less but all users are vectorized or deleted. - bool AllowToVectorize = - NumElts >= 3 || - any_of(ValueToGatherNodes.at(Slice.front()), - [=](const TreeEntry *TE) { - return TE->Scalars.size() == 2 && - ((TE->Scalars.front() == Slice.front() && - TE->Scalars.back() == Slice.back()) || - (TE->Scalars.front() == Slice.back() && - TE->Scalars.back() == Slice.front())); - }); - // Check if it is profitable to vectorize 2-elements loads. - if (NumElts == 2) { - bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad( - Slice.front()->getType(), ElementCount::getFixed(NumElts)); - auto CheckIfAllowed = [=](ArrayRef Slice) { - for (LoadInst *LI : Slice) { - // If single use/user - allow to vectorize. - if (LI->hasOneUse()) - continue; - // 1. Check if number of uses equals number of users. - // 2. All users are deleted. - // 3. The load broadcasts are not allowed or the load is not - // broadcasted. - if (std::distance(LI->user_begin(), LI->user_end()) != - LI->getNumUses()) - return false; - if (!IsLegalBroadcastLoad) - continue; - if (LI->hasNUsesOrMore(UsesLimit)) - return false; - for (User *U : LI->users()) { - if (auto *UI = dyn_cast(U); UI && isDeleted(UI)) - continue; - if (const TreeEntry *UTE = getTreeEntry(U)) { - for (int I : seq(UTE->getNumOperands())) { - if (all_of(UTE->getOperand(I), - [LI](Value *V) { return V == LI; })) - // Found legal broadcast - do not vectorize. - return false; - } - } - } - } - return true; - }; - AllowToVectorize = CheckIfAllowed(Slice); - } - if (AllowToVectorize) { - SmallVector PointerOps; - OrdersType CurrentOrder; - // Try to build vector load. - ArrayRef Values( - reinterpret_cast(Slice.begin()), Slice.size()); - LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder, - PointerOps, &BestVF); - if (LS != LoadsState::Gather || - (BestVF > 1 && static_cast(NumElts) == 2 * BestVF)) { - if (LS == LoadsState::ScatterVectorize) { - if (MaskedGatherVectorized.empty() || - Cnt >= MaskedGatherVectorized.back() + NumElts) - MaskedGatherVectorized.push_back(Cnt); - continue; - } - if (LS != LoadsState::Gather) { - Results.emplace_back(Values, LS); - VectorizedLoads.insert(Slice.begin(), Slice.end()); - // If we vectorized initial block, no need to try to vectorize it - // again. - if (Cnt == StartIdx) - StartIdx += NumElts; - } - // Check if the whole array was vectorized already - exit. - if (StartIdx >= Loads.size()) - break; - // Erase last masked gather candidate, if another candidate within - // the range is found to be better. - if (!MaskedGatherVectorized.empty() && - Cnt < MaskedGatherVectorized.back() + NumElts) - MaskedGatherVectorized.pop_back(); - Cnt += NumElts - 1; - continue; - } - } - if (!AllowToVectorize || BestVF == 0) - registerNonVectorizableLoads(Slice); - } - // Mark masked gathers candidates as vectorized, if any. - for (unsigned Cnt : MaskedGatherVectorized) { - ArrayRef Slice = ArrayRef(Loads).slice(Cnt, NumElts); - ArrayRef Values( - reinterpret_cast(Slice.begin()), Slice.size()); - Results.emplace_back(Values, LoadsState::ScatterVectorize); - VectorizedLoads.insert(Slice.begin(), Slice.end()); - // If we vectorized initial block, no need to try to vectorize it again. - if (Cnt == StartIdx) - StartIdx += NumElts; - } - } - for (LoadInst *LI : Loads) { - if (!VectorizedLoads.contains(LI)) - NonVectorized.push_back(LI); - } - return Results; - }; - auto ProcessGatheredLoads = - [&](ArrayRef>> GatheredLoads, - bool Final = false) { - SmallVector NonVectorized; - for (ArrayRef> LoadsDists : GatheredLoads) { - if (LoadsDists.size() <= 1) { - NonVectorized.push_back(LoadsDists.back().first); - continue; - } - SmallVector> LocalLoadsDists(LoadsDists); - SmallVector OriginalLoads(LocalLoadsDists.size()); - transform( - LoadsDists, OriginalLoads.begin(), - [](const std::pair &L) { return L.first; }); - stable_sort(LocalLoadsDists, LoadSorter); - SmallVector Loads; - unsigned MaxConsecutiveDistance = 0; - unsigned CurrentConsecutiveDist = 1; - int LastDist = std::numeric_limits::max(); - bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads); - for (const std::pair &L : LocalLoadsDists) { - if (getTreeEntry(L.first)) - continue; - assert(LastDist >= L.second && - "Expected first distance always not less than second"); - if (static_cast(LastDist - L.second) == - CurrentConsecutiveDist) { - ++CurrentConsecutiveDist; - MaxConsecutiveDistance = - std::max(MaxConsecutiveDistance, CurrentConsecutiveDist); - Loads.push_back(L.first); - continue; - } - if (!AllowMaskedGather && CurrentConsecutiveDist == 1 && - !Loads.empty()) - Loads.pop_back(); - CurrentConsecutiveDist = 1; - LastDist = L.second; - Loads.push_back(L.first); - } - if (Loads.size() <= 1) - continue; - if (AllowMaskedGather) - MaxConsecutiveDistance = Loads.size(); - else if (MaxConsecutiveDistance < 2) - continue; - BoUpSLP::ValueSet VectorizedLoads; - SmallVector SortedNonVectorized; - SmallVector, LoadsState>> Results = - GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized, - Final, MaxConsecutiveDistance); - if (!Results.empty() && !SortedNonVectorized.empty() && - OriginalLoads.size() == Loads.size() && - MaxConsecutiveDistance == Loads.size() && - all_of(Results, - [](const std::pair, LoadsState> &P) { - return P.second == LoadsState::ScatterVectorize; - })) { - VectorizedLoads.clear(); - SmallVector UnsortedNonVectorized; - SmallVector, LoadsState>> - UnsortedResults = - GetVectorizedRanges(OriginalLoads, VectorizedLoads, - UnsortedNonVectorized, Final, - OriginalLoads.size()); - if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) { - SortedNonVectorized.swap(UnsortedNonVectorized); - Results.swap(UnsortedResults); - } - } - for (auto [Slice, _] : Results) { - LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads (" - << Slice.size() << ")\n"); - if (any_of(Slice, [&](Value *V) { return getTreeEntry(V); })) { - for (Value *L : Slice) - if (!getTreeEntry(L)) - SortedNonVectorized.push_back(cast(L)); - continue; - } - - // Select maximum VF as a maximum of user gathered nodes and - // distance between scalar loads in these nodes. - unsigned MaxVF = Slice.size(); - unsigned UserMaxVF = 0; - if (MaxVF == 2) { - UserMaxVF = MaxVF; - } else { - std::optional CommonVF = 0; - DenseMap EntryToPosition; - for (auto [Idx, V] : enumerate(Slice)) { - for (const TreeEntry *E : ValueToGatherNodes.at(V)) { - UserMaxVF = std::max(UserMaxVF, E->Scalars.size()); - unsigned Pos = - EntryToPosition.try_emplace(E, Idx).first->second; - UserMaxVF = std::max(UserMaxVF, Idx - Pos + 1); - if (CommonVF) { - if (*CommonVF == 0) { - CommonVF = E->Scalars.size(); - continue; - } - if (*CommonVF != E->Scalars.size()) - CommonVF.reset(); - } - } - } - // Try to build long masked gather loads. - UserMaxVF = bit_ceil(UserMaxVF); - } - for (unsigned VF = MaxVF; VF >= 2; VF /= 2) { - bool IsVectorized = true; - for (unsigned I = 0, E = Slice.size(); I < E; I += VF) { - ArrayRef SubSlice = - Slice.slice(I, std::min(VF, E - I)); - if (getTreeEntry(SubSlice.front())) - continue; - unsigned Sz = VectorizableTree.size(); - buildTree_rec(SubSlice, 0, EdgeInfo()); - if (Sz == VectorizableTree.size()) { - IsVectorized = false; - continue; - } - } - if (IsVectorized) - break; - } - } - NonVectorized.append(SortedNonVectorized); - } - return NonVectorized; - }; - SmallVector NonVectorized = ProcessGatheredLoads(GatheredLoads); - if (!GatheredLoads.empty() && !NonVectorized.empty() && - std::accumulate( - GatheredLoads.begin(), GatheredLoads.end(), 0u, - [](unsigned S, ArrayRef> LoadsDists) { - return S + LoadsDists.size(); - }) != NonVectorized.size() && - IsMaskedGatherSupported(NonVectorized)) { - SmallVector>> FinalGatheredLoads; - for (LoadInst *LI : NonVectorized) { - // Reinsert non-vectorized loads to other list of loads with the same - // base pointers. - gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI, - FinalGatheredLoads, - /*AddNew=*/false); - } - // Final attempt to vectorize non-vectorized loads. - (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true); - } - // If no new entries created, consider it as no gathered loads entries must be - // handled. - if (static_cast(GatheredLoadsEntriesFirst) == - VectorizableTree.size()) - GatheredLoadsEntriesFirst = NoGatheredLoads; -} - /// \return true if the specified list of values has only one instruction that /// requires scheduling, false otherwise. #ifndef NDEBUG @@ -7072,7 +6536,7 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S, BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( InstructionsState &S, ArrayRef VL, bool IsScatterVectorizeUserTE, - OrdersType &CurrentOrder, SmallVectorImpl &PointerOps) { + OrdersType &CurrentOrder, SmallVectorImpl &PointerOps) const { assert(S.MainOp && "Expected instructions with same/alternate opcodes only."); if (S.MainOp->getType()->isFloatingPointTy() && @@ -7171,7 +6635,6 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( else LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); #endif // NDEBUG - registerNonVectorizableLoads(VL); return TreeEntry::NeedToGather; } llvm_unreachable("Unexpected state of loads"); @@ -7562,65 +7025,64 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, InstructionsState S = getSameOpcode(VL, *TLI); - // Don't go into catchswitch blocks, which can happen with PHIs. - // Such blocks can only have PHIs and the catchswitch. There is no - // place to insert a shuffle if we need to, so just avoid that issue. - if (S.MainOp && - isa(S.MainOp->getParent()->getTerminator())) { - LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n"); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); - return; + // Don't vectorize ephemeral values. + if (!EphValues.empty()) { + for (Value *V : VL) { + if (EphValues.count(V)) { + LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V + << ") is ephemeral.\n"); + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); + return; + } + } } // Check if this is a duplicate of another entry. - if (S.getOpcode()) { - if (TreeEntry *E = getTreeEntry(S.OpValue)) { - LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); - if (GatheredLoadsEntriesFirst != NoGatheredLoads || !E->isSame(VL)) { - auto It = MultiNodeScalars.find(S.OpValue); - if (It != MultiNodeScalars.end()) { - auto *TEIt = find_if(It->getSecond(), - [&](TreeEntry *ME) { return ME->isSame(VL); }); - if (TEIt != It->getSecond().end()) - E = *TEIt; - else - E = nullptr; - } else { + if (TreeEntry *E = getTreeEntry(S.OpValue)) { + LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); + if (!E->isSame(VL)) { + auto It = MultiNodeScalars.find(S.OpValue); + if (It != MultiNodeScalars.end()) { + auto *TEIt = find_if(It->getSecond(), + [&](TreeEntry *ME) { return ME->isSame(VL); }); + if (TEIt != It->getSecond().end()) + E = *TEIt; + else E = nullptr; - } - } - if (!E) { - if (!doesNotNeedToBeScheduled(S.OpValue)) { - LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - if (TryToFindDuplicates(S)) - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndices); - return; - } - SmallPtrSet Nodes; - Nodes.insert(getTreeEntry(S.OpValue)); - for (const TreeEntry *E : MultiNodeScalars.lookup(S.OpValue)) - Nodes.insert(E); - SmallPtrSet Values(VL.begin(), VL.end()); - if (any_of(Nodes, [&](const TreeEntry *E) { - return all_of(E->Scalars, - [&](Value *V) { return Values.contains(V); }); - })) { - LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n"); - if (TryToFindDuplicates(S)) - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, - ReuseShuffleIndices); - return; - } } else { - // Record the reuse of the tree node. FIXME, currently this is only - // used to properly draw the graph rather than for the actual - // vectorization. - E->UserTreeIndices.push_back(UserTreeIdx); - LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue - << ".\n"); + E = nullptr; + } + } + if (!E) { + if (!doesNotNeedToBeScheduled(S.OpValue)) { + LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndices); return; } + SmallPtrSet Nodes; + Nodes.insert(getTreeEntry(S.OpValue)); + for (const TreeEntry *E : MultiNodeScalars.lookup(S.OpValue)) + Nodes.insert(E); + SmallPtrSet Values(VL.begin(), VL.end()); + if (any_of(Nodes, [&](const TreeEntry *E) { + return all_of(E->Scalars, + [&](Value *V) { return Values.contains(V); }); + })) { + LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n"); + if (TryToFindDuplicates(S)) + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndices); + return; + } + } else { + // Record the reuse of the tree node. FIXME, currently this is only used + // to properly draw the graph rather than for the actual vectorization. + E->UserTreeIndices.push_back(UserTreeIdx); + LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue + << ".\n"); + return; } } @@ -7739,7 +7201,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE, SortedIndices)); bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock; - if (!AreAllSameInsts || (!S.getOpcode() && allConstant(VL)) || isSplat(VL) || + if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) || (isa( S.OpValue) && !all_of(VL, isVectorLikeInstWithConstOps)) || @@ -7751,18 +7213,6 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, return; } - // Don't vectorize ephemeral values. - if (S.getOpcode() && !EphValues.empty()) { - for (Value *V : VL) { - if (EphValues.count(V)) { - LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V - << ") is ephemeral.\n"); - newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); - return; - } - } - } - // We now know that this is a vector of instructions of the same type from // the same block. @@ -7784,7 +7234,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // The reduction nodes (stored in UserIgnoreList) also should stay scalar. if (UserIgnoreList && !UserIgnoreList->empty()) { for (Value *V : VL) { - if (UserIgnoreList->contains(V)) { + if (UserIgnoreList && UserIgnoreList->contains(V)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); if (TryToFindDuplicates(S)) newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, @@ -7811,7 +7261,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, auto *VL0 = cast(S.OpValue); BB = VL0->getParent(); - if (S.MainOp && !DT->isReachableFromEntry(BB)) { + if (!DT->isReachableFromEntry(BB)) { // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); @@ -7819,6 +7269,15 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, return; } + // Don't go into catchswitch blocks, which can happen with PHIs. + // Such blocks can only have PHIs and the catchswitch. There is no + // place to insert a shuffle if we need to, so just avoid that issue. + if (isa(BB->getTerminator())) { + LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n"); + newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx); + return; + } + // Check that every instruction appears once in this bundle. if (!TryToFindDuplicates(S, /*DoNotFail=*/true)) return; @@ -7854,9 +7313,6 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndices); NonScheduledFirst.insert(VL.front()); - if (S.getOpcode() == Instruction::Load && - BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit) - registerNonVectorizableLoads(VL); return; } LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); @@ -9111,40 +8567,6 @@ void BoUpSLP::transformNodes() { break; } } - - // Single load node - exit. - if (VectorizableTree.size() <= 1 && - VectorizableTree.front()->getOpcode() == Instruction::Load) - return; - // Small graph with small VF - exit. - constexpr unsigned SmallTree = 3; - constexpr unsigned SmallVF = 2; - if ((VectorizableTree.size() <= SmallTree && - VectorizableTree.front()->Scalars.size() == SmallVF) || - (VectorizableTree.size() <= 2 && UserIgnoreList)) - return; - - // A list of loads to be gathered during the vectorization process. We can - // try to vectorize them at the end, if profitable. - SmallVector>> GatheredLoads; - - for (std::unique_ptr &TE : VectorizableTree) { - TreeEntry &E = *TE; - if (E.isGather() && - (E.getOpcode() == Instruction::Load || - (!E.getOpcode() && any_of(E.Scalars, - [&](Value *V) { - return isa(V) && - !isVectorized(V) && - !isDeleted(cast(V)); - }))) && - !isSplat(E.Scalars)) - gatherPossiblyVectorizableLoads(*this, E.Scalars, *DL, *SE, *TTI, - GatheredLoads); - } - // Try to vectorize gathered loads if this is not just a gather of loads. - if (!GatheredLoads.empty()) - tryToVectorizeGatheredLoads(GatheredLoads); } /// Merges shuffle masks and emits final shuffle instruction, if required. It @@ -10174,9 +9596,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, InstructionCost VecCost = VectorCost(CommonCost); // Check if the current node must be resized, if the parent node is not // resized. - if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0 && - (E->getOpcode() != Instruction::Load || - !E->UserTreeIndices.empty())) { + if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) { const EdgeInfo &EI = E->UserTreeIndices.front(); if ((EI.UserTE->getOpcode() != Instruction::Select || EI.EdgeIdx != 0) && @@ -10883,8 +10303,8 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { ((TE->getOpcode() == Instruction::ExtractElement || all_of(TE->Scalars, IsaPred)) && isFixedVectorShuffle(TE->Scalars, Mask)) || - (TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()) || - any_of(TE->Scalars, IsaPred)); + (TE->isGather() && TE->getOpcode() == Instruction::Load && + !TE->isAltShuffle())); }; // We only handle trees of heights 1 and 2. @@ -11360,11 +10780,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { } } - // Exclude cost of gather loads nodes which are not used. These nodes were - // built as part of the final attempt to vectorize gathered loads. - assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) && - "Expected gather nodes with users only."); - InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts); Cost += C; LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle " @@ -11553,9 +10968,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { if (IsProfitablePHIUser) { KeepScalar = true; } else if (KeepScalar && ScalarCost != TTI::TCC_Free && - ExtraCost - ScalarCost <= TTI::TCC_Basic && - (GatheredLoadsEntriesFirst == NoGatheredLoads || - Entry->Idx < GatheredLoadsEntriesFirst)) { + ExtraCost - ScalarCost <= TTI::TCC_Basic) { unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) { return ValueToExtUses->contains(V); }); @@ -11890,9 +11303,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( Entries.clear(); // TODO: currently checking only for Scalars in the tree entry, need to count // reused elements too for better cost estimation. - const EdgeInfo &TEUseEI = TE == VectorizableTree.front().get() - ? EdgeInfo(const_cast(TE), 0) - : TE->UserTreeIndices.front(); + const EdgeInfo &TEUseEI = TE->UserTreeIndices.front(); const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE); const BasicBlock *TEInsertBlock = nullptr; // Main node of PHI entries keeps the correct order of operands/incoming @@ -11987,7 +11398,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( VToTEs.insert(TEPtr); } if (const TreeEntry *VTE = getTreeEntry(V)) { - if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst) { + if (ForOrder) { if (VTE->State != TreeEntry::Vectorize) { auto It = MultiNodeScalars.find(V); if (It == MultiNodeScalars.end()) @@ -12265,19 +11676,13 @@ BoUpSLP::isGatherShuffledEntry( "Expected positive number of registers."); Entries.clear(); // No need to check for the topmost gather node. - if (TE == VectorizableTree.front().get() && - (GatheredLoadsEntriesFirst == NoGatheredLoads || - none_of(ArrayRef(VectorizableTree).drop_front(), - [](const std::unique_ptr &TE) { - return !TE->isGather(); - }))) + if (TE == VectorizableTree.front().get()) return {}; // FIXME: Gathering for non-power-of-2 nodes not implemented yet. if (TE->isNonPowOf2Vec()) return {}; Mask.assign(VL.size(), PoisonMaskElem); - assert((TE->UserTreeIndices.size() == 1 || - TE == VectorizableTree.front().get()) && + assert(TE->UserTreeIndices.size() == 1 && "Expected only single user of the gather node."); assert(VL.size() % NumParts == 0 && "Number of scalars must be divisible by NumParts."); @@ -12396,23 +11801,17 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { return *Res; // Get the basic block this bundle is in. All instructions in the bundle // should be in this block (except for extractelement-like instructions with - // constant indices or gathered loads). + // constant indeces). auto *Front = E->getMainOp(); auto *BB = Front->getParent(); - assert(((GatheredLoadsEntriesFirst != NoGatheredLoads && - E->getOpcode() == Instruction::Load && E->isGather() && - E->Idx < GatheredLoadsEntriesFirst) || - all_of(E->Scalars, - [=](Value *V) -> bool { - if (E->getOpcode() == Instruction::GetElementPtr && - !isa(V)) - return true; - auto *I = cast(V); - return !E->isOpcodeOrAlt(I) || I->getParent() == BB || - isVectorLikeInstWithConstOps(I); - })) && - "Expected gathered loads or GEPs or instructions from same basic " - "block."); + assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool { + if (E->getOpcode() == Instruction::GetElementPtr && + !isa(V)) + return true; + auto *I = cast(V); + return !E->isOpcodeOrAlt(I) || I->getParent() == BB || + isVectorLikeInstWithConstOps(I); + })); auto FindLastInst = [&]() { Instruction *LastInst = Front; @@ -12428,10 +11827,7 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { assert(((E->getOpcode() == Instruction::GetElementPtr && !isa(I)) || (isVectorLikeInstWithConstOps(LastInst) && - isVectorLikeInstWithConstOps(I)) || - (GatheredLoadsEntriesFirst != NoGatheredLoads && - E->getOpcode() == Instruction::Load && E->isGather() && - E->Idx < GatheredLoadsEntriesFirst)) && + isVectorLikeInstWithConstOps(I))) && "Expected vector-like or non-GEP in GEP node insts only."); if (!DT->isReachableFromEntry(LastInst->getParent())) { LastInst = I; @@ -12488,14 +11884,6 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { return FirstInst; }; - // Set insertpoint for gathered loads to the very first load. - if (GatheredLoadsEntriesFirst != NoGatheredLoads && - E->Idx >= GatheredLoadsEntriesFirst && !E->isGather() && - E->getOpcode() == Instruction::Load) { - Res = FindFirstInst(); - return *Res; - } - // Set the insert point to the beginning of the basic block if the entry // should not be scheduled. if (doesNotNeedToSchedule(E->Scalars) || @@ -13476,12 +12864,6 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } // Gather extracts after we check for full matched gathers only. if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load || - ((E->getOpcode() == Instruction::Load || - any_of(E->Scalars, IsaPred)) && - any_of(E->Scalars, - [this](Value *V) { - return isa(V) && getTreeEntry(V); - })) || E->isAltShuffle() || all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || isSplat(E->Scalars) || @@ -14856,18 +14238,6 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues, else Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin()); - // Emit gathered loads first to emit better code for the users of those - // gathered loads. - for (const std::unique_ptr &TE : VectorizableTree) { - if (GatheredLoadsEntriesFirst != NoGatheredLoads && - TE->Idx >= GatheredLoadsEntriesFirst && - (!TE->isGather() || !TE->UserTreeIndices.empty())) { - assert((!TE->UserTreeIndices.empty() || - (TE->getOpcode() == Instruction::Load && !TE->isGather())) && - "Expected gathered load node."); - (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false); - } - } // Postpone emission of PHIs operands to avoid cyclic dependencies issues. (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true); for (const std::unique_ptr &TE : VectorizableTree) @@ -15427,15 +14797,10 @@ BoUpSLP::vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues, if (IE->Idx != 0 && !(VectorizableTree.front()->isGather() && isa(I) && !IE->UserTreeIndices.empty() && - any_of(IE->UserTreeIndices, - [&](const EdgeInfo &EI) { - return EI.UserTE == VectorizableTree.front().get() && - EI.EdgeIdx == UINT_MAX; - })) && - !(GatheredLoadsEntriesFirst != NoGatheredLoads && - IE->Idx >= GatheredLoadsEntriesFirst && - VectorizableTree.front()->isGather() && - is_contained(VectorizableTree.front()->Scalars, I))) + any_of(IE->UserTreeIndices, [&](const EdgeInfo &EI) { + return EI.UserTE == VectorizableTree.front().get() && + EI.EdgeIdx == UINT_MAX; + }))) continue; SmallVector LogicalOpSelects; I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) { diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll index fffa626cae0dda..ff1d6253bec928 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -23,12 +23,14 @@ define void @s116_modified(ptr %a) { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP3]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP2]], i64 2) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP7]] -; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[A]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP9]] +; CHECK-NEXT: store <4 x float> [[TMP10]], ptr [[A]], align 4 ; CHECK-NEXT: ret void ; %gep1 = getelementptr inbounds float, ptr %a, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll index 833bc56c4ec6b6..67746f2cbf5d22 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll @@ -1,27 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,NON-POWER-OF-2 %s -; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK,POWER-OF-2 %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK %s define void @vec3_vectorize_call(ptr %Colour, float %0) { -; NON-POWER-OF-2-LABEL: @vec3_vectorize_call( -; NON-POWER-OF-2-NEXT: entry: -; NON-POWER-OF-2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4 -; NON-POWER-OF-2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2 -; NON-POWER-OF-2-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> -; NON-POWER-OF-2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> -; NON-POWER-OF-2-NEXT: [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer) -; NON-POWER-OF-2-NEXT: store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4 -; NON-POWER-OF-2-NEXT: ret void -; -; POWER-OF-2-LABEL: @vec3_vectorize_call( -; POWER-OF-2-NEXT: entry: -; POWER-OF-2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4 -; POWER-OF-2-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP1]], <2 x float> zeroinitializer, <2 x float> zeroinitializer) -; POWER-OF-2-NEXT: store <2 x float> [[TMP2]], ptr [[COLOUR]], align 4 -; POWER-OF-2-NEXT: [[ARRAYIDX99_I1:%.*]] = getelementptr float, ptr [[COLOUR]], i64 2 -; POWER-OF-2-NEXT: [[TMP3:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0:%.*]], float 0.000000e+00, float 0.000000e+00) -; POWER-OF-2-NEXT: store float [[TMP3]], ptr [[ARRAYIDX99_I1]], align 4 -; POWER-OF-2-NEXT: ret void +; CHECK-LABEL: @vec3_vectorize_call( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP1]], <2 x float> zeroinitializer, <2 x float> zeroinitializer) +; CHECK-NEXT: store <2 x float> [[TMP2]], ptr [[COLOUR]], align 4 +; CHECK-NEXT: [[ARRAYIDX99_I1:%.*]] = getelementptr float, ptr [[COLOUR]], i64 2 +; CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.fmuladd.f32(float [[TMP0:%.*]], float 0.000000e+00, float 0.000000e+00) +; CHECK-NEXT: store float [[TMP3]], ptr [[ARRAYIDX99_I1]], align 4 +; CHECK-NEXT: ret void ; entry: %1 = load float, ptr %Colour, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll index 9c086abe216c02..f04c359b432b5e 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll @@ -245,24 +245,34 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) { ; CHECK-NEXT: [[L_8:%.*]] = load i8, ptr [[GEP_8]], align 1 ; CHECK-NEXT: [[CMP_8:%.*]] = icmp ugt i8 [[L_8]], -1 ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 9 +; CHECK-NEXT: [[L_9:%.*]] = load i8, ptr [[GEP_9]], align 1 +; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 10 +; CHECK-NEXT: [[L_10:%.*]] = load i8, ptr [[GEP_10]], align 1 ; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 11 ; CHECK-NEXT: [[L_11:%.*]] = load i8, ptr [[GEP_11]], align 1 ; CHECK-NEXT: [[GEP_12:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 12 ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0 ; CHECK-NEXT: [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i8>, ptr [[GEP_9]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[L_9]], i32 9 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[L_10]], i32 10 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[L_11]], i32 11 ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP6]], <8 x i8> [[TMP0]], i64 0) -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP7]], <4 x i8> [[TMP3]], i64 12) +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP7]], <4 x i8> [[TMP2]], i64 12) ; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt <16 x i8> [[TMP8]], -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP8]], <16 x i8> [[TMP11]] -; CHECK-NEXT: store <16 x i8> [[TMP12]], ptr [[PTR]], align 2 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> [[TMP11]], <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[L_9]], i32 9 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[L_10]], i32 10 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[L_11]], i32 11 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP17]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP16]], <16 x i8> [[TMP18]] +; CHECK-NEXT: store <16 x i8> [[TMP19]], ptr [[PTR]], align 2 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll index 4de16a5d57793d..af354bb06ad46a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll @@ -10,8 +10,10 @@ define void @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_(ptr %__first, ptr n ; CHECK-LABEL: @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__FIRST:%.*]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x ptr>, ptr [[__LAST:%.*]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[TMP0]], i32 0 +; CHECK-NEXT: [[_M_FIRST3_I_I:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", ptr [[__FIRST]], i64 0, i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__LAST:%.*]], align 8 +; CHECK-NEXT: [[_M_FIRST3_I_I83:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", ptr [[__LAST]], i64 0, i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_M_FIRST3_I_I83]], align 8 ; CHECK-NEXT: br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT:%.*]], label [[WHILE_COND_I_PREHEADER:%.*]] ; CHECK: while.cond.i.preheader: ; CHECK-NEXT: br label [[WHILE_COND_I:%.*]] @@ -20,8 +22,10 @@ define void @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_(ptr %__first, ptr n ; CHECK: while.body.i: ; CHECK-NEXT: br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT]], label [[WHILE_COND_I]] ; CHECK: _ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit: -; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x ptr> [ [[TMP2]], [[ENTRY:%.*]] ], [ [[TMP1]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ] -; CHECK-NEXT: store <2 x ptr> [[TMP3]], ptr [[__FIRST]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = phi ptr [ [[TMP2]], [[ENTRY:%.*]] ], [ [[TMP2]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi ptr [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ] +; CHECK-NEXT: store ptr [[TMP4]], ptr [[__FIRST]], align 8 +; CHECK-NEXT: store ptr [[TMP3]], ptr [[_M_FIRST3_I_I]], align 8 ; CHECK-NEXT: br i1 undef, label [[IF_THEN_I55:%.*]], label [[WHILE_COND:%.*]] ; CHECK: if.then.i55: ; CHECK-NEXT: br label [[WHILE_COND]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll index d1f93eccc2a91d..b0d9fea43a0e6c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -837,18 +837,21 @@ define i32 @maxi8_mutiple_uses(i32) { ; THRESH-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]] ; THRESH-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP3]], i32 [[TMP4]] ; THRESH-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 -; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP7]]) -; THRESH-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 -; THRESH-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP8]], i32 0 -; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP6]], i32 1 -; THRESH-NEXT: [[TMP12:%.*]] = icmp sgt <2 x i32> [[TMP10]], [[TMP11]] -; THRESH-NEXT: [[TMP13:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]] -; THRESH-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0 -; THRESH-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1 -; THRESH-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] -; THRESH-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP14]], i32 [[TMP15]] -; THRESH-NEXT: [[TMP16:%.*]] = select i1 [[TMP5]], i32 3, i32 4 -; THRESH-NEXT: store i32 [[TMP16]], ptr @var, align 8 +; THRESH-NEXT: [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 +; THRESH-NEXT: [[TMP9:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4 +; THRESH-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP7]]) +; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0 +; THRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP9]], i32 1 +; THRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0 +; THRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP6]], i32 1 +; THRESH-NEXT: [[TMP15:%.*]] = icmp sgt <2 x i32> [[TMP12]], [[TMP14]] +; THRESH-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP12]], <2 x i32> [[TMP14]] +; THRESH-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0 +; THRESH-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1 +; THRESH-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] +; THRESH-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP17]], i32 [[TMP18]] +; THRESH-NEXT: [[TMP19:%.*]] = select i1 [[TMP5]], i32 3, i32 4 +; THRESH-NEXT: store i32 [[TMP19]], ptr @var, align 8 ; THRESH-NEXT: ret i32 [[OP_RDX5]] ; %2 = load i32, ptr @arr, align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll index 4f94784a24dd47..9a41c1dc5de225 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll @@ -100,14 +100,21 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16) define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) { ; CHECK-LABEL: @PR16739_byval( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16 -; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X]], align 16 +; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X:%.*]], align 16 +; CHECK-NEXT: [[T2:%.*]] = getelementptr inbounds <4 x float>, ptr [[X]], i64 0, i64 2 +; CHECK-NEXT: [[T4:%.*]] = load i64, ptr [[T2]], align 8 +; CHECK-NEXT: [[T5:%.*]] = trunc i64 [[T1]] to i32 +; CHECK-NEXT: [[T6:%.*]] = bitcast i32 [[T5]] to float +; CHECK-NEXT: [[T7:%.*]] = insertelement <4 x float> poison, float [[T6]], i32 0 ; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float> -; CHECK-NEXT: ret <4 x float> [[TMP5]] +; CHECK-NEXT: [[T9:%.*]] = trunc i64 [[T8]] to i32 +; CHECK-NEXT: [[T10:%.*]] = bitcast i32 [[T9]] to float +; CHECK-NEXT: [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1 +; CHECK-NEXT: [[T12:%.*]] = trunc i64 [[T4]] to i32 +; CHECK-NEXT: [[T13:%.*]] = bitcast i32 [[T12]] to float +; CHECK-NEXT: [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2 +; CHECK-NEXT: [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[T13]], i32 3 +; CHECK-NEXT: ret <4 x float> [[T15]] ; %t1 = load i64, ptr %x, align 16 %t2 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll index 700e3ed9effc48..bc8e6626e55084 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll @@ -100,14 +100,21 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16) define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) { ; CHECK-LABEL: @PR16739_byval( -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16 -; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X]], align 16 +; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X:%.*]], align 16 +; CHECK-NEXT: [[T2:%.*]] = getelementptr inbounds <4 x float>, ptr [[X]], i64 0, i64 2 +; CHECK-NEXT: [[T4:%.*]] = load i64, ptr [[T2]], align 8 +; CHECK-NEXT: [[T5:%.*]] = trunc i64 [[T1]] to i32 +; CHECK-NEXT: [[T6:%.*]] = bitcast i32 [[T5]] to float +; CHECK-NEXT: [[T7:%.*]] = insertelement <4 x float> undef, float [[T6]], i32 0 ; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float> -; CHECK-NEXT: ret <4 x float> [[TMP5]] +; CHECK-NEXT: [[T9:%.*]] = trunc i64 [[T8]] to i32 +; CHECK-NEXT: [[T10:%.*]] = bitcast i32 [[T9]] to float +; CHECK-NEXT: [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1 +; CHECK-NEXT: [[T12:%.*]] = trunc i64 [[T4]] to i32 +; CHECK-NEXT: [[T13:%.*]] = bitcast i32 [[T12]] to float +; CHECK-NEXT: [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2 +; CHECK-NEXT: [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[T13]], i32 3 +; CHECK-NEXT: ret <4 x float> [[T15]] ; %t1 = load i64, ptr %x, align 16 %t2 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll index c3122d991da20c..5a28581913b8c7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -204,21 +204,25 @@ define void @lookahead_external_uses(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr ; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 1 ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 2 +; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1 +; CHECK-NEXT: [[B0:%.*]] = load double, ptr [[B]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, ptr [[C:%.*]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load double, ptr [[D:%.*]], align 8 ; CHECK-NEXT: [[B2:%.*]] = load double, ptr [[IDXB2]], align 8 ; CHECK-NEXT: [[A2:%.*]] = load double, ptr [[IDXA2]], align 8 +; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8 ; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP1]], double [[D0]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP7]] -; CHECK-NEXT: store <2 x double> [[TMP8]], ptr [[S:%.*]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[B1]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]] +; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[S:%.*]], align 8 ; CHECK-NEXT: store double [[A1]], ptr [[EXT1:%.*]], align 8 ; CHECK-NEXT: ret void ; @@ -280,22 +284,24 @@ define void @lookahead_limit_users_budget(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 2 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1 +; CHECK-NEXT: [[B0:%.*]] = load double, ptr [[B]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, ptr [[C:%.*]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load double, ptr [[D:%.*]], align 8 ; CHECK-NEXT: [[B2:%.*]] = load double, ptr [[IDXB2]], align 8 ; CHECK-NEXT: [[A2:%.*]] = load double, ptr [[IDXA2]], align 8 +; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8 ; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8 -; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP1]], double [[D0]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP7]] -; CHECK-NEXT: store <2 x double> [[TMP8]], ptr [[S:%.*]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[B1]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]] +; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[S:%.*]], align 8 ; CHECK-NEXT: store double [[A1]], ptr [[EXT1:%.*]], align 8 ; CHECK-NEXT: store double [[A1]], ptr [[EXT2:%.*]], align 8 ; CHECK-NEXT: store double [[A1]], ptr [[EXT3:%.*]], align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll index 19cbce0767c925..89bc44dc1d530a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -541,149 +541,161 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_div( ; SSE-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 -; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 -; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 -; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 -; SSE-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 16 -; SSE-NEXT: [[TMP10:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP11:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP12:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 -; SSE-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> -; SSE-NEXT: [[TMP15:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x i32> -; SSE-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> -; SSE-NEXT: [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> -; SSE-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> -; SSE-NEXT: [[TMP19:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> [[TMP18]], <4 x i32> -; SSE-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP12]], <4 x i32> -; SSE-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP14]], <4 x i32> -; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP8]], i64 3 -; SSE-NEXT: [[TMP23:%.*]] = fdiv <4 x float> [[TMP19]], [[TMP22]] -; SSE-NEXT: store <4 x float> [[TMP23]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 -; SSE-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 -; SSE-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 +; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16 +; SSE-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 +; SSE-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 +; SSE-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 +; SSE-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44 +; SSE-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56 +; SSE-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 +; SSE-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 16 +; SSE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 +; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP7]], i64 1 +; SSE-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP11]], i64 2 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP15]], i64 3 +; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 +; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP9]], i64 1 +; SSE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP13]], i64 2 +; SSE-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP17]], i64 3 +; SSE-NEXT: [[TMP27:%.*]] = fdiv <4 x float> [[TMP22]], [[TMP26]] +; SSE-NEXT: store <4 x float> [[TMP27]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 ; SSE-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 +; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 ; SSE-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 ; SSE-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 +; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 ; SSE-NEXT: [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 +; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 ; SSE-NEXT: [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 +; SSE-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 ; SSE-NEXT: [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP25]], i64 0 -; SSE-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP29]], i64 1 -; SSE-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP33]], i64 2 -; SSE-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP37]], i64 3 -; SSE-NEXT: [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i64 0 -; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP31]], i64 1 -; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP35]], i64 2 -; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP39]], i64 3 -; SSE-NEXT: [[TMP48:%.*]] = fdiv <4 x float> [[TMP43]], [[TMP47]] -; SSE-NEXT: store <4 x float> [[TMP48]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 +; SSE-NEXT: [[TMP41:%.*]] = load float, ptr [[TMP40]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 +; SSE-NEXT: [[TMP43:%.*]] = load float, ptr [[TMP42]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP29]], i64 0 +; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP33]], i64 1 +; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP37]], i64 2 +; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP41]], i64 3 +; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> poison, float [[TMP31]], i64 0 +; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> [[TMP48]], float [[TMP35]], i64 1 +; SSE-NEXT: [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP39]], i64 2 +; SSE-NEXT: [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP43]], i64 3 +; SSE-NEXT: [[TMP52:%.*]] = fdiv <4 x float> [[TMP47]], [[TMP51]] +; SSE-NEXT: store <4 x float> [[TMP52]], ptr [[TMP18]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_div( ; AVX-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 -; AVX-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 -; AVX-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 -; AVX-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 -; AVX-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 -; AVX-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 -; AVX-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 -; AVX-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 -; AVX-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 -; AVX-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 -; AVX-NEXT: [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> -; AVX-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> -; AVX-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> -; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4 -; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5 -; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6 -; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7 -; AVX-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> -; AVX-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> -; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3 -; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4 -; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5 -; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6 -; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7 -; AVX-NEXT: [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]] -; AVX-NEXT: store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16 +; AVX-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 +; AVX-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 +; AVX-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 +; AVX-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44 +; AVX-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56 +; AVX-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 +; AVX-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 +; AVX-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 +; AVX-NEXT: [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 +; AVX-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 +; AVX-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +; AVX-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 +; AVX-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 +; AVX-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 +; AVX-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 +; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1 +; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2 +; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3 +; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4 +; AVX-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5 +; AVX-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6 +; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7 +; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0 +; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1 +; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2 +; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3 +; AVX-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4 +; AVX-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5 +; AVX-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6 +; AVX-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 +; AVX-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] +; AVX-NEXT: store <8 x float> [[TMP50]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_div( ; AVX2-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 -; AVX2-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 -; AVX2-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 -; AVX2-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 -; AVX2-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 -; AVX2-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 -; AVX2-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 -; AVX2-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 -; AVX2-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 -; AVX2-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 -; AVX2-NEXT: [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> -; AVX2-NEXT: [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> -; AVX2-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> -; AVX2-NEXT: [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> -; AVX2-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> -; AVX2-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> -; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4 -; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5 -; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6 -; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7 -; AVX2-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> -; AVX2-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> -; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3 -; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4 -; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5 -; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6 -; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7 -; AVX2-NEXT: [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]] -; AVX2-NEXT: store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16 +; AVX2-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 +; AVX2-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 +; AVX2-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 +; AVX2-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44 +; AVX2-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56 +; AVX2-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 +; AVX2-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 +; AVX2-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 +; AVX2-NEXT: [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 +; AVX2-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 +; AVX2-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +; AVX2-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 +; AVX2-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 +; AVX2-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 +; AVX2-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 +; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1 +; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2 +; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3 +; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4 +; AVX2-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5 +; AVX2-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6 +; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7 +; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0 +; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1 +; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2 +; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3 +; AVX2-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4 +; AVX2-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5 +; AVX2-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6 +; AVX2-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 +; AVX2-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] +; AVX2-NEXT: store <8 x float> [[TMP50]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll index 9ac4208c632851..c1b501015e81e4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -541,149 +541,161 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_div( ; SSE-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 -; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 -; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 -; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 -; SSE-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 16 -; SSE-NEXT: [[TMP10:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP11:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP12:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 -; SSE-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> -; SSE-NEXT: [[TMP15:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x i32> -; SSE-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> -; SSE-NEXT: [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> -; SSE-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> -; SSE-NEXT: [[TMP19:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> [[TMP18]], <4 x i32> -; SSE-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP12]], <4 x i32> -; SSE-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP14]], <4 x i32> -; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP8]], i64 3 -; SSE-NEXT: [[TMP23:%.*]] = fdiv <4 x float> [[TMP19]], [[TMP22]] -; SSE-NEXT: store <4 x float> [[TMP23]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 -; SSE-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 -; SSE-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 +; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16 +; SSE-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 +; SSE-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 +; SSE-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 +; SSE-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44 +; SSE-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56 +; SSE-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 +; SSE-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 16 +; SSE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 +; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP7]], i64 1 +; SSE-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP11]], i64 2 +; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP15]], i64 3 +; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 +; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP9]], i64 1 +; SSE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP13]], i64 2 +; SSE-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP17]], i64 3 +; SSE-NEXT: [[TMP27:%.*]] = fdiv <4 x float> [[TMP22]], [[TMP26]] +; SSE-NEXT: store <4 x float> [[TMP27]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 ; SSE-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 +; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 ; SSE-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 ; SSE-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 +; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 ; SSE-NEXT: [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 +; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 ; SSE-NEXT: [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 +; SSE-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 ; SSE-NEXT: [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP25]], i64 0 -; SSE-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP29]], i64 1 -; SSE-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP33]], i64 2 -; SSE-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP37]], i64 3 -; SSE-NEXT: [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i64 0 -; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP31]], i64 1 -; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP35]], i64 2 -; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP39]], i64 3 -; SSE-NEXT: [[TMP48:%.*]] = fdiv <4 x float> [[TMP43]], [[TMP47]] -; SSE-NEXT: store <4 x float> [[TMP48]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 +; SSE-NEXT: [[TMP41:%.*]] = load float, ptr [[TMP40]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 +; SSE-NEXT: [[TMP43:%.*]] = load float, ptr [[TMP42]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP29]], i64 0 +; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP33]], i64 1 +; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP37]], i64 2 +; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP41]], i64 3 +; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> poison, float [[TMP31]], i64 0 +; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> [[TMP48]], float [[TMP35]], i64 1 +; SSE-NEXT: [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP39]], i64 2 +; SSE-NEXT: [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP43]], i64 3 +; SSE-NEXT: [[TMP52:%.*]] = fdiv <4 x float> [[TMP47]], [[TMP51]] +; SSE-NEXT: store <4 x float> [[TMP52]], ptr [[TMP18]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_div( ; AVX-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 -; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 -; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 -; AVX-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 -; AVX-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 -; AVX-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 -; AVX-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 -; AVX-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 -; AVX-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 -; AVX-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 -; AVX-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 -; AVX-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 -; AVX-NEXT: [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> -; AVX-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> -; AVX-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> -; AVX-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> -; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4 -; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5 -; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6 -; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7 -; AVX-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> -; AVX-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> -; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3 -; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4 -; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5 -; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6 -; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7 -; AVX-NEXT: [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]] -; AVX-NEXT: store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16 +; AVX-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 +; AVX-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 +; AVX-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 +; AVX-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44 +; AVX-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56 +; AVX-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 +; AVX-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 +; AVX-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 +; AVX-NEXT: [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 +; AVX-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 +; AVX-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +; AVX-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 +; AVX-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 +; AVX-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 +; AVX-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 +; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1 +; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2 +; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3 +; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4 +; AVX-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5 +; AVX-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6 +; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7 +; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0 +; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1 +; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2 +; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3 +; AVX-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4 +; AVX-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5 +; AVX-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6 +; AVX-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 +; AVX-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] +; AVX-NEXT: store <8 x float> [[TMP50]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_div( ; AVX2-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 -; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 -; AVX2-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 -; AVX2-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 -; AVX2-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 -; AVX2-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 -; AVX2-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 -; AVX2-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 -; AVX2-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 -; AVX2-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 -; AVX2-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 -; AVX2-NEXT: [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> -; AVX2-NEXT: [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> -; AVX2-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> -; AVX2-NEXT: [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> -; AVX2-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> -; AVX2-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> -; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4 -; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5 -; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6 -; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7 -; AVX2-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> -; AVX2-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> -; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3 -; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4 -; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5 -; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6 -; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7 -; AVX2-NEXT: [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]] -; AVX2-NEXT: store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16 +; AVX2-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 +; AVX2-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 +; AVX2-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 +; AVX2-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44 +; AVX2-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56 +; AVX2-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 +; AVX2-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 +; AVX2-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 +; AVX2-NEXT: [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 +; AVX2-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 +; AVX2-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +; AVX2-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 +; AVX2-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 +; AVX2-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 +; AVX2-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 +; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1 +; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2 +; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3 +; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4 +; AVX2-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5 +; AVX2-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6 +; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7 +; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0 +; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1 +; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2 +; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3 +; AVX2-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4 +; AVX2-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5 +; AVX2-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6 +; AVX2-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 +; AVX2-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] +; AVX2-NEXT: store <8 x float> [[TMP50]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll index 92a4095c7c57a8..249b51592760c0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll @@ -71,66 +71,120 @@ define { i64, i64 } @compute_min(ptr nocapture noundef nonnull readonly align 2 ; ; AVX-LABEL: @compute_min( ; AVX-NEXT: entry: -; AVX-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[Y:%.*]], align 2 -; AVX-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[X:%.*]], align 2 -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> -; AVX-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP2]], <2 x i16> [[TMP3]]) -; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> -; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> -; AVX-NEXT: [[TMP7:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP5]], <2 x i16> [[TMP6]]) -; AVX-NEXT: [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> -; AVX-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> -; AVX-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP8]], <2 x i16> [[TMP9]]) -; AVX-NEXT: [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> -; AVX-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> -; AVX-NEXT: [[TMP13:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP11]], <2 x i16> [[TMP12]]) -; AVX-NEXT: [[TMP14:%.*]] = zext <2 x i16> [[TMP13]] to <2 x i64> -; AVX-NEXT: [[TMP15:%.*]] = shl nuw <2 x i64> [[TMP14]], -; AVX-NEXT: [[TMP16:%.*]] = zext <2 x i16> [[TMP10]] to <2 x i64> -; AVX-NEXT: [[TMP17:%.*]] = shl nuw <2 x i64> [[TMP16]], -; AVX-NEXT: [[TMP18:%.*]] = or <2 x i64> [[TMP15]], [[TMP17]] -; AVX-NEXT: [[TMP19:%.*]] = zext <2 x i16> [[TMP7]] to <2 x i64> -; AVX-NEXT: [[TMP20:%.*]] = shl nuw nsw <2 x i64> [[TMP19]], -; AVX-NEXT: [[TMP21:%.*]] = or <2 x i64> [[TMP18]], [[TMP20]] -; AVX-NEXT: [[TMP22:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i64> -; AVX-NEXT: [[TMP23:%.*]] = or <2 x i64> [[TMP21]], [[TMP22]] -; AVX-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP23]], i32 0 -; AVX-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP24]], 0 -; AVX-NEXT: [[TMP25:%.*]] = extractelement <2 x i64> [[TMP23]], i32 1 -; AVX-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP25]], 1 +; AVX-NEXT: [[TMP0:%.*]] = load i16, ptr [[Y:%.*]], align 2 +; AVX-NEXT: [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 2 +; AVX-NEXT: [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP0]], i16 [[TMP1]]) +; AVX-NEXT: [[ARRAYIDX_I_I_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 1 +; AVX-NEXT: [[ARRAYIDX_I_I10_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 1 +; AVX-NEXT: [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_1]], align 2 +; AVX-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I_I_1]], align 2 +; AVX-NEXT: [[TMP5:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP3]], i16 [[TMP4]]) +; AVX-NEXT: [[ARRAYIDX_I_I_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 2 +; AVX-NEXT: [[ARRAYIDX_I_I10_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 2 +; AVX-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_2]], align 2 +; AVX-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_I_I_2]], align 2 +; AVX-NEXT: [[TMP8:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP6]], i16 [[TMP7]]) +; AVX-NEXT: [[ARRAYIDX_I_I_3:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 3 +; AVX-NEXT: [[ARRAYIDX_I_I10_3:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 3 +; AVX-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_3]], align 2 +; AVX-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_I_I_3]], align 2 +; AVX-NEXT: [[TMP11:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP9]], i16 [[TMP10]]) +; AVX-NEXT: [[ARRAYIDX_I_I_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 4 +; AVX-NEXT: [[ARRAYIDX_I_I10_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 4 +; AVX-NEXT: [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_4]], align 2 +; AVX-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_I_I_4]], align 2 +; AVX-NEXT: [[TMP14:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP12]], i16 [[TMP13]]) +; AVX-NEXT: [[ARRAYIDX_I_I_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 5 +; AVX-NEXT: [[ARRAYIDX_I_I10_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 5 +; AVX-NEXT: [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_5]], align 2 +; AVX-NEXT: [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX_I_I_5]], align 2 +; AVX-NEXT: [[TMP17:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP15]], i16 [[TMP16]]) +; AVX-NEXT: [[ARRAYIDX_I_I_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 6 +; AVX-NEXT: [[ARRAYIDX_I_I10_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 6 +; AVX-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_6]], align 2 +; AVX-NEXT: [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX_I_I_6]], align 2 +; AVX-NEXT: [[TMP20:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP18]], i16 [[TMP19]]) +; AVX-NEXT: [[ARRAYIDX_I_I_7:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 7 +; AVX-NEXT: [[ARRAYIDX_I_I10_7:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 7 +; AVX-NEXT: [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_7]], align 2 +; AVX-NEXT: [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX_I_I_7]], align 2 +; AVX-NEXT: [[TMP23:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP21]], i16 [[TMP22]]) +; AVX-NEXT: [[RETVAL_SROA_4_0_INSERT_EXT:%.*]] = zext i16 [[TMP11]] to i64 +; AVX-NEXT: [[RETVAL_SROA_4_0_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_4_0_INSERT_EXT]], 48 +; AVX-NEXT: [[RETVAL_SROA_3_0_INSERT_EXT:%.*]] = zext i16 [[TMP8]] to i64 +; AVX-NEXT: [[RETVAL_SROA_3_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_3_0_INSERT_EXT]], 32 +; AVX-NEXT: [[RETVAL_SROA_3_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_4_0_INSERT_SHIFT]], [[RETVAL_SROA_3_0_INSERT_SHIFT]] +; AVX-NEXT: [[RETVAL_SROA_2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i64 +; AVX-NEXT: [[RETVAL_SROA_2_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_2_0_INSERT_EXT]], 16 +; AVX-NEXT: [[RETVAL_SROA_2_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_3_0_INSERT_INSERT]], [[RETVAL_SROA_2_0_INSERT_SHIFT]] +; AVX-NEXT: [[RETVAL_SROA_0_0_INSERT_EXT:%.*]] = zext i16 [[TMP2]] to i64 +; AVX-NEXT: [[RETVAL_SROA_0_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_2_0_INSERT_INSERT]], [[RETVAL_SROA_0_0_INSERT_EXT]] +; AVX-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[RETVAL_SROA_0_0_INSERT_INSERT]], 0 +; AVX-NEXT: [[RETVAL_SROA_9_8_INSERT_EXT:%.*]] = zext i16 [[TMP23]] to i64 +; AVX-NEXT: [[RETVAL_SROA_9_8_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_9_8_INSERT_EXT]], 48 +; AVX-NEXT: [[RETVAL_SROA_8_8_INSERT_EXT:%.*]] = zext i16 [[TMP20]] to i64 +; AVX-NEXT: [[RETVAL_SROA_8_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_8_8_INSERT_EXT]], 32 +; AVX-NEXT: [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_9_8_INSERT_SHIFT]], [[RETVAL_SROA_8_8_INSERT_SHIFT]] +; AVX-NEXT: [[RETVAL_SROA_7_8_INSERT_EXT:%.*]] = zext i16 [[TMP17]] to i64 +; AVX-NEXT: [[RETVAL_SROA_7_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_7_8_INSERT_EXT]], 16 +; AVX-NEXT: [[RETVAL_SROA_7_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], [[RETVAL_SROA_7_8_INSERT_SHIFT]] +; AVX-NEXT: [[RETVAL_SROA_5_8_INSERT_EXT:%.*]] = zext i16 [[TMP14]] to i64 +; AVX-NEXT: [[RETVAL_SROA_5_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_7_8_INSERT_INSERT]], [[RETVAL_SROA_5_8_INSERT_EXT]] +; AVX-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[RETVAL_SROA_5_8_INSERT_INSERT]], 1 ; AVX-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] ; ; AVX2-LABEL: @compute_min( ; AVX2-NEXT: entry: -; AVX2-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[Y:%.*]], align 2 -; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[X:%.*]], align 2 -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> -; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> -; AVX2-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP2]], <2 x i16> [[TMP3]]) -; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> -; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> -; AVX2-NEXT: [[TMP7:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP5]], <2 x i16> [[TMP6]]) -; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> -; AVX2-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> -; AVX2-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP8]], <2 x i16> [[TMP9]]) -; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> -; AVX2-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> -; AVX2-NEXT: [[TMP13:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP11]], <2 x i16> [[TMP12]]) -; AVX2-NEXT: [[TMP14:%.*]] = zext <2 x i16> [[TMP13]] to <2 x i64> -; AVX2-NEXT: [[TMP15:%.*]] = shl nuw <2 x i64> [[TMP14]], -; AVX2-NEXT: [[TMP16:%.*]] = zext <2 x i16> [[TMP10]] to <2 x i64> -; AVX2-NEXT: [[TMP17:%.*]] = shl nuw <2 x i64> [[TMP16]], -; AVX2-NEXT: [[TMP18:%.*]] = or <2 x i64> [[TMP15]], [[TMP17]] -; AVX2-NEXT: [[TMP19:%.*]] = zext <2 x i16> [[TMP7]] to <2 x i64> -; AVX2-NEXT: [[TMP20:%.*]] = shl nuw nsw <2 x i64> [[TMP19]], -; AVX2-NEXT: [[TMP21:%.*]] = or <2 x i64> [[TMP18]], [[TMP20]] -; AVX2-NEXT: [[TMP22:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i64> -; AVX2-NEXT: [[TMP23:%.*]] = or <2 x i64> [[TMP21]], [[TMP22]] +; AVX2-NEXT: [[TMP0:%.*]] = load i16, ptr [[Y:%.*]], align 2 +; AVX2-NEXT: [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 2 +; AVX2-NEXT: [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP0]], i16 [[TMP1]]) +; AVX2-NEXT: [[ARRAYIDX_I_I_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 1 +; AVX2-NEXT: [[ARRAYIDX_I_I10_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 1 +; AVX2-NEXT: [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_1]], align 2 +; AVX2-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I_I_1]], align 2 +; AVX2-NEXT: [[TMP5:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP3]], i16 [[TMP4]]) +; AVX2-NEXT: [[ARRAYIDX_I_I_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 2 +; AVX2-NEXT: [[ARRAYIDX_I_I10_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 2 +; AVX2-NEXT: [[ARRAYIDX_I_I_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 4 +; AVX2-NEXT: [[ARRAYIDX_I_I10_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 4 +; AVX2-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_4]], align 2 +; AVX2-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_I_I_4]], align 2 +; AVX2-NEXT: [[TMP8:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP6]], i16 [[TMP7]]) +; AVX2-NEXT: [[ARRAYIDX_I_I_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 5 +; AVX2-NEXT: [[ARRAYIDX_I_I10_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 5 +; AVX2-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_5]], align 2 +; AVX2-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_I_I_5]], align 2 +; AVX2-NEXT: [[TMP11:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP9]], i16 [[TMP10]]) +; AVX2-NEXT: [[ARRAYIDX_I_I_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 6 +; AVX2-NEXT: [[ARRAYIDX_I_I10_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 6 +; AVX2-NEXT: [[TMP12:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I10_2]], align 2 +; AVX2-NEXT: [[TMP13:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I_2]], align 2 +; AVX2-NEXT: [[TMP14:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP12]], <2 x i16> [[TMP13]]) +; AVX2-NEXT: [[TMP15:%.*]] = zext <2 x i16> [[TMP14]] to <2 x i64> +; AVX2-NEXT: [[TMP16:%.*]] = shl nuw <2 x i64> [[TMP15]], +; AVX2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP16]], i32 0 +; AVX2-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP16]], i32 1 +; AVX2-NEXT: [[RETVAL_SROA_3_0_INSERT_INSERT:%.*]] = or i64 [[TMP18]], [[TMP17]] +; AVX2-NEXT: [[RETVAL_SROA_2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i64 +; AVX2-NEXT: [[RETVAL_SROA_2_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_2_0_INSERT_EXT]], 16 +; AVX2-NEXT: [[RETVAL_SROA_2_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_3_0_INSERT_INSERT]], [[RETVAL_SROA_2_0_INSERT_SHIFT]] +; AVX2-NEXT: [[RETVAL_SROA_0_0_INSERT_EXT:%.*]] = zext i16 [[TMP2]] to i64 +; AVX2-NEXT: [[RETVAL_SROA_0_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_2_0_INSERT_INSERT]], [[RETVAL_SROA_0_0_INSERT_EXT]] +; AVX2-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[RETVAL_SROA_0_0_INSERT_INSERT]], 0 +; AVX2-NEXT: [[TMP19:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I10_6]], align 2 +; AVX2-NEXT: [[TMP20:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I_6]], align 2 +; AVX2-NEXT: [[TMP21:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP19]], <2 x i16> [[TMP20]]) +; AVX2-NEXT: [[TMP22:%.*]] = zext <2 x i16> [[TMP21]] to <2 x i64> +; AVX2-NEXT: [[TMP23:%.*]] = shl nuw <2 x i64> [[TMP22]], ; AVX2-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP23]], i32 0 -; AVX2-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP24]], 0 ; AVX2-NEXT: [[TMP25:%.*]] = extractelement <2 x i64> [[TMP23]], i32 1 -; AVX2-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP25]], 1 +; AVX2-NEXT: [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or i64 [[TMP25]], [[TMP24]] +; AVX2-NEXT: [[RETVAL_SROA_7_8_INSERT_EXT:%.*]] = zext i16 [[TMP11]] to i64 +; AVX2-NEXT: [[RETVAL_SROA_7_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_7_8_INSERT_EXT]], 16 +; AVX2-NEXT: [[RETVAL_SROA_7_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], [[RETVAL_SROA_7_8_INSERT_SHIFT]] +; AVX2-NEXT: [[RETVAL_SROA_5_8_INSERT_EXT:%.*]] = zext i16 [[TMP8]] to i64 +; AVX2-NEXT: [[RETVAL_SROA_5_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_7_8_INSERT_INSERT]], [[RETVAL_SROA_5_8_INSERT_EXT]] +; AVX2-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[RETVAL_SROA_5_8_INSERT_INSERT]], 1 ; AVX2-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll index 5bd954e741d432..eacfbda5447c7b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll @@ -5,17 +5,20 @@ define void @test() { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr i32, ptr null, i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr null, align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr null, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP9]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP8]] ; CHECK-NEXT: [[TMP4:%.*]] = ashr <4 x i32> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: store <4 x i32> [[TMP11]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16 @@ -64,17 +67,20 @@ define void @test1() { ; CHECK-LABEL: define void @test1( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr i32, ptr null, i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr null, align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP8:%.*]] = load <2 x i32>, ptr null, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP8]] ; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i32> [[TMP3]] to <4 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP4]], zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i64> [[TMP5]] to <4 x i32> @@ -125,17 +131,20 @@ define void @test_div() { ; CHECK-LABEL: define void @test_div( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr i32, ptr null, i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr null, align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr null, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = udiv <4 x i32> [[TMP9]], ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16 @@ -184,17 +193,20 @@ define void @test_rem() { ; CHECK-LABEL: define void @test_rem( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr i32, ptr null, i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr null, align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, ptr null, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP4]], i32 2 -; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = urem <4 x i32> [[TMP9]], ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll index 360b258f216c56..82085ade519e23 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll @@ -12,17 +12,17 @@ define void @test() { ; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x float> zeroinitializer, [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX10_I_I86]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr undef, align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> , float [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP0]], float [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> , <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP6]]) +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> , float [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> , <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP6]], <2 x float> [[TMP7]]) ; CHECK-NEXT: br i1 false, label [[BB2:%.*]], label [[BB3:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x float> [ [[TMP8]], [[BB2]] ], [ zeroinitializer, [[BB1]] ] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = phi <2 x float> [ [[TMP9]], [[BB2]] ], [ zeroinitializer, [[BB1]] ] ; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x float> [[TMP1]], [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = fsub <2 x float> [[TMP12]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll index 68fb6c6202ce7f..9810d50beea736 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll @@ -11,51 +11,84 @@ declare double @llvm.sin.f64(double) define void @test() { ; CHECK-LABEL: @test( -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, ptr @src, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[A0:%.*]] = load double, ptr @src, align 8 +; CHECK-NEXT: [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8 +; CHECK-NEXT: [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8 +; CHECK-NEXT: [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8 +; CHECK-NEXT: [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8 +; CHECK-NEXT: [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8 +; CHECK-NEXT: [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8 +; CHECK-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A6]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP4]]) -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP6]]) -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A7]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A4]], i32 1 ; CHECK-NEXT: [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]]) -; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP7]], [[TMP5]] -; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP11]] -; CHECK-NEXT: store <2 x double> [[TMP12]], ptr @dst, align 8 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[A5]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]] +; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP14]] +; CHECK-NEXT: store <2 x double> [[TMP15]], ptr @dst, align 8 ; CHECK-NEXT: ret void ; ; VECLIB-LABEL: @test( -; VECLIB-NEXT: [[TMP1:%.*]] = load <8 x double>, ptr @src, align 8 -; VECLIB-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> +; VECLIB-NEXT: [[A0:%.*]] = load double, ptr @src, align 8 +; VECLIB-NEXT: [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8 +; VECLIB-NEXT: [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8 +; VECLIB-NEXT: [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8 +; VECLIB-NEXT: [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8 +; VECLIB-NEXT: [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8 +; VECLIB-NEXT: [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8 +; VECLIB-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8 +; VECLIB-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0 +; VECLIB-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A6]], i32 1 ; VECLIB-NEXT: [[TMP3:%.*]] = call fast <2 x double> @__svml_sin2(<2 x double> [[TMP2]]) -; VECLIB-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> -; VECLIB-NEXT: [[TMP5:%.*]] = call fast <2 x double> @__svml_sin2(<2 x double> [[TMP4]]) -; VECLIB-NEXT: [[TMP6:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> -; VECLIB-NEXT: [[TMP7:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP6]]) -; VECLIB-NEXT: [[TMP8:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> +; VECLIB-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A3]], i32 0 +; VECLIB-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A7]], i32 1 +; VECLIB-NEXT: [[TMP6:%.*]] = call fast <2 x double> @__svml_sin2(<2 x double> [[TMP5]]) +; VECLIB-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 +; VECLIB-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A4]], i32 1 ; VECLIB-NEXT: [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]]) -; VECLIB-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP7]], [[TMP5]] -; VECLIB-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP9]] -; VECLIB-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP11]] -; VECLIB-NEXT: store <2 x double> [[TMP12]], ptr @dst, align 8 +; VECLIB-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0 +; VECLIB-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[A5]], i32 1 +; VECLIB-NEXT: [[TMP12:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP11]]) +; VECLIB-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]] +; VECLIB-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP12]] +; VECLIB-NEXT: [[TMP15:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP14]] +; VECLIB-NEXT: store <2 x double> [[TMP15]], ptr @dst, align 8 ; VECLIB-NEXT: ret void ; ; AMDLIBM-LABEL: @test( -; AMDLIBM-NEXT: [[TMP1:%.*]] = load <8 x double>, ptr @src, align 8 -; AMDLIBM-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> +; AMDLIBM-NEXT: [[A0:%.*]] = load double, ptr @src, align 8 +; AMDLIBM-NEXT: [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8 +; AMDLIBM-NEXT: [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8 +; AMDLIBM-NEXT: [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8 +; AMDLIBM-NEXT: [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8 +; AMDLIBM-NEXT: [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8 +; AMDLIBM-NEXT: [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8 +; AMDLIBM-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8 +; AMDLIBM-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0 +; AMDLIBM-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A6]], i32 1 ; AMDLIBM-NEXT: [[TMP3:%.*]] = call fast <2 x double> @amd_vrd2_sin(<2 x double> [[TMP2]]) -; AMDLIBM-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> -; AMDLIBM-NEXT: [[TMP5:%.*]] = call fast <2 x double> @amd_vrd2_sin(<2 x double> [[TMP4]]) -; AMDLIBM-NEXT: [[TMP6:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> -; AMDLIBM-NEXT: [[TMP7:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP6]]) -; AMDLIBM-NEXT: [[TMP8:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <2 x i32> +; AMDLIBM-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A3]], i32 0 +; AMDLIBM-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A7]], i32 1 +; AMDLIBM-NEXT: [[TMP6:%.*]] = call fast <2 x double> @amd_vrd2_sin(<2 x double> [[TMP5]]) +; AMDLIBM-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 +; AMDLIBM-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A4]], i32 1 ; AMDLIBM-NEXT: [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]]) -; AMDLIBM-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP7]], [[TMP5]] -; AMDLIBM-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP9]] -; AMDLIBM-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP11]] -; AMDLIBM-NEXT: store <2 x double> [[TMP12]], ptr @dst, align 8 +; AMDLIBM-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0 +; AMDLIBM-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[A5]], i32 1 +; AMDLIBM-NEXT: [[TMP12:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP11]]) +; AMDLIBM-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]] +; AMDLIBM-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP12]] +; AMDLIBM-NEXT: [[TMP15:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP14]] +; AMDLIBM-NEXT: store <2 x double> [[TMP15]], ptr @dst, align 8 ; AMDLIBM-NEXT: ret void ; %a0 = load double, ptr @src, align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll index 202ec9633712f6..6ca1f8119c1cf0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll @@ -7,18 +7,28 @@ define dso_local void @_Z4testP1S(ptr %p) local_unnamed_addr { ; CHECK-LABEL: @_Z4testP1S( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[P:%.*]], i64 0, i32 1, i64 0 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 15 +; CHECK-NEXT: [[I1:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 6 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 4 +; CHECK-NEXT: [[I7:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4 ; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 12 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 14 -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX27]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 +; CHECK-NEXT: [[I13:%.*]] = load i32, ptr [[ARRAYIDX41]], align 4 +; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 5 +; CHECK-NEXT: [[I15:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX27]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX20]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP3]], <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP0]], i64 4) -; CHECK-NEXT: [[TMP7:%.*]] = add nsw <8 x i32> [[TMP6]], [[TMP2]] -; CHECK-NEXT: store <8 x i32> [[TMP7]], ptr [[P]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[I1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[I7]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[I13]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[I15]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP8]], <2 x i32> [[TMP1]], i64 4) +; CHECK-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[TMP9]], [[TMP2]] +; CHECK-NEXT: store <8 x i32> [[TMP10]], ptr [[P]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll index 69ae26b9f25856..87063fc3f7a820 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll @@ -7,13 +7,19 @@ define void @test_supernode_add(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Sarray) { ; ENABLED-LABEL: @test_supernode_add( ; ENABLED-NEXT: entry: -; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8 -; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 -; ENABLED-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[CARRAY:%.*]], align 8 -; ENABLED-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP2]], <2 x i32> -; ENABLED-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] -; ENABLED-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> -; ENABLED-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP5]] +; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[AARRAY:%.*]], i64 1 +; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, ptr [[CARRAY:%.*]], i64 1 +; ENABLED-NEXT: [[A0:%.*]] = load double, ptr [[AARRAY]], align 8 +; ENABLED-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8 +; ENABLED-NEXT: [[C0:%.*]] = load double, ptr [[CARRAY]], align 8 +; ENABLED-NEXT: [[C1:%.*]] = load double, ptr [[IDXC1]], align 8 +; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 +; ENABLED-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 +; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C1]], i32 1 +; ENABLED-NEXT: [[TMP3:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP2]] +; ENABLED-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 +; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A1]], i32 1 +; ENABLED-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] ; ENABLED-NEXT: store <2 x double> [[TMP6]], ptr [[SARRAY:%.*]], align 8 ; ENABLED-NEXT: ret void ; @@ -48,13 +54,19 @@ entry: define void @test_supernode_addsub(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Sarray) { ; ENABLED-LABEL: @test_supernode_addsub( ; ENABLED-NEXT: entry: -; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8 -; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 -; ENABLED-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[CARRAY:%.*]], align 8 -; ENABLED-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP2]], <2 x i32> -; ENABLED-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]] -; ENABLED-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> -; ENABLED-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP5]] +; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[AARRAY:%.*]], i64 1 +; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, ptr [[CARRAY:%.*]], i64 1 +; ENABLED-NEXT: [[A0:%.*]] = load double, ptr [[AARRAY]], align 8 +; ENABLED-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8 +; ENABLED-NEXT: [[C0:%.*]] = load double, ptr [[CARRAY]], align 8 +; ENABLED-NEXT: [[C1:%.*]] = load double, ptr [[IDXC1]], align 8 +; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 +; ENABLED-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 +; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C1]], i32 1 +; ENABLED-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP0]] +; ENABLED-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 +; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A1]], i32 1 +; ENABLED-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] ; ENABLED-NEXT: store <2 x double> [[TMP6]], ptr [[SARRAY:%.*]], align 8 ; ENABLED-NEXT: ret void ; @@ -90,16 +102,22 @@ entry: define void @test_supernode_addsub_alt(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Sarray) { ; ENABLED-LABEL: @test_supernode_addsub_alt( ; ENABLED-NEXT: entry: -; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8 -; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 -; ENABLED-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[CARRAY:%.*]], align 8 -; ENABLED-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP2]], <2 x i32> -; ENABLED-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]] -; ENABLED-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP1]] -; ENABLED-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x i32> -; ENABLED-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> -; ENABLED-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP7]] -; ENABLED-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP7]] +; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[AARRAY:%.*]], i64 1 +; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, ptr [[CARRAY:%.*]], i64 1 +; ENABLED-NEXT: [[A0:%.*]] = load double, ptr [[AARRAY]], align 8 +; ENABLED-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8 +; ENABLED-NEXT: [[C0:%.*]] = load double, ptr [[CARRAY]], align 8 +; ENABLED-NEXT: [[C1:%.*]] = load double, ptr [[IDXC1]], align 8 +; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 +; ENABLED-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 +; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C1]], i32 1 +; ENABLED-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP0]] +; ENABLED-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP0]] +; ENABLED-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <2 x i32> +; ENABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 +; ENABLED-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[A1]], i32 1 +; ENABLED-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] +; ENABLED-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP7]] ; ENABLED-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> ; ENABLED-NEXT: store <2 x double> [[TMP10]], ptr [[SARRAY:%.*]], align 8 ; ENABLED-NEXT: ret void @@ -159,15 +177,19 @@ entry: define void @supernode_scheduling(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Darray, ptr %Sarray) { ; ENABLED-LABEL: @supernode_scheduling( ; ENABLED-NEXT: entry: +; ENABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[BARRAY:%.*]], i64 1 ; ENABLED-NEXT: [[C:%.*]] = load double, ptr [[CARRAY:%.*]], align 8 +; ENABLED-NEXT: [[B0:%.*]] = load double, ptr [[BARRAY]], align 8 +; ENABLED-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8 ; ENABLED-NEXT: [[D:%.*]] = load double, ptr [[DARRAY:%.*]], align 8 ; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8 -; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 -; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C]], i32 0 +; ENABLED-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 +; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B1]], i32 1 ; ENABLED-NEXT: [[TMP3:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP2]] -; ENABLED-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP1]], double [[D]], i32 1 -; ENABLED-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP4]] -; ENABLED-NEXT: store <2 x double> [[TMP5]], ptr [[SARRAY:%.*]], align 8 +; ENABLED-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 +; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[D]], i32 1 +; ENABLED-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] +; ENABLED-NEXT: store <2 x double> [[TMP6]], ptr [[SARRAY:%.*]], align 8 ; ENABLED-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll index fd3c1a57aff344..243087c6d8d95b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll @@ -5,12 +5,14 @@ define void @vec3_vectorize_call(ptr %Colour, float %0) { ; NON-POW2-LABEL: @vec3_vectorize_call( ; NON-POW2-NEXT: entry: -; NON-POW2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4 -; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2 -; NON-POW2-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> -; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> -; NON-POW2-NEXT: [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer) -; NON-POW2-NEXT: store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4 +; NON-POW2-NEXT: [[TMP1:%.*]] = load float, ptr [[COLOUR:%.*]], align 4 +; NON-POW2-NEXT: [[ARRAYIDX91_I:%.*]] = getelementptr float, ptr [[COLOUR]], i64 1 +; NON-POW2-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX91_I]], align 4 +; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2 +; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP1]], i32 0 +; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP2]], i32 1 +; NON-POW2-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> zeroinitializer, <3 x float> zeroinitializer) +; NON-POW2-NEXT: store <3 x float> [[TMP6]], ptr [[COLOUR]], align 4 ; NON-POW2-NEXT: ret void ; ; POW2-ONLY-LABEL: @vec3_vectorize_call( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll index 373feb4a658f0b..7bcb2ece779210 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll @@ -13,9 +13,11 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T4:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 7 ; CHECK-NEXT: [[T5:%.*]] = load i32, ptr [[T4]], align 4 ; CHECK-NEXT: [[T8:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 1 +; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4 ; CHECK-NEXT: [[T10:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 6 ; CHECK-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4 ; CHECK-NEXT: [[T14:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 2 +; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 ; CHECK-NEXT: [[T16:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 5 ; CHECK-NEXT: [[T17:%.*]] = load i32, ptr [[T16]], align 4 ; CHECK-NEXT: [[T20:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 3 @@ -25,7 +27,10 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T24:%.*]] = add nsw i32 [[T23]], [[T21]] ; CHECK-NEXT: [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]] ; CHECK-NEXT: [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]] -; CHECK-NEXT: [[T32:%.*]] = mul nsw i32 [[T27]], 6270 +; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] +; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] +; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 +; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 ; CHECK-NEXT: [[T37:%.*]] = add nsw i32 [[T25]], [[T11]] ; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]] ; CHECK-NEXT: [[T39:%.*]] = add nsw i32 [[T37]], [[T38]] @@ -34,25 +39,19 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819 ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 -; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] -; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4 -; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4 -; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] -; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] -; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 -; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[T40]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T48]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T32]], i32 2 -; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3 -; CHECK-NEXT: [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP6]], <8 x i32> -; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6 -; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7 -; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T15]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T47]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> , i32 [[T9]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T40]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[T34]], i32 6 +; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T71]], ; CHECK-NEXT: store <8 x i32> [[T76]], ptr [[T2]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll index da935173b20d11..82f8aa5f9be1be 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll @@ -13,9 +13,11 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T4:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 7 ; CHECK-NEXT: [[T5:%.*]] = load i32, ptr [[T4]], align 4 ; CHECK-NEXT: [[T8:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 1 +; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4 ; CHECK-NEXT: [[T10:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 6 ; CHECK-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4 ; CHECK-NEXT: [[T14:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 2 +; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 ; CHECK-NEXT: [[T16:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 5 ; CHECK-NEXT: [[T17:%.*]] = load i32, ptr [[T16]], align 4 ; CHECK-NEXT: [[T20:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 3 @@ -25,7 +27,10 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T24:%.*]] = add nsw i32 [[T23]], [[T21]] ; CHECK-NEXT: [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]] ; CHECK-NEXT: [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]] -; CHECK-NEXT: [[T32:%.*]] = mul nsw i32 [[T27]], 6270 +; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] +; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] +; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 +; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 ; CHECK-NEXT: [[T37:%.*]] = add nsw i32 [[T25]], [[T11]] ; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]] ; CHECK-NEXT: [[T39:%.*]] = add nsw i32 [[T37]], [[T38]] @@ -34,25 +39,19 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819 ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 -; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] -; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4 -; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4 -; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] -; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] -; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 -; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[T40]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T48]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T32]], i32 2 -; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3 -; CHECK-NEXT: [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP6]], <8 x i32> -; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6 -; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7 -; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T15]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T47]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> , i32 [[T9]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T40]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[T34]], i32 6 +; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T71]], ; CHECK-NEXT: store <8 x i32> [[T76]], ptr [[T2]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll index e52b29a7f681c7..69ecf1852aedd7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll @@ -7,9 +7,11 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T4:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 7 ; CHECK-NEXT: [[T5:%.*]] = load i32, ptr [[T4]], align 4 ; CHECK-NEXT: [[T8:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 1 +; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4 ; CHECK-NEXT: [[T10:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 6 ; CHECK-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4 ; CHECK-NEXT: [[T14:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 2 +; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 ; CHECK-NEXT: [[T16:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 5 ; CHECK-NEXT: [[T17:%.*]] = load i32, ptr [[T16]], align 4 ; CHECK-NEXT: [[T20:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 3 @@ -19,7 +21,11 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T24:%.*]] = add nsw i32 [[T23]], [[T21]] ; CHECK-NEXT: [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]] ; CHECK-NEXT: [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]] +; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] +; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] +; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 ; CHECK-NEXT: [[T32:%.*]] = mul nsw i32 [[T27]], 6270 +; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 ; CHECK-NEXT: [[T37:%.*]] = add nsw i32 [[T25]], [[T11]] ; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]] ; CHECK-NEXT: [[T39:%.*]] = add nsw i32 [[T37]], [[T38]] @@ -29,17 +35,11 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 ; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] -; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4 -; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4 -; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] -; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] -; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 -; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[T40]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T48]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T32]], i32 2 ; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3