diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b754950199f93a..56fa9371fb7f9d 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1186,6 +1186,12 @@ class BoUpSLP { return VectorizableTree.front()->Scalars; } + /// Checks if the root graph node can be emitted with narrower bitwidth at + /// codegen and returns it signedness, if so. + bool isSignedMinBitwidthRootNode() const { + return MinBWs.at(VectorizableTree.front().get()).second; + } + /// Builds external uses of the vectorized scalars, i.e. the list of /// vectorized scalars to be extracted, their lanes and their scalar users. \p /// ExternallyUsedValues contains additional list of external uses to handle @@ -2453,6 +2459,90 @@ class BoUpSLP { DeletedInstructions.insert(I); } + /// Remove instructions from the parent function and clear the operands of \p + /// DeadVals instructions, marking for deletion trivially dead operands. + template + void removeInstructionsAndOperands(ArrayRef DeadVals) { + SmallVector DeadInsts; + for (T *V : DeadVals) { + auto *I = cast(V); + DeletedInstructions.insert(I); + } + for (T *V : DeadVals) { + if (!V) + continue; + auto *I = cast(V); + salvageDebugInfo(*I); + SmallVector Entries; + if (const TreeEntry *Entry = getTreeEntry(I)) { + Entries.push_back(Entry); + auto It = MultiNodeScalars.find(I); + if (It != MultiNodeScalars.end()) + Entries.append(It->second.begin(), It->second.end()); + } + for (Use &U : I->operands()) { + if (auto *OpI = dyn_cast_if_present(U.get()); + OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() && + wouldInstructionBeTriviallyDead(OpI, TLI) && + (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) { + return Entry->VectorizedValue == OpI; + }))) + DeadInsts.push_back(OpI); + } + I->dropAllReferences(); + } + for (T *V : DeadVals) { + auto *I = cast(V); + if (!I->getParent()) + continue; + assert((I->use_empty() || all_of(I->uses(), + [&](Use &U) { + return isDeleted( + cast(U.getUser())); + })) && + "trying to erase instruction with users."); + I->removeFromParent(); + SE->forgetValue(I); + } + // Process the dead instruction list until empty. + while (!DeadInsts.empty()) { + Value *V = DeadInsts.pop_back_val(); + Instruction *VI = cast_or_null(V); + if (!VI || !VI->getParent()) + continue; + assert(isInstructionTriviallyDead(VI, TLI) && + "Live instruction found in dead worklist!"); + assert(VI->use_empty() && "Instructions with uses are not dead."); + + // Don't lose the debug info while deleting the instructions. + salvageDebugInfo(*VI); + + // Null out all of the instruction's operands to see if any operand + // becomes dead as we go. + for (Use &OpU : VI->operands()) { + Value *OpV = OpU.get(); + if (!OpV) + continue; + OpU.set(nullptr); + + if (!OpV->use_empty()) + continue; + + // If the operand is an instruction that became dead as we nulled out + // the operand, and if it is 'trivially' dead, delete it in a future + // loop iteration. + if (auto *OpI = dyn_cast(OpV)) + if (!DeletedInstructions.contains(OpI) && + isInstructionTriviallyDead(OpI, TLI)) + DeadInsts.push_back(OpI); + } + + VI->removeFromParent(); + DeletedInstructions.insert(VI); + SE->forgetValue(VI); + } + } + /// Checks if the instruction was already analyzed for being possible /// reduction root. bool isAnalyzedReductionRoot(Instruction *I) const { @@ -3987,6 +4077,10 @@ template <> struct DOTGraphTraits : public DefaultDOTGraphTraits { BoUpSLP::~BoUpSLP() { SmallVector DeadInsts; for (auto *I : DeletedInstructions) { + if (!I->getParent()) { + I->insertBefore(F->getEntryBlock().getTerminator()); + continue; + } for (Use &U : I->operands()) { auto *Op = dyn_cast(U.get()); if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() && @@ -14075,11 +14169,8 @@ Value *BoUpSLP::vectorizeTree( } #endif LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); - eraseInstruction(cast(Scalar)); - // Retain to-be-deleted instructions for some debug-info - // bookkeeping. NOTE: eraseInstruction only marks the instruction for - // deletion - instructions are not deleted until later. - RemovedInsts.push_back(cast(Scalar)); + auto *I = cast(Scalar); + RemovedInsts.push_back(I); } } @@ -14088,6 +14179,22 @@ Value *BoUpSLP::vectorizeTree( if (auto *V = dyn_cast(VectorizableTree[0]->VectorizedValue)) V->mergeDIAssignID(RemovedInsts); + // Clear up reduction references, if any. + if (UserIgnoreList) { + for (Instruction *I : RemovedInsts) { + if (getTreeEntry(I)->Idx != 0) + continue; + I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) { + return UserIgnoreList->contains(U.getUser()); + }); + } + } + // Retain to-be-deleted instructions for some debug-info bookkeeping and alias + // cache correctness. + // NOTE: removeInstructionAndOperands only marks the instruction for deletion + // - instructions are not deleted until later. + removeInstructionsAndOperands(ArrayRef(RemovedInsts)); + Builder.ClearInsertionPoint(); InstrElementSize.clear(); @@ -16137,15 +16244,18 @@ bool SLPVectorizerPass::vectorizeStores( Res.first = Idx; Res.second.emplace(Idx, 0); }; - StoreInst *PrevStore = Stores.front(); + Type *PrevValTy = nullptr; for (auto [I, SI] : enumerate(Stores)) { + if (R.isDeleted(SI)) + continue; + if (!PrevValTy) + PrevValTy = SI->getValueOperand()->getType(); // Check that we do not try to vectorize stores of different types. - if (PrevStore->getValueOperand()->getType() != - SI->getValueOperand()->getType()) { + if (PrevValTy != SI->getValueOperand()->getType()) { for (auto &Set : SortedStores) TryToVectorize(Set.second); SortedStores.clear(); - PrevStore = SI; + PrevValTy = SI->getValueOperand()->getType(); } FillStoresSet(I, SI); } @@ -17028,9 +17138,12 @@ class HorizontalReduction { Value *VectorizedTree = nullptr; bool CheckForReusedReductionOps = false; // Try to vectorize elements based on their type. + SmallVector States; + for (ArrayRef RV : ReducedVals) + States.push_back(getSameOpcode(RV, TLI)); for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) { ArrayRef OrigReducedVals = ReducedVals[I]; - InstructionsState S = getSameOpcode(OrigReducedVals, TLI); + InstructionsState S = States[I]; SmallVector Candidates; Candidates.reserve(2 * OrigReducedVals.size()); DenseMap TrackedToOrig(2 * OrigReducedVals.size()); @@ -17355,14 +17468,11 @@ class HorizontalReduction { Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI); if (ReducedSubTree->getType() != VL.front()->getType()) { - ReducedSubTree = Builder.CreateIntCast( - ReducedSubTree, VL.front()->getType(), any_of(VL, [&](Value *R) { - KnownBits Known = computeKnownBits( - R, cast(ReductionOps.front().front()) - ->getModule() - ->getDataLayout()); - return !Known.isNonNegative(); - })); + assert(ReducedSubTree->getType() != VL.front()->getType() && + "Expected different reduction type."); + ReducedSubTree = + Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(), + V.isSignedMinBitwidthRootNode()); } // Improved analysis for add/fadd/xor reductions with same scale factor @@ -17524,11 +17634,11 @@ class HorizontalReduction { } #endif if (!Ignore->use_empty()) { - Value *Undef = UndefValue::get(Ignore->getType()); - Ignore->replaceAllUsesWith(Undef); + Value *P = PoisonValue::get(Ignore->getType()); + Ignore->replaceAllUsesWith(P); } - V.eraseInstruction(cast(Ignore)); } + V.removeInstructionsAndOperands(RdxOps); } } else if (!CheckForReusedReductionOps) { for (ReductionOpsType &RdxOps : ReductionOps) @@ -18076,6 +18186,8 @@ bool SLPVectorizerPass::vectorizeHorReduction( Stack.emplace(I, Level); continue; } + if (R.isDeleted(Inst)) + continue; } else { // We could not vectorize `Inst` so try to use it as a future seed. if (!TryAppendToPostponedInsts(Inst)) { @@ -18161,15 +18273,28 @@ static bool tryToVectorizeSequence( // Try to vectorize elements base on their type. SmallVector Candidates; - for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) { + SmallVector VL; + for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E; + VL.clear()) { // Look for the next elements with the same type, parent and operand // kinds. + auto *I = dyn_cast(*IncIt); + if (!I || R.isDeleted(I)) { + ++IncIt; + continue; + } auto *SameTypeIt = IncIt; - while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt)) + while (SameTypeIt != E && (!isa(*SameTypeIt) || + R.isDeleted(cast(*SameTypeIt)) || + AreCompatible(*SameTypeIt, *IncIt))) { + auto *I = dyn_cast(*SameTypeIt); ++SameTypeIt; + if (I && !R.isDeleted(I)) + VL.push_back(cast(I)); + } // Try to vectorize them. - unsigned NumElts = (SameTypeIt - IncIt); + unsigned NumElts = VL.size(); LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes (" << NumElts << ")\n"); // The vectorization is a 3-state attempt: @@ -18181,10 +18306,15 @@ static bool tryToVectorizeSequence( // 3. Final attempt to try to vectorize all instructions with the // same/alternate ops only, this may result in some extra final // vectorization. - if (NumElts > 1 && - TryToVectorizeHelper(ArrayRef(IncIt, NumElts), MaxVFOnly)) { + if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) { // Success start over because instructions might have been changed. Changed = true; + VL.swap(Candidates); + Candidates.clear(); + for (T *V : VL) { + if (auto *I = dyn_cast(V); I && !R.isDeleted(I)) + Candidates.push_back(V); + } } else { /// \Returns the minimum number of elements that we will attempt to /// vectorize. @@ -18195,7 +18325,10 @@ static bool tryToVectorizeSequence( if (NumElts < GetMinNumElements(*IncIt) && (Candidates.empty() || Candidates.front()->getType() == (*IncIt)->getType())) { - Candidates.append(IncIt, std::next(IncIt, NumElts)); + for (T *V : VL) { + if (auto *I = dyn_cast(V); I && !R.isDeleted(I)) + Candidates.push_back(V); + } } } // Final attempt to vectorize instructions with the same types. @@ -18206,13 +18339,26 @@ static bool tryToVectorizeSequence( Changed = true; } else if (MaxVFOnly) { // Try to vectorize using small vectors. - for (auto *It = Candidates.begin(), *End = Candidates.end(); - It != End;) { + SmallVector VL; + for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End; + VL.clear()) { + auto *I = dyn_cast(*It); + if (!I || R.isDeleted(I)) { + ++It; + continue; + } auto *SameTypeIt = It; - while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It)) + while (SameTypeIt != End && + (!isa(*SameTypeIt) || + R.isDeleted(cast(*SameTypeIt)) || + AreCompatible(*SameTypeIt, *It))) { + auto *I = dyn_cast(*SameTypeIt); ++SameTypeIt; - unsigned NumElts = (SameTypeIt - It); - if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(It, NumElts), + if (I && !R.isDeleted(I)) + VL.push_back(cast(I)); + } + unsigned NumElts = VL.size(); + if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), /*MaxVFOnly=*/false)) Changed = true; It = SameTypeIt; @@ -18486,7 +18632,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { } return false; }; - auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) { + auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) { if (V1 == V2) return true; if (V1->getType() != V2->getType()) @@ -18501,6 +18647,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { continue; if (auto *I1 = dyn_cast(Opcodes1[I])) if (auto *I2 = dyn_cast(Opcodes2[I])) { + if (R.isDeleted(I1) || R.isDeleted(I2)) + return false; if (I1->getParent() != I2->getParent()) return false; InstructionsState S = getSameOpcode({I1, I2}, *TLI); @@ -18721,8 +18869,13 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { // are trying to vectorize the index computations, so the maximum number of // elements is based on the size of the index expression, rather than the // size of the GEP itself (the target's pointer size). + auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) { + return !R.isDeleted(GEP); + }); + if (It == Entry.second.end()) + continue; unsigned MaxVecRegSize = R.getMaxVecRegSize(); - unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin()); + unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin()); if (MaxVecRegSize < EltSize) continue; diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll index daa534fcd0c227..0315adb5452b40 100644 --- a/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll +++ b/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll @@ -24,11 +24,11 @@ ;; Test that dbg.assigns linked to the the scalar stores to quad get linked to ;; the vector store that replaces them. -; CHECK: #dbg_assign(float undef, ![[VAR:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 32), ![[ID:[0-9]+]], ptr %arrayidx, !DIExpression(), -; CHECK: #dbg_assign(float undef, ![[VAR]], !DIExpression(DW_OP_LLVM_fragment, 32, 32), ![[ID]], ptr %quad, !DIExpression(DW_OP_plus_uconst, 4), -; CHECK: #dbg_assign(float undef, ![[VAR]], !DIExpression(DW_OP_LLVM_fragment, 64, 32), ![[ID]], ptr %quad, !DIExpression(DW_OP_plus_uconst, 8), +; CHECK: #dbg_assign(float poison, ![[VAR:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 32), ![[ID:[0-9]+]], ptr %arrayidx, !DIExpression(), +; CHECK: #dbg_assign(float poison, ![[VAR]], !DIExpression(DW_OP_LLVM_fragment, 32, 32), ![[ID]], ptr %quad, !DIExpression(DW_OP_plus_uconst, 4), +; CHECK: #dbg_assign(float poison, ![[VAR]], !DIExpression(DW_OP_LLVM_fragment, 64, 32), ![[ID]], ptr %quad, !DIExpression(DW_OP_plus_uconst, 8), ; CHECK: store <4 x float> {{.*}} !DIAssignID ![[ID]] -; CHECK: #dbg_assign(float undef, ![[VAR]], !DIExpression(DW_OP_LLVM_fragment, 96, 32), ![[ID]], ptr %quad, !DIExpression(DW_OP_plus_uconst, 12), +; CHECK: #dbg_assign(float poison, ![[VAR]], !DIExpression(DW_OP_LLVM_fragment, 96, 32), ![[ID]], ptr %quad, !DIExpression(DW_OP_plus_uconst, 12), target triple = "x86_64-unknown-unknown" diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll index 5232ae76fa8870..9cb2badc25fb20 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll @@ -13,8 +13,8 @@ define void @patatino(i64 %n, i64 %i, ptr %p) !dbg !7 { ; CHECK-NEXT: #dbg_value(i64 [[I:%.*]], [[META19:![0-9]+]], !DIExpression(), [[META24:![0-9]+]]) ; CHECK-NEXT: #dbg_value(ptr [[P:%.*]], [[META20:![0-9]+]], !DIExpression(), [[META25:![0-9]+]]) ; CHECK-NEXT: [[X1:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[P]], i64 [[N]], i32 0, !dbg [[DBG26:![0-9]+]] -; CHECK-NEXT: #dbg_value(i64 undef, [[META21:![0-9]+]], !DIExpression(), [[META27:![0-9]+]]) -; CHECK-NEXT: #dbg_value(i64 undef, [[META22:![0-9]+]], !DIExpression(), [[META28:![0-9]+]]) +; CHECK-NEXT: #dbg_value(i64 poison, [[META21:![0-9]+]], !DIExpression(), [[META27:![0-9]+]]) +; CHECK-NEXT: #dbg_value(i64 poison, [[META22:![0-9]+]], !DIExpression(), [[META28:![0-9]+]]) ; CHECK-NEXT: [[X5:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 [[I]], i32 0, !dbg [[DBG29:![0-9]+]] ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[X1]], align 8, !dbg [[DBG26]], !tbaa [[TBAA30:![0-9]+]] ; CHECK-NEXT: store <2 x i64> [[TMP0]], ptr [[X5]], align 8, !dbg [[DBG34:![0-9]+]], !tbaa [[TBAA30]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll index 24c5fcb0680865..8c4903dbc92bbe 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-ssat.ll @@ -503,10 +503,10 @@ define void @add_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; @@ -522,10 +522,10 @@ define void @add_v64i8() { ; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SLM-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll index fab022d691c07a..cb8d45b1a21a20 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll @@ -401,10 +401,10 @@ define void @add_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll index dafed43e6e71c1..a7ae2d9e02ff4b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-add.ll @@ -439,10 +439,10 @@ define void @add_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = add <16 x i8> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = add <16 x i8> [[TMP10]], [[TMP11]] -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; @@ -458,10 +458,10 @@ define void @add_v64i8() { ; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP9:%.*]] = add <16 x i8> [[TMP7]], [[TMP8]] +; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP12:%.*]] = add <16 x i8> [[TMP10]], [[TMP11]] -; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SLM-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll index e4c76daddb02e4..d4eafdeb50a470 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fix.ll @@ -520,10 +520,10 @@ define void @smul_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; @@ -539,10 +539,10 @@ define void @smul_v64i8() { ; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3) +; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3) -; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SLM-NEXT: ret void ; @@ -1323,10 +1323,10 @@ define void @umul_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; @@ -1342,10 +1342,10 @@ define void @umul_v64i8() { ; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]], i32 3) +; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umul.fix.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]], i32 3) -; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SLM-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll index 9b8480cd0088a3..16977c025e3eaa 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl-rot.ll @@ -480,10 +480,10 @@ define void @fshl_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll index daf28b9a0bb4da..609a9024e5bf7b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshl.ll @@ -575,21 +575,21 @@ define void @fshl_v64i8() { ; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 ; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr @c8, align 1 ; SSE-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]]) +; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @d8, align 1 ; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 ; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 ; SSE-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]], <16 x i8> [[TMP7]]) +; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 16), align 1 ; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP15:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i8> [[TMP13]]) -; SSE-NEXT: [[TMP16:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP11]], <16 x i8> [[TMP12]], <16 x i8> [[TMP14]]) -; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @d8, align 1 -; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 16), align 1 -; SSE-NEXT: store <16 x i8> [[TMP15]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) +; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP15:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP16:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i8> [[TMP15]]) ; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; @@ -598,11 +598,11 @@ define void @fshl_v64i8() { ; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 ; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, ptr @c8, align 1 ; AVX-NEXT: [[TMP4:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <32 x i8> [[TMP3]]) +; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @d8, align 1 ; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; AVX-NEXT: [[TMP7:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; AVX-NEXT: [[TMP8:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]], <32 x i8> [[TMP7]]) -; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @d8, align 1 ; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1 ; AVX-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll index f3e73d0e6840e0..090a9daa6a1136 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr-rot.ll @@ -480,10 +480,10 @@ define void @fshr_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr.ll index fb7532768c4b3f..3dc7d164f5bc94 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-fshr.ll @@ -575,21 +575,21 @@ define void @fshr_v64i8() { ; SSE-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr @b8, align 1 ; SSE-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr @c8, align 1 ; SSE-NEXT: [[TMP4:%.*]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> [[TMP1]], <16 x i8> [[TMP2]], <16 x i8> [[TMP3]]) +; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @d8, align 1 ; SSE-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 16), align 1 ; SSE-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 16), align 1 ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 16), align 1 ; SSE-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]], <16 x i8> [[TMP7]]) +; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 16), align 1 ; SSE-NEXT: [[TMP9:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP12:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 -; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 -; SSE-NEXT: [[TMP15:%.*]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i8> [[TMP13]]) -; SSE-NEXT: [[TMP16:%.*]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> [[TMP11]], <16 x i8> [[TMP12]], <16 x i8> [[TMP14]]) -; SSE-NEXT: store <16 x i8> [[TMP4]], ptr @d8, align 1 -; SSE-NEXT: store <16 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 16), align 1 -; SSE-NEXT: store <16 x i8> [[TMP15]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> [[TMP9]], <16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) +; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1 +; SSE-NEXT: [[TMP13:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP14:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP15:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 +; SSE-NEXT: [[TMP16:%.*]] = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> [[TMP13]], <16 x i8> [[TMP14]], <16 x i8> [[TMP15]]) ; SSE-NEXT: store <16 x i8> [[TMP16]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; @@ -598,11 +598,11 @@ define void @fshr_v64i8() { ; AVX-NEXT: [[TMP2:%.*]] = load <32 x i8>, ptr @b8, align 1 ; AVX-NEXT: [[TMP3:%.*]] = load <32 x i8>, ptr @c8, align 1 ; AVX-NEXT: [[TMP4:%.*]] = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> [[TMP1]], <32 x i8> [[TMP2]], <32 x i8> [[TMP3]]) +; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @d8, align 1 ; AVX-NEXT: [[TMP5:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; AVX-NEXT: [[TMP6:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; AVX-NEXT: [[TMP7:%.*]] = load <32 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; AVX-NEXT: [[TMP8:%.*]] = call <32 x i8> @llvm.fshr.v32i8(<32 x i8> [[TMP5]], <32 x i8> [[TMP6]], <32 x i8> [[TMP7]]) -; AVX-NEXT: store <32 x i8> [[TMP4]], ptr @d8, align 1 ; AVX-NEXT: store <32 x i8> [[TMP8]], ptr getelementptr inbounds ([64 x i8], ptr @d8, i32 0, i64 32), align 1 ; AVX-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll index 94976a8cdee252..51cf32242bfdfe 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-mul.ll @@ -528,10 +528,10 @@ define void @mul_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = mul <16 x i8> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = mul <16 x i8> [[TMP10]], [[TMP11]] -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; @@ -547,10 +547,10 @@ define void @mul_v64i8() { ; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP9:%.*]] = mul <16 x i8> [[TMP7]], [[TMP8]] +; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP12:%.*]] = mul <16 x i8> [[TMP10]], [[TMP11]] -; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SLM-NEXT: ret void ; @@ -566,10 +566,10 @@ define void @mul_v64i8() { ; AVX128-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; AVX128-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; AVX128-NEXT: [[TMP9:%.*]] = mul <16 x i8> [[TMP7]], [[TMP8]] +; AVX128-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; AVX128-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; AVX128-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; AVX128-NEXT: [[TMP12:%.*]] = mul <16 x i8> [[TMP10]], [[TMP11]] -; AVX128-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; AVX128-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; AVX128-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-smax.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-smax.ll index c63b672f4187cd..dd76992c2570b9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-smax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-smax.ll @@ -385,10 +385,10 @@ define void @smax_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-smin.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-smin.ll index 826f97f2a2d895..678477fa1e3977 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-smin.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-smin.ll @@ -385,10 +385,10 @@ define void @smin_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.smin.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll index afaab8b8ca642b..65e2a011cc9a14 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-ssat.ll @@ -503,10 +503,10 @@ define void @sub_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; @@ -522,10 +522,10 @@ define void @sub_v64i8() { ; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SLM-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll index 3510863c889301..18df499c6646ec 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll @@ -401,10 +401,10 @@ define void @sub_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll index be54c1e04ca39a..9d34edbb506c06 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-sub.ll @@ -439,10 +439,10 @@ define void @sub_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = sub <16 x i8> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = sub <16 x i8> [[TMP10]], [[TMP11]] -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; @@ -458,10 +458,10 @@ define void @sub_v64i8() { ; SLM-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP9:%.*]] = sub <16 x i8> [[TMP7]], [[TMP8]] +; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SLM-NEXT: [[TMP12:%.*]] = sub <16 x i8> [[TMP10]], [[TMP11]] -; SLM-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SLM-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SLM-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-umax.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-umax.ll index 3a187930055f0e..a3f2b97a08a6e9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-umax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-umax.ll @@ -385,10 +385,10 @@ define void @umax_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/arith-umin.ll b/llvm/test/Transforms/SLPVectorizer/X86/arith-umin.ll index 15119a96280673..0c7688345ac481 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/arith-umin.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/arith-umin.ll @@ -385,10 +385,10 @@ define void @umin_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]]) +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]]) -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll index e3dc67558af028..f036801865048f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -810,14 +810,9 @@ define float @extra_args_same_several_times(ptr nocapture readonly %x, i32 %a, i ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float ; THRESHOLD-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[X:%.*]], align 4 ; THRESHOLD-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP0]]) -; THRESHOLD-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; THRESHOLD-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[CONV]], i32 1 -; THRESHOLD-NEXT: [[TMP4:%.*]] = fadd fast <2 x float> [[TMP3]], -; THRESHOLD-NEXT: [[TMP5:%.*]] = fmul fast <2 x float> [[TMP3]], -; THRESHOLD-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x i32> -; THRESHOLD-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 -; THRESHOLD-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 -; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[TMP7]], [[TMP8]] +; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP1]], 1.300000e+01 +; THRESHOLD-NEXT: [[TMP2:%.*]] = fmul fast float [[CONV]], 2.000000e+00 +; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP2]] ; THRESHOLD-NEXT: ret float [[OP_RDX1]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll b/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll index 51798deae694a3..88aafb2bf148bc 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shift-ashr.ll @@ -464,10 +464,10 @@ define void @ashr_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = ashr <16 x i8> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = ashr <16 x i8> [[TMP10]], [[TMP11]] -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll b/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll index 7583561bbecf90..96977cd4fb7d75 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shift-lshr.ll @@ -413,10 +413,10 @@ define void @lshr_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = lshr <16 x i8> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = lshr <16 x i8> [[TMP10]], [[TMP11]] -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll b/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll index 5ec327c131fb78..789316ab33c434 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shift-shl.ll @@ -461,10 +461,10 @@ define void @shl_v64i8() { ; SSE-NEXT: [[TMP7:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP8:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP9:%.*]] = shl <16 x i8> [[TMP7]], [[TMP8]] +; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: [[TMP10:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @a8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP11:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([64 x i8], ptr @b8, i32 0, i64 48), align 1 ; SSE-NEXT: [[TMP12:%.*]] = shl <16 x i8> [[TMP10]], [[TMP11]] -; SSE-NEXT: store <16 x i8> [[TMP9]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 32), align 1 ; SSE-NEXT: store <16 x i8> [[TMP12]], ptr getelementptr inbounds ([64 x i8], ptr @c8, i32 0, i64 48), align 1 ; SSE-NEXT: ret void ;