Skip to content

Commit

Permalink
[SLP]Remove operands upon marking instruction for deletion.
Browse files Browse the repository at this point in the history
If the instruction is marked for deletion, better to drop all its
operands and mark them for deletion too (if allowed). It allows to have
more vectorizable patterns and generate less useless extractelement
instructions.

Reviewers: RKSimon

Reviewed By: RKSimon

Pull Request: llvm#97409
  • Loading branch information
alexey-bataev committed Jul 8, 2024
1 parent 9dca3ac commit 3851186
Show file tree
Hide file tree
Showing 23 changed files with 242 additions and 94 deletions.
221 changes: 187 additions & 34 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1186,6 +1186,12 @@ class BoUpSLP {
return VectorizableTree.front()->Scalars;
}

/// Checks if the root graph node can be emitted with narrower bitwidth at
/// codegen and returns it signedness, if so.
bool isSignedMinBitwidthRootNode() const {
return MinBWs.at(VectorizableTree.front().get()).second;
}

/// Builds external uses of the vectorized scalars, i.e. the list of
/// vectorized scalars to be extracted, their lanes and their scalar users. \p
/// ExternallyUsedValues contains additional list of external uses to handle
Expand Down Expand Up @@ -2453,6 +2459,90 @@ class BoUpSLP {
DeletedInstructions.insert(I);
}

/// Remove instructions from the parent function and clear the operands of \p
/// DeadVals instructions, marking for deletion trivially dead operands.
template <typename T>
void removeInstructionsAndOperands(ArrayRef<T *> DeadVals) {
SmallVector<WeakTrackingVH> DeadInsts;
for (T *V : DeadVals) {
auto *I = cast<Instruction>(V);
DeletedInstructions.insert(I);
}
for (T *V : DeadVals) {
if (!V)
continue;
auto *I = cast<Instruction>(V);
salvageDebugInfo(*I);
SmallVector<const TreeEntry *> Entries;
if (const TreeEntry *Entry = getTreeEntry(I)) {
Entries.push_back(Entry);
auto It = MultiNodeScalars.find(I);
if (It != MultiNodeScalars.end())
Entries.append(It->second.begin(), It->second.end());
}
for (Use &U : I->operands()) {
if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
wouldInstructionBeTriviallyDead(OpI, TLI) &&
(Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
return Entry->VectorizedValue == OpI;
})))
DeadInsts.push_back(OpI);
}
I->dropAllReferences();
}
for (T *V : DeadVals) {
auto *I = cast<Instruction>(V);
if (!I->getParent())
continue;
assert((I->use_empty() || all_of(I->uses(),
[&](Use &U) {
return isDeleted(
cast<Instruction>(U.getUser()));
})) &&
"trying to erase instruction with users.");
I->removeFromParent();
SE->forgetValue(I);
}
// Process the dead instruction list until empty.
while (!DeadInsts.empty()) {
Value *V = DeadInsts.pop_back_val();
Instruction *VI = cast_or_null<Instruction>(V);
if (!VI || !VI->getParent())
continue;
assert(isInstructionTriviallyDead(VI, TLI) &&
"Live instruction found in dead worklist!");
assert(VI->use_empty() && "Instructions with uses are not dead.");

// Don't lose the debug info while deleting the instructions.
salvageDebugInfo(*VI);

// Null out all of the instruction's operands to see if any operand
// becomes dead as we go.
for (Use &OpU : VI->operands()) {
Value *OpV = OpU.get();
if (!OpV)
continue;
OpU.set(nullptr);

if (!OpV->use_empty())
continue;

// If the operand is an instruction that became dead as we nulled out
// the operand, and if it is 'trivially' dead, delete it in a future
// loop iteration.
if (auto *OpI = dyn_cast<Instruction>(OpV))
if (!DeletedInstructions.contains(OpI) &&
isInstructionTriviallyDead(OpI, TLI))
DeadInsts.push_back(OpI);
}

VI->removeFromParent();
DeletedInstructions.insert(VI);
SE->forgetValue(VI);
}
}

/// Checks if the instruction was already analyzed for being possible
/// reduction root.
bool isAnalyzedReductionRoot(Instruction *I) const {
Expand Down Expand Up @@ -3987,6 +4077,10 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
BoUpSLP::~BoUpSLP() {
SmallVector<WeakTrackingVH> DeadInsts;
for (auto *I : DeletedInstructions) {
if (!I->getParent()) {
I->insertBefore(F->getEntryBlock().getTerminator());
continue;
}
for (Use &U : I->operands()) {
auto *Op = dyn_cast<Instruction>(U.get());
if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
Expand Down Expand Up @@ -14075,11 +14169,8 @@ Value *BoUpSLP::vectorizeTree(
}
#endif
LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
eraseInstruction(cast<Instruction>(Scalar));
// Retain to-be-deleted instructions for some debug-info
// bookkeeping. NOTE: eraseInstruction only marks the instruction for
// deletion - instructions are not deleted until later.
RemovedInsts.push_back(cast<Instruction>(Scalar));
auto *I = cast<Instruction>(Scalar);
RemovedInsts.push_back(I);
}
}

Expand All @@ -14088,6 +14179,22 @@ Value *BoUpSLP::vectorizeTree(
if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
V->mergeDIAssignID(RemovedInsts);

// Clear up reduction references, if any.
if (UserIgnoreList) {
for (Instruction *I : RemovedInsts) {
if (getTreeEntry(I)->Idx != 0)
continue;
I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
return UserIgnoreList->contains(U.getUser());
});
}
}
// Retain to-be-deleted instructions for some debug-info bookkeeping and alias
// cache correctness.
// NOTE: removeInstructionAndOperands only marks the instruction for deletion
// - instructions are not deleted until later.
removeInstructionsAndOperands(ArrayRef(RemovedInsts));

Builder.ClearInsertionPoint();
InstrElementSize.clear();

Expand Down Expand Up @@ -16137,15 +16244,18 @@ bool SLPVectorizerPass::vectorizeStores(
Res.first = Idx;
Res.second.emplace(Idx, 0);
};
StoreInst *PrevStore = Stores.front();
Type *PrevValTy = nullptr;
for (auto [I, SI] : enumerate(Stores)) {
if (R.isDeleted(SI))
continue;
if (!PrevValTy)
PrevValTy = SI->getValueOperand()->getType();
// Check that we do not try to vectorize stores of different types.
if (PrevStore->getValueOperand()->getType() !=
SI->getValueOperand()->getType()) {
if (PrevValTy != SI->getValueOperand()->getType()) {
for (auto &Set : SortedStores)
TryToVectorize(Set.second);
SortedStores.clear();
PrevStore = SI;
PrevValTy = SI->getValueOperand()->getType();
}
FillStoresSet(I, SI);
}
Expand Down Expand Up @@ -17028,9 +17138,12 @@ class HorizontalReduction {
Value *VectorizedTree = nullptr;
bool CheckForReusedReductionOps = false;
// Try to vectorize elements based on their type.
SmallVector<InstructionsState> States;
for (ArrayRef<Value *> RV : ReducedVals)
States.push_back(getSameOpcode(RV, TLI));
for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
InstructionsState S = getSameOpcode(OrigReducedVals, TLI);
InstructionsState S = States[I];
SmallVector<Value *> Candidates;
Candidates.reserve(2 * OrigReducedVals.size());
DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
Expand Down Expand Up @@ -17355,14 +17468,11 @@ class HorizontalReduction {
Value *ReducedSubTree =
emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
if (ReducedSubTree->getType() != VL.front()->getType()) {
ReducedSubTree = Builder.CreateIntCast(
ReducedSubTree, VL.front()->getType(), any_of(VL, [&](Value *R) {
KnownBits Known = computeKnownBits(
R, cast<Instruction>(ReductionOps.front().front())
->getModule()
->getDataLayout());
return !Known.isNonNegative();
}));
assert(ReducedSubTree->getType() != VL.front()->getType() &&
"Expected different reduction type.");
ReducedSubTree =
Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
V.isSignedMinBitwidthRootNode());
}

// Improved analysis for add/fadd/xor reductions with same scale factor
Expand Down Expand Up @@ -17524,11 +17634,11 @@ class HorizontalReduction {
}
#endif
if (!Ignore->use_empty()) {
Value *Undef = UndefValue::get(Ignore->getType());
Ignore->replaceAllUsesWith(Undef);
Value *P = PoisonValue::get(Ignore->getType());
Ignore->replaceAllUsesWith(P);
}
V.eraseInstruction(cast<Instruction>(Ignore));
}
V.removeInstructionsAndOperands(RdxOps);
}
} else if (!CheckForReusedReductionOps) {
for (ReductionOpsType &RdxOps : ReductionOps)
Expand Down Expand Up @@ -18076,6 +18186,8 @@ bool SLPVectorizerPass::vectorizeHorReduction(
Stack.emplace(I, Level);
continue;
}
if (R.isDeleted(Inst))
continue;
} else {
// We could not vectorize `Inst` so try to use it as a future seed.
if (!TryAppendToPostponedInsts(Inst)) {
Expand Down Expand Up @@ -18161,15 +18273,28 @@ static bool tryToVectorizeSequence(

// Try to vectorize elements base on their type.
SmallVector<T *> Candidates;
for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) {
SmallVector<T *> VL;
for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
VL.clear()) {
// Look for the next elements with the same type, parent and operand
// kinds.
auto *I = dyn_cast<Instruction>(*IncIt);
if (!I || R.isDeleted(I)) {
++IncIt;
continue;
}
auto *SameTypeIt = IncIt;
while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))
while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
AreCompatible(*SameTypeIt, *IncIt))) {
auto *I = dyn_cast<Instruction>(*SameTypeIt);
++SameTypeIt;
if (I && !R.isDeleted(I))
VL.push_back(cast<T>(I));
}

// Try to vectorize them.
unsigned NumElts = (SameTypeIt - IncIt);
unsigned NumElts = VL.size();
LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
<< NumElts << ")\n");
// The vectorization is a 3-state attempt:
Expand All @@ -18181,10 +18306,15 @@ static bool tryToVectorizeSequence(
// 3. Final attempt to try to vectorize all instructions with the
// same/alternate ops only, this may result in some extra final
// vectorization.
if (NumElts > 1 &&
TryToVectorizeHelper(ArrayRef(IncIt, NumElts), MaxVFOnly)) {
if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
// Success start over because instructions might have been changed.
Changed = true;
VL.swap(Candidates);
Candidates.clear();
for (T *V : VL) {
if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
Candidates.push_back(V);
}
} else {
/// \Returns the minimum number of elements that we will attempt to
/// vectorize.
Expand All @@ -18195,7 +18325,10 @@ static bool tryToVectorizeSequence(
if (NumElts < GetMinNumElements(*IncIt) &&
(Candidates.empty() ||
Candidates.front()->getType() == (*IncIt)->getType())) {
Candidates.append(IncIt, std::next(IncIt, NumElts));
for (T *V : VL) {
if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
Candidates.push_back(V);
}
}
}
// Final attempt to vectorize instructions with the same types.
Expand All @@ -18206,13 +18339,26 @@ static bool tryToVectorizeSequence(
Changed = true;
} else if (MaxVFOnly) {
// Try to vectorize using small vectors.
for (auto *It = Candidates.begin(), *End = Candidates.end();
It != End;) {
SmallVector<T *> VL;
for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
VL.clear()) {
auto *I = dyn_cast<Instruction>(*It);
if (!I || R.isDeleted(I)) {
++It;
continue;
}
auto *SameTypeIt = It;
while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It))
while (SameTypeIt != End &&
(!isa<Instruction>(*SameTypeIt) ||
R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
AreCompatible(*SameTypeIt, *It))) {
auto *I = dyn_cast<Instruction>(*SameTypeIt);
++SameTypeIt;
unsigned NumElts = (SameTypeIt - It);
if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(It, NumElts),
if (I && !R.isDeleted(I))
VL.push_back(cast<T>(I));
}
unsigned NumElts = VL.size();
if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
/*MaxVFOnly=*/false))
Changed = true;
It = SameTypeIt;
Expand Down Expand Up @@ -18486,7 +18632,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
}
return false;
};
auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) {
auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
if (V1 == V2)
return true;
if (V1->getType() != V2->getType())
Expand All @@ -18501,6 +18647,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
continue;
if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
if (R.isDeleted(I1) || R.isDeleted(I2))
return false;
if (I1->getParent() != I2->getParent())
return false;
InstructionsState S = getSameOpcode({I1, I2}, *TLI);
Expand Down Expand Up @@ -18721,8 +18869,13 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
// are trying to vectorize the index computations, so the maximum number of
// elements is based on the size of the index expression, rather than the
// size of the GEP itself (the target's pointer size).
auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
return !R.isDeleted(GEP);
});
if (It == Entry.second.end())
continue;
unsigned MaxVecRegSize = R.getMaxVecRegSize();
unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin());
unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
if (MaxVecRegSize < EltSize)
continue;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@
;; Test that dbg.assigns linked to the the scalar stores to quad get linked to
;; the vector store that replaces them.

; CHECK: #dbg_assign(float undef, ![[VAR:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 32), ![[ID:[0-9]+]], ptr %arrayidx, !DIExpression(),
; CHECK: #dbg_assign(float undef, ![[VAR]], !DIExpression(DW_OP_LLVM_fragment, 32, 32), ![[ID]], ptr %quad, !DIExpression(DW_OP_plus_uconst, 4),
; CHECK: #dbg_assign(float undef, ![[VAR]], !DIExpression(DW_OP_LLVM_fragment, 64, 32), ![[ID]], ptr %quad, !DIExpression(DW_OP_plus_uconst, 8),
; CHECK: #dbg_assign(float poison, ![[VAR:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 32), ![[ID:[0-9]+]], ptr %arrayidx, !DIExpression(),
; CHECK: #dbg_assign(float poison, ![[VAR]], !DIExpression(DW_OP_LLVM_fragment, 32, 32), ![[ID]], ptr %quad, !DIExpression(DW_OP_plus_uconst, 4),
; CHECK: #dbg_assign(float poison, ![[VAR]], !DIExpression(DW_OP_LLVM_fragment, 64, 32), ![[ID]], ptr %quad, !DIExpression(DW_OP_plus_uconst, 8),
; CHECK: store <4 x float> {{.*}} !DIAssignID ![[ID]]
; CHECK: #dbg_assign(float undef, ![[VAR]], !DIExpression(DW_OP_LLVM_fragment, 96, 32), ![[ID]], ptr %quad, !DIExpression(DW_OP_plus_uconst, 12),
; CHECK: #dbg_assign(float poison, ![[VAR]], !DIExpression(DW_OP_LLVM_fragment, 96, 32), ![[ID]], ptr %quad, !DIExpression(DW_OP_plus_uconst, 12),

target triple = "x86_64-unknown-unknown"

Expand Down
4 changes: 2 additions & 2 deletions llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ define void @patatino(i64 %n, i64 %i, ptr %p) !dbg !7 {
; CHECK-NEXT: #dbg_value(i64 [[I:%.*]], [[META19:![0-9]+]], !DIExpression(), [[META24:![0-9]+]])
; CHECK-NEXT: #dbg_value(ptr [[P:%.*]], [[META20:![0-9]+]], !DIExpression(), [[META25:![0-9]+]])
; CHECK-NEXT: [[X1:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[P]], i64 [[N]], i32 0, !dbg [[DBG26:![0-9]+]]
; CHECK-NEXT: #dbg_value(i64 undef, [[META21:![0-9]+]], !DIExpression(), [[META27:![0-9]+]])
; CHECK-NEXT: #dbg_value(i64 undef, [[META22:![0-9]+]], !DIExpression(), [[META28:![0-9]+]])
; CHECK-NEXT: #dbg_value(i64 poison, [[META21:![0-9]+]], !DIExpression(), [[META27:![0-9]+]])
; CHECK-NEXT: #dbg_value(i64 poison, [[META22:![0-9]+]], !DIExpression(), [[META28:![0-9]+]])
; CHECK-NEXT: [[X5:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 [[I]], i32 0, !dbg [[DBG29:![0-9]+]]
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[X1]], align 8, !dbg [[DBG26]], !tbaa [[TBAA30:![0-9]+]]
; CHECK-NEXT: store <2 x i64> [[TMP0]], ptr [[X5]], align 8, !dbg [[DBG34:![0-9]+]], !tbaa [[TBAA30]]
Expand Down
Loading

0 comments on commit 3851186

Please sign in to comment.