Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JIT: Implement greedy RPO-based block layout #101473

Merged
merged 17 commits into from
May 2, 2024
Merged
21 changes: 14 additions & 7 deletions src/coreclr/jit/block.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,33 +137,40 @@ void FlowEdge::addLikelihood(weight_t addedLikelihood)
// AllSuccessorEnumerator: Construct an instance of the enumerator.
//
// Arguments:
// comp - Compiler instance
// block - The block whose successors are to be iterated
// comp - Compiler instance
// block - The block whose successors are to be iterated
// useProfile - If true, determines the order of successors visited using profile data
//
AllSuccessorEnumerator::AllSuccessorEnumerator(Compiler* comp, BasicBlock* block)
AllSuccessorEnumerator::AllSuccessorEnumerator(Compiler* comp, BasicBlock* block, const bool useProfile /* = false */)
: m_block(block)
{
m_numSuccs = 0;
block->VisitAllSuccs(comp, [this](BasicBlock* succ) {
block->VisitAllSuccs(
comp,
[this](BasicBlock* succ) {
if (m_numSuccs < ArrLen(m_successors))
{
m_successors[m_numSuccs] = succ;
}

m_numSuccs++;
return BasicBlockVisit::Continue;
});
},
useProfile);

if (m_numSuccs > ArrLen(m_successors))
{
m_pSuccessors = new (comp, CMK_BasicBlock) BasicBlock*[m_numSuccs];

unsigned numSuccs = 0;
block->VisitAllSuccs(comp, [this, &numSuccs](BasicBlock* succ) {
block->VisitAllSuccs(
comp,
[this, &numSuccs](BasicBlock* succ) {
assert(numSuccs < m_numSuccs);
m_pSuccessors[numSuccs++] = succ;
return BasicBlockVisit::Continue;
});
},
useProfile);

assert(numSuccs == m_numSuccs);
}
Expand Down
4 changes: 2 additions & 2 deletions src/coreclr/jit/block.h
Original file line number Diff line number Diff line change
Expand Up @@ -1820,7 +1820,7 @@ struct BasicBlock : private LIR::Range
BasicBlockVisit VisitEHEnclosedHandlerSecondPassSuccs(Compiler* comp, TFunc func);

template <typename TFunc>
BasicBlockVisit VisitAllSuccs(Compiler* comp, TFunc func);
BasicBlockVisit VisitAllSuccs(Compiler* comp, TFunc func, const bool useProfile = false);

template <typename TFunc>
BasicBlockVisit VisitEHSuccs(Compiler* comp, TFunc func);
Expand Down Expand Up @@ -2518,7 +2518,7 @@ class AllSuccessorEnumerator

public:
// Constructs an enumerator of all `block`'s successors.
AllSuccessorEnumerator(Compiler* comp, BasicBlock* block);
AllSuccessorEnumerator(Compiler* comp, BasicBlock* block, const bool useProfile = false);

// Gets the block whose successors are enumerated.
BasicBlock* Block()
Expand Down
8 changes: 7 additions & 1 deletion src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -2793,6 +2793,9 @@ class Compiler
EHblkDsc* ehIsBlockHndLast(BasicBlock* block);
bool ehIsBlockEHLast(BasicBlock* block);

template <typename GetTryLast, typename SetTryLast>
void ehUpdateTryLasts(GetTryLast getTryLast, SetTryLast setTryLast);

bool ehBlockHasExnFlowDsc(BasicBlock* block);

// Return the region index of the most nested EH region this block is in.
Expand Down Expand Up @@ -6054,6 +6057,8 @@ class Compiler
bool fgComputeCalledCount(weight_t returnWeight);

bool fgReorderBlocks(bool useProfile);
void fgDoReversePostOrderLayout();
void fgMoveColdBlocks();

bool fgFuncletsAreCold();

Expand All @@ -6074,9 +6079,10 @@ class Compiler
PhaseStatus fgSetBlockOrder();
bool fgHasCycleWithoutGCSafePoint();

template<typename VisitPreorder, typename VisitPostorder, typename VisitEdge>
template <typename VisitPreorder, typename VisitPostorder, typename VisitEdge, const bool useProfile = false>
unsigned fgRunDfs(VisitPreorder assignPreorder, VisitPostorder assignPostorder, VisitEdge visitEdge);

template <const bool useProfile = false>
FlowGraphDfsTree* fgComputeDfs();
void fgInvalidateDfsTree();

Expand Down
32 changes: 23 additions & 9 deletions src/coreclr/jit/compiler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -623,12 +623,13 @@ BasicBlockVisit BasicBlock::VisitEHSuccs(Compiler* comp, TFunc func)
// Arguments:
// comp - Compiler instance
// func - Callback
// useProfile - If true, determines the order of successors visited using profile data
//
// Returns:
// Whether or not the visiting was aborted.
//
template <typename TFunc>
BasicBlockVisit BasicBlock::VisitAllSuccs(Compiler* comp, TFunc func)
BasicBlockVisit BasicBlock::VisitAllSuccs(Compiler* comp, TFunc func, const bool useProfile /* = false */)
{
switch (bbKind)
{
Expand Down Expand Up @@ -662,10 +663,22 @@ BasicBlockVisit BasicBlock::VisitAllSuccs(Compiler* comp, TFunc func)
return VisitEHSuccs(comp, func);

case BBJ_COND:
RETURN_ON_ABORT(func(GetFalseTarget()));

if (!TrueEdgeIs(GetFalseEdge()))
if (TrueEdgeIs(GetFalseEdge()))
{
RETURN_ON_ABORT(func(GetFalseTarget()));
}
else if (useProfile && (GetTrueEdge()->getLikelihood() < GetFalseEdge()->getLikelihood()))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Am I confused, or is this visiting the less likely successor first?

Suggested change
else if (useProfile && (GetTrueEdge()->getLikelihood() < GetFalseEdge()->getLikelihood()))
else if (useProfile && (GetTrueEdge()->getLikelihood() > GetFalseEdge()->getLikelihood()))

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks unintuitive, but I think we need to flip the comparison so the DFS is in the order we want. Consider the following block list, pre-ordering:

BBnum BBid ref try hnd preds           weight   IBC [IL range]   [jump]                            [EH region]        [flags]
---------------------------------------------------------------------------------------------------------------------------------------------------------------------
BB01 [0014]  1                             1        [???..???)-> BB02(1)                 (always)                     i keep internal
BB02 [0000]  1       BB01                  1    100 [000..00E)-> BB04(0),BB03(1)         ( cond )                     i IBC
BB03 [0010]  1       BB02                  0.50  50 [00D..00E)-> BB05(1)                 (always)                     i IBC nullcheck
BB04 [0011]  1       BB02                  0      0 [00D..00E)-> BB05(1)                 (always)                     i IBC rare
BB05 [0012]  2       BB03,BB04             1        [00D..017)                           (return)                     i hascall gcsafe

When processing BB02, if we visit BB03 before BB04, then we end up with an RPO that looks something like [<BB03's successors>, BB03, BB04, BB02, ...], so after layout, we get this:

BBnum BBid ref try hnd preds           weight   IBC [IL range]   [jump]                            [EH region]        [flags]
---------------------------------------------------------------------------------------------------------------------------------------------------------------------
BB01 [0014]  1                             1        [???..???)-> BB02(1)                 (always)                     i keep internal
BB02 [0000]  1       BB01                  1    100 [000..00E)-> BB04(0),BB03(1)         ( cond )                     i IBC
BB04 [0011]  1       BB02                  0      0 [00D..00E)-> BB05(1)                 (always)                     i IBC rare
BB03 [0010]  1       BB02                  0.50  50 [00D..00E)-> BB05(1)                 (always)                     i IBC nullcheck
BB05 [0012]  2       BB03,BB04             1        [00D..017)                           (return)                     i hascall gcsafe

If we instead visit the less likely successor (BB04) first, we push the more likely successor BB03 up to BB02 in the RPO, and get this layout:

BBnum BBid ref try hnd preds           weight   IBC [IL range]   [jump]                            [EH region]        [flags]
---------------------------------------------------------------------------------------------------------------------------------------------------------------------
BB01 [0014]  1                             1        [???..???)-> BB02(1)                 (always)                     i keep internal
BB02 [0000]  1       BB01                  1    100 [000..00E)-> BB04(0),BB03(1)         ( cond )                     i IBC
BB03 [0010]  1       BB02                  0.50  50 [00D..00E)-> BB05(1)                 (always)                     i IBC nullcheck
BB04 [0011]  1       BB02                  0      0 [00D..00E)-> BB05(1)                 (always)                     i IBC rare
BB05 [0012]  2       BB03,BB04             1        [00D..017)                           (return)                     i hascall gcsafe

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I think because we're going to form an RPO then visiting the less-likely successor first is correct. If we wanted to view the depth-first spanning tree as a pseudo maximum weight tree then we'd do it the other way around.

Can you add a comment here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure thing.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This logic seems very specialized to block layout and tied into how the DFS traversal works to end up with the result it wants. It makes it seem a bit odd for it to live in this very general utility function.
I'm ok with this for now, but if we end up with even more logic to handle other cases (like BBJ_SWITCH) then I'd suggest we introduce a separate version of the visitor that lives next to the block layout code. It would save a bit on throughput as well since now everyone is paying for this useProfile check.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, I'll fix this in a follow-up PR. As for what the new abstraction should look like, would you prefer we move the useProfile check into AllSuccessorEnumerator, or even introduce a new enumerator like ProfileGuidedSuccessorEnumerator?

Copy link
Member

@jakobbotsch jakobbotsch May 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps introduce two instance initializer methods on AllSuccessorEnumerator and then pass some factory method to fgRunDfs? E.g. the normal use would be:

fgRunDfs([](SuccessorEnumerator* enumerator, BasicBlock* block) { enumerator->InitializeAllSuccs(block); }, ...);

and the block layout use could be

fgRunDfs([](SuccessorEnumerator* enumerator, BasicBlock* block) { enumerator->InitializeAllSuccsForBlockLayout(block); }, ...);

{
// When building an RPO-based block layout, we want to visit the unlikely successor first
// so that in the DFS computation, the likely successor will be processed right before this block,
// meaning the RPO-based layout will enable fall-through into the likely successor.
//
RETURN_ON_ABORT(func(GetTrueTarget()));
RETURN_ON_ABORT(func(GetFalseTarget()));
}
else
{
RETURN_ON_ABORT(func(GetFalseTarget()));
RETURN_ON_ABORT(func(GetTrueTarget()));
}

Expand Down Expand Up @@ -696,8 +709,8 @@ BasicBlockVisit BasicBlock::VisitAllSuccs(Compiler* comp, TFunc func)
// VisitRegularSuccs: Visit regular successors of this block.
//
// Arguments:
// comp - Compiler instance
// func - Callback
// comp - Compiler instance
// func - Callback
//
// Returns:
// Whether or not the visiting was aborted.
Expand Down Expand Up @@ -4745,6 +4758,7 @@ inline bool Compiler::compCanHavePatchpoints(const char** reason)
// VisitPreorder - Functor type that takes a BasicBlock* and its preorder number
// VisitPostorder - Functor type that takes a BasicBlock* and its postorder number
// VisitEdge - Functor type that takes two BasicBlock*.
// useProfile - If true, determines order of successors visited using profile data
//
// Parameters:
// visitPreorder - Functor to visit block in its preorder
Expand All @@ -4755,7 +4769,7 @@ inline bool Compiler::compCanHavePatchpoints(const char** reason)
// Returns:
// Number of blocks visited.
//
template <typename VisitPreorder, typename VisitPostorder, typename VisitEdge>
template <typename VisitPreorder, typename VisitPostorder, typename VisitEdge, const bool useProfile /* = false */>
unsigned Compiler::fgRunDfs(VisitPreorder visitPreorder, VisitPostorder visitPostorder, VisitEdge visitEdge)
{
BitVecTraits traits(fgBBNumMax + 1, this);
Expand All @@ -4768,7 +4782,7 @@ unsigned Compiler::fgRunDfs(VisitPreorder visitPreorder, VisitPostorder visitPos

auto dfsFrom = [&](BasicBlock* firstBB) {
BitVecOps::AddElemD(&traits, visited, firstBB->bbNum);
blocks.Emplace(this, firstBB);
blocks.Emplace(this, firstBB, useProfile);
visitPreorder(firstBB, preOrderIndex++);

while (!blocks.Empty())
Expand All @@ -4780,7 +4794,7 @@ unsigned Compiler::fgRunDfs(VisitPreorder visitPreorder, VisitPostorder visitPos
{
if (BitVecOps::TryAddElemD(&traits, visited, succ->bbNum))
{
blocks.Emplace(this, succ);
blocks.Emplace(this, succ, useProfile);
visitPreorder(succ, preOrderIndex++);
}

Expand Down
Loading
Loading