diff --git a/visa/LocalScheduler/G4_Sched.cpp b/visa/LocalScheduler/G4_Sched.cpp index bc537925d935..e209347c4420 100644 --- a/visa/LocalScheduler/G4_Sched.cpp +++ b/visa/LocalScheduler/G4_Sched.cpp @@ -67,6 +67,9 @@ class preEdge { } return false; } + bool isFlowDep() const { + return (mType == DepType::RAW || mType == DepType::WAW); + } void setLatency(int L) { mLatency = L; } int getLatency() { return isDataDep() ? mLatency : 0; } @@ -176,6 +179,10 @@ class preNode { /* The following data may be overwritten by a scheduler. */ + // height by counting edges not by summing latency + // not used as priority in latency-scheduling + unsigned Height = 0; + // Tuple node, which should be schedule in pair with this node. preNode *TupleLead = nullptr; unsigned TupleParts = 0; @@ -191,8 +198,6 @@ class preNode { // True once scheduled. bool isScheduled = false; - bool isClustered = false; - bool isClusterLead = false; bool ACCCandidate = false; friend class preDDD; @@ -431,23 +436,23 @@ struct SchedConfig { enum { MASK_DUMP = 1U << 0, MASK_LATENCY = 1U << 1, - MASK_SETHI_ULLMAN = 1U << 2, - MASK_CLUSTTERING = 1U << 3, + MASK_MIN_REG = 1U << 2, + MASK_SKIP_CLUSTER = 1U << 3, MASK_SKIP_HOLD = 1U << 4, MASK_NOT_ITERATE = 1U << 5, }; unsigned Dump : 1; unsigned UseLatency : 1; - unsigned UseSethiUllman : 1; - unsigned DoClustering : 1; + unsigned UseMinReg : 1; + unsigned SkipClustering : 1; // default 0 i.e. try min-reg with clustering unsigned SkipHoldList : 1; // default 0 i.e. use hold list in latency-hiding unsigned DoNotIterate : 1; // default 0 i.e. iterative latency-scheduling explicit SchedConfig(unsigned Config) : Dump((Config & MASK_DUMP) != 0), UseLatency((Config & MASK_LATENCY) != 0), - UseSethiUllman((Config & MASK_SETHI_ULLMAN) != 0), - DoClustering((Config & MASK_CLUSTTERING) != 0), + UseMinReg((Config & MASK_MIN_REG) != 0), + SkipClustering((Config & MASK_SKIP_CLUSTER) != 0), SkipHoldList((Config & MASK_SKIP_HOLD) != 0), DoNotIterate((Config & MASK_NOT_ITERATE) != 0) {} }; @@ -501,9 +506,9 @@ class BB_Scheduler { // UpperBoundGRF is the measure max reg-pressure of this kernel before scheduling bool scheduleBlockForLatency(unsigned &MaxPressure, bool ReassignID, unsigned UpperBoundGRF); + void SethiUllmanScheduling(bool DoClustering); private: - void SethiUllmanScheduling(); void LatencyScheduling(unsigned GroupingThreshold); bool verifyScheduling(); @@ -800,13 +805,8 @@ class SethiUllmanQueue : public QueueBase { // max-reg-pressure for the sub-exp-tree starting from a node std::vector MaxRegs; std::vector DstSizes; - - // The clustering nodes. - std::vector Clusterings; - std::set Visited; - - // Scheduling in clustering mode. - bool IsInClusteringMode = false; + // the max time-stamp among node uses + std::vector LiveTS; public: SethiUllmanQueue(preDDD &ddd, RegisterPressure &rp, SchedConfig config) @@ -816,19 +816,15 @@ class SethiUllmanQueue : public QueueBase { // Add a new ready node. void push(preNode *N) override { - // Clustering nodes have been added. - if (N->isClustered && !N->isClusterLead) { - vASSERT(std::find(Clusterings.begin(), Clusterings.end(), N) != - Clusterings.end()); - } else { - Q.push_back(N); - } + Q.push_back(N); } // Schedule the top node. preNode *pickNode() override { return select(); } - bool empty() const { return Q.empty() && Clusterings.empty(); } + bool empty() const { return Q.empty(); } + + friend void BB_Scheduler::SethiUllmanScheduling(bool); private: // Initialize Sethi-Ullman numbers. @@ -836,8 +832,8 @@ class SethiUllmanQueue : public QueueBase { // Select next ready node to schedule. preNode *select(); - - preNode *scheduleClusteringNode(); + // In clustering mode + void formWorkingSet(preNode *seed, std::vector &W); // Compare two ready nodes and decide which one should be scheduled first. // Return true if N2 has a higher priority than N1, false otherwise. @@ -924,6 +920,7 @@ void SethiUllmanQueue::init() { unsigned N = (unsigned)Nodes.size(); MaxRegs.resize(N, 0); DstSizes.resize(N, 0); + LiveTS.resize(N, N); for (auto I = Nodes.rbegin(); I != Nodes.rend(); ++I) { calculateSethiUllmanNumber((*I)); } @@ -950,21 +947,6 @@ bool SethiUllmanQueue::compare(preNode *N1, preNode *N2) { if (N1->getInst()->isPseudoKill()) return false; - // Prefer to unlock a pending clustering node. - if (IsInClusteringMode) { - // Only kick in when top clustering node is not ready. - vASSERT(!Clusterings.empty()); - preNode *Top = Clusterings.back(); - if (Top->NumSuccsLeft > 0) { - for (auto &SuccN : Top->succs()) { - if (SuccN.getNode() == N1) - return false; - if (SuccN.getNode() == N2) - return true; - } - } - } - auto SU1 = MaxRegs[N1->getID()] - DstSizes[N1->getID()]; auto SU2 = MaxRegs[N2->getID()] - DstSizes[N2->getID()]; @@ -979,128 +961,9 @@ bool SethiUllmanQueue::compare(preNode *N1, preNode *N2) { return N1->getID() > N2->getID(); } -preNode *SethiUllmanQueue::scheduleClusteringNode() { - // Clustering does not work well for SIMD32 before PVC - // because all instructions are sliced into SIMD16 - if (isSlicedSIMD32(ddd.getKernel())) - return nullptr; - - // Schedule clustering nodes first. - if (IsInClusteringMode && !Clusterings.empty()) { - // Pop off already scheduled node, if any. - while (!Clusterings.empty() && Clusterings.back()->isScheduled) - Clusterings.pop_back(); - - // All are scheduled, ending clustering mode. - if (Clusterings.empty()) { - IsInClusteringMode = false; - return nullptr; - } - - // The next clustering node is not ready yet. - preNode *Top = Clusterings.back(); - if (Top->NumSuccsLeft > 0) - return nullptr; - - // The next clustering node is ready and not scheduled yet. - Clusterings.pop_back(); - if (Clusterings.empty()) - IsInClusteringMode = false; - return Top; - } - - // The width limit of clustering. - const unsigned CLUSTER_SIZE_MIN = 3; - const unsigned CLUSTER_SIZE_MAX = 8; - - // Match clustering nodes. - auto collectClustering = [&](preNode *Node, preNode *predNode) { - for (auto &E : predNode->succs()) { - preNode *N = E.getNode(); - // Match nodes may not be ready. - if (!E.isDataDep() || N->isScheduled) - continue; - // Do not cluster sends, which may confuse send pairing. - if (N->getInst() == nullptr || N->getInst()->isSend()) - continue; - Clusterings.push_back(N); - } - - // Check if the first matching is successful. - if (unsigned(Clusterings.size()) == predNode->NumSuccsLeft && - unsigned(Clusterings.size()) >= CLUSTER_SIZE_MIN && - unsigned(Clusterings.size()) <= CLUSTER_SIZE_MAX) - return true; - // Check if the second matching is successful. - if (unsigned(Q.size()) >= CLUSTER_SIZE_MIN) { - Clusterings.clear(); - for (auto &E : predNode->succs()) { - preNode *N = E.getNode(); - // Only match ready nodes. - if (!E.isDataDep() || N->isScheduled || N->NumSuccsLeft) - continue; - // Do not cluster sends, which may confuse send pairing. - if (N->getInst() == nullptr || N->getInst()->isSend()) - continue; - Clusterings.push_back(N); - } - if (unsigned(Clusterings.size()) == predNode->NumSuccsLeft && - unsigned(Clusterings.size()) >= CLUSTER_SIZE_MIN && - unsigned(Clusterings.size()) <= CLUSTER_SIZE_MAX) { - return true; - } - } - - Clusterings.clear(); - return false; - }; - - if (config.DoClustering && Clusterings.empty()) { - for (auto Node : Q) { - for (auto I = Node->pred_begin(), E = Node->pred_end(); I != E; ++I) { - if (I->isDataDep()) { - preNode *predN = I->getNode(); - if (!Visited.insert(predN).second) - continue; - if (collectClustering(Node, predN)) - break; - } - } - - if (!Clusterings.empty()) { - for (auto N : Clusterings) - N->isClustered = true; - - Q.erase(std::remove_if(Q.begin(), Q.end(), - [](preNode *N) { return N->isClustered; }), - Q.end()); - - std::sort( - Clusterings.begin(), Clusterings.end(), - [](preNode *A, preNode *B) { return A->getID() > B->getID(); }); - - // We put the leading node back to the regular queue to - // participate SU number comparison. - preNode *Top = Clusterings.back(); - Top->isClusterLead = true; - if (Top->NumSuccsLeft == 0) { - Clusterings.pop_back(); - Q.push_back(Top); - return nullptr; - } - break; - } - } - } - - return nullptr; -} preNode *SethiUllmanQueue::select() { - if (auto Top = scheduleClusteringNode()) - return Top; - vASSERT(!Q.empty()); auto TopIter = Q.end(); for (auto I = Q.begin(), E = Q.end(); I != E; ++I) { @@ -1129,24 +992,116 @@ preNode *SethiUllmanQueue::select() { std::swap(*TopIter, Q.back()); Q.pop_back(); - // This selected node is clustered. From now on, schedule all - // other clustered nodes. - if (Top->isClustered) { - IsInClusteringMode = true; - return Top; - } - return Top; } +void SethiUllmanQueue::formWorkingSet(preNode *seed, + std::vector &W) { + if (!seed->getInst() || seed->getInst()->isSend()) { + W.push_back(seed); + return; + } + std::vector Cluster; + Cluster.push_back(seed); + preNode *ClusterWait = nullptr; + unsigned MaxHeight = seed->Height; + // looking for cluster by checking source-operand sharing + for (auto I = seed->pred_begin(), E = seed->pred_end(); I != E; ++I) { + preNode *Pred = I->getNode(); + // not looking for cluster for instructions sharing tiny-size def + if (Pred->getInst() == nullptr || DstSizes[Pred->getID()] < 4) + continue; + if (I->isFlowDep()) { + unsigned UseCnt = 0; + for (auto J = Pred->succ_begin(), JE = Pred->succ_end(); J != JE; ++J) { + if (J->isFlowDep()) { + UseCnt++; + } + } + // not looking for cluster on operand with too many sharing + if (UseCnt > 8) + continue; + // adding the other uses of Pred to the cluster + for (auto J = Pred->succ_begin(), JE = Pred->succ_end(); J != JE; ++J) { + preNode *Succ = J->getNode(); + if (!J->isFlowDep()) + continue; + if (Succ->isScheduled) + continue; + // this use is not in cluster + if (std::find(Cluster.begin(), Cluster.end(), Succ) == Cluster.end()) { + Cluster.push_back(Succ); + // Succ is not in ready list + if (Succ->NumSuccsLeft > 0) { + if (ClusterWait == nullptr) + ClusterWait = Succ; + else if (Succ->Height > ClusterWait->Height) + ClusterWait = Succ; + MaxHeight = std::max(MaxHeight, ClusterWait->Height); + } + } + } + } + } + // when the entire cluster is ready or the entire cluster is too small + // too large, just schedule all ready nodes in the cluster + if (!ClusterWait || Cluster.size() <= 2 || Cluster.size() > 5) { + W.push_back(seed); + unsigned idx = 0; + while (idx < Q.size()) { + preNode *Tmp = Q[idx]; + if (std::find(Cluster.begin(), Cluster.end(), Tmp) != Cluster.end()) { + W.push_back(Tmp); + Q[idx] = Q.back(); + Q.pop_back(); + } else + idx++; + } + return; + } + // cluster is not ready, find a ready node that can enable the cluster + auto Top = ClusterWait; + bool Searching = (MaxHeight - seed->Height <= 20); + while (Searching) { + Searching = false; + for (auto I = Top->succ_begin(), E = Top->succ_end(); I != E; ++I) { + auto Node = I->getNode(); + if (!Node->isScheduled) { + if (Node->NumSuccsLeft > 0) { + Top = Node; // continue search + Searching = true; + break; + } else { + if (Node != seed) { + Q.erase(std::remove(Q.begin(), Q.end(), Node), Q.end()); + Q.push_back(seed); + } + W.push_back(Node); + return; + } + } + } + } + if (W.empty()) { + W.push_back(seed); + return; + } +} + // The basic idea is... // bool BB_Scheduler::scheduleBlockForPressure(unsigned &MaxPressure, unsigned Threshold) { auto tryRPReduction = [=]() { - if (!config.UseSethiUllman) + if (!config.UseMinReg) return false; - return MaxPressure >= Threshold; + if (MaxPressure < Threshold) + return false; + // MaxRP at the block entry or exit, cannot change that + if (MaxPressure == rp.getPressure(ddd.getBB()->front()) || + MaxPressure == rp.getPressure(ddd.getBB()->back())) + return false; + return true; }; bool Changed = false; @@ -1155,55 +1110,101 @@ bool BB_Scheduler::scheduleBlockForPressure(unsigned &MaxPressure, if (kernel.getOptions()->getOption(vISA_DumpDagTxt)) { ddd.dumpDagTxt(rp); } - SethiUllmanScheduling(); - if (commitIfBeneficial(MaxPressure)) { - SCHED_DUMP(rp.dump(ddd.getBB(), "After scheduling for presssure, ")); - Changed = true; - } else if (!config.DoClustering && - !isSlicedSIMD32(ddd.getKernel())) { // try clustering - ddd.reset(false); - auto SaveClustering = config.DoClustering; - config.DoClustering = 1; - SethiUllmanScheduling(); - config.DoClustering = SaveClustering; + if (!config.SkipClustering) { + // try clustering first + SethiUllmanScheduling(true); if (commitIfBeneficial(MaxPressure)) { SCHED_DUMP(rp.dump(ddd.getBB(), "After scheduling for presssure, ")); + kernel.fg.builder->getJitInfo()->statsVerbose.minRegClusterCount++; + Changed = true; + } else { + ddd.reset(false); + } + } + if (!Changed) { + // try not-clustering + SethiUllmanScheduling(false); + if (commitIfBeneficial(MaxPressure)) { + SCHED_DUMP(rp.dump(ddd.getBB(), "After scheduling for presssure, ")); + kernel.fg.builder->getJitInfo()->statsVerbose.minRegSUCount++; Changed = true; } } } + if (!Changed) + kernel.fg.builder->getJitInfo()->statsVerbose.minRegRestCount++; return Changed; } -void BB_Scheduler::SethiUllmanScheduling() { +void BB_Scheduler::SethiUllmanScheduling(bool DoClustering) { schedule.clear(); SethiUllmanQueue Q(ddd, rp, config); Q.push(ddd.getExitNode()); while (!Q.empty()) { preNode *N = Q.pickNode(); - vASSERT(!N->isScheduled && N->NumSuccsLeft == 0); - if (N->getInst() != nullptr) { - // std::cerr << "emit: "; N->getInst()->dump(); - if (N->getInst()->isSend() && N->getTupleLead()) { - // If it's the pair of the current node, reset the node to be - // paired. If it's send with pair, ensure its pair is scheduled - // before other sends by setting the current node to be paired. - if (!Q.getCurrTupleLead()) - Q.setCurrTupleLead(N); - if (Q.getCurrTupleLead()) - Q.updateCurrTupleLead(N); + std::vector W; + if (DoClustering) + Q.formWorkingSet(N, W); + else + W.push_back(N); + while (!W.empty()) { + N = W.back(); + W.pop_back(); + vASSERT(!N->isScheduled && N->NumSuccsLeft == 0); + if (N->getInst() != nullptr) { + // std::cerr << "emit: "; N->getInst()->dump(); + if (N->getInst()->isSend() && N->getTupleLead()) { + // If it's the pair of the current node, reset the node to be + // paired. If it's send with pair, ensure its pair is scheduled + // before other sends by setting the current node to be paired. + if (!Q.getCurrTupleLead()) + Q.setCurrTupleLead(N); + if (Q.getCurrTupleLead()) + Q.updateCurrTupleLead(N); + } + schedule.push_back(N->getInst()); + N->isScheduled = true; + } + // update LiveTS, which is used both in priority-comparision + // and register-releasing scheduling + for (auto I = N->pred_begin(), E = N->pred_end(); I != E; ++I) { + preNode *Node = I->getNode(); + vASSERT(!Node->isScheduled && Node->NumSuccsLeft); + if (I->isFlowDep()) { + Q.LiveTS[Node->getID()] = + std::min(Q.LiveTS[Node->getID()], (unsigned)schedule.size()); + } + } + for (auto I = N->pred_begin(), E = N->pred_end(); I != E; ++I) { + preNode *Node = I->getNode(); + --Node->NumSuccsLeft; + if (Node->NumSuccsLeft == 0) { + if (Node->getInst() == nullptr) { + W.push_back(Node); + } else if (Node->getInst()->isSend() && Node->getTupleLead()) { + Q.push(Node); + } else { + // if newly available node does not increase pressure, + // schedule it immediately + int SzDelta = (Q.LiveTS[Node->getID()] <= schedule.size()) + ? (int)Q.DstSizes[Node->getID()] + : 0; + for (auto J = Node->pred_begin(), JE = Node->pred_end(); J != JE; + ++J) { + if (J->isFlowDep()) { + auto NodeJ = J->getNode(); + if (Q.LiveTS[NodeJ->getID()] > schedule.size()) + SzDelta = SzDelta - (int)Q.DstSizes[NodeJ->getID()]; + } + } + if (SzDelta >= 0) + W.push_back(Node); + else + Q.push(Node); + } + } } - schedule.push_back(N->getInst()); - N->isScheduled = true; - } - - for (auto I = N->pred_begin(), E = N->pred_end(); I != E; ++I) { - preNode *Node = I->getNode(); - vASSERT(!Node->isScheduled && Node->NumSuccsLeft); - --Node->NumSuccsLeft; - if (Node->NumSuccsLeft == 0) - Q.push(Node); } } @@ -2695,23 +2696,29 @@ void preDDD::reset(bool ReassignNodeID) { N->NumSuccsLeft = unsigned(N->Succs.size()); N->isScheduled = false; N->setReadyCycle(0); - N->isClustered = false; - N->isClusterLead = false; } EntryNode.NumPredsLeft = 0; EntryNode.NumSuccsLeft = unsigned(EntryNode.Succs.size()); EntryNode.isScheduled = false; EntryNode.setReadyCycle(0); - EntryNode.isClustered = false; - EntryNode.isClusterLead = false; ExitNode.NumPredsLeft = unsigned(ExitNode.Preds.size()); ExitNode.NumSuccsLeft = 0; ExitNode.isScheduled = false; ExitNode.setReadyCycle(0); - ExitNode.isClustered = false; - ExitNode.isClusterLead = false; + // compute height + // sort its successor in height-descending order + for (auto N : SNodes) { + std::sort(N->Succs.begin(), N->Succs.end(), [](preEdge &A, preEdge &B) { + auto *AN = A.getNode(); + auto *BN = B.getNode(); + return (AN->Height > BN->Height) || + (AN->Height == BN->Height && AN->getID() > BN->getID()); + }); + if (N->succs().size()) + N->Height = N->succs().front().getNode()->Height + 1; + } } void preDDD::dumpDagTxt(RegisterPressure &rp) { @@ -2778,10 +2785,8 @@ void preDDD::dumpDagTxt(RegisterPressure &rp) { } // Edge for (auto &E : N->Preds) { - DepType depType = E.getType(); auto DefInst = E.getNode()->getInst(); - if (DefInst && !DefInst->isPseudoKill() && - (depType == RAW || depType == WAW)) + if (DefInst && !DefInst->isPseudoKill() && E.isFlowDep()) ofile << ", " << E.getNode()->ID; } ofile << "\n"; diff --git a/visa/include/JitterDataStruct.h b/visa/include/JitterDataStruct.h index bbc210ba6d46..822162134ec1 100644 --- a/visa/include/JitterDataStruct.h +++ b/visa/include/JitterDataStruct.h @@ -151,6 +151,11 @@ struct PERF_STATS_VERBOSE { uint32_t normIntfNum = 0; // Number of SIMD inteference edges. uint32_t augIntfNum = 0; + + // preRA scheduler counters + uint32_t minRegClusterCount; + uint32_t minRegSUCount; + uint32_t minRegRestCount; }; struct FINALIZER_INFO {