diff --git a/Makefile.in b/Makefile.in index f518f647..18185e83 100644 --- a/Makefile.in +++ b/Makefile.in @@ -111,7 +111,7 @@ cache_lib_path := @CHARM_PATH@/tmp/libs/ck-libs/cache threadsafe_ht_path := $(cache_lib_path)/threadsafe_hashtable # ------- Modules to build ---------------------------------------------------- -changa_modules := $(strip MultistepLB MultistepLB_notopo \ +changa_modules := $(strip MultistepLB MultistepLB_SFC MultistepLB_notopo \ MultistepNodeLB_notopo Orb3dLB Orb3dLB_notopo HierarchOrbLB) charm_modules := $(strip CkCache CkIO CkMulticast RefineLB \ diff --git a/MultistepLB_SFC.ci b/MultistepLB_SFC.ci new file mode 100644 index 00000000..23c35358 --- /dev/null +++ b/MultistepLB_SFC.ci @@ -0,0 +1,10 @@ +module MultistepLB_SFC { + +extern module CentralLB; +initnode void lbinit(void); + +group [migratable] MultistepLB_SFC : CentralLB { + entry void MultistepLB_SFC(const CkLBOptions &); +}; + +}; diff --git a/MultistepLB_SFC.cpp b/MultistepLB_SFC.cpp new file mode 100644 index 00000000..a3376efe --- /dev/null +++ b/MultistepLB_SFC.cpp @@ -0,0 +1,285 @@ +#include +#include "MultistepLB_SFC.h" +#include "ParallelGravity.h" +#include "Vector3D.h" +#include "formatted_string.h" + +CkpvExtern(int, _lb_obj_index); +using namespace std; + +#if CHARM_VERSION > 61002 +static void lbinit() +{ + LBRegisterBalancer("MultistepLB_SFC", + "Works best with multistepped runs; uses SFC distribution"); +} +#else +CreateLBFunc_Def(MultistepLB_SFC, + "Works best with multistepped runs; uses SFC distribution"); +#endif + +void MultistepLB_SFC::init() { + lbname = "MultistepLB_SFC"; + if (CkpvAccess(_lb_obj_index) == -1) + CkpvAccess(_lb_obj_index) = LBRegisterObjUserData(sizeof(TaggedVector3D)); +} + + +MultistepLB_SFC::MultistepLB_SFC(const CkLBOptions &opt): CBase_MultistepLB_SFC(opt) +{ + init(); + if (CkMyPe() == 0){ + CkPrintf("[%d] MultistepLB_SFC created\n",CkMyPe()); + } +} + +bool MultistepLB_SFC::QueryBalanceNow(int step){ + if(CkMyPe() == 0) CkPrintf("LB_SFC: Step %d\n", step); + return true; +} + +/// @brief Implement load balancing: store loads and determine active +/// processors and objects, sort by SFC, then divide up among processors. +/// @param stats The Load Balancer statistics object. +void MultistepLB_SFC::work(BaseLB::LDStats* stats) +{ +#if CMK_LBDB_ON + // find active objects - mark the inactive ones as non-migratable + const auto num_objs = stats->objData.size(); + + if(_lb_args.debug() >= 2 && step() > 0) { + // Write out "particle file" of measured load balance information + auto achFileName = make_formatted_string("lb_a.%d.sim", step()-1); + write_LB_particles(stats, achFileName.c_str(), true); + } + + int numActiveObjects = 0; + int numInactiveObjects = 0; + int minActiveProc = INT_MAX; + int maxActiveProc = 0; + + for(int i = 0; i < num_objs; i++){ + stats->to_proc[i] = stats->from_proc[i]; + } + + for(int i = 0; i < num_objs; i++){ + if (!stats->objData[i].migratable) continue; + + LDObjData &odata = stats->objData[i]; + TaggedVector3D* udata = (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index)); + + if(udata->myNumParticles == 0){ // ignore pieces with no particles + stats->objData[i].migratable = 0; + stats->n_migrateobjs--; + continue; + } + if(udata->numActiveParticles == 0){ + numInactiveObjects++; + } + else{ + numActiveObjects++; + if(minActiveProc > stats->from_proc[i]) + minActiveProc = stats->from_proc[i]; + if(maxActiveProc < stats->from_proc[i]) + maxActiveProc = stats->from_proc[i]; + } + } + CkPrintf("numActiveObjects: %d, numInactiveObjects: %d\n", numActiveObjects, + numInactiveObjects); + CkPrintf("active PROC range: %d to %d\n", minActiveProc, maxActiveProc); + if(numActiveObjects < 0.1*numInactiveObjects) { + // only a small number of active objects, only migrate them + for(int i = 0; i < stats->objData.size(); i++){ + if (!stats->objData[i].migratable) continue; + + LDObjData &odata = stats->objData[i]; + TaggedVector3D* udata = + (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index)); + if(udata->numActiveParticles == 0) { + stats->objData[i].migratable = 0; + stats->n_migrateobjs--; + } + } + } + else { + CkPrintf("Migrating all: numActiveObjects: %d, numInactiveObjects: %d\n", + numActiveObjects, numInactiveObjects); + } + + // let the strategy take over on this modified instrumented data and processor information + work2(stats); +#endif //CMK_LDB_ON +} + +/// @brief SFC load balance. +void MultistepLB_SFC::work2(BaseLB::LDStats *stats){ + const int numobjs = stats->objData.size(); + const int nmig = stats->n_migrateobjs; + + // this data structure is used by the SFC strategy + // to balance objects. it is NOT indexed by tree piece index + // there are as many entries in it as there are + // migratable (active) tree pieces + vector tp_array; + tp_array.resize(nmig); + + if (_lb_args.debug()>=2) { + CkPrintf("[work2] ready tp_array data structure\n"); + } + + int numProcessed = 0; + + double dBgLoad = 0.0; + for(int i = 0; i < stats->nprocs(); i++){ + dBgLoad += stats->procs[i].bg_walltime; + } + dBgLoad /= numobjs; + + dTotalLoad = 0.0; + for(int i = 0; i < numobjs; i++){ + if(!stats->objData[i].migratable) continue; + + float load; + LDObjData &odata = stats->objData[i]; + TaggedVector3D* udata = (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index)); + if(step() == 0){ // no load information, balance by particle numbers + load = udata->myNumParticles; + } + else{ + // give each piece a portion of the background load + load = stats->objData[i].wallTime + dBgLoad; + } + + tp_array[numProcessed] = SFCObject(i, load); + tp_array[numProcessed].centroid = udata->vec; + numProcessed++; + dTotalLoad += load; + } + + if(verbosity > 0) + CkPrintf("Avg active load %g; Avg bg load %g\n", dTotalLoad/numobjs, + dBgLoad); + + CkAssert(numProcessed==nmig); + + sfcPrepare(tp_array, nmig, stats); + sfcPartition(stats->nprocs(),tp_array, stats); + + // refine(stats, numobjs); + Orb_PrintLBStats(stats, numobjs); + + if(_lb_args.debug() >= 2) { + // Write out "particle file" of load balance information + auto achFileName = make_formatted_string("lb.%d.sim", step()); + write_LB_particles(stats, achFileName.c_str(), false); + } +} + +/// @brief Prepare structures for the ORB partition. +/// @param tp_array Reference to Vector of Objects representing TreePieces. +/// @param nObjs Number of tree pieces to partition. +/// @param stats Data from the load balancing framework. +/// @param node_partition Are we partitioning on nodes. +void MultistepLB_SFC::sfcPrepare(vector &tp_array, + int nObjs, + BaseLB::LDStats *stats, + bool node_partition){ + + OrientedBox boundingBox; + int nmig = stats->n_migrateobjs; + if(dMaxBalance < 1.0) + dMaxBalance = 1.0; + + // If using node based orb partition, then the maxPieceProc is total + // migratable objs / total number of node. + if (node_partition) { + maxPieceProc = dMaxBalance * nmig / CkNumNodes(); + } else { + maxPieceProc = dMaxBalance*nmig/stats->nprocs(); + } + + if(maxPieceProc < 1.0) + maxPieceProc = 1.01; + + CkAssert(tp_array.size() == nObjs); + + mapping = &stats->to_proc; + from = &stats->from_proc; + + CkPrintf("[LB_SFC] sorting\n"); + for(int i = 0; i < nObjs; i++) + boundingBox.grow(tp_array[i].centroid); + + // N.B. code below from TreePiece::assignKeys(). + // Refactoring is a possibility. + // get longest axis + Vector3D bsize = boundingBox.size(); + float max = (bsize.x > bsize.y) ? bsize.x : bsize.y; + max = (max > bsize.z) ? max : bsize.z; + // + // Make the bounding box cubical. + // + Vector3D bcenter = boundingBox.center(); + // The magic number below is approximately 2^(-19) + const float fEps = 1.0 + 1.91e-6; // slop to ensure keys fall + // between 0 and 1. + bsize = Vector3D(fEps*0.5*max); + boundingBox = OrientedBox(bcenter-bsize, bcenter+bsize); + if(verbosity > 1) + ckout << "TreePiece: Bounding box now: " << boundingBox << endl; + + for(unsigned int i = 0; i < nObjs; ++i) { + tp_array[i].key = SFC::generateKey(tp_array[i].centroid, boundingBox); + } + sort(tp_array.begin(),tp_array.end()); +} + +/// @brief Partition treepieces among processors by +/// dividing the SFC as evenly as possible. +/// @param nprocs Number of processors over which to partition the +/// pieces. N.B. if node_partition is true, then this is the number of nodes. +/// @param tp Vector of TreePiece data. +/// @param stats Load balance data +void MultistepLB_SFC::sfcPartition(int nProcs, vector & tp, + BaseLB::LDStats *stats, + bool node_partition){ + + double loadPrev = 0.0; // load on all lower processors + int iCurrPiece = 0; // Piece under consideration + const int nPieces = tp.size(); + for (int iProc = 0; iProc < nProcs && iCurrPiece < nPieces; iProc++) { + if (!stats->procs[iProc].available) + continue; + // always assign one piece to a processor + SFCObject &oSFC = tp[iCurrPiece]; + (*mapping)[oSFC.lbindex] = iProc; + double loadCurr = oSFC.load; + iCurrPiece++; + int nCurrPiece = 1; // number of pieces on this processor + double loadTarget = (iProc+1)*dTotalLoad/nProcs; + double dLoadError = fabs(loadPrev + loadCurr - loadTarget); + + while ((nCurrPiece < maxPieceProc) + && fabs(tp[iCurrPiece].load + loadPrev + loadCurr - loadTarget) + < dLoadError + && iCurrPiece < nPieces) { // add pieces to this + // processor to get the + // closest to the target + // load + oSFC = tp[iCurrPiece]; + loadCurr += oSFC.load; + (*mapping)[oSFC.lbindex] = iProc; + dLoadError = fabs(loadPrev + loadCurr - loadTarget); + iCurrPiece++; + nCurrPiece++; + } + loadPrev += loadCurr; + } + CkAssert(iCurrPiece == tp.size()); +} + +void MultistepLB_SFC::pup(PUP::er &p){ + CBase_MultistepLB_SFC::pup(p); +} + +#include "MultistepLB_SFC.def.h" diff --git a/MultistepLB_SFC.h b/MultistepLB_SFC.h new file mode 100644 index 00000000..7d8f91e4 --- /dev/null +++ b/MultistepLB_SFC.h @@ -0,0 +1,74 @@ +#ifndef _MULTISTEPLB_SFC_H_ +#define _MULTISTEPLB_SFC_H_ + +#include "MultistepLB_SFC.decl.h" +#include "Vector3D.h" +#include "cosmoType.h" +#include "SFC.h" +#include "CentralLB.h" + +void Orb_PrintLBStats(BaseLB::LDStats *stats, int numobjs); +void write_LB_particles(BaseLB::LDStats* stats, const char *achFileName, bool bFrom); + +/// @brief Multistep load balancer using Space Filling Curve +/// +/// This balancer recognizes different "phases" (called rungs in other +/// parts of the code), and uses loads based on measurements of the +/// previous calculation at the same phase. For large phases, (i.e., +/// when many particles are active, the TreePieces are divided among +/// the processors using a Space Filling Curve based on the centroids +/// of the TreePieces. +/// + +class MultistepLB_SFC : public CBase_MultistepLB_SFC { +private: + void init(); + bool QueryBalanceNow(int step); + + decltype(BaseLB::LDStats::to_proc) *mapping; + decltype(BaseLB::LDStats::from_proc) *from; + /// total computational cost to be balanced + double dTotalLoad; + /// Maximum number of pieces per processor + double maxPieceProc; + +public: + MultistepLB_SFC(const CkLBOptions &); + MultistepLB_SFC(CkMigrateMessage *m) : CBase_MultistepLB_SFC(m) { + init(); + } + + class SFCObject + { + public: + /// index into LB stats->objData + int lbindex; + /// Spacial location of TreePiece + Vector3D centroid; + SFC::Key key; + /// computational cost of this object + double load; + + SFCObject() : lbindex(-1), load(0) {} + SFCObject(int _lbindex, double _load) : + lbindex(_lbindex), + load(_load) + { + } + bool operator<(const SFCObject &o) const{ + return key < o.key; + } + }; + + void work(BaseLB::LDStats* stats); + void work2(BaseLB::LDStats* stats); + void sfcPrepare(std::vector &tp_array, + int nObjs, BaseLB::LDStats * stats, + bool node_partition=false); + void sfcPartition(int nProcs, std::vector & tp, + BaseLB::LDStats *stats, bool node_partition=false); + void pup(PUP::er &p); +}; + + +#endif /* _MultistepLB_notopo */ diff --git a/MultistepLB_notopo.cpp b/MultistepLB_notopo.cpp index 676a5368..9af4be5c 100644 --- a/MultistepLB_notopo.cpp +++ b/MultistepLB_notopo.cpp @@ -45,7 +45,7 @@ bool MultistepLB_notopo::QueryBalanceNow(int step){ } // helper functions for multistepping -#ifdef MCLBMS +#ifdef MCLBMSV void MultistepLB_notopo::makeActiveProcessorList(BaseLB::LDStats *stats, int numActiveObjs){ int objsPerProc = 8; @@ -54,9 +54,7 @@ void MultistepLB_notopo::makeActiveProcessorList(BaseLB::LDStats *stats, int num procsNeeded = expandFactor*numActiveObjs/objsPerProc > stats->nprocs() ? stats->nprocs() : expandFactor*numActiveObjs/objsPerProc; /* currently, only the first procsNeeded procs are used - could do something more sophisticated here in the future - FIXME */ -#ifdef MCLBMSV CkPrintf("Processors 0 to %d active\n", procsNeeded-1); -#endif } #endif @@ -72,32 +70,8 @@ void MultistepLB_notopo::work(BaseLB::LDStats* stats) if(_lb_args.debug() >= 2 && step() > 0) { // Write out "particle file" of measured load balance information auto achFileName = make_formatted_string("lb_a.%d.sim", step()-1); - FILE *fp = fopen(achFileName.c_str(), "w"); - CkAssert(fp != NULL); - - int num_migratables = num_objs; - for(int i = 0; i < num_objs; i++) { - if (!stats->objData[i].migratable) { - num_migratables--; - } - } - - fprintf(fp, "%d %d 0\n", num_migratables, num_migratables); - for(int i = 0; i < num_objs; i++) { - if (!stats->objData[i].migratable) continue; - - LDObjData &odata = stats->objData[i]; - TaggedVector3D* udata = (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index)); - fprintf(fp, "%g %g %g %g 0.0 0.0 0.0 %d %d\n", - stats->objData[i].wallTime, - udata->vec.x, - udata->vec.y, - udata->vec.z, - stats->from_proc[i], - udata->tp); - } - fclose(fp); - } + write_LB_particles(stats, achFileName.c_str(), true); + } int numActiveObjects = 0; int numInactiveObjects = 0; @@ -114,6 +88,11 @@ void MultistepLB_notopo::work(BaseLB::LDStats* stats) LDObjData &odata = stats->objData[i]; TaggedVector3D* udata = (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index)); + if(udata->myNumParticles == 0){ // ignore pieces with no particles + stats->objData[i].migratable = 0; + stats->n_migrateobjs--; + continue; + } if(udata->numActiveParticles == 0){ numInactiveObjects++; } @@ -148,10 +127,9 @@ void MultistepLB_notopo::work(BaseLB::LDStats* stats) // select processors #ifdef MCLBMSV - //printData(*stats, phase, NULL); CkPrintf("making active processor list\n"); -#endif makeActiveProcessorList(stats, numActiveObjects); +#endif count = stats->nprocs(); // let the strategy take over on this modified instrumented data and processor information @@ -221,36 +199,160 @@ void MultistepLB_notopo::work2(BaseLB::LDStats *stats, int count){ if(_lb_args.debug() >= 2) { // Write out "particle file" of load balance information auto achFileName = make_formatted_string("lb.%d.sim", step()); - FILE *fp = fopen(achFileName.c_str(), "w"); - CkAssert(fp != NULL); + write_LB_particles(stats, achFileName.c_str(), false); + } +} + +void Orb_PrintLBStats(BaseLB::LDStats *stats, int numobjs) +{ + std::vector predLoad(stats->nprocs(), 0.0); + std::vector predCount(stats->nprocs(), 0); + double maxObjLoad = 0.0; + + int migr = 0; + for(int i = 0; i < numobjs; i++){ + LDObjData &odata = stats->objData[i]; + TaggedVector3D* udata = (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index)); + if(udata->myNumParticles == 0) // ignore empty TreePieces + continue; + if(stats->to_proc[i] != stats->from_proc[i]) + migr++; + double ld = stats->objData[i].wallTime; + int proc = stats->to_proc[i]; + predLoad[proc] += ld; + predCount[proc] ++; + if(ld > maxObjLoad) + maxObjLoad = ld; + } + + double minWall = 0.0; + double maxWall = 0.0; + double avgWall = 0.0; + + double minIdle = 0.0; + double maxIdle = 0.0; + double avgIdle = 0.0; + + double minBg = 0.0; + double maxBg = 0.0; + double avgBg = 0.0; + + double avgPred = 0.0; + double minPred = 0.0; + double maxPred = 0.0; - int num_migratables = numobjs; - for(int i = 0; i < numobjs; i++) { + double avgPiece = 0.0; + double minPiece = 0.0; + double maxPiece = 0.0; + + CkPrintf("***************************\n"); + for(int i = 0; i < stats->nprocs(); i++){ + double wallTime = stats->procs[i].total_walltime; + double idleTime = stats->procs[i].idletime; + double bgTime = stats->procs[i].bg_walltime; + double pred = predLoad[i]; + double npiece = predCount[i]; + + avgWall += wallTime; + avgIdle += idleTime; + avgBg += bgTime; + avgPred += pred; + avgPiece += npiece; + + if(i==0 || minWall > wallTime) minWall = wallTime; + if(i==0 || maxWall < wallTime) maxWall = wallTime; + + if(i==0 || minIdle > idleTime) minIdle = idleTime; + if(i==0 || maxIdle < idleTime) maxIdle = idleTime; + + if(i==0 || minBg > bgTime) minBg = bgTime; + if(i==0 || maxBg < bgTime) maxBg = bgTime; + + if(i==0 || minPred > pred) minPred = pred; + if(i==0 || maxPred < pred) maxPred = pred; + + if(i==0 || minPiece > npiece) minPiece = npiece; + if(i==0 || maxPiece < npiece) maxPiece = npiece; + + } + + avgWall /= stats->nprocs(); + avgIdle /= stats->nprocs(); + avgBg /= stats->nprocs(); + avgPred /= stats->nprocs(); + avgPiece /= stats->nprocs(); + +#ifdef PRINT_LOAD_PERCENTILES + double accumVar = 0; + vector objectWallTimes; + for(int i = 0; i < stats->nprocs(); i++){ + double wallTime = stats->procs[i].total_walltime; + objectWallTimes.push_back(wallTime); + accumVar += (wallTime - avgWall) * (wallTime - avgWall); + } + double stdDev = sqrt(accumVar / stats->nprocs()); + CkPrintf("Average load: %.3f\n", avgWall); + CkPrintf("Standard deviation: %.3f\n", stdDev); + + std::sort(objectWallTimes.begin(), objectWallTimes.end()); + CkPrintf("Object load percentiles: \n"); + double increment = (double) objectWallTimes.size() / 10; + int j = 0; + double index = 0; + for (int j = 0; j < 100; j += 10) { + index += increment; + CkPrintf("%d: %.3f\n", j, objectWallTimes[(int) index]); + } + CkPrintf("100: %.3f\n", objectWallTimes.back()); +#endif + + CkPrintf("LB stats: maxObjLoad %f\n", maxObjLoad); + CkPrintf("LB stats: minWall %f maxWall %f avgWall %f maxWall/avgWall %f\n", minWall, maxWall, avgWall, maxWall/avgWall); + CkPrintf("LB stats: minIdle %f maxIdle %f avgIdle %f minIdle/avgIdle %f\n", minIdle, maxIdle, avgIdle, minIdle/avgIdle); + CkPrintf("LB stats: minPred %f maxPred %f avgPred %f maxPred/avgPred %f\n", minPred, maxPred, avgPred, maxPred/avgPred); + CkPrintf("LB stats: minPiece %f maxPiece %f avgPiece %f maxPiece/avgPiece %f\n", minPiece, maxPiece, avgPiece, maxPiece/avgPiece); + CkPrintf("LB stats: minBg %f maxBg %f avgBg %f maxBg/avgBg %f\n", minBg, maxBg, avgBg, maxBg/avgBg); + CkPrintf("LB stats: orb migrated %d objects\n", migr); +} + +/// @brief Write out TreePieces as "particles" into a simple file +/// that can be converted into a tipsy file for visualization and +/// analysis. +/// @param stats LB structure +/// @param achFileName file to write. +/// @param bFrom use "from" processor if true, otherwise, use "to" processor +void write_LB_particles(BaseLB::LDStats* stats, const char *achFileName, bool bFrom) +{ + const auto num_objs = stats->objData.size(); + FILE *fp = fopen(achFileName, "w"); + CkAssert(fp != NULL); + + int num_migratables = num_objs; + for(int i = 0; i < num_objs; i++) { if (!stats->objData[i].migratable) { - num_migratables--; + num_migratables--; } - } - fprintf(fp, "%d %d 0\n", num_migratables, num_migratables); + } - for(int i = 0; i < numobjs; i++) { - if(!stats->objData[i].migratable) continue; + fprintf(fp, "%d %d 0\n", num_migratables, num_migratables); + for(int i = 0; i < num_objs; i++) { + if (!stats->objData[i].migratable) continue; - LDObjData &odata = stats->objData[i]; - TaggedVector3D* udata = - (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index)); - fprintf(fp, "%g %g %g %g 0.0 0.0 0.0 %d %d\n", - stats->objData[i].wallTime, - udata->vec.x, - udata->vec.y, - udata->vec.z, - stats->to_proc[i], - udata->tp); - } - fclose(fp); - } + LDObjData &odata = stats->objData[i]; + TaggedVector3D* udata = (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index)); + int proc; + if(bFrom) + proc = stats->from_proc[i]; + else + proc = stats->to_proc[i]; + fprintf(fp, "%g %g %g %g 0.0 0.0 0.0 %d %d\n", + stats->objData[i].wallTime, + udata->vec.x, udata->vec.y, udata->vec.z, + proc, udata->tp); + } + fclose(fp); } - void MultistepLB_notopo::pup(PUP::er &p){ CBase_MultistepLB_notopo::pup(p); } diff --git a/MultistepLB_notopo.h b/MultistepLB_notopo.h index 46c1974c..06a380de 100644 --- a/MultistepLB_notopo.h +++ b/MultistepLB_notopo.h @@ -35,8 +35,9 @@ class MultistepLB_notopo : public CBase_MultistepLB_notopo, public Orb3dCommon { private: void init(); bool QueryBalanceNow(int step); +#ifdef MCLBMSV void makeActiveProcessorList(BaseLB::LDStats *stats, int numActiveObjs); - +#endif public: MultistepLB_notopo(const CkLBOptions &); diff --git a/MultistepNodeLB_notopo.cpp b/MultistepNodeLB_notopo.cpp index b69c02e3..79076211 100644 --- a/MultistepNodeLB_notopo.cpp +++ b/MultistepNodeLB_notopo.cpp @@ -58,32 +58,8 @@ void MultistepNodeLB_notopo::work(BaseLB::LDStats* stats) if(_lb_args.debug() >= 2 && step() > 0) { // Write out "particle file" of measured load balance information auto achFileName = make_formatted_string("lb_a.%d.sim", step()-1); - FILE *fp = fopen(achFileName.c_str(), "w"); - CkAssert(fp != NULL); - - int num_migratables = num_objs; - for(int i = 0; i < num_objs; i++) { - if (!stats->objData[i].migratable) { - num_migratables--; - } - } - fprintf(fp, "%d %d 0\n", num_migratables, num_migratables); - - for(int i = 0; i < num_objs; i++) { - if(!stats->objData[i].migratable) continue; - - LDObjData &odata = stats->objData[i]; - TaggedVector3D* udata = (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index)); - fprintf(fp, "%g %g %g %g 0.0 0.0 0.0 %d %d\n", - stats->objData[i].wallTime, - udata->vec.x, - udata->vec.y, - udata->vec.z, - stats->from_proc[i], - udata->tp); - } - fclose(fp); - } + write_LB_particles(stats, achFileName.c_str(), true); + } int numActiveObjects = 0; int numInactiveObjects = 0; @@ -229,33 +205,8 @@ void MultistepNodeLB_notopo::work2(BaseLB::LDStats *stats, int count){ if(_lb_args.debug() >= 2) { // Write out "particle file" of load balance information auto achFileName = make_formatted_string("lb.%d.sim", step()); - FILE *fp = fopen(achFileName.c_str(), "w"); - CkAssert(fp != NULL); - - int num_migratables = numobjs; - for(int i = 0; i < numobjs; i++) { - if (!stats->objData[i].migratable) { - num_migratables--; - } - } - fprintf(fp, "%d %d 0\n", num_migratables, num_migratables); - - for(int i = 0; i < numobjs; i++) { - if(!stats->objData[i].migratable) continue; - - LDObjData &odata = stats->objData[i]; - TaggedVector3D* udata = - (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index)); - fprintf(fp, "%g %g %g %g 0.0 0.0 0.0 %d %d\n", - stats->objData[i].wallTime, - udata->vec.x, - udata->vec.y, - udata->vec.z, - stats->to_proc[i], - udata->tp); - } - fclose(fp); - } + write_LB_particles(stats, achFileName.c_str(), false); + } } /// @brief Class for sorting lightly loaded Pes. diff --git a/Orb3dLBCommon.h b/Orb3dLBCommon.h index 852ab011..fc68d6f1 100644 --- a/Orb3dLBCommon.h +++ b/Orb3dLBCommon.h @@ -16,6 +16,9 @@ #define ORB3DLB_NOTOPO_DEBUG(X) // #define ORB3DLB_NOTOPO_DEBUG(X) CkPrintf X +void Orb_PrintLBStats(BaseLB::LDStats *stats, int numobjs); +void write_LB_particles(BaseLB::LDStats* stats, const char *achFileName, bool bFrom); + /// @brief Hold information about Pe load and number of objects. class PeInfo { public: @@ -358,9 +361,7 @@ class Orb3dCommon{ int *to_procs = Refiner::AllocProcs(stats->nprocs(), stats); #endif - int migr = 0; for(int i = 0; i < numobjs; i++){ - if(stats->to_proc[i] != stats->from_proc[i]) migr++; #ifdef DO_REFINE int pe = stats->to_proc[i]; from_procs[i] = pe; @@ -380,150 +381,7 @@ class Orb3dCommon{ } #endif - double *predLoad = new double[stats->nprocs()]; - double *predCount = new double[stats->nprocs()]; - for(int i = 0; i < stats->nprocs(); i++){ - predLoad[i] = 0.0; - predCount[i] = 0.0; - } - - double maxObjLoad = 0.0; - - for(int i = 0; i < numobjs; i++){ - double ld = stats->objData[i].wallTime; - int proc = stats->to_proc[i]; - predLoad[proc] += ld; - predCount[proc] += 1.0; - if(ld > maxObjLoad) - maxObjLoad = ld; - } - - double minWall = 0.0; - double maxWall = 0.0; - double avgWall = 0.0; - - double minIdle = 0.0; - double maxIdle = 0.0; - double avgIdle = 0.0; - - double minBg = 0.0; - double maxBg = 0.0; - double avgBg = 0.0; - - double avgPred = 0.0; - double minPred = 0.0; - double maxPred = 0.0; - - double avgPiece = 0.0; - double minPiece = 0.0; - double maxPiece = 0.0; - - CkPrintf("***************************\n"); - // CkPrintf("Before LB step %d\n", step()); - for(int i = 0; i < stats->nprocs(); i++){ - double wallTime = stats->procs[i].total_walltime; - double idleTime = stats->procs[i].idletime; - double bgTime = stats->procs[i].bg_walltime; - double pred = predLoad[i]; - double npiece = predCount[i]; - /* - CkPrintf("[pestats] %d %d %f %f %f %f\n", - i, - stats->procs[i].pe, - wallTime, - idleTime, - bgTime, - objTime); - */ - - avgWall += wallTime; - avgIdle += idleTime; - avgBg += bgTime; - avgPred += pred; - avgPiece += npiece; - - if(i==0 || minWall > wallTime) minWall = wallTime; - if(i==0 || maxWall < wallTime) maxWall = wallTime; - - if(i==0 || minIdle > idleTime) minIdle = idleTime; - if(i==0 || maxIdle < idleTime) maxIdle = idleTime; - - if(i==0 || minBg > bgTime) minBg = bgTime; - if(i==0 || maxBg < bgTime) maxBg = bgTime; - - if(i==0 || minPred > pred) minPred = pred; - if(i==0 || maxPred < pred) maxPred = pred; - - if(i==0 || minPiece > npiece) minPiece = npiece; - if(i==0 || maxPiece < npiece) maxPiece = npiece; - - } - - avgWall /= stats->nprocs(); - avgIdle /= stats->nprocs(); - avgBg /= stats->nprocs(); - avgPred /= stats->nprocs(); - avgPiece /= stats->nprocs(); - -#ifdef PRINT_LOAD_PERCENTILES - double accumVar = 0; - vector objectWallTimes; - for(int i = 0; i < stats->nprocs(); i++){ - double wallTime = stats->procs[i].total_walltime; - objectWallTimes.push_back(wallTime); - accumVar += (wallTime - avgWall) * (wallTime - avgWall); - } - double stdDev = sqrt(accumVar / stats->nprocs()); - CkPrintf("Average load: %.3f\n", avgWall); - CkPrintf("Standard deviation: %.3f\n", stdDev); - - std::sort(objectWallTimes.begin(), objectWallTimes.end()); - CkPrintf("Object load percentiles: \n"); - double increment = (double) objectWallTimes.size() / 10; - int j = 0; - double index = 0; - for (int j = 0; j < 100; j += 10) { - index += increment; - CkPrintf("%d: %.3f\n", j, objectWallTimes[(int) index]); - } - CkPrintf("100: %.3f\n", objectWallTimes.back()); -#endif - - delete[] predLoad; - delete[] predCount; - -#if 0 - float minload, maxload, avgload; - minload = maxload = procload[0]; - avgload = 0.0; - for(int i = 0; i < stats->nprocs(); i++){ - CkPrintf("pe %d load %f box %f %f %f %f %f %f\n", i, procload[i], - procbox[i].lesser_corner.x, - procbox[i].lesser_corner.y, - procbox[i].lesser_corner.z, - procbox[i].greater_corner.x, - procbox[i].greater_corner.y, - procbox[i].greater_corner.z - ); - avgload += procload[i]; - if(minload > procload[i]) minload = procload[i]; - if(maxload < procload[i]) maxload = procload[i]; - } - - avgload /= stats->nprocs(); - - CkPrintf("Orb3dLB_notopo stats: min %f max %f avg %f max/avg %f\n", minload, maxload, avgload, maxload/avgload); -#endif - - - CkPrintf("Orb3dLB_notopo stats: maxObjLoad %f\n", maxObjLoad); - CkPrintf("Orb3dLB_notopo stats: minWall %f maxWall %f avgWall %f maxWall/avgWall %f\n", minWall, maxWall, avgWall, maxWall/avgWall); - CkPrintf("Orb3dLB_notopo stats: minIdle %f maxIdle %f avgIdle %f minIdle/avgIdle %f\n", minIdle, maxIdle, avgIdle, minIdle/avgIdle); - CkPrintf("Orb3dLB_notopo stats: minPred %f maxPred %f avgPred %f maxPred/avgPred %f\n", minPred, maxPred, avgPred, maxPred/avgPred); - CkPrintf("Orb3dLB_notopo stats: minPiece %f maxPiece %f avgPiece %f maxPiece/avgPiece %f\n", minPiece, maxPiece, avgPiece, maxPiece/avgPiece); - - CkPrintf("Orb3dLB_notopo stats: minBg %f maxBg %f avgBg %f maxBg/avgBg %f\n", minBg, maxBg, avgBg, maxBg/avgBg); - CkPrintf("Orb3dLB_notopo stats: orb migrated %d refine migrated %d objects\n", migr, numRefineMigrated); + Orb_PrintLBStats(stats, numobjs); #ifdef DO_REFINE // Free the refine buffers diff --git a/Orb3dLB_notopo.cpp b/Orb3dLB_notopo.cpp index e13953c6..fcdc3f6a 100644 --- a/Orb3dLB_notopo.cpp +++ b/Orb3dLB_notopo.cpp @@ -125,33 +125,10 @@ void Orb3dLB_notopo::work(BaseLB::LDStats* stats) CkMyPe(), mcount, gstarttime, CkWallTimer() - gstarttime); if(_lb_args.debug() >= 2) { - // Write out "particle file" of load balance information - auto achFileName = make_formatted_string("lb.%d.sim", step()); - FILE *fp = fopen(achFileName.c_str(), "w"); - CkAssert(fp != NULL); - - int num_migratables = numobjs; - for(int i = 0; i < numobjs; i++) { - if (!stats->objData[i].migratable) { - num_migratables--; - } + // Write out "particle file" of load balance information + auto achFileName = make_formatted_string("lb.%d.sim", step()); + write_LB_particles(stats, achFileName.c_str(), false); } - fprintf(fp, "%d %d 0\n", num_migratables, num_migratables); - - for(int i = 0; i < numobjs; i++) { - if (!stats->objData[i].migratable) continue; - - CkAssert(tps[i].lbindex < numobjs); - CkAssert(tps[i].lbindex >= 0); - fprintf(fp, "%g %g %g %g 0.0 0.0 0.0 %d 0.0\n", - stats->objData[tps[i].lbindex].wallTime, - tps[i].centroid.x, - tps[i].centroid.y, - tps[i].centroid.z, - stats->to_proc[tps[i].lbindex]); - } - fclose(fp); - } if(doSimulateLB){ CkExit(); diff --git a/ParallelGravity.h b/ParallelGravity.h index f014a602..c1dcd3a9 100644 --- a/ParallelGravity.h +++ b/ParallelGravity.h @@ -74,6 +74,7 @@ enum LBStrategy{ MultistepNode_notopo, Orb3d_notopo, MultistepOrb, + Multistep_SFC, HierarchOrb }; PUPbytes(LBStrategy); diff --git a/TreePiece.cpp b/TreePiece.cpp index cc8ee43b..4d2b8130 100644 --- a/TreePiece.cpp +++ b/TreePiece.cpp @@ -14,6 +14,7 @@ #include "Reductions.h" // jetley #include "MultistepLB.h" +#include "MultistepLB_SFC.h" #include "MultistepLB_notopo.h" #include "MultistepNodeLB_notopo.h" #include "Orb3dLB.h" @@ -6525,6 +6526,7 @@ void TreePiece::balanceBeforeInitialForces(const CkCallback &cb){ string msname("MultistepLB"); string orb3dname("Orb3dLB"); + string ms_sfcname("MultistepLB_SFC"); string ms_notoponame("MultistepLB_notopo"); string msnode_notoponame("MultistepNodeLB_notopo"); string orb3d_notoponame("Orb3dLB_notopo"); @@ -6543,6 +6545,10 @@ void TreePiece::balanceBeforeInitialForces(const CkCallback &cb){ foundLB = Orb3d; break; } + else if(ms_sfcname == string(lbs[i]->lbName())){ + foundLB = Multistep_SFC; + break; + } else if(ms_notoponame == string(lbs[i]->lbName())){ foundLB = Multistep_notopo; break;