N-BodyShop · trquinn · Jul 26, 2024 · Jun 30, 2024 · Jul 10, 2024 · Jul 12, 2024
diff --git a/Makefile.in b/Makefile.in
@@ -111,7 +111,7 @@ cache_lib_path := @CHARM_PATH@/tmp/libs/ck-libs/cache
 threadsafe_ht_path := $(cache_lib_path)/threadsafe_hashtable
 
 # ------- Modules to build ----------------------------------------------------
-changa_modules := $(strip MultistepLB MultistepLB_notopo \
+changa_modules := $(strip MultistepLB MultistepLB_SFC MultistepLB_notopo \
  MultistepNodeLB_notopo Orb3dLB Orb3dLB_notopo HierarchOrbLB)
 
 charm_modules := $(strip CkCache CkIO CkMulticast RefineLB \

diff --git a/MultistepLB_SFC.ci b/MultistepLB_SFC.ci
@@ -0,0 +1,10 @@
+module MultistepLB_SFC {
+
+extern module CentralLB;
+initnode void lbinit(void);
+
+group [migratable] MultistepLB_SFC : CentralLB {
+ entry void MultistepLB_SFC(const CkLBOptions &);
+};
+
+};
diff --git a/MultistepLB_SFC.cpp b/MultistepLB_SFC.cpp
@@ -0,0 +1,285 @@
+#include <charm++.h>
+#include "MultistepLB_SFC.h"
+#include "ParallelGravity.h"
+#include "Vector3D.h"
+#include "formatted_string.h"
+
+CkpvExtern(int, _lb_obj_index);
+using namespace std;
+
+#if CHARM_VERSION > 61002
+static void lbinit()
+{
+ LBRegisterBalancer<MultistepLB_SFC>("MultistepLB_SFC",
+ "Works best with multistepped runs; uses SFC distribution");
+}
+#else
+CreateLBFunc_Def(MultistepLB_SFC,
+ "Works best with multistepped runs; uses SFC distribution");
+#endif
+
+void MultistepLB_SFC::init() {
+ lbname = "MultistepLB_SFC";
+ if (CkpvAccess(_lb_obj_index) == -1)
+ CkpvAccess(_lb_obj_index) = LBRegisterObjUserData(sizeof(TaggedVector3D));
+}
+
+
+MultistepLB_SFC::MultistepLB_SFC(const CkLBOptions &opt): CBase_MultistepLB_SFC(opt)
+{
+ init();
+ if (CkMyPe() == 0){
+ CkPrintf("[%d] MultistepLB_SFC created\n",CkMyPe());
+ }
+}
+
+bool MultistepLB_SFC::QueryBalanceNow(int step){
+ if(CkMyPe() == 0) CkPrintf("LB_SFC: Step %d\n", step);
+ return true;
+}
+
+/// @brief Implement load balancing: store loads and determine active
+/// processors and objects, sort by SFC, then divide up among processors.
+/// @param stats The Load Balancer statistics object.
+void MultistepLB_SFC::work(BaseLB::LDStats* stats)
+{
+#if CMK_LBDB_ON
+ // find active objects - mark the inactive ones as non-migratable
+ const auto num_objs = stats->objData.size();
+
+ if(_lb_args.debug() >= 2 && step() > 0) {
+ // Write out "particle file" of measured load balance information
+ auto achFileName = make_formatted_string("lb_a.%d.sim", step()-1);
+ write_LB_particles(stats, achFileName.c_str(), true);
+ }
+
+ int numActiveObjects = 0;
+ int numInactiveObjects = 0;
+ int minActiveProc = INT_MAX;
+ int maxActiveProc = 0;
+
+ for(int i = 0; i < num_objs; i++){
+ stats->to_proc[i] = stats->from_proc[i];
+ }
+
+ for(int i = 0; i < num_objs; i++){
+ if (!stats->objData[i].migratable) continue;
+
+ LDObjData &odata = stats->objData[i];
+ TaggedVector3D* udata = (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index));
+
+ if(udata->myNumParticles == 0){ // ignore pieces with no particles
+ stats->objData[i].migratable = 0;
+ stats->n_migrateobjs--;
+ continue;
+ }
+ if(udata->numActiveParticles == 0){
+ numInactiveObjects++;
+ }
+ else{
+ numActiveObjects++;
+ if(minActiveProc > stats->from_proc[i])
+ minActiveProc = stats->from_proc[i];
+ if(maxActiveProc < stats->from_proc[i])
+ maxActiveProc = stats->from_proc[i];
+ }
+ }
+ CkPrintf("numActiveObjects: %d, numInactiveObjects: %d\n", numActiveObjects,
+ numInactiveObjects);
+ CkPrintf("active PROC range: %d to %d\n", minActiveProc, maxActiveProc);
+ if(numActiveObjects < 0.1*numInactiveObjects) {
+ // only a small number of active objects, only migrate them
+ for(int i = 0; i < stats->objData.size(); i++){
+ if (!stats->objData[i].migratable) continue;
+
+ LDObjData &odata = stats->objData[i];
+ TaggedVector3D* udata =
+ (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index));
+ if(udata->numActiveParticles == 0) {
+ stats->objData[i].migratable = 0;
+ stats->n_migrateobjs--;
+ }
+ }
+ }
+ else {
+ CkPrintf("Migrating all: numActiveObjects: %d, numInactiveObjects: %d\n",
+ numActiveObjects, numInactiveObjects);
+ }
+
+ // let the strategy take over on this modified instrumented data and processor information
+ work2(stats);
+#endif //CMK_LDB_ON
+}
+
+/// @brief SFC load balance.
+void MultistepLB_SFC::work2(BaseLB::LDStats *stats){
+ const int numobjs = stats->objData.size();
+ const int nmig = stats->n_migrateobjs;
+
+ // this data structure is used by the SFC strategy
+ // to balance objects. it is NOT indexed by tree piece index
+ // there are as many entries in it as there are
+ // migratable (active) tree pieces
+ vector<SFCObject> tp_array;
+ tp_array.resize(nmig);
+
+ if (_lb_args.debug()>=2) {
+ CkPrintf("[work2] ready tp_array data structure\n");
+ }
+
+ int numProcessed = 0;
+
+ double dBgLoad = 0.0;
+ for(int i = 0; i < stats->nprocs(); i++){
+ dBgLoad += stats->procs[i].bg_walltime;
+ }
+ dBgLoad /= numobjs;
+
+ dTotalLoad = 0.0;
+ for(int i = 0; i < numobjs; i++){
+ if(!stats->objData[i].migratable) continue;
+
+ float load;
+ LDObjData &odata = stats->objData[i];
+ TaggedVector3D* udata = (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index));
+ if(step() == 0){ // no load information, balance by particle numbers
+ load = udata->myNumParticles;
+ }
+ else{
+ // give each piece a portion of the background load
+ load = stats->objData[i].wallTime + dBgLoad;
+ }
+
+ tp_array[numProcessed] = SFCObject(i, load);
+ tp_array[numProcessed].centroid = udata->vec;
+ numProcessed++;
+ dTotalLoad += load;
+ }
+
+ if(verbosity > 0)
+ CkPrintf("Avg active load %g; Avg bg load %g\n", dTotalLoad/numobjs,
+ dBgLoad);
+
+ CkAssert(numProcessed==nmig);
+
+ sfcPrepare(tp_array, nmig, stats);
+ sfcPartition(stats->nprocs(),tp_array, stats);
+
+ // refine(stats, numobjs);
+ Orb_PrintLBStats(stats, numobjs);
+
+ if(_lb_args.debug() >= 2) {
+ // Write out "particle file" of load balance information
+ auto achFileName = make_formatted_string("lb.%d.sim", step());
+ write_LB_particles(stats, achFileName.c_str(), false);
+ }
+}
+
+/// @brief Prepare structures for the ORB partition.
+/// @param tp_array Reference to Vector of Objects representing TreePieces.
+/// @param nObjs Number of tree pieces to partition.
+/// @param stats Data from the load balancing framework.
+/// @param node_partition Are we partitioning on nodes.
+void MultistepLB_SFC::sfcPrepare(vector<SFCObject> &tp_array,
+ int nObjs,
+ BaseLB::LDStats *stats,
+ bool node_partition){
+
+ OrientedBox<float> boundingBox;
+ int nmig = stats->n_migrateobjs;
+ if(dMaxBalance < 1.0)
+ dMaxBalance = 1.0;
+
+ // If using node based orb partition, then the maxPieceProc is total
+ // migratable objs / total number of node.
+ if (node_partition) {
+ maxPieceProc = dMaxBalance * nmig / CkNumNodes();
+ } else {
+ maxPieceProc = dMaxBalance*nmig/stats->nprocs();
+ }
+
+ if(maxPieceProc < 1.0)
+ maxPieceProc = 1.01;
+
+ CkAssert(tp_array.size() == nObjs);
+
+ mapping = &stats->to_proc;
+ from = &stats->from_proc;
+
+ CkPrintf("[LB_SFC] sorting\n");
+ for(int i = 0; i < nObjs; i++)
+ boundingBox.grow(tp_array[i].centroid);
+
+ // N.B. code below from TreePiece::assignKeys().
+ // Refactoring is a possibility.
+ // get longest axis
+ Vector3D<float> bsize = boundingBox.size();
+ float max = (bsize.x > bsize.y) ? bsize.x : bsize.y;
+ max = (max > bsize.z) ? max : bsize.z;
+ //
+ // Make the bounding box cubical.
+ //
+ Vector3D<float> bcenter = boundingBox.center();
+ // The magic number below is approximately 2^(-19)
+ const float fEps = 1.0 + 1.91e-6; // slop to ensure keys fall
+ // between 0 and 1.
+ bsize = Vector3D<float>(fEps*0.5*max);
+ boundingBox = OrientedBox<float>(bcenter-bsize, bcenter+bsize);
+ if(verbosity > 1)
+ ckout << "TreePiece: Bounding box now: " << boundingBox << endl;
+
+ for(unsigned int i = 0; i < nObjs; ++i) {
+ tp_array[i].key = SFC::generateKey(tp_array[i].centroid, boundingBox);
+ }
+ sort(tp_array.begin(),tp_array.end());
+}
+
+/// @brief Partition treepieces among processors by
+/// dividing the SFC as evenly as possible.
+/// @param nprocs Number of processors over which to partition the
+/// pieces. N.B. if node_partition is true, then this is the number of nodes.
+/// @param tp Vector of TreePiece data.
+/// @param stats Load balance data
+void MultistepLB_SFC::sfcPartition(int nProcs, vector<SFCObject> & tp,
+ BaseLB::LDStats *stats,
+ bool node_partition){
+
+ double loadPrev = 0.0; // load on all lower processors
+ int iCurrPiece = 0; // Piece under consideration
+ const int nPieces = tp.size();
+ for (int iProc = 0; iProc < nProcs && iCurrPiece < nPieces; iProc++) {
+ if (!stats->procs[iProc].available)
+ continue;
+ // always assign one piece to a processor
+ SFCObject &oSFC = tp[iCurrPiece];
+ (*mapping)[oSFC.lbindex] = iProc;
+ double loadCurr = oSFC.load;
+ iCurrPiece++;
+ int nCurrPiece = 1; // number of pieces on this processor
+ double loadTarget = (iProc+1)*dTotalLoad/nProcs;
+ double dLoadError = fabs(loadPrev + loadCurr - loadTarget);
+
+ while ((nCurrPiece < maxPieceProc)
+ && fabs(tp[iCurrPiece].load + loadPrev + loadCurr - loadTarget)
+ < dLoadError
+ && iCurrPiece < nPieces) { // add pieces to this
+ // processor to get the
+ // closest to the target
+ // load
+ oSFC = tp[iCurrPiece];
+ loadCurr += oSFC.load;
+ (*mapping)[oSFC.lbindex] = iProc;
+ dLoadError = fabs(loadPrev + loadCurr - loadTarget);
+ iCurrPiece++;
+ nCurrPiece++;
+ }
+ loadPrev += loadCurr;
+ }
+ CkAssert(iCurrPiece == tp.size());
+}
+
+void MultistepLB_SFC::pup(PUP::er &p){
+ CBase_MultistepLB_SFC::pup(p);
+}
+
+#include "MultistepLB_SFC.def.h"
diff --git a/MultistepLB_SFC.h b/MultistepLB_SFC.h
@@ -0,0 +1,74 @@
+#ifndef _MULTISTEPLB_SFC_H_
+#define _MULTISTEPLB_SFC_H_
+
+#include "MultistepLB_SFC.decl.h"
+#include "Vector3D.h"
+#include "cosmoType.h"
+#include "SFC.h"
+#include "CentralLB.h"
+
+void Orb_PrintLBStats(BaseLB::LDStats *stats, int numobjs);
+void write_LB_particles(BaseLB::LDStats* stats, const char *achFileName, bool bFrom);
+
+/// @brief Multistep load balancer using Space Filling Curve
+///
+/// This balancer recognizes different "phases" (called rungs in other
+/// parts of the code), and uses loads based on measurements of the
+/// previous calculation at the same phase. For large phases, (i.e.,
+/// when many particles are active, the TreePieces are divided among
+/// the processors using a Space Filling Curve based on the centroids
+/// of the TreePieces.
+///
+
+class MultistepLB_SFC : public CBase_MultistepLB_SFC {
+private:
+ void init();
+ bool QueryBalanceNow(int step);
+
+ decltype(BaseLB::LDStats::to_proc) *mapping;
+ decltype(BaseLB::LDStats::from_proc) *from;
+ /// total computational cost to be balanced
+ double dTotalLoad;
+ /// Maximum number of pieces per processor
+ double maxPieceProc;
+
+public:
+ MultistepLB_SFC(const CkLBOptions &);
+ MultistepLB_SFC(CkMigrateMessage *m) : CBase_MultistepLB_SFC(m) {
+ init();
+ }
+
+ class SFCObject
+ {
+ public:
+ /// index into LB stats->objData
+ int lbindex;
+ /// Spacial location of TreePiece
+ Vector3D<cosmoType> centroid;
+ SFC::Key key;
+ /// computational cost of this object
+ double load;
+
+ SFCObject() : lbindex(-1), load(0) {}
+ SFCObject(int _lbindex, double _load) :
+ lbindex(_lbindex),
+ load(_load)
+ {
+ }
+ bool operator<(const SFCObject &o) const{
+ return key < o.key;
+ }
+ };
+
+ void work(BaseLB::LDStats* stats);
+ void work2(BaseLB::LDStats* stats);
+ void sfcPrepare(std::vector<SFCObject> &tp_array,
+ int nObjs, BaseLB::LDStats * stats,
+ bool node_partition=false);
+ void sfcPartition(int nProcs, std::vector<SFCObject> & tp,
+ BaseLB::LDStats *stats, bool node_partition=false);
+ void pup(PUP::er &p);
+};
+
+
+#endif /* _MultistepLB_notopo */