Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SFC based Multistep load balancer. #175

Merged
merged 5 commits into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ cache_lib_path := @CHARM_PATH@/tmp/libs/ck-libs/cache
threadsafe_ht_path := $(cache_lib_path)/threadsafe_hashtable

# ------- Modules to build ----------------------------------------------------
changa_modules := $(strip MultistepLB MultistepLB_notopo \
changa_modules := $(strip MultistepLB MultistepLB_SFC MultistepLB_notopo \
MultistepNodeLB_notopo Orb3dLB Orb3dLB_notopo HierarchOrbLB)

charm_modules := $(strip CkCache CkIO CkMulticast RefineLB \
Expand Down
10 changes: 10 additions & 0 deletions MultistepLB_SFC.ci
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
module MultistepLB_SFC {

extern module CentralLB;
initnode void lbinit(void);

group [migratable] MultistepLB_SFC : CentralLB {
entry void MultistepLB_SFC(const CkLBOptions &);
};

};
285 changes: 285 additions & 0 deletions MultistepLB_SFC.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
#include <charm++.h>
#include "MultistepLB_SFC.h"
#include "ParallelGravity.h"
#include "Vector3D.h"
#include "formatted_string.h"

CkpvExtern(int, _lb_obj_index);
using namespace std;

#if CHARM_VERSION > 61002
static void lbinit()
{
LBRegisterBalancer<MultistepLB_SFC>("MultistepLB_SFC",
"Works best with multistepped runs; uses SFC distribution");
}
#else
CreateLBFunc_Def(MultistepLB_SFC,
"Works best with multistepped runs; uses SFC distribution");
#endif

void MultistepLB_SFC::init() {
lbname = "MultistepLB_SFC";
if (CkpvAccess(_lb_obj_index) == -1)
CkpvAccess(_lb_obj_index) = LBRegisterObjUserData(sizeof(TaggedVector3D));
}


MultistepLB_SFC::MultistepLB_SFC(const CkLBOptions &opt): CBase_MultistepLB_SFC(opt)
{
init();
if (CkMyPe() == 0){
CkPrintf("[%d] MultistepLB_SFC created\n",CkMyPe());
}
}

bool MultistepLB_SFC::QueryBalanceNow(int step){
if(CkMyPe() == 0) CkPrintf("LB_SFC: Step %d\n", step);
return true;
}

/// @brief Implement load balancing: store loads and determine active
/// processors and objects, sort by SFC, then divide up among processors.
/// @param stats The Load Balancer statistics object.
void MultistepLB_SFC::work(BaseLB::LDStats* stats)
{
#if CMK_LBDB_ON
// find active objects - mark the inactive ones as non-migratable
const auto num_objs = stats->objData.size();

if(_lb_args.debug() >= 2 && step() > 0) {
// Write out "particle file" of measured load balance information
auto achFileName = make_formatted_string("lb_a.%d.sim", step()-1);
write_LB_particles(stats, achFileName.c_str(), true);
}

int numActiveObjects = 0;
int numInactiveObjects = 0;
int minActiveProc = INT_MAX;
int maxActiveProc = 0;

for(int i = 0; i < num_objs; i++){
stats->to_proc[i] = stats->from_proc[i];
}

for(int i = 0; i < num_objs; i++){
if (!stats->objData[i].migratable) continue;

LDObjData &odata = stats->objData[i];
TaggedVector3D* udata = (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index));

if(udata->myNumParticles == 0){ // ignore pieces with no particles
stats->objData[i].migratable = 0;
stats->n_migrateobjs--;
continue;
}
if(udata->numActiveParticles == 0){
numInactiveObjects++;
}
else{
numActiveObjects++;
if(minActiveProc > stats->from_proc[i])
minActiveProc = stats->from_proc[i];
if(maxActiveProc < stats->from_proc[i])
maxActiveProc = stats->from_proc[i];
}
}
CkPrintf("numActiveObjects: %d, numInactiveObjects: %d\n", numActiveObjects,
numInactiveObjects);
CkPrintf("active PROC range: %d to %d\n", minActiveProc, maxActiveProc);
if(numActiveObjects < 0.1*numInactiveObjects) {
// only a small number of active objects, only migrate them
for(int i = 0; i < stats->objData.size(); i++){
if (!stats->objData[i].migratable) continue;

LDObjData &odata = stats->objData[i];
TaggedVector3D* udata =
(TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index));
if(udata->numActiveParticles == 0) {
stats->objData[i].migratable = 0;
stats->n_migrateobjs--;
}
}
}
else {
CkPrintf("Migrating all: numActiveObjects: %d, numInactiveObjects: %d\n",
numActiveObjects, numInactiveObjects);
}

// let the strategy take over on this modified instrumented data and processor information
work2(stats);
#endif //CMK_LDB_ON
}

/// @brief SFC load balance.
void MultistepLB_SFC::work2(BaseLB::LDStats *stats){
const int numobjs = stats->objData.size();
const int nmig = stats->n_migrateobjs;

// this data structure is used by the SFC strategy
// to balance objects. it is NOT indexed by tree piece index
// there are as many entries in it as there are
// migratable (active) tree pieces
vector<SFCObject> tp_array;
tp_array.resize(nmig);

if (_lb_args.debug()>=2) {
CkPrintf("[work2] ready tp_array data structure\n");
}

int numProcessed = 0;

double dBgLoad = 0.0;
for(int i = 0; i < stats->nprocs(); i++){
dBgLoad += stats->procs[i].bg_walltime;
}
dBgLoad /= numobjs;

dTotalLoad = 0.0;
for(int i = 0; i < numobjs; i++){
if(!stats->objData[i].migratable) continue;

float load;
LDObjData &odata = stats->objData[i];
TaggedVector3D* udata = (TaggedVector3D *)odata.getUserData(CkpvAccess(_lb_obj_index));
if(step() == 0){ // no load information, balance by particle numbers
load = udata->myNumParticles;
}
else{
// give each piece a portion of the background load
load = stats->objData[i].wallTime + dBgLoad;
}

tp_array[numProcessed] = SFCObject(i, load);
tp_array[numProcessed].centroid = udata->vec;
numProcessed++;
dTotalLoad += load;
}

if(verbosity > 0)
CkPrintf("Avg active load %g; Avg bg load %g\n", dTotalLoad/numobjs,
dBgLoad);

CkAssert(numProcessed==nmig);

sfcPrepare(tp_array, nmig, stats);
sfcPartition(stats->nprocs(),tp_array, stats);

// refine(stats, numobjs);
Orb_PrintLBStats(stats, numobjs);

if(_lb_args.debug() >= 2) {
// Write out "particle file" of load balance information
auto achFileName = make_formatted_string("lb.%d.sim", step());
write_LB_particles(stats, achFileName.c_str(), false);
}
}

/// @brief Prepare structures for the ORB partition.
/// @param tp_array Reference to Vector of Objects representing TreePieces.
/// @param nObjs Number of tree pieces to partition.
/// @param stats Data from the load balancing framework.
/// @param node_partition Are we partitioning on nodes.
void MultistepLB_SFC::sfcPrepare(vector<SFCObject> &tp_array,
int nObjs,
BaseLB::LDStats *stats,
bool node_partition){

OrientedBox<float> boundingBox;
int nmig = stats->n_migrateobjs;
if(dMaxBalance < 1.0)
dMaxBalance = 1.0;

// If using node based orb partition, then the maxPieceProc is total
// migratable objs / total number of node.
if (node_partition) {
maxPieceProc = dMaxBalance * nmig / CkNumNodes();
} else {
maxPieceProc = dMaxBalance*nmig/stats->nprocs();
}

if(maxPieceProc < 1.0)
maxPieceProc = 1.01;

CkAssert(tp_array.size() == nObjs);

mapping = &stats->to_proc;
from = &stats->from_proc;

CkPrintf("[LB_SFC] sorting\n");
for(int i = 0; i < nObjs; i++)
boundingBox.grow(tp_array[i].centroid);

// N.B. code below from TreePiece::assignKeys().
// Refactoring is a possibility.
// get longest axis
Vector3D<float> bsize = boundingBox.size();
float max = (bsize.x > bsize.y) ? bsize.x : bsize.y;
max = (max > bsize.z) ? max : bsize.z;
//
// Make the bounding box cubical.
//
Vector3D<float> bcenter = boundingBox.center();
// The magic number below is approximately 2^(-19)
const float fEps = 1.0 + 1.91e-6; // slop to ensure keys fall
// between 0 and 1.
bsize = Vector3D<float>(fEps*0.5*max);
boundingBox = OrientedBox<float>(bcenter-bsize, bcenter+bsize);
if(verbosity > 1)
ckout << "TreePiece: Bounding box now: " << boundingBox << endl;

for(unsigned int i = 0; i < nObjs; ++i) {
tp_array[i].key = SFC::generateKey(tp_array[i].centroid, boundingBox);
}
sort(tp_array.begin(),tp_array.end());
}

/// @brief Partition treepieces among processors by
/// dividing the SFC as evenly as possible.
/// @param nprocs Number of processors over which to partition the
/// pieces. N.B. if node_partition is true, then this is the number of nodes.
/// @param tp Vector of TreePiece data.
/// @param stats Load balance data
void MultistepLB_SFC::sfcPartition(int nProcs, vector<SFCObject> & tp,
BaseLB::LDStats *stats,
bool node_partition){

double loadPrev = 0.0; // load on all lower processors
int iCurrPiece = 0; // Piece under consideration
const int nPieces = tp.size();
for (int iProc = 0; iProc < nProcs && iCurrPiece < nPieces; iProc++) {
if (!stats->procs[iProc].available)
continue;
// always assign one piece to a processor
SFCObject &oSFC = tp[iCurrPiece];
(*mapping)[oSFC.lbindex] = iProc;
double loadCurr = oSFC.load;
iCurrPiece++;
int nCurrPiece = 1; // number of pieces on this processor
double loadTarget = (iProc+1)*dTotalLoad/nProcs;
double dLoadError = fabs(loadPrev + loadCurr - loadTarget);

while ((nCurrPiece < maxPieceProc)
&& fabs(tp[iCurrPiece].load + loadPrev + loadCurr - loadTarget)
< dLoadError
&& iCurrPiece < nPieces) { // add pieces to this
// processor to get the
// closest to the target
// load
oSFC = tp[iCurrPiece];
loadCurr += oSFC.load;
(*mapping)[oSFC.lbindex] = iProc;
dLoadError = fabs(loadPrev + loadCurr - loadTarget);
iCurrPiece++;
nCurrPiece++;
}
loadPrev += loadCurr;
}
CkAssert(iCurrPiece == tp.size());
}

void MultistepLB_SFC::pup(PUP::er &p){
CBase_MultistepLB_SFC::pup(p);
}

#include "MultistepLB_SFC.def.h"
74 changes: 74 additions & 0 deletions MultistepLB_SFC.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#ifndef _MULTISTEPLB_SFC_H_
#define _MULTISTEPLB_SFC_H_

#include "MultistepLB_SFC.decl.h"
#include "Vector3D.h"
#include "cosmoType.h"
#include "SFC.h"
#include "CentralLB.h"

void Orb_PrintLBStats(BaseLB::LDStats *stats, int numobjs);
void write_LB_particles(BaseLB::LDStats* stats, const char *achFileName, bool bFrom);

/// @brief Multistep load balancer using Space Filling Curve
///
/// This balancer recognizes different "phases" (called rungs in other
/// parts of the code), and uses loads based on measurements of the
/// previous calculation at the same phase. For large phases, (i.e.,
/// when many particles are active, the TreePieces are divided among
/// the processors using a Space Filling Curve based on the centroids
/// of the TreePieces.
///

class MultistepLB_SFC : public CBase_MultistepLB_SFC {
private:
void init();
bool QueryBalanceNow(int step);

decltype(BaseLB::LDStats::to_proc) *mapping;
decltype(BaseLB::LDStats::from_proc) *from;
/// total computational cost to be balanced
double dTotalLoad;
/// Maximum number of pieces per processor
double maxPieceProc;

public:
MultistepLB_SFC(const CkLBOptions &);
MultistepLB_SFC(CkMigrateMessage *m) : CBase_MultistepLB_SFC(m) {
init();
}

class SFCObject
{
public:
/// index into LB stats->objData
int lbindex;
/// Spacial location of TreePiece
Vector3D<cosmoType> centroid;
SFC::Key key;
/// computational cost of this object
double load;

SFCObject() : lbindex(-1), load(0) {}
SFCObject(int _lbindex, double _load) :
lbindex(_lbindex),
load(_load)
{
}
bool operator<(const SFCObject &o) const{
return key < o.key;
}
};

void work(BaseLB::LDStats* stats);
void work2(BaseLB::LDStats* stats);
void sfcPrepare(std::vector<SFCObject> &tp_array,
int nObjs, BaseLB::LDStats * stats,
bool node_partition=false);
void sfcPartition(int nProcs, std::vector<SFCObject> & tp,
BaseLB::LDStats *stats, bool node_partition=false);
void pup(PUP::er &p);
};


#endif /* _MultistepLB_notopo */
Loading
Loading