forked from N-BodyShop/changa
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ParallelGravity.h
2109 lines (1837 loc) · 69.3 KB
/
ParallelGravity.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/** @file ParallelGravity.h
*/
#ifndef PARALLELGRAVITY_H
#define PARALLELGRAVITY_H
#include "config.h"
#include <string>
#include <map>
#include <vector>
#include <algorithm>
#include "pup_stl.h"
#include "ckio.h"
#include "Vector3D.h"
#include "tree_xdr.h"
#include "TipsyFile.h"
#include "SFC.h"
#include "TreeNode.h"
#include "GenericTreeNode.h"
#include "Interval.h"
#include "parameters.h"
#include "param.h"
#include "dumpframe.h"
#include <liveViz.h>
#include "TaggedVector3D.h"
#include "codes.h"
#include "CacheInterface.h"
#ifdef SPCUDA
#include "EwaldCUDA.h"
#endif
#ifdef CUDA
#include "cuda_typedef.h"
#endif
#include "keytype.h"
PUPbytes(InDumpFrame);
PUPbytes(COOL);
PUPbytes(COOLPARAM);
#ifdef HPM_COUNTER
#include <libhpm.h>
#endif
#include <map>
#define MERGE_REMOTE_REQUESTS_VERBOSE /*CkPrintf*/
using namespace std;
using namespace Tree;
/// Load balancers that need the spatial information.
enum LBStrategy{
Null=0,
Multistep,
Orb3d,
Multistep_notopo,
MultistepNode_notopo,
Orb3d_notopo,
MultistepOrb,
HierarchOrb
};
PUPbytes(LBStrategy);
#ifdef SELECTIVE_TRACING
enum TraceState {
TraceNormal = 0,
TraceSkip
};
#endif
/// Possible domain decomposition methods
enum DomainsDec {
SFC_dec=0, // Space Filling Curve with Morton ordering
Oct_dec=1, // Oct tree
ORB_dec=2, // Bisect the longest axis, balancing particles
SFC_peano_dec=3, // SFC with Peano-Hilbert ordering
SFC_peano_dec_3D=4, // Joachim Stadel's implementation of P-H ordering
SFC_peano_dec_2D=5, // 2D version of Peano-Hilbert ordering
ORB_space_dec=6 // Bisect space
};
/// Directions for sending boundaries
enum NborDir {
LEFT = 0,
RIGHT
};
PUPbytes(NborDir);
/// tolerance for unequal pieces in SFC based decompositions.
const double ddTolerance = 0.1;
inline void operator|(PUP::er &p,DomainsDec &d) {
int di;
if (p.isUnpacking()) {
p | di;
d = (DomainsDec)di;
} else {
di = (int)d;
p | di;
}
}
#include "GravityParticle.h"
class SmoothParams;
/// Class for new maxOrder broadcast
class NewMaxOrder
{
public:
int64_t nMaxOrderGas;
int64_t nMaxOrderDark;
int64_t nMaxOrder;
void pup(PUP::er& p) {
p| nMaxOrderGas;
p| nMaxOrderDark;
p| nMaxOrder;
}
};
#include "InOutput.h"
#include "ParallelGravity.decl.h"
extern CProxy_Main mainChare;
extern int verbosity;
extern bool _cache;
extern int _nocache;
extern int _cacheLineDepth;
extern unsigned int _yieldPeriod;
extern DomainsDec domainDecomposition;
extern double dExtraStore;
extern double dMaxBalance;
extern double dFracLoadBalance;
extern double dGlassDamper;
extern int bUseCkLoopPar;
extern GenericTrees useTree;
extern CProxy_TreePiece treeProxy;
#ifdef REDUCTION_HELPER
extern CProxy_ReductionHelper reductionHelperProxy;
#endif
extern CProxy_LvArray lvProxy; // Proxy for the liveViz array
extern CProxy_LvArray smoothProxy; // Proxy for smooth reduction
extern CProxy_LvArray gravityProxy; // Proxy for gravity reduction
extern CProxy_TreePiece streamingProxy;
extern CProxy_DataManager dMProxy;
extern CProxy_IntraNodeLBManager nodeLBMgrProxy;
extern unsigned int numTreePieces;
extern unsigned int particlesPerChare;
extern int nIOProcessor;
extern CProxy_DumpFrameData dfDataProxy;
extern CProxy_PETreeMerger peTreeMergerProxy;
extern CProxy_CkCacheManager<KeyType> cacheGravPart;
extern CProxy_CkCacheManager<KeyType> cacheSmoothPart;
extern CProxy_CkCacheManager<KeyType> cacheNode;
/// The group ID of your DataManager. You must set this!
extern CkGroupID dataManagerID;
extern int boundaryEvaluationUE;
extern int weightBalanceUE;
extern int networkProgressUE;
extern int nodeForceUE;
extern int partForceUE;
extern int tbRecursiveUE;
extern int tbFlushRequestsUE;
extern int prefetchDoneUE;
extern int _prefetch;
extern int _randChunks;
extern int _numChunks;
extern unsigned int bucketSize;
//jetley
extern int localNodesPerReq;
extern int remoteNodesPerReq;
extern int remoteResumeNodesPerReq;
extern int localPartsPerReq;
extern int remotePartsPerReq;
extern int remoteResumePartsPerReq;
extern double largePhaseThreshold;
extern cosmoType theta;
extern cosmoType thetaMono;
extern int numInitDecompBins;
extern int octRefineLevel;
/// @brief Message to efficiently start entry methods with no arguments.
class dummyMsg : public CMessage_dummyMsg{
public:
};
#if COSMO_STATS > 0
class TreePieceStatistics {
u_int64_t nodesOpenedLocal;
u_int64_t nodesOpenedRemote;
u_int64_t nodeInterLocal;
u_int64_t nodeInterRemote;
u_int64_t particleInterLocal;
u_int64_t particleInterRemote;
u_int64_t openCriterionCalls;
int nActive;
TreePieceStatistics() : nodesOpenedLocal(0), nodesOpenedRemote(0),
nodeInterLocal(0), nodeInterRemote(0), particleInterLocal(0),
particleInterRemote(0), openCriterionCalls(0), nActive(0) { }
public:
TreePieceStatistics(u_int64_t nol, u_int64_t nor, u_int64_t occ, u_int64_t nil, u_int64_t nir,
u_int64_t pil, u_int64_t pir, int na) :
nodesOpenedLocal(nol), nodesOpenedRemote(nor), nodeInterLocal(nil),
nodeInterRemote(nir), particleInterLocal(pil), particleInterRemote(pir),
openCriterionCalls(occ), nActive(na) { }
void printTo(CkOStream &os) {
os << " TreePiece: " << nActive << " particles active." << endl;
os << " TreePiece: " << nodesOpenedLocal << " local nodes opened, ";
os << nodesOpenedRemote << " remote" << endl;
os << " TreePiece: " << openCriterionCalls << " num of open criterion calls" << endl;
os << " TreePiece: " << nodeInterLocal << " local particle-node interactions, ";
os << nodeInterRemote << " remote" << endl;
os << " TreePiece: " << particleInterLocal << " local particle-particle interactions, ";
os << particleInterRemote << " remote" << endl;
os << " TreePiece: "
<< (particleInterLocal + particleInterRemote)/(double) nActive
<< " particles, "
<< (nodeInterLocal + nodeInterRemote)/(double) nActive
<< " nodes per particle" << endl;
}
static CkReduction::reducerType sum;
static CkReductionMsg *sumFn(int nMsg, CkReductionMsg **msgs) {
TreePieceStatistics ret;
for (int i=0; i<nMsg; ++i) {
CkAssert(msgs[i]->getSize() == sizeof(TreePieceStatistics));
TreePieceStatistics *data = (TreePieceStatistics *)msgs[i]->getData();
ret.nodesOpenedLocal += data->nodesOpenedLocal;
ret.nodesOpenedRemote += data->nodesOpenedRemote;
ret.openCriterionCalls += data->openCriterionCalls;
ret.nodeInterLocal += data->nodeInterLocal;
ret.nodeInterRemote += data->nodeInterRemote;
ret.particleInterLocal += data->particleInterLocal;
ret.particleInterRemote += data->particleInterRemote;
ret.nActive += data->nActive;
}
return CkReductionMsg::buildNew(sizeof(TreePieceStatistics), &ret);
}
};
#endif
/// @brief Message to start a remote gravity walk.
class ComputeChunkMsg : public CMessage_ComputeChunkMsg {
ComputeChunkMsg() {} // not available
public:
int chunkNum;
ComputeChunkMsg(int i) : chunkNum(i) {
}
};
/// @brief Message for evaluating splits for the ORB domain decomposition
///
/// This message contains the splitting dimensions and values for an
/// ORB tree. The size arrays pos and dim are set by arguments to
/// the new() operator, but should be equal to length.
class ORBSplittersMsg : public CMessage_ORBSplittersMsg{
public:
/// Number of splits
int length;
/// Positions of splits
double *pos;
/// Dimension of splits
char *dim;
/// Callback for reduction of particle counts
CkCallback cb;
ORBSplittersMsg(int len, CkCallback callback): length (len), cb(callback) {}
};
/// Message for shuffling particles during domain decomposition
class ParticleShuffleMsg : public CMessage_ParticleShuffleMsg{
public:
int nloads;
int n;
int nSPH;
int nStar;
double load;
double *loads;
unsigned int *parts_per_phase;
GravityParticle *particles;
extraSPHData *pGas;
extraStarData *pStar;
ParticleShuffleMsg(int nload, int npart, int nsph, int nstar, double pload):
nloads(nload), n(npart), nSPH(nsph), nStar(nstar), load(pload) {}
};
#ifdef PUSH_GRAVITY
#include "ckmulticast.h"
struct BucketMsg : public CkMcastBaseMsg, public CMessage_BucketMsg {
GenericTreeNode *buckets;
int numBuckets;
ExternalGravityParticle *particles;
int numParticles;
int whichTreePiece;
};
#endif
/// Class to count added and deleted particles
class CountSetPart
{
public:
int index; /* chare index */
int nAddGas;
int nDelGas;
int nAddDark;
int nDelDark;
int nAddStar;
int nDelStar;
void pup(PUP::er& p) {
p | index;
p | nAddGas;
p | nDelGas;
p | nAddDark;
p | nDelDark;
p | nAddStar;
p | nDelStar;
}
};
/*
* Multistepping routines
*
* Each major timestep can have MAXSUBSTEPS substeps where MAXSUBSTEPS
* is a large power of 2 that can fit in an integer: 1 << MAXRUNG,
* with MAXRUNG something like 30.
* A given particle is on a "Rung" such that it takes 1 << Rung
* substeps per major timestep. That is, it's force is updated every
* 1 << (MAXRUNG - Rung) smallest substeps.
*
* Needed routines:
* DtToRung(): take an ideal timestep, the major timestep and
* determine a rung.
*/
const int MAXRUNG = 30;
const int MAXSUBSTEPS = 1 << MAXRUNG;
const double MAXSUBSTEPS_INV = 1 / (double)MAXSUBSTEPS;
/// @brief Given a rung, return the number of substeps in one big step.
inline int RungToSubsteps(int iRung) {
CkAssert(iRung <= MAXRUNG);
return 1 << (MAXRUNG - iRung);
}
/// @brief Given the size of the big step, and a desired timestep,
/// return the rung of the largest timestep less than dTideal.
inline int DtToRung(double dDelta, double dTideal) {
int iSteps = (int) ceil(dDelta/dTideal);
int iRung = 0;
iSteps--;
while(iSteps > 0) {
iRung++;
iSteps >>= 1;
}
return iRung;
}
/// @brief Given the size of the big step, and a rung, return the
/// corresponding timestep size.
inline double RungToDt(double dDelta, int iRung) {
return dDelta*RungToSubsteps(iRung)*MAXSUBSTEPS_INV;
}
/// @brief slot in MultistepLB to hold feedback phase load information
const int PHASE_FEEDBACK = MAXRUNG + 1;
/// @brief Pressure floor to force Jeans length to be larger than the
/// spatial resolution.
inline double PoverRhoFloorJeans(double dResolveJeans, GravityParticle *p)
{
/*
* Add pressure floor to keep Jeans Mass
* resolved. In comparison with Agertz et
* al. 2009, dResolveJeans should be 3.0:
* P_min = 3*G*max(h,eps)^2*rho^2
* Note that G = 1 in our code
*/
#ifdef JEANSSOFTONLY
double l2 = p->soft*p->soft;
#else
double l2 = 0.25*p->fBall*p->fBall;
#ifdef JEANSSOFT
double e2 = p->soft*p->soft;
if (l2 < e2) l2 = e2; /* Jeans scale can't be smaller than softening */
#endif
#endif
return l2*dResolveJeans*p->fDensity;
}
/// @brief Adiabatic index to use with the Jeans pressure floor.
const double GAMMA_JEANS = 2.0;
const double GAMMA_NONCOOL = 5.0/3.0;
/// @brief Overall flow control of the simulation.
///
/// As well as controlling the overall flow of the simulation, the
/// constructors are the main entry points into the program.
/// The sequence of tasks is: read the simulation parameters (Main()),
/// read in the initial conditions (setupICs()), calculate the initial
/// forces (initialForces()), then iterate across timesteps and write
/// the final output (doSimulation()).
///
class Main : public CBase_Main {
CkArgMsg *args;
std::string basefilename;
/// Save parameters for output
OutputParams *pOutput;
/// globally finished IO
CkCallback cbIO;
/// Save file token for CkIO
Ck::IO::File fIOFile;
CProxy_Sorter sorter;
int64_t nTotalParticles;
int64_t nTotalSPH;
int64_t nTotalDark;
int64_t nTotalStar;
/// Total Sink Particles
int64_t nSink;
int64_t nMaxOrderGas; /* Maximum iOrders */
int64_t nMaxOrderDark;
int64_t nMaxOrder;
double dTime; /* Simulation time */
double dTime0; ///< Simulation time at dStep = 0
double dEcosmo; /* variables for integrating
Lazer-Irvine eq. */
double dUOld;
double dTimeOld;
PRM prm; /* parameter parsing info */
Parameters param; /* actual parameters */
CkVec<double> vdOutTime; // Desired output times
int iOut;
/*
** Tracking for frame dumping function
*/
int bDumpFrame;
struct DumpFrameContext **df;
int bIsRestarting;
/// SPH Alpha has been read in.
int bHaveAlpha;
int bChkFirst; /* alternate between 0 and 1 for checkpoint */
double dSimStartTime; // Start time for entire simulation
int iStop; /* indicate we're stopping the
simulation early */
int64_t nActiveGrav;
int64_t nActiveSPH;
#ifdef CUDA
double localNodesPerReqDouble;
double remoteNodesPerReqDouble;
double remoteResumeNodesPerReqDouble;
double localPartsPerReqDouble;
double remotePartsPerReqDouble;
double remoteResumePartsPerReqDouble;
#endif
#ifdef CHECK_TIME_WITHIN_BIGSTEP
double wallTimeStart;
#endif
/// @brief Hold wall clock timings of calculation phases for a
/// given rung.
class timing_fields {
public:
int count; ///< number of times on this rung
double tGrav; ///< Gravity time
double tuDot; ///< Energy integration
double tDD; ///< Domain Decomposition
double tLoadB; ///< Load Balancing
double tTBuild; ///< Tree Building
double tAdjust; ///< Timestep adjustment
double tEmergAdjust; ///< Emergency Timestep adjustment
double tKick; ///< Kick time
double tDrift; ///< Drift time
double tCache; ///< Cache teardown
public:
///@brief Zero out fields
void clear() {
count = 0;
tGrav = tuDot = tDD = tLoadB = tTBuild = tAdjust
= tEmergAdjust = tKick = tDrift = tCache = 0.0;
}
};
CkVec<timing_fields> timings; ///< One element for each rung.
void writeTimings(int iStep);
#ifdef SELECTIVE_TRACING
int monitorRung;
int monitorStart;
int numTraceIterations;
int numSkipIterations;
int numMaxTrace;
int traceIteration;
int traceState;
bool projectionsOn;
void turnProjectionsOn(int activeRung);
void turnProjectionsOff();
#endif
public:
Main(CkArgMsg* m);
Main(CkMigrateMessage *m);
void niceExit();
void setupICs();
void initialForces();
void doSimulation();
void restart(CkCheckpointStatusMsg *msg);
void waitForGravity(const CkCallback &cb, double startTime,
int activeRung);
void advanceBigStep(int);
void domainDecomp(int iPhase);
void loadBalance(int iPhase);
void buildTree(int iPhase);
void startGravity(const CkCallback& cbGravity, int iActiveRung,
double *startTime) ;
void externalGravity(int iActiveRung);
void updateuDot(int iActiveRung, const double duKick[],
const double dStartTime[], int bUpdateState, int bAll);
void kick(bool bClosing, int iActiveRung, int nextMaxRung,
const CkCallback &cbGravity, double gravStartTime);
int adjust(int iKickRung);
void rungStats();
void countActive(int activeRung);
void emergencyAdjust(int iRung);
void starCenterOfMass();
void calcEnergy(double, double, const char *);
void getStartTime();
void getOutTimes();
int bOutTime();
void writeOutput(int iStep) ;
void outputBinary(OutputParams& params, int bParaWrite,
const CkCallback& cb);
void cbOpen(Ck::IO::FileReadyMsg *msg);
void cbIOReady(Ck::IO::SessionReadyMsg *msg);
void cbIOComplete(CkMessage *msg);
void cbIOClosed(CkMessage *msg);
std::string getNCNextOutput(OutputParams& params);
void updateSoft();
void growMass(double dTime, double dDelta);
void initSph();
void initCooling();
void initStarLog();
int ReadASCII(char *extension, int nDataPerLine, double *dDataOut);
void restartGas();
void doSph(int activeRung, int bNeedDensity = 1);
void AGORAfeedbackPreCheck(double dTime, double dDelta, double dTimeToSF);
void FormStars(double dTime, double dDelta);
void StellarFeedback(double dTime, double dDelta);
void outputBlackHoles(double dTime);
void SetSink();
void FormSinks(double dTime, double dDelta, int iKickRung);
void doSinks(double dTime, double dDelta, int iKickRung);
int DumpFrameInit(double dTime, double dStep, int bRestart);
void DumpFrame(double dTime, double dStep);
int nextMaxRungIncDF(int nextMaxRung);
void addDelParticles();
void memoryStats();
void memoryStatsCache();
void pup(PUP::er& p);
void liveVizImagePrep(liveVizRequestMsg *msg);
};
/* IBM brain damage */
#undef hz
/// @brief Coefficients for the Fourier space part of the Ewald sum.
typedef struct ewaldTable {
double hx,hy,hz;
double hCfac,hSfac;
} EWT;
// jetley
class MissRecord;
class State;
///Remote Cell interaction lists for all tree levels
typedef struct OffsetNodeStruct
{
GenericTreeNode *node;
int offsetID;
}OffsetNode;
#if INTERLIST_VER > 0
/// @brief Queue of nodes to check for interactions.
typedef CkQ<OffsetNode> CheckList;
/// @brief Vector of nodes that are undecided at this level.
typedef CkVec<OffsetNode> UndecidedList;
/// @brief Vector of undecided lists, one for each level.
typedef CkVec<UndecidedList> UndecidedLists;
#endif
/// @brief Remote particles in an interaction list.
typedef struct particlesInfoR{
ExternalGravityParticle* particles;
int numParticles;
Vector3D<cosmoType> offset;
#if defined CHANGA_REFACTOR_PRINT_INTERACTIONS || defined CHANGA_REFACTOR_WALKCHECK_INTERLIST || defined CUDA
NodeKey key;
#endif
#if COSMO_DEBUG > 1
GenericTreeNode *nd;
#endif
} RemotePartInfo;
/// @brief Local particles in an interaction list.
typedef struct particlesInfoL{
GravityParticle* particles;
int numParticles;
Vector3D<cosmoType> offset;
#if defined CHANGA_REFACTOR_PRINT_INTERACTIONS || defined CHANGA_REFACTOR_WALKCHECK_INTERLIST || defined CUDA
NodeKey key;
#endif
#if COSMO_DEBUG > 1 || defined CUDA
GenericTreeNode *nd;
#endif
} LocalPartInfo;
/** @brief Data needed for the CkLoop intranode parallelization.
*
* This structure holds the data that needs to be passed to a free
* processor so that it can calculate the gravity interactions.
* Each attribute is a list so that multiple buckets can be operated on.
*/
typedef struct LoopParDataStruct {
CkVec<GenericTreeNode*> lowNodes; ///< Lowest node containing the
/// buckets to interact
CkVec<int> bucketids; ///< startBucket number
CkVec<int> chunkids; ///< remote walk chunk number
CkVec<CkVec<OffsetNode> > clists; ///< Cell interactions
CkVec<CkVec<RemotePartInfo> > rpilists; ///< Remote particle interactions
CkVec<CkVec<LocalPartInfo> > lpilists; ///< Local particle interactions
TreePiece* tp; ///< Treepiece that owns this data
} LoopParData;
#ifdef CUDA
struct BucketActiveInfo{
int start;
int size;
};
#endif
class SmoothCompute;
#include "Compute.h"
#if INTERLIST_VER > 0 && defined CUDA
template<typename T> class GenericList;
#endif
/// @brief client that has requested a moment.
struct NonLocalMomentsClient {
TreePiece *clientTreePiece;
GenericTreeNode *clientNode;
NonLocalMomentsClient() :
clientTreePiece(NULL),
clientNode(NULL)
{}
NonLocalMomentsClient(TreePiece *tp, GenericTreeNode *node) :
clientTreePiece(tp),
clientNode(node)
{}
};
/// @brief List of clients needing a particular moment
struct NonLocalMomentsClientList {
GenericTreeNode *targetNode;
CkVec<NonLocalMomentsClient> clients;
NonLocalMomentsClientList() :
targetNode(NULL)
{}
NonLocalMomentsClientList(GenericTreeNode *node) :
targetNode(node)
{}
void addClient(const NonLocalMomentsClient &cli){
clients.push_back(cli);
}
};
/// Fundamental structure that holds particle and tree data.
class TreePiece : public CBase_TreePiece {
// jetley
friend class PrefetchCompute;
friend class GravityCompute;
friend class SmoothCompute;
friend class KNearestSmoothCompute;
friend class ReSmoothCompute;
friend class MarkSmoothCompute;
friend class ListCompute;
friend class NearNeighborState;
friend class ReNearNeighborState;
friend class MarkNeighborState;
friend class BottomUpTreeWalk;
#if INTERLIST_VER > 0 && defined CUDA
friend class DataManager;
template<typename T> friend class GenericList;
#endif
friend class RemoteTreeBuilder;
friend class LocalTreeBuilder;
/// @brief Walk for gravity prefetch
TreeWalk *sTopDown;
TreeWalk *twSmooth;
#if INTERLIST_VER > 0
TreeWalk *sInterListWalk;
// clearable, used for resumed walks
State *sInterListStateRemoteResume;
#endif
Compute *sGravity, *sPrefetch;
SmoothCompute *sSmooth;
Opt *sLocal, *sRemote, *sPref;
Opt *optSmooth;
State *sPrefetchState;
/// Keeps track of the gravity walks over the local tree.
State *sLocalGravityState, *sRemoteGravityState, *sSmoothState;
typedef std::map<KeyType, CkVec<int>* > SmPartRequestType;
// buffer of requests for smoothParticles.
SmPartRequestType smPartRequests;
CkVec<ActiveWalk> activeWalks;
int completedActiveWalks; // XXX this should be part of the gravity
// walk state.
int nCacheAccesses; // keep track of outstanding cache accesses to
// know when writebacks complete. XXX this
// should be part of the smooth state
double treePieceLoad; // used to store CPU load data for incoming particles
double treePieceLoadTmp; // temporary accumulator for above
double treePieceLoadExp;
unsigned int treePieceActivePartsTmp;
std::vector<double> savedPhaseLoad;
std::vector<unsigned int> savedPhaseParticle;
std::vector<double> savedPhaseLoadTmp;
std::vector<unsigned int> savedPhaseParticleTmp;
int memWithCache, memPostCache; // store memory usage.
int nNodeCacheEntries, nPartCacheEntries; // store memory usage.
#ifdef PUSH_GRAVITY
bool doMerge;
bool createdSpanningTree;
CProxySection_TreePiece allTreePieceSection;
CkVec<GravityParticle> foreignParticles;
CkVec<double> foreignParticleAccelerations;
map<int,CkSectionInfo> cookieJar;
BucketMsg *createBucketMsg();
void unpackBuckets(BucketMsg *, GenericTreeNode *&foreignBuckets, int &numForeignBuckets);
void calculateForces(GenericTreeNode *foreignBuckets, int numForeignBuckets);
#endif
public:
#ifdef PUSH_GRAVITY
void startPushGravity(int am, double myTheta);
void recvPushBuckets(BucketMsg *);
void recvPushAccelerations(CkReductionMsg *);
#endif
#if COSMO_PRINT_BK > 1
State *getSRemoteGravityState(){ return sRemoteGravityState; }
State *getSLocalGravityState(){ return sLocalGravityState; }
#endif
void memCacheStats(const CkCallback &cb);
void addActiveWalk(int iAwi, TreeWalk *tw, Compute *c, Opt *o, State *s);
/// @brief Called when walk on the current TreePiece is done.
void markWalkDone();
/// @brief Called when walk on all TreePieces is done.
void finishWalk();
/// @brief Called when smooth walk on the current TreePiece is done.
void markSmoothWalkDone();
/// @brief Called when smooth walk on all TreePieces is done.
void finishSmoothWalk();
int getIndex() {
return thisIndex;
}
/// @brief accumulate node interaction count for statistics
void addToNodeInterRemote(int chunk, int howmany){
nodeInterRemote[chunk] += howmany;
}
/// @brief accumulate particle interaction count for statistics
void addToParticleInterRemote(int chunk, int howmany){
particleInterRemote[chunk] += howmany;
}
/// @brief accumulate node interaction count for statistics
void addToNodeInterLocal(int howmany){
nodeInterLocal += howmany;
}
/// @brief accumulate particle interaction count for statistics
void addToParticleInterLocal(int howmany){
particleInterLocal += howmany;
}
/// Start prefetching the specfied chunk; prefetch compute
/// calls startRemoteChunk() once chunk prefetch is complete
void initiatePrefetch(int chunk);
/// Start a new remote computation upon prefetch finished
void startRemoteChunk();
/// Return the number of particles on this TreePiece.
int getNumParticles(){
return myNumParticles;
}
/// Return the pointer to the particles on this TreePiece.
GravityParticle *getParticles(){return myParticles;}
#ifdef CUDA
// this variable holds the number of buckets active at
// the start of an iteration
// it is used to ascertain how many buckets still need to
// be processed via the stateReady function with regard
// to their local and remote-no-resume walks.
// if all numActiveBuckets
// have been processed but we still have leftover nodes/particles
// in the list of interations to the sent to the gpu, we flush
// the list
int numActiveBuckets;
int myNumActiveParticles;
// First and Last indices of GPU particle
int FirstGPUParticleIndex;
int LastGPUParticleIndex;
int NumberOfGPUParticles;
BucketActiveInfo *bucketActiveInfo;
int getNumBuckets(){
return numBuckets;
}
void callFreeRemoteChunkMemory(int chunk);
int getActiveRung(){ return activeRung; }
#ifdef HAPI_INSTRUMENT_WRS
int getInstrumentId(){ return instrumentId; }
#endif
// returns either all particles or only active particles,
// depending on fraction of active particles to their
// total count.
int getDMNumParticles(){
if(largePhase()){
return myNumParticles;
}
else{
return myNumActiveParticles;
}
}
int getNumActiveParticles(){
return myNumActiveParticles;
}
void calculateNumActiveParticles(){
myNumActiveParticles = 0;
for(int i = 1; i <= myNumParticles; i++){
if(myParticles[i].rung >= activeRung){
myNumActiveParticles++;
}
}
}
bool largePhase(){
return (1.0*myNumActiveParticles/myNumParticles) >= largePhaseThreshold;
}
void getDMParticles(CompactPartData *fillArray, int &fillIndex){
NumberOfGPUParticles = 0;
FirstGPUParticleIndex = fillIndex;//This is for the GPU Ewald
if(largePhase()){
for(int b = 0; b < numBuckets; b++){
GenericTreeNode *bucket = bucketList[b];
int buckstart = bucket->firstParticle;
int buckend = bucket->lastParticle;
GravityParticle *buckparts = bucket->particlePointer;
bucket->bucketArrayIndex = fillIndex;
for(int i = buckstart; i <= buckend; i++){
fillArray[fillIndex] = buckparts[i-buckstart];
fillIndex++;
}
}
}
else{
for(int b = 0; b < numBuckets; b++){
GenericTreeNode *bucket = bucketList[b];
if(bucket->rungs < activeRung){
continue;
}
BucketActiveInfo *binfo = &(bucketActiveInfo[b]);
int buckstart = bucket->firstParticle;
int buckend = bucket->lastParticle;
GravityParticle *buckparts = bucket->particlePointer;
binfo->start = fillIndex;
for(int i = buckstart; i <= buckend; i++){
if(buckparts[i-buckstart].rung >= activeRung){
fillArray[fillIndex] = buckparts[i-buckstart];
fillIndex++;
}
}
binfo->size = fillIndex-binfo->start;
}
}
//This is for the GPU Ewald
if(FirstGPUParticleIndex == fillIndex){
//This means no particle is on GPU
FirstGPUParticleIndex = -1;
LastGPUParticleIndex = -1;
NumberOfGPUParticles = 0;
}
else{
LastGPUParticleIndex = fillIndex - 1;
NumberOfGPUParticles = LastGPUParticleIndex - FirstGPUParticleIndex + 1;
}
}
bool isActive(int partNum){
return myParticles[partNum].rung >= activeRung;
}
void clearMarkedBuckets(CkVec<GenericTreeNode *> &markedBuckets);
void clearMarkedBucketsAll();
#ifdef CUDA_STATS
long long localNodeInteractions;
long long localPartInteractions;
long long remoteNodeInteractions;
long long remotePartInteractions;
long long remoteResumeNodeInteractions;
long long remoteResumePartInteractions;
#endif
#ifdef HAPI_INSTRUMENT_WRS
int instrumentId;
double localNodeListConstructionTime;
double remoteNodeListConstructionTime;
double remoteResumeNodeListConstructionTime;
double localPartListConstructionTime;
double remotePartListConstructionTime;
double remoteResumePartListConstructionTime;
int nLocalNodeReqs;
int nRemoteNodeReqs;
int nRemoteResumeNodeReqs;
int nLocalPartReqs;
int nRemotePartReqs;
int nRemoteResumePartReqs;
#endif