Skip to content

Commit

Permalink
add some sanity checks
Browse files Browse the repository at this point in the history
  • Loading branch information
MoraruMaxim committed Mar 29, 2024
1 parent e7f1851 commit 44944a3
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 11 deletions.
15 changes: 13 additions & 2 deletions src/bvals/comms/mm_neigh_token.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ namespace neigh_comm{
MPI_Dist_graph_create_adjacent(comm_, mpi_procs.size(), mpi_procs.data(), MPI_UNWEIGHTED,
mpi_procs.size(), mpi_procs.data(), MPI_UNWEIGHTED,
MPI_INFO_NULL, false, &neigh_comm);

nb_of_comm_built++;
}
/*
* calculate_off_prefix_sum()
Expand Down Expand Up @@ -143,6 +143,8 @@ namespace neigh_comm{
* start_data_exchange_neigh_alltoallv()
*/
void start_data_exchange_neigh_alltoallv(){
if(neigh_comm_in_use) PARTHENON_FAIL("trying to launch an alltoallv operation while the communicator is in use");
neigh_comm_in_use = true;
MPI_Ineighbor_alltoallv(send_comm_buffer.data(), counts.data(), displs.data(), MPI_PARTHENON_REAL,
recv_comm_buffer.data(), counts.data(), displs.data(), MPI_PARTHENON_REAL, neigh_comm, &neigh_request);
}
Expand All @@ -154,13 +156,20 @@ namespace neigh_comm{
int flag_nc = 0;
//if(neigh_request)
MPI_Test(&neigh_request, &flag_nc, MPI_STATUS_IGNORE);
if(flag_nc) neigh_comm_in_use = false;
//MPI_Wait(&neigh_request, MPI_STATUS_IGNORE);
//return true;
return flag_nc;
}

~NeighToken(){
int root_rank = 0;
if(parthenon::Globals::my_rank == root_rank)
std::cout<<"# COMM_BUILD_INFO: Nb_of_comm_build="<<nb_of_comm_built<<std::endl;
}

public:
NeighToken(): building_token_on(false), neigh_request(), send_comm_buffer("send_neigh_buf",100), recv_comm_buffer("recv_neigh_buf",100) {}
NeighToken(): building_token_on(false), neigh_request(), nb_of_comm_built(0), send_comm_buffer("send_neigh_buf",100), recv_comm_buffer("recv_neigh_buf",100), neigh_comm_in_use(false) {}

std::set<int> mpi_neighbors;
std::vector<int> displs;
Expand All @@ -170,11 +179,13 @@ namespace neigh_comm{
std::map<int, int> total_buff_size_per_rank;
std::map<int, std::vector<std::pair<int,int>>> buff_info_per_rank;
int total_buf_size;
size_t nb_of_comm_built;

MPI_Request neigh_request;

bool building_token_on;
bool enable_add_buff;
bool neigh_comm_in_use;
MPI_Comm neigh_comm; // created with MPI_Dist_graph_create_adjacent

//Kokkos::View<parthenon::Real*>
Expand Down
6 changes: 3 additions & 3 deletions src/utils/communication_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,8 +279,8 @@ bool CommBuffer<T>::IsAvailableForWrite() {
if (*state_ == BufferState::stale) return true;
if (*my_request_ == MPI_REQUEST_NULL) return true;
int flag, test;
/*PARTHENON_MPI_CHECK(MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &test,
MPI_STATUS_IGNORE));*/ // Moraru : remove Iprobe
PARTHENON_MPI_CHECK(MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &test,
MPI_STATUS_IGNORE)); // Moraru : remove Iprobe
PARTHENON_MPI_CHECK(MPI_Test(my_request_.get(), &flag, MPI_STATUS_IGNORE));
if (flag) *state_ = BufferState::stale;
return flag;
Expand Down Expand Up @@ -374,7 +374,7 @@ bool CommBuffer<T>::TryReceive(const logger::COMM_TYPE & prof_comm_type) noexcep
// as the total number of buffers being communicated, I have found this can have
// anywhere from no impact on the walltime to a factor of a few reduction in the
// walltime. It seems to be unpredictable as far as I can tell.
for (int i = 0; i < 0; ++i) // Moraru : set i=0 (before i=1)
for (int i = 0; i < 1; ++i) // Moraru : set i<=>0 (before i<=>1)
PARTHENON_MPI_CHECK(MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &flag,
MPI_STATUS_IGNORE));
PARTHENON_MPI_CHECK(MPI_Test(my_request_.get(), &flag, &status));
Expand Down
34 changes: 28 additions & 6 deletions src/utils/mm_logger.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ namespace logger{
public:
My_Logger(): filename("mm_logger.log"), is_init(false), time_recv_bound_bufs(0),\
time_send_bound_bufs(0), time_recv_flux_corr(0), time_send_flux_corr(0),\
time_token_creation(0), token_id(0), print_only(false), log_times(false), rank(-1) {
time_token_creation(0), token_id(0), print_only(false), log_times(false), rank(-1), timer_recvs_in_use(false), timer_comm_build_in_use(false) {
const char * env_filename = getenv("MM_LOGGER_OUT_FILE");
if(env_filename != NULL) filename = env_filename;
//_start_timer(total_exec_time);
Expand All @@ -33,7 +33,7 @@ namespace logger{
My_Logger(bool _print_only): filename("mm_logger.log"), is_init(false), time_recv_bound_bufs(0),\
time_send_bound_bufs(0), time_recv_flux_corr(0), time_send_flux_corr(0),\
time_token_creation(0), token_id(0), print_only(_print_only), log_times(true),\
log_time_sends(0), log_time_recvs(0), log_time_build_comm(0), rank(-1) {
log_time_sends(0), log_time_recvs(0), log_time_build_comm(0), rank(-1), timer_recvs_in_use(false), timer_comm_build_in_use(false) {
const char * env_filename = getenv("MM_LOGGER_OUT_FILE");
if(!print_only && env_filename != NULL) filename = env_filename;

Expand All @@ -44,12 +44,32 @@ namespace logger{
void end_timer_sends(){_end_timer(log_time_sends);}

/* Recv */
void start_timer_recvs(){_start_timer(log_time_recvs);}
void end_timer_recvs(){_end_timer(log_time_recvs);}
void start_timer_recvs(){
if(!timer_recvs_in_use){
timer_recvs_in_use = true;
_start_timer(log_time_recvs);
}
}
void end_timer_recvs(){
if(timer_recvs_in_use){
_end_timer(log_time_recvs);
timer_recvs_in_use = false;
}
}

/* Build communicaiton token */
void start_timer_build_comm(){_start_timer(log_time_build_comm);}
void end_timer_build_comm(){_end_timer(log_time_build_comm);}
void start_timer_build_comm(){
if(!timer_comm_build_in_use){
timer_comm_build_in_use = true;
_start_timer(log_time_build_comm);
}
}
void end_timer_build_comm(){
if(!timer_comm_build_in_use){
_end_timer(log_time_build_comm);
timer_comm_build_in_use = false;
}
}

#endif

Expand Down Expand Up @@ -169,6 +189,8 @@ namespace logger{
bool is_init;
bool print_only; // used only when ENABLE_MM_LOG_TIME is on
bool log_times; // used only when ENABLE_MM_LOG_TIME is on
bool timer_recvs_in_use; // make sure we measure correctly the time spent in communication
bool timer_comm_build_in_use;
double log_time_sends;
double log_time_recvs;
double log_time_build_comm;
Expand Down

0 comments on commit 44944a3

Please sign in to comment.