Skip to content

Commit

Permalink
mpir: replace subcomm usage with subgroups
Browse files Browse the repository at this point in the history
Directly use information from MPIR_Process rather than from nodecomm in
MPIR_Process.

One step toward removing subcomms.
  • Loading branch information
hzhou committed Aug 23, 2024
1 parent 17e1dc6 commit 914755c
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 23 deletions.
7 changes: 1 addition & 6 deletions src/mpi/comm/comm_split_type_nbhd.c
Original file line number Diff line number Diff line change
Expand Up @@ -474,12 +474,7 @@ static int network_split_by_min_memsize(MPIR_Comm * comm_ptr, int key, long min_
if (min_mem_size == 0 || topo_type == MPIR_NETTOPO_TYPE__INVALID) {
*newcomm_ptr = NULL;
} else {
int num_ranks_node;
if (MPIR_Process.comm_world->node_comm != NULL) {
num_ranks_node = MPIR_Comm_size(MPIR_Process.comm_world->node_comm);
} else {
num_ranks_node = 1;
}
int num_ranks_node = MPIR_Process.local_size;
memory_per_process = total_memory_size / num_ranks_node;
mpi_errno = network_split_by_minsize(comm_ptr, key, min_mem_size / memory_per_process,
newcomm_ptr);
Expand Down
9 changes: 3 additions & 6 deletions src/mpi/init/init_async.c
Original file line number Diff line number Diff line change
Expand Up @@ -179,17 +179,14 @@ static int get_thread_affinity(bool * apply_affinity, int **p_thread_affinity, i
}

global_rank = MPIR_Process.rank;
local_rank =
(MPIR_Process.comm_world->node_comm) ? MPIR_Process.comm_world->node_comm->rank : 0;
local_rank = MPIR_Process.local_rank;
if (have_cliques) {
/* If local cliques > 1, using local_size from node_comm will have conflict on thread idx.
/* If local cliques > 1, using local_size will have conflict on thread idx.
* In multiple nodes case, this would cost extra memory for allocating thread affinity on every
* node, but it is okay to solve progress thread oversubscription. */
local_size = MPIR_Process.comm_world->local_size;
} else {
local_size =
(MPIR_Process.comm_world->node_comm) ? MPIR_Process.comm_world->
node_comm->local_size : 1;
local_size = MPIR_Process.local_size;
}

async_threads_per_node = local_size;
Expand Down
21 changes: 10 additions & 11 deletions src/util/mpir_nodemap.c
Original file line number Diff line number Diff line change
Expand Up @@ -436,14 +436,14 @@ int MPIR_nodeid_init(void)
utarray_resize(MPIR_Process.node_hostnames, MPIR_Process.num_nodes, MPL_MEM_OTHER);
char *allhostnames = (char *) utarray_eltptr(MPIR_Process.node_hostnames, 0);

if (MPIR_Process.local_rank == 0) {
MPIR_Comm *node_roots_comm = MPIR_Process.comm_world->node_roots_comm;
if (node_roots_comm == NULL) {
/* num_external == comm->remote_size */
node_roots_comm = MPIR_Process.comm_world;
}
MPIR_Comm *world_comm = MPIR_Process.comm_world;
int local_rank = world_comm->subgroups[MPIR_SUBGROUP_NODE].rank;
int local_size = world_comm->subgroups[MPIR_SUBGROUP_NODE].size;

if (local_rank == 0) {
int inter_rank = world_comm->subgroups[MPIR_SUBGROUP_NODE_CROSS].rank;

char *my_hostname = allhostnames + MAX_HOSTNAME_LEN * node_roots_comm->rank;
char *my_hostname = allhostnames + MAX_HOSTNAME_LEN * inter_rank;
int ret = gethostname(my_hostname, MAX_HOSTNAME_LEN);
char strerrbuf[MPIR_STRERROR_BUF_SIZE] ATTRIBUTE((unused));
MPIR_ERR_CHKANDJUMP2(ret == -1, mpi_errno, MPI_ERR_OTHER,
Expand All @@ -453,14 +453,13 @@ int MPIR_nodeid_init(void)

mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, MAX_HOSTNAME_LEN, MPI_CHAR,
allhostnames, MAX_HOSTNAME_LEN, MPI_CHAR,
node_roots_comm, MPIR_SUBGROUP_NONE, MPIR_ERR_NONE);
world_comm, MPIR_SUBGROUP_NODE_CROSS, MPIR_ERR_NONE);
MPIR_ERR_CHECK(mpi_errno);
}

MPIR_Comm *node_comm = MPIR_Process.comm_world->node_comm;
if (node_comm) {
if (local_size > 1) {
mpi_errno = MPIR_Bcast_impl(allhostnames, MAX_HOSTNAME_LEN * MPIR_Process.num_nodes,
MPI_CHAR, 0, node_comm, MPIR_SUBGROUP_NONE, MPIR_ERR_NONE);
MPI_CHAR, 0, world_comm, MPIR_SUBGROUP_NODE, MPIR_ERR_NONE);
MPIR_ERR_CHECK(mpi_errno);
}

Expand Down

0 comments on commit 914755c

Please sign in to comment.