Skip to content

Commit

Permalink
Improve 4P2H topology on Rome (#243)
Browse files Browse the repository at this point in the history
1. Use bi-directional rings
2. GPU search is sorted by PCI device ID to get consistent results
  • Loading branch information
wenkaidu authored Jul 28, 2020
1 parent e7a10aa commit d1e20b4
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 35 deletions.
3 changes: 1 addition & 2 deletions src/graph/connect.cc
Original file line number Diff line number Diff line change
Expand Up @@ -289,9 +289,8 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct nccl
memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));

int nc = 0;
int nc = nChannels*2;
if (comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_CR8G) nc = nChannels*3;
else if (comm->topo->nodes[NET].count != 0 && comm->topo->type == RCCL_TOPO_4P2H_ROME) nc = nChannels*4;
int end = std::min((int)ncclMaxNchannels(), std::max(nc, ncclMinNchannels()));

// Duplication should be complete now
Expand Down
76 changes: 43 additions & 33 deletions src/graph/search.cc
Original file line number Diff line number Diff line change
Expand Up @@ -783,7 +783,7 @@ static bool getGpuNetCount(struct ncclTopoSystem* system, int id, int *ngpu, int
}

static bool findGpuByXGMI(struct ncclTopoSystem* system, int cpu1, int cpu2, int *gpu1, int *gpu2, int ex1, int ex2) {
int n, m;
int n, m, idx, gid;
int ngpus = system->nodes[GPU].count;
*gpu1 = -1; *gpu2 = -1;
int c1, c2;
Expand All @@ -794,6 +794,7 @@ static bool findGpuByXGMI(struct ncclTopoSystem* system, int cpu1, int cpu2, int
if (system->nodes[GPU].nodes[n].paths[CPU][c1].count != 2) continue;
struct ncclTopoNode* node = system->nodes[GPU].nodes+n;
if (node->paths[GPU] == NULL) continue;
idx = -1; gid = 0;
for (m = 0; m < ngpus; m++) {
if (system->nodes[GPU].nodes[m].gpu.dev == ex2) continue;
if (system->nodes[GPU].nodes[m].paths[CPU][c2].count != 2) continue;
Expand All @@ -802,13 +803,18 @@ static bool findGpuByXGMI(struct ncclTopoSystem* system, int cpu1, int cpu2, int
if (link->remNode->gpu.dev == system->nodes[GPU].nodes[m].gpu.dev) break;
}
if (!link->remNode) continue;
if (link->type == LINK_NVL) break;
if (link->type == LINK_NVL) {
if (idx == -1 || (idx != -1 && system->nodes[GPU].nodes[m].id < gid)) {
idx = m;
gid = system->nodes[GPU].nodes[m].id;
}
}
}
if (m < ngpus) break;
if (idx != -1) break;
}
if (n < ngpus) {
*gpu1 = system->nodes[GPU].nodes[n].gpu.dev;
*gpu2 = system->nodes[GPU].nodes[m].gpu.dev;
*gpu2 = system->nodes[GPU].nodes[idx].gpu.dev;
return true;
}
return false;
Expand Down Expand Up @@ -842,16 +848,16 @@ static bool validate4P1H(struct ncclTopoSystem* system, int *hive) {
}

static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) {
static const char *ringBase = "6 7 4 5 1 0 3 2";
static const char *ringBase = "6 7 4 5 1 0 3 2|7 6 2 3 0 1 5 4";
static char ringRemap[64];
int id[8], dist[8];
int i;

*str = 0;
int ngpus = system->nodes[GPU].count;
int ncpus = system->nodes[CPU].count;
// 8 GPUs and 4 numa nodes on multi node only
if (ngpus != 8 || ncpus != 4 || !system->nodes[NET].count)
// 8 GPUs and 4 numa nodes only
if (ngpus != 8 || ncpus != 4)
return ncclSuccess;
// only valid on Rome
int arch, vendor, model;
Expand Down Expand Up @@ -942,34 +948,38 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
}
graph->speedIntra = graph->speedInter = system->maxWidth;
if (system->nodes[NET].count) {
int *intra, *used;
graph->nChannels = system->nodes[NET].count;
NCCLCHECK(ncclCalloc(&intra, ngpus));
NCCLCHECK(ncclCalloc(&used,system->nodes[NET].count));
for (int n = 0; n < system->nodes[NET].count; n++) {
graph->inter[n*2] = graph->inter[n*2+1] = n;
// do not change ring order for 4P2H on Rome
if (system->type == RCCL_TOPO_4P2H_ROME) continue;
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
struct ncclTopoLinkList* paths = net->paths[GPU];
// find the first unsed GPU that is closest to NIC
int f, m;
for (f = 0; f < ngpus; f++) {
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break;
if(j >= n) break;
}
for (int i = 0; i < ngpus; i++) {
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break;
if (j < n) continue;
if (paths[i].count < paths[f].count) f = i;
// do not change ring order for multi node 4P2H on Rome
if (system->type == RCCL_TOPO_4P2H_ROME) {
for (int n = 0; n < graph->nChannels; n++)
graph->inter[n*2] = graph->inter[n*2+1] = n%system->nodes[NET].count;
} else {
int *intra, *used;
graph->nChannels = system->nodes[NET].count;
NCCLCHECK(ncclCalloc(&intra, ngpus));
NCCLCHECK(ncclCalloc(&used,system->nodes[NET].count));
for (int n = 0; n < system->nodes[NET].count; n++) {
graph->inter[n*2] = graph->inter[n*2+1] = n;
struct ncclTopoNode* net = system->nodes[NET].nodes+n;
struct ncclTopoLinkList* paths = net->paths[GPU];
// find the first unsed GPU that is closest to NIC
int f, m;
for (f = 0; f < ngpus; f++) {
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break;
if(j >= n) break;
}
for (int i = 0; i < ngpus; i++) {
int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break;
if (j < n) continue;
if (paths[i].count < paths[f].count) f = i;
}
for (m = 0; m<ngpus; m++) if (graph->intra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break;
used[n] = graph->intra[n*ngpus+m];
for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)];
for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i];
}
for (m = 0; m<ngpus; m++) if (graph->intra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break;
used[n] = graph->intra[n*ngpus+m];
for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)];
for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i];
free(used);
free(intra);
}
free(used);
free(intra);
}
if (graph->nChannels) return ncclSuccess;
}
Expand Down

0 comments on commit d1e20b4

Please sign in to comment.