From 42955f5f4fcc75d14662a1d7cbdf84cf6b75507b Mon Sep 17 00:00:00 2001 From: Wenkai Du Date: Thu, 17 Sep 2020 16:01:45 -0700 Subject: [PATCH] Limit P2P channels on Rome --- src/enqueue.cc | 2 +- src/graph/paths.cc | 13 ++++++++++--- src/graph/search.cc | 9 ++++++--- src/group.cc | 2 +- src/init.cc | 8 ++++---- 5 files changed, 22 insertions(+), 12 deletions(-) diff --git a/src/enqueue.cc b/src/enqueue.cc index a5438b5b2..152490905 100644 --- a/src/enqueue.cc +++ b/src/enqueue.cc @@ -90,7 +90,7 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList, ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) { // Only launch blocks where we have work to do. - for (int c=0; cp2pnChannels; c++) { + for (int c=0; cnChannels, comm->p2pnChannels); c++) { if (comm->channels[c].collCount) params->gridDim.x = c+1; } diff --git a/src/graph/paths.cc b/src/graph/paths.cc index 10c729647..edd98df62 100644 --- a/src/graph/paths.cc +++ b/src/graph/paths.cc @@ -510,9 +510,16 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) { } } - // Round to next pow2 nChannelsPerPeer and nChannels - comm->p2pnChannelsPerPeer = nextPow2(minChannels); - comm->p2pnChannels = nextPow2(comm->p2pnChannels); + if (comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_4P2H_ROME) { + // Adjust P2P channels on Rome + comm->p2pnChannelsPerPeer = 2; + comm->p2pnChannels = 2; + } + else { + // Round to next pow2 nChannelsPerPeer and nChannels + comm->p2pnChannelsPerPeer = nextPow2(minChannels); + comm->p2pnChannels = nextPow2(comm->p2pnChannels); + } // Init channels that weren't used so far for (int c=comm->nChannels; cp2pnChannels; c++) NCCLCHECK(initChannel(comm, c)); diff --git a/src/graph/search.cc b/src/graph/search.cc index 6ba948272..689640555 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -892,14 +892,18 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) { *str = 0; int ngpus = system->nodes[GPU].count; int ncpus = system->nodes[CPU].count; - // 8 GPUs and 4 numa nodes only - if (ngpus != 8 || (ncpus != 4 && ncpus != 8)) + // 8 GPUs only + if (ngpus != 8) return ncclSuccess; // only valid on Rome int arch, vendor, model; NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model)); if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME) return ncclSuccess; + system->type = RCCL_TOPO_4P2H_ROME; + // 4 or 8 numa nodes only + if (ncpus != 4 && ncpus != 8) + return ncclSuccess; // number of GPUs and NICs on each numa node is used as first screening pattern char pattern[256]; for (i = 0; i < ncpus; i++) { @@ -998,7 +1002,6 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) { } ringRemap[i] = 0; *str = ringRemap; - system->type = RCCL_TOPO_4P2H_ROME; INFO(NCCL_GRAPH, "Use 4P2H on Rome: %s", ringRemap); return ncclSuccess; } diff --git a/src/group.cc b/src/group.cc index 0385ccf44..fc6163534 100644 --- a/src/group.cc +++ b/src/group.cc @@ -304,7 +304,7 @@ ncclResult_t ncclGroupEnd() { *args->init.newcomm = NULL; } else { struct ncclComm* comm = args->coll.comm; - for (int c=0; cp2pnChannels; c++) { + for (int c=0; cnChannels, comm->p2pnChannels); c++) { struct ncclChannel* channel = comm->channels+c; for (int i=0; icollCount; i++) { channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0; diff --git a/src/init.cc b/src/init.cc index 63abfcdfb..939dda414 100644 --- a/src/init.cc +++ b/src/init.cc @@ -242,7 +242,7 @@ static ncclResult_t commFree(ncclComm_t comm) { free(prof); CUDACHECK(hipFree(comm->hostDevComm.devProf)); - for (int channel=0; channelp2pnChannels; channel++) { + for (int channel=0; channelnChannels, comm->p2pnChannels); channel++) { if (comm->channels[channel].send_byte) INFO(NCCL_INIT, "# [%03d:%02d] Proxy Send %6.2f GB/s (%ld bytes %d measurements)", comm->rank, channel, (comm->channels[channel].bw_count) ? (float)comm->channels[channel].bw_cumulative/comm->channels[channel].bw_count : 0, @@ -368,11 +368,11 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) { static ncclResult_t devCommSetup(ncclComm_t comm) { // Duplicate the channels on the device - NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->p2pnChannels)); - NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->p2pnChannels)); + NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, std::max(comm->nChannels, comm->p2pnChannels))); + NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, std::max(comm->nChannels, comm->p2pnChannels))); // Copy userRanks and peers - for (int r=0; rp2pnChannels; r++) { + for (int r=0; rnChannels, comm->p2pnChannels); r++) { NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks)); }