From 42955f5f4fcc75d14662a1d7cbdf84cf6b75507b Mon Sep 17 00:00:00 2001
From: Wenkai Du <wenkai.du@amd.com>
Date: Thu, 17 Sep 2020 16:01:45 -0700
Subject: [PATCH] Limit P2P channels on Rome

---
 src/enqueue.cc      |  2 +-
 src/graph/paths.cc  | 13 ++++++++++---
 src/graph/search.cc |  9 ++++++---
 src/group.cc        |  2 +-
 src/init.cc         |  8 ++++----
 5 files changed, 22 insertions(+), 12 deletions(-)
diff --git a/src/enqueue.cc b/src/enqueue.cc
index a5438b5b2..152490905 100644
--- a/src/enqueue.cc
+++ b/src/enqueue.cc
@@ -90,7 +90,7 @@ ncclResult_t ncclLaunchCooperativeKernelMultiDevice(hipLaunchParams *paramsList,
 
 ncclResult_t setupLaunch(struct ncclComm* comm, hipLaunchParams* params) {
   // Only launch blocks where we have work to do.
-  for (int c=0; c<comm->p2pnChannels; c++) {
+  for (int c=0; c<std::max(comm->nChannels, comm->p2pnChannels); c++) {
     if (comm->channels[c].collCount) params->gridDim.x = c+1;
   }
 
diff --git a/src/graph/paths.cc b/src/graph/paths.cc
index 10c729647..edd98df62 100644
--- a/src/graph/paths.cc
+++ b/src/graph/paths.cc
@@ -510,9 +510,16 @@ ncclResult_t ncclTopoComputeP2pChannels(struct ncclComm* comm) {
     }
   }
 
-  // Round to next pow2 nChannelsPerPeer and nChannels
-  comm->p2pnChannelsPerPeer = nextPow2(minChannels);
-  comm->p2pnChannels = nextPow2(comm->p2pnChannels);
+  if (comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_4P2H_ROME) {
+    // Adjust P2P channels on Rome
+    comm->p2pnChannelsPerPeer = 2;
+    comm->p2pnChannels = 2;
+  }
+  else {
+    // Round to next pow2 nChannelsPerPeer and nChannels
+    comm->p2pnChannelsPerPeer = nextPow2(minChannels);
+    comm->p2pnChannels = nextPow2(comm->p2pnChannels);
+  }
 
   // Init channels that weren't used so far
   for (int c=comm->nChannels; c<comm->p2pnChannels; c++) NCCLCHECK(initChannel(comm, c));
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 6ba948272..689640555 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -892,14 +892,18 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) {
   *str = 0;
   int ngpus = system->nodes[GPU].count;
   int ncpus = system->nodes[CPU].count;
-  // 8 GPUs and 4 numa nodes only
-  if (ngpus != 8 || (ncpus != 4 && ncpus != 8))
+  // 8 GPUs only
+  if (ngpus != 8)
     return ncclSuccess;
   // only valid on Rome
   int arch, vendor, model;
   NCCLCHECK(ncclTopoCpuType(system, &arch, &vendor, &model));
   if (arch != NCCL_TOPO_CPU_ARCH_X86 || vendor != NCCL_TOPO_CPU_VENDOR_AMD || model != NCCL_TOPO_CPU_TYPE_ROME)
     return ncclSuccess;
+  system->type = RCCL_TOPO_4P2H_ROME;
+  // 4 or 8 numa nodes only
+  if (ncpus != 4 && ncpus != 8)
+    return ncclSuccess;
   // number of GPUs and NICs on each numa node is used as first screening pattern
   char pattern[256];
   for (i = 0; i < ncpus; i++) {
@@ -998,7 +1002,6 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) {
   }
   ringRemap[i] = 0;
   *str = ringRemap;
-  system->type = RCCL_TOPO_4P2H_ROME;
   INFO(NCCL_GRAPH, "Use 4P2H on Rome: %s", ringRemap);
   return ncclSuccess;
 }
diff --git a/src/group.cc b/src/group.cc
index 0385ccf44..fc6163534 100644
--- a/src/group.cc
+++ b/src/group.cc
@@ -304,7 +304,7 @@ ncclResult_t ncclGroupEnd() {
         *args->init.newcomm = NULL;
       } else {
         struct ncclComm* comm = args->coll.comm;
-        for (int c=0; c<comm->p2pnChannels; c++) {
+        for (int c=0; c<std::max(comm->nChannels, comm->p2pnChannels); c++) {
           struct ncclChannel* channel = comm->channels+c;
           for (int i=0; i<channel->collCount; i++) {
             channel->collectives[(channel->collStart + i)%NCCL_MAX_OPS].active = 0;
diff --git a/src/init.cc b/src/init.cc
index 63abfcdfb..939dda414 100644
--- a/src/init.cc
+++ b/src/init.cc
@@ -242,7 +242,7 @@ static ncclResult_t commFree(ncclComm_t comm) {
   free(prof);
   CUDACHECK(hipFree(comm->hostDevComm.devProf));
 
-  for (int channel=0; channel<comm->p2pnChannels; channel++) {
+  for (int channel=0; channel<std::max(comm->nChannels, comm->p2pnChannels); channel++) {
     if (comm->channels[channel].send_byte) INFO(NCCL_INIT, "# [%03d:%02d] Proxy Send %6.2f GB/s (%ld bytes %d measurements)",
       comm->rank, channel, (comm->channels[channel].bw_count) ?
       (float)comm->channels[channel].bw_cumulative/comm->channels[channel].bw_count : 0,
@@ -368,11 +368,11 @@ static ncclResult_t commAlloc(ncclComm_t* comret, int ndev, int rank) {
 
 static ncclResult_t devCommSetup(ncclComm_t comm) {
   // Duplicate the channels on the device
-  NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, comm->p2pnChannels));
-  NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, comm->p2pnChannels));
+  NCCLCHECK(ncclCudaCalloc(&comm->hostDevComm.channels, std::max(comm->nChannels, comm->p2pnChannels)));
+  NCCLCHECK(ncclCudaMemcpy(comm->hostDevComm.channels, comm->channels, std::max(comm->nChannels, comm->p2pnChannels)));
 
   // Copy userRanks and peers
-  for (int r=0; r<comm->p2pnChannels; r++) {
+  for (int r=0; r<std::max(comm->nChannels, comm->p2pnChannels); r++) {
     NCCLCHECK(ncclCudaMemcpy(comm->channels[r].ring.devUserRanks, comm->channels[r].ring.userRanks, comm->nRanks));
   }