Merge pull request #186 from wenkaidu/v2.6.4

Merge with NCCL 2.6.4
ROCm · Apr 2, 2020 · 3cbe5c8 · 3cbe5c8
2 parents ebc823e + 6f54b23
commit 3cbe5c8
Show file tree

Hide file tree

Showing 70 changed files with 4,733 additions and 2,043 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -111,6 +111,7 @@ set(CC_SOURCES
  src/graph/connect.cc
  src/graph/tuning.cc
  src/graph/topo.cc
+ src/graph/xml.cc
  src/collectives/all_reduce.cc
  src/collectives/all_gather.cc
  src/collectives/reduce.cc
@@ -122,6 +123,7 @@ set(CC_SOURCES
  src/misc/utils.cc
  src/misc/ibvwrap.cc
  src/misc/nvmlwrap_stub.cc
+ src/transport/coll_net.cc
  src/transport/net.cc
  src/transport/net_ib.cc
  src/transport/net_socket.cc

diff --git a/Makefile b/Makefile
@@ -1,6 +1,5 @@
 #
 # Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
-# Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
 #
 # See LICENSE.txt for license information
 #

diff --git a/makefiles/common.mk b/makefiles/common.mk
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #

diff --git a/makefiles/version.mk b/makefiles/version.mk
@@ -1,6 +1,6 @@
 ##### version
 NCCL_MAJOR := 2
-NCCL_MINOR := 5
-NCCL_PATCH := 7
+NCCL_MINOR := 6
+NCCL_PATCH := 4
 NCCL_SUFFIX :=
 PKG_REVISION := 1
diff --git a/src/Makefile b/src/Makefile
@@ -1,6 +1,5 @@
 #
-# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
-# Modifications Copyright (c) 2015-2020, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
 #
 # See LICENSE.txt for license information
 #
@@ -12,9 +11,9 @@ include ../makefiles/version.mk
 INCEXPORTS := nccl.h nccl_net.h
 LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc \
  misc/nvmlwrap.cc misc/ibvwrap.cc misc/utils.cc misc/argcheck.cc \
- transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \
+ transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
  collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
- graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc
+ graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc
 
 ##### lib files
 LIBNAME := libnccl.so

diff --git a/src/bootstrap.cc b/src/bootstrap.cc
@@ -1,6 +1,5 @@
 /*************************************************************************
  * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
- * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
  *
  * See LICENSE.txt for license information
  ************************************************************************/

diff --git a/src/channel.cc b/src/channel.cc
@@ -7,24 +7,32 @@
 
 #include "channel.h"
 #include "param.h"
+#include "graph.h"
 
-NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES);
+#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
+#define DEFAULT_BUFFER_SIZE_BYTES_ARM (1LL << 20) /* 1MiB */
+
+NCCL_PARAM(Buffsize, "BUFFSIZE", -2);
 
 ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
  struct ncclChannel* channel = comm->channels+channelid;
  channel->id = channelid;
 
  // Setup intermediate buffering
- channel->buffSize = ncclParamBuffsize();
+ int buffSize = ncclParamBuffsize();
+ int cpuArch, cpuVendor, cpuModel;
+ NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
+ channel->buffSize = buffSize != -2 ? buffSize :
+ cpuArch == NCCL_TOPO_CPU_ARCH_ARM ? DEFAULT_BUFFER_SIZE_BYTES_ARM : DEFAULT_BUFFER_SIZE_BYTES;
 
  // Ring index to user rank table.
  NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
  NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));
 
  // Communication structures with peers.
- NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks));
- NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks));
- for (size_t i=0; i<comm->nRanks; ++i) {
+ NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network)
+ NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1));
+ for (size_t i=0; i<comm->nRanks+1; ++i) {
  channel->peers[i].send.comm = comm;
  channel->peers[i].recv.comm = comm;
  }
@@ -43,9 +51,13 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
  CUDACHECK(hipFree(channel->ring.devUserRanks));
 
  // Free transport proxy resources
- for (int r=0; r<nRanks; r++) {
+ // Note: free all send resources first due to CollNet arrangement
+ for (int r=0; r<nRanks+1; r++) {
  struct ncclPeer* peer = channel->peers+r;
  if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
+ }
+ for (int r=0; r<nRanks+1; r++) {
+ struct ncclPeer* peer = channel->peers+r;
  if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
  }
 

diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h
@@ -72,6 +72,10 @@ template<int UNROLL, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { }
 
+template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclAllGatherCollNetKernel(struct CollectiveArgs* args) { }
+
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
@@ -135,6 +139,10 @@ template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }
 
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclAllGatherCollNetLLKernel(struct CollectiveArgs* args) { }
+
 #include "prims_ll128.h"
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
@@ -200,3 +208,7 @@ __device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) {
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclAllGatherTreeLL128Kernel(struct CollectiveArgs* args) { }
+
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclAllGatherCollNetLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h
@@ -163,6 +163,63 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
  } while(0);
 }
 
+template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclAllReduceCollNetKernel(struct CollectiveArgs* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads;
+ const int bid = args->bid;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ const ssize_t size = args->N;
+ const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
+ int chunkSize = args->lastChunkSize;
+ const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
+ const ssize_t loopSize = args->nChannels*chunkSize;
+
+ if (loopSize > size) {
+ chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+ }
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->ThisInput;
+ T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+ if (blockIdx.x < args->nChannels) { // first half of the channels do reduce
+ struct ncclTree* tree = &channel->collTreeUp;
+ ncclPrimitives<UNROLL, 1, 1, T, 1, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Up
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (tree->up == -1) {
+ prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+ } else if (tree->down[0] == -1) {
+ prims.send(thisInput+offset, nelem);
+ } else {
+ prims.recvReduceSend(thisInput+offset, nelem);
+ }
+ }
+ }
+
+ if (blockIdx.x >= args->nChannels) { // second half of the channels do broadcast
+ struct ncclTree* tree = &channel->collTreeDn;
+ ncclPrimitives<UNROLL, 1, 1, T, 1, 1, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Down
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (tree->up == -1) {
+ prims.send(thisOutput+offset, nelem);
+ } else if (tree->down[0] == -1) {
+ prims.recv(thisOutput+offset, nelem);
+ } else {
+ prims.recvCopySend(thisOutput+offset, nelem);
+ }
+ }
+ }
+}
+
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
@@ -298,6 +355,62 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
  } while(0);
 }
 
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclAllReduceCollNetLLKernel(struct CollectiveArgs* args) {
+ const int tid = threadIdx.x;
+ const int nthreads = args->nThreads;
+ const int bid = args->bid;
+ struct ncclDevComm* comm = args->comm;
+ struct ncclChannel* channel = comm->channels+blockIdx.x;
+ const ssize_t size = args->N;
+ ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
+ const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
+ const ssize_t loopSize = args->nChannels*chunkSize;
+
+ if (loopSize > size) {
+ chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
+ }
+
+ // Compute pointers
+ const T * __restrict__ thisInput = (const T*)args->ThisInput;
+ T * __restrict__ thisOutput = (T*)args->ThisOutput;
+
+ if (blockIdx.x < args->nChannels) { // first half of the channels do reduce
+ struct ncclTree* tree = &channel->collTreeUp;
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Up
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (tree->up == -1) {
+ LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
+ } else if (tree->down[0] == -1) {
+ LLprims.send(thisInput+offset, nelem);
+ } else {
+ LLprims.recvReduceSend(thisInput+offset, nelem);
+ }
+ }
+ }
+
+ if (blockIdx.x >= args->nChannels) { // second half of the channels do broadcast
+ struct ncclTree* tree = &channel->collTreeDn;
+ ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount);
+ for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
+ // Down
+ ssize_t offset = gridOffset + bid*chunkSize;
+ int nelem = min(chunkSize, size-offset);
+ if (tree->up == -1) {
+ LLprims.send(thisOutput+offset, nelem);
+ } else if (tree->down[0] == -1) {
+ LLprims.recv(thisOutput+offset, nelem);
+ } else {
+ LLprims.recvCopySend(thisOutput+offset, nelem);
+ }
+ }
+ }
+}
+
 #include "prims_ll128.h"
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
@@ -437,3 +550,7 @@ __device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
  }
  }
 }
+
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclAllReduceCollNetLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h
@@ -73,6 +73,10 @@ template<int UNROLL, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { }
 
+template<int UNROLL, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclBroadcastCollNetKernel(struct CollectiveArgs* args) { }
+
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
@@ -122,6 +126,10 @@ template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }
 
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclBroadcastCollNetLLKernel(struct CollectiveArgs* args) { }
+
 #include "prims_ll128.h"
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
@@ -171,3 +179,7 @@ __device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) {
 template<int UNUSED, class FUNC, typename T>
 __attribute__((noinline))
 __device__ void ncclBroadcastTreeLL128Kernel(struct CollectiveArgs* args) { }
+
+template<int UNUSED, class FUNC, typename T>
+__attribute__((noinline))
+__device__ void ncclBroadcastCollNetLL128Kernel(struct CollectiveArgs* args) { }
diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h
@@ -1,4 +1,3 @@
-#include "hip/hip_runtime.h"
 /*************************************************************************
  * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
  * Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
@@ -51,7 +50,8 @@ static inline __device__ void exitIfAbortBarrier(int abort) {
 
 #define NCCL_FUNC4(coll, op, dtype) \
  NCCL_FUNC5(coll##Tree, op, dtype), \
- NCCL_FUNC5(coll##Ring, op, dtype)
+ NCCL_FUNC5(coll##Ring, op, dtype), \
+ NCCL_FUNC5(coll##CollNet, op, dtype)
 
 // Must be consistent with ncclDataType_t
 #define NCCL_FUNCS3A(coll, op) \
@@ -133,24 +133,30 @@ struct Caller<f, f + 1>{
 inline
 __device__
 void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept {
- if (c->funcIndex < 240) {
- if (c->funcIndex % 6 == 0) ncclBroadcastTreeLL_copy_i8(&c->args);
- else if (c->funcIndex % 6 == 1) ncclBroadcastTreeLL128_copy_i8(&c->args);
- else if (c->funcIndex % 6 == 2) ncclBroadcastTree_copy_i8(&c->args);
- else if (c->funcIndex % 6 == 3) ncclBroadcastRingLL_copy_i8(&c->args);
- else if (c->funcIndex % 6 == 4) ncclBroadcastRingLL128_copy_i8(&c->args);
- else ncclBroadcastRing_copy_i8(&c->args);
+ if (c->funcIndex < 360) {
+ if (c->funcIndex % 9 == 0) ncclBroadcastTreeLL_copy_i8(&c->args);
+ else if (c->funcIndex % 9 == 1) ncclBroadcastTreeLL128_copy_i8(&c->args);
+ else if (c->funcIndex % 9 == 2) ncclBroadcastTree_copy_i8(&c->args);
+ else if (c->funcIndex % 9 == 3) ncclBroadcastRingLL_copy_i8(&c->args);
+ else if (c->funcIndex % 9 == 4) ncclBroadcastRingLL128_copy_i8(&c->args);
+ else if (c->funcIndex % 9 == 5) ncclBroadcastRing_copy_i8(&c->args);
+ else if (c->funcIndex % 9 == 6) ncclBroadcastCollNetLL_copy_i8(&c->args);
+ else if (c->funcIndex % 9 == 7) ncclBroadcastCollNetLL128_copy_i8(&c->args);
+ else ncclBroadcastCollNet_copy_i8(&c->args);
  }
- else if (c->funcIndex < 480) Caller<240, 480>::call(c);
- else if (c->funcIndex < 720) {
- if (c->funcIndex % 6 == 0) ncclAllGatherTreeLL_copy_i8(&c->args);
- else if (c->funcIndex % 6 == 1) ncclAllGatherTreeLL128_copy_i8(&c->args);
- else if (c->funcIndex % 6 == 2) ncclAllGatherTree_copy_i8(&c->args);
- else if (c->funcIndex % 6 == 3) ncclAllGatherRingLL_copy_i8(&c->args);
- else if (c->funcIndex % 6 == 4) ncclAllGatherRingLL128_copy_i8(&c->args);
- else ncclAllGatherRing_copy_i8(&c->args);
+ else if (c->funcIndex < 720) Caller<360, 720>::call(c);
+ else if (c->funcIndex < 1080) {
+ if (c->funcIndex % 9 == 0) ncclAllGatherTreeLL_copy_i8(&c->args);
+ else if (c->funcIndex % 9 == 1) ncclAllGatherTreeLL128_copy_i8(&c->args);
+ else if (c->funcIndex % 9 == 2) ncclAllGatherTree_copy_i8(&c->args);
+ else if (c->funcIndex % 9 == 3) ncclAllGatherRingLL_copy_i8(&c->args);
+ else if (c->funcIndex % 9 == 4) ncclAllGatherRingLL128_copy_i8(&c->args);
+ else if (c->funcIndex % 9 == 5) ncclAllGatherRing_copy_i8(&c->args);
+ else if (c->funcIndex % 9 == 6) ncclAllGatherCollNetLL_copy_i8(&c->args);
+ else if (c->funcIndex % 9 == 7) ncclAllGatherCollNetLL128_copy_i8(&c->args);
+ else ncclAllGatherCollNet_copy_i8(&c->args);
  }
- else Caller<720, 1200>::call(c);
+ else Caller<1080, 1800>::call(c);
 }
 
 static __device__ void load_parallel(void* dst, void* src, size_t size, int tid, uint32_t* abortCount) {
@@ -274,7 +280,8 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclDevComm* comm) { \
 
 #define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
  IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_TREE) \
- IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING)
+ IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING) \
+ IMPL_COLL4(coll##CollNet, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_COLLNET)
 
 #define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
  IMPL_COLL3(coll, op, ncclFunc, i8, int8_t, ncclColl, ncclOp, ncclInt8) \