Skip to content

Commit

Permalink
Merge pull request #186 from wenkaidu/v2.6.4
Browse files Browse the repository at this point in the history
Merge with NCCL 2.6.4
  • Loading branch information
wenkaidu committed Apr 2, 2020
2 parents ebc823e + 6f54b23 commit 3cbe5c8
Show file tree
Hide file tree
Showing 70 changed files with 4,733 additions and 2,043 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ set(CC_SOURCES
src/graph/connect.cc
src/graph/tuning.cc
src/graph/topo.cc
src/graph/xml.cc
src/collectives/all_reduce.cc
src/collectives/all_gather.cc
src/collectives/reduce.cc
Expand All @@ -122,6 +123,7 @@ set(CC_SOURCES
src/misc/utils.cc
src/misc/ibvwrap.cc
src/misc/nvmlwrap_stub.cc
src/transport/coll_net.cc
src/transport/net.cc
src/transport/net_ib.cc
src/transport/net_socket.cc
Expand Down
1 change: 0 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
# Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE.txt for license information
#
Expand Down
2 changes: 1 addition & 1 deletion makefiles/common.mk
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
Expand Down
4 changes: 2 additions & 2 deletions makefiles/version.mk
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 5
NCCL_PATCH := 7
NCCL_MINOR := 6
NCCL_PATCH := 4
NCCL_SUFFIX :=
PKG_REVISION := 1
7 changes: 3 additions & 4 deletions src/Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#
# Copyright (c) 2015-2019, NVIDIA CORPORATION. All rights reserved.
# Modifications Copyright (c) 2015-2020, Advanced Micro Devices, Inc. All rights reserved.
# Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
#
# See LICENSE.txt for license information
#
Expand All @@ -12,9 +11,9 @@ include ../makefiles/version.mk
INCEXPORTS := nccl.h nccl_net.h
LIBSRCFILES := init.cc channel.cc bootstrap.cc transport.cc enqueue.cc group.cc debug.cc \
misc/nvmlwrap.cc misc/ibvwrap.cc misc/utils.cc misc/argcheck.cc \
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc \
transport/p2p.cc transport/shm.cc transport/net.cc transport/net_socket.cc transport/net_ib.cc transport/coll_net.cc \
collectives/all_reduce.cc collectives/all_gather.cc collectives/broadcast.cc collectives/reduce.cc collectives/reduce_scatter.cc \
graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc
graph/topo.cc graph/paths.cc graph/search.cc graph/connect.cc graph/rings.cc graph/trees.cc graph/tuning.cc graph/xml.cc

##### lib files
LIBNAME := libnccl.so
Expand Down
1 change: 0 additions & 1 deletion src/bootstrap.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
/*************************************************************************
* Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/
Expand Down
24 changes: 18 additions & 6 deletions src/channel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,32 @@

#include "channel.h"
#include "param.h"
#include "graph.h"

NCCL_PARAM(Buffsize, "BUFFSIZE", DEFAULT_BUFFER_SIZE_BYTES);
#define DEFAULT_BUFFER_SIZE_BYTES (1LL << 22) /* 4MiB */
#define DEFAULT_BUFFER_SIZE_BYTES_ARM (1LL << 20) /* 1MiB */

NCCL_PARAM(Buffsize, "BUFFSIZE", -2);

ncclResult_t initChannel(struct ncclComm* comm, int channelid) {
struct ncclChannel* channel = comm->channels+channelid;
channel->id = channelid;

// Setup intermediate buffering
channel->buffSize = ncclParamBuffsize();
int buffSize = ncclParamBuffsize();
int cpuArch, cpuVendor, cpuModel;
NCCLCHECK(ncclTopoCpuType(comm->topo, &cpuArch, &cpuVendor, &cpuModel));
channel->buffSize = buffSize != -2 ? buffSize :
cpuArch == NCCL_TOPO_CPU_ARCH_ARM ? DEFAULT_BUFFER_SIZE_BYTES_ARM : DEFAULT_BUFFER_SIZE_BYTES;

// Ring index to user rank table.
NCCLCHECK(ncclCudaCalloc(&channel->ring.devUserRanks, comm->nRanks));
NCCLCHECK(ncclCalloc(&channel->ring.userRanks, comm->nRanks));

// Communication structures with peers.
NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks));
NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks));
for (size_t i=0; i<comm->nRanks; ++i) {
NCCLCHECK(ncclCudaCalloc(&channel->devPeers, comm->nRanks+1)); // The extra one rank is for collnet root (i.e. network)
NCCLCHECK(ncclCalloc(&channel->peers, comm->nRanks+1));
for (size_t i=0; i<comm->nRanks+1; ++i) {
channel->peers[i].send.comm = comm;
channel->peers[i].recv.comm = comm;
}
Expand All @@ -43,9 +51,13 @@ ncclResult_t freeChannel(struct ncclChannel* channel, int nRanks) {
CUDACHECK(hipFree(channel->ring.devUserRanks));

// Free transport proxy resources
for (int r=0; r<nRanks; r++) {
// Note: free all send resources first due to CollNet arrangement
for (int r=0; r<nRanks+1; r++) {
struct ncclPeer* peer = channel->peers+r;
if (peer->send.transportResources) NCCLCHECK(peer->send.transportComm->free(peer->send.transportResources));
}
for (int r=0; r<nRanks+1; r++) {
struct ncclPeer* peer = channel->peers+r;
if (peer->recv.transportResources) NCCLCHECK(peer->recv.transportComm->free(peer->recv.transportResources));
}

Expand Down
12 changes: 12 additions & 0 deletions src/collectives/device/all_gather.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherTreeKernel(struct CollectiveArgs* args) { }

template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherCollNetKernel(struct CollectiveArgs* args) { }

template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherRingLLKernel(struct CollectiveArgs* args) {
Expand Down Expand Up @@ -135,6 +139,10 @@ template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherTreeLLKernel(struct CollectiveArgs* args) { }

template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherCollNetLLKernel(struct CollectiveArgs* args) { }

#include "prims_ll128.h"
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
Expand Down Expand Up @@ -200,3 +208,7 @@ __device__ void ncclAllGatherRingLL128Kernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherTreeLL128Kernel(struct CollectiveArgs* args) { }

template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllGatherCollNetLL128Kernel(struct CollectiveArgs* args) { }
117 changes: 117 additions & 0 deletions src/collectives/device/all_reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,63 @@ __device__ void ncclAllReduceTreeKernel(struct CollectiveArgs* args) {
} while(0);
}

template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllReduceCollNetKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->bid;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
const ssize_t size = args->N;
const int stepSize = channel->buffSize / (sizeof(T)*NCCL_STEPS);
int chunkSize = args->lastChunkSize;
const ssize_t minChunkSize = nthreads*8*sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = args->nChannels*chunkSize;

if (loopSize > size) {
chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
}

// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;

if (blockIdx.x < args->nChannels) { // first half of the channels do reduce
struct ncclTree* tree = &channel->collTreeUp;
ncclPrimitives<UNROLL, 1, 1, T, 1, 1, FUNC> prims(tid, args->nThreads, tree->down, &tree->up, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
prims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
prims.send(thisInput+offset, nelem);
} else {
prims.recvReduceSend(thisInput+offset, nelem);
}
}
}

if (blockIdx.x >= args->nChannels) { // second half of the channels do broadcast
struct ncclTree* tree = &channel->collTreeDn;
ncclPrimitives<UNROLL, 1, 1, T, 1, 1, FUNC> prims(tid, args->nThreads, &tree->up, tree->down, NULL, stepSize, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
prims.send(thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
prims.recv(thisOutput+offset, nelem);
} else {
prims.recvCopySend(thisOutput+offset, nelem);
}
}
}
}

template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllReduceRingLLKernel(struct CollectiveArgs* args) {
Expand Down Expand Up @@ -298,6 +355,62 @@ __device__ void ncclAllReduceTreeLLKernel(struct CollectiveArgs* args) {
} while(0);
}

template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllReduceCollNetLLKernel(struct CollectiveArgs* args) {
const int tid = threadIdx.x;
const int nthreads = args->nThreads;
const int bid = args->bid;
struct ncclDevComm* comm = args->comm;
struct ncclChannel* channel = comm->channels+blockIdx.x;
const ssize_t size = args->N;
ssize_t chunkSize = NCCL_LL_SLICE_LINES * sizeof(uint64_t) / sizeof(T);
const ssize_t minChunkSize = nthreads*sizeof(uint64_t) / sizeof(T);
const ssize_t loopSize = args->nChannels*chunkSize;

if (loopSize > size) {
chunkSize = DIVUP(size, args->nChannels*minChunkSize)*minChunkSize;
}

// Compute pointers
const T * __restrict__ thisInput = (const T*)args->ThisInput;
T * __restrict__ thisOutput = (T*)args->ThisOutput;

if (blockIdx.x < args->nChannels) { // first half of the channels do reduce
struct ncclTree* tree = &channel->collTreeUp;
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, tree->down, &tree->up, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Up
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
LLprims.recvReduceCopy(thisInput+offset, thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
LLprims.send(thisInput+offset, nelem);
} else {
LLprims.recvReduceSend(thisInput+offset, nelem);
}
}
}

if (blockIdx.x >= args->nChannels) { // second half of the channels do broadcast
struct ncclTree* tree = &channel->collTreeDn;
ncclLLPrimitives<T, FUNC, 1, 1> LLprims(tid, nthreads, &tree->up, tree->down, channel, comm, args->opCount);
for (ssize_t gridOffset = 0; gridOffset < size; gridOffset += loopSize) {
// Down
ssize_t offset = gridOffset + bid*chunkSize;
int nelem = min(chunkSize, size-offset);
if (tree->up == -1) {
LLprims.send(thisOutput+offset, nelem);
} else if (tree->down[0] == -1) {
LLprims.recv(thisOutput+offset, nelem);
} else {
LLprims.recvCopySend(thisOutput+offset, nelem);
}
}
}
}

#include "prims_ll128.h"
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
Expand Down Expand Up @@ -437,3 +550,7 @@ __device__ void ncclAllReduceTreeLL128Kernel(struct CollectiveArgs* args) {
}
}
}

template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclAllReduceCollNetLL128Kernel(struct CollectiveArgs* args) { }
12 changes: 12 additions & 0 deletions src/collectives/device/broadcast.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastTreeKernel(struct CollectiveArgs* args) { }

template<int UNROLL, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastCollNetKernel(struct CollectiveArgs* args) { }

template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastRingLLKernel(struct CollectiveArgs* args) {
Expand Down Expand Up @@ -122,6 +126,10 @@ template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastTreeLLKernel(struct CollectiveArgs* args) { }

template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastCollNetLLKernel(struct CollectiveArgs* args) { }

#include "prims_ll128.h"
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
Expand Down Expand Up @@ -171,3 +179,7 @@ __device__ void ncclBroadcastRingLL128Kernel(struct CollectiveArgs* args) {
template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastTreeLL128Kernel(struct CollectiveArgs* args) { }

template<int UNUSED, class FUNC, typename T>
__attribute__((noinline))
__device__ void ncclBroadcastCollNetLL128Kernel(struct CollectiveArgs* args) { }
45 changes: 26 additions & 19 deletions src/collectives/device/common.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
#include "hip/hip_runtime.h"
/*************************************************************************
* Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
* Modifications Copyright (c) 2019-2020 Advanced Micro Devices, Inc. All rights reserved.
Expand Down Expand Up @@ -51,7 +50,8 @@ static inline __device__ void exitIfAbortBarrier(int abort) {

#define NCCL_FUNC4(coll, op, dtype) \
NCCL_FUNC5(coll##Tree, op, dtype), \
NCCL_FUNC5(coll##Ring, op, dtype)
NCCL_FUNC5(coll##Ring, op, dtype), \
NCCL_FUNC5(coll##CollNet, op, dtype)

// Must be consistent with ncclDataType_t
#define NCCL_FUNCS3A(coll, op) \
Expand Down Expand Up @@ -133,24 +133,30 @@ struct Caller<f, f + 1>{
inline
__device__
void NCCL_CALL_FUNCTIONS(struct ncclColl* const c) noexcept {
if (c->funcIndex < 240) {
if (c->funcIndex % 6 == 0) ncclBroadcastTreeLL_copy_i8(&c->args);
else if (c->funcIndex % 6 == 1) ncclBroadcastTreeLL128_copy_i8(&c->args);
else if (c->funcIndex % 6 == 2) ncclBroadcastTree_copy_i8(&c->args);
else if (c->funcIndex % 6 == 3) ncclBroadcastRingLL_copy_i8(&c->args);
else if (c->funcIndex % 6 == 4) ncclBroadcastRingLL128_copy_i8(&c->args);
else ncclBroadcastRing_copy_i8(&c->args);
if (c->funcIndex < 360) {
if (c->funcIndex % 9 == 0) ncclBroadcastTreeLL_copy_i8(&c->args);
else if (c->funcIndex % 9 == 1) ncclBroadcastTreeLL128_copy_i8(&c->args);
else if (c->funcIndex % 9 == 2) ncclBroadcastTree_copy_i8(&c->args);
else if (c->funcIndex % 9 == 3) ncclBroadcastRingLL_copy_i8(&c->args);
else if (c->funcIndex % 9 == 4) ncclBroadcastRingLL128_copy_i8(&c->args);
else if (c->funcIndex % 9 == 5) ncclBroadcastRing_copy_i8(&c->args);
else if (c->funcIndex % 9 == 6) ncclBroadcastCollNetLL_copy_i8(&c->args);
else if (c->funcIndex % 9 == 7) ncclBroadcastCollNetLL128_copy_i8(&c->args);
else ncclBroadcastCollNet_copy_i8(&c->args);
}
else if (c->funcIndex < 480) Caller<240, 480>::call(c);
else if (c->funcIndex < 720) {
if (c->funcIndex % 6 == 0) ncclAllGatherTreeLL_copy_i8(&c->args);
else if (c->funcIndex % 6 == 1) ncclAllGatherTreeLL128_copy_i8(&c->args);
else if (c->funcIndex % 6 == 2) ncclAllGatherTree_copy_i8(&c->args);
else if (c->funcIndex % 6 == 3) ncclAllGatherRingLL_copy_i8(&c->args);
else if (c->funcIndex % 6 == 4) ncclAllGatherRingLL128_copy_i8(&c->args);
else ncclAllGatherRing_copy_i8(&c->args);
else if (c->funcIndex < 720) Caller<360, 720>::call(c);
else if (c->funcIndex < 1080) {
if (c->funcIndex % 9 == 0) ncclAllGatherTreeLL_copy_i8(&c->args);
else if (c->funcIndex % 9 == 1) ncclAllGatherTreeLL128_copy_i8(&c->args);
else if (c->funcIndex % 9 == 2) ncclAllGatherTree_copy_i8(&c->args);
else if (c->funcIndex % 9 == 3) ncclAllGatherRingLL_copy_i8(&c->args);
else if (c->funcIndex % 9 == 4) ncclAllGatherRingLL128_copy_i8(&c->args);
else if (c->funcIndex % 9 == 5) ncclAllGatherRing_copy_i8(&c->args);
else if (c->funcIndex % 9 == 6) ncclAllGatherCollNetLL_copy_i8(&c->args);
else if (c->funcIndex % 9 == 7) ncclAllGatherCollNetLL128_copy_i8(&c->args);
else ncclAllGatherCollNet_copy_i8(&c->args);
}
else Caller<720, 1200>::call(c);
else Caller<1080, 1800>::call(c);
}

static __device__ void load_parallel(void* dst, void* src, size_t size, int tid, uint32_t* abortCount) {
Expand Down Expand Up @@ -274,7 +280,8 @@ __global__ void NCCL_KERN_NAME(coll, op, dtype)(struct ncclDevComm* comm) { \

#define IMPL_COLL3(coll, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType) \
IMPL_COLL4(coll##Tree, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_TREE) \
IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING)
IMPL_COLL4(coll##Ring, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_RING) \
IMPL_COLL4(coll##CollNet, op, ncclFunc, dtype, ctype, ncclColl, ncclOp, ncclType, NCCL_ALGO_COLLNET)

#define IMPL_COLL2(coll, op, ncclFunc, ncclColl, ncclOp) \
IMPL_COLL3(coll, op, ncclFunc, i8, int8_t, ncclColl, ncclOp, ncclInt8) \
Expand Down
Loading

0 comments on commit 3cbe5c8

Please sign in to comment.