Skip to content

Commit

Permalink
Fix share memory collision in multi-communicator case. (#93)
Browse files Browse the repository at this point in the history
Current SHM object name would only use pidHash and ranks as
identification, which would collide each other when program runs with
multiple communicators. Here we added commId info into pidHash, it makes
'pidHash'es of different communicators keeping in same process will be
distincted with each other.

Ported from original commit: https://github.com/lowintelligence/nccl/commits/shm
  • Loading branch information
wenkaidu authored Jul 2, 2019
1 parent e6a0da4 commit 949d680
Show file tree
Hide file tree
Showing 7 changed files with 24 additions and 12 deletions.
2 changes: 1 addition & 1 deletion src/include/transport.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ struct ncclTransportComm {

struct ncclTransport {
const char name[4];
ncclResult_t (*fillInfo)(ncclTinfo_t*, int);
ncclResult_t (*fillInfo)(ncclTinfo_t*, int, uint64_t);
ncclResult_t (*canConnect)(ncclTvalue_t*, ncclTinfo_t*, ncclTinfo_t*);
ncclResult_t (*getRings)(int, int*, int*, ncclTvalue_t*, int*, int*, int*, int, int*);
struct ncclTransportComm send;
Expand Down
1 change: 1 addition & 0 deletions src/include/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <stdint.h>

ncclResult_t getHostName(char* hostname, int maxlen);
uint64_t getnHash(const char* string, int n);
uint64_t getHostHash();
uint64_t getPidHash();

Expand Down
10 changes: 6 additions & 4 deletions src/init.cu
Original file line number Diff line number Diff line change
Expand Up @@ -292,9 +292,9 @@ static void showVersion() {
}
}

static ncclResult_t fillInfo(struct ncclInfo* info, int rank) {
static ncclResult_t fillInfo(struct ncclInfo* info, int rank, uint64_t commHash) {
for (int t=0; t<NTRANSPORTS; t++) {
NCCLCHECK(ncclTransports[t].fillInfo(info->tinfo+t, rank));
NCCLCHECK(ncclTransports[t].fillInfo(info->tinfo+t, rank, commHash));
}
return ncclSuccess;
}
Expand Down Expand Up @@ -517,11 +517,13 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* comm
int rank = comm->rank;
int nranks = comm->nRanks;
void* commState;
uint64_t commHash = getnHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
TRACE(NCCL_INIT, "comm %p, commHash %lu, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
NCCLCHECK(bootstrapInit(commId, rank, nranks, &commState));

struct ncclInfo* allInfo;
NCCLCHECK(ncclCalloc(&allInfo, nranks));
NCCLCHECK(fillInfo(allInfo+rank, rank));
NCCLCHECK(fillInfo(allInfo+rank, rank, commHash));
NCCLCHECK(bootstrapAllGather(commState, allInfo, sizeof(struct ncclInfo)));

int* connectTransport;
Expand Down Expand Up @@ -741,7 +743,7 @@ static ncclResult_t initTransportsAll(struct ncclComm** comms, const int* devs,
NCCLCHECK(ncclCalloc(&allInfo, nranks));
for (int rank=0; rank<nranks; rank++) {
CUDACHECK(hipSetDevice(devs[rank]));
NCCLCHECK(fillInfo(allInfo+rank, rank));
NCCLCHECK(fillInfo(allInfo+rank, rank, 0));
}

int* connectTransport;
Expand Down
9 changes: 9 additions & 0 deletions src/misc/utils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,15 @@ uint64_t getHash(const char* string) {
return result;
}

uint64_t getnHash(const char* string, int n) {
// Based on DJB2, result = result * 33 + char
uint64_t result = 9527;
for (int c = 0; c < n; c++) {
result = ((result << 5) + result) + string[c];
}
return result;
}

/* Generate a hash of the unique identifying string for this host
* that will be unique for both bare-metal and container instances
* Equivalent of a hash of;
Expand Down
2 changes: 1 addition & 1 deletion src/transport/net.cu
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ struct netRecvResources {

/* Fill information necessary to exchange between ranks to choose whether or not
* to use this transport */
ncclResult_t netFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
ncclResult_t netFillInfo(ncclTinfo_t* opaqueInfo, int rank, uint64_t commHash) {
struct netInfo* info = (struct netInfo*)opaqueInfo;
static_assert(sizeof(struct netInfo) <= sizeof(ncclTinfo_t), "NET Info too large");
info->rank = rank;
Expand Down
6 changes: 3 additions & 3 deletions src/transport/p2p.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,13 @@ struct p2pConnectInfo {

/* Fill information necessary to exchange between ranks to choose whether or not
* to use this transport */
ncclResult_t p2pFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
ncclResult_t p2pFillInfo(ncclTinfo_t* opaqueInfo, int rank, uint64_t commHash) {
struct p2pInfo* info = (struct p2pInfo*)opaqueInfo;
static_assert(sizeof(struct p2pInfo) <= sizeof(ncclTinfo_t), "p2p Info too large");
info->rank = rank;
CUDACHECK(hipGetDevice(&info->cudaDev));
info->hostHash=getHostHash();
info->pidHash=getPidHash();
info->hostHash=getHostHash()+commHash;
info->pidHash=getPidHash()+commHash;

// Get PCI Bus Id. We need to get the bus ID through CUDA first, since the
// cudaDev is a CUDA runtime dev number which could be different from the
Expand Down
6 changes: 3 additions & 3 deletions src/transport/shm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,13 @@ struct shmRecvResources {

/* Fill information necessary to exchange between ranks to choose whether or not
* to use this transport */
ncclResult_t shmFillInfo(ncclTinfo_t* opaqueInfo, int rank) {
ncclResult_t shmFillInfo(ncclTinfo_t* opaqueInfo, int rank, uint64_t commHash) {
struct shmInfo* info = (struct shmInfo*)opaqueInfo;
static_assert(sizeof(struct shmInfo) <= sizeof(ncclTinfo_t), "shm Info too large");
info->rank = rank;
CUDACHECK(hipGetDevice(&info->cudaDev));
info->hostHash=getHostHash();
info->pidHash=getPidHash();
info->hostHash=getHostHash()+commHash;
info->pidHash=getPidHash()+commHash;
return ncclSuccess;
}

Expand Down

0 comments on commit 949d680

Please sign in to comment.