Skip to content

Commit

Permalink
Dma buf support optin (#905) (#907)
Browse files Browse the repository at this point in the history
* Dma buf support optin (#905)

* dmaBufSupport Optin added on every part of the code that should invoke it

* added to the original commit
  • Loading branch information
akolliasAMD committed Oct 6, 2023
1 parent f9012b6 commit 3d014cc
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 11 deletions.
11 changes: 5 additions & 6 deletions src/init.cc
Original file line number Diff line number Diff line change
Expand Up @@ -425,11 +425,10 @@ NCCL_PARAM(GdrCopyFifoEnable, "GDRCOPY_FIFO_ENABLE", 1);
NCCL_PARAM(WorkFifoDepth, "WORK_FIFO_DEPTH", 64<<10);
enum ncclLaunchMode ncclParamLaunchMode;

NCCL_PARAM(DmaBufEnable, "DMABUF_ENABLE", 0);

// Detect DMA-BUF support
static ncclResult_t dmaBufSupported(struct ncclComm* comm) {
if (ncclParamDmaBufEnable() == 0 || comm->ncclNet->regMrDmaBuf == NULL || rocmLibraryInit() != ncclSuccess) return ncclInternalError;
if (comm->ncclNet->regMrDmaBuf == NULL || rocmLibraryInit() != ncclSuccess) return ncclInternalError;
#if CUDA_VERSION >= 11070
int flag = 0;
CUdevice dev;
Expand Down Expand Up @@ -1727,7 +1726,7 @@ constexpr nvtxPayloadSchemaEntry_t CommInitRankSchema[] = {
NCCL_API(ncclResult_t, ncclCommInitRank, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank);
ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank) {
// Load the CUDA driver and dlsym hooks (can fail on old drivers)
if (ncclParamDmaBufEnable()) rocmLibraryInit();
rocmLibraryInit();

int cudaDev;
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
Expand All @@ -1743,7 +1742,7 @@ ncclResult_t ncclCommInitRank(ncclComm_t* newcomm, int nranks, ncclUniqueId comm
NCCL_API(ncclResult_t, ncclCommInitRankMulti, ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int virtualId);
ncclResult_t ncclCommInitRankMulti(ncclComm_t* newcomm, int nranks, ncclUniqueId commId, int myrank, int virtualId) {
// Load the CUDA driver and dlsym hooks (can fail on old drivers)
if (ncclParamDmaBufEnable()) rocmLibraryInit();
rocmLibraryInit();

int cudaDev;
ncclConfig_t config = NCCL_CONFIG_INITIALIZER;
Expand All @@ -1770,7 +1769,7 @@ ncclResult_t ncclCommInitAll(ncclComm_t* comms, int ndev, const int* devlist) {
NVTX3_FUNC_WITH_PARAMS(CommInitAll, CommInitAllSchema, ndev)

// Load the CUDA driver and dlsym hooks (can fail on old drivers)
if (ncclParamDmaBufEnable()) (void) rocmLibraryInit();
rocmLibraryInit();

NCCLCHECKGOTO(PtrCheck(comms, "CommInitAll", "comms"), ret, fail);
if (ndev < 0) {
Expand Down Expand Up @@ -1834,7 +1833,7 @@ ncclResult_t ncclCommInitRankConfig(ncclComm_t *newcomm, int nranks, ncclUniqueI
ncclConfig_t *internalConfigPtr = NULL;
NCCLCHECK(ncclGroupStartInternal());

if (ncclParamDmaBufEnable()) (void) rocmLibraryInit();
rocmLibraryInit();
CUDACHECKGOTO(cudaGetDevice(&cudaDev), ret, fail);

if (config == NULL)
Expand Down
18 changes: 13 additions & 5 deletions src/misc/rocmwrap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "debug.h"
#include "rocmwrap.h"
#include "hsa/hsa.h"
#include "param.h"

#include <dlfcn.h>
#include <sys/utsname.h>
Expand All @@ -17,7 +18,7 @@
#define DECLARE_ROCM_PFN(symbol) PFN_##symbol pfn_##symbol = nullptr

DECLARE_ROCM_PFN(hsa_amd_portable_export_dmabuf); // DMA-BUF support

NCCL_PARAM(DmaBufEnable, "DMABUF_ENABLE", 0);
/* ROCr Driver functions loaded with dlsym() */
DECLARE_ROCM_PFN(hsa_init);
DECLARE_ROCM_PFN(hsa_system_get_info);
Expand All @@ -28,14 +29,14 @@ static enum { hsaUninitialized, hsaInitializing, hsaInitialized, hsaError } hsaS
static void *hsaLib;
static uint16_t version_major, version_minor;
bool ncclCudaLaunchBlocking = false;
bool dmaBufSupport = false;

ncclResult_t rocmLibraryInit(void) {
do {
char* val = getenv("CUDA_LAUNCH_BLOCKING");
ncclCudaLaunchBlocking = val!=nullptr && val[0]!=0 && !(val[0]=='0' && val[1]==0);
} while (0);

bool dmaBufSupport = false;
hsa_status_t res;

if (hsaState == hsaInitialized)
Expand Down Expand Up @@ -108,14 +109,21 @@ ncclResult_t rocmLibraryInit(void) {

/* DMA-BUF support */
//ROCm support
if (ncclParamDmaBufEnable() == 0 ) {
INFO(NCCL_INIT, "Dmabuf feature disabled without NCCL_ENABLE_DMABUF_SUPPORT=1");
goto error;
}
res = pfn_hsa_system_get_info((hsa_system_info_t) 0x204, &dmaBufSupport);
if (res != HSA_STATUS_SUCCESS || !dmaBufSupport) INFO(NCCL_INIT, "Current version of ROCm does not support dmabuf feature.");
if (res != HSA_STATUS_SUCCESS || !dmaBufSupport) {
INFO(NCCL_INIT, "Current version of ROCm does not support dmabuf feature.");
goto error;
}
else {
pfn_hsa_amd_portable_export_dmabuf = (PFN_hsa_amd_portable_export_dmabuf) dlsym(hsaLib, "hsa_amd_portable_export_dmabuf");
if (pfn_hsa_amd_portable_export_dmabuf == NULL) {
WARN("Failed to load ROCr missing symbol hsa_amd_portable_export_dmabuf");
goto error;
}
}
else {
//check OS kernel support
struct utsname utsname;
Expand All @@ -126,7 +134,7 @@ ncclResult_t rocmLibraryInit(void) {
char buf[256];
int found_opt1 = 0;
int found_opt2 = 0;

//check for kernel name exists
if (uname(&utsname) == -1) INFO(NCCL_INIT,"Could not get kernel name");
//format and store the kernel conf file location
Expand Down
1 change: 1 addition & 0 deletions src/transport/net_ib.cc
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,7 @@ ncclResult_t ncclIbDmaBufSupport(int dev) {
static int dmaBufSupported = -1;
if (dmaBufSupported == -1) {
ncclResult_t res;
NCCLCHECKGOTO(rocmLibraryInit(), res, failure);
struct ibv_pd* pd;
struct ibv_context* ctx;
ctx = ncclIbDevs[dev].context;
Expand Down

0 comments on commit 3d014cc

Please sign in to comment.