diff --git a/CMakeLists.txt b/CMakeLists.txt index 511ba8099..538428755 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,7 +18,7 @@ option(BUILD_SHARED_LIBS "Build as shared library" option(BUILD_TESTS "Build unit test programs" OFF) option(COLLTRACE "Collective Trace Option" ON) option(ENABLE_MSCCL_KERNEL "Enable MSCCL while compiling" ON) -option(ENABLE_IFC "Enable indirect function call" OFF) +option(ENABLE_IFC "Enable indirect function call" ON) option(INSTALL_DEPENDENCIES "Force install dependencies" OFF) option(PROFILE "Enable profiling" OFF) option(TIMETRACE "Enable time-trace during compilation" OFF) diff --git a/src/collectives/device/all_gather.h b/src/collectives/device/all_gather.h index 26106d20c..e6c48b8a3 100644 --- a/src/collectives/device/all_gather.h +++ b/src/collectives/device/all_gather.h @@ -11,7 +11,7 @@ namespace { template -#ifdef USE_INDIRECT_FUNCTION_CALL +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) __device__ void runRing(ncclWorkElem *args) { #else __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) { diff --git a/src/collectives/device/all_reduce.h b/src/collectives/device/all_reduce.h index 658fc30b5..185ea74f6 100644 --- a/src/collectives/device/all_reduce.h +++ b/src/collectives/device/all_reduce.h @@ -15,7 +15,7 @@ namespace { template -#ifdef USE_INDIRECT_FUNCTION_CALL +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) __device__ void runRing(ncclWorkElem *args) { #else __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) { @@ -223,7 +223,7 @@ namespace { } template -#ifdef USE_INDIRECT_FUNCTION_CALL +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) __device__ void runTreeUpDown(ncclWorkElem *args) { #else __device__ __attribute__((noinline)) void runTreeUpDown(ncclWorkElem *args) { @@ -379,7 +379,7 @@ namespace { } template -#ifdef USE_INDIRECT_FUNCTION_CALL +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) __device__ void runTreeSplit(ncclWorkElem *args) { #else __device__ __attribute__((noinline)) void runTreeSplit(ncclWorkElem *args) { diff --git a/src/collectives/device/alltoall_pivot.h b/src/collectives/device/alltoall_pivot.h index c7912808e..693f22af2 100644 --- a/src/collectives/device/alltoall_pivot.h +++ b/src/collectives/device/alltoall_pivot.h @@ -10,7 +10,7 @@ namespace { template -#ifdef USE_INDIRECT_FUNCTION_CALL +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) __device__ void runRing(ncclWorkElem *args) { #else __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) { diff --git a/src/collectives/device/broadcast.h b/src/collectives/device/broadcast.h index 5dc72b5a4..6a149c9ee 100644 --- a/src/collectives/device/broadcast.h +++ b/src/collectives/device/broadcast.h @@ -10,7 +10,7 @@ namespace { template -#ifdef USE_INDIRECT_FUNCTION_CALL +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) __device__ void runRing(ncclWorkElem *args) { #else __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) { diff --git a/src/collectives/device/common.h b/src/collectives/device/common.h index 4acd171f4..5b3c47982 100644 --- a/src/collectives/device/common.h +++ b/src/collectives/device/common.h @@ -130,7 +130,7 @@ static const __device__ constexpr ncclKernelFunc_t ncclFuncs[]{ static_assert(FUNC_INDEX_P2P == 5410, "Wrong P2P function index"); static_assert(FUNC_INDEX_ALLTOALL_PIVOT == 5411, "Wrong AllToAllPivot function index"); -#ifndef USE_INDIRECT_FUNCTION_CALL +#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) template struct Caller { static __forceinline__ __device__ __host__ @@ -274,7 +274,7 @@ void NCCL_CALL_FUNCTIONS(unsigned short funcIndex) noexcept { template class ncclFunction { public: -#ifdef USE_INDIRECT_FUNCTION_CALL +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) __device__ __attribute__((noinline)) void run(struct ncclWorkElem* args) {} #else __device__ void run(struct ncclWorkElem* args) {} @@ -568,7 +568,7 @@ __forceinline__ __device__ void ncclKernel( if (ncclShmem.work.header.funcIndex == FnIndex) { RunWork().run(&ncclShmem.work); } else { -#ifdef USE_INDIRECT_FUNCTION_CALL +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) ncclFuncs[ncclShmem.work.header.funcIndex](); #else #ifdef ENABLE_LL128 @@ -627,7 +627,7 @@ __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDev // Examples : AllReduce, RING, LL, Sum, uint8 /* Functions for aggregation case */ -#ifdef USE_INDIRECT_FUNCTION_CALL +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) #define IMPL_COLL_FUNC(func, algo, proto, devredop, type) \ __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)() { \ RunWork, NCCL_ALGO_##algo, NCCL_PROTO_##proto>().run(&ncclShmem.work); \ diff --git a/src/collectives/device/onerank_reduce.cu b/src/collectives/device/onerank_reduce.cu index 0c40f9afc..f1f0fed9e 100644 --- a/src/collectives/device/onerank_reduce.cu +++ b/src/collectives/device/onerank_reduce.cu @@ -12,7 +12,7 @@ namespace { template -#ifdef USE_INDIRECT_FUNCTION_CALL +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) __device__ void oneRankReduce() { #else __device__ __attribute__((noinline)) void oneRankReduce() { @@ -48,7 +48,7 @@ namespace { } } -#ifdef USE_INDIRECT_FUNCTION_CALL +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) #define INSTANTIATE(devredop, type) \ __device__ void NCCL_ONERANK_REDUCE_NAME(devredop, type)() { \ oneRankReduce>(); \ diff --git a/src/collectives/device/op128.h b/src/collectives/device/op128.h index 5b694545c..586b50e37 100644 --- a/src/collectives/device/op128.h +++ b/src/collectives/device/op128.h @@ -124,7 +124,7 @@ union alignas(16) BytePack<16> { uint32_t u32[4]; uint64_t u64[2]; ulong2 ul2, native; -#ifndef USE_INDIRECT_FUNCTION_CALL +#if !defined(USE_INDIRECT_FUNCTION_CALL) || defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) inline __device__ BytePack<16>& operator=(BytePack<16> other) { u64[0] = other.u64[0]; u64[1] = other.u64[1]; diff --git a/src/collectives/device/reduce.h b/src/collectives/device/reduce.h index 491258ea6..811874676 100644 --- a/src/collectives/device/reduce.h +++ b/src/collectives/device/reduce.h @@ -11,7 +11,7 @@ namespace { template -#ifdef USE_INDIRECT_FUNCTION_CALL +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) __device__ void runRing(ncclWorkElem *args) { #else __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) { diff --git a/src/collectives/device/reduce_scatter.h b/src/collectives/device/reduce_scatter.h index 6a7caeba3..dd29d36c2 100644 --- a/src/collectives/device/reduce_scatter.h +++ b/src/collectives/device/reduce_scatter.h @@ -11,7 +11,7 @@ namespace { template -#ifdef USE_INDIRECT_FUNCTION_CALL +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) __device__ void runRing(ncclWorkElem *args) { #else __device__ __attribute__((noinline)) void runRing(ncclWorkElem *args) { diff --git a/src/collectives/device/sendrecv.h b/src/collectives/device/sendrecv.h index 86c72f457..5c5b5b72e 100644 --- a/src/collectives/device/sendrecv.h +++ b/src/collectives/device/sendrecv.h @@ -174,7 +174,7 @@ struct RunWork { } } -#ifdef USE_INDIRECT_FUNCTION_CALL +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) __device__ void run(ncclWork *work) { #else __device__ __attribute__((noinline)) void run(ncclWork *work) { diff --git a/src/include/collectives.h b/src/include/collectives.h index bda4be7f7..cd0684181 100644 --- a/src/include/collectives.h +++ b/src/include/collectives.h @@ -39,7 +39,7 @@ struct ncclDevRedOpFull { nccl##func##algo##proto /* Declare all collective operations */ -#ifdef USE_INDIRECT_FUNCTION_CALL +#if defined(USE_INDIRECT_FUNCTION_CALL) && !defined(__gfx940__) && !defined(__gfx941__) && !defined(__gfx942__) #define DECL5(func, algo, proto, devredop, type) \ extern __device__ void NCCL_FUNC_NAME(func, algo, proto, devredop, type)(); \ extern __global__ void NCCL_KERN_NAME(func, algo, proto, devredop, type)(struct ncclDevComm* comm, uint64_t channelMask, struct ncclWork* workHead); \ diff --git a/src/init.cc b/src/init.cc index 8041c7a5b..36a8db183 100644 --- a/src/init.cc +++ b/src/init.cc @@ -1688,6 +1688,7 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { int* parentRanks = NULL; int cudaArch; int64_t stackSize = rcclParamStackSizeOverride() ? rcclParamStackSizeOverride() : maxLocalSizeBytes; + hipDeviceProp_t devProp; CUDACHECKGOTO(cudaSetDevice(cudaDev), res, fail); CUDACHECKGOTO(cudaDeviceGetAttribute(&archMajor, cudaDevAttrComputeCapabilityMajor, cudaDev), res, fail); @@ -1698,7 +1699,8 @@ static ncclResult_t ncclCommInitRankFunc(struct ncclAsyncJob* job_) { // Set the maximum kernel stack size of all kernels to avoid // a CUDA memory reconfig on load (c.f. NVSHMEM issue) #ifdef USE_INDIRECT_FUNCTION_CALL - if (stackSize > 0 && ncclParamSetStackSize() == 1) { + CUDACHECK(hipGetDeviceProperties(&devProp, 0)); + if (stackSize > 0 && ncclParamSetStackSize() == 1 && strcmp(devProp.gcnArchName,"gfx940") != 0 && strcmp(devProp.gcnArchName, "gfx941") != 0 && strcmp(devProp.gcnArchName, "gfx942") != 0) { INFO(NCCL_INIT, "Setting cudaLimitStackSize to %zi maxLocalSizeBytes %zi", stackSize, maxLocalSizeBytes); CUDACHECKIGNORE(cudaDeviceSetLimit(cudaLimitStackSize, stackSize)); }