From a1a090c822716a714fc28169c49c4a73963a0ae3 Mon Sep 17 00:00:00 2001 From: Chuck Fossen Date: Tue, 29 Oct 2024 09:43:21 -0500 Subject: [PATCH] prov/cxi: Enable use of dmabuf by default for ROCR Remove unneeded disable_dmabuf_cuda and disable_dmabuf_rocr environment variables. Set FI_HMEM_ROCR_USE_DMABUF to enabled by default. Update CXI man page. NETCASSINI-6844 Signed-off-by: Chuck Fossen --- man/fi_cxi.7.md | 14 +++++++------- prov/cxi/src/cxip_info.c | 18 +++++------------- prov/cxi/src/cxip_iomm.c | 6 ------ 3 files changed, 12 insertions(+), 26 deletions(-) diff --git a/man/fi_cxi.7.md b/man/fi_cxi.7.md index 3be247ae462..384026f0192 100644 --- a/man/fi_cxi.7.md +++ b/man/fi_cxi.7.md @@ -445,14 +445,14 @@ faults but requires all buffers to be backed by physical memory. Copy-on-write semantics are broken when using pinned memory. See the Fork section for more information. -The CXI provider supports DMABUF for device memory registration. If the ROCR -and CUDA libraries support it, the CXI provider will default to use DMA-buf. +The CXI provider supports DMABUF for device memory registration. +DMABUF is supported in ROCm 5.6+ and Cuda 11.7+ with nvidia open source driver +525+. +Both *FI_HMEM_ROCR_USE_DMABUF* and *FI_HMEM_CUDA_USE_DMABUF are disabled by +default in libfabric core but the CXI provider enables +*FI_HMEM_ROCR_USE_DMABUF* by default if not specifically set. There may be situations with CUDA that may double the BAR consumption. -Until this is fixed in the CUDA stack, the environment variable -*FI_CXI_DISABLE_DMABUF_CUDA* can be used to fall back to the nvidia -peer-memory interface. -Also, *FI_CXI_DISABLE_DMABUF_ROCR* can be used to fall back to the amdgpu -peer-memory interface. +Until this is fixed in the CUDA stack, CUDA DMABUF will be disabled by default. ## Translation Cache diff --git a/prov/cxi/src/cxip_info.c b/prov/cxi/src/cxip_info.c index e195a459624..af94964ab5f 100644 --- a/prov/cxi/src/cxip_info.c +++ b/prov/cxi/src/cxip_info.c @@ -608,8 +608,6 @@ struct cxip_environment cxip_env = { .force_odp = false, .ats = false, .iotlb = true, - .disable_dmabuf_cuda = false, - .disable_dmabuf_rocr = false, .ats_mlock_mode = CXIP_ATS_MLOCK_ALL, .fork_safe_requested = false, .rx_match_mode = CXIP_PTLTE_DEFAULT_MODE, @@ -770,17 +768,11 @@ static void cxip_env_init(void) "Enables the NIC IOTLB (default %d).", cxip_env.iotlb); fi_param_get_bool(&cxip_prov, "iotlb", &cxip_env.iotlb); - fi_param_define(&cxip_prov, "disable_dmabuf_cuda", FI_PARAM_BOOL, - "Disables the DMABUF interface for CUDA (default %d).", - cxip_env.disable_dmabuf_cuda); - fi_param_get_bool(&cxip_prov, "disable_dmabuf_cuda", - &cxip_env.disable_dmabuf_cuda); - - fi_param_define(&cxip_prov, "disable_dmabuf_rocr", FI_PARAM_BOOL, - "Disables the DMABUF interface for ROCR (default %d).", - cxip_env.disable_dmabuf_rocr); - fi_param_get_bool(&cxip_prov, "disable_dmabuf_rocr", - &cxip_env.disable_dmabuf_rocr); + /* Use ROCR DMABUF by default - honors the env if already set */ + ret = setenv("FI_HMEM_ROCR_USE_DMABUF", "1", 0); + if (ret) + CXIP_INFO("Could not enable FI_HMEM_ROCR_USE_DMABUF ret:%d %s\n", + ret, fi_strerror(errno)); /* Disable cuda DMABUF by default - honors the env if already set */ ret = setenv("FI_HMEM_CUDA_USE_DMABUF", "0", 0); diff --git a/prov/cxi/src/cxip_iomm.c b/prov/cxi/src/cxip_iomm.c index 69975cfb06a..b998bd34aee 100644 --- a/prov/cxi/src/cxip_iomm.c +++ b/prov/cxi/src/cxip_iomm.c @@ -28,12 +28,6 @@ static int cxip_dmabuf_hints(enum fi_hmem_iface iface, void *iov_base, return -FI_ENOSYS; } - if (iface == FI_HMEM_CUDA && cxip_env.disable_dmabuf_cuda) - return FI_SUCCESS; - - if (iface == FI_HMEM_ROCR && cxip_env.disable_dmabuf_rocr) - return FI_SUCCESS; - ret = ofi_hmem_get_base_addr(iface, iov_base, len, (void*)&base, &size); if (ret) return ret;