From 96894a6f949601bfebfa2cb91bf0e9cc736b7039 Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Tue, 19 Dec 2023 12:52:27 -0600 Subject: [PATCH 1/3] mpl/cuda: Fix potential crash in memory hooks MPL intercepts calls to free CUDA device memory so it can execute its own hooks before calling cudaFree. If the application calls cudaFree before the hooks are initialized, the internal function pointer for cudaFree is NULL and will cause the application to segfault. Check and initialize the memory hooks, if needed, before executing them. --- src/mpl/src/gpu/mpl_gpu_cuda.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/mpl/src/gpu/mpl_gpu_cuda.c b/src/mpl/src/gpu/mpl_gpu_cuda.c index c2aefec82a0..8aca8349a47 100644 --- a/src/mpl/src/gpu/mpl_gpu_cuda.c +++ b/src/mpl/src/gpu/mpl_gpu_cuda.c @@ -508,6 +508,11 @@ __attribute__ ((visibility("default"))) CUresult CUDAAPI cuMemFree(CUdeviceptr dptr) { CUresult result; + if (!gpu_initialized) { + int ret = MPL_gpu_init(0); + assert(ret == 0); + } + gpu_free_hooks_cb((void *) dptr); result = sys_cuMemFree(dptr); return (result); @@ -519,6 +524,11 @@ __attribute__ ((visibility("default"))) cudaError_t CUDARTAPI cudaFree(void *dptr) { cudaError_t result; + if (!gpu_initialized) { + int ret = MPL_gpu_init(0); + assert(ret == 0); + } + gpu_free_hooks_cb(dptr); result = sys_cudaFree(dptr); return result; From f8c71c35ca044fb53be07e2e43b989934923f4f2 Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Fri, 12 Jan 2024 16:30:07 -0600 Subject: [PATCH 2/3] mpl/cuda: Only initialize hooks from free functions Instead of initializing all GPU infrastructure, just setup hooks. This still avoids crashes and won't cause consume resources if the user is simply allocating and freeing device memory not for use with MPI. --- src/mpl/src/gpu/mpl_gpu_cuda.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/mpl/src/gpu/mpl_gpu_cuda.c b/src/mpl/src/gpu/mpl_gpu_cuda.c index 8aca8349a47..a73b00be79b 100644 --- a/src/mpl/src/gpu/mpl_gpu_cuda.c +++ b/src/mpl/src/gpu/mpl_gpu_cuda.c @@ -508,9 +508,8 @@ __attribute__ ((visibility("default"))) CUresult CUDAAPI cuMemFree(CUdeviceptr dptr) { CUresult result; - if (!gpu_initialized) { - int ret = MPL_gpu_init(0); - assert(ret == 0); + if (!sys_cuMemFree) { + gpu_mem_hook_init(); } gpu_free_hooks_cb((void *) dptr); @@ -524,9 +523,8 @@ __attribute__ ((visibility("default"))) cudaError_t CUDARTAPI cudaFree(void *dptr) { cudaError_t result; - if (!gpu_initialized) { - int ret = MPL_gpu_init(0); - assert(ret == 0); + if (!sys_cudaFree) { + gpu_mem_hook_init(); } gpu_free_hooks_cb(dptr); From 447e4f9e836a21f4024b204b4b608c14ec8b8dc7 Mon Sep 17 00:00:00 2001 From: Ken Raffenetti Date: Fri, 12 Jan 2024 09:23:04 -0600 Subject: [PATCH 3/3] mpl/cuda: Add thread safety to memory hooks CUDA memory allocation functions are thread-safe, so our hooks need to be, as well. Use a static MPL "initlock" to avoid the need for initialization, since the hooks are installed at link-time. --- src/mpl/src/gpu/mpl_gpu_cuda.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/mpl/src/gpu/mpl_gpu_cuda.c b/src/mpl/src/gpu/mpl_gpu_cuda.c index a73b00be79b..9255a79cf83 100644 --- a/src/mpl/src/gpu/mpl_gpu_cuda.c +++ b/src/mpl/src/gpu/mpl_gpu_cuda.c @@ -32,6 +32,7 @@ static CUresult CUDAAPI(*sys_cuMemFree) (CUdeviceptr dptr); static cudaError_t CUDARTAPI(*sys_cudaFree) (void *dptr); static int gpu_mem_hook_init(); +static MPL_initlock_t free_hook_mutex = MPL_INITLOCK_INITIALIZER; int MPL_gpu_get_dev_count(int *dev_cnt, int *dev_id, int *subdevice_id) { @@ -359,7 +360,9 @@ int MPL_gpu_init(int debug_summary) * in cuda, such as cudaFree and cuMemFree, to track user behaviors on * the memory buffer and invalidate cached handle/buffer respectively * for result correctness. */ + MPL_initlock_lock(&free_hook_mutex); gpu_mem_hook_init(); + MPL_initlock_unlock(&free_hook_mutex); gpu_initialized = 1; if (MPL_gpu_info.debug_summary) { @@ -388,11 +391,13 @@ int MPL_gpu_finalize(void) MPL_free(global_to_local_map); gpu_free_hook_s *prev; + MPL_initlock_lock(&free_hook_mutex); while (free_hook_chain) { prev = free_hook_chain; free_hook_chain = free_hook_chain->next; MPL_free(prev); } + MPL_initlock_unlock(&free_hook_mutex); /* Reset initialization state */ gpu_initialized = 0; @@ -483,6 +488,7 @@ static int gpu_mem_hook_init() assert(sys_cuMemFree); sys_cudaFree = (void *) dlsym(libcudart_handle, "cudaFree"); assert(sys_cudaFree); + return MPL_SUCCESS; } @@ -492,12 +498,15 @@ int MPL_gpu_free_hook_register(void (*free_hook) (void *dptr)) assert(hook_obj); hook_obj->free_hook = free_hook; hook_obj->next = NULL; + + MPL_initlock_lock(&free_hook_mutex); if (!free_hook_chain) free_hook_chain = hook_obj; else { hook_obj->next = free_hook_chain; free_hook_chain = hook_obj; } + MPL_initlock_unlock(&free_hook_mutex); return MPL_SUCCESS; } @@ -508,12 +517,15 @@ __attribute__ ((visibility("default"))) CUresult CUDAAPI cuMemFree(CUdeviceptr dptr) { CUresult result; + MPL_initlock_lock(&free_hook_mutex); if (!sys_cuMemFree) { gpu_mem_hook_init(); } gpu_free_hooks_cb((void *) dptr); result = sys_cuMemFree(dptr); + + MPL_initlock_unlock(&free_hook_mutex); return (result); } @@ -523,12 +535,15 @@ __attribute__ ((visibility("default"))) cudaError_t CUDARTAPI cudaFree(void *dptr) { cudaError_t result; + MPL_initlock_lock(&free_hook_mutex); if (!sys_cudaFree) { gpu_mem_hook_init(); } gpu_free_hooks_cb(dptr); result = sys_cudaFree(dptr); + + MPL_initlock_unlock(&free_hook_mutex); return result; }