From 96894a6f949601bfebfa2cb91bf0e9cc736b7039 Mon Sep 17 00:00:00 2001
From: Ken Raffenetti <raffenet@mcs.anl.gov>
Date: Tue, 19 Dec 2023 12:52:27 -0600
Subject: [PATCH 1/3] mpl/cuda: Fix potential crash in memory hooks

MPL intercepts calls to free CUDA device memory so it can execute its
own hooks before calling cudaFree. If the application calls cudaFree
before the hooks are initialized, the internal function pointer for
cudaFree is NULL and will cause the application to segfault. Check and
initialize the memory hooks, if needed, before executing them.
---
 src/mpl/src/gpu/mpl_gpu_cuda.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/mpl/src/gpu/mpl_gpu_cuda.c b/src/mpl/src/gpu/mpl_gpu_cuda.c
index c2aefec82a0..8aca8349a47 100644
--- a/src/mpl/src/gpu/mpl_gpu_cuda.c
+++ b/src/mpl/src/gpu/mpl_gpu_cuda.c
@@ -508,6 +508,11 @@ __attribute__ ((visibility("default")))
 CUresult CUDAAPI cuMemFree(CUdeviceptr dptr)
 {
     CUresult result;
+    if (!gpu_initialized) {
+        int ret = MPL_gpu_init(0);
+        assert(ret == 0);
+    }
+
     gpu_free_hooks_cb((void *) dptr);
     result = sys_cuMemFree(dptr);
     return (result);
@@ -519,6 +524,11 @@ __attribute__ ((visibility("default")))
 cudaError_t CUDARTAPI cudaFree(void *dptr)
 {
     cudaError_t result;
+    if (!gpu_initialized) {
+        int ret = MPL_gpu_init(0);
+        assert(ret == 0);
+    }
+
     gpu_free_hooks_cb(dptr);
     result = sys_cudaFree(dptr);
     return result;

From f8c71c35ca044fb53be07e2e43b989934923f4f2 Mon Sep 17 00:00:00 2001
From: Ken Raffenetti <raffenet@mcs.anl.gov>
Date: Fri, 12 Jan 2024 16:30:07 -0600
Subject: [PATCH 2/3] mpl/cuda: Only initialize hooks from free functions

Instead of initializing all GPU infrastructure, just setup hooks. This
still avoids crashes and won't cause consume resources if the user is
simply allocating and freeing device memory not for use with MPI.
---
 src/mpl/src/gpu/mpl_gpu_cuda.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/mpl/src/gpu/mpl_gpu_cuda.c b/src/mpl/src/gpu/mpl_gpu_cuda.c
index 8aca8349a47..a73b00be79b 100644
--- a/src/mpl/src/gpu/mpl_gpu_cuda.c
+++ b/src/mpl/src/gpu/mpl_gpu_cuda.c
@@ -508,9 +508,8 @@ __attribute__ ((visibility("default")))
 CUresult CUDAAPI cuMemFree(CUdeviceptr dptr)
 {
     CUresult result;
-    if (!gpu_initialized) {
-        int ret = MPL_gpu_init(0);
-        assert(ret == 0);
+    if (!sys_cuMemFree) {
+        gpu_mem_hook_init();
     }
 
     gpu_free_hooks_cb((void *) dptr);
@@ -524,9 +523,8 @@ __attribute__ ((visibility("default")))
 cudaError_t CUDARTAPI cudaFree(void *dptr)
 {
     cudaError_t result;
-    if (!gpu_initialized) {
-        int ret = MPL_gpu_init(0);
-        assert(ret == 0);
+    if (!sys_cudaFree) {
+        gpu_mem_hook_init();
     }
 
     gpu_free_hooks_cb(dptr);

From 447e4f9e836a21f4024b204b4b608c14ec8b8dc7 Mon Sep 17 00:00:00 2001
From: Ken Raffenetti <raffenet@mcs.anl.gov>
Date: Fri, 12 Jan 2024 09:23:04 -0600
Subject: [PATCH 3/3] mpl/cuda: Add thread safety to memory hooks

CUDA memory allocation functions are thread-safe, so our hooks need to
be, as well. Use a static MPL "initlock" to avoid the need for
initialization, since the hooks are installed at link-time.
---
 src/mpl/src/gpu/mpl_gpu_cuda.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/src/mpl/src/gpu/mpl_gpu_cuda.c b/src/mpl/src/gpu/mpl_gpu_cuda.c
index a73b00be79b..9255a79cf83 100644
--- a/src/mpl/src/gpu/mpl_gpu_cuda.c
+++ b/src/mpl/src/gpu/mpl_gpu_cuda.c
@@ -32,6 +32,7 @@ static CUresult CUDAAPI(*sys_cuMemFree) (CUdeviceptr dptr);
 static cudaError_t CUDARTAPI(*sys_cudaFree) (void *dptr);
 
 static int gpu_mem_hook_init();
+static MPL_initlock_t free_hook_mutex = MPL_INITLOCK_INITIALIZER;
 
 int MPL_gpu_get_dev_count(int *dev_cnt, int *dev_id, int *subdevice_id)
 {
@@ -359,7 +360,9 @@ int MPL_gpu_init(int debug_summary)
      * in cuda, such as cudaFree and cuMemFree, to track user behaviors on
      * the memory buffer and invalidate cached handle/buffer respectively
      * for result correctness. */
+    MPL_initlock_lock(&free_hook_mutex);
     gpu_mem_hook_init();
+    MPL_initlock_unlock(&free_hook_mutex);
     gpu_initialized = 1;
 
     if (MPL_gpu_info.debug_summary) {
@@ -388,11 +391,13 @@ int MPL_gpu_finalize(void)
     MPL_free(global_to_local_map);
 
     gpu_free_hook_s *prev;
+    MPL_initlock_lock(&free_hook_mutex);
     while (free_hook_chain) {
         prev = free_hook_chain;
         free_hook_chain = free_hook_chain->next;
         MPL_free(prev);
     }
+    MPL_initlock_unlock(&free_hook_mutex);
 
     /* Reset initialization state */
     gpu_initialized = 0;
@@ -483,6 +488,7 @@ static int gpu_mem_hook_init()
     assert(sys_cuMemFree);
     sys_cudaFree = (void *) dlsym(libcudart_handle, "cudaFree");
     assert(sys_cudaFree);
+
     return MPL_SUCCESS;
 }
 
@@ -492,12 +498,15 @@ int MPL_gpu_free_hook_register(void (*free_hook) (void *dptr))
     assert(hook_obj);
     hook_obj->free_hook = free_hook;
     hook_obj->next = NULL;
+
+    MPL_initlock_lock(&free_hook_mutex);
     if (!free_hook_chain)
         free_hook_chain = hook_obj;
     else {
         hook_obj->next = free_hook_chain;
         free_hook_chain = hook_obj;
     }
+    MPL_initlock_unlock(&free_hook_mutex);
 
     return MPL_SUCCESS;
 }
@@ -508,12 +517,15 @@ __attribute__ ((visibility("default")))
 CUresult CUDAAPI cuMemFree(CUdeviceptr dptr)
 {
     CUresult result;
+    MPL_initlock_lock(&free_hook_mutex);
     if (!sys_cuMemFree) {
         gpu_mem_hook_init();
     }
 
     gpu_free_hooks_cb((void *) dptr);
     result = sys_cuMemFree(dptr);
+
+    MPL_initlock_unlock(&free_hook_mutex);
     return (result);
 }
 
@@ -523,12 +535,15 @@ __attribute__ ((visibility("default")))
 cudaError_t CUDARTAPI cudaFree(void *dptr)
 {
     cudaError_t result;
+    MPL_initlock_lock(&free_hook_mutex);
     if (!sys_cudaFree) {
         gpu_mem_hook_init();
     }
 
     gpu_free_hooks_cb(dptr);
     result = sys_cudaFree(dptr);
+
+    MPL_initlock_unlock(&free_hook_mutex);
     return result;
 }