TL/CUDA: enable lazy init

openucx · Mar 29, 2023 · 915c5ac · 915c5ac
1 parent d0e3ca4
commit 915c5ac
Show file tree

Hide file tree

Showing 13 changed files with 433 additions and 221 deletions.
diff --git a/src/components/tl/cuda/allgather/allgather.c b/src/components/tl/cuda/allgather/allgather.c
@@ -43,6 +43,12 @@ ucc_status_t ucc_tl_cuda_allgather_init(ucc_base_coll_args_t *coll_args,
                                         ucc_coll_task_t **task_p)
 {
     ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
+    ucc_status_t status;
+
+    status = ucc_tl_cuda_comm_init(team);
+    if (ucc_unlikely(status != UCC_OK)) {
+        return status;
+    }
 
     if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
         return ucc_tl_cuda_allgather_linear_init(coll_args, tl_team, task_p);

diff --git a/src/components/tl/cuda/allgatherv/allgatherv.c b/src/components/tl/cuda/allgatherv/allgatherv.c
@@ -46,6 +46,12 @@ ucc_status_t ucc_tl_cuda_allgatherv_init(ucc_base_coll_args_t *coll_args,
                                          ucc_coll_task_t **    task_p)
 {
     ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
+    ucc_status_t status;
+
+    status = ucc_tl_cuda_comm_init(team);
+    if (ucc_unlikely(status != UCC_OK)) {
+        return status;
+    }
 
     if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
         return ucc_tl_cuda_allgatherv_linear_init(coll_args, tl_team, task_p);

diff --git a/src/components/tl/cuda/reduce_scatter/reduce_scatter.c b/src/components/tl/cuda/reduce_scatter/reduce_scatter.c
@@ -47,7 +47,12 @@ ucc_status_t ucc_tl_cuda_reduce_scatter_init(ucc_base_coll_args_t *coll_args,
                                              ucc_coll_task_t **task_p)
 {
     ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
+    ucc_status_t status;
 
+    status = ucc_tl_cuda_comm_init(team);
+    if (ucc_unlikely(status != UCC_OK)) {
+        return status;
+    }
     if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
         return ucc_tl_cuda_reduce_scatter_linear_init(coll_args, tl_team,
                                                       task_p);

diff --git a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c
@@ -50,7 +50,12 @@ ucc_status_t ucc_tl_cuda_reduce_scatterv_init(ucc_base_coll_args_t *coll_args,
                                              ucc_coll_task_t **task_p)
 {
     ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
+    ucc_status_t status;
 
+    status = ucc_tl_cuda_comm_init(team);
+    if (ucc_unlikely(status != UCC_OK)) {
+        return status;
+    }
     if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
         return ucc_tl_cuda_reduce_scatterv_linear_init(coll_args, tl_team,
                                                        task_p);

diff --git a/src/components/tl/cuda/tl_cuda.c b/src/components/tl/cuda/tl_cuda.c
@@ -59,6 +59,11 @@ static ucs_config_field_t ucc_tl_cuda_context_config_table[] = {
     {"", "", NULL, ucc_offsetof(ucc_tl_cuda_context_config_t, super),
      UCC_CONFIG_TYPE_TABLE(ucc_tl_context_config_table)},
 
+    {"LAZY_INIT", "yes",
+     "Initialize team on first collective",
+     ucc_offsetof(ucc_tl_cuda_context_config_t, lazy_init),
+     UCC_CONFIG_TYPE_BOOL},
+
     {NULL}};
 
 ucc_status_t ucc_tl_cuda_get_context_attr(const ucc_base_context_t *context,

diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h
@@ -53,6 +53,14 @@
         (ucc_tl_cuda_shm_barrier_t *)_bar;                                     \
     })
 
+#define GET_RANK_ID(_ids, _trank, _max_concurrent)                             \
+    ({                                                                         \
+        size_t _rank_id_size = sizeof(ucc_tl_cuda_rank_id_t) +                 \
+            (_max_concurrent - 1) * sizeof(cudaIpcEventHandle_t);              \
+        void *_rank_id = PTR_OFFSET(_ids, _trank * _rank_id_size);             \
+        (ucc_tl_cuda_rank_id_t*)_rank_id;                                      \
+    })
+
 #ifdef HAVE_PROFILING_TL_CUDA
 #include "utils/profile/ucc_profile.h"
 #else
@@ -81,6 +89,7 @@ typedef struct ucc_tl_cuda_lib_config {
 
 typedef struct ucc_tl_cuda_context_config {
     ucc_tl_context_config_t super;
+    int                     lazy_init;
 } ucc_tl_cuda_context_config_t;
 
 typedef struct ucc_tl_cuda_lib {
@@ -93,8 +102,6 @@ UCC_CLASS_DECLARE(ucc_tl_cuda_lib_t, const ucc_base_lib_params_t *,
 typedef struct ucc_tl_cuda_context {
     ucc_tl_context_t             super;
     ucc_tl_cuda_context_config_t cfg;
-    int                          device;
-    ucc_tl_cuda_device_pci_id_t  device_id;
     ucc_tl_cuda_topo_t          *topo;
     ucc_mpool_t                  req_mp;
     tl_cuda_ep_hash_t           *ipc_cache;
@@ -127,6 +134,7 @@ typedef struct ucc_tl_cuda_rank_id {
     ucc_tl_cuda_device_pci_id_t pci_id;
     ucc_tl_cuda_mem_info_t      scratch_info;
     int                         shm;
+    cudaIpcEventHandle_t        ev_handle[1]; /* max concurent */
 } ucc_tl_cuda_rank_id_t;
 
 typedef struct ucc_tl_cuda_sync {
@@ -152,20 +160,32 @@ typedef struct ucc_tl_cuda_scratch {
     ucc_tl_cuda_mem_info_t rem_info[UCC_TL_CUDA_MAX_PEERS];
 } ucc_tl_cuda_scratch_t;
 
+enum {
+    TL_CUDA_STATE_READY,
+    TL_CUDA_STATE_SHM_ID_EXCHANGE,
+    TL_CUDA_STATE_COMM_INIT,
+    TL_CUDA_STATE_ERROR
+};
+
 typedef struct ucc_tl_cuda_team {
-    ucc_tl_team_t              super;
-    uint32_t                   seq_num;
-    ucc_tl_cuda_team_topo_t   *topo;
-    ucc_tl_cuda_sync_t        *sync;
-    ucc_tl_cuda_sync_state_t  *sync_state;
-    ucc_tl_cuda_shm_barrier_t *bar;
-    ucc_tl_cuda_scratch_t      scratch;
-    cudaStream_t               stream;
-    ucc_tl_cuda_rank_id_t     *ids;
-    ucc_team_oob_coll_t        oob;
-    void                      *oob_req;
+    ucc_tl_team_t                super;
+    int                          state;
+    uint32_t                     seq_num;
+    int                          device;
+    ucc_tl_cuda_device_pci_id_t  device_id;
+    ucc_tl_cuda_team_topo_t     *topo;
+    ucc_tl_cuda_sync_t          *sync;
+    ucc_tl_cuda_sync_state_t    *sync_state;
+    ucc_tl_cuda_shm_barrier_t   *bar;
+    ucc_tl_cuda_scratch_t        scratch;
+    cudaStream_t                 stream;
+    ucc_tl_cuda_rank_id_t       *ids;
+    ucc_team_oob_coll_t          oob;
+    void                        *oob_req;
 } ucc_tl_cuda_team_t;
 
+ucc_status_t ucc_tl_cuda_comm_init(ucc_tl_cuda_team_t *team);
+
 UCC_CLASS_DECLARE(ucc_tl_cuda_team_t, ucc_base_context_t *,
                   const ucc_base_team_params_t *);
 

diff --git a/src/components/tl/cuda/tl_cuda_coll.c b/src/components/tl/cuda/tl_cuda_coll.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
@@ -12,24 +12,6 @@
 #include "reduce_scatter/reduce_scatter.h"
 #include "reduce_scatterv/reduce_scatterv.h"
 #include "utils/arch/cpu.h"
-#include "utils/arch/cuda_def.h"
-
-
-#if ENABLE_DEBUG == 1
-/* TODO: possible need to check CUDA context */
-#define UCC_TL_CUDA_CHECK_DEVICE_MATCH(_team) do {                             \
-    int _dev;                                                                  \
-    CUDA_CHECK(cudaGetDevice(&_dev));                                          \
-    if (_dev != UCC_TL_CUDA_TEAM_CTX(_team)->device) {                         \
-        tl_error(UCC_TL_TEAM_LIB(_team), "CUDA device mismatch, "              \
-                 "current device %d, team device %d\n", _dev,                  \
-                 UCC_TL_CUDA_TEAM_CTX(_team)->device);                         \
-        return UCC_ERR_INVALID_PARAM;                                          \
-    }                                                                          \
-} while(0)
-#else
-#define UCC_TL_CUDA_CHECK_DEVICE_MATCH(_team)
-#endif
 
 const char *
     ucc_tl_cuda_default_alg_select_str[UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR] = {

diff --git a/src/components/tl/cuda/tl_cuda_coll.h b/src/components/tl/cuda/tl_cuda_coll.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
@@ -9,11 +9,28 @@
 
 #include "tl_cuda.h"
 #include "components/mc/ucc_mc.h"
+#include "utils/arch/cuda_def.h"
 
 #define UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR 4
 extern const char
     *ucc_tl_cuda_default_alg_select_str[UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR];
 
+#if ENABLE_DEBUG == 1
+/* TODO: possible need to check CUDA context */
+#define UCC_TL_CUDA_CHECK_DEVICE_MATCH(_team) do {                             \
+    int _dev;                                                                  \
+    CUDA_CHECK(cudaGetDevice(&_dev));                                          \
+    if (_dev != (_team)->device) {                                             \
+        tl_error(UCC_TL_TEAM_LIB(_team), "CUDA device mismatch, "              \
+                 "current device %d, team device %d\n", _dev,                  \
+                 (_team)->device);                                             \
+        return UCC_ERR_INVALID_PARAM;                                          \
+    }                                                                          \
+} while(0)
+#else
+#define UCC_TL_CUDA_CHECK_DEVICE_MATCH(_team)
+#endif
+
 #define TASK_TEAM(_task)                                                       \
     (ucc_derived_of((_task)->super.team, ucc_tl_cuda_team_t))
 
@@ -85,6 +102,11 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args,
     ucc_tl_cuda_task_t *task;
     ucc_status_t        status;
 
+    status = ucc_tl_cuda_comm_init(team);
+    if (ucc_unlikely(status != UCC_OK)) {
+        return status;
+    }
+    UCC_TL_CUDA_CHECK_DEVICE_MATCH(team);
     if (!ucc_coll_args_is_predefined_dt(&coll_args->args, trank)) {
         return UCC_ERR_NOT_SUPPORTED;
     }

diff --git a/src/components/tl/cuda/tl_cuda_context.c b/src/components/tl/cuda/tl_cuda_context.c
@@ -20,8 +20,6 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_context_t,
     ucc_status_t status;
     int num_devices;
     cudaError_t cuda_st;
-    CUcontext cu_ctx;
-    CUresult cu_st;
 
     UCC_CLASS_CALL_SUPER_INIT(ucc_tl_context_t, &tl_cuda_config->super,
                               params->context);
@@ -37,13 +35,6 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_context_t,
         return UCC_ERR_NO_RESOURCE;
     }
 
-    cu_st = cuCtxGetCurrent(&cu_ctx);
-    if (cu_ctx == NULL || cu_st != CUDA_SUCCESS) {
-        tl_debug(self->super.super.lib,
-                 "cannot create CUDA TL context without active CUDA context");
-        return UCC_ERR_NO_RESOURCE;
-    }
-
     status = ucc_mpool_init(&self->req_mp, 0, sizeof(ucc_tl_cuda_task_t), 0,
                             UCC_CACHE_LINE_SIZE, 8, UINT_MAX,
                             &ucc_coll_task_mpool_ops, params->thread_mode,
@@ -54,18 +45,12 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_context_t,
         return status;
     }
 
-    CUDA_CHECK_GOTO(cudaGetDevice(&self->device), free_mpool, status);
     status = ucc_tl_cuda_topo_create(self->super.super.lib, &self->topo);
     if (status != UCC_OK) {
         tl_error(self->super.super.lib,
                  "failed to initialize tl_cuda_topo");
         goto free_mpool;
     }
-    status = ucc_tl_cuda_topo_get_pci_id(self->device, &self->device_id);
-    if (status != UCC_OK) {
-        tl_error(self->super.super.lib, "failed to get pci id");
-        goto free_mpool;
-    }
 
     self->ipc_cache = kh_init(tl_cuda_ep_hash);
     tl_debug(self->super.super.lib, "initialized tl context: %p", self);