Skip to content

Commit

Permalink
TL/CUDA: enable lazy init
Browse files Browse the repository at this point in the history
  • Loading branch information
Sergei-Lebedev committed Mar 29, 2023
1 parent d0e3ca4 commit 915c5ac
Show file tree
Hide file tree
Showing 13 changed files with 433 additions and 221 deletions.
6 changes: 6 additions & 0 deletions src/components/tl/cuda/allgather/allgather.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ ucc_status_t ucc_tl_cuda_allgather_init(ucc_base_coll_args_t *coll_args,
ucc_coll_task_t **task_p)
{
ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
ucc_status_t status;

status = ucc_tl_cuda_comm_init(team);
if (ucc_unlikely(status != UCC_OK)) {
return status;
}

if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
return ucc_tl_cuda_allgather_linear_init(coll_args, tl_team, task_p);
Expand Down
6 changes: 6 additions & 0 deletions src/components/tl/cuda/allgatherv/allgatherv.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ ucc_status_t ucc_tl_cuda_allgatherv_init(ucc_base_coll_args_t *coll_args,
ucc_coll_task_t ** task_p)
{
ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
ucc_status_t status;

status = ucc_tl_cuda_comm_init(team);
if (ucc_unlikely(status != UCC_OK)) {
return status;
}

if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
return ucc_tl_cuda_allgatherv_linear_init(coll_args, tl_team, task_p);
Expand Down
5 changes: 5 additions & 0 deletions src/components/tl/cuda/reduce_scatter/reduce_scatter.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,12 @@ ucc_status_t ucc_tl_cuda_reduce_scatter_init(ucc_base_coll_args_t *coll_args,
ucc_coll_task_t **task_p)
{
ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
ucc_status_t status;

status = ucc_tl_cuda_comm_init(team);
if (ucc_unlikely(status != UCC_OK)) {
return status;
}
if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
return ucc_tl_cuda_reduce_scatter_linear_init(coll_args, tl_team,
task_p);
Expand Down
5 changes: 5 additions & 0 deletions src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,12 @@ ucc_status_t ucc_tl_cuda_reduce_scatterv_init(ucc_base_coll_args_t *coll_args,
ucc_coll_task_t **task_p)
{
ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
ucc_status_t status;

status = ucc_tl_cuda_comm_init(team);
if (ucc_unlikely(status != UCC_OK)) {
return status;
}
if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
return ucc_tl_cuda_reduce_scatterv_linear_init(coll_args, tl_team,
task_p);
Expand Down
5 changes: 5 additions & 0 deletions src/components/tl/cuda/tl_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,11 @@ static ucs_config_field_t ucc_tl_cuda_context_config_table[] = {
{"", "", NULL, ucc_offsetof(ucc_tl_cuda_context_config_t, super),
UCC_CONFIG_TYPE_TABLE(ucc_tl_context_config_table)},

{"LAZY_INIT", "yes",
"Initialize team on first collective",
ucc_offsetof(ucc_tl_cuda_context_config_t, lazy_init),
UCC_CONFIG_TYPE_BOOL},

{NULL}};

ucc_status_t ucc_tl_cuda_get_context_attr(const ucc_base_context_t *context,
Expand Down
46 changes: 33 additions & 13 deletions src/components/tl/cuda/tl_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,14 @@
(ucc_tl_cuda_shm_barrier_t *)_bar; \
})

#define GET_RANK_ID(_ids, _trank, _max_concurrent) \
({ \
size_t _rank_id_size = sizeof(ucc_tl_cuda_rank_id_t) + \
(_max_concurrent - 1) * sizeof(cudaIpcEventHandle_t); \
void *_rank_id = PTR_OFFSET(_ids, _trank * _rank_id_size); \
(ucc_tl_cuda_rank_id_t*)_rank_id; \
})

#ifdef HAVE_PROFILING_TL_CUDA
#include "utils/profile/ucc_profile.h"
#else
Expand Down Expand Up @@ -81,6 +89,7 @@ typedef struct ucc_tl_cuda_lib_config {

typedef struct ucc_tl_cuda_context_config {
ucc_tl_context_config_t super;
int lazy_init;
} ucc_tl_cuda_context_config_t;

typedef struct ucc_tl_cuda_lib {
Expand All @@ -93,8 +102,6 @@ UCC_CLASS_DECLARE(ucc_tl_cuda_lib_t, const ucc_base_lib_params_t *,
typedef struct ucc_tl_cuda_context {
ucc_tl_context_t super;
ucc_tl_cuda_context_config_t cfg;
int device;
ucc_tl_cuda_device_pci_id_t device_id;
ucc_tl_cuda_topo_t *topo;
ucc_mpool_t req_mp;
tl_cuda_ep_hash_t *ipc_cache;
Expand Down Expand Up @@ -127,6 +134,7 @@ typedef struct ucc_tl_cuda_rank_id {
ucc_tl_cuda_device_pci_id_t pci_id;
ucc_tl_cuda_mem_info_t scratch_info;
int shm;
cudaIpcEventHandle_t ev_handle[1]; /* max concurent */
} ucc_tl_cuda_rank_id_t;

typedef struct ucc_tl_cuda_sync {
Expand All @@ -152,20 +160,32 @@ typedef struct ucc_tl_cuda_scratch {
ucc_tl_cuda_mem_info_t rem_info[UCC_TL_CUDA_MAX_PEERS];
} ucc_tl_cuda_scratch_t;

enum {
TL_CUDA_STATE_READY,
TL_CUDA_STATE_SHM_ID_EXCHANGE,
TL_CUDA_STATE_COMM_INIT,
TL_CUDA_STATE_ERROR
};

typedef struct ucc_tl_cuda_team {
ucc_tl_team_t super;
uint32_t seq_num;
ucc_tl_cuda_team_topo_t *topo;
ucc_tl_cuda_sync_t *sync;
ucc_tl_cuda_sync_state_t *sync_state;
ucc_tl_cuda_shm_barrier_t *bar;
ucc_tl_cuda_scratch_t scratch;
cudaStream_t stream;
ucc_tl_cuda_rank_id_t *ids;
ucc_team_oob_coll_t oob;
void *oob_req;
ucc_tl_team_t super;
int state;
uint32_t seq_num;
int device;
ucc_tl_cuda_device_pci_id_t device_id;
ucc_tl_cuda_team_topo_t *topo;
ucc_tl_cuda_sync_t *sync;
ucc_tl_cuda_sync_state_t *sync_state;
ucc_tl_cuda_shm_barrier_t *bar;
ucc_tl_cuda_scratch_t scratch;
cudaStream_t stream;
ucc_tl_cuda_rank_id_t *ids;
ucc_team_oob_coll_t oob;
void *oob_req;
} ucc_tl_cuda_team_t;

ucc_status_t ucc_tl_cuda_comm_init(ucc_tl_cuda_team_t *team);

UCC_CLASS_DECLARE(ucc_tl_cuda_team_t, ucc_base_context_t *,
const ucc_base_team_params_t *);

Expand Down
20 changes: 1 addition & 19 deletions src/components/tl/cuda/tl_cuda_coll.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand All @@ -12,24 +12,6 @@
#include "reduce_scatter/reduce_scatter.h"
#include "reduce_scatterv/reduce_scatterv.h"
#include "utils/arch/cpu.h"
#include "utils/arch/cuda_def.h"


#if ENABLE_DEBUG == 1
/* TODO: possible need to check CUDA context */
#define UCC_TL_CUDA_CHECK_DEVICE_MATCH(_team) do { \
int _dev; \
CUDA_CHECK(cudaGetDevice(&_dev)); \
if (_dev != UCC_TL_CUDA_TEAM_CTX(_team)->device) { \
tl_error(UCC_TL_TEAM_LIB(_team), "CUDA device mismatch, " \
"current device %d, team device %d\n", _dev, \
UCC_TL_CUDA_TEAM_CTX(_team)->device); \
return UCC_ERR_INVALID_PARAM; \
} \
} while(0)
#else
#define UCC_TL_CUDA_CHECK_DEVICE_MATCH(_team)
#endif

const char *
ucc_tl_cuda_default_alg_select_str[UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR] = {
Expand Down
24 changes: 23 additions & 1 deletion src/components/tl/cuda/tl_cuda_coll.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2021-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand All @@ -9,11 +9,28 @@

#include "tl_cuda.h"
#include "components/mc/ucc_mc.h"
#include "utils/arch/cuda_def.h"

#define UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR 4
extern const char
*ucc_tl_cuda_default_alg_select_str[UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR];

#if ENABLE_DEBUG == 1
/* TODO: possible need to check CUDA context */
#define UCC_TL_CUDA_CHECK_DEVICE_MATCH(_team) do { \
int _dev; \
CUDA_CHECK(cudaGetDevice(&_dev)); \
if (_dev != (_team)->device) { \
tl_error(UCC_TL_TEAM_LIB(_team), "CUDA device mismatch, " \
"current device %d, team device %d\n", _dev, \
(_team)->device); \
return UCC_ERR_INVALID_PARAM; \
} \
} while(0)
#else
#define UCC_TL_CUDA_CHECK_DEVICE_MATCH(_team)
#endif

#define TASK_TEAM(_task) \
(ucc_derived_of((_task)->super.team, ucc_tl_cuda_team_t))

Expand Down Expand Up @@ -85,6 +102,11 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args,
ucc_tl_cuda_task_t *task;
ucc_status_t status;

status = ucc_tl_cuda_comm_init(team);
if (ucc_unlikely(status != UCC_OK)) {
return status;
}
UCC_TL_CUDA_CHECK_DEVICE_MATCH(team);
if (!ucc_coll_args_is_predefined_dt(&coll_args->args, trank)) {
return UCC_ERR_NOT_SUPPORTED;
}
Expand Down
15 changes: 0 additions & 15 deletions src/components/tl/cuda/tl_cuda_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_context_t,
ucc_status_t status;
int num_devices;
cudaError_t cuda_st;
CUcontext cu_ctx;
CUresult cu_st;

UCC_CLASS_CALL_SUPER_INIT(ucc_tl_context_t, &tl_cuda_config->super,
params->context);
Expand All @@ -37,13 +35,6 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_context_t,
return UCC_ERR_NO_RESOURCE;
}

cu_st = cuCtxGetCurrent(&cu_ctx);
if (cu_ctx == NULL || cu_st != CUDA_SUCCESS) {
tl_debug(self->super.super.lib,
"cannot create CUDA TL context without active CUDA context");
return UCC_ERR_NO_RESOURCE;
}

status = ucc_mpool_init(&self->req_mp, 0, sizeof(ucc_tl_cuda_task_t), 0,
UCC_CACHE_LINE_SIZE, 8, UINT_MAX,
&ucc_coll_task_mpool_ops, params->thread_mode,
Expand All @@ -54,18 +45,12 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_context_t,
return status;
}

CUDA_CHECK_GOTO(cudaGetDevice(&self->device), free_mpool, status);
status = ucc_tl_cuda_topo_create(self->super.super.lib, &self->topo);
if (status != UCC_OK) {
tl_error(self->super.super.lib,
"failed to initialize tl_cuda_topo");
goto free_mpool;
}
status = ucc_tl_cuda_topo_get_pci_id(self->device, &self->device_id);
if (status != UCC_OK) {
tl_error(self->super.super.lib, "failed to get pci id");
goto free_mpool;
}

self->ipc_cache = kh_init(tl_cuda_ep_hash);
tl_debug(self->super.super.lib, "initialized tl context: %p", self);
Expand Down
Loading

0 comments on commit 915c5ac

Please sign in to comment.