From 189120d13e9c265f9352a96fe98566e7aaf9aaa1 Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Fri, 22 Mar 2024 11:59:58 +0100 Subject: [PATCH] TL/CUDA: add linear bcast --- src/components/tl/cuda/Makefile.am | 5 ++ src/components/tl/cuda/bcast/bcast.c | 28 +++++++ src/components/tl/cuda/bcast/bcast.h | 43 +++++++++++ src/components/tl/cuda/bcast/bcast_linear.c | 86 +++++++++++++++++++++ src/components/tl/cuda/tl_cuda.c | 3 + src/components/tl/cuda/tl_cuda.h | 5 ++ src/components/tl/cuda/tl_cuda_coll.c | 16 ++++ src/components/tl/cuda/tl_cuda_coll.h | 2 +- 8 files changed, 187 insertions(+), 1 deletion(-) create mode 100644 src/components/tl/cuda/bcast/bcast.c create mode 100644 src/components/tl/cuda/bcast/bcast.h create mode 100644 src/components/tl/cuda/bcast/bcast_linear.c diff --git a/src/components/tl/cuda/Makefile.am b/src/components/tl/cuda/Makefile.am index e22796e6fa..c5a05f9c3b 100644 --- a/src/components/tl/cuda/Makefile.am +++ b/src/components/tl/cuda/Makefile.am @@ -27,6 +27,11 @@ alltoallv = \ alltoallv/alltoallv.c \ alltoallv/alltoallv_ce.c +bcast = \ + bcast/bcast.h \ + bcast/bcast.c \ + bcast/bcast_linear.c + reduce_scatter = \ reduce_scatter/reduce_scatter.h \ reduce_scatter/reduce_scatter.c \ diff --git a/src/components/tl/cuda/bcast/bcast.c b/src/components/tl/cuda/bcast/bcast.c new file mode 100644 index 0000000000..46623684fe --- /dev/null +++ b/src/components/tl/cuda/bcast/bcast.c @@ -0,0 +1,28 @@ +/** + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + +#include "bcast.h" +#include "components/mc/ucc_mc.h" + +ucc_base_coll_alg_info_t + ucc_tl_cuda_bcast_algs[UCC_TL_CUDA_BCAST_ALG_LAST + 1] = { + [UCC_TL_CUDA_BCAST_ALG_LINEAR] = {.id = UCC_TL_CUDA_BCAST_ALG_LINEAR, + .name = "linear", + .desc = "linear bcast algorithm"}, + [UCC_TL_CUDA_BCAST_ALG_LAST] = {.id = 0, .name = NULL, .desc = NULL}}; + +ucc_status_t ucc_tl_cuda_bcast_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *tl_team, + ucc_coll_task_t **task_p) +{ + ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t); + + if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) { + return ucc_tl_cuda_bcast_linear_init(coll_args, tl_team, task_p); + } else { + return UCC_ERR_NOT_SUPPORTED; + } +} diff --git a/src/components/tl/cuda/bcast/bcast.h b/src/components/tl/cuda/bcast/bcast.h new file mode 100644 index 0000000000..17d07a529b --- /dev/null +++ b/src/components/tl/cuda/bcast/bcast.h @@ -0,0 +1,43 @@ +/** + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + +#ifndef BCAST_H_ +#define BCAST_H_ + +#include "tl_cuda.h" +#include "tl_cuda_coll.h" + +enum +{ + UCC_TL_CUDA_BCAST_ALG_LINEAR, + UCC_TL_CUDA_BCAST_ALG_LAST +}; + +extern ucc_base_coll_alg_info_t + ucc_tl_cuda_bcast_algs[UCC_TL_CUDA_BCAST_ALG_LAST + 1]; + +#define UCC_TL_CUDA_BCAST_DEFAULT_ALG_SELECT_STR "bcast:cuda:@0" + +ucc_status_t ucc_tl_cuda_bcast_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *tl_team, + ucc_coll_task_t **task_p); + +ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *tl_team, + ucc_coll_task_t **task_p); + +static inline int ucc_tl_cuda_bcast_alg_from_str(const char *str) +{ + int i; + for (i = 0; i < UCC_TL_CUDA_BCAST_ALG_LAST; i++) { + if (0 == strcasecmp(str, ucc_tl_cuda_bcast_algs[i].name)) { + break; + } + } + return i; +} + +#endif diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c new file mode 100644 index 0000000000..a1f5474501 --- /dev/null +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -0,0 +1,86 @@ +/** + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + +#include "bcast/bcast.h" + +enum +{ + STAGE_SYNC, /*< Wait for free SYNC segment */ + STAGE_SETUP, /*< Wait for memhandle setup to finish */ + STAGE_COPIES, /*< Linear algorithm is running */ + STAGE_BARRIER, /*< Linear algorithm is done, waiting for + * other ranks to finish */ +}; + +ucc_status_t ucc_tl_cuda_bcast_linear_finalize(ucc_coll_task_t *coll_task) +{ + ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); + + tl_trace(UCC_TASK_LIB(task), "finalizing task %p", task); + ucc_tl_cuda_task_put(task); + return UCC_OK; +} + +void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) +{ + ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); + ucc_tl_cuda_team_t *team = TASK_TEAM(task); + ucc_status_t st; + + task->super.status = UCC_INPROGRESS; +} + +ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) +{ + ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); + ucc_tl_cuda_team_t *team = TASK_TEAM(task); + ucc_coll_args_t * args = &TASK_ARGS(task); + ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); + ucc_datatype_t dt = task->allgatherv_linear.dt; + ucc_rank_t i; + size_t send_size, frag_size, ssize; + + task->bcast_linear.stage = STAGE_SYNC; + task->allgatherv_linear.sbuf = args->src.info.buffer; + + + return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super); +} + +ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t * tl_team, + ucc_coll_task_t ** task_p) +{ + ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t); + ucc_tl_cuda_task_t *task; + ucc_status_t status; + + if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) || + UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) { + return UCC_ERR_NOT_SUPPORTED; + } + + status = ucc_tl_cuda_task_init(coll_args, team, &task); + if (ucc_unlikely(status != UCC_OK)) { + return status; + } + + // task->allgatherv_linear.get_count = ucc_tl_cuda_allgather_get_count; + // task->allgatherv_linear.get_offset = ucc_tl_cuda_allgather_get_offset; + // task->allgatherv_linear.dt = coll_args->args.dst.info.datatype; + // task->allgatherv_linear.sbuf = coll_args->args.src.info.buffer; + // task->allgatherv_linear.rbuf = coll_args->args.dst.info.buffer; + + task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR; + task->super.post = ucc_tl_cuda_allgatherv_linear_start; + task->super.progress = ucc_tl_cuda_allgatherv_linear_progress; + task->super.finalize = ucc_tl_cuda_allgatherv_linear_finalize; + task->bar = TASK_BAR(task); + + *task_p = &task->super; + return UCC_OK; +} + diff --git a/src/components/tl/cuda/tl_cuda.c b/src/components/tl/cuda/tl_cuda.c index 98dccf26bf..18135fae00 100644 --- a/src/components/tl/cuda/tl_cuda.c +++ b/src/components/tl/cuda/tl_cuda.c @@ -9,6 +9,7 @@ #include "components/mc/base/ucc_mc_base.h" #include "allgather/allgather.h" #include "allgatherv/allgatherv.h" +#include "bcast/bcast.h" #include "reduce_scatter/reduce_scatter.h" #include "reduce_scatterv/reduce_scatterv.h" @@ -93,6 +94,8 @@ __attribute__((constructor)) static void tl_cuda_iface_init(void) ucc_tl_cuda_allgather_algs; ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_ALLGATHERV)] = ucc_tl_cuda_allgatherv_algs; + ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_BCAST)] = + ucc_tl_cuda_bcast_algs; ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_REDUCE_SCATTER)] = ucc_tl_cuda_reduce_scatter_algs; ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_REDUCE_SCATTERV)] = diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h index 792100c80c..751dc4d8ad 100644 --- a/src/components/tl/cuda/tl_cuda.h +++ b/src/components/tl/cuda/tl_cuda.h @@ -27,6 +27,7 @@ #define UCC_TL_CUDA_SUPPORTED_COLLS \ (UCC_COLL_TYPE_ALLTOALL | UCC_COLL_TYPE_ALLTOALLV | \ UCC_COLL_TYPE_ALLGATHER | UCC_COLL_TYPE_ALLGATHERV | \ + UCC_COLL_TYPE_BCAST | \ UCC_COLL_TYPE_REDUCE_SCATTER | UCC_COLL_TYPE_REDUCE_SCATTERV) #define UCC_TL_CUDA_TEAM_LIB(_team) \ @@ -224,6 +225,10 @@ struct ucc_tl_cuda_task { size_t (*get_offset)(const ucc_tl_cuda_task_t *task, ucc_rank_t block); } allgatherv_linear; + + struct { + int stage; + } bcast_linear; struct { int stage; int num_frags; diff --git a/src/components/tl/cuda/tl_cuda_coll.c b/src/components/tl/cuda/tl_cuda_coll.c index 5d01cc1a94..42b33cdbcc 100644 --- a/src/components/tl/cuda/tl_cuda_coll.c +++ b/src/components/tl/cuda/tl_cuda_coll.c @@ -9,6 +9,7 @@ #include "alltoallv/alltoallv.h" #include "allgather/allgather.h" #include "allgatherv/allgatherv.h" +#include "bcast/bcast.h" #include "reduce_scatter/reduce_scatter.h" #include "reduce_scatterv/reduce_scatterv.h" #include "utils/arch/cpu.h" @@ -35,6 +36,7 @@ const char * ucc_tl_cuda_default_alg_select_str[UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR] = { UCC_TL_CUDA_ALLGATHER_DEFAULT_ALG_SELECT_STR, UCC_TL_CUDA_ALLGATHERV_DEFAULT_ALG_SELECT_STR, + UCC_TL_CUDA_BCAST_DEFAULT_ALG_SELECT_STR, UCC_TL_CUDA_REDUCE_SCATTER_DEFAULT_ALG_SELECT_STR, UCC_TL_CUDA_REDUCE_SCATTERV_DEFAULT_ALG_SELECT_STR}; @@ -78,6 +80,8 @@ ucc_status_t ucc_tl_cuda_coll_init(ucc_base_coll_args_t *coll_args, return ucc_tl_cuda_allgather_init(coll_args, team, task_h); case UCC_COLL_TYPE_ALLGATHERV: return ucc_tl_cuda_allgatherv_init(coll_args, team, task_h); + case UCC_COLL_TYPE_BCAST: + return ucc_tl_cuda_bcast_init(coll_args, team, task_h); case UCC_COLL_TYPE_REDUCE_SCATTER: return ucc_tl_cuda_reduce_scatter_init(coll_args, team, task_h); case UCC_COLL_TYPE_REDUCE_SCATTERV: @@ -134,6 +138,8 @@ static inline int alg_id_from_str(ucc_coll_type_t coll_type, const char *str) return ucc_tl_cuda_allgather_alg_from_str(str); case UCC_COLL_TYPE_ALLGATHERV: return ucc_tl_cuda_allgatherv_alg_from_str(str); + case UCC_COLL_TYPE_BCAST: + return ucc_tl_cuda_bcast_alg_from_str(str); default: break; } @@ -187,6 +193,16 @@ ucc_status_t ucc_tl_cuda_alg_id_to_init(int alg_id, const char *alg_id_str, break; }; break; + case UCC_COLL_TYPE_BCAST: + switch (alg_id) { + case UCC_TL_CUDA_BCAST_ALG_LINEAR: + *init = ucc_tl_cuda_bcast_linear_init; + break; + default: + status = UCC_ERR_INVALID_PARAM; + break; + }; + break; case UCC_COLL_TYPE_REDUCE_SCATTER: switch (alg_id) { case UCC_TL_CUDA_REDUCE_SCATTER_ALG_AUTO: diff --git a/src/components/tl/cuda/tl_cuda_coll.h b/src/components/tl/cuda/tl_cuda_coll.h index 8b15cdf249..f450ff950c 100644 --- a/src/components/tl/cuda/tl_cuda_coll.h +++ b/src/components/tl/cuda/tl_cuda_coll.h @@ -10,7 +10,7 @@ #include "tl_cuda.h" #include "components/mc/ucc_mc.h" -#define UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR 4 +#define UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR 5 extern const char *ucc_tl_cuda_default_alg_select_str[UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR];