From 999b43ea4a5bf1a12450189907b2471457e7d974 Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Fri, 22 Mar 2024 11:59:58 +0100 Subject: [PATCH 01/34] TL/CUDA: add linear bcast --- src/components/tl/cuda/Makefile.am | 5 ++ src/components/tl/cuda/bcast/bcast.c | 28 +++++++ src/components/tl/cuda/bcast/bcast.h | 43 +++++++++++ src/components/tl/cuda/bcast/bcast_linear.c | 86 +++++++++++++++++++++ src/components/tl/cuda/tl_cuda.c | 3 + src/components/tl/cuda/tl_cuda.h | 5 ++ src/components/tl/cuda/tl_cuda_coll.c | 16 ++++ src/components/tl/cuda/tl_cuda_coll.h | 2 +- 8 files changed, 187 insertions(+), 1 deletion(-) create mode 100644 src/components/tl/cuda/bcast/bcast.c create mode 100644 src/components/tl/cuda/bcast/bcast.h create mode 100644 src/components/tl/cuda/bcast/bcast_linear.c diff --git a/src/components/tl/cuda/Makefile.am b/src/components/tl/cuda/Makefile.am index e22796e6fa..c5a05f9c3b 100644 --- a/src/components/tl/cuda/Makefile.am +++ b/src/components/tl/cuda/Makefile.am @@ -27,6 +27,11 @@ alltoallv = \ alltoallv/alltoallv.c \ alltoallv/alltoallv_ce.c +bcast = \ + bcast/bcast.h \ + bcast/bcast.c \ + bcast/bcast_linear.c + reduce_scatter = \ reduce_scatter/reduce_scatter.h \ reduce_scatter/reduce_scatter.c \ diff --git a/src/components/tl/cuda/bcast/bcast.c b/src/components/tl/cuda/bcast/bcast.c new file mode 100644 index 0000000000..46623684fe --- /dev/null +++ b/src/components/tl/cuda/bcast/bcast.c @@ -0,0 +1,28 @@ +/** + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + +#include "bcast.h" +#include "components/mc/ucc_mc.h" + +ucc_base_coll_alg_info_t + ucc_tl_cuda_bcast_algs[UCC_TL_CUDA_BCAST_ALG_LAST + 1] = { + [UCC_TL_CUDA_BCAST_ALG_LINEAR] = {.id = UCC_TL_CUDA_BCAST_ALG_LINEAR, + .name = "linear", + .desc = "linear bcast algorithm"}, + [UCC_TL_CUDA_BCAST_ALG_LAST] = {.id = 0, .name = NULL, .desc = NULL}}; + +ucc_status_t ucc_tl_cuda_bcast_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *tl_team, + ucc_coll_task_t **task_p) +{ + ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t); + + if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) { + return ucc_tl_cuda_bcast_linear_init(coll_args, tl_team, task_p); + } else { + return UCC_ERR_NOT_SUPPORTED; + } +} diff --git a/src/components/tl/cuda/bcast/bcast.h b/src/components/tl/cuda/bcast/bcast.h new file mode 100644 index 0000000000..17d07a529b --- /dev/null +++ b/src/components/tl/cuda/bcast/bcast.h @@ -0,0 +1,43 @@ +/** + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + +#ifndef BCAST_H_ +#define BCAST_H_ + +#include "tl_cuda.h" +#include "tl_cuda_coll.h" + +enum +{ + UCC_TL_CUDA_BCAST_ALG_LINEAR, + UCC_TL_CUDA_BCAST_ALG_LAST +}; + +extern ucc_base_coll_alg_info_t + ucc_tl_cuda_bcast_algs[UCC_TL_CUDA_BCAST_ALG_LAST + 1]; + +#define UCC_TL_CUDA_BCAST_DEFAULT_ALG_SELECT_STR "bcast:cuda:@0" + +ucc_status_t ucc_tl_cuda_bcast_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *tl_team, + ucc_coll_task_t **task_p); + +ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *tl_team, + ucc_coll_task_t **task_p); + +static inline int ucc_tl_cuda_bcast_alg_from_str(const char *str) +{ + int i; + for (i = 0; i < UCC_TL_CUDA_BCAST_ALG_LAST; i++) { + if (0 == strcasecmp(str, ucc_tl_cuda_bcast_algs[i].name)) { + break; + } + } + return i; +} + +#endif diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c new file mode 100644 index 0000000000..a1f5474501 --- /dev/null +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -0,0 +1,86 @@ +/** + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + +#include "bcast/bcast.h" + +enum +{ + STAGE_SYNC, /*< Wait for free SYNC segment */ + STAGE_SETUP, /*< Wait for memhandle setup to finish */ + STAGE_COPIES, /*< Linear algorithm is running */ + STAGE_BARRIER, /*< Linear algorithm is done, waiting for + * other ranks to finish */ +}; + +ucc_status_t ucc_tl_cuda_bcast_linear_finalize(ucc_coll_task_t *coll_task) +{ + ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); + + tl_trace(UCC_TASK_LIB(task), "finalizing task %p", task); + ucc_tl_cuda_task_put(task); + return UCC_OK; +} + +void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) +{ + ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); + ucc_tl_cuda_team_t *team = TASK_TEAM(task); + ucc_status_t st; + + task->super.status = UCC_INPROGRESS; +} + +ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) +{ + ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); + ucc_tl_cuda_team_t *team = TASK_TEAM(task); + ucc_coll_args_t * args = &TASK_ARGS(task); + ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); + ucc_datatype_t dt = task->allgatherv_linear.dt; + ucc_rank_t i; + size_t send_size, frag_size, ssize; + + task->bcast_linear.stage = STAGE_SYNC; + task->allgatherv_linear.sbuf = args->src.info.buffer; + + + return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super); +} + +ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t * tl_team, + ucc_coll_task_t ** task_p) +{ + ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t); + ucc_tl_cuda_task_t *task; + ucc_status_t status; + + if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) || + UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) { + return UCC_ERR_NOT_SUPPORTED; + } + + status = ucc_tl_cuda_task_init(coll_args, team, &task); + if (ucc_unlikely(status != UCC_OK)) { + return status; + } + + // task->allgatherv_linear.get_count = ucc_tl_cuda_allgather_get_count; + // task->allgatherv_linear.get_offset = ucc_tl_cuda_allgather_get_offset; + // task->allgatherv_linear.dt = coll_args->args.dst.info.datatype; + // task->allgatherv_linear.sbuf = coll_args->args.src.info.buffer; + // task->allgatherv_linear.rbuf = coll_args->args.dst.info.buffer; + + task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR; + task->super.post = ucc_tl_cuda_allgatherv_linear_start; + task->super.progress = ucc_tl_cuda_allgatherv_linear_progress; + task->super.finalize = ucc_tl_cuda_allgatherv_linear_finalize; + task->bar = TASK_BAR(task); + + *task_p = &task->super; + return UCC_OK; +} + diff --git a/src/components/tl/cuda/tl_cuda.c b/src/components/tl/cuda/tl_cuda.c index 98dccf26bf..18135fae00 100644 --- a/src/components/tl/cuda/tl_cuda.c +++ b/src/components/tl/cuda/tl_cuda.c @@ -9,6 +9,7 @@ #include "components/mc/base/ucc_mc_base.h" #include "allgather/allgather.h" #include "allgatherv/allgatherv.h" +#include "bcast/bcast.h" #include "reduce_scatter/reduce_scatter.h" #include "reduce_scatterv/reduce_scatterv.h" @@ -93,6 +94,8 @@ __attribute__((constructor)) static void tl_cuda_iface_init(void) ucc_tl_cuda_allgather_algs; ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_ALLGATHERV)] = ucc_tl_cuda_allgatherv_algs; + ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_BCAST)] = + ucc_tl_cuda_bcast_algs; ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_REDUCE_SCATTER)] = ucc_tl_cuda_reduce_scatter_algs; ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_REDUCE_SCATTERV)] = diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h index 792100c80c..751dc4d8ad 100644 --- a/src/components/tl/cuda/tl_cuda.h +++ b/src/components/tl/cuda/tl_cuda.h @@ -27,6 +27,7 @@ #define UCC_TL_CUDA_SUPPORTED_COLLS \ (UCC_COLL_TYPE_ALLTOALL | UCC_COLL_TYPE_ALLTOALLV | \ UCC_COLL_TYPE_ALLGATHER | UCC_COLL_TYPE_ALLGATHERV | \ + UCC_COLL_TYPE_BCAST | \ UCC_COLL_TYPE_REDUCE_SCATTER | UCC_COLL_TYPE_REDUCE_SCATTERV) #define UCC_TL_CUDA_TEAM_LIB(_team) \ @@ -224,6 +225,10 @@ struct ucc_tl_cuda_task { size_t (*get_offset)(const ucc_tl_cuda_task_t *task, ucc_rank_t block); } allgatherv_linear; + + struct { + int stage; + } bcast_linear; struct { int stage; int num_frags; diff --git a/src/components/tl/cuda/tl_cuda_coll.c b/src/components/tl/cuda/tl_cuda_coll.c index 5d01cc1a94..42b33cdbcc 100644 --- a/src/components/tl/cuda/tl_cuda_coll.c +++ b/src/components/tl/cuda/tl_cuda_coll.c @@ -9,6 +9,7 @@ #include "alltoallv/alltoallv.h" #include "allgather/allgather.h" #include "allgatherv/allgatherv.h" +#include "bcast/bcast.h" #include "reduce_scatter/reduce_scatter.h" #include "reduce_scatterv/reduce_scatterv.h" #include "utils/arch/cpu.h" @@ -35,6 +36,7 @@ const char * ucc_tl_cuda_default_alg_select_str[UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR] = { UCC_TL_CUDA_ALLGATHER_DEFAULT_ALG_SELECT_STR, UCC_TL_CUDA_ALLGATHERV_DEFAULT_ALG_SELECT_STR, + UCC_TL_CUDA_BCAST_DEFAULT_ALG_SELECT_STR, UCC_TL_CUDA_REDUCE_SCATTER_DEFAULT_ALG_SELECT_STR, UCC_TL_CUDA_REDUCE_SCATTERV_DEFAULT_ALG_SELECT_STR}; @@ -78,6 +80,8 @@ ucc_status_t ucc_tl_cuda_coll_init(ucc_base_coll_args_t *coll_args, return ucc_tl_cuda_allgather_init(coll_args, team, task_h); case UCC_COLL_TYPE_ALLGATHERV: return ucc_tl_cuda_allgatherv_init(coll_args, team, task_h); + case UCC_COLL_TYPE_BCAST: + return ucc_tl_cuda_bcast_init(coll_args, team, task_h); case UCC_COLL_TYPE_REDUCE_SCATTER: return ucc_tl_cuda_reduce_scatter_init(coll_args, team, task_h); case UCC_COLL_TYPE_REDUCE_SCATTERV: @@ -134,6 +138,8 @@ static inline int alg_id_from_str(ucc_coll_type_t coll_type, const char *str) return ucc_tl_cuda_allgather_alg_from_str(str); case UCC_COLL_TYPE_ALLGATHERV: return ucc_tl_cuda_allgatherv_alg_from_str(str); + case UCC_COLL_TYPE_BCAST: + return ucc_tl_cuda_bcast_alg_from_str(str); default: break; } @@ -187,6 +193,16 @@ ucc_status_t ucc_tl_cuda_alg_id_to_init(int alg_id, const char *alg_id_str, break; }; break; + case UCC_COLL_TYPE_BCAST: + switch (alg_id) { + case UCC_TL_CUDA_BCAST_ALG_LINEAR: + *init = ucc_tl_cuda_bcast_linear_init; + break; + default: + status = UCC_ERR_INVALID_PARAM; + break; + }; + break; case UCC_COLL_TYPE_REDUCE_SCATTER: switch (alg_id) { case UCC_TL_CUDA_REDUCE_SCATTER_ALG_AUTO: diff --git a/src/components/tl/cuda/tl_cuda_coll.h b/src/components/tl/cuda/tl_cuda_coll.h index 8b15cdf249..f450ff950c 100644 --- a/src/components/tl/cuda/tl_cuda_coll.h +++ b/src/components/tl/cuda/tl_cuda_coll.h @@ -10,7 +10,7 @@ #include "tl_cuda.h" #include "components/mc/ucc_mc.h" -#define UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR 4 +#define UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR 5 extern const char *ucc_tl_cuda_default_alg_select_str[UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR]; From fceeb8f99eff1d792b6c6e12fc4051aff0e17609 Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Fri, 22 Mar 2024 11:59:58 +0100 Subject: [PATCH 02/34] TL/CUDA: fix build --- src/components/tl/cuda/Makefile.am | 1 + src/components/tl/cuda/bcast/bcast_linear.c | 16 +++++++++------- src/components/tl/cuda/tl_cuda_context.c | 16 ++++++++-------- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/src/components/tl/cuda/Makefile.am b/src/components/tl/cuda/Makefile.am index c5a05f9c3b..2136821b93 100644 --- a/src/components/tl/cuda/Makefile.am +++ b/src/components/tl/cuda/Makefile.am @@ -59,6 +59,7 @@ sources = \ $(allgatherv) \ $(alltoall) \ $(alltoallv) \ + $(bcast) \ $(reduce_scatter) \ $(reduce_scatterv) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index a1f5474501..db9bdb04e5 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -29,7 +29,8 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); ucc_tl_cuda_team_t *team = TASK_TEAM(task); ucc_status_t st; - + (void) team; + (void) st; task->super.status = UCC_INPROGRESS; } @@ -40,11 +41,12 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) ucc_coll_args_t * args = &TASK_ARGS(task); ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); ucc_datatype_t dt = task->allgatherv_linear.dt; - ucc_rank_t i; - size_t send_size, frag_size, ssize; + (void) tsize; + (void) args; + (void) dt; task->bcast_linear.stage = STAGE_SYNC; - task->allgatherv_linear.sbuf = args->src.info.buffer; + // task->bcast_linear.sbuf = args->src.info.buffer; return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super); @@ -75,9 +77,9 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, // task->allgatherv_linear.rbuf = coll_args->args.dst.info.buffer; task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR; - task->super.post = ucc_tl_cuda_allgatherv_linear_start; - task->super.progress = ucc_tl_cuda_allgatherv_linear_progress; - task->super.finalize = ucc_tl_cuda_allgatherv_linear_finalize; + task->super.post = ucc_tl_cuda_bcast_linear_start; + task->super.progress = ucc_tl_cuda_bcast_linear_progress; + task->super.finalize = ucc_tl_cuda_bcast_linear_finalize; task->bar = TASK_BAR(task); *task_p = &task->super; diff --git a/src/components/tl/cuda/tl_cuda_context.c b/src/components/tl/cuda/tl_cuda_context.c index 4d89029680..ffb48336f5 100644 --- a/src/components/tl/cuda/tl_cuda_context.c +++ b/src/components/tl/cuda/tl_cuda_context.c @@ -20,8 +20,8 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_context_t, ucc_status_t status; int num_devices; cudaError_t cuda_st; - CUcontext cu_ctx; - CUresult cu_st; + // CUcontext cu_ctx; + // CUresult cu_st; UCC_CLASS_CALL_SUPER_INIT(ucc_tl_context_t, &tl_cuda_config->super, params->context); @@ -37,12 +37,12 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_context_t, return UCC_ERR_NO_RESOURCE; } - cu_st = cuCtxGetCurrent(&cu_ctx); - if (cu_ctx == NULL || cu_st != CUDA_SUCCESS) { - tl_debug(self->super.super.lib, - "cannot create CUDA TL context without active CUDA context"); - return UCC_ERR_NO_RESOURCE; - } + // cu_st = cuCtxGetCurrent(&cu_ctx); + // if (cu_ctx == NULL || cu_st != CUDA_SUCCESS) { + // tl_debug(self->super.super.lib, + // "cannot create CUDA TL context without active CUDA context"); + // return UCC_ERR_NO_RESOURCE; + // } status = ucc_mpool_init(&self->req_mp, 0, sizeof(ucc_tl_cuda_task_t), 0, UCC_CACHE_LINE_SIZE, 8, UINT_MAX, From dc65324e71a05d68bbf721d3bae28a988afb63cb Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Sun, 30 Jun 2024 17:29:41 +0200 Subject: [PATCH 03/34] TL/CUDA: wip --- src/components/tl/cuda/bcast/bcast_linear.c | 76 ++++++++++++++++++--- src/components/tl/cuda/tl_cuda.h | 5 ++ 2 files changed, 71 insertions(+), 10 deletions(-) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index db9bdb04e5..c557bf6fdc 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -8,13 +8,24 @@ enum { - STAGE_SYNC, /*< Wait for free SYNC segment */ - STAGE_SETUP, /*< Wait for memhandle setup to finish */ - STAGE_COPIES, /*< Linear algorithm is running */ - STAGE_BARRIER, /*< Linear algorithm is done, waiting for - * other ranks to finish */ + STAGE_COPY, // post copy task: copy block from src to scratch buffer + STAGE_WAIT_COPY, // wait for copy finishes + STAGE_WAIT_ALL, // wait for all others rank be on same step }; +static inline ucc_status_t ecopy(void *dst, void *src, size_t size, + ucc_ee_executor_t * exec, + ucc_ee_executor_task_t **etask) +{ + ucc_ee_executor_task_args_t exec_args = {0}; + + exec_args.task_type = UCC_EE_EXECUTOR_TASK_COPY; + exec_args.copy.dst = dst; + exec_args.copy.src = src; + exec_args.copy.len = size; + return ucc_ee_executor_task_post(exec, &exec_args, etask); +} + ucc_status_t ucc_tl_cuda_bcast_linear_finalize(ucc_coll_task_t *coll_task) { ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); @@ -26,12 +37,50 @@ ucc_status_t ucc_tl_cuda_bcast_linear_finalize(ucc_coll_task_t *coll_task) void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) { - ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); - ucc_tl_cuda_team_t *team = TASK_TEAM(task); + ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); + ucc_tl_cuda_team_t *team = TASK_TEAM(task); + ucc_rank_t trank = UCC_TL_TEAM_RANK(team); ucc_status_t st; - (void) team; - (void) st; + (void)team; + (void)st; + ucc_ee_executor_task_t *etask; + ucc_ee_executor_t *exec; + ucc_status_t status; + void * sbuf, *dbuf, *scratch_buf; task->super.status = UCC_INPROGRESS; + + // fall-through between cases is intentional + switch (task->bcast_linear.stage) + { + case STAGE_COPY: + // copy from src buffer to scratch + scratch_buf = TASK_SCRATCH(task, trank); + block_size = + status = ecopy(dbuf, sbuf, block_size); + break; + + default: + break; + } + + status = ucc_coll_task_get_executor(&task->super, &exec); + if (ucc_unlikely(status != UCC_OK)) { + return; + } + + etask = &task->bcast_linear.exec_task; + + ucc_ee_executor_task_args_t exec_args = {0}; + + exec_args.task_type = UCC_EE_EXECUTOR_TASK_COPY; + exec_args.copy.dst = task->bcast_linear.dbuf; + exec_args.copy.src = task->bcast_linear.sbuf; + exec_args.copy.len = task->bcast_linear.size; + status = ucc_ee_executor_task_post(exec, &exec_args, etask); + if (ucc_unlikely(status != UCC_OK)) { + ucc_error("ucc_ee_executor_task_post failed"); + return; + } } ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) @@ -46,7 +95,10 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) (void) args; (void) dt; task->bcast_linear.stage = STAGE_SYNC; - // task->bcast_linear.sbuf = args->src.info.buffer; + ucc_info("bcast start"); + + task->bcast_linear.sbuf = args->src.info.buffer; + task->bcast_linear.rbuf = args->dst.info.buffer; return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super); @@ -60,6 +112,8 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, ucc_tl_cuda_task_t *task; ucc_status_t status; + ucc_info("bcast init"); + if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) || UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) { return UCC_ERR_NOT_SUPPORTED; @@ -82,6 +136,8 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, task->super.finalize = ucc_tl_cuda_bcast_linear_finalize; task->bar = TASK_BAR(task); + ucc_info("bcast init success"); + *task_p = &task->super; return UCC_OK; } diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h index 751dc4d8ad..0d1a484187 100644 --- a/src/components/tl/cuda/tl_cuda.h +++ b/src/components/tl/cuda/tl_cuda.h @@ -228,6 +228,11 @@ struct ucc_tl_cuda_task { struct { int stage; + int step; + void * sbuf; + void * rbuf; + size_t size; + ucc_ee_executor_task_t *exec_task; } bcast_linear; struct { int stage; From 74375184889a86b854b2501c9ef490c3eb541580 Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Mon, 1 Jul 2024 20:33:57 +0200 Subject: [PATCH 04/34] TL/CUDA: fix compilation --- src/components/tl/cuda/bcast/bcast_linear.c | 46 ++++++++------------- src/components/tl/cuda/tl_cuda.h | 1 + 2 files changed, 18 insertions(+), 29 deletions(-) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index c557bf6fdc..3e0f1fbfc8 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -43,44 +43,28 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) ucc_status_t st; (void)team; (void)st; - ucc_ee_executor_task_t *etask; ucc_ee_executor_t *exec; ucc_status_t status; - void * sbuf, *dbuf, *scratch_buf; + void * sbuf, *dbuf; task->super.status = UCC_INPROGRESS; + status = ucc_coll_task_get_executor(&task->super, &exec); + if (ucc_unlikely(status != UCC_OK)) { + return; + } + // fall-through between cases is intentional switch (task->bcast_linear.stage) { case STAGE_COPY: // copy from src buffer to scratch - scratch_buf = TASK_SCRATCH(task, trank); - block_size = - status = ecopy(dbuf, sbuf, block_size); + dbuf = TASK_SCRATCH(task, trank); + sbuf = task->bcast_linear.sbuf; + status = ecopy(dbuf, sbuf, task->bcast_linear.size, exec, &task->bcast_linear.exec_task); break; - default: break; } - - status = ucc_coll_task_get_executor(&task->super, &exec); - if (ucc_unlikely(status != UCC_OK)) { - return; - } - - etask = &task->bcast_linear.exec_task; - - ucc_ee_executor_task_args_t exec_args = {0}; - - exec_args.task_type = UCC_EE_EXECUTOR_TASK_COPY; - exec_args.copy.dst = task->bcast_linear.dbuf; - exec_args.copy.src = task->bcast_linear.sbuf; - exec_args.copy.len = task->bcast_linear.size; - status = ucc_ee_executor_task_post(exec, &exec_args, etask); - if (ucc_unlikely(status != UCC_OK)) { - ucc_error("ucc_ee_executor_task_post failed"); - return; - } } ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) @@ -89,13 +73,13 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) ucc_tl_cuda_team_t *team = TASK_TEAM(task); ucc_coll_args_t * args = &TASK_ARGS(task); ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); - ucc_datatype_t dt = task->allgatherv_linear.dt; + ucc_datatype_t dt = task->bcast_linear.dt; (void) tsize; (void) args; (void) dt; - task->bcast_linear.stage = STAGE_SYNC; - ucc_info("bcast start"); + task->bcast_linear.stage = STAGE_COPY; + ucc_info("bcast start with dt: %ld", dt); task->bcast_linear.sbuf = args->src.info.buffer; task->bcast_linear.rbuf = args->dst.info.buffer; @@ -126,10 +110,14 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, // task->allgatherv_linear.get_count = ucc_tl_cuda_allgather_get_count; // task->allgatherv_linear.get_offset = ucc_tl_cuda_allgather_get_offset; - // task->allgatherv_linear.dt = coll_args->args.dst.info.datatype; + task->bcast_linear.dt = coll_args->args.src.info.datatype; + + ucc_info("bcast start with dt: %ld", task->bcast_linear.dt); + // task->allgatherv_linear.sbuf = coll_args->args.src.info.buffer; // task->allgatherv_linear.rbuf = coll_args->args.dst.info.buffer; + task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR; task->super.post = ucc_tl_cuda_bcast_linear_start; task->super.progress = ucc_tl_cuda_bcast_linear_progress; diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h index 0d1a484187..f3bb3135d9 100644 --- a/src/components/tl/cuda/tl_cuda.h +++ b/src/components/tl/cuda/tl_cuda.h @@ -231,6 +231,7 @@ struct ucc_tl_cuda_task { int step; void * sbuf; void * rbuf; + ucc_datatype_t dt; size_t size; ucc_ee_executor_task_t *exec_task; } bcast_linear; From 62157cd684b8aba41c8f222122cb7604ca86f473 Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Mon, 1 Jul 2024 22:03:05 +0200 Subject: [PATCH 05/34] TL/CUDA: calc size --- src/components/tl/cuda/bcast/bcast_linear.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index 3e0f1fbfc8..48ea6159a0 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -61,6 +61,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) dbuf = TASK_SCRATCH(task, trank); sbuf = task->bcast_linear.sbuf; status = ecopy(dbuf, sbuf, task->bcast_linear.size, exec, &task->bcast_linear.exec_task); + task->bcast_linear.stage = STAGE_WAIT_COPY; break; default: break; @@ -111,9 +112,11 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, // task->allgatherv_linear.get_count = ucc_tl_cuda_allgather_get_count; // task->allgatherv_linear.get_offset = ucc_tl_cuda_allgather_get_offset; task->bcast_linear.dt = coll_args->args.src.info.datatype; - ucc_info("bcast start with dt: %ld", task->bcast_linear.dt); + task->bcast_linear.size = coll_args->args.src.info.count * ucc_dt_size(task->bcast_linear.dt); + ucc_info("bcast start with data size: %ld", task->bcast_linear.size); + // task->allgatherv_linear.sbuf = coll_args->args.src.info.buffer; // task->allgatherv_linear.rbuf = coll_args->args.dst.info.buffer; From a0f225e762ed8030c2e437c049cabc9fe0c3bc2d Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Mon, 1 Jul 2024 22:03:05 +0200 Subject: [PATCH 06/34] TL/CUDA: wip some logic for root --- src/components/tl/cuda/bcast/bcast_linear.c | 129 ++++++++++++++------ src/components/tl/cuda/tl_cuda.h | 1 - 2 files changed, 91 insertions(+), 39 deletions(-) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index 48ea6159a0..e7822e5996 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -6,15 +6,34 @@ #include "bcast/bcast.h" -enum -{ - STAGE_COPY, // post copy task: copy block from src to scratch buffer +enum { + STAGE_UNDEF, + // root + STAGE_COPY, // post copy task: copy block from src to scratch buffer STAGE_WAIT_COPY, // wait for copy finishes - STAGE_WAIT_ALL, // wait for all others rank be on same step + STAGE_WAIT_ALL, // wait for all others rank be on same step + // non-root }; +// TODO: move out to common with allgather +static inline int get_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank, + int step_id) +{ + ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank); + + return sync->seq_num[step_id]; +} + +static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank, + int step, int step_id) +{ + ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank); + + sync->seq_num[step_id] = step; +} + static inline ucc_status_t ecopy(void *dst, void *src, size_t size, - ucc_ee_executor_t * exec, + ucc_ee_executor_t *exec, ucc_ee_executor_task_t **etask) { ucc_ee_executor_task_args_t exec_args = {0}; @@ -40,31 +59,68 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); ucc_tl_cuda_team_t *team = TASK_TEAM(task); ucc_rank_t trank = UCC_TL_TEAM_RANK(team); + ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); ucc_status_t st; (void)team; (void)st; ucc_ee_executor_t *exec; + ucc_ee_executor_task_t *etask; ucc_status_t status; - void * sbuf, *dbuf; + void *sbuf, *dbuf; task->super.status = UCC_INPROGRESS; status = ucc_coll_task_get_executor(&task->super, &exec); if (ucc_unlikely(status != UCC_OK)) { return; } - - // fall-through between cases is intentional - switch (task->bcast_linear.stage) - { - case STAGE_COPY: - // copy from src buffer to scratch - dbuf = TASK_SCRATCH(task, trank); - sbuf = task->bcast_linear.sbuf; - status = ecopy(dbuf, sbuf, task->bcast_linear.size, exec, &task->bcast_linear.exec_task); - task->bcast_linear.stage = STAGE_WAIT_COPY; - break; - default: - break; + + if (trank == 0) { + // fall-through between cases is intentional + switch (task->bcast_linear.stage) { + case STAGE_COPY: + // copy from src buffer to scratch + dbuf = TASK_SCRATCH(task, trank); + sbuf = task->bcast_linear.sbuf; + status = ecopy(dbuf, sbuf, task->bcast_linear.size, exec, + &task->bcast_linear.exec_task); + task->bcast_linear.stage = STAGE_WAIT_COPY; + break; + case STAGE_WAIT_COPY: + etask = task->bcast_linear.exec_task; + if (etask) { + status = ucc_ee_executor_task_test(etask); + if (status == UCC_OK) { + ucc_ee_executor_task_finalize(etask); + task->bcast_linear.exec_task = NULL; + ucc_info("hello from rank: %d, copy done!", trank); + // signal others + ++task->bcast_linear.step; + set_rank_step(task, 0, task->bcast_linear.step, 0); + task->bcast_linear.stage = STAGE_WAIT_ALL; + } + } + break; + case STAGE_WAIT_ALL: + for (int i = 1; i < tsize; ++i) { + int other_rank_step = get_rank_step(task, i, 0); + ucc_info("rank %d, step: %d, my step: %d", i, other_rank_step, + task->bcast_linear.step); + if (other_rank_step < task->bcast_linear.step) { + ucc_info("rank %d is not ready", i); + return; + } + } + task->bcast_linear.stage = STAGE_COPY; + ucc_info("all others ready for next step"); + // TODO: remove + task->bcast_linear.stage = STAGE_UNDEF; + break; + default: + break; + } + // Root scenario + } else { + // others } } @@ -72,26 +128,25 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) { ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); ucc_tl_cuda_team_t *team = TASK_TEAM(task); - ucc_coll_args_t * args = &TASK_ARGS(task); + ucc_coll_args_t *args = &TASK_ARGS(task); ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); ucc_datatype_t dt = task->bcast_linear.dt; - (void) tsize; - (void) args; - (void) dt; - task->bcast_linear.stage = STAGE_COPY; + (void)tsize; + (void)args; + (void)dt; + task->bcast_linear.stage = STAGE_COPY; ucc_info("bcast start with dt: %ld", dt); task->bcast_linear.sbuf = args->src.info.buffer; - task->bcast_linear.rbuf = args->dst.info.buffer; - + task->bcast_linear.step = 0; return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super); } ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, - ucc_base_team_t * tl_team, - ucc_coll_task_t ** task_p) + ucc_base_team_t *tl_team, + ucc_coll_task_t **task_p) { ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t); ucc_tl_cuda_task_t *task; @@ -100,7 +155,8 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, ucc_info("bcast init"); if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) || - UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) { + UCC_TL_TEAM_SIZE(team) - 1 > + UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) { return UCC_ERR_NOT_SUPPORTED; } @@ -111,25 +167,22 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, // task->allgatherv_linear.get_count = ucc_tl_cuda_allgather_get_count; // task->allgatherv_linear.get_offset = ucc_tl_cuda_allgather_get_offset; - task->bcast_linear.dt = coll_args->args.src.info.datatype; + task->bcast_linear.dt = coll_args->args.src.info.datatype; ucc_info("bcast start with dt: %ld", task->bcast_linear.dt); - task->bcast_linear.size = coll_args->args.src.info.count * ucc_dt_size(task->bcast_linear.dt); - ucc_info("bcast start with data size: %ld", task->bcast_linear.size); - // task->allgatherv_linear.sbuf = coll_args->args.src.info.buffer; // task->allgatherv_linear.rbuf = coll_args->args.dst.info.buffer; + task->bcast_linear.sbuf = coll_args->args.src.info.buffer; task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR; - task->super.post = ucc_tl_cuda_bcast_linear_start; - task->super.progress = ucc_tl_cuda_bcast_linear_progress; - task->super.finalize = ucc_tl_cuda_bcast_linear_finalize; - task->bar = TASK_BAR(task); + task->super.post = ucc_tl_cuda_bcast_linear_start; + task->super.progress = ucc_tl_cuda_bcast_linear_progress; + task->super.finalize = ucc_tl_cuda_bcast_linear_finalize; + task->bar = TASK_BAR(task); ucc_info("bcast init success"); *task_p = &task->super; return UCC_OK; } - diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h index f3bb3135d9..c8f60258aa 100644 --- a/src/components/tl/cuda/tl_cuda.h +++ b/src/components/tl/cuda/tl_cuda.h @@ -230,7 +230,6 @@ struct ucc_tl_cuda_task { int stage; int step; void * sbuf; - void * rbuf; ucc_datatype_t dt; size_t size; ucc_ee_executor_task_t *exec_task; From 4187516289a32d7de3152faf030f60ae30cca102 Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Tue, 2 Jul 2024 19:06:01 +0200 Subject: [PATCH 07/34] TL/CUDA: wip logic for client --- src/components/tl/cuda/bcast/bcast_linear.c | 78 +++++++++++++++++---- 1 file changed, 66 insertions(+), 12 deletions(-) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index e7822e5996..3a6f99005f 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -6,13 +6,17 @@ #include "bcast/bcast.h" -enum { - STAGE_UNDEF, +enum +{ + STAGE_DONE, // root STAGE_COPY, // post copy task: copy block from src to scratch buffer STAGE_WAIT_COPY, // wait for copy finishes STAGE_WAIT_ALL, // wait for all others rank be on same step // non-root + STAGE_WAIT_ROOT, + STAGE_CLIENT_COPY, + STAGE_CLIENT_COPY_WAIT, }; // TODO: move out to common with allgather @@ -33,7 +37,7 @@ static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank, } static inline ucc_status_t ecopy(void *dst, void *src, size_t size, - ucc_ee_executor_t *exec, + ucc_ee_executor_t * exec, ucc_ee_executor_task_t **etask) { ucc_ee_executor_task_args_t exec_args = {0}; @@ -63,10 +67,10 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) ucc_status_t st; (void)team; (void)st; - ucc_ee_executor_t *exec; + ucc_ee_executor_t * exec; ucc_ee_executor_task_t *etask; ucc_status_t status; - void *sbuf, *dbuf; + void * sbuf, *dbuf; task->super.status = UCC_INPROGRESS; status = ucc_coll_task_get_executor(&task->super, &exec); @@ -113,7 +117,10 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) task->bcast_linear.stage = STAGE_COPY; ucc_info("all others ready for next step"); // TODO: remove - task->bcast_linear.stage = STAGE_UNDEF; + task->bcast_linear.stage = STAGE_DONE; + break; + case STAGE_DONE: + task->super.status = UCC_OK; break; default: break; @@ -121,6 +128,43 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) // Root scenario } else { // others + switch (task->bcast_linear.stage) { + case STAGE_WAIT_ROOT: + /* code */ + if (get_rank_step(task, 0 /* root */, 0) > + task->bcast_linear.step) { + ucc_info("something from root is ready!"); + task->bcast_linear.stage = STAGE_CLIENT_COPY; + break; + } + break; + case STAGE_CLIENT_COPY: + dbuf = task->bcast_linear.sbuf; + sbuf = TASK_SCRATCH(task, + 0); // need to copy from root's scratch buffer + status = ecopy(dbuf, sbuf, task->bcast_linear.size, exec, + &task->bcast_linear.exec_task); + task->bcast_linear.stage = STAGE_CLIENT_COPY_WAIT; + break; + case STAGE_CLIENT_COPY_WAIT: + etask = task->bcast_linear.exec_task; + if (etask) { + status = ucc_ee_executor_task_test(etask); + if (status == UCC_OK) { + ucc_ee_executor_task_finalize(etask); + task->bcast_linear.exec_task = NULL; + ++task->bcast_linear.step; + set_rank_step(task, trank, task->bcast_linear.step, 0); + task->bcast_linear.stage = STAGE_DONE; // TODO: just for debug + } + } + break; + case STAGE_DONE: + task->super.status = UCC_OK; + break; + default: + break; + } } } @@ -128,15 +172,25 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) { ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); ucc_tl_cuda_team_t *team = TASK_TEAM(task); - ucc_coll_args_t *args = &TASK_ARGS(task); + ucc_rank_t trank = UCC_TL_TEAM_RANK(team); + ucc_coll_args_t * args = &TASK_ARGS(task); ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); ucc_datatype_t dt = task->bcast_linear.dt; (void)tsize; (void)args; (void)dt; - task->bcast_linear.stage = STAGE_COPY; - ucc_info("bcast start with dt: %ld", dt); + + if (trank == 0) { + task->bcast_linear.stage = STAGE_COPY; + } else { + task->bcast_linear.stage = STAGE_WAIT_ROOT; + } + ucc_info("bcast start with dt: %ld and count: %ld", dt, args->src.info.count); + + task->bcast_linear.size = ucc_dt_size(dt) * args->src.info.count; + + ucc_info("bcast buffer size: %ld", task->bcast_linear.size); task->bcast_linear.sbuf = args->src.info.buffer; task->bcast_linear.step = 0; @@ -145,8 +199,8 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) } ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, - ucc_base_team_t *tl_team, - ucc_coll_task_t **task_p) + ucc_base_team_t * tl_team, + ucc_coll_task_t ** task_p) { ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t); ucc_tl_cuda_task_t *task; @@ -168,7 +222,7 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, // task->allgatherv_linear.get_count = ucc_tl_cuda_allgather_get_count; // task->allgatherv_linear.get_offset = ucc_tl_cuda_allgather_get_offset; task->bcast_linear.dt = coll_args->args.src.info.datatype; - ucc_info("bcast start with dt: %ld", task->bcast_linear.dt); + ucc_info("bcast init with dt: %ld", task->bcast_linear.dt); // task->allgatherv_linear.sbuf = coll_args->args.src.info.buffer; // task->allgatherv_linear.rbuf = coll_args->args.dst.info.buffer; From 3f0e4d9c882c58d6dcdbfe497e866601f0bccc4a Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Wed, 3 Jul 2024 13:20:13 +0200 Subject: [PATCH 08/34] TL/CUDA: added barrier to sync stages --- src/components/tl/cuda/bcast/bcast_linear.c | 97 ++++++++++++++++----- 1 file changed, 74 insertions(+), 23 deletions(-) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index 3a6f99005f..41e224acff 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -6,9 +6,10 @@ #include "bcast/bcast.h" -enum -{ +enum { STAGE_DONE, + STAGE_SYNC, + STAGE_SETUP, // root STAGE_COPY, // post copy task: copy block from src to scratch buffer STAGE_WAIT_COPY, // wait for copy finishes @@ -36,8 +37,33 @@ static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank, sync->seq_num[step_id] = step; } +ucc_status_t ucc_tl_cuda_bcast_linear_setup_start(ucc_tl_cuda_task_t *task) +{ + ucc_tl_cuda_team_t *team = TASK_TEAM(task); + ucc_rank_t trank = UCC_TL_TEAM_RANK(team); + ucc_status_t status; + + set_rank_step(task, trank, 0, 0); + ucc_memory_cpu_store_fence(); + status = ucc_tl_cuda_shm_barrier_start(UCC_TL_TEAM_RANK(team), task->bar); + if (ucc_unlikely(status != UCC_OK)) { + goto exit_err; + } + + return UCC_OK; + +exit_err: + return status; +} + +ucc_status_t ucc_tl_cuda_bcast_linear_setup_test(ucc_tl_cuda_task_t *task) +{ + ucc_tl_cuda_team_t *team = TASK_TEAM(task); + return ucc_tl_cuda_shm_barrier_test(UCC_TL_TEAM_RANK(team), task->bar); +} + static inline ucc_status_t ecopy(void *dst, void *src, size_t size, - ucc_ee_executor_t * exec, + ucc_ee_executor_t *exec, ucc_ee_executor_task_t **etask) { ucc_ee_executor_task_args_t exec_args = {0}; @@ -67,10 +93,10 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) ucc_status_t st; (void)team; (void)st; - ucc_ee_executor_t * exec; + ucc_ee_executor_t *exec; ucc_ee_executor_task_t *etask; ucc_status_t status; - void * sbuf, *dbuf; + void *sbuf, *dbuf; task->super.status = UCC_INPROGRESS; status = ucc_coll_task_get_executor(&task->super, &exec); @@ -78,6 +104,38 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) return; } + switch (task->bcast_linear.stage) { + case STAGE_SYNC: + ucc_info("sync"); + if (ucc_tl_cuda_get_sync(task) != UCC_OK) { + task->super.status = UCC_INPROGRESS; + return; + } + task->bcast_linear.step = 0; + ucc_info("setup"); + st = ucc_tl_cuda_bcast_linear_setup_start(task); + if (st != UCC_OK) { + task->super.status = st; + return; + } + task->bcast_linear.stage = STAGE_SETUP; + case STAGE_SETUP: + ucc_info("test"); + st = ucc_tl_cuda_bcast_linear_setup_test(task); + if (st != UCC_OK) { + task->super.status = st; + return; + } + ucc_tl_cuda_put_sync(task); + if (trank == 0 /* root */) { + task->bcast_linear.stage = STAGE_COPY; + } else { + task->bcast_linear.stage = STAGE_WAIT_ROOT; + } + default: + break; + } + if (trank == 0) { // fall-through between cases is intentional switch (task->bcast_linear.stage) { @@ -141,7 +199,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) case STAGE_CLIENT_COPY: dbuf = task->bcast_linear.sbuf; sbuf = TASK_SCRATCH(task, - 0); // need to copy from root's scratch buffer + 0); // need to copy from root's scratch buffer status = ecopy(dbuf, sbuf, task->bcast_linear.size, exec, &task->bcast_linear.exec_task); task->bcast_linear.stage = STAGE_CLIENT_COPY_WAIT; @@ -155,7 +213,8 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) task->bcast_linear.exec_task = NULL; ++task->bcast_linear.step; set_rank_step(task, trank, task->bcast_linear.step, 0); - task->bcast_linear.stage = STAGE_DONE; // TODO: just for debug + task->bcast_linear.stage = + STAGE_DONE; // TODO: just for debug } } break; @@ -172,8 +231,7 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) { ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); ucc_tl_cuda_team_t *team = TASK_TEAM(task); - ucc_rank_t trank = UCC_TL_TEAM_RANK(team); - ucc_coll_args_t * args = &TASK_ARGS(task); + ucc_coll_args_t *args = &TASK_ARGS(task); ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); ucc_datatype_t dt = task->bcast_linear.dt; @@ -181,12 +239,10 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) (void)args; (void)dt; - if (trank == 0) { - task->bcast_linear.stage = STAGE_COPY; - } else { - task->bcast_linear.stage = STAGE_WAIT_ROOT; - } - ucc_info("bcast start with dt: %ld and count: %ld", dt, args->src.info.count); + task->bcast_linear.stage = STAGE_SYNC; + + ucc_info("bcast start with dt: %s and count: %ld", ucc_datatype_str(dt), + args->src.info.count); task->bcast_linear.size = ucc_dt_size(dt) * args->src.info.count; @@ -199,8 +255,8 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) } ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, - ucc_base_team_t * tl_team, - ucc_coll_task_t ** task_p) + ucc_base_team_t *tl_team, + ucc_coll_task_t **task_p) { ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t); ucc_tl_cuda_task_t *task; @@ -219,13 +275,8 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, return status; } - // task->allgatherv_linear.get_count = ucc_tl_cuda_allgather_get_count; - // task->allgatherv_linear.get_offset = ucc_tl_cuda_allgather_get_offset; task->bcast_linear.dt = coll_args->args.src.info.datatype; - ucc_info("bcast init with dt: %ld", task->bcast_linear.dt); - - // task->allgatherv_linear.sbuf = coll_args->args.src.info.buffer; - // task->allgatherv_linear.rbuf = coll_args->args.dst.info.buffer; + ucc_info("bcast init with dt: %s", ucc_datatype_str(task->bcast_linear.dt)); task->bcast_linear.sbuf = coll_args->args.src.info.buffer; From cd13b04071147cc724d3537ad5c89222c2ae800c Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Wed, 3 Jul 2024 15:27:34 +0200 Subject: [PATCH 09/34] TL/CUDA: non zero root --- src/components/tl/cuda/bcast/bcast_linear.c | 30 ++++++++++----------- src/components/tl/cuda/tl_cuda.h | 1 + 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index 41e224acff..f6712c2c46 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -106,13 +106,13 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) switch (task->bcast_linear.stage) { case STAGE_SYNC: - ucc_info("sync"); + // ucc_info("sync"); if (ucc_tl_cuda_get_sync(task) != UCC_OK) { task->super.status = UCC_INPROGRESS; return; } task->bcast_linear.step = 0; - ucc_info("setup"); + // ucc_info("setup"); st = ucc_tl_cuda_bcast_linear_setup_start(task); if (st != UCC_OK) { task->super.status = st; @@ -120,14 +120,14 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) } task->bcast_linear.stage = STAGE_SETUP; case STAGE_SETUP: - ucc_info("test"); + // ucc_info("test"); st = ucc_tl_cuda_bcast_linear_setup_test(task); if (st != UCC_OK) { task->super.status = st; return; } ucc_tl_cuda_put_sync(task); - if (trank == 0 /* root */) { + if (trank == task->bcast_linear.root) { task->bcast_linear.stage = STAGE_COPY; } else { task->bcast_linear.stage = STAGE_WAIT_ROOT; @@ -136,7 +136,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) break; } - if (trank == 0) { + if (trank == task->bcast_linear.root) { // fall-through between cases is intentional switch (task->bcast_linear.stage) { case STAGE_COPY: @@ -154,26 +154,25 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) if (status == UCC_OK) { ucc_ee_executor_task_finalize(etask); task->bcast_linear.exec_task = NULL; - ucc_info("hello from rank: %d, copy done!", trank); // signal others ++task->bcast_linear.step; - set_rank_step(task, 0, task->bcast_linear.step, 0); + set_rank_step(task, task->bcast_linear.root, task->bcast_linear.step, 0); task->bcast_linear.stage = STAGE_WAIT_ALL; } } break; case STAGE_WAIT_ALL: - for (int i = 1; i < tsize; ++i) { + for (int i = 0; i < tsize; ++i) { int other_rank_step = get_rank_step(task, i, 0); - ucc_info("rank %d, step: %d, my step: %d", i, other_rank_step, - task->bcast_linear.step); + // ucc_info("rank %d, step: %d, my step: %d", i, other_rank_step, + // task->bcast_linear.step); if (other_rank_step < task->bcast_linear.step) { - ucc_info("rank %d is not ready", i); + // ucc_info("rank %d is not ready", i); return; } } task->bcast_linear.stage = STAGE_COPY; - ucc_info("all others ready for next step"); + // ucc_info("all others ready for next step"); // TODO: remove task->bcast_linear.stage = STAGE_DONE; break; @@ -189,9 +188,9 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) switch (task->bcast_linear.stage) { case STAGE_WAIT_ROOT: /* code */ - if (get_rank_step(task, 0 /* root */, 0) > + if (get_rank_step(task, task->bcast_linear.root, 0) > task->bcast_linear.step) { - ucc_info("something from root is ready!"); + // ucc_info("something from root is ready!"); task->bcast_linear.stage = STAGE_CLIENT_COPY; break; } @@ -199,7 +198,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) case STAGE_CLIENT_COPY: dbuf = task->bcast_linear.sbuf; sbuf = TASK_SCRATCH(task, - 0); // need to copy from root's scratch buffer + task->bcast_linear.root); // need to copy from root's scratch buffer status = ecopy(dbuf, sbuf, task->bcast_linear.size, exec, &task->bcast_linear.exec_task); task->bcast_linear.stage = STAGE_CLIENT_COPY_WAIT; @@ -275,6 +274,7 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, return status; } + task->bcast_linear.root = coll_args->args.root; task->bcast_linear.dt = coll_args->args.src.info.datatype; ucc_info("bcast init with dt: %s", ucc_datatype_str(task->bcast_linear.dt)); diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h index c8f60258aa..38608973ac 100644 --- a/src/components/tl/cuda/tl_cuda.h +++ b/src/components/tl/cuda/tl_cuda.h @@ -231,6 +231,7 @@ struct ucc_tl_cuda_task { int step; void * sbuf; ucc_datatype_t dt; + ucc_rank_t root; size_t size; ucc_ee_executor_task_t *exec_task; } bcast_linear; From 54b84dc92e83ad1b56431647b35c7629687d55bf Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Wed, 3 Jul 2024 15:27:56 +0200 Subject: [PATCH 10/34] TL/CUDA: revert commented --- src/components/tl/cuda/tl_cuda_context.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/components/tl/cuda/tl_cuda_context.c b/src/components/tl/cuda/tl_cuda_context.c index ffb48336f5..4d89029680 100644 --- a/src/components/tl/cuda/tl_cuda_context.c +++ b/src/components/tl/cuda/tl_cuda_context.c @@ -20,8 +20,8 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_context_t, ucc_status_t status; int num_devices; cudaError_t cuda_st; - // CUcontext cu_ctx; - // CUresult cu_st; + CUcontext cu_ctx; + CUresult cu_st; UCC_CLASS_CALL_SUPER_INIT(ucc_tl_context_t, &tl_cuda_config->super, params->context); @@ -37,12 +37,12 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_context_t, return UCC_ERR_NO_RESOURCE; } - // cu_st = cuCtxGetCurrent(&cu_ctx); - // if (cu_ctx == NULL || cu_st != CUDA_SUCCESS) { - // tl_debug(self->super.super.lib, - // "cannot create CUDA TL context without active CUDA context"); - // return UCC_ERR_NO_RESOURCE; - // } + cu_st = cuCtxGetCurrent(&cu_ctx); + if (cu_ctx == NULL || cu_st != CUDA_SUCCESS) { + tl_debug(self->super.super.lib, + "cannot create CUDA TL context without active CUDA context"); + return UCC_ERR_NO_RESOURCE; + } status = ucc_mpool_init(&self->req_mp, 0, sizeof(ucc_tl_cuda_task_t), 0, UCC_CACHE_LINE_SIZE, 8, UINT_MAX, From 7989e60e5cb84563770bf5001c7f98e09044bcf7 Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Wed, 3 Jul 2024 17:55:21 +0200 Subject: [PATCH 11/34] TL/CUDA: wip multistep --- src/components/tl/cuda/bcast/bcast_linear.c | 66 ++++++++++++++++----- src/components/tl/cuda/tl_cuda.h | 1 + 2 files changed, 52 insertions(+), 15 deletions(-) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index f6712c2c46..743ddab245 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -62,6 +62,17 @@ ucc_status_t ucc_tl_cuda_bcast_linear_setup_test(ucc_tl_cuda_task_t *task) return ucc_tl_cuda_shm_barrier_test(UCC_TL_TEAM_RANK(team), task->bar); } +static inline size_t get_scratch_size(ucc_tl_cuda_team_t *team, + ucc_datatype_t dt) +{ + size_t dt_size = ucc_dt_size(dt); + ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); + + ucc_assert((dt_size > 0) && (tsize > 0)); + + return UCC_TL_CUDA_TEAM_LIB(team)->cfg.scratch_size; +} + static inline ucc_status_t ecopy(void *dst, void *src, size_t size, ucc_ee_executor_t *exec, ucc_ee_executor_task_t **etask) @@ -90,6 +101,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) ucc_tl_cuda_team_t *team = TASK_TEAM(task); ucc_rank_t trank = UCC_TL_TEAM_RANK(team); ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); + ucc_datatype_t dt = task->bcast_linear.dt; ucc_status_t st; (void)team; (void)st; @@ -136,14 +148,23 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) break; } + size_t scratch_size = get_scratch_size(team, dt); + size_t chunk_size = task->bcast_linear.step < task->bcast_linear.num_steps + ? ucc_min(scratch_size, task->bcast_linear.size) + : task->bcast_linear.size - + (task->bcast_linear.step - 1) * scratch_size; + size_t offset_buff = task->bcast_linear.step * scratch_size; + + // ucc_info("chunk_size: %ld", chunk_size); + if (trank == task->bcast_linear.root) { // fall-through between cases is intentional switch (task->bcast_linear.stage) { case STAGE_COPY: // copy from src buffer to scratch dbuf = TASK_SCRATCH(task, trank); - sbuf = task->bcast_linear.sbuf; - status = ecopy(dbuf, sbuf, task->bcast_linear.size, exec, + sbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff); + status = ecopy(dbuf, sbuf, chunk_size, exec, &task->bcast_linear.exec_task); task->bcast_linear.stage = STAGE_WAIT_COPY; break; @@ -156,7 +177,8 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) task->bcast_linear.exec_task = NULL; // signal others ++task->bcast_linear.step; - set_rank_step(task, task->bcast_linear.root, task->bcast_linear.step, 0); + set_rank_step(task, task->bcast_linear.root, + task->bcast_linear.step, 0); task->bcast_linear.stage = STAGE_WAIT_ALL; } } @@ -173,8 +195,11 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) } task->bcast_linear.stage = STAGE_COPY; // ucc_info("all others ready for next step"); - // TODO: remove - task->bcast_linear.stage = STAGE_DONE; + if (task->bcast_linear.stage < task->bcast_linear.num_steps) { + task->bcast_linear.stage = STAGE_COPY; + } else { + task->bcast_linear.stage = STAGE_DONE; + } break; case STAGE_DONE: task->super.status = UCC_OK; @@ -196,11 +221,13 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) } break; case STAGE_CLIENT_COPY: - dbuf = task->bcast_linear.sbuf; - sbuf = TASK_SCRATCH(task, - task->bcast_linear.root); // need to copy from root's scratch buffer - status = ecopy(dbuf, sbuf, task->bcast_linear.size, exec, - &task->bcast_linear.exec_task); + dbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff); + sbuf = TASK_SCRATCH( + task, + task->bcast_linear + .root); // need to copy from root's scratch buffer + status = ecopy(dbuf, sbuf, chunk_size, exec, + &task->bcast_linear.exec_task); task->bcast_linear.stage = STAGE_CLIENT_COPY_WAIT; break; case STAGE_CLIENT_COPY_WAIT: @@ -212,8 +239,14 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) task->bcast_linear.exec_task = NULL; ++task->bcast_linear.step; set_rank_step(task, trank, task->bcast_linear.step, 0); - task->bcast_linear.stage = - STAGE_DONE; // TODO: just for debug + // task->bcast_linear.stage = + // STAGE_DONE; // TODO: just for debug + if (task->bcast_linear.stage < + task->bcast_linear.num_steps) { + task->bcast_linear.stage = STAGE_COPY; + } else { + task->bcast_linear.stage = STAGE_DONE; + } } } break; @@ -235,7 +268,6 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) ucc_datatype_t dt = task->bcast_linear.dt; (void)tsize; - (void)args; (void)dt; task->bcast_linear.stage = STAGE_SYNC; @@ -244,8 +276,12 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) args->src.info.count); task->bcast_linear.size = ucc_dt_size(dt) * args->src.info.count; + size_t scratch_size = get_scratch_size(team, dt); + task->bcast_linear.num_steps = + ucc_div_round_up(task->bcast_linear.size, scratch_size); - ucc_info("bcast buffer size: %ld", task->bcast_linear.size); + ucc_info("bcast buffer size: %ld, num_steps: %d", task->bcast_linear.size, + task->bcast_linear.num_steps); task->bcast_linear.sbuf = args->src.info.buffer; task->bcast_linear.step = 0; @@ -275,7 +311,7 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, } task->bcast_linear.root = coll_args->args.root; - task->bcast_linear.dt = coll_args->args.src.info.datatype; + task->bcast_linear.dt = coll_args->args.src.info.datatype; ucc_info("bcast init with dt: %s", ucc_datatype_str(task->bcast_linear.dt)); task->bcast_linear.sbuf = coll_args->args.src.info.buffer; diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h index 38608973ac..096bab1a0e 100644 --- a/src/components/tl/cuda/tl_cuda.h +++ b/src/components/tl/cuda/tl_cuda.h @@ -233,6 +233,7 @@ struct ucc_tl_cuda_task { ucc_datatype_t dt; ucc_rank_t root; size_t size; + int num_steps; ucc_ee_executor_task_t *exec_task; } bcast_linear; struct { From e400efdc35126c11a86d509926f8ef99b910a6a3 Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Thu, 4 Jul 2024 16:55:44 +0200 Subject: [PATCH 12/34] TL/CUDA: fix step check --- src/components/tl/cuda/bcast/bcast_linear.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index 743ddab245..ad55cfc1a3 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -195,7 +195,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) } task->bcast_linear.stage = STAGE_COPY; // ucc_info("all others ready for next step"); - if (task->bcast_linear.stage < task->bcast_linear.num_steps) { + if (task->bcast_linear.step < task->bcast_linear.num_steps) { task->bcast_linear.stage = STAGE_COPY; } else { task->bcast_linear.stage = STAGE_DONE; @@ -241,9 +241,9 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) set_rank_step(task, trank, task->bcast_linear.step, 0); // task->bcast_linear.stage = // STAGE_DONE; // TODO: just for debug - if (task->bcast_linear.stage < + if (task->bcast_linear.step < task->bcast_linear.num_steps) { - task->bcast_linear.stage = STAGE_COPY; + task->bcast_linear.stage = STAGE_WAIT_ROOT; } else { task->bcast_linear.stage = STAGE_DONE; } From 2c94cbff6f3156548a95e978fcdb88a9c59cfc3d Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Thu, 4 Jul 2024 17:37:30 +0200 Subject: [PATCH 13/34] TL/CUDA: minor cleanup --- src/components/tl/cuda/bcast/bcast_linear.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index ad55cfc1a3..67dd083043 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -62,14 +62,8 @@ ucc_status_t ucc_tl_cuda_bcast_linear_setup_test(ucc_tl_cuda_task_t *task) return ucc_tl_cuda_shm_barrier_test(UCC_TL_TEAM_RANK(team), task->bar); } -static inline size_t get_scratch_size(ucc_tl_cuda_team_t *team, - ucc_datatype_t dt) +static inline size_t get_raw_scratch_size(ucc_tl_cuda_team_t *team) { - size_t dt_size = ucc_dt_size(dt); - ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); - - ucc_assert((dt_size > 0) && (tsize > 0)); - return UCC_TL_CUDA_TEAM_LIB(team)->cfg.scratch_size; } @@ -101,8 +95,8 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) ucc_tl_cuda_team_t *team = TASK_TEAM(task); ucc_rank_t trank = UCC_TL_TEAM_RANK(team); ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); - ucc_datatype_t dt = task->bcast_linear.dt; - ucc_status_t st; + // ucc_datatype_t dt = task->bcast_linear.dt; + ucc_status_t st; (void)team; (void)st; ucc_ee_executor_t *exec; @@ -148,7 +142,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) break; } - size_t scratch_size = get_scratch_size(team, dt); + size_t scratch_size = get_raw_scratch_size(team); size_t chunk_size = task->bcast_linear.step < task->bcast_linear.num_steps ? ucc_min(scratch_size, task->bcast_linear.size) : task->bcast_linear.size - @@ -239,8 +233,6 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) task->bcast_linear.exec_task = NULL; ++task->bcast_linear.step; set_rank_step(task, trank, task->bcast_linear.step, 0); - // task->bcast_linear.stage = - // STAGE_DONE; // TODO: just for debug if (task->bcast_linear.step < task->bcast_linear.num_steps) { task->bcast_linear.stage = STAGE_WAIT_ROOT; @@ -276,7 +268,7 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) args->src.info.count); task->bcast_linear.size = ucc_dt_size(dt) * args->src.info.count; - size_t scratch_size = get_scratch_size(team, dt); + size_t scratch_size = get_raw_scratch_size(team); task->bcast_linear.num_steps = ucc_div_round_up(task->bcast_linear.size, scratch_size); From 3ff154bb5d01d3d24d6d5cea653e41cf8ecfb2ff Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Fri, 5 Jul 2024 15:23:33 +0200 Subject: [PATCH 14/34] TL/CUDA: removed breaks --- src/components/tl/cuda/bcast/bcast_linear.c | 94 +++++++++++++-------- 1 file changed, 59 insertions(+), 35 deletions(-) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index 67dd083043..57377699b4 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -96,17 +96,15 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) ucc_rank_t trank = UCC_TL_TEAM_RANK(team); ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); // ucc_datatype_t dt = task->bcast_linear.dt; - ucc_status_t st; - (void)team; - (void)st; + ucc_ee_executor_t *exec; ucc_ee_executor_task_t *etask; - ucc_status_t status; + ucc_status_t st; void *sbuf, *dbuf; task->super.status = UCC_INPROGRESS; - status = ucc_coll_task_get_executor(&task->super, &exec); - if (ucc_unlikely(status != UCC_OK)) { + st = ucc_coll_task_get_executor(&task->super, &exec); + if (ucc_unlikely(st != UCC_OK)) { return; } @@ -152,21 +150,28 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) // ucc_info("chunk_size: %ld", chunk_size); if (trank == task->bcast_linear.root) { + // Root scenario // fall-through between cases is intentional switch (task->bcast_linear.stage) { case STAGE_COPY: // copy from src buffer to scratch - dbuf = TASK_SCRATCH(task, trank); - sbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff); - status = ecopy(dbuf, sbuf, chunk_size, exec, - &task->bcast_linear.exec_task); + dbuf = TASK_SCRATCH(task, trank); + sbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff); + st = ecopy(dbuf, sbuf, chunk_size, exec, + &task->bcast_linear.exec_task); + if (st != UCC_OK) + { + ucc_error("failed to post ecopy task"); + task->super.status = st; + return; + } task->bcast_linear.stage = STAGE_WAIT_COPY; - break; + // break; case STAGE_WAIT_COPY: etask = task->bcast_linear.exec_task; if (etask) { - status = ucc_ee_executor_task_test(etask); - if (status == UCC_OK) { + st = ucc_ee_executor_task_test(etask); + if (st == UCC_OK) { ucc_ee_executor_task_finalize(etask); task->bcast_linear.exec_task = NULL; // signal others @@ -175,33 +180,42 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) task->bcast_linear.step, 0); task->bcast_linear.stage = STAGE_WAIT_ALL; } + else + { + // ucc_info("not ready"); + return; + } } - break; + else + { + ucc_info("etask is nullptr"); + return; + } + // break; case STAGE_WAIT_ALL: for (int i = 0; i < tsize; ++i) { - int other_rank_step = get_rank_step(task, i, 0); - // ucc_info("rank %d, step: %d, my step: %d", i, other_rank_step, - // task->bcast_linear.step); - if (other_rank_step < task->bcast_linear.step) { - // ucc_info("rank %d is not ready", i); + if (get_rank_step(task, i, 0) < task->bcast_linear.step) { + // rank is not ready, lets wait return; } } task->bcast_linear.stage = STAGE_COPY; // ucc_info("all others ready for next step"); if (task->bcast_linear.step < task->bcast_linear.num_steps) { + // go to next iteration task->bcast_linear.stage = STAGE_COPY; + return; } else { + // finish task->bcast_linear.stage = STAGE_DONE; } - break; + // break; case STAGE_DONE: task->super.status = UCC_OK; break; default: break; } - // Root scenario } else { // others switch (task->bcast_linear.stage) { @@ -213,22 +227,26 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) task->bcast_linear.stage = STAGE_CLIENT_COPY; break; } - break; + else + { + return; + } + // break; case STAGE_CLIENT_COPY: dbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff); sbuf = TASK_SCRATCH( task, task->bcast_linear .root); // need to copy from root's scratch buffer - status = ecopy(dbuf, sbuf, chunk_size, exec, + st = ecopy(dbuf, sbuf, chunk_size, exec, &task->bcast_linear.exec_task); task->bcast_linear.stage = STAGE_CLIENT_COPY_WAIT; - break; + // break; case STAGE_CLIENT_COPY_WAIT: etask = task->bcast_linear.exec_task; if (etask) { - status = ucc_ee_executor_task_test(etask); - if (status == UCC_OK) { + st = ucc_ee_executor_task_test(etask); + if (st == UCC_OK) { ucc_ee_executor_task_finalize(etask); task->bcast_linear.exec_task = NULL; ++task->bcast_linear.step; @@ -236,12 +254,21 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) if (task->bcast_linear.step < task->bcast_linear.num_steps) { task->bcast_linear.stage = STAGE_WAIT_ROOT; + return; } else { task->bcast_linear.stage = STAGE_DONE; } } + else + { + return; + } } - break; + else + { + return; + } + // break; case STAGE_DONE: task->super.status = UCC_OK; break; @@ -253,14 +280,11 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) { - ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); - ucc_tl_cuda_team_t *team = TASK_TEAM(task); - ucc_coll_args_t *args = &TASK_ARGS(task); - ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); - ucc_datatype_t dt = task->bcast_linear.dt; - - (void)tsize; - (void)dt; + ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); + ucc_tl_cuda_team_t *team = TASK_TEAM(task); + ucc_coll_args_t *args = &TASK_ARGS(task); + // ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); + ucc_datatype_t dt = task->bcast_linear.dt; task->bcast_linear.stage = STAGE_SYNC; From 162680879de7aefc30f7c69e02d764a6e90802fe Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Sat, 6 Jul 2024 17:16:04 +0200 Subject: [PATCH 15/34] TL/CUDA: fix linter --- src/components/tl/cuda/bcast/bcast_linear.c | 49 +++++++++------------ 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index 57377699b4..ed419f7b00 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -6,7 +6,8 @@ #include "bcast/bcast.h" -enum { +enum +{ STAGE_DONE, STAGE_SYNC, STAGE_SETUP, @@ -68,7 +69,7 @@ static inline size_t get_raw_scratch_size(ucc_tl_cuda_team_t *team) } static inline ucc_status_t ecopy(void *dst, void *src, size_t size, - ucc_ee_executor_t *exec, + ucc_ee_executor_t * exec, ucc_ee_executor_task_t **etask) { ucc_ee_executor_task_args_t exec_args = {0}; @@ -97,10 +98,10 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); // ucc_datatype_t dt = task->bcast_linear.dt; - ucc_ee_executor_t *exec; + ucc_ee_executor_t * exec; ucc_ee_executor_task_t *etask; ucc_status_t st; - void *sbuf, *dbuf; + void * sbuf, *dbuf; task->super.status = UCC_INPROGRESS; st = ucc_coll_task_get_executor(&task->super, &exec); @@ -158,9 +159,8 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) dbuf = TASK_SCRATCH(task, trank); sbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff); st = ecopy(dbuf, sbuf, chunk_size, exec, - &task->bcast_linear.exec_task); - if (st != UCC_OK) - { + &task->bcast_linear.exec_task); + if (st != UCC_OK) { ucc_error("failed to post ecopy task"); task->super.status = st; return; @@ -179,15 +179,11 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) set_rank_step(task, task->bcast_linear.root, task->bcast_linear.step, 0); task->bcast_linear.stage = STAGE_WAIT_ALL; - } - else - { + } else { // ucc_info("not ready"); return; } - } - else - { + } else { ucc_info("etask is nullptr"); return; } @@ -226,9 +222,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) // ucc_info("something from root is ready!"); task->bcast_linear.stage = STAGE_CLIENT_COPY; break; - } - else - { + } else { return; } // break; @@ -238,8 +232,13 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) task, task->bcast_linear .root); // need to copy from root's scratch buffer - st = ecopy(dbuf, sbuf, chunk_size, exec, - &task->bcast_linear.exec_task); + st = ecopy(dbuf, sbuf, chunk_size, exec, + &task->bcast_linear.exec_task); + if (st != UCC_OK) { + ucc_error("failed to post ecopy task at client"); + task->super.status = st; + return; + } task->bcast_linear.stage = STAGE_CLIENT_COPY_WAIT; // break; case STAGE_CLIENT_COPY_WAIT: @@ -258,14 +257,10 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) } else { task->bcast_linear.stage = STAGE_DONE; } - } - else - { + } else { return; } - } - else - { + } else { return; } // break; @@ -282,7 +277,7 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) { ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); ucc_tl_cuda_team_t *team = TASK_TEAM(task); - ucc_coll_args_t *args = &TASK_ARGS(task); + ucc_coll_args_t * args = &TASK_ARGS(task); // ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); ucc_datatype_t dt = task->bcast_linear.dt; @@ -306,8 +301,8 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) } ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, - ucc_base_team_t *tl_team, - ucc_coll_task_t **task_p) + ucc_base_team_t * tl_team, + ucc_coll_task_t ** task_p) { ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t); ucc_tl_cuda_task_t *task; From bb555de7049d3742eb0ba80c789210194e5ba3ef Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Thu, 11 Jul 2024 20:55:47 +0200 Subject: [PATCH 16/34] TL/CUDA: double buffering --- src/components/tl/cuda/bcast/bcast_linear.c | 68 ++++++++------------- 1 file changed, 24 insertions(+), 44 deletions(-) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index ed419f7b00..6099d3e306 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -96,7 +96,13 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) ucc_tl_cuda_team_t *team = TASK_TEAM(task); ucc_rank_t trank = UCC_TL_TEAM_RANK(team); ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); - // ucc_datatype_t dt = task->bcast_linear.dt; + size_t half_scratch_size = get_raw_scratch_size(team) / 2; + size_t chunk_size = + task->bcast_linear.step < task->bcast_linear.num_steps + ? ucc_min(half_scratch_size, task->bcast_linear.size) + : task->bcast_linear.size - + (task->bcast_linear.step - 1) * half_scratch_size; + size_t offset_buff = task->bcast_linear.step * half_scratch_size; ucc_ee_executor_t * exec; ucc_ee_executor_task_t *etask; @@ -111,21 +117,18 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) switch (task->bcast_linear.stage) { case STAGE_SYNC: - // ucc_info("sync"); if (ucc_tl_cuda_get_sync(task) != UCC_OK) { task->super.status = UCC_INPROGRESS; return; } task->bcast_linear.step = 0; - // ucc_info("setup"); - st = ucc_tl_cuda_bcast_linear_setup_start(task); + st = ucc_tl_cuda_bcast_linear_setup_start(task); if (st != UCC_OK) { task->super.status = st; return; } task->bcast_linear.stage = STAGE_SETUP; case STAGE_SETUP: - // ucc_info("test"); st = ucc_tl_cuda_bcast_linear_setup_test(task); if (st != UCC_OK) { task->super.status = st; @@ -141,22 +144,14 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) break; } - size_t scratch_size = get_raw_scratch_size(team); - size_t chunk_size = task->bcast_linear.step < task->bcast_linear.num_steps - ? ucc_min(scratch_size, task->bcast_linear.size) - : task->bcast_linear.size - - (task->bcast_linear.step - 1) * scratch_size; - size_t offset_buff = task->bcast_linear.step * scratch_size; - - // ucc_info("chunk_size: %ld", chunk_size); - if (trank == task->bcast_linear.root) { // Root scenario // fall-through between cases is intentional switch (task->bcast_linear.stage) { case STAGE_COPY: // copy from src buffer to scratch - dbuf = TASK_SCRATCH(task, trank); + dbuf = PTR_OFFSET(TASK_SCRATCH(task, trank), + task->bcast_linear.step % 2 * half_scratch_size); sbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff); st = ecopy(dbuf, sbuf, chunk_size, exec, &task->bcast_linear.exec_task); @@ -166,7 +161,6 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) return; } task->bcast_linear.stage = STAGE_WAIT_COPY; - // break; case STAGE_WAIT_COPY: etask = task->bcast_linear.exec_task; if (etask) { @@ -180,23 +174,22 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) task->bcast_linear.step, 0); task->bcast_linear.stage = STAGE_WAIT_ALL; } else { - // ucc_info("not ready"); + // not ready return; } } else { ucc_info("etask is nullptr"); return; } - // break; case STAGE_WAIT_ALL: for (int i = 0; i < tsize; ++i) { - if (get_rank_step(task, i, 0) < task->bcast_linear.step) { + // need to wait until all ranks complete step - 1, because of double buffering + if (get_rank_step(task, i, 0) < task->bcast_linear.step - 1) { // rank is not ready, lets wait return; } } task->bcast_linear.stage = STAGE_COPY; - // ucc_info("all others ready for next step"); if (task->bcast_linear.step < task->bcast_linear.num_steps) { // go to next iteration task->bcast_linear.stage = STAGE_COPY; @@ -205,7 +198,6 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) // finish task->bcast_linear.stage = STAGE_DONE; } - // break; case STAGE_DONE: task->super.status = UCC_OK; break; @@ -213,7 +205,8 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) break; } } else { - // others + // clients + // fall-through between cases is intentional switch (task->bcast_linear.stage) { case STAGE_WAIT_ROOT: /* code */ @@ -225,14 +218,12 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) } else { return; } - // break; case STAGE_CLIENT_COPY: dbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff); - sbuf = TASK_SCRATCH( - task, - task->bcast_linear - .root); // need to copy from root's scratch buffer - st = ecopy(dbuf, sbuf, chunk_size, exec, + // need to copy from root's scratch buffer + sbuf = PTR_OFFSET(TASK_SCRATCH(task, task->bcast_linear.root), + task->bcast_linear.step % 2 * chunk_size); + st = ecopy(dbuf, sbuf, chunk_size, exec, &task->bcast_linear.exec_task); if (st != UCC_OK) { ucc_error("failed to post ecopy task at client"); @@ -240,7 +231,6 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) return; } task->bcast_linear.stage = STAGE_CLIENT_COPY_WAIT; - // break; case STAGE_CLIENT_COPY_WAIT: etask = task->bcast_linear.exec_task; if (etask) { @@ -263,7 +253,6 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) } else { return; } - // break; case STAGE_DONE: task->super.status = UCC_OK; break; @@ -278,20 +267,17 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); ucc_tl_cuda_team_t *team = TASK_TEAM(task); ucc_coll_args_t * args = &TASK_ARGS(task); - // ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); - ucc_datatype_t dt = task->bcast_linear.dt; + ucc_datatype_t dt = task->bcast_linear.dt; + size_t half_scratch_size = get_raw_scratch_size(team) / 2; task->bcast_linear.stage = STAGE_SYNC; - ucc_info("bcast start with dt: %s and count: %ld", ucc_datatype_str(dt), - args->src.info.count); - task->bcast_linear.size = ucc_dt_size(dt) * args->src.info.count; - size_t scratch_size = get_raw_scratch_size(team); task->bcast_linear.num_steps = - ucc_div_round_up(task->bcast_linear.size, scratch_size); + ucc_div_round_up(task->bcast_linear.size, half_scratch_size); - ucc_info("bcast buffer size: %ld, num_steps: %d", task->bcast_linear.size, + ucc_info("bcast dt: %s, buffer size: %ld, num_steps: %d", + ucc_datatype_str(dt), task->bcast_linear.size, task->bcast_linear.num_steps); task->bcast_linear.sbuf = args->src.info.buffer; @@ -308,8 +294,6 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, ucc_tl_cuda_task_t *task; ucc_status_t status; - ucc_info("bcast init"); - if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) || UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) { @@ -323,8 +307,6 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, task->bcast_linear.root = coll_args->args.root; task->bcast_linear.dt = coll_args->args.src.info.datatype; - ucc_info("bcast init with dt: %s", ucc_datatype_str(task->bcast_linear.dt)); - task->bcast_linear.sbuf = coll_args->args.src.info.buffer; task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR; @@ -333,8 +315,6 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, task->super.finalize = ucc_tl_cuda_bcast_linear_finalize; task->bar = TASK_BAR(task); - ucc_info("bcast init success"); - *task_p = &task->super; return UCC_OK; } From 190f1e386c5a3a7b57249ababb32f1ce727816a2 Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Fri, 2 Aug 2024 17:28:48 +0200 Subject: [PATCH 17/34] TL/CUDA: moved get/set rank step --- .../tl/cuda/allgatherv/allgatherv_linear.c | 16 ---------------- src/components/tl/cuda/bcast/bcast_linear.c | 17 ----------------- .../reduce_scatterv/reduce_scatterv_linear.c | 16 ---------------- src/components/tl/cuda/tl_cuda_coll.h | 17 +++++++++++++++++ src/components/tl/cuda/tl_cuda_ring.h | 16 ---------------- 5 files changed, 17 insertions(+), 65 deletions(-) diff --git a/src/components/tl/cuda/allgatherv/allgatherv_linear.c b/src/components/tl/cuda/allgatherv/allgatherv_linear.c index 0fca5c6af6..41499c136d 100644 --- a/src/components/tl/cuda/allgatherv/allgatherv_linear.c +++ b/src/components/tl/cuda/allgatherv/allgatherv_linear.c @@ -55,22 +55,6 @@ enum * other ranks to finish */ }; -static inline int get_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank, - int step_id) -{ - ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank); - - return sync->seq_num[step_id]; -} - -static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank, - int step, int step_id) -{ - ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank); - - sync->seq_num[step_id] = step; -} - ucc_status_t ucc_tl_cuda_allgatherv_linear_finalize(ucc_coll_task_t *coll_task) { ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index 6099d3e306..6ee642eac7 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -21,23 +21,6 @@ enum STAGE_CLIENT_COPY_WAIT, }; -// TODO: move out to common with allgather -static inline int get_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank, - int step_id) -{ - ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank); - - return sync->seq_num[step_id]; -} - -static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank, - int step, int step_id) -{ - ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank); - - sync->seq_num[step_id] = step; -} - ucc_status_t ucc_tl_cuda_bcast_linear_setup_start(ucc_tl_cuda_task_t *task) { ucc_tl_cuda_team_t *team = TASK_TEAM(task); diff --git a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c index 6a1ec5b22c..3e0595fa44 100644 --- a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c +++ b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c @@ -59,22 +59,6 @@ enum * other ranks to finish */ }; -static inline int get_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank, - int step_id) -{ - ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank); - - return sync->seq_num[step_id]; -} - -static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank, - int step, int step_id) -{ - ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank); - - sync->seq_num[step_id] = step; -} - ucc_status_t ucc_tl_cuda_reduce_scatterv_linear_finalize(ucc_coll_task_t *coll_task) { diff --git a/src/components/tl/cuda/tl_cuda_coll.h b/src/components/tl/cuda/tl_cuda_coll.h index f450ff950c..55b86e2cee 100644 --- a/src/components/tl/cuda/tl_cuda_coll.h +++ b/src/components/tl/cuda/tl_cuda_coll.h @@ -156,4 +156,21 @@ ucc_status_t ucc_tl_cuda_alg_id_to_init(int alg_id, const char *alg_id_str, ucc_memory_type_t mem_type, ucc_base_coll_init_fn_t *init); +// common utils function for collectives: +static inline int get_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank, + int step_id) +{ + ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank); + + return sync->seq_num[step_id]; +} + +static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank, + int step, int step_id) +{ + ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank); + + sync->seq_num[step_id] = step; +} + #endif diff --git a/src/components/tl/cuda/tl_cuda_ring.h b/src/components/tl/cuda/tl_cuda_ring.h index cc2d3c95db..621e074184 100644 --- a/src/components/tl/cuda/tl_cuda_ring.h +++ b/src/components/tl/cuda/tl_cuda_ring.h @@ -83,20 +83,4 @@ static inline ucc_rank_t get_recv_block(ucc_tl_cuda_team_t *team, return ring->ring[(ring->iring[trank] + tsize - step - 1) % tsize]; } -static inline int get_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank, - int ring_id) -{ - ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank); - - return sync->seq_num[ring_id]; -} - -static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank, - int step, int ring_id) -{ - ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank); - - sync->seq_num[ring_id] = step; -} - #endif From d76629c1443c97fc32650f31f78287f717f75267 Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Fri, 2 Aug 2024 18:02:10 +0200 Subject: [PATCH 18/34] TL/CUDA: changed logs to debug lvl --- src/components/tl/cuda/bcast/bcast_linear.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index 6ee642eac7..762edba4fb 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -161,7 +161,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) return; } } else { - ucc_info("etask is nullptr"); + ucc_debug("etask is nullptr"); return; } case STAGE_WAIT_ALL: @@ -195,7 +195,6 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) /* code */ if (get_rank_step(task, task->bcast_linear.root, 0) > task->bcast_linear.step) { - // ucc_info("something from root is ready!"); task->bcast_linear.stage = STAGE_CLIENT_COPY; break; } else { @@ -259,9 +258,9 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) task->bcast_linear.num_steps = ucc_div_round_up(task->bcast_linear.size, half_scratch_size); - ucc_info("bcast dt: %s, buffer size: %ld, num_steps: %d", - ucc_datatype_str(dt), task->bcast_linear.size, - task->bcast_linear.num_steps); + ucc_debug("bcast linear dt: %s, buffer size: %ld, num_steps: %d", + ucc_datatype_str(dt), task->bcast_linear.size, + task->bcast_linear.num_steps); task->bcast_linear.sbuf = args->src.info.buffer; task->bcast_linear.step = 0; From e444cda0efc83a0b657cc32c6fa233de42522870 Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Fri, 2 Aug 2024 18:11:16 +0200 Subject: [PATCH 19/34] TL/CUDA: minor cleanups --- src/components/tl/cuda/bcast/bcast_linear.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index 762edba4fb..177119d503 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -16,9 +16,9 @@ enum STAGE_WAIT_COPY, // wait for copy finishes STAGE_WAIT_ALL, // wait for all others rank be on same step // non-root - STAGE_WAIT_ROOT, - STAGE_CLIENT_COPY, - STAGE_CLIENT_COPY_WAIT, + STAGE_WAIT_ROOT, // clients wait while root writes to own scratch buffer + STAGE_CLIENT_COPY, // clients submit copy task + STAGE_CLIENT_COPY_WAIT, // clients wait completion of copy from root's scratch }; ucc_status_t ucc_tl_cuda_bcast_linear_setup_start(ucc_tl_cuda_task_t *task) @@ -91,6 +91,8 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) ucc_ee_executor_task_t *etask; ucc_status_t st; void * sbuf, *dbuf; + int i; + task->super.status = UCC_INPROGRESS; st = ucc_coll_task_get_executor(&task->super, &exec); @@ -165,7 +167,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) return; } case STAGE_WAIT_ALL: - for (int i = 0; i < tsize; ++i) { + for (i = 0; i < tsize; ++i) { // need to wait until all ranks complete step - 1, because of double buffering if (get_rank_step(task, i, 0) < task->bcast_linear.step - 1) { // rank is not ready, lets wait @@ -192,7 +194,6 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) // fall-through between cases is intentional switch (task->bcast_linear.stage) { case STAGE_WAIT_ROOT: - /* code */ if (get_rank_step(task, task->bcast_linear.root, 0) > task->bcast_linear.step) { task->bcast_linear.stage = STAGE_CLIENT_COPY; @@ -201,8 +202,8 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) return; } case STAGE_CLIENT_COPY: - dbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff); // need to copy from root's scratch buffer + dbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff); sbuf = PTR_OFFSET(TASK_SCRATCH(task, task->bcast_linear.root), task->bcast_linear.step % 2 * chunk_size); st = ecopy(dbuf, sbuf, chunk_size, exec, From 1ca88667b44623718d6c895c17453ac650eae400 Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Wed, 21 Aug 2024 15:21:09 +0200 Subject: [PATCH 20/34] TL/CUDA: addressed comments --- src/components/tl/cuda/allgather/allgather.c | 2 +- .../tl/cuda/allgather/allgather_linear.c | 2 +- .../tl/cuda/allgatherv/allgatherv.c | 2 +- .../tl/cuda/allgatherv/allgatherv_linear.c | 2 +- src/components/tl/cuda/bcast/bcast.c | 2 +- src/components/tl/cuda/bcast/bcast_linear.c | 23 ++++++++----------- .../tl/cuda/reduce_scatter/reduce_scatter.c | 2 +- .../reduce_scatter/reduce_scatter_linear.c | 2 +- .../tl/cuda/reduce_scatterv/reduce_scatterv.c | 2 +- .../reduce_scatterv/reduce_scatterv_linear.c | 2 +- src/components/tl/cuda/tl_cuda.h | 1 - src/components/tl/cuda/tl_cuda_team_topo.h | 2 +- 12 files changed, 20 insertions(+), 24 deletions(-) diff --git a/src/components/tl/cuda/allgather/allgather.c b/src/components/tl/cuda/allgather/allgather.c index 01996da4da..1e64c0a582 100644 --- a/src/components/tl/cuda/allgather/allgather.c +++ b/src/components/tl/cuda/allgather/allgather.c @@ -44,7 +44,7 @@ ucc_status_t ucc_tl_cuda_allgather_init(ucc_base_coll_args_t *coll_args, { ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t); - if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) { + if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) { return ucc_tl_cuda_allgather_linear_init(coll_args, tl_team, task_p); } else { return ucc_tl_cuda_allgather_ring_init(coll_args, tl_team, task_p); diff --git a/src/components/tl/cuda/allgather/allgather_linear.c b/src/components/tl/cuda/allgather/allgather_linear.c index ed228d1683..fefc774628 100644 --- a/src/components/tl/cuda/allgather/allgather_linear.c +++ b/src/components/tl/cuda/allgather/allgather_linear.c @@ -15,7 +15,7 @@ ucc_status_t ucc_tl_cuda_allgather_linear_init(ucc_base_coll_args_t *coll_args, ucc_tl_cuda_task_t *task; ucc_status_t status; - if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) || + if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) || UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) { return UCC_ERR_NOT_SUPPORTED; } diff --git a/src/components/tl/cuda/allgatherv/allgatherv.c b/src/components/tl/cuda/allgatherv/allgatherv.c index 5a8f78c481..76da65fa65 100644 --- a/src/components/tl/cuda/allgatherv/allgatherv.c +++ b/src/components/tl/cuda/allgatherv/allgatherv.c @@ -47,7 +47,7 @@ ucc_status_t ucc_tl_cuda_allgatherv_init(ucc_base_coll_args_t *coll_args, { ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t); - if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) { + if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) { return ucc_tl_cuda_allgatherv_linear_init(coll_args, tl_team, task_p); } else { return ucc_tl_cuda_allgatherv_ring_init(coll_args, tl_team, task_p); diff --git a/src/components/tl/cuda/allgatherv/allgatherv_linear.c b/src/components/tl/cuda/allgatherv/allgatherv_linear.c index 41499c136d..1f02ad37bd 100644 --- a/src/components/tl/cuda/allgatherv/allgatherv_linear.c +++ b/src/components/tl/cuda/allgatherv/allgatherv_linear.c @@ -416,7 +416,7 @@ ucc_status_t ucc_tl_cuda_allgatherv_linear_init(ucc_base_coll_args_t *coll_args, ucc_tl_cuda_task_t *task; ucc_status_t status; - if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) || + if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) || UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) { return UCC_ERR_NOT_SUPPORTED; } diff --git a/src/components/tl/cuda/bcast/bcast.c b/src/components/tl/cuda/bcast/bcast.c index 46623684fe..d687d924a0 100644 --- a/src/components/tl/cuda/bcast/bcast.c +++ b/src/components/tl/cuda/bcast/bcast.c @@ -20,7 +20,7 @@ ucc_status_t ucc_tl_cuda_bcast_init(ucc_base_coll_args_t *coll_args, { ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t); - if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) { + if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) { return ucc_tl_cuda_bcast_linear_init(coll_args, tl_team, task_p); } else { return UCC_ERR_NOT_SUPPORTED; diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index 177119d503..f0ba528635 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -6,8 +6,7 @@ #include "bcast/bcast.h" -enum -{ +enum { STAGE_DONE, STAGE_SYNC, STAGE_SETUP, @@ -52,7 +51,7 @@ static inline size_t get_raw_scratch_size(ucc_tl_cuda_team_t *team) } static inline ucc_status_t ecopy(void *dst, void *src, size_t size, - ucc_ee_executor_t * exec, + ucc_ee_executor_t *exec, ucc_ee_executor_task_t **etask) { ucc_ee_executor_task_args_t exec_args = {0}; @@ -86,11 +85,10 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) : task->bcast_linear.size - (task->bcast_linear.step - 1) * half_scratch_size; size_t offset_buff = task->bcast_linear.step * half_scratch_size; - - ucc_ee_executor_t * exec; + ucc_ee_executor_t *exec; ucc_ee_executor_task_t *etask; ucc_status_t st; - void * sbuf, *dbuf; + void *sbuf, *dbuf; int i; task->super.status = UCC_INPROGRESS; @@ -103,7 +101,6 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) switch (task->bcast_linear.stage) { case STAGE_SYNC: if (ucc_tl_cuda_get_sync(task) != UCC_OK) { - task->super.status = UCC_INPROGRESS; return; } task->bcast_linear.step = 0; @@ -139,7 +136,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) task->bcast_linear.step % 2 * half_scratch_size); sbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff); st = ecopy(dbuf, sbuf, chunk_size, exec, - &task->bcast_linear.exec_task); + &task->bcast_linear.exec_task); if (st != UCC_OK) { ucc_error("failed to post ecopy task"); task->super.status = st; @@ -207,7 +204,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) sbuf = PTR_OFFSET(TASK_SCRATCH(task, task->bcast_linear.root), task->bcast_linear.step % 2 * chunk_size); st = ecopy(dbuf, sbuf, chunk_size, exec, - &task->bcast_linear.exec_task); + &task->bcast_linear.exec_task); if (st != UCC_OK) { ucc_error("failed to post ecopy task at client"); task->super.status = st; @@ -249,7 +246,7 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) { ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); ucc_tl_cuda_team_t *team = TASK_TEAM(task); - ucc_coll_args_t * args = &TASK_ARGS(task); + ucc_coll_args_t *args = &TASK_ARGS(task); ucc_datatype_t dt = task->bcast_linear.dt; size_t half_scratch_size = get_raw_scratch_size(team) / 2; @@ -270,14 +267,14 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) } ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, - ucc_base_team_t * tl_team, - ucc_coll_task_t ** task_p) + ucc_base_team_t *tl_team, + ucc_coll_task_t **task_p) { ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t); ucc_tl_cuda_task_t *task; ucc_status_t status; - if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) || + if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) || UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) { return UCC_ERR_NOT_SUPPORTED; diff --git a/src/components/tl/cuda/reduce_scatter/reduce_scatter.c b/src/components/tl/cuda/reduce_scatter/reduce_scatter.c index 468fd68338..1e1d75c3ed 100644 --- a/src/components/tl/cuda/reduce_scatter/reduce_scatter.c +++ b/src/components/tl/cuda/reduce_scatter/reduce_scatter.c @@ -48,7 +48,7 @@ ucc_status_t ucc_tl_cuda_reduce_scatter_init(ucc_base_coll_args_t *coll_args, { ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t); - if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) { + if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) { return ucc_tl_cuda_reduce_scatter_linear_init(coll_args, tl_team, task_p); } else { diff --git a/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c b/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c index 46efbdb051..9a025267ca 100644 --- a/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c +++ b/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c @@ -19,7 +19,7 @@ ucc_status_t ucc_tl_cuda_reduce_scatter_linear_init(ucc_base_coll_args_t *coll_a return UCC_ERR_NOT_SUPPORTED; } - if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) || + if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) || UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) { return UCC_ERR_NOT_SUPPORTED; } diff --git a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c index d85e2c8dd3..d954e38e9e 100644 --- a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c +++ b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c @@ -51,7 +51,7 @@ ucc_status_t ucc_tl_cuda_reduce_scatterv_init(ucc_base_coll_args_t *coll_args, { ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t); - if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) { + if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) { return ucc_tl_cuda_reduce_scatterv_linear_init(coll_args, tl_team, task_p); } else { diff --git a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c index 3e0595fa44..d719632853 100644 --- a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c +++ b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c @@ -432,7 +432,7 @@ ucc_tl_cuda_reduce_scatterv_linear_init(ucc_base_coll_args_t *coll_args, return UCC_ERR_NOT_SUPPORTED; } - if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) || + if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) || UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) { return UCC_ERR_NOT_SUPPORTED; } diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h index 096bab1a0e..b31a2db070 100644 --- a/src/components/tl/cuda/tl_cuda.h +++ b/src/components/tl/cuda/tl_cuda.h @@ -225,7 +225,6 @@ struct ucc_tl_cuda_task { size_t (*get_offset)(const ucc_tl_cuda_task_t *task, ucc_rank_t block); } allgatherv_linear; - struct { int stage; int step; diff --git a/src/components/tl/cuda/tl_cuda_team_topo.h b/src/components/tl/cuda/tl_cuda_team_topo.h index 96b6d63a5b..1d7d19ad1c 100644 --- a/src/components/tl/cuda/tl_cuda_team_topo.h +++ b/src/components/tl/cuda/tl_cuda_team_topo.h @@ -51,7 +51,7 @@ ucc_tl_cuda_team_topo_is_direct(const ucc_tl_team_t *team, } static inline int -ucc_tl_cuda_team_topo_is_fully_conntected(const ucc_tl_cuda_team_topo_t *topo) +ucc_tl_cuda_team_topo_is_fully_connected(const ucc_tl_cuda_team_topo_t *topo) { return topo->is_fully_connected; } From 2af5ea55096e0739ed061bbb5b6f3cd12c2db00b Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Fri, 23 Aug 2024 15:09:20 +0200 Subject: [PATCH 21/34] TL/CUDA: removed done stage --- src/components/tl/cuda/bcast/bcast_linear.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index f0ba528635..8db6e43115 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -7,7 +7,6 @@ #include "bcast/bcast.h" enum { - STAGE_DONE, STAGE_SYNC, STAGE_SETUP, // root @@ -178,11 +177,9 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) return; } else { // finish - task->bcast_linear.stage = STAGE_DONE; + task->super.status = UCC_OK; + break; } - case STAGE_DONE: - task->super.status = UCC_OK; - break; default: break; } @@ -225,7 +222,9 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) task->bcast_linear.stage = STAGE_WAIT_ROOT; return; } else { - task->bcast_linear.stage = STAGE_DONE; + // Done + task->super.status = UCC_OK; + break; } } else { return; @@ -233,9 +232,6 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) } else { return; } - case STAGE_DONE: - task->super.status = UCC_OK; - break; default: break; } From 38489b3dfed669adf9438fd0d8faf82ff01dfa3f Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Fri, 23 Aug 2024 15:44:24 +0200 Subject: [PATCH 22/34] TL/CUDA: added unit test --- test/gtest/coll/test_bcast.cc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test/gtest/coll/test_bcast.cc b/test/gtest/coll/test_bcast.cc index 6d80816a31..69f697a508 100644 --- a/test/gtest/coll/test_bcast.cc +++ b/test/gtest/coll/test_bcast.cc @@ -276,6 +276,8 @@ ucc_job_env_t two_step_env = {{"UCC_CL_HIER_TUNE", "bcast:@2step:0-inf:inf"}, {"UCC_CLS", "all"}}; ucc_job_env_t dbt_env = {{"UCC_TL_UCP_TUNE", "bcast:@dbt:0-inf:inf"}, {"UCC_CLS", "basic"}}; +ucc_job_env_t cuda_env = {{"UCC_TL_CUDA_TUNE", "bcast:cuda:@0"}, + {"UCC_CLS", "basic"}}; INSTANTIATE_TEST_CASE_P( , test_bcast_alg, ::testing::Combine( @@ -285,6 +287,10 @@ INSTANTIATE_TEST_CASE_P( #else ::testing::Values(UCC_MEMORY_TYPE_HOST), #endif +#ifdef HAVE_CUDA + ::testing::Values(two_step_env, dbt_env, cuda_env), //env +#else ::testing::Values(two_step_env, dbt_env), //env +#endif ::testing::Values(8, 65536), // count - ::testing::Values(15,16))); // n_procs + ::testing::Values(15, 16))); // n_procs From b9272bcb44dc96e60d8c722db9a432403f48a36c Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Tue, 3 Sep 2024 12:43:10 +0200 Subject: [PATCH 23/34] TL/CUDA: addressed comments --- src/components/tl/cuda/bcast/bcast_linear.c | 1 + src/components/tl/cuda/tl_cuda.h | 10 +++++----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index 8db6e43115..bbb5b667df 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -94,6 +94,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) st = ucc_coll_task_get_executor(&task->super, &exec); if (ucc_unlikely(st != UCC_OK)) { + task->status = st; return; } diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h index b31a2db070..d86dc7cc66 100644 --- a/src/components/tl/cuda/tl_cuda.h +++ b/src/components/tl/cuda/tl_cuda.h @@ -226,13 +226,13 @@ struct ucc_tl_cuda_task { ucc_rank_t block); } allgatherv_linear; struct { - int stage; - int step; - void * sbuf; + int stage; + int step; + void *sbuf; ucc_datatype_t dt; ucc_rank_t root; - size_t size; - int num_steps; + size_t size; + int num_steps; ucc_ee_executor_task_t *exec_task; } bcast_linear; struct { From 2c7ae3209293b6dfe382557c7906ec7e0271ce9e Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Tue, 3 Sep 2024 13:00:41 +0200 Subject: [PATCH 24/34] TL/CUDA: fix formatting --- src/components/tl/cuda/bcast/bcast_linear.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index bbb5b667df..3ea31d4ba4 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -73,17 +73,17 @@ ucc_status_t ucc_tl_cuda_bcast_linear_finalize(ucc_coll_task_t *coll_task) void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) { - ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); - ucc_tl_cuda_team_t *team = TASK_TEAM(task); - ucc_rank_t trank = UCC_TL_TEAM_RANK(team); - ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); + ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); + ucc_tl_cuda_team_t *team = TASK_TEAM(task); + ucc_rank_t trank = UCC_TL_TEAM_RANK(team); + ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); size_t half_scratch_size = get_raw_scratch_size(team) / 2; - size_t chunk_size = + size_t chunk_size = task->bcast_linear.step < task->bcast_linear.num_steps ? ucc_min(half_scratch_size, task->bcast_linear.size) : task->bcast_linear.size - (task->bcast_linear.step - 1) * half_scratch_size; - size_t offset_buff = task->bcast_linear.step * half_scratch_size; + size_t offset_buff = task->bcast_linear.step * half_scratch_size; ucc_ee_executor_t *exec; ucc_ee_executor_task_t *etask; ucc_status_t st; From 5b8c8c78e2c6afc6c50753a01187266407e1ac7d Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Tue, 3 Sep 2024 13:37:09 +0200 Subject: [PATCH 25/34] TL/CUDA: fixed compilation --- src/components/tl/cuda/bcast/bcast_linear.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index 3ea31d4ba4..992dce9470 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -94,7 +94,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) st = ucc_coll_task_get_executor(&task->super, &exec); if (ucc_unlikely(st != UCC_OK)) { - task->status = st; + task->super.status = st; return; } From 87c44241a153a95cb4a13fb9a10061807f54aa0b Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Fri, 25 Oct 2024 16:40:46 +0200 Subject: [PATCH 26/34] TL/CUDA: fix include --- src/components/tl/cuda/bcast/bcast_linear.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index 992dce9470..f21548f1b8 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -4,7 +4,7 @@ * See file LICENSE for terms. */ -#include "bcast/bcast.h" +#include "bcast.h" enum { STAGE_SYNC, From 60ea28981a63e1419743ab46001e646427f59fd6 Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Mon, 28 Oct 2024 11:48:01 +0100 Subject: [PATCH 27/34] TL/CUDA: removed returns --- src/components/tl/cuda/bcast/bcast_linear.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index f21548f1b8..52c34ce56b 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -227,11 +227,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) task->super.status = UCC_OK; break; } - } else { - return; } - } else { - return; } default: break; From a4887f7fd576810eee172a6b1771c4327c89e962 Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Wed, 6 Nov 2024 15:13:41 +0100 Subject: [PATCH 28/34] TL/CUDA: active set support --- src/components/tl/cuda/bcast/bcast_linear.c | 20 +++- src/components/tl/cuda/tl_cuda.h | 18 +-- src/components/tl/cuda/tl_cuda_coll.c | 13 +++ src/components/tl/cuda/tl_cuda_coll.h | 122 +++++++++++++++++--- src/components/tl/cuda/tl_cuda_team.c | 9 +- src/components/tl/ucp/bcast/bcast_knomial.c | 2 +- 6 files changed, 153 insertions(+), 31 deletions(-) diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c index 52c34ce56b..2fab96d7d7 100644 --- a/src/components/tl/cuda/bcast/bcast_linear.c +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -27,6 +27,7 @@ ucc_status_t ucc_tl_cuda_bcast_linear_setup_start(ucc_tl_cuda_task_t *task) set_rank_step(task, trank, 0, 0); ucc_memory_cpu_store_fence(); + // initiate barrier wait while all ranks set theirs steps to 0 status = ucc_tl_cuda_shm_barrier_start(UCC_TL_TEAM_RANK(team), task->bar); if (ucc_unlikely(status != UCC_OK)) { goto exit_err; @@ -76,7 +77,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); ucc_tl_cuda_team_t *team = TASK_TEAM(task); ucc_rank_t trank = UCC_TL_TEAM_RANK(team); - ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); + ucc_rank_t tsize = (ucc_rank_t)task->subset.map.ep_num; size_t half_scratch_size = get_raw_scratch_size(team) / 2; size_t chunk_size = task->bcast_linear.step < task->bcast_linear.num_steps @@ -89,6 +90,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) ucc_status_t st; void *sbuf, *dbuf; int i; + ucc_rank_t peer; task->super.status = UCC_INPROGRESS; @@ -100,7 +102,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) switch (task->bcast_linear.stage) { case STAGE_SYNC: - if (ucc_tl_cuda_get_sync(task) != UCC_OK) { + if (ucc_tl_cuda_get_sync_root(task, task->bcast_linear.root) != UCC_OK) { return; } task->bcast_linear.step = 0; @@ -116,7 +118,6 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) task->super.status = st; return; } - ucc_tl_cuda_put_sync(task); if (trank == task->bcast_linear.root) { task->bcast_linear.stage = STAGE_COPY; } else { @@ -165,8 +166,17 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) } case STAGE_WAIT_ALL: for (i = 0; i < tsize; ++i) { + if (UCC_COLL_ARGS_ACTIVE_SET(&TASK_ARGS(task))) + { + // eval phys rank from virt + peer = ucc_ep_map_eval(task->subset.map, i); + } + else + { + peer = i; + } // need to wait until all ranks complete step - 1, because of double buffering - if (get_rank_step(task, i, 0) < task->bcast_linear.step - 1) { + if (get_rank_step(task, peer, 0) < task->bcast_linear.step - 1) { // rank is not ready, lets wait return; } @@ -178,6 +188,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) return; } else { // finish + ucc_tl_cuda_put_sync_root(task, task->bcast_linear.root); task->super.status = UCC_OK; break; } @@ -286,7 +297,6 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, task->super.post = ucc_tl_cuda_bcast_linear_start; task->super.progress = ucc_tl_cuda_bcast_linear_progress; task->super.finalize = ucc_tl_cuda_bcast_linear_finalize; - task->bar = TASK_BAR(task); *task_p = &task->super; return UCC_OK; diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h index d86dc7cc66..7931777318 100644 --- a/src/components/tl/cuda/tl_cuda.h +++ b/src/components/tl/cuda/tl_cuda.h @@ -12,6 +12,7 @@ #include "components/tl/ucc_tl_log.h" #include "components/mc/ucc_mc.h" #include "utils/ucc_mpool.h" +#include "utils/ucc_datastruct.h" #include "tl_cuda_ep_hash.h" #include "tl_cuda_topo.h" #include "tl_cuda_team_topo.h" @@ -108,6 +109,7 @@ typedef uint32_t ucc_tl_cuda_sync_state_t; typedef struct ucc_tl_cuda_shm_barrier { ucc_rank_t size; ucc_rank_t count; + uint32_t tag; int sense; ucc_status_t state[UCC_TL_CUDA_MAX_PEERS]; int local_sense[UCC_TL_CUDA_MAX_PEERS]; @@ -155,11 +157,12 @@ typedef struct ucc_tl_cuda_scratch { typedef struct ucc_tl_cuda_team { ucc_tl_team_t super; - uint32_t seq_num; + uint32_t seq_num; // counter for launched collectives (tasks) for this team + uint32_t seq_num_active_set; // active set ucc_tl_cuda_team_topo_t *topo; - ucc_tl_cuda_sync_t *sync; - ucc_tl_cuda_sync_state_t *sync_state; - ucc_tl_cuda_shm_barrier_t *bar; + ucc_tl_cuda_sync_t *sync; // pointer to shared mem + ucc_tl_cuda_sync_state_t *sync_state; // to track if sync segment of shared memory is used by what task? + ucc_tl_cuda_shm_barrier_t *bar; // pointer to first barrier in array of [0; 2 * max_concurrent) first max_concurrent for normal mode and second for active set ucc_tl_cuda_scratch_t scratch; cudaStream_t stream; ucc_tl_cuda_rank_id_t *ids; @@ -173,9 +176,10 @@ UCC_CLASS_DECLARE(ucc_tl_cuda_team_t, ucc_base_context_t *, typedef struct ucc_tl_cuda_task ucc_tl_cuda_task_t; struct ucc_tl_cuda_task { ucc_coll_task_t super; - uint32_t seq_num; - uint32_t coll_id; - ucc_tl_cuda_shm_barrier_t *bar; + uint32_t seq_num; // sequential number of collective (started task) in team + uint32_t coll_id; // index of collective in flight [0; max_concurrent) + ucc_tl_cuda_shm_barrier_t *bar; // pointer to barrier that reserved for with task in cuda team + ucc_subset_t subset; // information about mapping of active set if it present union { struct { int stage; diff --git a/src/components/tl/cuda/tl_cuda_coll.c b/src/components/tl/cuda/tl_cuda_coll.c index 42b33cdbcc..ab2a5ff733 100644 --- a/src/components/tl/cuda/tl_cuda_coll.c +++ b/src/components/tl/cuda/tl_cuda_coll.c @@ -93,6 +93,19 @@ ucc_status_t ucc_tl_cuda_coll_init(ucc_base_coll_args_t *coll_args, } } +ucc_status_t ucc_tl_cuda_shm_barrier_init_root(ucc_rank_t size, ucc_rank_t rank, ucc_rank_t root, + ucc_tl_cuda_shm_barrier_t *barrier) +{ + if (rank == root) { + barrier->size = size; + barrier->count = 0; + barrier->sense = 0; + } + barrier->state[rank] = UCC_OK; + barrier->local_sense[rank] = 1; + return UCC_OK; +} + ucc_status_t ucc_tl_cuda_shm_barrier_init(ucc_rank_t size, ucc_rank_t rank, ucc_tl_cuda_shm_barrier_t *barrier) { diff --git a/src/components/tl/cuda/tl_cuda_coll.h b/src/components/tl/cuda/tl_cuda_coll.h index 55b86e2cee..01e41af1f7 100644 --- a/src/components/tl/cuda/tl_cuda_coll.h +++ b/src/components/tl/cuda/tl_cuda_coll.h @@ -7,6 +7,8 @@ #ifndef UCC_TL_CUDA_COLL_H_ #define UCC_TL_CUDA_COLL_H_ +#include + #include "tl_cuda.h" #include "components/mc/ucc_mc.h" @@ -50,6 +52,19 @@ static inline void ucc_tl_cuda_task_reset(ucc_tl_cuda_task_t *task) task->super.status = UCC_INPROGRESS; } +ucc_status_t ucc_tl_cuda_shm_barrier_init_root(ucc_rank_t size, ucc_rank_t rank, ucc_rank_t root, + ucc_tl_cuda_shm_barrier_t *barrier); + +ucc_status_t ucc_tl_cuda_shm_barrier_init(ucc_rank_t size, ucc_rank_t rank, + ucc_tl_cuda_shm_barrier_t *barrier); + +ucc_status_t ucc_tl_cuda_shm_barrier_start(ucc_rank_t rank, + ucc_tl_cuda_shm_barrier_t *barrier); + +ucc_status_t ucc_tl_cuda_shm_barrier_test(ucc_rank_t rank, + ucc_tl_cuda_shm_barrier_t *barrier); + + static inline ucc_tl_cuda_task_t *ucc_tl_cuda_task_get(ucc_tl_cuda_team_t *team) { ucc_tl_cuda_context_t *ctx = UCC_TL_CUDA_TEAM_CTX(team); @@ -79,11 +94,13 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args, ucc_tl_cuda_team_t *team, ucc_tl_cuda_task_t **task_h) { - ucc_rank_t trank = UCC_TL_TEAM_RANK(team); - ucc_tl_cuda_lib_t *lib = UCC_TL_CUDA_TEAM_LIB(team); - uint32_t max_concurrent = lib->cfg.max_concurrent; - ucc_tl_cuda_task_t *task; - ucc_status_t status; + ucc_rank_t trank = UCC_TL_TEAM_RANK(team); + ucc_tl_cuda_lib_t *lib = UCC_TL_CUDA_TEAM_LIB(team); + uint32_t max_concurrent = lib->cfg.max_concurrent; + ucc_tl_cuda_shm_barrier_t *curr_bar; + ucc_tl_cuda_task_t *task; + ucc_status_t status; + uint32_t i; if (!ucc_coll_args_is_predefined_dt(&coll_args->args, trank)) { return UCC_ERR_NOT_SUPPORTED; @@ -100,19 +117,74 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args, return status; } - task->seq_num = team->seq_num++; - task->coll_id = task->seq_num % max_concurrent; + /* active set */ + if (UCC_COLL_ARGS_ACTIVE_SET(&coll_args->args)) { + ucc_assert(coll_args->args.coll_type == UCC_COLL_TYPE_BCAST); + + task->subset.map = ucc_active_set_to_ep_map(&coll_args->args); + task->subset.myrank = UCC_TL_TEAM_RANK(team); + // root + if (task->subset.myrank == coll_args->args.root) { + bool found = false; + /* search first free barrier in active set pool */ + for (i = 0; i < max_concurrent; ++i) { + curr_bar = UCC_TL_CUDA_TEAM_BARRIER(team, max_concurrent + i); + if (curr_bar->tag == 0) { + // free + task->bar = curr_bar; + // set user specified tag to mark that this barrier is used by this task + task->bar->tag = coll_args->args.tag; + status = ucc_tl_cuda_shm_barrier_init_root(task->subset.map.ep_num, task->subset.myrank, coll_args->args.root, task->bar); + if (ucc_unlikely(status != UCC_OK)) { + ucc_error("failed to init root barrier"); + } + + found = true; + break; + } + } + ucc_assert(found); + } else { + /* pool barrier while root mark any of it with tag */ + bool found = false; + // TODO: get rid of inf loop? + while (!found) + { + for (i = 0; i < max_concurrent; ++i) { + curr_bar = UCC_TL_CUDA_TEAM_BARRIER(team, max_concurrent + i); + if (curr_bar->tag == coll_args->args.tag) { + task->bar = curr_bar; + // TODO: pass root rank??? + status = ucc_tl_cuda_shm_barrier_init_root(task->subset.map.ep_num, task->subset.myrank, coll_args->args.root, task->bar); + if (ucc_unlikely(status != UCC_OK)) { + ucc_error("failed to init peer barrier"); + } + + found = true; + break; + } + } + } + } + task->seq_num = team->seq_num_active_set++; + task->coll_id = task->seq_num % max_concurrent + max_concurrent; + } else { + task->seq_num = team->seq_num++; + task->coll_id = task->seq_num % max_concurrent; + task->bar = TASK_BAR(task); + } *task_h = task; return UCC_OK; } -static inline ucc_status_t ucc_tl_cuda_get_sync(ucc_tl_cuda_task_t *task) +// check if segment for current task is available and barrier is available (completed from prev iteration) +static inline ucc_status_t ucc_tl_cuda_get_sync_root(ucc_tl_cuda_task_t *task, ucc_rank_t root) { ucc_tl_cuda_team_t *team = TASK_TEAM(task); volatile ucc_tl_cuda_sync_state_t *state = &team->sync_state[task->coll_id]; - if ((UCC_TL_TEAM_RANK(team) == 0) && (*state == 0)) { + if ((UCC_TL_TEAM_RANK(team) == root) && (*state == 0)) { *state = task->seq_num; } if ((*state != task->seq_num) || @@ -122,6 +194,31 @@ static inline ucc_status_t ucc_tl_cuda_get_sync(ucc_tl_cuda_task_t *task) return UCC_OK; } +static inline void ucc_tl_cuda_put_sync_root(ucc_tl_cuda_task_t *task, ucc_rank_t root) +{ + ucc_tl_cuda_team_t *team = TASK_TEAM(task); + ucc_tl_cuda_sync_state_t *state = &team->sync_state[task->coll_id]; + + if (UCC_TL_TEAM_RANK(team) == root) { + ucc_assert(*state == task->seq_num); + *state = 0; + } +} + +static inline ucc_status_t ucc_tl_cuda_get_sync(ucc_tl_cuda_task_t *task) +{ + ucc_tl_cuda_team_t *team = TASK_TEAM(task); + volatile ucc_tl_cuda_sync_state_t *state = &team->sync_state[task->coll_id]; + + if ((UCC_TL_TEAM_RANK(team) == 0) && (*state == 0)) { + *state = task->seq_num; + } + if ((*state != task->seq_num) || (task->bar->state[UCC_TL_TEAM_RANK(team)] != UCC_OK)) { + return UCC_INPROGRESS; + } + return UCC_OK; +} + static inline void ucc_tl_cuda_put_sync(ucc_tl_cuda_task_t *task) { ucc_tl_cuda_team_t *team = TASK_TEAM(task); @@ -142,14 +239,7 @@ ucc_status_t ucc_tl_cuda_coll_init(ucc_base_coll_args_t *coll_args, ucc_status_t ucc_tl_cuda_coll_finalize(ucc_coll_task_t *coll_task); -ucc_status_t ucc_tl_cuda_shm_barrier_init(ucc_rank_t size, ucc_rank_t rank, - ucc_tl_cuda_shm_barrier_t *barrier); -ucc_status_t ucc_tl_cuda_shm_barrier_start(ucc_rank_t rank, - ucc_tl_cuda_shm_barrier_t *barrier); - -ucc_status_t ucc_tl_cuda_shm_barrier_test(ucc_rank_t rank, - ucc_tl_cuda_shm_barrier_t *barrier); ucc_status_t ucc_tl_cuda_alg_id_to_init(int alg_id, const char *alg_id_str, ucc_coll_type_t coll_type, diff --git a/src/components/tl/cuda/tl_cuda_team.c b/src/components/tl/cuda/tl_cuda_team.c index 64123a8cea..e85537f861 100644 --- a/src/components/tl/cuda/tl_cuda_team.c +++ b/src/components/tl/cuda/tl_cuda_team.c @@ -45,7 +45,8 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_team_t, ucc_base_context_t *tl_context, return UCC_ERR_NO_MEMORY; } - scratch_size = lib->cfg.max_concurrent * lib->cfg.scratch_size; + // active set + scratch_size = 2 * lib->cfg.max_concurrent * lib->cfg.scratch_size; status = CUDA_FUNC(cudaMalloc(&self->scratch.loc, scratch_size)); if (status != UCC_OK) { tl_error(tl_context->lib, "failed to alloc scratch buffer"); @@ -64,6 +65,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_team_t, ucc_base_context_t *tl_context, lib->cfg.max_concurrent + sizeof(ucc_tl_cuda_shm_barrier_t) * lib->cfg.max_concurrent + sizeof(ucc_tl_cuda_sync_state_t) * lib->cfg.max_concurrent; + ctrl_size *= 2; // active sets shm_id = -1; self->sync = (void*)-1; @@ -79,8 +81,10 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_team_t, ucc_base_context_t *tl_context, memset(self->sync, 0, ctrl_size); self->bar = (ucc_tl_cuda_shm_barrier_t*)UCC_TL_CUDA_TEAM_SYNC(self, 0, lib->cfg.max_concurrent); - for (i = 0; i < lib->cfg.max_concurrent; i++) { + /* active set */ + for (i = 0; i < lib->cfg.max_concurrent * 2; i++) { bar = UCC_TL_CUDA_TEAM_BARRIER(self, i); + bar->tag = 0; // mark as free for (j = 0; j < UCC_TL_TEAM_SIZE(self); j++) { status = ucc_tl_cuda_shm_barrier_init(UCC_TL_TEAM_SIZE(self), j, bar); @@ -109,6 +113,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_team_t, ucc_base_context_t *tl_context, tl_debug(tl_context->lib, "posted tl team: %p", self); self->seq_num = 1; + self->seq_num_active_set = 1; return UCC_OK; free_devices: diff --git a/src/components/tl/ucp/bcast/bcast_knomial.c b/src/components/tl/ucp/bcast/bcast_knomial.c index 1ca08893e3..21f47bf293 100644 --- a/src/components/tl/ucp/bcast/bcast_knomial.c +++ b/src/components/tl/ucp/bcast/bcast_knomial.c @@ -22,7 +22,7 @@ void ucc_tl_ucp_bcast_knomial_progress(ucc_coll_task_t *coll_task) ucc_rank_t size = (ucc_rank_t)task->subset.map.ep_num; uint32_t radix = task->bcast_kn.radix; - ucc_rank_t root = (uint32_t)TASK_ARGS(task).root; + ucc_rank_t root = (uint32_t)TASK_ARGS(task).root; ucc_rank_t dist = task->bcast_kn.dist; void *buffer = TASK_ARGS(task).src.info.buffer; ucc_memory_type_t mtype = TASK_ARGS(task).src.info.mem_type; From 082c643d847214cb7373aebe1369c04cbcb62bfe Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Wed, 6 Nov 2024 15:29:57 +0100 Subject: [PATCH 29/34] TL/CUDA: fix build --- src/components/tl/cuda/tl_cuda_coll.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/components/tl/cuda/tl_cuda_coll.h b/src/components/tl/cuda/tl_cuda_coll.h index 01e41af1f7..d9c2ad2016 100644 --- a/src/components/tl/cuda/tl_cuda_coll.h +++ b/src/components/tl/cuda/tl_cuda_coll.h @@ -137,13 +137,18 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args, status = ucc_tl_cuda_shm_barrier_init_root(task->subset.map.ep_num, task->subset.myrank, coll_args->args.root, task->bar); if (ucc_unlikely(status != UCC_OK)) { ucc_error("failed to init root barrier"); + return UCC_ERR_NO_RESOURCE; } - found = true; break; } } ucc_assert(found); + if (!found) + { + ucc_error("Unable to find free barrier"); + return UCC_ERR_NO_RESOURCE; + } } else { /* pool barrier while root mark any of it with tag */ bool found = false; @@ -158,8 +163,8 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args, status = ucc_tl_cuda_shm_barrier_init_root(task->subset.map.ep_num, task->subset.myrank, coll_args->args.root, task->bar); if (ucc_unlikely(status != UCC_OK)) { ucc_error("failed to init peer barrier"); + return UCC_ERR_NO_RESOURCE; } - found = true; break; } From 168d2e154a44a8637b2877ada05704006e027877 Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Wed, 6 Nov 2024 15:46:24 +0100 Subject: [PATCH 30/34] TL/CUDA: fixed comments --- src/components/tl/cuda/tl_cuda.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h index 7931777318..e7d66eec8a 100644 --- a/src/components/tl/cuda/tl_cuda.h +++ b/src/components/tl/cuda/tl_cuda.h @@ -155,14 +155,15 @@ typedef struct ucc_tl_cuda_scratch { ucc_tl_cuda_mem_info_t rem_info[UCC_TL_CUDA_MAX_PEERS]; } ucc_tl_cuda_scratch_t; +// Team represents a communicator created within the CUDA context, typically using NVLink for inter-GPU communication typedef struct ucc_tl_cuda_team { ucc_tl_team_t super; - uint32_t seq_num; // counter for launched collectives (tasks) for this team - uint32_t seq_num_active_set; // active set + uint32_t seq_num; // Counter for the number of launched collective tasks for this team + uint32_t seq_num_active_set; // Counter for tasks in the active set (subset of tasks requiring special handling) ucc_tl_cuda_team_topo_t *topo; - ucc_tl_cuda_sync_t *sync; // pointer to shared mem - ucc_tl_cuda_sync_state_t *sync_state; // to track if sync segment of shared memory is used by what task? - ucc_tl_cuda_shm_barrier_t *bar; // pointer to first barrier in array of [0; 2 * max_concurrent) first max_concurrent for normal mode and second for active set + ucc_tl_cuda_sync_t *sync; // Pointer to shared memory segment for synchronization + ucc_tl_cuda_sync_state_t *sync_state; // Tracks the task currently using the sync segment of shared memory, if free - 0 + ucc_tl_cuda_shm_barrier_t *bar; // Pointer to the first barrier in an array of size [0; 2 * max_concurrent]. First max_concurrent barriers are for normal mode, the second one for active set mode ucc_tl_cuda_scratch_t scratch; cudaStream_t stream; ucc_tl_cuda_rank_id_t *ids; @@ -173,13 +174,14 @@ typedef struct ucc_tl_cuda_team { UCC_CLASS_DECLARE(ucc_tl_cuda_team_t, ucc_base_context_t *, const ucc_base_team_params_t *); +// Task represents a collective operation that runs in the CUDA context, typically using NVLink for inter-GPU communication typedef struct ucc_tl_cuda_task ucc_tl_cuda_task_t; struct ucc_tl_cuda_task { ucc_coll_task_t super; - uint32_t seq_num; // sequential number of collective (started task) in team - uint32_t coll_id; // index of collective in flight [0; max_concurrent) - ucc_tl_cuda_shm_barrier_t *bar; // pointer to barrier that reserved for with task in cuda team - ucc_subset_t subset; // information about mapping of active set if it present + uint32_t seq_num; // Sequential identifier for each taks started within the team + uint32_t coll_id; // Index of the collective task in flight, within the range [0; max_concurrent) + ucc_tl_cuda_shm_barrier_t *bar; // Pointer to the reserved barrier for this task in the CUDA team + ucc_subset_t subset; // Mapping information for the active set, if it is present union { struct { int stage; From 8a7adc4dfbceef7647164005288e8cb55d09a4fe Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Thu, 7 Nov 2024 16:33:09 +0100 Subject: [PATCH 31/34] TL/CUDA: select free bar using atomic --- src/components/tl/cuda/tl_cuda_coll.h | 13 +++++++++---- src/utils/ucc_atomic.h | 1 + 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/components/tl/cuda/tl_cuda_coll.h b/src/components/tl/cuda/tl_cuda_coll.h index d9c2ad2016..5904ef3629 100644 --- a/src/components/tl/cuda/tl_cuda_coll.h +++ b/src/components/tl/cuda/tl_cuda_coll.h @@ -86,6 +86,12 @@ static inline ucc_tl_cuda_task_t *ucc_tl_cuda_task_get(ucc_tl_cuda_team_t *team) static inline void ucc_tl_cuda_task_put(ucc_tl_cuda_task_t *task) { UCC_TL_CUDA_PROFILE_REQUEST_FREE(task); + + if (UCC_TL_TEAM_RANK(TASK_TEAM(task)) == task->bcast_linear.root) { + ucc_print("free bar!"); + task->bar->tag = 0; + } + ucc_mpool_put(task); } @@ -129,7 +135,8 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args, /* search first free barrier in active set pool */ for (i = 0; i < max_concurrent; ++i) { curr_bar = UCC_TL_CUDA_TEAM_BARRIER(team, max_concurrent + i); - if (curr_bar->tag == 0) { + if (ucc_atomic_cswap32(&curr_bar->tag, 0, coll_args->args.tag) == 0) { + ucc_print("found free barrier: %d", i); // free task->bar = curr_bar; // set user specified tag to mark that this barrier is used by this task @@ -143,10 +150,10 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args, break; } } - ucc_assert(found); if (!found) { ucc_error("Unable to find free barrier"); + ucc_assert(found); return UCC_ERR_NO_RESOURCE; } } else { @@ -244,8 +251,6 @@ ucc_status_t ucc_tl_cuda_coll_init(ucc_base_coll_args_t *coll_args, ucc_status_t ucc_tl_cuda_coll_finalize(ucc_coll_task_t *coll_task); - - ucc_status_t ucc_tl_cuda_alg_id_to_init(int alg_id, const char *alg_id_str, ucc_coll_type_t coll_type, ucc_memory_type_t mem_type, diff --git a/src/utils/ucc_atomic.h b/src/utils/ucc_atomic.h index d5aabd1829..71eb12e233 100644 --- a/src/utils/ucc_atomic.h +++ b/src/utils/ucc_atomic.h @@ -16,6 +16,7 @@ #define ucc_atomic_add64 ucs_atomic_add64 #define ucc_atomic_sub64 ucs_atomic_sub64 #define ucc_atomic_cswap8 ucs_atomic_cswap8 +#define ucc_atomic_cswap32 ucs_atomic_cswap32 #define ucc_atomic_cswap64 ucs_atomic_cswap64 #define ucc_atomic_bool_cswap8 ucs_atomic_bool_cswap8 #define ucc_atomic_bool_cswap64 ucs_atomic_bool_cswap64 From f2404db2abbe71d5388518ab0efcbf8f062705b3 Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Fri, 8 Nov 2024 12:40:41 +0100 Subject: [PATCH 32/34] TL/CUDA: fix --- src/components/tl/cuda/tl_cuda_coll.h | 4 +++- src/components/tl/cuda/tl_cuda_team.c | 13 +++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/components/tl/cuda/tl_cuda_coll.h b/src/components/tl/cuda/tl_cuda_coll.h index 5904ef3629..7e47deddc2 100644 --- a/src/components/tl/cuda/tl_cuda_coll.h +++ b/src/components/tl/cuda/tl_cuda_coll.h @@ -147,6 +147,7 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args, return UCC_ERR_NO_RESOURCE; } found = true; + task->coll_id = i + max_concurrent; break; } } @@ -173,13 +174,14 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args, return UCC_ERR_NO_RESOURCE; } found = true; + task->coll_id = i + max_concurrent; break; } } } } task->seq_num = team->seq_num_active_set++; - task->coll_id = task->seq_num % max_concurrent + max_concurrent; + // task->coll_id = task->seq_num % max_concurrent + max_concurrent; } else { task->seq_num = team->seq_num++; task->coll_id = task->seq_num % max_concurrent; diff --git a/src/components/tl/cuda/tl_cuda_team.c b/src/components/tl/cuda/tl_cuda_team.c index e85537f861..be0286a7a0 100644 --- a/src/components/tl/cuda/tl_cuda_team.c +++ b/src/components/tl/cuda/tl_cuda_team.c @@ -80,7 +80,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_team_t, ucc_base_context_t *tl_context, } memset(self->sync, 0, ctrl_size); self->bar = (ucc_tl_cuda_shm_barrier_t*)UCC_TL_CUDA_TEAM_SYNC(self, 0, - lib->cfg.max_concurrent); + lib->cfg.max_concurrent * 2); /* active set */ for (i = 0; i < lib->cfg.max_concurrent * 2; i++) { bar = UCC_TL_CUDA_TEAM_BARRIER(self, i); @@ -142,7 +142,7 @@ UCC_CLASS_CLEANUP_FUNC(ucc_tl_cuda_team_t) } if (self->ids) { if (self->sync != (void*)-1) { - for (i = 0; i < lib->cfg.max_concurrent; i++) { + for (i = 0; i < lib->cfg.max_concurrent * 2; i++) { for (j = 0; j < UCC_TL_TEAM_SIZE(self); j++) { if (j == UCC_TL_TEAM_RANK(self)) { continue; @@ -273,14 +273,15 @@ ucc_status_t ucc_tl_cuda_team_create_test(ucc_base_team_t *tl_team) goto exit_err; } team->bar = (ucc_tl_cuda_shm_barrier_t*)UCC_TL_CUDA_TEAM_SYNC(team, 0, - lib->cfg.max_concurrent); + lib->cfg.max_concurrent * 2); } team->sync_state = (ucc_tl_cuda_sync_state_t*)PTR_OFFSET(team->bar, sizeof(ucc_tl_cuda_shm_barrier_t) * - lib->cfg.max_concurrent); + lib->cfg.max_concurrent * 2); CUDA_CHECK_GOTO(cudaStreamCreateWithFlags(&team->stream, cudaStreamNonBlocking), exit_err, status); - for (i = 0; i < lib->cfg.max_concurrent; i++) { + // second max_concurent events are unused for bcast active set + for (i = 0; i < lib->cfg.max_concurrent * 2; i++) { sync = UCC_TL_CUDA_TEAM_SYNC(team, UCC_TL_TEAM_RANK(team), i); CUDA_CHECK_GOTO(cudaEventCreateWithFlags(&sync->ipc_event_local, cudaEventDisableTiming | @@ -308,7 +309,7 @@ ucc_status_t ucc_tl_cuda_team_create_test(ucc_base_team_t *tl_team) goto exit_err; } - for (i = 0; i < lib->cfg.max_concurrent; i++) { + for (i = 0; i < lib->cfg.max_concurrent * 2; i++) { sync = UCC_TL_CUDA_TEAM_SYNC(team, UCC_TL_TEAM_RANK(team), i); for (j = 0 ; j < UCC_TL_TEAM_SIZE(team); j++) { if (j == UCC_TL_TEAM_RANK(team)) { From f4f1e5a314d88eb3068198484401a86dd4f377e5 Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Fri, 8 Nov 2024 16:00:57 +0100 Subject: [PATCH 33/34] TL/CUDA: replace free tag --- src/components/tl/cuda/tl_cuda.h | 2 ++ src/components/tl/cuda/tl_cuda_coll.h | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h index e7d66eec8a..8a3d63ff41 100644 --- a/src/components/tl/cuda/tl_cuda.h +++ b/src/components/tl/cuda/tl_cuda.h @@ -106,6 +106,8 @@ UCC_CLASS_DECLARE(ucc_tl_cuda_context_t, const ucc_base_context_params_t *, typedef uint32_t ucc_tl_cuda_sync_state_t; +#define UCC_TAG_FREE 0xFFFFFFFF + typedef struct ucc_tl_cuda_shm_barrier { ucc_rank_t size; ucc_rank_t count; diff --git a/src/components/tl/cuda/tl_cuda_coll.h b/src/components/tl/cuda/tl_cuda_coll.h index 7e47deddc2..e584c7f0a0 100644 --- a/src/components/tl/cuda/tl_cuda_coll.h +++ b/src/components/tl/cuda/tl_cuda_coll.h @@ -89,7 +89,7 @@ static inline void ucc_tl_cuda_task_put(ucc_tl_cuda_task_t *task) if (UCC_TL_TEAM_RANK(TASK_TEAM(task)) == task->bcast_linear.root) { ucc_print("free bar!"); - task->bar->tag = 0; + task->bar->tag = UCC_TAG_FREE; } ucc_mpool_put(task); @@ -135,7 +135,7 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args, /* search first free barrier in active set pool */ for (i = 0; i < max_concurrent; ++i) { curr_bar = UCC_TL_CUDA_TEAM_BARRIER(team, max_concurrent + i); - if (ucc_atomic_cswap32(&curr_bar->tag, 0, coll_args->args.tag) == 0) { + if (ucc_atomic_cswap32(&curr_bar->tag, UCC_TAG_FREE, coll_args->args.tag) == UCC_TAG_FREE) { ucc_print("found free barrier: %d", i); // free task->bar = curr_bar; From e9c1abe2a6bb7925f27ffe9169168b8366cd7512 Mon Sep 17 00:00:00 2001 From: Ilya Kryukov Date: Fri, 8 Nov 2024 16:12:57 +0100 Subject: [PATCH 34/34] TL/CUDA: fix bar tag init val --- src/components/tl/cuda/tl_cuda_team.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/components/tl/cuda/tl_cuda_team.c b/src/components/tl/cuda/tl_cuda_team.c index be0286a7a0..8fe668249a 100644 --- a/src/components/tl/cuda/tl_cuda_team.c +++ b/src/components/tl/cuda/tl_cuda_team.c @@ -84,7 +84,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_team_t, ucc_base_context_t *tl_context, /* active set */ for (i = 0; i < lib->cfg.max_concurrent * 2; i++) { bar = UCC_TL_CUDA_TEAM_BARRIER(self, i); - bar->tag = 0; // mark as free + bar->tag = UCC_TAG_FREE; // mark as free for (j = 0; j < UCC_TL_TEAM_SIZE(self); j++) { status = ucc_tl_cuda_shm_barrier_init(UCC_TL_TEAM_SIZE(self), j, bar);