diff --git a/src/components/tl/cuda/Makefile.am b/src/components/tl/cuda/Makefile.am index e22796e6fa..2136821b93 100644 --- a/src/components/tl/cuda/Makefile.am +++ b/src/components/tl/cuda/Makefile.am @@ -27,6 +27,11 @@ alltoallv = \ alltoallv/alltoallv.c \ alltoallv/alltoallv_ce.c +bcast = \ + bcast/bcast.h \ + bcast/bcast.c \ + bcast/bcast_linear.c + reduce_scatter = \ reduce_scatter/reduce_scatter.h \ reduce_scatter/reduce_scatter.c \ @@ -54,6 +59,7 @@ sources = \ $(allgatherv) \ $(alltoall) \ $(alltoallv) \ + $(bcast) \ $(reduce_scatter) \ $(reduce_scatterv) diff --git a/src/components/tl/cuda/allgather/allgather.c b/src/components/tl/cuda/allgather/allgather.c index 01996da4da..1e64c0a582 100644 --- a/src/components/tl/cuda/allgather/allgather.c +++ b/src/components/tl/cuda/allgather/allgather.c @@ -44,7 +44,7 @@ ucc_status_t ucc_tl_cuda_allgather_init(ucc_base_coll_args_t *coll_args, { ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t); - if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) { + if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) { return ucc_tl_cuda_allgather_linear_init(coll_args, tl_team, task_p); } else { return ucc_tl_cuda_allgather_ring_init(coll_args, tl_team, task_p); diff --git a/src/components/tl/cuda/allgather/allgather_linear.c b/src/components/tl/cuda/allgather/allgather_linear.c index ed228d1683..fefc774628 100644 --- a/src/components/tl/cuda/allgather/allgather_linear.c +++ b/src/components/tl/cuda/allgather/allgather_linear.c @@ -15,7 +15,7 @@ ucc_status_t ucc_tl_cuda_allgather_linear_init(ucc_base_coll_args_t *coll_args, ucc_tl_cuda_task_t *task; ucc_status_t status; - if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) || + if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) || UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) { return UCC_ERR_NOT_SUPPORTED; } diff --git a/src/components/tl/cuda/allgatherv/allgatherv.c b/src/components/tl/cuda/allgatherv/allgatherv.c index 5a8f78c481..76da65fa65 100644 --- a/src/components/tl/cuda/allgatherv/allgatherv.c +++ b/src/components/tl/cuda/allgatherv/allgatherv.c @@ -47,7 +47,7 @@ ucc_status_t ucc_tl_cuda_allgatherv_init(ucc_base_coll_args_t *coll_args, { ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t); - if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) { + if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) { return ucc_tl_cuda_allgatherv_linear_init(coll_args, tl_team, task_p); } else { return ucc_tl_cuda_allgatherv_ring_init(coll_args, tl_team, task_p); diff --git a/src/components/tl/cuda/allgatherv/allgatherv_linear.c b/src/components/tl/cuda/allgatherv/allgatherv_linear.c index 0fca5c6af6..1f02ad37bd 100644 --- a/src/components/tl/cuda/allgatherv/allgatherv_linear.c +++ b/src/components/tl/cuda/allgatherv/allgatherv_linear.c @@ -55,22 +55,6 @@ enum * other ranks to finish */ }; -static inline int get_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank, - int step_id) -{ - ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank); - - return sync->seq_num[step_id]; -} - -static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank, - int step, int step_id) -{ - ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank); - - sync->seq_num[step_id] = step; -} - ucc_status_t ucc_tl_cuda_allgatherv_linear_finalize(ucc_coll_task_t *coll_task) { ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); @@ -432,7 +416,7 @@ ucc_status_t ucc_tl_cuda_allgatherv_linear_init(ucc_base_coll_args_t *coll_args, ucc_tl_cuda_task_t *task; ucc_status_t status; - if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) || + if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) || UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) { return UCC_ERR_NOT_SUPPORTED; } diff --git a/src/components/tl/cuda/bcast/bcast.c b/src/components/tl/cuda/bcast/bcast.c new file mode 100644 index 0000000000..d687d924a0 --- /dev/null +++ b/src/components/tl/cuda/bcast/bcast.c @@ -0,0 +1,28 @@ +/** + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + +#include "bcast.h" +#include "components/mc/ucc_mc.h" + +ucc_base_coll_alg_info_t + ucc_tl_cuda_bcast_algs[UCC_TL_CUDA_BCAST_ALG_LAST + 1] = { + [UCC_TL_CUDA_BCAST_ALG_LINEAR] = {.id = UCC_TL_CUDA_BCAST_ALG_LINEAR, + .name = "linear", + .desc = "linear bcast algorithm"}, + [UCC_TL_CUDA_BCAST_ALG_LAST] = {.id = 0, .name = NULL, .desc = NULL}}; + +ucc_status_t ucc_tl_cuda_bcast_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *tl_team, + ucc_coll_task_t **task_p) +{ + ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t); + + if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) { + return ucc_tl_cuda_bcast_linear_init(coll_args, tl_team, task_p); + } else { + return UCC_ERR_NOT_SUPPORTED; + } +} diff --git a/src/components/tl/cuda/bcast/bcast.h b/src/components/tl/cuda/bcast/bcast.h new file mode 100644 index 0000000000..17d07a529b --- /dev/null +++ b/src/components/tl/cuda/bcast/bcast.h @@ -0,0 +1,43 @@ +/** + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + +#ifndef BCAST_H_ +#define BCAST_H_ + +#include "tl_cuda.h" +#include "tl_cuda_coll.h" + +enum +{ + UCC_TL_CUDA_BCAST_ALG_LINEAR, + UCC_TL_CUDA_BCAST_ALG_LAST +}; + +extern ucc_base_coll_alg_info_t + ucc_tl_cuda_bcast_algs[UCC_TL_CUDA_BCAST_ALG_LAST + 1]; + +#define UCC_TL_CUDA_BCAST_DEFAULT_ALG_SELECT_STR "bcast:cuda:@0" + +ucc_status_t ucc_tl_cuda_bcast_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *tl_team, + ucc_coll_task_t **task_p); + +ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *tl_team, + ucc_coll_task_t **task_p); + +static inline int ucc_tl_cuda_bcast_alg_from_str(const char *str) +{ + int i; + for (i = 0; i < UCC_TL_CUDA_BCAST_ALG_LAST; i++) { + if (0 == strcasecmp(str, ucc_tl_cuda_bcast_algs[i].name)) { + break; + } + } + return i; +} + +#endif diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c new file mode 100644 index 0000000000..992dce9470 --- /dev/null +++ b/src/components/tl/cuda/bcast/bcast_linear.c @@ -0,0 +1,297 @@ +/** + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + +#include "bcast/bcast.h" + +enum { + STAGE_SYNC, + STAGE_SETUP, + // root + STAGE_COPY, // post copy task: copy block from src to scratch buffer + STAGE_WAIT_COPY, // wait for copy finishes + STAGE_WAIT_ALL, // wait for all others rank be on same step + // non-root + STAGE_WAIT_ROOT, // clients wait while root writes to own scratch buffer + STAGE_CLIENT_COPY, // clients submit copy task + STAGE_CLIENT_COPY_WAIT, // clients wait completion of copy from root's scratch +}; + +ucc_status_t ucc_tl_cuda_bcast_linear_setup_start(ucc_tl_cuda_task_t *task) +{ + ucc_tl_cuda_team_t *team = TASK_TEAM(task); + ucc_rank_t trank = UCC_TL_TEAM_RANK(team); + ucc_status_t status; + + set_rank_step(task, trank, 0, 0); + ucc_memory_cpu_store_fence(); + status = ucc_tl_cuda_shm_barrier_start(UCC_TL_TEAM_RANK(team), task->bar); + if (ucc_unlikely(status != UCC_OK)) { + goto exit_err; + } + + return UCC_OK; + +exit_err: + return status; +} + +ucc_status_t ucc_tl_cuda_bcast_linear_setup_test(ucc_tl_cuda_task_t *task) +{ + ucc_tl_cuda_team_t *team = TASK_TEAM(task); + return ucc_tl_cuda_shm_barrier_test(UCC_TL_TEAM_RANK(team), task->bar); +} + +static inline size_t get_raw_scratch_size(ucc_tl_cuda_team_t *team) +{ + return UCC_TL_CUDA_TEAM_LIB(team)->cfg.scratch_size; +} + +static inline ucc_status_t ecopy(void *dst, void *src, size_t size, + ucc_ee_executor_t *exec, + ucc_ee_executor_task_t **etask) +{ + ucc_ee_executor_task_args_t exec_args = {0}; + + exec_args.task_type = UCC_EE_EXECUTOR_TASK_COPY; + exec_args.copy.dst = dst; + exec_args.copy.src = src; + exec_args.copy.len = size; + return ucc_ee_executor_task_post(exec, &exec_args, etask); +} + +ucc_status_t ucc_tl_cuda_bcast_linear_finalize(ucc_coll_task_t *coll_task) +{ + ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); + + tl_trace(UCC_TASK_LIB(task), "finalizing task %p", task); + ucc_tl_cuda_task_put(task); + return UCC_OK; +} + +void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task) +{ + ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); + ucc_tl_cuda_team_t *team = TASK_TEAM(task); + ucc_rank_t trank = UCC_TL_TEAM_RANK(team); + ucc_rank_t tsize = UCC_TL_TEAM_SIZE(team); + size_t half_scratch_size = get_raw_scratch_size(team) / 2; + size_t chunk_size = + task->bcast_linear.step < task->bcast_linear.num_steps + ? ucc_min(half_scratch_size, task->bcast_linear.size) + : task->bcast_linear.size - + (task->bcast_linear.step - 1) * half_scratch_size; + size_t offset_buff = task->bcast_linear.step * half_scratch_size; + ucc_ee_executor_t *exec; + ucc_ee_executor_task_t *etask; + ucc_status_t st; + void *sbuf, *dbuf; + int i; + + task->super.status = UCC_INPROGRESS; + + st = ucc_coll_task_get_executor(&task->super, &exec); + if (ucc_unlikely(st != UCC_OK)) { + task->super.status = st; + return; + } + + switch (task->bcast_linear.stage) { + case STAGE_SYNC: + if (ucc_tl_cuda_get_sync(task) != UCC_OK) { + return; + } + task->bcast_linear.step = 0; + st = ucc_tl_cuda_bcast_linear_setup_start(task); + if (st != UCC_OK) { + task->super.status = st; + return; + } + task->bcast_linear.stage = STAGE_SETUP; + case STAGE_SETUP: + st = ucc_tl_cuda_bcast_linear_setup_test(task); + if (st != UCC_OK) { + task->super.status = st; + return; + } + ucc_tl_cuda_put_sync(task); + if (trank == task->bcast_linear.root) { + task->bcast_linear.stage = STAGE_COPY; + } else { + task->bcast_linear.stage = STAGE_WAIT_ROOT; + } + default: + break; + } + + if (trank == task->bcast_linear.root) { + // Root scenario + // fall-through between cases is intentional + switch (task->bcast_linear.stage) { + case STAGE_COPY: + // copy from src buffer to scratch + dbuf = PTR_OFFSET(TASK_SCRATCH(task, trank), + task->bcast_linear.step % 2 * half_scratch_size); + sbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff); + st = ecopy(dbuf, sbuf, chunk_size, exec, + &task->bcast_linear.exec_task); + if (st != UCC_OK) { + ucc_error("failed to post ecopy task"); + task->super.status = st; + return; + } + task->bcast_linear.stage = STAGE_WAIT_COPY; + case STAGE_WAIT_COPY: + etask = task->bcast_linear.exec_task; + if (etask) { + st = ucc_ee_executor_task_test(etask); + if (st == UCC_OK) { + ucc_ee_executor_task_finalize(etask); + task->bcast_linear.exec_task = NULL; + // signal others + ++task->bcast_linear.step; + set_rank_step(task, task->bcast_linear.root, + task->bcast_linear.step, 0); + task->bcast_linear.stage = STAGE_WAIT_ALL; + } else { + // not ready + return; + } + } else { + ucc_debug("etask is nullptr"); + return; + } + case STAGE_WAIT_ALL: + for (i = 0; i < tsize; ++i) { + // need to wait until all ranks complete step - 1, because of double buffering + if (get_rank_step(task, i, 0) < task->bcast_linear.step - 1) { + // rank is not ready, lets wait + return; + } + } + task->bcast_linear.stage = STAGE_COPY; + if (task->bcast_linear.step < task->bcast_linear.num_steps) { + // go to next iteration + task->bcast_linear.stage = STAGE_COPY; + return; + } else { + // finish + task->super.status = UCC_OK; + break; + } + default: + break; + } + } else { + // clients + // fall-through between cases is intentional + switch (task->bcast_linear.stage) { + case STAGE_WAIT_ROOT: + if (get_rank_step(task, task->bcast_linear.root, 0) > + task->bcast_linear.step) { + task->bcast_linear.stage = STAGE_CLIENT_COPY; + break; + } else { + return; + } + case STAGE_CLIENT_COPY: + // need to copy from root's scratch buffer + dbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff); + sbuf = PTR_OFFSET(TASK_SCRATCH(task, task->bcast_linear.root), + task->bcast_linear.step % 2 * chunk_size); + st = ecopy(dbuf, sbuf, chunk_size, exec, + &task->bcast_linear.exec_task); + if (st != UCC_OK) { + ucc_error("failed to post ecopy task at client"); + task->super.status = st; + return; + } + task->bcast_linear.stage = STAGE_CLIENT_COPY_WAIT; + case STAGE_CLIENT_COPY_WAIT: + etask = task->bcast_linear.exec_task; + if (etask) { + st = ucc_ee_executor_task_test(etask); + if (st == UCC_OK) { + ucc_ee_executor_task_finalize(etask); + task->bcast_linear.exec_task = NULL; + ++task->bcast_linear.step; + set_rank_step(task, trank, task->bcast_linear.step, 0); + if (task->bcast_linear.step < + task->bcast_linear.num_steps) { + task->bcast_linear.stage = STAGE_WAIT_ROOT; + return; + } else { + // Done + task->super.status = UCC_OK; + break; + } + } else { + return; + } + } else { + return; + } + default: + break; + } + } +} + +ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task) +{ + ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t); + ucc_tl_cuda_team_t *team = TASK_TEAM(task); + ucc_coll_args_t *args = &TASK_ARGS(task); + ucc_datatype_t dt = task->bcast_linear.dt; + size_t half_scratch_size = get_raw_scratch_size(team) / 2; + + task->bcast_linear.stage = STAGE_SYNC; + + task->bcast_linear.size = ucc_dt_size(dt) * args->src.info.count; + task->bcast_linear.num_steps = + ucc_div_round_up(task->bcast_linear.size, half_scratch_size); + + ucc_debug("bcast linear dt: %s, buffer size: %ld, num_steps: %d", + ucc_datatype_str(dt), task->bcast_linear.size, + task->bcast_linear.num_steps); + + task->bcast_linear.sbuf = args->src.info.buffer; + task->bcast_linear.step = 0; + + return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super); +} + +ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args, + ucc_base_team_t *tl_team, + ucc_coll_task_t **task_p) +{ + ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t); + ucc_tl_cuda_task_t *task; + ucc_status_t status; + + if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) || + UCC_TL_TEAM_SIZE(team) - 1 > + UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) { + return UCC_ERR_NOT_SUPPORTED; + } + + status = ucc_tl_cuda_task_init(coll_args, team, &task); + if (ucc_unlikely(status != UCC_OK)) { + return status; + } + + task->bcast_linear.root = coll_args->args.root; + task->bcast_linear.dt = coll_args->args.src.info.datatype; + task->bcast_linear.sbuf = coll_args->args.src.info.buffer; + + task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR; + task->super.post = ucc_tl_cuda_bcast_linear_start; + task->super.progress = ucc_tl_cuda_bcast_linear_progress; + task->super.finalize = ucc_tl_cuda_bcast_linear_finalize; + task->bar = TASK_BAR(task); + + *task_p = &task->super; + return UCC_OK; +} diff --git a/src/components/tl/cuda/reduce_scatter/reduce_scatter.c b/src/components/tl/cuda/reduce_scatter/reduce_scatter.c index 468fd68338..1e1d75c3ed 100644 --- a/src/components/tl/cuda/reduce_scatter/reduce_scatter.c +++ b/src/components/tl/cuda/reduce_scatter/reduce_scatter.c @@ -48,7 +48,7 @@ ucc_status_t ucc_tl_cuda_reduce_scatter_init(ucc_base_coll_args_t *coll_args, { ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t); - if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) { + if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) { return ucc_tl_cuda_reduce_scatter_linear_init(coll_args, tl_team, task_p); } else { diff --git a/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c b/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c index 46efbdb051..9a025267ca 100644 --- a/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c +++ b/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c @@ -19,7 +19,7 @@ ucc_status_t ucc_tl_cuda_reduce_scatter_linear_init(ucc_base_coll_args_t *coll_a return UCC_ERR_NOT_SUPPORTED; } - if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) || + if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) || UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) { return UCC_ERR_NOT_SUPPORTED; } diff --git a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c index d85e2c8dd3..d954e38e9e 100644 --- a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c +++ b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c @@ -51,7 +51,7 @@ ucc_status_t ucc_tl_cuda_reduce_scatterv_init(ucc_base_coll_args_t *coll_args, { ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t); - if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) { + if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) { return ucc_tl_cuda_reduce_scatterv_linear_init(coll_args, tl_team, task_p); } else { diff --git a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c index 6a1ec5b22c..d719632853 100644 --- a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c +++ b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c @@ -59,22 +59,6 @@ enum * other ranks to finish */ }; -static inline int get_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank, - int step_id) -{ - ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank); - - return sync->seq_num[step_id]; -} - -static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank, - int step, int step_id) -{ - ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank); - - sync->seq_num[step_id] = step; -} - ucc_status_t ucc_tl_cuda_reduce_scatterv_linear_finalize(ucc_coll_task_t *coll_task) { @@ -448,7 +432,7 @@ ucc_tl_cuda_reduce_scatterv_linear_init(ucc_base_coll_args_t *coll_args, return UCC_ERR_NOT_SUPPORTED; } - if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) || + if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) || UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) { return UCC_ERR_NOT_SUPPORTED; } diff --git a/src/components/tl/cuda/tl_cuda.c b/src/components/tl/cuda/tl_cuda.c index 98dccf26bf..18135fae00 100644 --- a/src/components/tl/cuda/tl_cuda.c +++ b/src/components/tl/cuda/tl_cuda.c @@ -9,6 +9,7 @@ #include "components/mc/base/ucc_mc_base.h" #include "allgather/allgather.h" #include "allgatherv/allgatherv.h" +#include "bcast/bcast.h" #include "reduce_scatter/reduce_scatter.h" #include "reduce_scatterv/reduce_scatterv.h" @@ -93,6 +94,8 @@ __attribute__((constructor)) static void tl_cuda_iface_init(void) ucc_tl_cuda_allgather_algs; ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_ALLGATHERV)] = ucc_tl_cuda_allgatherv_algs; + ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_BCAST)] = + ucc_tl_cuda_bcast_algs; ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_REDUCE_SCATTER)] = ucc_tl_cuda_reduce_scatter_algs; ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_REDUCE_SCATTERV)] = diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h index 792100c80c..d86dc7cc66 100644 --- a/src/components/tl/cuda/tl_cuda.h +++ b/src/components/tl/cuda/tl_cuda.h @@ -27,6 +27,7 @@ #define UCC_TL_CUDA_SUPPORTED_COLLS \ (UCC_COLL_TYPE_ALLTOALL | UCC_COLL_TYPE_ALLTOALLV | \ UCC_COLL_TYPE_ALLGATHER | UCC_COLL_TYPE_ALLGATHERV | \ + UCC_COLL_TYPE_BCAST | \ UCC_COLL_TYPE_REDUCE_SCATTER | UCC_COLL_TYPE_REDUCE_SCATTERV) #define UCC_TL_CUDA_TEAM_LIB(_team) \ @@ -224,6 +225,16 @@ struct ucc_tl_cuda_task { size_t (*get_offset)(const ucc_tl_cuda_task_t *task, ucc_rank_t block); } allgatherv_linear; + struct { + int stage; + int step; + void *sbuf; + ucc_datatype_t dt; + ucc_rank_t root; + size_t size; + int num_steps; + ucc_ee_executor_task_t *exec_task; + } bcast_linear; struct { int stage; int num_frags; diff --git a/src/components/tl/cuda/tl_cuda_coll.c b/src/components/tl/cuda/tl_cuda_coll.c index 5d01cc1a94..42b33cdbcc 100644 --- a/src/components/tl/cuda/tl_cuda_coll.c +++ b/src/components/tl/cuda/tl_cuda_coll.c @@ -9,6 +9,7 @@ #include "alltoallv/alltoallv.h" #include "allgather/allgather.h" #include "allgatherv/allgatherv.h" +#include "bcast/bcast.h" #include "reduce_scatter/reduce_scatter.h" #include "reduce_scatterv/reduce_scatterv.h" #include "utils/arch/cpu.h" @@ -35,6 +36,7 @@ const char * ucc_tl_cuda_default_alg_select_str[UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR] = { UCC_TL_CUDA_ALLGATHER_DEFAULT_ALG_SELECT_STR, UCC_TL_CUDA_ALLGATHERV_DEFAULT_ALG_SELECT_STR, + UCC_TL_CUDA_BCAST_DEFAULT_ALG_SELECT_STR, UCC_TL_CUDA_REDUCE_SCATTER_DEFAULT_ALG_SELECT_STR, UCC_TL_CUDA_REDUCE_SCATTERV_DEFAULT_ALG_SELECT_STR}; @@ -78,6 +80,8 @@ ucc_status_t ucc_tl_cuda_coll_init(ucc_base_coll_args_t *coll_args, return ucc_tl_cuda_allgather_init(coll_args, team, task_h); case UCC_COLL_TYPE_ALLGATHERV: return ucc_tl_cuda_allgatherv_init(coll_args, team, task_h); + case UCC_COLL_TYPE_BCAST: + return ucc_tl_cuda_bcast_init(coll_args, team, task_h); case UCC_COLL_TYPE_REDUCE_SCATTER: return ucc_tl_cuda_reduce_scatter_init(coll_args, team, task_h); case UCC_COLL_TYPE_REDUCE_SCATTERV: @@ -134,6 +138,8 @@ static inline int alg_id_from_str(ucc_coll_type_t coll_type, const char *str) return ucc_tl_cuda_allgather_alg_from_str(str); case UCC_COLL_TYPE_ALLGATHERV: return ucc_tl_cuda_allgatherv_alg_from_str(str); + case UCC_COLL_TYPE_BCAST: + return ucc_tl_cuda_bcast_alg_from_str(str); default: break; } @@ -187,6 +193,16 @@ ucc_status_t ucc_tl_cuda_alg_id_to_init(int alg_id, const char *alg_id_str, break; }; break; + case UCC_COLL_TYPE_BCAST: + switch (alg_id) { + case UCC_TL_CUDA_BCAST_ALG_LINEAR: + *init = ucc_tl_cuda_bcast_linear_init; + break; + default: + status = UCC_ERR_INVALID_PARAM; + break; + }; + break; case UCC_COLL_TYPE_REDUCE_SCATTER: switch (alg_id) { case UCC_TL_CUDA_REDUCE_SCATTER_ALG_AUTO: diff --git a/src/components/tl/cuda/tl_cuda_coll.h b/src/components/tl/cuda/tl_cuda_coll.h index 8b15cdf249..55b86e2cee 100644 --- a/src/components/tl/cuda/tl_cuda_coll.h +++ b/src/components/tl/cuda/tl_cuda_coll.h @@ -10,7 +10,7 @@ #include "tl_cuda.h" #include "components/mc/ucc_mc.h" -#define UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR 4 +#define UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR 5 extern const char *ucc_tl_cuda_default_alg_select_str[UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR]; @@ -156,4 +156,21 @@ ucc_status_t ucc_tl_cuda_alg_id_to_init(int alg_id, const char *alg_id_str, ucc_memory_type_t mem_type, ucc_base_coll_init_fn_t *init); +// common utils function for collectives: +static inline int get_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank, + int step_id) +{ + ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank); + + return sync->seq_num[step_id]; +} + +static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank, + int step, int step_id) +{ + ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank); + + sync->seq_num[step_id] = step; +} + #endif diff --git a/src/components/tl/cuda/tl_cuda_ring.h b/src/components/tl/cuda/tl_cuda_ring.h index cc2d3c95db..621e074184 100644 --- a/src/components/tl/cuda/tl_cuda_ring.h +++ b/src/components/tl/cuda/tl_cuda_ring.h @@ -83,20 +83,4 @@ static inline ucc_rank_t get_recv_block(ucc_tl_cuda_team_t *team, return ring->ring[(ring->iring[trank] + tsize - step - 1) % tsize]; } -static inline int get_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank, - int ring_id) -{ - ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank); - - return sync->seq_num[ring_id]; -} - -static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank, - int step, int ring_id) -{ - ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank); - - sync->seq_num[ring_id] = step; -} - #endif diff --git a/src/components/tl/cuda/tl_cuda_team_topo.h b/src/components/tl/cuda/tl_cuda_team_topo.h index 96b6d63a5b..1d7d19ad1c 100644 --- a/src/components/tl/cuda/tl_cuda_team_topo.h +++ b/src/components/tl/cuda/tl_cuda_team_topo.h @@ -51,7 +51,7 @@ ucc_tl_cuda_team_topo_is_direct(const ucc_tl_team_t *team, } static inline int -ucc_tl_cuda_team_topo_is_fully_conntected(const ucc_tl_cuda_team_topo_t *topo) +ucc_tl_cuda_team_topo_is_fully_connected(const ucc_tl_cuda_team_topo_t *topo) { return topo->is_fully_connected; } diff --git a/test/gtest/coll/test_bcast.cc b/test/gtest/coll/test_bcast.cc index 6d80816a31..69f697a508 100644 --- a/test/gtest/coll/test_bcast.cc +++ b/test/gtest/coll/test_bcast.cc @@ -276,6 +276,8 @@ ucc_job_env_t two_step_env = {{"UCC_CL_HIER_TUNE", "bcast:@2step:0-inf:inf"}, {"UCC_CLS", "all"}}; ucc_job_env_t dbt_env = {{"UCC_TL_UCP_TUNE", "bcast:@dbt:0-inf:inf"}, {"UCC_CLS", "basic"}}; +ucc_job_env_t cuda_env = {{"UCC_TL_CUDA_TUNE", "bcast:cuda:@0"}, + {"UCC_CLS", "basic"}}; INSTANTIATE_TEST_CASE_P( , test_bcast_alg, ::testing::Combine( @@ -285,6 +287,10 @@ INSTANTIATE_TEST_CASE_P( #else ::testing::Values(UCC_MEMORY_TYPE_HOST), #endif +#ifdef HAVE_CUDA + ::testing::Values(two_step_env, dbt_env, cuda_env), //env +#else ::testing::Values(two_step_env, dbt_env), //env +#endif ::testing::Values(8, 65536), // count - ::testing::Values(15,16))); // n_procs + ::testing::Values(15, 16))); // n_procs