diff --git a/src/components/tl/cuda/Makefile.am b/src/components/tl/cuda/Makefile.am
index e22796e6fa..2136821b93 100644
--- a/src/components/tl/cuda/Makefile.am
+++ b/src/components/tl/cuda/Makefile.am
@@ -27,6 +27,11 @@ alltoallv =                  \
 	alltoallv/alltoallv.c    \
 	alltoallv/alltoallv_ce.c
 
+bcast =                  \
+	bcast/bcast.h        \
+	bcast/bcast.c        \
+	bcast/bcast_linear.c
+
 reduce_scatter =                           \
 	reduce_scatter/reduce_scatter.h        \
 	reduce_scatter/reduce_scatter.c        \
@@ -54,6 +59,7 @@ sources =               \
 	$(allgatherv)       \
 	$(alltoall)         \
 	$(alltoallv)        \
+	$(bcast)            \
 	$(reduce_scatter)   \
 	$(reduce_scatterv)
 
diff --git a/src/components/tl/cuda/allgather/allgather.c b/src/components/tl/cuda/allgather/allgather.c
index 01996da4da..1e64c0a582 100644
--- a/src/components/tl/cuda/allgather/allgather.c
+++ b/src/components/tl/cuda/allgather/allgather.c
@@ -44,7 +44,7 @@ ucc_status_t ucc_tl_cuda_allgather_init(ucc_base_coll_args_t *coll_args,
 {
     ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
 
-    if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
+    if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) {
         return ucc_tl_cuda_allgather_linear_init(coll_args, tl_team, task_p);
     } else {
         return ucc_tl_cuda_allgather_ring_init(coll_args, tl_team, task_p);
diff --git a/src/components/tl/cuda/allgather/allgather_linear.c b/src/components/tl/cuda/allgather/allgather_linear.c
index ed228d1683..fefc774628 100644
--- a/src/components/tl/cuda/allgather/allgather_linear.c
+++ b/src/components/tl/cuda/allgather/allgather_linear.c
@@ -15,7 +15,7 @@ ucc_status_t ucc_tl_cuda_allgather_linear_init(ucc_base_coll_args_t *coll_args,
     ucc_tl_cuda_task_t *task;
     ucc_status_t        status;
 
-    if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) ||
+    if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) ||
         UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
         return UCC_ERR_NOT_SUPPORTED;
     }
diff --git a/src/components/tl/cuda/allgatherv/allgatherv.c b/src/components/tl/cuda/allgatherv/allgatherv.c
index 5a8f78c481..76da65fa65 100644
--- a/src/components/tl/cuda/allgatherv/allgatherv.c
+++ b/src/components/tl/cuda/allgatherv/allgatherv.c
@@ -47,7 +47,7 @@ ucc_status_t ucc_tl_cuda_allgatherv_init(ucc_base_coll_args_t *coll_args,
 {
     ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
 
-    if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
+    if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) {
         return ucc_tl_cuda_allgatherv_linear_init(coll_args, tl_team, task_p);
     } else {
         return ucc_tl_cuda_allgatherv_ring_init(coll_args, tl_team, task_p);
diff --git a/src/components/tl/cuda/allgatherv/allgatherv_linear.c b/src/components/tl/cuda/allgatherv/allgatherv_linear.c
index 0fca5c6af6..1f02ad37bd 100644
--- a/src/components/tl/cuda/allgatherv/allgatherv_linear.c
+++ b/src/components/tl/cuda/allgatherv/allgatherv_linear.c
@@ -55,22 +55,6 @@ enum
                     *  other ranks to finish */
 };
 
-static inline int get_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
-                                int step_id)
-{
-    ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);
-
-    return sync->seq_num[step_id];
-}
-
-static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
-                                 int step, int step_id)
-{
-    ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);
-
-    sync->seq_num[step_id] = step;
-}
-
 ucc_status_t ucc_tl_cuda_allgatherv_linear_finalize(ucc_coll_task_t *coll_task)
 {
     ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
@@ -432,7 +416,7 @@ ucc_status_t ucc_tl_cuda_allgatherv_linear_init(ucc_base_coll_args_t *coll_args,
     ucc_tl_cuda_task_t *task;
     ucc_status_t        status;
 
-    if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) ||
+    if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) ||
         UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
         return UCC_ERR_NOT_SUPPORTED;
     }
diff --git a/src/components/tl/cuda/bcast/bcast.c b/src/components/tl/cuda/bcast/bcast.c
new file mode 100644
index 0000000000..d687d924a0
--- /dev/null
+++ b/src/components/tl/cuda/bcast/bcast.c
@@ -0,0 +1,28 @@
+/**
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include "bcast.h"
+#include "components/mc/ucc_mc.h"
+
+ucc_base_coll_alg_info_t
+    ucc_tl_cuda_bcast_algs[UCC_TL_CUDA_BCAST_ALG_LAST + 1] = {
+        [UCC_TL_CUDA_BCAST_ALG_LINEAR] = {.id   = UCC_TL_CUDA_BCAST_ALG_LINEAR,
+                                          .name = "linear",
+                                          .desc = "linear bcast algorithm"},
+        [UCC_TL_CUDA_BCAST_ALG_LAST]   = {.id = 0, .name = NULL, .desc = NULL}};
+
+ucc_status_t ucc_tl_cuda_bcast_init(ucc_base_coll_args_t *coll_args,
+                                    ucc_base_team_t      *tl_team,
+                                    ucc_coll_task_t     **task_p)
+{
+    ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
+
+    if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) {
+        return ucc_tl_cuda_bcast_linear_init(coll_args, tl_team, task_p);
+    } else {
+        return UCC_ERR_NOT_SUPPORTED;
+    }
+}
diff --git a/src/components/tl/cuda/bcast/bcast.h b/src/components/tl/cuda/bcast/bcast.h
new file mode 100644
index 0000000000..17d07a529b
--- /dev/null
+++ b/src/components/tl/cuda/bcast/bcast.h
@@ -0,0 +1,43 @@
+/**
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifndef BCAST_H_
+#define BCAST_H_
+
+#include "tl_cuda.h"
+#include "tl_cuda_coll.h"
+
+enum
+{
+    UCC_TL_CUDA_BCAST_ALG_LINEAR,
+    UCC_TL_CUDA_BCAST_ALG_LAST
+};
+
+extern ucc_base_coll_alg_info_t
+    ucc_tl_cuda_bcast_algs[UCC_TL_CUDA_BCAST_ALG_LAST + 1];
+
+#define UCC_TL_CUDA_BCAST_DEFAULT_ALG_SELECT_STR "bcast:cuda:@0"
+
+ucc_status_t ucc_tl_cuda_bcast_init(ucc_base_coll_args_t *coll_args,
+                                    ucc_base_team_t      *tl_team,
+                                    ucc_coll_task_t     **task_p);
+
+ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
+                                           ucc_base_team_t      *tl_team,
+                                           ucc_coll_task_t     **task_p);
+
+static inline int ucc_tl_cuda_bcast_alg_from_str(const char *str)
+{
+    int i;
+    for (i = 0; i < UCC_TL_CUDA_BCAST_ALG_LAST; i++) {
+        if (0 == strcasecmp(str, ucc_tl_cuda_bcast_algs[i].name)) {
+            break;
+        }
+    }
+    return i;
+}
+
+#endif
diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
new file mode 100644
index 0000000000..992dce9470
--- /dev/null
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -0,0 +1,297 @@
+/**
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include "bcast/bcast.h"
+
+enum {
+    STAGE_SYNC,
+    STAGE_SETUP,
+    // root
+    STAGE_COPY,      // post copy task: copy block from src to scratch buffer
+    STAGE_WAIT_COPY, // wait for copy finishes
+    STAGE_WAIT_ALL,  // wait for all others rank be on same step
+    // non-root
+    STAGE_WAIT_ROOT,   // clients wait while root writes to own scratch buffer
+    STAGE_CLIENT_COPY, // clients submit copy task
+    STAGE_CLIENT_COPY_WAIT, // clients wait completion of copy from root's scratch
+};
+
+ucc_status_t ucc_tl_cuda_bcast_linear_setup_start(ucc_tl_cuda_task_t *task)
+{
+    ucc_tl_cuda_team_t *team  = TASK_TEAM(task);
+    ucc_rank_t          trank = UCC_TL_TEAM_RANK(team);
+    ucc_status_t        status;
+
+    set_rank_step(task, trank, 0, 0);
+    ucc_memory_cpu_store_fence();
+    status = ucc_tl_cuda_shm_barrier_start(UCC_TL_TEAM_RANK(team), task->bar);
+    if (ucc_unlikely(status != UCC_OK)) {
+        goto exit_err;
+    }
+
+    return UCC_OK;
+
+exit_err:
+    return status;
+}
+
+ucc_status_t ucc_tl_cuda_bcast_linear_setup_test(ucc_tl_cuda_task_t *task)
+{
+    ucc_tl_cuda_team_t *team = TASK_TEAM(task);
+    return ucc_tl_cuda_shm_barrier_test(UCC_TL_TEAM_RANK(team), task->bar);
+}
+
+static inline size_t get_raw_scratch_size(ucc_tl_cuda_team_t *team)
+{
+    return UCC_TL_CUDA_TEAM_LIB(team)->cfg.scratch_size;
+}
+
+static inline ucc_status_t ecopy(void *dst, void *src, size_t size,
+                                 ucc_ee_executor_t       *exec,
+                                 ucc_ee_executor_task_t **etask)
+{
+    ucc_ee_executor_task_args_t exec_args = {0};
+
+    exec_args.task_type = UCC_EE_EXECUTOR_TASK_COPY;
+    exec_args.copy.dst  = dst;
+    exec_args.copy.src  = src;
+    exec_args.copy.len  = size;
+    return ucc_ee_executor_task_post(exec, &exec_args, etask);
+}
+
+ucc_status_t ucc_tl_cuda_bcast_linear_finalize(ucc_coll_task_t *coll_task)
+{
+    ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
+
+    tl_trace(UCC_TASK_LIB(task), "finalizing task %p", task);
+    ucc_tl_cuda_task_put(task);
+    return UCC_OK;
+}
+
+void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
+{
+    ucc_tl_cuda_task_t *task              = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
+    ucc_tl_cuda_team_t *team              = TASK_TEAM(task);
+    ucc_rank_t          trank             = UCC_TL_TEAM_RANK(team);
+    ucc_rank_t          tsize             = UCC_TL_TEAM_SIZE(team);
+    size_t              half_scratch_size = get_raw_scratch_size(team) / 2;
+    size_t              chunk_size        =
+        task->bcast_linear.step < task->bcast_linear.num_steps
+                         ? ucc_min(half_scratch_size, task->bcast_linear.size)
+                         : task->bcast_linear.size -
+                  (task->bcast_linear.step - 1) * half_scratch_size;
+    size_t offset_buff                    = task->bcast_linear.step * half_scratch_size;
+    ucc_ee_executor_t      *exec;
+    ucc_ee_executor_task_t *etask;
+    ucc_status_t            st;
+    void                   *sbuf, *dbuf;
+    int                     i;
+
+    task->super.status = UCC_INPROGRESS;
+
+    st = ucc_coll_task_get_executor(&task->super, &exec);
+    if (ucc_unlikely(st != UCC_OK)) {
+        task->super.status = st;
+        return;
+    }
+
+    switch (task->bcast_linear.stage) {
+    case STAGE_SYNC:
+        if (ucc_tl_cuda_get_sync(task) != UCC_OK) {
+            return;
+        }
+        task->bcast_linear.step = 0;
+        st                      = ucc_tl_cuda_bcast_linear_setup_start(task);
+        if (st != UCC_OK) {
+            task->super.status = st;
+            return;
+        }
+        task->bcast_linear.stage = STAGE_SETUP;
+    case STAGE_SETUP:
+        st = ucc_tl_cuda_bcast_linear_setup_test(task);
+        if (st != UCC_OK) {
+            task->super.status = st;
+            return;
+        }
+        ucc_tl_cuda_put_sync(task);
+        if (trank == task->bcast_linear.root) {
+            task->bcast_linear.stage = STAGE_COPY;
+        } else {
+            task->bcast_linear.stage = STAGE_WAIT_ROOT;
+        }
+    default:
+        break;
+    }
+
+    if (trank == task->bcast_linear.root) {
+        // Root scenario
+        // fall-through between cases is intentional
+        switch (task->bcast_linear.stage) {
+        case STAGE_COPY:
+            // copy from src buffer to scratch
+            dbuf = PTR_OFFSET(TASK_SCRATCH(task, trank),
+                              task->bcast_linear.step % 2 * half_scratch_size);
+            sbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff);
+            st   = ecopy(dbuf, sbuf, chunk_size, exec,
+                         &task->bcast_linear.exec_task);
+            if (st != UCC_OK) {
+                ucc_error("failed to post ecopy task");
+                task->super.status = st;
+                return;
+            }
+            task->bcast_linear.stage = STAGE_WAIT_COPY;
+        case STAGE_WAIT_COPY:
+            etask = task->bcast_linear.exec_task;
+            if (etask) {
+                st = ucc_ee_executor_task_test(etask);
+                if (st == UCC_OK) {
+                    ucc_ee_executor_task_finalize(etask);
+                    task->bcast_linear.exec_task = NULL;
+                    // signal others
+                    ++task->bcast_linear.step;
+                    set_rank_step(task, task->bcast_linear.root,
+                                  task->bcast_linear.step, 0);
+                    task->bcast_linear.stage = STAGE_WAIT_ALL;
+                } else {
+                    // not ready
+                    return;
+                }
+            } else {
+                ucc_debug("etask is nullptr");
+                return;
+            }
+        case STAGE_WAIT_ALL:
+            for (i = 0; i < tsize; ++i) {
+                // need to wait until all ranks complete step - 1, because of double buffering
+                if (get_rank_step(task, i, 0) < task->bcast_linear.step - 1) {
+                    // rank is not ready, lets wait
+                    return;
+                }
+            }
+            task->bcast_linear.stage = STAGE_COPY;
+            if (task->bcast_linear.step < task->bcast_linear.num_steps) {
+                // go to next iteration
+                task->bcast_linear.stage = STAGE_COPY;
+                return;
+            } else {
+                // finish
+                task->super.status = UCC_OK;
+                break;
+            }
+        default:
+            break;
+        }
+    } else {
+        // clients
+        // fall-through between cases is intentional
+        switch (task->bcast_linear.stage) {
+        case STAGE_WAIT_ROOT:
+            if (get_rank_step(task, task->bcast_linear.root, 0) >
+                task->bcast_linear.step) {
+                task->bcast_linear.stage = STAGE_CLIENT_COPY;
+                break;
+            } else {
+                return;
+            }
+        case STAGE_CLIENT_COPY:
+            // need to copy from root's scratch buffer
+            dbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff);
+            sbuf = PTR_OFFSET(TASK_SCRATCH(task, task->bcast_linear.root),
+                              task->bcast_linear.step % 2 * chunk_size);
+            st   = ecopy(dbuf, sbuf, chunk_size, exec,
+                         &task->bcast_linear.exec_task);
+            if (st != UCC_OK) {
+                ucc_error("failed to post ecopy task at client");
+                task->super.status = st;
+                return;
+            }
+            task->bcast_linear.stage = STAGE_CLIENT_COPY_WAIT;
+        case STAGE_CLIENT_COPY_WAIT:
+            etask = task->bcast_linear.exec_task;
+            if (etask) {
+                st = ucc_ee_executor_task_test(etask);
+                if (st == UCC_OK) {
+                    ucc_ee_executor_task_finalize(etask);
+                    task->bcast_linear.exec_task = NULL;
+                    ++task->bcast_linear.step;
+                    set_rank_step(task, trank, task->bcast_linear.step, 0);
+                    if (task->bcast_linear.step <
+                        task->bcast_linear.num_steps) {
+                        task->bcast_linear.stage = STAGE_WAIT_ROOT;
+                        return;
+                    } else {
+                        // Done
+                        task->super.status = UCC_OK;
+                        break;
+                    }
+                } else {
+                    return;
+                }
+            } else {
+                return;
+            }
+        default:
+            break;
+        }
+    }
+}
+
+ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
+{
+    ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
+    ucc_tl_cuda_team_t *team = TASK_TEAM(task);
+    ucc_coll_args_t    *args = &TASK_ARGS(task);
+    ucc_datatype_t      dt   = task->bcast_linear.dt;
+    size_t              half_scratch_size = get_raw_scratch_size(team) / 2;
+
+    task->bcast_linear.stage = STAGE_SYNC;
+
+    task->bcast_linear.size = ucc_dt_size(dt) * args->src.info.count;
+    task->bcast_linear.num_steps =
+        ucc_div_round_up(task->bcast_linear.size, half_scratch_size);
+
+    ucc_debug("bcast linear dt: %s, buffer size: %ld, num_steps: %d",
+              ucc_datatype_str(dt), task->bcast_linear.size,
+              task->bcast_linear.num_steps);
+
+    task->bcast_linear.sbuf = args->src.info.buffer;
+    task->bcast_linear.step = 0;
+
+    return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super);
+}
+
+ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
+                                           ucc_base_team_t      *tl_team,
+                                           ucc_coll_task_t     **task_p)
+{
+    ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
+    ucc_tl_cuda_task_t *task;
+    ucc_status_t        status;
+
+    if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) ||
+                     UCC_TL_TEAM_SIZE(team) - 1 >
+                         UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
+        return UCC_ERR_NOT_SUPPORTED;
+    }
+
+    status = ucc_tl_cuda_task_init(coll_args, team, &task);
+    if (ucc_unlikely(status != UCC_OK)) {
+        return status;
+    }
+
+    task->bcast_linear.root = coll_args->args.root;
+    task->bcast_linear.dt   = coll_args->args.src.info.datatype;
+    task->bcast_linear.sbuf = coll_args->args.src.info.buffer;
+
+    task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR;
+    task->super.post     = ucc_tl_cuda_bcast_linear_start;
+    task->super.progress = ucc_tl_cuda_bcast_linear_progress;
+    task->super.finalize = ucc_tl_cuda_bcast_linear_finalize;
+    task->bar            = TASK_BAR(task);
+
+    *task_p = &task->super;
+    return UCC_OK;
+}
diff --git a/src/components/tl/cuda/reduce_scatter/reduce_scatter.c b/src/components/tl/cuda/reduce_scatter/reduce_scatter.c
index 468fd68338..1e1d75c3ed 100644
--- a/src/components/tl/cuda/reduce_scatter/reduce_scatter.c
+++ b/src/components/tl/cuda/reduce_scatter/reduce_scatter.c
@@ -48,7 +48,7 @@ ucc_status_t ucc_tl_cuda_reduce_scatter_init(ucc_base_coll_args_t *coll_args,
 {
     ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
 
-    if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
+    if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) {
         return ucc_tl_cuda_reduce_scatter_linear_init(coll_args, tl_team,
                                                       task_p);
     } else {
diff --git a/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c b/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c
index 46efbdb051..9a025267ca 100644
--- a/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c
+++ b/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c
@@ -19,7 +19,7 @@ ucc_status_t ucc_tl_cuda_reduce_scatter_linear_init(ucc_base_coll_args_t *coll_a
         return UCC_ERR_NOT_SUPPORTED;
     }
 
-    if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) ||
+    if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) ||
         UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
         return UCC_ERR_NOT_SUPPORTED;
     }
diff --git a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c
index d85e2c8dd3..d954e38e9e 100644
--- a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c
+++ b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c
@@ -51,7 +51,7 @@ ucc_status_t ucc_tl_cuda_reduce_scatterv_init(ucc_base_coll_args_t *coll_args,
 {
     ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
 
-    if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
+    if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) {
         return ucc_tl_cuda_reduce_scatterv_linear_init(coll_args, tl_team,
                                                        task_p);
     } else {
diff --git a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c
index 6a1ec5b22c..d719632853 100644
--- a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c
+++ b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c
@@ -59,22 +59,6 @@ enum
                     *  other ranks to finish */
 };
 
-static inline int get_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
-                                int step_id)
-{
-    ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);
-
-    return sync->seq_num[step_id];
-}
-
-static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
-                                 int step, int step_id)
-{
-    ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);
-
-    sync->seq_num[step_id] = step;
-}
-
 ucc_status_t
 ucc_tl_cuda_reduce_scatterv_linear_finalize(ucc_coll_task_t *coll_task)
 {
@@ -448,7 +432,7 @@ ucc_tl_cuda_reduce_scatterv_linear_init(ucc_base_coll_args_t *coll_args,
         return UCC_ERR_NOT_SUPPORTED;
     }
 
-    if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) ||
+    if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) ||
         UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
         return UCC_ERR_NOT_SUPPORTED;
     }
diff --git a/src/components/tl/cuda/tl_cuda.c b/src/components/tl/cuda/tl_cuda.c
index 98dccf26bf..18135fae00 100644
--- a/src/components/tl/cuda/tl_cuda.c
+++ b/src/components/tl/cuda/tl_cuda.c
@@ -9,6 +9,7 @@
 #include "components/mc/base/ucc_mc_base.h"
 #include "allgather/allgather.h"
 #include "allgatherv/allgatherv.h"
+#include "bcast/bcast.h"
 #include "reduce_scatter/reduce_scatter.h"
 #include "reduce_scatterv/reduce_scatterv.h"
 
@@ -93,6 +94,8 @@ __attribute__((constructor)) static void tl_cuda_iface_init(void)
         ucc_tl_cuda_allgather_algs;
     ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_ALLGATHERV)] =
         ucc_tl_cuda_allgatherv_algs;
+    ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_BCAST)] = 
+        ucc_tl_cuda_bcast_algs;
     ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_REDUCE_SCATTER)] =
         ucc_tl_cuda_reduce_scatter_algs;
     ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_REDUCE_SCATTERV)] =
diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h
index 792100c80c..d86dc7cc66 100644
--- a/src/components/tl/cuda/tl_cuda.h
+++ b/src/components/tl/cuda/tl_cuda.h
@@ -27,6 +27,7 @@
 #define UCC_TL_CUDA_SUPPORTED_COLLS                                            \
     (UCC_COLL_TYPE_ALLTOALL | UCC_COLL_TYPE_ALLTOALLV |                        \
      UCC_COLL_TYPE_ALLGATHER | UCC_COLL_TYPE_ALLGATHERV |                      \
+     UCC_COLL_TYPE_BCAST |                                                     \
      UCC_COLL_TYPE_REDUCE_SCATTER | UCC_COLL_TYPE_REDUCE_SCATTERV)
 
 #define UCC_TL_CUDA_TEAM_LIB(_team)                                            \
@@ -224,6 +225,16 @@ struct ucc_tl_cuda_task {
             size_t (*get_offset)(const ucc_tl_cuda_task_t *task,
                                  ucc_rank_t                block);
         } allgatherv_linear;
+        struct {
+            int                     stage;
+            int                     step;
+            void                   *sbuf;
+            ucc_datatype_t          dt;
+            ucc_rank_t              root;
+            size_t                  size;
+            int                     num_steps;
+            ucc_ee_executor_task_t *exec_task;
+        } bcast_linear;
         struct {
             int                     stage;
             int                     num_frags;
diff --git a/src/components/tl/cuda/tl_cuda_coll.c b/src/components/tl/cuda/tl_cuda_coll.c
index 5d01cc1a94..42b33cdbcc 100644
--- a/src/components/tl/cuda/tl_cuda_coll.c
+++ b/src/components/tl/cuda/tl_cuda_coll.c
@@ -9,6 +9,7 @@
 #include "alltoallv/alltoallv.h"
 #include "allgather/allgather.h"
 #include "allgatherv/allgatherv.h"
+#include "bcast/bcast.h"
 #include "reduce_scatter/reduce_scatter.h"
 #include "reduce_scatterv/reduce_scatterv.h"
 #include "utils/arch/cpu.h"
@@ -35,6 +36,7 @@ const char *
     ucc_tl_cuda_default_alg_select_str[UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR] = {
         UCC_TL_CUDA_ALLGATHER_DEFAULT_ALG_SELECT_STR,
         UCC_TL_CUDA_ALLGATHERV_DEFAULT_ALG_SELECT_STR,
+        UCC_TL_CUDA_BCAST_DEFAULT_ALG_SELECT_STR,
         UCC_TL_CUDA_REDUCE_SCATTER_DEFAULT_ALG_SELECT_STR,
         UCC_TL_CUDA_REDUCE_SCATTERV_DEFAULT_ALG_SELECT_STR};
 
@@ -78,6 +80,8 @@ ucc_status_t ucc_tl_cuda_coll_init(ucc_base_coll_args_t *coll_args,
         return ucc_tl_cuda_allgather_init(coll_args, team, task_h);
     case UCC_COLL_TYPE_ALLGATHERV:
         return ucc_tl_cuda_allgatherv_init(coll_args, team, task_h);
+    case UCC_COLL_TYPE_BCAST:
+        return ucc_tl_cuda_bcast_init(coll_args, team, task_h);
     case UCC_COLL_TYPE_REDUCE_SCATTER:
         return ucc_tl_cuda_reduce_scatter_init(coll_args, team, task_h);
     case UCC_COLL_TYPE_REDUCE_SCATTERV:
@@ -134,6 +138,8 @@ static inline int alg_id_from_str(ucc_coll_type_t coll_type, const char *str)
         return ucc_tl_cuda_allgather_alg_from_str(str);
     case UCC_COLL_TYPE_ALLGATHERV:
         return ucc_tl_cuda_allgatherv_alg_from_str(str);
+    case UCC_COLL_TYPE_BCAST:
+        return ucc_tl_cuda_bcast_alg_from_str(str);
     default:
         break;
     }
@@ -187,6 +193,16 @@ ucc_status_t ucc_tl_cuda_alg_id_to_init(int alg_id, const char *alg_id_str,
             break;
         };
         break;
+    case UCC_COLL_TYPE_BCAST:
+        switch (alg_id) {
+        case UCC_TL_CUDA_BCAST_ALG_LINEAR:
+            *init = ucc_tl_cuda_bcast_linear_init;
+            break;
+        default:
+            status = UCC_ERR_INVALID_PARAM;
+            break;
+        };
+        break;
     case UCC_COLL_TYPE_REDUCE_SCATTER:
         switch (alg_id) {
         case UCC_TL_CUDA_REDUCE_SCATTER_ALG_AUTO:
diff --git a/src/components/tl/cuda/tl_cuda_coll.h b/src/components/tl/cuda/tl_cuda_coll.h
index 8b15cdf249..55b86e2cee 100644
--- a/src/components/tl/cuda/tl_cuda_coll.h
+++ b/src/components/tl/cuda/tl_cuda_coll.h
@@ -10,7 +10,7 @@
 #include "tl_cuda.h"
 #include "components/mc/ucc_mc.h"
 
-#define UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR 4
+#define UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR 5
 extern const char
     *ucc_tl_cuda_default_alg_select_str[UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR];
 
@@ -156,4 +156,21 @@ ucc_status_t ucc_tl_cuda_alg_id_to_init(int alg_id, const char *alg_id_str,
                                         ucc_memory_type_t        mem_type,
                                         ucc_base_coll_init_fn_t *init);
 
+// common utils function for collectives:
+static inline int get_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
+                                int step_id)
+{
+    ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);
+
+    return sync->seq_num[step_id];
+}
+
+static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
+                                 int step, int step_id)
+{
+    ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);
+
+    sync->seq_num[step_id] = step;
+}
+
 #endif
diff --git a/src/components/tl/cuda/tl_cuda_ring.h b/src/components/tl/cuda/tl_cuda_ring.h
index cc2d3c95db..621e074184 100644
--- a/src/components/tl/cuda/tl_cuda_ring.h
+++ b/src/components/tl/cuda/tl_cuda_ring.h
@@ -83,20 +83,4 @@ static inline ucc_rank_t get_recv_block(ucc_tl_cuda_team_t *team,
     return ring->ring[(ring->iring[trank] + tsize - step - 1) % tsize];
 }
 
-static inline int get_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
-                                int ring_id)
-{
-    ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);
-
-    return sync->seq_num[ring_id];
-}
-
-static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
-                                 int step, int ring_id)
-{
-    ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);
-
-    sync->seq_num[ring_id] = step;
-}
-
 #endif
diff --git a/src/components/tl/cuda/tl_cuda_team_topo.h b/src/components/tl/cuda/tl_cuda_team_topo.h
index 96b6d63a5b..1d7d19ad1c 100644
--- a/src/components/tl/cuda/tl_cuda_team_topo.h
+++ b/src/components/tl/cuda/tl_cuda_team_topo.h
@@ -51,7 +51,7 @@ ucc_tl_cuda_team_topo_is_direct(const ucc_tl_team_t *team,
 }
 
 static inline int
-ucc_tl_cuda_team_topo_is_fully_conntected(const ucc_tl_cuda_team_topo_t *topo)
+ucc_tl_cuda_team_topo_is_fully_connected(const ucc_tl_cuda_team_topo_t *topo)
 {
     return topo->is_fully_connected;
 }
diff --git a/test/gtest/coll/test_bcast.cc b/test/gtest/coll/test_bcast.cc
index 6d80816a31..69f697a508 100644
--- a/test/gtest/coll/test_bcast.cc
+++ b/test/gtest/coll/test_bcast.cc
@@ -276,6 +276,8 @@ ucc_job_env_t two_step_env = {{"UCC_CL_HIER_TUNE", "bcast:@2step:0-inf:inf"},
                               {"UCC_CLS", "all"}};
 ucc_job_env_t dbt_env      = {{"UCC_TL_UCP_TUNE", "bcast:@dbt:0-inf:inf"},
                               {"UCC_CLS", "basic"}};
+ucc_job_env_t cuda_env     = {{"UCC_TL_CUDA_TUNE", "bcast:cuda:@0"},
+                              {"UCC_CLS", "basic"}};
 INSTANTIATE_TEST_CASE_P(
     , test_bcast_alg,
     ::testing::Combine(
@@ -285,6 +287,10 @@ INSTANTIATE_TEST_CASE_P(
 #else
         ::testing::Values(UCC_MEMORY_TYPE_HOST),
 #endif
+#ifdef HAVE_CUDA
+        ::testing::Values(two_step_env, dbt_env, cuda_env), //env
+#else
         ::testing::Values(two_step_env, dbt_env), //env
+#endif
         ::testing::Values(8, 65536), // count
-        ::testing::Values(15,16))); // n_procs
+        ::testing::Values(15, 16))); // n_procs