From 999b43ea4a5bf1a12450189907b2471457e7d974 Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Fri, 22 Mar 2024 11:59:58 +0100
Subject: [PATCH 01/34] TL/CUDA: add linear bcast

---
 src/components/tl/cuda/Makefile.am          |  5 ++
 src/components/tl/cuda/bcast/bcast.c        | 28 +++++++
 src/components/tl/cuda/bcast/bcast.h        | 43 +++++++++++
 src/components/tl/cuda/bcast/bcast_linear.c | 86 +++++++++++++++++++++
 src/components/tl/cuda/tl_cuda.c            |  3 +
 src/components/tl/cuda/tl_cuda.h            |  5 ++
 src/components/tl/cuda/tl_cuda_coll.c       | 16 ++++
 src/components/tl/cuda/tl_cuda_coll.h       |  2 +-
 8 files changed, 187 insertions(+), 1 deletion(-)
 create mode 100644 src/components/tl/cuda/bcast/bcast.c
 create mode 100644 src/components/tl/cuda/bcast/bcast.h
 create mode 100644 src/components/tl/cuda/bcast/bcast_linear.c

diff --git a/src/components/tl/cuda/Makefile.am b/src/components/tl/cuda/Makefile.am
index e22796e6fa..c5a05f9c3b 100644
--- a/src/components/tl/cuda/Makefile.am
+++ b/src/components/tl/cuda/Makefile.am
@@ -27,6 +27,11 @@ alltoallv =                  \
 	alltoallv/alltoallv.c    \
 	alltoallv/alltoallv_ce.c
 
+bcast =                  \
+	bcast/bcast.h        \
+	bcast/bcast.c        \
+	bcast/bcast_linear.c
+
 reduce_scatter =                           \
 	reduce_scatter/reduce_scatter.h        \
 	reduce_scatter/reduce_scatter.c        \
diff --git a/src/components/tl/cuda/bcast/bcast.c b/src/components/tl/cuda/bcast/bcast.c
new file mode 100644
index 0000000000..46623684fe
--- /dev/null
+++ b/src/components/tl/cuda/bcast/bcast.c
@@ -0,0 +1,28 @@
+/**
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include "bcast.h"
+#include "components/mc/ucc_mc.h"
+
+ucc_base_coll_alg_info_t
+    ucc_tl_cuda_bcast_algs[UCC_TL_CUDA_BCAST_ALG_LAST + 1] = {
+        [UCC_TL_CUDA_BCAST_ALG_LINEAR] = {.id   = UCC_TL_CUDA_BCAST_ALG_LINEAR,
+                                          .name = "linear",
+                                          .desc = "linear bcast algorithm"},
+        [UCC_TL_CUDA_BCAST_ALG_LAST]   = {.id = 0, .name = NULL, .desc = NULL}};
+
+ucc_status_t ucc_tl_cuda_bcast_init(ucc_base_coll_args_t *coll_args,
+                                    ucc_base_team_t      *tl_team,
+                                    ucc_coll_task_t     **task_p)
+{
+    ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
+
+    if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
+        return ucc_tl_cuda_bcast_linear_init(coll_args, tl_team, task_p);
+    } else {
+        return UCC_ERR_NOT_SUPPORTED;
+    }
+}
diff --git a/src/components/tl/cuda/bcast/bcast.h b/src/components/tl/cuda/bcast/bcast.h
new file mode 100644
index 0000000000..17d07a529b
--- /dev/null
+++ b/src/components/tl/cuda/bcast/bcast.h
@@ -0,0 +1,43 @@
+/**
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See file LICENSE for terms.
+ */
+
+#ifndef BCAST_H_
+#define BCAST_H_
+
+#include "tl_cuda.h"
+#include "tl_cuda_coll.h"
+
+enum
+{
+    UCC_TL_CUDA_BCAST_ALG_LINEAR,
+    UCC_TL_CUDA_BCAST_ALG_LAST
+};
+
+extern ucc_base_coll_alg_info_t
+    ucc_tl_cuda_bcast_algs[UCC_TL_CUDA_BCAST_ALG_LAST + 1];
+
+#define UCC_TL_CUDA_BCAST_DEFAULT_ALG_SELECT_STR "bcast:cuda:@0"
+
+ucc_status_t ucc_tl_cuda_bcast_init(ucc_base_coll_args_t *coll_args,
+                                    ucc_base_team_t      *tl_team,
+                                    ucc_coll_task_t     **task_p);
+
+ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
+                                           ucc_base_team_t      *tl_team,
+                                           ucc_coll_task_t     **task_p);
+
+static inline int ucc_tl_cuda_bcast_alg_from_str(const char *str)
+{
+    int i;
+    for (i = 0; i < UCC_TL_CUDA_BCAST_ALG_LAST; i++) {
+        if (0 == strcasecmp(str, ucc_tl_cuda_bcast_algs[i].name)) {
+            break;
+        }
+    }
+    return i;
+}
+
+#endif
diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
new file mode 100644
index 0000000000..a1f5474501
--- /dev/null
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -0,0 +1,86 @@
+/**
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See file LICENSE for terms.
+ */
+
+#include "bcast/bcast.h"
+
+enum
+{
+    STAGE_SYNC,    /*< Wait for free SYNC segment */
+    STAGE_SETUP,   /*< Wait for memhandle setup to finish */
+    STAGE_COPIES,  /*< Linear algorithm is running */
+    STAGE_BARRIER, /*< Linear algorithm is done, waiting for
+                    *  other ranks to finish */
+};
+
+ucc_status_t ucc_tl_cuda_bcast_linear_finalize(ucc_coll_task_t *coll_task)
+{
+    ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
+
+    tl_trace(UCC_TASK_LIB(task), "finalizing task %p", task);
+    ucc_tl_cuda_task_put(task);
+    return UCC_OK;
+}
+
+void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
+{
+    ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
+    ucc_tl_cuda_team_t *team = TASK_TEAM(task);
+    ucc_status_t        st;
+
+    task->super.status = UCC_INPROGRESS;
+}
+
+ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
+{
+    ucc_tl_cuda_task_t *task  = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
+    ucc_tl_cuda_team_t *team  = TASK_TEAM(task);
+    ucc_coll_args_t *   args  = &TASK_ARGS(task);
+    ucc_rank_t          tsize = UCC_TL_TEAM_SIZE(team);
+    ucc_datatype_t      dt    = task->allgatherv_linear.dt;
+    ucc_rank_t          i;
+    size_t              send_size, frag_size, ssize;
+
+    task->bcast_linear.stage         = STAGE_SYNC;
+    task->allgatherv_linear.sbuf          = args->src.info.buffer;
+
+
+    return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super);
+}
+
+ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
+                                               ucc_base_team_t *     tl_team,
+                                               ucc_coll_task_t **    task_p)
+{
+    ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
+    ucc_tl_cuda_task_t *task;
+    ucc_status_t        status;
+
+    if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) ||
+        UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
+        return UCC_ERR_NOT_SUPPORTED;
+    }
+
+    status = ucc_tl_cuda_task_init(coll_args, team, &task);
+    if (ucc_unlikely(status != UCC_OK)) {
+        return status;
+    }
+
+    // task->allgatherv_linear.get_count  = ucc_tl_cuda_allgather_get_count;
+    // task->allgatherv_linear.get_offset = ucc_tl_cuda_allgather_get_offset;
+    // task->allgatherv_linear.dt         = coll_args->args.dst.info.datatype;
+    // task->allgatherv_linear.sbuf       = coll_args->args.src.info.buffer;
+    // task->allgatherv_linear.rbuf       = coll_args->args.dst.info.buffer;
+
+    task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR;
+    task->super.post           = ucc_tl_cuda_allgatherv_linear_start;
+    task->super.progress       = ucc_tl_cuda_allgatherv_linear_progress;
+    task->super.finalize       = ucc_tl_cuda_allgatherv_linear_finalize;
+    task->bar                  = TASK_BAR(task);
+
+    *task_p = &task->super;
+    return UCC_OK;
+}
+
diff --git a/src/components/tl/cuda/tl_cuda.c b/src/components/tl/cuda/tl_cuda.c
index 98dccf26bf..18135fae00 100644
--- a/src/components/tl/cuda/tl_cuda.c
+++ b/src/components/tl/cuda/tl_cuda.c
@@ -9,6 +9,7 @@
 #include "components/mc/base/ucc_mc_base.h"
 #include "allgather/allgather.h"
 #include "allgatherv/allgatherv.h"
+#include "bcast/bcast.h"
 #include "reduce_scatter/reduce_scatter.h"
 #include "reduce_scatterv/reduce_scatterv.h"
 
@@ -93,6 +94,8 @@ __attribute__((constructor)) static void tl_cuda_iface_init(void)
         ucc_tl_cuda_allgather_algs;
     ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_ALLGATHERV)] =
         ucc_tl_cuda_allgatherv_algs;
+    ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_BCAST)] = 
+        ucc_tl_cuda_bcast_algs;
     ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_REDUCE_SCATTER)] =
         ucc_tl_cuda_reduce_scatter_algs;
     ucc_tl_cuda.super.alg_info[ucc_ilog2(UCC_COLL_TYPE_REDUCE_SCATTERV)] =
diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h
index 792100c80c..751dc4d8ad 100644
--- a/src/components/tl/cuda/tl_cuda.h
+++ b/src/components/tl/cuda/tl_cuda.h
@@ -27,6 +27,7 @@
 #define UCC_TL_CUDA_SUPPORTED_COLLS                                            \
     (UCC_COLL_TYPE_ALLTOALL | UCC_COLL_TYPE_ALLTOALLV |                        \
      UCC_COLL_TYPE_ALLGATHER | UCC_COLL_TYPE_ALLGATHERV |                      \
+     UCC_COLL_TYPE_BCAST |                                                     \
      UCC_COLL_TYPE_REDUCE_SCATTER | UCC_COLL_TYPE_REDUCE_SCATTERV)
 
 #define UCC_TL_CUDA_TEAM_LIB(_team)                                            \
@@ -224,6 +225,10 @@ struct ucc_tl_cuda_task {
             size_t (*get_offset)(const ucc_tl_cuda_task_t *task,
                                  ucc_rank_t                block);
         } allgatherv_linear;
+        
+        struct {
+            int stage;
+        } bcast_linear;
         struct {
             int                     stage;
             int                     num_frags;
diff --git a/src/components/tl/cuda/tl_cuda_coll.c b/src/components/tl/cuda/tl_cuda_coll.c
index 5d01cc1a94..42b33cdbcc 100644
--- a/src/components/tl/cuda/tl_cuda_coll.c
+++ b/src/components/tl/cuda/tl_cuda_coll.c
@@ -9,6 +9,7 @@
 #include "alltoallv/alltoallv.h"
 #include "allgather/allgather.h"
 #include "allgatherv/allgatherv.h"
+#include "bcast/bcast.h"
 #include "reduce_scatter/reduce_scatter.h"
 #include "reduce_scatterv/reduce_scatterv.h"
 #include "utils/arch/cpu.h"
@@ -35,6 +36,7 @@ const char *
     ucc_tl_cuda_default_alg_select_str[UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR] = {
         UCC_TL_CUDA_ALLGATHER_DEFAULT_ALG_SELECT_STR,
         UCC_TL_CUDA_ALLGATHERV_DEFAULT_ALG_SELECT_STR,
+        UCC_TL_CUDA_BCAST_DEFAULT_ALG_SELECT_STR,
         UCC_TL_CUDA_REDUCE_SCATTER_DEFAULT_ALG_SELECT_STR,
         UCC_TL_CUDA_REDUCE_SCATTERV_DEFAULT_ALG_SELECT_STR};
 
@@ -78,6 +80,8 @@ ucc_status_t ucc_tl_cuda_coll_init(ucc_base_coll_args_t *coll_args,
         return ucc_tl_cuda_allgather_init(coll_args, team, task_h);
     case UCC_COLL_TYPE_ALLGATHERV:
         return ucc_tl_cuda_allgatherv_init(coll_args, team, task_h);
+    case UCC_COLL_TYPE_BCAST:
+        return ucc_tl_cuda_bcast_init(coll_args, team, task_h);
     case UCC_COLL_TYPE_REDUCE_SCATTER:
         return ucc_tl_cuda_reduce_scatter_init(coll_args, team, task_h);
     case UCC_COLL_TYPE_REDUCE_SCATTERV:
@@ -134,6 +138,8 @@ static inline int alg_id_from_str(ucc_coll_type_t coll_type, const char *str)
         return ucc_tl_cuda_allgather_alg_from_str(str);
     case UCC_COLL_TYPE_ALLGATHERV:
         return ucc_tl_cuda_allgatherv_alg_from_str(str);
+    case UCC_COLL_TYPE_BCAST:
+        return ucc_tl_cuda_bcast_alg_from_str(str);
     default:
         break;
     }
@@ -187,6 +193,16 @@ ucc_status_t ucc_tl_cuda_alg_id_to_init(int alg_id, const char *alg_id_str,
             break;
         };
         break;
+    case UCC_COLL_TYPE_BCAST:
+        switch (alg_id) {
+        case UCC_TL_CUDA_BCAST_ALG_LINEAR:
+            *init = ucc_tl_cuda_bcast_linear_init;
+            break;
+        default:
+            status = UCC_ERR_INVALID_PARAM;
+            break;
+        };
+        break;
     case UCC_COLL_TYPE_REDUCE_SCATTER:
         switch (alg_id) {
         case UCC_TL_CUDA_REDUCE_SCATTER_ALG_AUTO:
diff --git a/src/components/tl/cuda/tl_cuda_coll.h b/src/components/tl/cuda/tl_cuda_coll.h
index 8b15cdf249..f450ff950c 100644
--- a/src/components/tl/cuda/tl_cuda_coll.h
+++ b/src/components/tl/cuda/tl_cuda_coll.h
@@ -10,7 +10,7 @@
 #include "tl_cuda.h"
 #include "components/mc/ucc_mc.h"
 
-#define UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR 4
+#define UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR 5
 extern const char
     *ucc_tl_cuda_default_alg_select_str[UCC_TL_CUDA_N_DEFAULT_ALG_SELECT_STR];
 

From fceeb8f99eff1d792b6c6e12fc4051aff0e17609 Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Fri, 22 Mar 2024 11:59:58 +0100
Subject: [PATCH 02/34] TL/CUDA: fix build

---
 src/components/tl/cuda/Makefile.am          |  1 +
 src/components/tl/cuda/bcast/bcast_linear.c | 16 +++++++++-------
 src/components/tl/cuda/tl_cuda_context.c    | 16 ++++++++--------
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/src/components/tl/cuda/Makefile.am b/src/components/tl/cuda/Makefile.am
index c5a05f9c3b..2136821b93 100644
--- a/src/components/tl/cuda/Makefile.am
+++ b/src/components/tl/cuda/Makefile.am
@@ -59,6 +59,7 @@ sources =               \
 	$(allgatherv)       \
 	$(alltoall)         \
 	$(alltoallv)        \
+	$(bcast)            \
 	$(reduce_scatter)   \
 	$(reduce_scatterv)
 
diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index a1f5474501..db9bdb04e5 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -29,7 +29,8 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
     ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
     ucc_tl_cuda_team_t *team = TASK_TEAM(task);
     ucc_status_t        st;
-
+    (void) team;
+    (void) st;
     task->super.status = UCC_INPROGRESS;
 }
 
@@ -40,11 +41,12 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
     ucc_coll_args_t *   args  = &TASK_ARGS(task);
     ucc_rank_t          tsize = UCC_TL_TEAM_SIZE(team);
     ucc_datatype_t      dt    = task->allgatherv_linear.dt;
-    ucc_rank_t          i;
-    size_t              send_size, frag_size, ssize;
 
+    (void) tsize;
+    (void) args;
+    (void) dt;
     task->bcast_linear.stage         = STAGE_SYNC;
-    task->allgatherv_linear.sbuf          = args->src.info.buffer;
+    // task->bcast_linear.sbuf          = args->src.info.buffer;
 
 
     return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super);
@@ -75,9 +77,9 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
     // task->allgatherv_linear.rbuf       = coll_args->args.dst.info.buffer;
 
     task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR;
-    task->super.post           = ucc_tl_cuda_allgatherv_linear_start;
-    task->super.progress       = ucc_tl_cuda_allgatherv_linear_progress;
-    task->super.finalize       = ucc_tl_cuda_allgatherv_linear_finalize;
+    task->super.post           = ucc_tl_cuda_bcast_linear_start;
+    task->super.progress       = ucc_tl_cuda_bcast_linear_progress;
+    task->super.finalize       = ucc_tl_cuda_bcast_linear_finalize;
     task->bar                  = TASK_BAR(task);
 
     *task_p = &task->super;
diff --git a/src/components/tl/cuda/tl_cuda_context.c b/src/components/tl/cuda/tl_cuda_context.c
index 4d89029680..ffb48336f5 100644
--- a/src/components/tl/cuda/tl_cuda_context.c
+++ b/src/components/tl/cuda/tl_cuda_context.c
@@ -20,8 +20,8 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_context_t,
     ucc_status_t status;
     int num_devices;
     cudaError_t cuda_st;
-    CUcontext cu_ctx;
-    CUresult cu_st;
+    // CUcontext cu_ctx;
+    // CUresult cu_st;
 
     UCC_CLASS_CALL_SUPER_INIT(ucc_tl_context_t, &tl_cuda_config->super,
                               params->context);
@@ -37,12 +37,12 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_context_t,
         return UCC_ERR_NO_RESOURCE;
     }
 
-    cu_st = cuCtxGetCurrent(&cu_ctx);
-    if (cu_ctx == NULL || cu_st != CUDA_SUCCESS) {
-        tl_debug(self->super.super.lib,
-                 "cannot create CUDA TL context without active CUDA context");
-        return UCC_ERR_NO_RESOURCE;
-    }
+    // cu_st = cuCtxGetCurrent(&cu_ctx);
+    // if (cu_ctx == NULL || cu_st != CUDA_SUCCESS) {
+    //     tl_debug(self->super.super.lib,
+    //              "cannot create CUDA TL context without active CUDA context");
+    //     return UCC_ERR_NO_RESOURCE;
+    // }
 
     status = ucc_mpool_init(&self->req_mp, 0, sizeof(ucc_tl_cuda_task_t), 0,
                             UCC_CACHE_LINE_SIZE, 8, UINT_MAX,

From dc65324e71a05d68bbf721d3bae28a988afb63cb Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Sun, 30 Jun 2024 17:29:41 +0200
Subject: [PATCH 03/34] TL/CUDA: wip

---
 src/components/tl/cuda/bcast/bcast_linear.c | 76 ++++++++++++++++++---
 src/components/tl/cuda/tl_cuda.h            |  5 ++
 2 files changed, 71 insertions(+), 10 deletions(-)

diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index db9bdb04e5..c557bf6fdc 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -8,13 +8,24 @@
 
 enum
 {
-    STAGE_SYNC,    /*< Wait for free SYNC segment */
-    STAGE_SETUP,   /*< Wait for memhandle setup to finish */
-    STAGE_COPIES,  /*< Linear algorithm is running */
-    STAGE_BARRIER, /*< Linear algorithm is done, waiting for
-                    *  other ranks to finish */
+    STAGE_COPY, // post copy task: copy block from src to scratch buffer
+    STAGE_WAIT_COPY, // wait for copy finishes
+    STAGE_WAIT_ALL, // wait for all others rank be on same step
 };
 
+static inline ucc_status_t ecopy(void *dst, void *src, size_t size,
+                                 ucc_ee_executor_t *      exec,
+                                 ucc_ee_executor_task_t **etask)
+{
+    ucc_ee_executor_task_args_t exec_args = {0};
+
+    exec_args.task_type = UCC_EE_EXECUTOR_TASK_COPY;
+    exec_args.copy.dst  = dst;
+    exec_args.copy.src  = src;
+    exec_args.copy.len  = size;
+    return ucc_ee_executor_task_post(exec, &exec_args, etask);
+}
+
 ucc_status_t ucc_tl_cuda_bcast_linear_finalize(ucc_coll_task_t *coll_task)
 {
     ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
@@ -26,12 +37,50 @@ ucc_status_t ucc_tl_cuda_bcast_linear_finalize(ucc_coll_task_t *coll_task)
 
 void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
 {
-    ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
-    ucc_tl_cuda_team_t *team = TASK_TEAM(task);
+    ucc_tl_cuda_task_t *task  = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
+    ucc_tl_cuda_team_t *team  = TASK_TEAM(task);
+    ucc_rank_t          trank = UCC_TL_TEAM_RANK(team);
     ucc_status_t        st;
-    (void) team;
-    (void) st;
+    (void)team;
+    (void)st;
+    ucc_ee_executor_task_t *etask;
+    ucc_ee_executor_t      *exec;
+    ucc_status_t            status;
+    void *                  sbuf, *dbuf, *scratch_buf;
     task->super.status = UCC_INPROGRESS;
+
+    // fall-through between cases is intentional
+    switch (task->bcast_linear.stage)
+    {
+    case STAGE_COPY:
+        // copy from src buffer to scratch
+        scratch_buf = TASK_SCRATCH(task, trank);
+        block_size = 
+        status = ecopy(dbuf, sbuf, block_size);
+        break;
+    
+    default:
+        break;
+    }
+
+    status = ucc_coll_task_get_executor(&task->super, &exec);
+    if (ucc_unlikely(status != UCC_OK)) {
+        return;
+    }
+    
+    etask = &task->bcast_linear.exec_task;
+
+    ucc_ee_executor_task_args_t exec_args = {0};
+
+    exec_args.task_type = UCC_EE_EXECUTOR_TASK_COPY;
+    exec_args.copy.dst  = task->bcast_linear.dbuf;
+    exec_args.copy.src  = task->bcast_linear.sbuf;
+    exec_args.copy.len  = task->bcast_linear.size;
+    status = ucc_ee_executor_task_post(exec, &exec_args, etask);
+    if (ucc_unlikely(status != UCC_OK)) {
+        ucc_error("ucc_ee_executor_task_post failed");
+        return;
+    }
 }
 
 ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
@@ -46,7 +95,10 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
     (void) args;
     (void) dt;
     task->bcast_linear.stage         = STAGE_SYNC;
-    // task->bcast_linear.sbuf          = args->src.info.buffer;
+    ucc_info("bcast start");
+
+    task->bcast_linear.sbuf = args->src.info.buffer;
+    task->bcast_linear.rbuf = args->dst.info.buffer;
 
 
     return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super);
@@ -60,6 +112,8 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
     ucc_tl_cuda_task_t *task;
     ucc_status_t        status;
 
+    ucc_info("bcast init");
+
     if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) ||
         UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
         return UCC_ERR_NOT_SUPPORTED;
@@ -82,6 +136,8 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
     task->super.finalize       = ucc_tl_cuda_bcast_linear_finalize;
     task->bar                  = TASK_BAR(task);
 
+    ucc_info("bcast init success");
+
     *task_p = &task->super;
     return UCC_OK;
 }
diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h
index 751dc4d8ad..0d1a484187 100644
--- a/src/components/tl/cuda/tl_cuda.h
+++ b/src/components/tl/cuda/tl_cuda.h
@@ -228,6 +228,11 @@ struct ucc_tl_cuda_task {
         
         struct {
             int stage;
+            int step;
+            void *                  sbuf;
+            void *                  rbuf;
+            size_t size;
+            ucc_ee_executor_task_t *exec_task;
         } bcast_linear;
         struct {
             int                     stage;

From 74375184889a86b854b2501c9ef490c3eb541580 Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Mon, 1 Jul 2024 20:33:57 +0200
Subject: [PATCH 04/34] TL/CUDA: fix compilation

---
 src/components/tl/cuda/bcast/bcast_linear.c | 46 ++++++++-------------
 src/components/tl/cuda/tl_cuda.h            |  1 +
 2 files changed, 18 insertions(+), 29 deletions(-)

diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index c557bf6fdc..3e0f1fbfc8 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -43,44 +43,28 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
     ucc_status_t        st;
     (void)team;
     (void)st;
-    ucc_ee_executor_task_t *etask;
     ucc_ee_executor_t      *exec;
     ucc_status_t            status;
-    void *                  sbuf, *dbuf, *scratch_buf;
+    void *                  sbuf, *dbuf;
     task->super.status = UCC_INPROGRESS;
 
+    status = ucc_coll_task_get_executor(&task->super, &exec);
+    if (ucc_unlikely(status != UCC_OK)) {
+        return;
+    }
+    
     // fall-through between cases is intentional
     switch (task->bcast_linear.stage)
     {
     case STAGE_COPY:
         // copy from src buffer to scratch
-        scratch_buf = TASK_SCRATCH(task, trank);
-        block_size = 
-        status = ecopy(dbuf, sbuf, block_size);
+        dbuf = TASK_SCRATCH(task, trank);
+        sbuf = task->bcast_linear.sbuf;
+        status = ecopy(dbuf, sbuf, task->bcast_linear.size, exec, &task->bcast_linear.exec_task);
         break;
-    
     default:
         break;
     }
-
-    status = ucc_coll_task_get_executor(&task->super, &exec);
-    if (ucc_unlikely(status != UCC_OK)) {
-        return;
-    }
-    
-    etask = &task->bcast_linear.exec_task;
-
-    ucc_ee_executor_task_args_t exec_args = {0};
-
-    exec_args.task_type = UCC_EE_EXECUTOR_TASK_COPY;
-    exec_args.copy.dst  = task->bcast_linear.dbuf;
-    exec_args.copy.src  = task->bcast_linear.sbuf;
-    exec_args.copy.len  = task->bcast_linear.size;
-    status = ucc_ee_executor_task_post(exec, &exec_args, etask);
-    if (ucc_unlikely(status != UCC_OK)) {
-        ucc_error("ucc_ee_executor_task_post failed");
-        return;
-    }
 }
 
 ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
@@ -89,13 +73,13 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
     ucc_tl_cuda_team_t *team  = TASK_TEAM(task);
     ucc_coll_args_t *   args  = &TASK_ARGS(task);
     ucc_rank_t          tsize = UCC_TL_TEAM_SIZE(team);
-    ucc_datatype_t      dt    = task->allgatherv_linear.dt;
+    ucc_datatype_t      dt    = task->bcast_linear.dt;
 
     (void) tsize;
     (void) args;
     (void) dt;
-    task->bcast_linear.stage         = STAGE_SYNC;
-    ucc_info("bcast start");
+    task->bcast_linear.stage         = STAGE_COPY;
+    ucc_info("bcast start with dt: %ld", dt);
 
     task->bcast_linear.sbuf = args->src.info.buffer;
     task->bcast_linear.rbuf = args->dst.info.buffer;
@@ -126,10 +110,14 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
 
     // task->allgatherv_linear.get_count  = ucc_tl_cuda_allgather_get_count;
     // task->allgatherv_linear.get_offset = ucc_tl_cuda_allgather_get_offset;
-    // task->allgatherv_linear.dt         = coll_args->args.dst.info.datatype;
+    task->bcast_linear.dt         = coll_args->args.src.info.datatype;
+
+    ucc_info("bcast start with dt: %ld", task->bcast_linear.dt);
+
     // task->allgatherv_linear.sbuf       = coll_args->args.src.info.buffer;
     // task->allgatherv_linear.rbuf       = coll_args->args.dst.info.buffer;
 
+
     task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR;
     task->super.post           = ucc_tl_cuda_bcast_linear_start;
     task->super.progress       = ucc_tl_cuda_bcast_linear_progress;
diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h
index 0d1a484187..f3bb3135d9 100644
--- a/src/components/tl/cuda/tl_cuda.h
+++ b/src/components/tl/cuda/tl_cuda.h
@@ -231,6 +231,7 @@ struct ucc_tl_cuda_task {
             int step;
             void *                  sbuf;
             void *                  rbuf;
+            ucc_datatype_t          dt;
             size_t size;
             ucc_ee_executor_task_t *exec_task;
         } bcast_linear;

From 62157cd684b8aba41c8f222122cb7604ca86f473 Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Mon, 1 Jul 2024 22:03:05 +0200
Subject: [PATCH 05/34] TL/CUDA: calc size

---
 src/components/tl/cuda/bcast/bcast_linear.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index 3e0f1fbfc8..48ea6159a0 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -61,6 +61,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
         dbuf = TASK_SCRATCH(task, trank);
         sbuf = task->bcast_linear.sbuf;
         status = ecopy(dbuf, sbuf, task->bcast_linear.size, exec, &task->bcast_linear.exec_task);
+        task->bcast_linear.stage = STAGE_WAIT_COPY;
         break;
     default:
         break;
@@ -111,9 +112,11 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
     // task->allgatherv_linear.get_count  = ucc_tl_cuda_allgather_get_count;
     // task->allgatherv_linear.get_offset = ucc_tl_cuda_allgather_get_offset;
     task->bcast_linear.dt         = coll_args->args.src.info.datatype;
-
     ucc_info("bcast start with dt: %ld", task->bcast_linear.dt);
 
+    task->bcast_linear.size = coll_args->args.src.info.count * ucc_dt_size(task->bcast_linear.dt);
+    ucc_info("bcast start with data size: %ld", task->bcast_linear.size);
+
     // task->allgatherv_linear.sbuf       = coll_args->args.src.info.buffer;
     // task->allgatherv_linear.rbuf       = coll_args->args.dst.info.buffer;
 

From a0f225e762ed8030c2e437c049cabc9fe0c3bc2d Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Mon, 1 Jul 2024 22:03:05 +0200
Subject: [PATCH 06/34] TL/CUDA: wip some logic for root

---
 src/components/tl/cuda/bcast/bcast_linear.c | 129 ++++++++++++++------
 src/components/tl/cuda/tl_cuda.h            |   1 -
 2 files changed, 91 insertions(+), 39 deletions(-)

diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index 48ea6159a0..e7822e5996 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -6,15 +6,34 @@
 
 #include "bcast/bcast.h"
 
-enum
-{
-    STAGE_COPY, // post copy task: copy block from src to scratch buffer
+enum {
+    STAGE_UNDEF,
+    // root
+    STAGE_COPY,      // post copy task: copy block from src to scratch buffer
     STAGE_WAIT_COPY, // wait for copy finishes
-    STAGE_WAIT_ALL, // wait for all others rank be on same step
+    STAGE_WAIT_ALL,  // wait for all others rank be on same step
+    // non-root
 };
 
+// TODO: move out to common with allgather
+static inline int get_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
+                                int step_id)
+{
+    ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);
+
+    return sync->seq_num[step_id];
+}
+
+static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
+                                 int step, int step_id)
+{
+    ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);
+
+    sync->seq_num[step_id] = step;
+}
+
 static inline ucc_status_t ecopy(void *dst, void *src, size_t size,
-                                 ucc_ee_executor_t *      exec,
+                                 ucc_ee_executor_t       *exec,
                                  ucc_ee_executor_task_t **etask)
 {
     ucc_ee_executor_task_args_t exec_args = {0};
@@ -40,31 +59,68 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
     ucc_tl_cuda_task_t *task  = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
     ucc_tl_cuda_team_t *team  = TASK_TEAM(task);
     ucc_rank_t          trank = UCC_TL_TEAM_RANK(team);
+    ucc_rank_t          tsize = UCC_TL_TEAM_SIZE(team);
     ucc_status_t        st;
     (void)team;
     (void)st;
     ucc_ee_executor_t      *exec;
+    ucc_ee_executor_task_t *etask;
     ucc_status_t            status;
-    void *                  sbuf, *dbuf;
+    void                   *sbuf, *dbuf;
     task->super.status = UCC_INPROGRESS;
 
     status = ucc_coll_task_get_executor(&task->super, &exec);
     if (ucc_unlikely(status != UCC_OK)) {
         return;
     }
-    
-    // fall-through between cases is intentional
-    switch (task->bcast_linear.stage)
-    {
-    case STAGE_COPY:
-        // copy from src buffer to scratch
-        dbuf = TASK_SCRATCH(task, trank);
-        sbuf = task->bcast_linear.sbuf;
-        status = ecopy(dbuf, sbuf, task->bcast_linear.size, exec, &task->bcast_linear.exec_task);
-        task->bcast_linear.stage = STAGE_WAIT_COPY;
-        break;
-    default:
-        break;
+
+    if (trank == 0) {
+        // fall-through between cases is intentional
+        switch (task->bcast_linear.stage) {
+        case STAGE_COPY:
+            // copy from src buffer to scratch
+            dbuf   = TASK_SCRATCH(task, trank);
+            sbuf   = task->bcast_linear.sbuf;
+            status = ecopy(dbuf, sbuf, task->bcast_linear.size, exec,
+                           &task->bcast_linear.exec_task);
+            task->bcast_linear.stage = STAGE_WAIT_COPY;
+            break;
+        case STAGE_WAIT_COPY:
+            etask = task->bcast_linear.exec_task;
+            if (etask) {
+                status = ucc_ee_executor_task_test(etask);
+                if (status == UCC_OK) {
+                    ucc_ee_executor_task_finalize(etask);
+                    task->bcast_linear.exec_task = NULL;
+                    ucc_info("hello from rank: %d, copy done!", trank);
+                    // signal others
+                    ++task->bcast_linear.step;
+                    set_rank_step(task, 0, task->bcast_linear.step, 0);
+                    task->bcast_linear.stage = STAGE_WAIT_ALL;
+                }
+            }
+            break;
+        case STAGE_WAIT_ALL:
+            for (int i = 1; i < tsize; ++i) {
+                int other_rank_step = get_rank_step(task, i, 0);
+                ucc_info("rank %d, step: %d, my step: %d", i, other_rank_step,
+                         task->bcast_linear.step);
+                if (other_rank_step < task->bcast_linear.step) {
+                    ucc_info("rank %d is not ready", i);
+                    return;
+                }
+            }
+            task->bcast_linear.stage = STAGE_COPY;
+            ucc_info("all others ready for next step");
+            // TODO: remove
+            task->bcast_linear.stage = STAGE_UNDEF;
+            break;
+        default:
+            break;
+        }
+        // Root scenario
+    } else {
+        // others
     }
 }
 
@@ -72,26 +128,25 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
 {
     ucc_tl_cuda_task_t *task  = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
     ucc_tl_cuda_team_t *team  = TASK_TEAM(task);
-    ucc_coll_args_t *   args  = &TASK_ARGS(task);
+    ucc_coll_args_t    *args  = &TASK_ARGS(task);
     ucc_rank_t          tsize = UCC_TL_TEAM_SIZE(team);
     ucc_datatype_t      dt    = task->bcast_linear.dt;
 
-    (void) tsize;
-    (void) args;
-    (void) dt;
-    task->bcast_linear.stage         = STAGE_COPY;
+    (void)tsize;
+    (void)args;
+    (void)dt;
+    task->bcast_linear.stage = STAGE_COPY;
     ucc_info("bcast start with dt: %ld", dt);
 
     task->bcast_linear.sbuf = args->src.info.buffer;
-    task->bcast_linear.rbuf = args->dst.info.buffer;
-
+    task->bcast_linear.step = 0;
 
     return ucc_progress_queue_enqueue(UCC_TL_CORE_CTX(team)->pq, &task->super);
 }
 
 ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
-                                               ucc_base_team_t *     tl_team,
-                                               ucc_coll_task_t **    task_p)
+                                           ucc_base_team_t      *tl_team,
+                                           ucc_coll_task_t     **task_p)
 {
     ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
     ucc_tl_cuda_task_t *task;
@@ -100,7 +155,8 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
     ucc_info("bcast init");
 
     if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) ||
-        UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
+                     UCC_TL_TEAM_SIZE(team) - 1 >
+                         UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
         return UCC_ERR_NOT_SUPPORTED;
     }
 
@@ -111,25 +167,22 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
 
     // task->allgatherv_linear.get_count  = ucc_tl_cuda_allgather_get_count;
     // task->allgatherv_linear.get_offset = ucc_tl_cuda_allgather_get_offset;
-    task->bcast_linear.dt         = coll_args->args.src.info.datatype;
+    task->bcast_linear.dt = coll_args->args.src.info.datatype;
     ucc_info("bcast start with dt: %ld", task->bcast_linear.dt);
 
-    task->bcast_linear.size = coll_args->args.src.info.count * ucc_dt_size(task->bcast_linear.dt);
-    ucc_info("bcast start with data size: %ld", task->bcast_linear.size);
-
     // task->allgatherv_linear.sbuf       = coll_args->args.src.info.buffer;
     // task->allgatherv_linear.rbuf       = coll_args->args.dst.info.buffer;
 
+    task->bcast_linear.sbuf = coll_args->args.src.info.buffer;
 
     task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR;
-    task->super.post           = ucc_tl_cuda_bcast_linear_start;
-    task->super.progress       = ucc_tl_cuda_bcast_linear_progress;
-    task->super.finalize       = ucc_tl_cuda_bcast_linear_finalize;
-    task->bar                  = TASK_BAR(task);
+    task->super.post     = ucc_tl_cuda_bcast_linear_start;
+    task->super.progress = ucc_tl_cuda_bcast_linear_progress;
+    task->super.finalize = ucc_tl_cuda_bcast_linear_finalize;
+    task->bar            = TASK_BAR(task);
 
     ucc_info("bcast init success");
 
     *task_p = &task->super;
     return UCC_OK;
 }
-
diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h
index f3bb3135d9..c8f60258aa 100644
--- a/src/components/tl/cuda/tl_cuda.h
+++ b/src/components/tl/cuda/tl_cuda.h
@@ -230,7 +230,6 @@ struct ucc_tl_cuda_task {
             int stage;
             int step;
             void *                  sbuf;
-            void *                  rbuf;
             ucc_datatype_t          dt;
             size_t size;
             ucc_ee_executor_task_t *exec_task;

From 4187516289a32d7de3152faf030f60ae30cca102 Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Tue, 2 Jul 2024 19:06:01 +0200
Subject: [PATCH 07/34] TL/CUDA: wip logic for client

---
 src/components/tl/cuda/bcast/bcast_linear.c | 78 +++++++++++++++++----
 1 file changed, 66 insertions(+), 12 deletions(-)

diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index e7822e5996..3a6f99005f 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -6,13 +6,17 @@
 
 #include "bcast/bcast.h"
 
-enum {
-    STAGE_UNDEF,
+enum
+{
+    STAGE_DONE,
     // root
     STAGE_COPY,      // post copy task: copy block from src to scratch buffer
     STAGE_WAIT_COPY, // wait for copy finishes
     STAGE_WAIT_ALL,  // wait for all others rank be on same step
     // non-root
+    STAGE_WAIT_ROOT,
+    STAGE_CLIENT_COPY,
+    STAGE_CLIENT_COPY_WAIT,
 };
 
 // TODO: move out to common with allgather
@@ -33,7 +37,7 @@ static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
 }
 
 static inline ucc_status_t ecopy(void *dst, void *src, size_t size,
-                                 ucc_ee_executor_t       *exec,
+                                 ucc_ee_executor_t *      exec,
                                  ucc_ee_executor_task_t **etask)
 {
     ucc_ee_executor_task_args_t exec_args = {0};
@@ -63,10 +67,10 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
     ucc_status_t        st;
     (void)team;
     (void)st;
-    ucc_ee_executor_t      *exec;
+    ucc_ee_executor_t *     exec;
     ucc_ee_executor_task_t *etask;
     ucc_status_t            status;
-    void                   *sbuf, *dbuf;
+    void *                  sbuf, *dbuf;
     task->super.status = UCC_INPROGRESS;
 
     status = ucc_coll_task_get_executor(&task->super, &exec);
@@ -113,7 +117,10 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
             task->bcast_linear.stage = STAGE_COPY;
             ucc_info("all others ready for next step");
             // TODO: remove
-            task->bcast_linear.stage = STAGE_UNDEF;
+            task->bcast_linear.stage = STAGE_DONE;
+            break;
+        case STAGE_DONE:
+            task->super.status = UCC_OK;
             break;
         default:
             break;
@@ -121,6 +128,43 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
         // Root scenario
     } else {
         // others
+        switch (task->bcast_linear.stage) {
+        case STAGE_WAIT_ROOT:
+            /* code */
+            if (get_rank_step(task, 0 /* root */, 0) >
+                task->bcast_linear.step) {
+                ucc_info("something from root is ready!");
+                task->bcast_linear.stage = STAGE_CLIENT_COPY;
+                break;
+            }
+            break;
+        case STAGE_CLIENT_COPY:
+            dbuf   = task->bcast_linear.sbuf;
+            sbuf   = TASK_SCRATCH(task,
+                                0); // need to copy from root's scratch buffer
+            status = ecopy(dbuf, sbuf, task->bcast_linear.size, exec,
+                           &task->bcast_linear.exec_task);
+            task->bcast_linear.stage = STAGE_CLIENT_COPY_WAIT;
+            break;
+        case STAGE_CLIENT_COPY_WAIT:
+            etask = task->bcast_linear.exec_task;
+            if (etask) {
+                status = ucc_ee_executor_task_test(etask);
+                if (status == UCC_OK) {
+                    ucc_ee_executor_task_finalize(etask);
+                    task->bcast_linear.exec_task = NULL;
+                    ++task->bcast_linear.step;
+                    set_rank_step(task, trank, task->bcast_linear.step, 0);
+                    task->bcast_linear.stage = STAGE_DONE; // TODO: just for debug
+                }
+            }
+            break;
+        case STAGE_DONE:
+            task->super.status = UCC_OK;
+            break;
+        default:
+            break;
+        }
     }
 }
 
@@ -128,15 +172,25 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
 {
     ucc_tl_cuda_task_t *task  = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
     ucc_tl_cuda_team_t *team  = TASK_TEAM(task);
-    ucc_coll_args_t    *args  = &TASK_ARGS(task);
+    ucc_rank_t          trank = UCC_TL_TEAM_RANK(team);
+    ucc_coll_args_t *   args  = &TASK_ARGS(task);
     ucc_rank_t          tsize = UCC_TL_TEAM_SIZE(team);
     ucc_datatype_t      dt    = task->bcast_linear.dt;
 
     (void)tsize;
     (void)args;
     (void)dt;
-    task->bcast_linear.stage = STAGE_COPY;
-    ucc_info("bcast start with dt: %ld", dt);
+
+    if (trank == 0) {
+        task->bcast_linear.stage = STAGE_COPY;
+    } else {
+        task->bcast_linear.stage = STAGE_WAIT_ROOT;
+    }
+    ucc_info("bcast start with dt: %ld and count: %ld", dt, args->src.info.count);
+
+    task->bcast_linear.size = ucc_dt_size(dt) * args->src.info.count;
+
+    ucc_info("bcast buffer size: %ld", task->bcast_linear.size);
 
     task->bcast_linear.sbuf = args->src.info.buffer;
     task->bcast_linear.step = 0;
@@ -145,8 +199,8 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
 }
 
 ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
-                                           ucc_base_team_t      *tl_team,
-                                           ucc_coll_task_t     **task_p)
+                                           ucc_base_team_t *     tl_team,
+                                           ucc_coll_task_t **    task_p)
 {
     ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
     ucc_tl_cuda_task_t *task;
@@ -168,7 +222,7 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
     // task->allgatherv_linear.get_count  = ucc_tl_cuda_allgather_get_count;
     // task->allgatherv_linear.get_offset = ucc_tl_cuda_allgather_get_offset;
     task->bcast_linear.dt = coll_args->args.src.info.datatype;
-    ucc_info("bcast start with dt: %ld", task->bcast_linear.dt);
+    ucc_info("bcast init with dt: %ld", task->bcast_linear.dt);
 
     // task->allgatherv_linear.sbuf       = coll_args->args.src.info.buffer;
     // task->allgatherv_linear.rbuf       = coll_args->args.dst.info.buffer;

From 3f0e4d9c882c58d6dcdbfe497e866601f0bccc4a Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Wed, 3 Jul 2024 13:20:13 +0200
Subject: [PATCH 08/34] TL/CUDA: added barrier to sync stages

---
 src/components/tl/cuda/bcast/bcast_linear.c | 97 ++++++++++++++++-----
 1 file changed, 74 insertions(+), 23 deletions(-)

diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index 3a6f99005f..41e224acff 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -6,9 +6,10 @@
 
 #include "bcast/bcast.h"
 
-enum
-{
+enum {
     STAGE_DONE,
+    STAGE_SYNC,
+    STAGE_SETUP,
     // root
     STAGE_COPY,      // post copy task: copy block from src to scratch buffer
     STAGE_WAIT_COPY, // wait for copy finishes
@@ -36,8 +37,33 @@ static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
     sync->seq_num[step_id] = step;
 }
 
+ucc_status_t ucc_tl_cuda_bcast_linear_setup_start(ucc_tl_cuda_task_t *task)
+{
+    ucc_tl_cuda_team_t *team  = TASK_TEAM(task);
+    ucc_rank_t          trank = UCC_TL_TEAM_RANK(team);
+    ucc_status_t        status;
+
+    set_rank_step(task, trank, 0, 0);
+    ucc_memory_cpu_store_fence();
+    status = ucc_tl_cuda_shm_barrier_start(UCC_TL_TEAM_RANK(team), task->bar);
+    if (ucc_unlikely(status != UCC_OK)) {
+        goto exit_err;
+    }
+
+    return UCC_OK;
+
+exit_err:
+    return status;
+}
+
+ucc_status_t ucc_tl_cuda_bcast_linear_setup_test(ucc_tl_cuda_task_t *task)
+{
+    ucc_tl_cuda_team_t *team = TASK_TEAM(task);
+    return ucc_tl_cuda_shm_barrier_test(UCC_TL_TEAM_RANK(team), task->bar);
+}
+
 static inline ucc_status_t ecopy(void *dst, void *src, size_t size,
-                                 ucc_ee_executor_t *      exec,
+                                 ucc_ee_executor_t       *exec,
                                  ucc_ee_executor_task_t **etask)
 {
     ucc_ee_executor_task_args_t exec_args = {0};
@@ -67,10 +93,10 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
     ucc_status_t        st;
     (void)team;
     (void)st;
-    ucc_ee_executor_t *     exec;
+    ucc_ee_executor_t      *exec;
     ucc_ee_executor_task_t *etask;
     ucc_status_t            status;
-    void *                  sbuf, *dbuf;
+    void                   *sbuf, *dbuf;
     task->super.status = UCC_INPROGRESS;
 
     status = ucc_coll_task_get_executor(&task->super, &exec);
@@ -78,6 +104,38 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
         return;
     }
 
+    switch (task->bcast_linear.stage) {
+    case STAGE_SYNC:
+        ucc_info("sync");
+        if (ucc_tl_cuda_get_sync(task) != UCC_OK) {
+            task->super.status = UCC_INPROGRESS;
+            return;
+        }
+        task->bcast_linear.step = 0;
+        ucc_info("setup");
+        st = ucc_tl_cuda_bcast_linear_setup_start(task);
+        if (st != UCC_OK) {
+            task->super.status = st;
+            return;
+        }
+        task->bcast_linear.stage = STAGE_SETUP;
+    case STAGE_SETUP:
+        ucc_info("test");
+        st = ucc_tl_cuda_bcast_linear_setup_test(task);
+        if (st != UCC_OK) {
+            task->super.status = st;
+            return;
+        }
+        ucc_tl_cuda_put_sync(task);
+        if (trank == 0 /* root */) {
+            task->bcast_linear.stage = STAGE_COPY;
+        } else {
+            task->bcast_linear.stage = STAGE_WAIT_ROOT;
+        }
+    default:
+        break;
+    }
+
     if (trank == 0) {
         // fall-through between cases is intentional
         switch (task->bcast_linear.stage) {
@@ -141,7 +199,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
         case STAGE_CLIENT_COPY:
             dbuf   = task->bcast_linear.sbuf;
             sbuf   = TASK_SCRATCH(task,
-                                0); // need to copy from root's scratch buffer
+                                  0); // need to copy from root's scratch buffer
             status = ecopy(dbuf, sbuf, task->bcast_linear.size, exec,
                            &task->bcast_linear.exec_task);
             task->bcast_linear.stage = STAGE_CLIENT_COPY_WAIT;
@@ -155,7 +213,8 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                     task->bcast_linear.exec_task = NULL;
                     ++task->bcast_linear.step;
                     set_rank_step(task, trank, task->bcast_linear.step, 0);
-                    task->bcast_linear.stage = STAGE_DONE; // TODO: just for debug
+                    task->bcast_linear.stage =
+                        STAGE_DONE; // TODO: just for debug
                 }
             }
             break;
@@ -172,8 +231,7 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
 {
     ucc_tl_cuda_task_t *task  = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
     ucc_tl_cuda_team_t *team  = TASK_TEAM(task);
-    ucc_rank_t          trank = UCC_TL_TEAM_RANK(team);
-    ucc_coll_args_t *   args  = &TASK_ARGS(task);
+    ucc_coll_args_t    *args  = &TASK_ARGS(task);
     ucc_rank_t          tsize = UCC_TL_TEAM_SIZE(team);
     ucc_datatype_t      dt    = task->bcast_linear.dt;
 
@@ -181,12 +239,10 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
     (void)args;
     (void)dt;
 
-    if (trank == 0) {
-        task->bcast_linear.stage = STAGE_COPY;
-    } else {
-        task->bcast_linear.stage = STAGE_WAIT_ROOT;
-    }
-    ucc_info("bcast start with dt: %ld and count: %ld", dt, args->src.info.count);
+    task->bcast_linear.stage = STAGE_SYNC;
+
+    ucc_info("bcast start with dt: %s and count: %ld", ucc_datatype_str(dt),
+             args->src.info.count);
 
     task->bcast_linear.size = ucc_dt_size(dt) * args->src.info.count;
 
@@ -199,8 +255,8 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
 }
 
 ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
-                                           ucc_base_team_t *     tl_team,
-                                           ucc_coll_task_t **    task_p)
+                                           ucc_base_team_t      *tl_team,
+                                           ucc_coll_task_t     **task_p)
 {
     ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
     ucc_tl_cuda_task_t *task;
@@ -219,13 +275,8 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
         return status;
     }
 
-    // task->allgatherv_linear.get_count  = ucc_tl_cuda_allgather_get_count;
-    // task->allgatherv_linear.get_offset = ucc_tl_cuda_allgather_get_offset;
     task->bcast_linear.dt = coll_args->args.src.info.datatype;
-    ucc_info("bcast init with dt: %ld", task->bcast_linear.dt);
-
-    // task->allgatherv_linear.sbuf       = coll_args->args.src.info.buffer;
-    // task->allgatherv_linear.rbuf       = coll_args->args.dst.info.buffer;
+    ucc_info("bcast init with dt: %s", ucc_datatype_str(task->bcast_linear.dt));
 
     task->bcast_linear.sbuf = coll_args->args.src.info.buffer;
 

From cd13b04071147cc724d3537ad5c89222c2ae800c Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Wed, 3 Jul 2024 15:27:34 +0200
Subject: [PATCH 09/34] TL/CUDA: non zero root

---
 src/components/tl/cuda/bcast/bcast_linear.c | 30 ++++++++++-----------
 src/components/tl/cuda/tl_cuda.h            |  1 +
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index 41e224acff..f6712c2c46 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -106,13 +106,13 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
 
     switch (task->bcast_linear.stage) {
     case STAGE_SYNC:
-        ucc_info("sync");
+        // ucc_info("sync");
         if (ucc_tl_cuda_get_sync(task) != UCC_OK) {
             task->super.status = UCC_INPROGRESS;
             return;
         }
         task->bcast_linear.step = 0;
-        ucc_info("setup");
+        // ucc_info("setup");
         st = ucc_tl_cuda_bcast_linear_setup_start(task);
         if (st != UCC_OK) {
             task->super.status = st;
@@ -120,14 +120,14 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
         }
         task->bcast_linear.stage = STAGE_SETUP;
     case STAGE_SETUP:
-        ucc_info("test");
+        // ucc_info("test");
         st = ucc_tl_cuda_bcast_linear_setup_test(task);
         if (st != UCC_OK) {
             task->super.status = st;
             return;
         }
         ucc_tl_cuda_put_sync(task);
-        if (trank == 0 /* root */) {
+        if (trank == task->bcast_linear.root) {
             task->bcast_linear.stage = STAGE_COPY;
         } else {
             task->bcast_linear.stage = STAGE_WAIT_ROOT;
@@ -136,7 +136,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
         break;
     }
 
-    if (trank == 0) {
+    if (trank == task->bcast_linear.root) {
         // fall-through between cases is intentional
         switch (task->bcast_linear.stage) {
         case STAGE_COPY:
@@ -154,26 +154,25 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                 if (status == UCC_OK) {
                     ucc_ee_executor_task_finalize(etask);
                     task->bcast_linear.exec_task = NULL;
-                    ucc_info("hello from rank: %d, copy done!", trank);
                     // signal others
                     ++task->bcast_linear.step;
-                    set_rank_step(task, 0, task->bcast_linear.step, 0);
+                    set_rank_step(task, task->bcast_linear.root, task->bcast_linear.step, 0);
                     task->bcast_linear.stage = STAGE_WAIT_ALL;
                 }
             }
             break;
         case STAGE_WAIT_ALL:
-            for (int i = 1; i < tsize; ++i) {
+            for (int i = 0; i < tsize; ++i) {
                 int other_rank_step = get_rank_step(task, i, 0);
-                ucc_info("rank %d, step: %d, my step: %d", i, other_rank_step,
-                         task->bcast_linear.step);
+                // ucc_info("rank %d, step: %d, my step: %d", i, other_rank_step,
+                //          task->bcast_linear.step);
                 if (other_rank_step < task->bcast_linear.step) {
-                    ucc_info("rank %d is not ready", i);
+                    // ucc_info("rank %d is not ready", i);
                     return;
                 }
             }
             task->bcast_linear.stage = STAGE_COPY;
-            ucc_info("all others ready for next step");
+            // ucc_info("all others ready for next step");
             // TODO: remove
             task->bcast_linear.stage = STAGE_DONE;
             break;
@@ -189,9 +188,9 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
         switch (task->bcast_linear.stage) {
         case STAGE_WAIT_ROOT:
             /* code */
-            if (get_rank_step(task, 0 /* root */, 0) >
+            if (get_rank_step(task, task->bcast_linear.root, 0) >
                 task->bcast_linear.step) {
-                ucc_info("something from root is ready!");
+                // ucc_info("something from root is ready!");
                 task->bcast_linear.stage = STAGE_CLIENT_COPY;
                 break;
             }
@@ -199,7 +198,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
         case STAGE_CLIENT_COPY:
             dbuf   = task->bcast_linear.sbuf;
             sbuf   = TASK_SCRATCH(task,
-                                  0); // need to copy from root's scratch buffer
+                                  task->bcast_linear.root); // need to copy from root's scratch buffer
             status = ecopy(dbuf, sbuf, task->bcast_linear.size, exec,
                            &task->bcast_linear.exec_task);
             task->bcast_linear.stage = STAGE_CLIENT_COPY_WAIT;
@@ -275,6 +274,7 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
         return status;
     }
 
+    task->bcast_linear.root = coll_args->args.root;
     task->bcast_linear.dt = coll_args->args.src.info.datatype;
     ucc_info("bcast init with dt: %s", ucc_datatype_str(task->bcast_linear.dt));
 
diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h
index c8f60258aa..38608973ac 100644
--- a/src/components/tl/cuda/tl_cuda.h
+++ b/src/components/tl/cuda/tl_cuda.h
@@ -231,6 +231,7 @@ struct ucc_tl_cuda_task {
             int step;
             void *                  sbuf;
             ucc_datatype_t          dt;
+            ucc_rank_t              root;
             size_t size;
             ucc_ee_executor_task_t *exec_task;
         } bcast_linear;

From 54b84dc92e83ad1b56431647b35c7629687d55bf Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Wed, 3 Jul 2024 15:27:56 +0200
Subject: [PATCH 10/34] TL/CUDA: revert commented

---
 src/components/tl/cuda/tl_cuda_context.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/components/tl/cuda/tl_cuda_context.c b/src/components/tl/cuda/tl_cuda_context.c
index ffb48336f5..4d89029680 100644
--- a/src/components/tl/cuda/tl_cuda_context.c
+++ b/src/components/tl/cuda/tl_cuda_context.c
@@ -20,8 +20,8 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_context_t,
     ucc_status_t status;
     int num_devices;
     cudaError_t cuda_st;
-    // CUcontext cu_ctx;
-    // CUresult cu_st;
+    CUcontext cu_ctx;
+    CUresult cu_st;
 
     UCC_CLASS_CALL_SUPER_INIT(ucc_tl_context_t, &tl_cuda_config->super,
                               params->context);
@@ -37,12 +37,12 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_context_t,
         return UCC_ERR_NO_RESOURCE;
     }
 
-    // cu_st = cuCtxGetCurrent(&cu_ctx);
-    // if (cu_ctx == NULL || cu_st != CUDA_SUCCESS) {
-    //     tl_debug(self->super.super.lib,
-    //              "cannot create CUDA TL context without active CUDA context");
-    //     return UCC_ERR_NO_RESOURCE;
-    // }
+    cu_st = cuCtxGetCurrent(&cu_ctx);
+    if (cu_ctx == NULL || cu_st != CUDA_SUCCESS) {
+        tl_debug(self->super.super.lib,
+                 "cannot create CUDA TL context without active CUDA context");
+        return UCC_ERR_NO_RESOURCE;
+    }
 
     status = ucc_mpool_init(&self->req_mp, 0, sizeof(ucc_tl_cuda_task_t), 0,
                             UCC_CACHE_LINE_SIZE, 8, UINT_MAX,

From 7989e60e5cb84563770bf5001c7f98e09044bcf7 Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Wed, 3 Jul 2024 17:55:21 +0200
Subject: [PATCH 11/34] TL/CUDA: wip multistep

---
 src/components/tl/cuda/bcast/bcast_linear.c | 66 ++++++++++++++++-----
 src/components/tl/cuda/tl_cuda.h            |  1 +
 2 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index f6712c2c46..743ddab245 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -62,6 +62,17 @@ ucc_status_t ucc_tl_cuda_bcast_linear_setup_test(ucc_tl_cuda_task_t *task)
     return ucc_tl_cuda_shm_barrier_test(UCC_TL_TEAM_RANK(team), task->bar);
 }
 
+static inline size_t get_scratch_size(ucc_tl_cuda_team_t *team,
+                                      ucc_datatype_t      dt)
+{
+    size_t     dt_size = ucc_dt_size(dt);
+    ucc_rank_t tsize   = UCC_TL_TEAM_SIZE(team);
+
+    ucc_assert((dt_size > 0) && (tsize > 0));
+
+    return UCC_TL_CUDA_TEAM_LIB(team)->cfg.scratch_size;
+}
+
 static inline ucc_status_t ecopy(void *dst, void *src, size_t size,
                                  ucc_ee_executor_t       *exec,
                                  ucc_ee_executor_task_t **etask)
@@ -90,6 +101,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
     ucc_tl_cuda_team_t *team  = TASK_TEAM(task);
     ucc_rank_t          trank = UCC_TL_TEAM_RANK(team);
     ucc_rank_t          tsize = UCC_TL_TEAM_SIZE(team);
+    ucc_datatype_t      dt    = task->bcast_linear.dt;
     ucc_status_t        st;
     (void)team;
     (void)st;
@@ -136,14 +148,23 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
         break;
     }
 
+    size_t scratch_size = get_scratch_size(team, dt);
+    size_t chunk_size   = task->bcast_linear.step < task->bcast_linear.num_steps
+                              ? ucc_min(scratch_size, task->bcast_linear.size)
+                              : task->bcast_linear.size -
+                                  (task->bcast_linear.step - 1) * scratch_size;
+    size_t offset_buff  = task->bcast_linear.step * scratch_size;
+
+    // ucc_info("chunk_size: %ld", chunk_size);
+
     if (trank == task->bcast_linear.root) {
         // fall-through between cases is intentional
         switch (task->bcast_linear.stage) {
         case STAGE_COPY:
             // copy from src buffer to scratch
             dbuf   = TASK_SCRATCH(task, trank);
-            sbuf   = task->bcast_linear.sbuf;
-            status = ecopy(dbuf, sbuf, task->bcast_linear.size, exec,
+            sbuf   = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff);
+            status = ecopy(dbuf, sbuf, chunk_size, exec,
                            &task->bcast_linear.exec_task);
             task->bcast_linear.stage = STAGE_WAIT_COPY;
             break;
@@ -156,7 +177,8 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                     task->bcast_linear.exec_task = NULL;
                     // signal others
                     ++task->bcast_linear.step;
-                    set_rank_step(task, task->bcast_linear.root, task->bcast_linear.step, 0);
+                    set_rank_step(task, task->bcast_linear.root,
+                                  task->bcast_linear.step, 0);
                     task->bcast_linear.stage = STAGE_WAIT_ALL;
                 }
             }
@@ -173,8 +195,11 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
             }
             task->bcast_linear.stage = STAGE_COPY;
             // ucc_info("all others ready for next step");
-            // TODO: remove
-            task->bcast_linear.stage = STAGE_DONE;
+            if (task->bcast_linear.stage < task->bcast_linear.num_steps) {
+                task->bcast_linear.stage = STAGE_COPY;
+            } else {
+                task->bcast_linear.stage = STAGE_DONE;
+            }
             break;
         case STAGE_DONE:
             task->super.status = UCC_OK;
@@ -196,11 +221,13 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
             }
             break;
         case STAGE_CLIENT_COPY:
-            dbuf   = task->bcast_linear.sbuf;
-            sbuf   = TASK_SCRATCH(task,
-                                  task->bcast_linear.root); // need to copy from root's scratch buffer
-            status = ecopy(dbuf, sbuf, task->bcast_linear.size, exec,
-                           &task->bcast_linear.exec_task);
+            dbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff);
+            sbuf = TASK_SCRATCH(
+                task,
+                task->bcast_linear
+                    .root); // need to copy from root's scratch buffer
+            status                   = ecopy(dbuf, sbuf, chunk_size, exec,
+                                             &task->bcast_linear.exec_task);
             task->bcast_linear.stage = STAGE_CLIENT_COPY_WAIT;
             break;
         case STAGE_CLIENT_COPY_WAIT:
@@ -212,8 +239,14 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                     task->bcast_linear.exec_task = NULL;
                     ++task->bcast_linear.step;
                     set_rank_step(task, trank, task->bcast_linear.step, 0);
-                    task->bcast_linear.stage =
-                        STAGE_DONE; // TODO: just for debug
+                    // task->bcast_linear.stage =
+                    //     STAGE_DONE; // TODO: just for debug
+                    if (task->bcast_linear.stage <
+                        task->bcast_linear.num_steps) {
+                        task->bcast_linear.stage = STAGE_COPY;
+                    } else {
+                        task->bcast_linear.stage = STAGE_DONE;
+                    }
                 }
             }
             break;
@@ -235,7 +268,6 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
     ucc_datatype_t      dt    = task->bcast_linear.dt;
 
     (void)tsize;
-    (void)args;
     (void)dt;
 
     task->bcast_linear.stage = STAGE_SYNC;
@@ -244,8 +276,12 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
              args->src.info.count);
 
     task->bcast_linear.size = ucc_dt_size(dt) * args->src.info.count;
+    size_t scratch_size     = get_scratch_size(team, dt);
+    task->bcast_linear.num_steps =
+        ucc_div_round_up(task->bcast_linear.size, scratch_size);
 
-    ucc_info("bcast buffer size: %ld", task->bcast_linear.size);
+    ucc_info("bcast buffer size: %ld, num_steps: %d", task->bcast_linear.size,
+             task->bcast_linear.num_steps);
 
     task->bcast_linear.sbuf = args->src.info.buffer;
     task->bcast_linear.step = 0;
@@ -275,7 +311,7 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
     }
 
     task->bcast_linear.root = coll_args->args.root;
-    task->bcast_linear.dt = coll_args->args.src.info.datatype;
+    task->bcast_linear.dt   = coll_args->args.src.info.datatype;
     ucc_info("bcast init with dt: %s", ucc_datatype_str(task->bcast_linear.dt));
 
     task->bcast_linear.sbuf = coll_args->args.src.info.buffer;
diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h
index 38608973ac..096bab1a0e 100644
--- a/src/components/tl/cuda/tl_cuda.h
+++ b/src/components/tl/cuda/tl_cuda.h
@@ -233,6 +233,7 @@ struct ucc_tl_cuda_task {
             ucc_datatype_t          dt;
             ucc_rank_t              root;
             size_t size;
+            int num_steps;
             ucc_ee_executor_task_t *exec_task;
         } bcast_linear;
         struct {

From e400efdc35126c11a86d509926f8ef99b910a6a3 Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Thu, 4 Jul 2024 16:55:44 +0200
Subject: [PATCH 12/34] TL/CUDA: fix step check

---
 src/components/tl/cuda/bcast/bcast_linear.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index 743ddab245..ad55cfc1a3 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -195,7 +195,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
             }
             task->bcast_linear.stage = STAGE_COPY;
             // ucc_info("all others ready for next step");
-            if (task->bcast_linear.stage < task->bcast_linear.num_steps) {
+            if (task->bcast_linear.step < task->bcast_linear.num_steps) {
                 task->bcast_linear.stage = STAGE_COPY;
             } else {
                 task->bcast_linear.stage = STAGE_DONE;
@@ -241,9 +241,9 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                     set_rank_step(task, trank, task->bcast_linear.step, 0);
                     // task->bcast_linear.stage =
                     //     STAGE_DONE; // TODO: just for debug
-                    if (task->bcast_linear.stage <
+                    if (task->bcast_linear.step <
                         task->bcast_linear.num_steps) {
-                        task->bcast_linear.stage = STAGE_COPY;
+                        task->bcast_linear.stage = STAGE_WAIT_ROOT;
                     } else {
                         task->bcast_linear.stage = STAGE_DONE;
                     }

From 2c94cbff6f3156548a95e978fcdb88a9c59cfc3d Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Thu, 4 Jul 2024 17:37:30 +0200
Subject: [PATCH 13/34] TL/CUDA: minor cleanup

---
 src/components/tl/cuda/bcast/bcast_linear.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index ad55cfc1a3..67dd083043 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -62,14 +62,8 @@ ucc_status_t ucc_tl_cuda_bcast_linear_setup_test(ucc_tl_cuda_task_t *task)
     return ucc_tl_cuda_shm_barrier_test(UCC_TL_TEAM_RANK(team), task->bar);
 }
 
-static inline size_t get_scratch_size(ucc_tl_cuda_team_t *team,
-                                      ucc_datatype_t      dt)
+static inline size_t get_raw_scratch_size(ucc_tl_cuda_team_t *team)
 {
-    size_t     dt_size = ucc_dt_size(dt);
-    ucc_rank_t tsize   = UCC_TL_TEAM_SIZE(team);
-
-    ucc_assert((dt_size > 0) && (tsize > 0));
-
     return UCC_TL_CUDA_TEAM_LIB(team)->cfg.scratch_size;
 }
 
@@ -101,8 +95,8 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
     ucc_tl_cuda_team_t *team  = TASK_TEAM(task);
     ucc_rank_t          trank = UCC_TL_TEAM_RANK(team);
     ucc_rank_t          tsize = UCC_TL_TEAM_SIZE(team);
-    ucc_datatype_t      dt    = task->bcast_linear.dt;
-    ucc_status_t        st;
+    // ucc_datatype_t      dt    = task->bcast_linear.dt;
+    ucc_status_t st;
     (void)team;
     (void)st;
     ucc_ee_executor_t      *exec;
@@ -148,7 +142,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
         break;
     }
 
-    size_t scratch_size = get_scratch_size(team, dt);
+    size_t scratch_size = get_raw_scratch_size(team);
     size_t chunk_size   = task->bcast_linear.step < task->bcast_linear.num_steps
                               ? ucc_min(scratch_size, task->bcast_linear.size)
                               : task->bcast_linear.size -
@@ -239,8 +233,6 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                     task->bcast_linear.exec_task = NULL;
                     ++task->bcast_linear.step;
                     set_rank_step(task, trank, task->bcast_linear.step, 0);
-                    // task->bcast_linear.stage =
-                    //     STAGE_DONE; // TODO: just for debug
                     if (task->bcast_linear.step <
                         task->bcast_linear.num_steps) {
                         task->bcast_linear.stage = STAGE_WAIT_ROOT;
@@ -276,7 +268,7 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
              args->src.info.count);
 
     task->bcast_linear.size = ucc_dt_size(dt) * args->src.info.count;
-    size_t scratch_size     = get_scratch_size(team, dt);
+    size_t scratch_size     = get_raw_scratch_size(team);
     task->bcast_linear.num_steps =
         ucc_div_round_up(task->bcast_linear.size, scratch_size);
 

From 3ff154bb5d01d3d24d6d5cea653e41cf8ecfb2ff Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Fri, 5 Jul 2024 15:23:33 +0200
Subject: [PATCH 14/34] TL/CUDA: removed breaks

---
 src/components/tl/cuda/bcast/bcast_linear.c | 94 +++++++++++++--------
 1 file changed, 59 insertions(+), 35 deletions(-)

diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index 67dd083043..57377699b4 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -96,17 +96,15 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
     ucc_rank_t          trank = UCC_TL_TEAM_RANK(team);
     ucc_rank_t          tsize = UCC_TL_TEAM_SIZE(team);
     // ucc_datatype_t      dt    = task->bcast_linear.dt;
-    ucc_status_t st;
-    (void)team;
-    (void)st;
+
     ucc_ee_executor_t      *exec;
     ucc_ee_executor_task_t *etask;
-    ucc_status_t            status;
+    ucc_status_t            st;
     void                   *sbuf, *dbuf;
     task->super.status = UCC_INPROGRESS;
 
-    status = ucc_coll_task_get_executor(&task->super, &exec);
-    if (ucc_unlikely(status != UCC_OK)) {
+    st = ucc_coll_task_get_executor(&task->super, &exec);
+    if (ucc_unlikely(st != UCC_OK)) {
         return;
     }
 
@@ -152,21 +150,28 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
     // ucc_info("chunk_size: %ld", chunk_size);
 
     if (trank == task->bcast_linear.root) {
+        // Root scenario
         // fall-through between cases is intentional
         switch (task->bcast_linear.stage) {
         case STAGE_COPY:
             // copy from src buffer to scratch
-            dbuf   = TASK_SCRATCH(task, trank);
-            sbuf   = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff);
-            status = ecopy(dbuf, sbuf, chunk_size, exec,
-                           &task->bcast_linear.exec_task);
+            dbuf = TASK_SCRATCH(task, trank);
+            sbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff);
+            st   = ecopy(dbuf, sbuf, chunk_size, exec,
+                         &task->bcast_linear.exec_task);
+            if (st != UCC_OK)
+            {
+                ucc_error("failed to post ecopy task");
+                task->super.status = st;
+                return;
+            }
             task->bcast_linear.stage = STAGE_WAIT_COPY;
-            break;
+            // break;
         case STAGE_WAIT_COPY:
             etask = task->bcast_linear.exec_task;
             if (etask) {
-                status = ucc_ee_executor_task_test(etask);
-                if (status == UCC_OK) {
+                st = ucc_ee_executor_task_test(etask);
+                if (st == UCC_OK) {
                     ucc_ee_executor_task_finalize(etask);
                     task->bcast_linear.exec_task = NULL;
                     // signal others
@@ -175,33 +180,42 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                                   task->bcast_linear.step, 0);
                     task->bcast_linear.stage = STAGE_WAIT_ALL;
                 }
+                else
+                {
+                    // ucc_info("not ready");
+                    return;
+                }
             }
-            break;
+            else
+            {
+                ucc_info("etask is nullptr");
+                return;
+            }
+            // break;
         case STAGE_WAIT_ALL:
             for (int i = 0; i < tsize; ++i) {
-                int other_rank_step = get_rank_step(task, i, 0);
-                // ucc_info("rank %d, step: %d, my step: %d", i, other_rank_step,
-                //          task->bcast_linear.step);
-                if (other_rank_step < task->bcast_linear.step) {
-                    // ucc_info("rank %d is not ready", i);
+                if (get_rank_step(task, i, 0) < task->bcast_linear.step) {
+                    // rank is not ready, lets wait
                     return;
                 }
             }
             task->bcast_linear.stage = STAGE_COPY;
             // ucc_info("all others ready for next step");
             if (task->bcast_linear.step < task->bcast_linear.num_steps) {
+                // go to next iteration
                 task->bcast_linear.stage = STAGE_COPY;
+                return;
             } else {
+                // finish
                 task->bcast_linear.stage = STAGE_DONE;
             }
-            break;
+            // break;
         case STAGE_DONE:
             task->super.status = UCC_OK;
             break;
         default:
             break;
         }
-        // Root scenario
     } else {
         // others
         switch (task->bcast_linear.stage) {
@@ -213,22 +227,26 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                 task->bcast_linear.stage = STAGE_CLIENT_COPY;
                 break;
             }
-            break;
+            else
+            {
+                return;
+            }
+            // break;
         case STAGE_CLIENT_COPY:
             dbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff);
             sbuf = TASK_SCRATCH(
                 task,
                 task->bcast_linear
                     .root); // need to copy from root's scratch buffer
-            status                   = ecopy(dbuf, sbuf, chunk_size, exec,
+            st                   = ecopy(dbuf, sbuf, chunk_size, exec,
                                              &task->bcast_linear.exec_task);
             task->bcast_linear.stage = STAGE_CLIENT_COPY_WAIT;
-            break;
+            // break;
         case STAGE_CLIENT_COPY_WAIT:
             etask = task->bcast_linear.exec_task;
             if (etask) {
-                status = ucc_ee_executor_task_test(etask);
-                if (status == UCC_OK) {
+                st = ucc_ee_executor_task_test(etask);
+                if (st == UCC_OK) {
                     ucc_ee_executor_task_finalize(etask);
                     task->bcast_linear.exec_task = NULL;
                     ++task->bcast_linear.step;
@@ -236,12 +254,21 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                     if (task->bcast_linear.step <
                         task->bcast_linear.num_steps) {
                         task->bcast_linear.stage = STAGE_WAIT_ROOT;
+                        return;
                     } else {
                         task->bcast_linear.stage = STAGE_DONE;
                     }
                 }
+                else
+                {
+                    return;
+                }
             }
-            break;
+            else
+            {
+                return;
+            }
+            // break;
         case STAGE_DONE:
             task->super.status = UCC_OK;
             break;
@@ -253,14 +280,11 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
 
 ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
 {
-    ucc_tl_cuda_task_t *task  = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
-    ucc_tl_cuda_team_t *team  = TASK_TEAM(task);
-    ucc_coll_args_t    *args  = &TASK_ARGS(task);
-    ucc_rank_t          tsize = UCC_TL_TEAM_SIZE(team);
-    ucc_datatype_t      dt    = task->bcast_linear.dt;
-
-    (void)tsize;
-    (void)dt;
+    ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
+    ucc_tl_cuda_team_t *team = TASK_TEAM(task);
+    ucc_coll_args_t    *args = &TASK_ARGS(task);
+    // ucc_rank_t          tsize = UCC_TL_TEAM_SIZE(team);
+    ucc_datatype_t dt = task->bcast_linear.dt;
 
     task->bcast_linear.stage = STAGE_SYNC;
 

From 162680879de7aefc30f7c69e02d764a6e90802fe Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Sat, 6 Jul 2024 17:16:04 +0200
Subject: [PATCH 15/34] TL/CUDA: fix linter

---
 src/components/tl/cuda/bcast/bcast_linear.c | 49 +++++++++------------
 1 file changed, 22 insertions(+), 27 deletions(-)

diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index 57377699b4..ed419f7b00 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -6,7 +6,8 @@
 
 #include "bcast/bcast.h"
 
-enum {
+enum
+{
     STAGE_DONE,
     STAGE_SYNC,
     STAGE_SETUP,
@@ -68,7 +69,7 @@ static inline size_t get_raw_scratch_size(ucc_tl_cuda_team_t *team)
 }
 
 static inline ucc_status_t ecopy(void *dst, void *src, size_t size,
-                                 ucc_ee_executor_t       *exec,
+                                 ucc_ee_executor_t *      exec,
                                  ucc_ee_executor_task_t **etask)
 {
     ucc_ee_executor_task_args_t exec_args = {0};
@@ -97,10 +98,10 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
     ucc_rank_t          tsize = UCC_TL_TEAM_SIZE(team);
     // ucc_datatype_t      dt    = task->bcast_linear.dt;
 
-    ucc_ee_executor_t      *exec;
+    ucc_ee_executor_t *     exec;
     ucc_ee_executor_task_t *etask;
     ucc_status_t            st;
-    void                   *sbuf, *dbuf;
+    void *                  sbuf, *dbuf;
     task->super.status = UCC_INPROGRESS;
 
     st = ucc_coll_task_get_executor(&task->super, &exec);
@@ -158,9 +159,8 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
             dbuf = TASK_SCRATCH(task, trank);
             sbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff);
             st   = ecopy(dbuf, sbuf, chunk_size, exec,
-                         &task->bcast_linear.exec_task);
-            if (st != UCC_OK)
-            {
+                       &task->bcast_linear.exec_task);
+            if (st != UCC_OK) {
                 ucc_error("failed to post ecopy task");
                 task->super.status = st;
                 return;
@@ -179,15 +179,11 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                     set_rank_step(task, task->bcast_linear.root,
                                   task->bcast_linear.step, 0);
                     task->bcast_linear.stage = STAGE_WAIT_ALL;
-                }
-                else
-                {
+                } else {
                     // ucc_info("not ready");
                     return;
                 }
-            }
-            else
-            {
+            } else {
                 ucc_info("etask is nullptr");
                 return;
             }
@@ -226,9 +222,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                 // ucc_info("something from root is ready!");
                 task->bcast_linear.stage = STAGE_CLIENT_COPY;
                 break;
-            }
-            else
-            {
+            } else {
                 return;
             }
             // break;
@@ -238,8 +232,13 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                 task,
                 task->bcast_linear
                     .root); // need to copy from root's scratch buffer
-            st                   = ecopy(dbuf, sbuf, chunk_size, exec,
-                                             &task->bcast_linear.exec_task);
+            st = ecopy(dbuf, sbuf, chunk_size, exec,
+                       &task->bcast_linear.exec_task);
+            if (st != UCC_OK) {
+                ucc_error("failed to post ecopy task at client");
+                task->super.status = st;
+                return;
+            }
             task->bcast_linear.stage = STAGE_CLIENT_COPY_WAIT;
             // break;
         case STAGE_CLIENT_COPY_WAIT:
@@ -258,14 +257,10 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                     } else {
                         task->bcast_linear.stage = STAGE_DONE;
                     }
-                }
-                else
-                {
+                } else {
                     return;
                 }
-            }
-            else
-            {
+            } else {
                 return;
             }
             // break;
@@ -282,7 +277,7 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
 {
     ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
     ucc_tl_cuda_team_t *team = TASK_TEAM(task);
-    ucc_coll_args_t    *args = &TASK_ARGS(task);
+    ucc_coll_args_t *   args = &TASK_ARGS(task);
     // ucc_rank_t          tsize = UCC_TL_TEAM_SIZE(team);
     ucc_datatype_t dt = task->bcast_linear.dt;
 
@@ -306,8 +301,8 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
 }
 
 ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
-                                           ucc_base_team_t      *tl_team,
-                                           ucc_coll_task_t     **task_p)
+                                           ucc_base_team_t *     tl_team,
+                                           ucc_coll_task_t **    task_p)
 {
     ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
     ucc_tl_cuda_task_t *task;

From bb555de7049d3742eb0ba80c789210194e5ba3ef Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Thu, 11 Jul 2024 20:55:47 +0200
Subject: [PATCH 16/34] TL/CUDA: double buffering

---
 src/components/tl/cuda/bcast/bcast_linear.c | 68 ++++++++-------------
 1 file changed, 24 insertions(+), 44 deletions(-)

diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index ed419f7b00..6099d3e306 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -96,7 +96,13 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
     ucc_tl_cuda_team_t *team  = TASK_TEAM(task);
     ucc_rank_t          trank = UCC_TL_TEAM_RANK(team);
     ucc_rank_t          tsize = UCC_TL_TEAM_SIZE(team);
-    // ucc_datatype_t      dt    = task->bcast_linear.dt;
+    size_t              half_scratch_size = get_raw_scratch_size(team) / 2;
+    size_t              chunk_size =
+        task->bcast_linear.step < task->bcast_linear.num_steps
+                         ? ucc_min(half_scratch_size, task->bcast_linear.size)
+                         : task->bcast_linear.size -
+                  (task->bcast_linear.step - 1) * half_scratch_size;
+    size_t offset_buff = task->bcast_linear.step * half_scratch_size;
 
     ucc_ee_executor_t *     exec;
     ucc_ee_executor_task_t *etask;
@@ -111,21 +117,18 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
 
     switch (task->bcast_linear.stage) {
     case STAGE_SYNC:
-        // ucc_info("sync");
         if (ucc_tl_cuda_get_sync(task) != UCC_OK) {
             task->super.status = UCC_INPROGRESS;
             return;
         }
         task->bcast_linear.step = 0;
-        // ucc_info("setup");
-        st = ucc_tl_cuda_bcast_linear_setup_start(task);
+        st                      = ucc_tl_cuda_bcast_linear_setup_start(task);
         if (st != UCC_OK) {
             task->super.status = st;
             return;
         }
         task->bcast_linear.stage = STAGE_SETUP;
     case STAGE_SETUP:
-        // ucc_info("test");
         st = ucc_tl_cuda_bcast_linear_setup_test(task);
         if (st != UCC_OK) {
             task->super.status = st;
@@ -141,22 +144,14 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
         break;
     }
 
-    size_t scratch_size = get_raw_scratch_size(team);
-    size_t chunk_size   = task->bcast_linear.step < task->bcast_linear.num_steps
-                              ? ucc_min(scratch_size, task->bcast_linear.size)
-                              : task->bcast_linear.size -
-                                  (task->bcast_linear.step - 1) * scratch_size;
-    size_t offset_buff  = task->bcast_linear.step * scratch_size;
-
-    // ucc_info("chunk_size: %ld", chunk_size);
-
     if (trank == task->bcast_linear.root) {
         // Root scenario
         // fall-through between cases is intentional
         switch (task->bcast_linear.stage) {
         case STAGE_COPY:
             // copy from src buffer to scratch
-            dbuf = TASK_SCRATCH(task, trank);
+            dbuf = PTR_OFFSET(TASK_SCRATCH(task, trank),
+                              task->bcast_linear.step % 2 * half_scratch_size);
             sbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff);
             st   = ecopy(dbuf, sbuf, chunk_size, exec,
                        &task->bcast_linear.exec_task);
@@ -166,7 +161,6 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                 return;
             }
             task->bcast_linear.stage = STAGE_WAIT_COPY;
-            // break;
         case STAGE_WAIT_COPY:
             etask = task->bcast_linear.exec_task;
             if (etask) {
@@ -180,23 +174,22 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                                   task->bcast_linear.step, 0);
                     task->bcast_linear.stage = STAGE_WAIT_ALL;
                 } else {
-                    // ucc_info("not ready");
+                    // not ready
                     return;
                 }
             } else {
                 ucc_info("etask is nullptr");
                 return;
             }
-            // break;
         case STAGE_WAIT_ALL:
             for (int i = 0; i < tsize; ++i) {
-                if (get_rank_step(task, i, 0) < task->bcast_linear.step) {
+                // need to wait until all ranks complete step - 1, because of double buffering
+                if (get_rank_step(task, i, 0) < task->bcast_linear.step - 1) {
                     // rank is not ready, lets wait
                     return;
                 }
             }
             task->bcast_linear.stage = STAGE_COPY;
-            // ucc_info("all others ready for next step");
             if (task->bcast_linear.step < task->bcast_linear.num_steps) {
                 // go to next iteration
                 task->bcast_linear.stage = STAGE_COPY;
@@ -205,7 +198,6 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                 // finish
                 task->bcast_linear.stage = STAGE_DONE;
             }
-            // break;
         case STAGE_DONE:
             task->super.status = UCC_OK;
             break;
@@ -213,7 +205,8 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
             break;
         }
     } else {
-        // others
+        // clients
+        // fall-through between cases is intentional
         switch (task->bcast_linear.stage) {
         case STAGE_WAIT_ROOT:
             /* code */
@@ -225,14 +218,12 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
             } else {
                 return;
             }
-            // break;
         case STAGE_CLIENT_COPY:
             dbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff);
-            sbuf = TASK_SCRATCH(
-                task,
-                task->bcast_linear
-                    .root); // need to copy from root's scratch buffer
-            st = ecopy(dbuf, sbuf, chunk_size, exec,
+            // need to copy from root's scratch buffer
+            sbuf = PTR_OFFSET(TASK_SCRATCH(task, task->bcast_linear.root),
+                              task->bcast_linear.step % 2 * chunk_size);
+            st   = ecopy(dbuf, sbuf, chunk_size, exec,
                        &task->bcast_linear.exec_task);
             if (st != UCC_OK) {
                 ucc_error("failed to post ecopy task at client");
@@ -240,7 +231,6 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                 return;
             }
             task->bcast_linear.stage = STAGE_CLIENT_COPY_WAIT;
-            // break;
         case STAGE_CLIENT_COPY_WAIT:
             etask = task->bcast_linear.exec_task;
             if (etask) {
@@ -263,7 +253,6 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
             } else {
                 return;
             }
-            // break;
         case STAGE_DONE:
             task->super.status = UCC_OK;
             break;
@@ -278,20 +267,17 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
     ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
     ucc_tl_cuda_team_t *team = TASK_TEAM(task);
     ucc_coll_args_t *   args = &TASK_ARGS(task);
-    // ucc_rank_t          tsize = UCC_TL_TEAM_SIZE(team);
-    ucc_datatype_t dt = task->bcast_linear.dt;
+    ucc_datatype_t      dt   = task->bcast_linear.dt;
+    size_t              half_scratch_size = get_raw_scratch_size(team) / 2;
 
     task->bcast_linear.stage = STAGE_SYNC;
 
-    ucc_info("bcast start with dt: %s and count: %ld", ucc_datatype_str(dt),
-             args->src.info.count);
-
     task->bcast_linear.size = ucc_dt_size(dt) * args->src.info.count;
-    size_t scratch_size     = get_raw_scratch_size(team);
     task->bcast_linear.num_steps =
-        ucc_div_round_up(task->bcast_linear.size, scratch_size);
+        ucc_div_round_up(task->bcast_linear.size, half_scratch_size);
 
-    ucc_info("bcast buffer size: %ld, num_steps: %d", task->bcast_linear.size,
+    ucc_info("bcast dt: %s, buffer size: %ld, num_steps: %d",
+             ucc_datatype_str(dt), task->bcast_linear.size,
              task->bcast_linear.num_steps);
 
     task->bcast_linear.sbuf = args->src.info.buffer;
@@ -308,8 +294,6 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
     ucc_tl_cuda_task_t *task;
     ucc_status_t        status;
 
-    ucc_info("bcast init");
-
     if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) ||
                      UCC_TL_TEAM_SIZE(team) - 1 >
                          UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
@@ -323,8 +307,6 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
 
     task->bcast_linear.root = coll_args->args.root;
     task->bcast_linear.dt   = coll_args->args.src.info.datatype;
-    ucc_info("bcast init with dt: %s", ucc_datatype_str(task->bcast_linear.dt));
-
     task->bcast_linear.sbuf = coll_args->args.src.info.buffer;
 
     task->super.flags |= UCC_COLL_TASK_FLAG_EXECUTOR;
@@ -333,8 +315,6 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
     task->super.finalize = ucc_tl_cuda_bcast_linear_finalize;
     task->bar            = TASK_BAR(task);
 
-    ucc_info("bcast init success");
-
     *task_p = &task->super;
     return UCC_OK;
 }

From 190f1e386c5a3a7b57249ababb32f1ce727816a2 Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Fri, 2 Aug 2024 17:28:48 +0200
Subject: [PATCH 17/34] TL/CUDA: moved get/set rank step

---
 .../tl/cuda/allgatherv/allgatherv_linear.c      | 16 ----------------
 src/components/tl/cuda/bcast/bcast_linear.c     | 17 -----------------
 .../reduce_scatterv/reduce_scatterv_linear.c    | 16 ----------------
 src/components/tl/cuda/tl_cuda_coll.h           | 17 +++++++++++++++++
 src/components/tl/cuda/tl_cuda_ring.h           | 16 ----------------
 5 files changed, 17 insertions(+), 65 deletions(-)

diff --git a/src/components/tl/cuda/allgatherv/allgatherv_linear.c b/src/components/tl/cuda/allgatherv/allgatherv_linear.c
index 0fca5c6af6..41499c136d 100644
--- a/src/components/tl/cuda/allgatherv/allgatherv_linear.c
+++ b/src/components/tl/cuda/allgatherv/allgatherv_linear.c
@@ -55,22 +55,6 @@ enum
                     *  other ranks to finish */
 };
 
-static inline int get_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
-                                int step_id)
-{
-    ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);
-
-    return sync->seq_num[step_id];
-}
-
-static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
-                                 int step, int step_id)
-{
-    ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);
-
-    sync->seq_num[step_id] = step;
-}
-
 ucc_status_t ucc_tl_cuda_allgatherv_linear_finalize(ucc_coll_task_t *coll_task)
 {
     ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index 6099d3e306..6ee642eac7 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -21,23 +21,6 @@ enum
     STAGE_CLIENT_COPY_WAIT,
 };
 
-// TODO: move out to common with allgather
-static inline int get_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
-                                int step_id)
-{
-    ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);
-
-    return sync->seq_num[step_id];
-}
-
-static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
-                                 int step, int step_id)
-{
-    ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);
-
-    sync->seq_num[step_id] = step;
-}
-
 ucc_status_t ucc_tl_cuda_bcast_linear_setup_start(ucc_tl_cuda_task_t *task)
 {
     ucc_tl_cuda_team_t *team  = TASK_TEAM(task);
diff --git a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c
index 6a1ec5b22c..3e0595fa44 100644
--- a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c
+++ b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c
@@ -59,22 +59,6 @@ enum
                     *  other ranks to finish */
 };
 
-static inline int get_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
-                                int step_id)
-{
-    ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);
-
-    return sync->seq_num[step_id];
-}
-
-static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
-                                 int step, int step_id)
-{
-    ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);
-
-    sync->seq_num[step_id] = step;
-}
-
 ucc_status_t
 ucc_tl_cuda_reduce_scatterv_linear_finalize(ucc_coll_task_t *coll_task)
 {
diff --git a/src/components/tl/cuda/tl_cuda_coll.h b/src/components/tl/cuda/tl_cuda_coll.h
index f450ff950c..55b86e2cee 100644
--- a/src/components/tl/cuda/tl_cuda_coll.h
+++ b/src/components/tl/cuda/tl_cuda_coll.h
@@ -156,4 +156,21 @@ ucc_status_t ucc_tl_cuda_alg_id_to_init(int alg_id, const char *alg_id_str,
                                         ucc_memory_type_t        mem_type,
                                         ucc_base_coll_init_fn_t *init);
 
+// common utils function for collectives:
+static inline int get_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
+                                int step_id)
+{
+    ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);
+
+    return sync->seq_num[step_id];
+}
+
+static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
+                                 int step, int step_id)
+{
+    ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);
+
+    sync->seq_num[step_id] = step;
+}
+
 #endif
diff --git a/src/components/tl/cuda/tl_cuda_ring.h b/src/components/tl/cuda/tl_cuda_ring.h
index cc2d3c95db..621e074184 100644
--- a/src/components/tl/cuda/tl_cuda_ring.h
+++ b/src/components/tl/cuda/tl_cuda_ring.h
@@ -83,20 +83,4 @@ static inline ucc_rank_t get_recv_block(ucc_tl_cuda_team_t *team,
     return ring->ring[(ring->iring[trank] + tsize - step - 1) % tsize];
 }
 
-static inline int get_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
-                                int ring_id)
-{
-    ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);
-
-    return sync->seq_num[ring_id];
-}
-
-static inline void set_rank_step(ucc_tl_cuda_task_t *task, ucc_rank_t rank,
-                                 int step, int ring_id)
-{
-    ucc_tl_cuda_sync_t *sync = TASK_SYNC(task, rank);
-
-    sync->seq_num[ring_id] = step;
-}
-
 #endif

From d76629c1443c97fc32650f31f78287f717f75267 Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Fri, 2 Aug 2024 18:02:10 +0200
Subject: [PATCH 18/34] TL/CUDA: changed logs to debug lvl

---
 src/components/tl/cuda/bcast/bcast_linear.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index 6ee642eac7..762edba4fb 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -161,7 +161,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                     return;
                 }
             } else {
-                ucc_info("etask is nullptr");
+                ucc_debug("etask is nullptr");
                 return;
             }
         case STAGE_WAIT_ALL:
@@ -195,7 +195,6 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
             /* code */
             if (get_rank_step(task, task->bcast_linear.root, 0) >
                 task->bcast_linear.step) {
-                // ucc_info("something from root is ready!");
                 task->bcast_linear.stage = STAGE_CLIENT_COPY;
                 break;
             } else {
@@ -259,9 +258,9 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
     task->bcast_linear.num_steps =
         ucc_div_round_up(task->bcast_linear.size, half_scratch_size);
 
-    ucc_info("bcast dt: %s, buffer size: %ld, num_steps: %d",
-             ucc_datatype_str(dt), task->bcast_linear.size,
-             task->bcast_linear.num_steps);
+    ucc_debug("bcast linear dt: %s, buffer size: %ld, num_steps: %d",
+              ucc_datatype_str(dt), task->bcast_linear.size,
+              task->bcast_linear.num_steps);
 
     task->bcast_linear.sbuf = args->src.info.buffer;
     task->bcast_linear.step = 0;

From e444cda0efc83a0b657cc32c6fa233de42522870 Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Fri, 2 Aug 2024 18:11:16 +0200
Subject: [PATCH 19/34] TL/CUDA: minor cleanups

---
 src/components/tl/cuda/bcast/bcast_linear.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index 762edba4fb..177119d503 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -16,9 +16,9 @@ enum
     STAGE_WAIT_COPY, // wait for copy finishes
     STAGE_WAIT_ALL,  // wait for all others rank be on same step
     // non-root
-    STAGE_WAIT_ROOT,
-    STAGE_CLIENT_COPY,
-    STAGE_CLIENT_COPY_WAIT,
+    STAGE_WAIT_ROOT,   // clients wait while root writes to own scratch buffer
+    STAGE_CLIENT_COPY, // clients submit copy task
+    STAGE_CLIENT_COPY_WAIT, // clients wait completion of copy from root's scratch
 };
 
 ucc_status_t ucc_tl_cuda_bcast_linear_setup_start(ucc_tl_cuda_task_t *task)
@@ -91,6 +91,8 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
     ucc_ee_executor_task_t *etask;
     ucc_status_t            st;
     void *                  sbuf, *dbuf;
+    int                     i;
+
     task->super.status = UCC_INPROGRESS;
 
     st = ucc_coll_task_get_executor(&task->super, &exec);
@@ -165,7 +167,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                 return;
             }
         case STAGE_WAIT_ALL:
-            for (int i = 0; i < tsize; ++i) {
+            for (i = 0; i < tsize; ++i) {
                 // need to wait until all ranks complete step - 1, because of double buffering
                 if (get_rank_step(task, i, 0) < task->bcast_linear.step - 1) {
                     // rank is not ready, lets wait
@@ -192,7 +194,6 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
         // fall-through between cases is intentional
         switch (task->bcast_linear.stage) {
         case STAGE_WAIT_ROOT:
-            /* code */
             if (get_rank_step(task, task->bcast_linear.root, 0) >
                 task->bcast_linear.step) {
                 task->bcast_linear.stage = STAGE_CLIENT_COPY;
@@ -201,8 +202,8 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                 return;
             }
         case STAGE_CLIENT_COPY:
-            dbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff);
             // need to copy from root's scratch buffer
+            dbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff);
             sbuf = PTR_OFFSET(TASK_SCRATCH(task, task->bcast_linear.root),
                               task->bcast_linear.step % 2 * chunk_size);
             st   = ecopy(dbuf, sbuf, chunk_size, exec,

From 1ca88667b44623718d6c895c17453ac650eae400 Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Wed, 21 Aug 2024 15:21:09 +0200
Subject: [PATCH 20/34] TL/CUDA: addressed comments

---
 src/components/tl/cuda/allgather/allgather.c  |  2 +-
 .../tl/cuda/allgather/allgather_linear.c      |  2 +-
 .../tl/cuda/allgatherv/allgatherv.c           |  2 +-
 .../tl/cuda/allgatherv/allgatherv_linear.c    |  2 +-
 src/components/tl/cuda/bcast/bcast.c          |  2 +-
 src/components/tl/cuda/bcast/bcast_linear.c   | 23 ++++++++-----------
 .../tl/cuda/reduce_scatter/reduce_scatter.c   |  2 +-
 .../reduce_scatter/reduce_scatter_linear.c    |  2 +-
 .../tl/cuda/reduce_scatterv/reduce_scatterv.c |  2 +-
 .../reduce_scatterv/reduce_scatterv_linear.c  |  2 +-
 src/components/tl/cuda/tl_cuda.h              |  1 -
 src/components/tl/cuda/tl_cuda_team_topo.h    |  2 +-
 12 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/src/components/tl/cuda/allgather/allgather.c b/src/components/tl/cuda/allgather/allgather.c
index 01996da4da..1e64c0a582 100644
--- a/src/components/tl/cuda/allgather/allgather.c
+++ b/src/components/tl/cuda/allgather/allgather.c
@@ -44,7 +44,7 @@ ucc_status_t ucc_tl_cuda_allgather_init(ucc_base_coll_args_t *coll_args,
 {
     ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
 
-    if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
+    if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) {
         return ucc_tl_cuda_allgather_linear_init(coll_args, tl_team, task_p);
     } else {
         return ucc_tl_cuda_allgather_ring_init(coll_args, tl_team, task_p);
diff --git a/src/components/tl/cuda/allgather/allgather_linear.c b/src/components/tl/cuda/allgather/allgather_linear.c
index ed228d1683..fefc774628 100644
--- a/src/components/tl/cuda/allgather/allgather_linear.c
+++ b/src/components/tl/cuda/allgather/allgather_linear.c
@@ -15,7 +15,7 @@ ucc_status_t ucc_tl_cuda_allgather_linear_init(ucc_base_coll_args_t *coll_args,
     ucc_tl_cuda_task_t *task;
     ucc_status_t        status;
 
-    if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) ||
+    if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) ||
         UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
         return UCC_ERR_NOT_SUPPORTED;
     }
diff --git a/src/components/tl/cuda/allgatherv/allgatherv.c b/src/components/tl/cuda/allgatherv/allgatherv.c
index 5a8f78c481..76da65fa65 100644
--- a/src/components/tl/cuda/allgatherv/allgatherv.c
+++ b/src/components/tl/cuda/allgatherv/allgatherv.c
@@ -47,7 +47,7 @@ ucc_status_t ucc_tl_cuda_allgatherv_init(ucc_base_coll_args_t *coll_args,
 {
     ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
 
-    if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
+    if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) {
         return ucc_tl_cuda_allgatherv_linear_init(coll_args, tl_team, task_p);
     } else {
         return ucc_tl_cuda_allgatherv_ring_init(coll_args, tl_team, task_p);
diff --git a/src/components/tl/cuda/allgatherv/allgatherv_linear.c b/src/components/tl/cuda/allgatherv/allgatherv_linear.c
index 41499c136d..1f02ad37bd 100644
--- a/src/components/tl/cuda/allgatherv/allgatherv_linear.c
+++ b/src/components/tl/cuda/allgatherv/allgatherv_linear.c
@@ -416,7 +416,7 @@ ucc_status_t ucc_tl_cuda_allgatherv_linear_init(ucc_base_coll_args_t *coll_args,
     ucc_tl_cuda_task_t *task;
     ucc_status_t        status;
 
-    if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) ||
+    if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) ||
         UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
         return UCC_ERR_NOT_SUPPORTED;
     }
diff --git a/src/components/tl/cuda/bcast/bcast.c b/src/components/tl/cuda/bcast/bcast.c
index 46623684fe..d687d924a0 100644
--- a/src/components/tl/cuda/bcast/bcast.c
+++ b/src/components/tl/cuda/bcast/bcast.c
@@ -20,7 +20,7 @@ ucc_status_t ucc_tl_cuda_bcast_init(ucc_base_coll_args_t *coll_args,
 {
     ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
 
-    if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
+    if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) {
         return ucc_tl_cuda_bcast_linear_init(coll_args, tl_team, task_p);
     } else {
         return UCC_ERR_NOT_SUPPORTED;
diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index 177119d503..f0ba528635 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -6,8 +6,7 @@
 
 #include "bcast/bcast.h"
 
-enum
-{
+enum {
     STAGE_DONE,
     STAGE_SYNC,
     STAGE_SETUP,
@@ -52,7 +51,7 @@ static inline size_t get_raw_scratch_size(ucc_tl_cuda_team_t *team)
 }
 
 static inline ucc_status_t ecopy(void *dst, void *src, size_t size,
-                                 ucc_ee_executor_t *      exec,
+                                 ucc_ee_executor_t       *exec,
                                  ucc_ee_executor_task_t **etask)
 {
     ucc_ee_executor_task_args_t exec_args = {0};
@@ -86,11 +85,10 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                          : task->bcast_linear.size -
                   (task->bcast_linear.step - 1) * half_scratch_size;
     size_t offset_buff = task->bcast_linear.step * half_scratch_size;
-
-    ucc_ee_executor_t *     exec;
+    ucc_ee_executor_t      *exec;
     ucc_ee_executor_task_t *etask;
     ucc_status_t            st;
-    void *                  sbuf, *dbuf;
+    void                   *sbuf, *dbuf;
     int                     i;
 
     task->super.status = UCC_INPROGRESS;
@@ -103,7 +101,6 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
     switch (task->bcast_linear.stage) {
     case STAGE_SYNC:
         if (ucc_tl_cuda_get_sync(task) != UCC_OK) {
-            task->super.status = UCC_INPROGRESS;
             return;
         }
         task->bcast_linear.step = 0;
@@ -139,7 +136,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                               task->bcast_linear.step % 2 * half_scratch_size);
             sbuf = PTR_OFFSET(task->bcast_linear.sbuf, offset_buff);
             st   = ecopy(dbuf, sbuf, chunk_size, exec,
-                       &task->bcast_linear.exec_task);
+                         &task->bcast_linear.exec_task);
             if (st != UCC_OK) {
                 ucc_error("failed to post ecopy task");
                 task->super.status = st;
@@ -207,7 +204,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
             sbuf = PTR_OFFSET(TASK_SCRATCH(task, task->bcast_linear.root),
                               task->bcast_linear.step % 2 * chunk_size);
             st   = ecopy(dbuf, sbuf, chunk_size, exec,
-                       &task->bcast_linear.exec_task);
+                         &task->bcast_linear.exec_task);
             if (st != UCC_OK) {
                 ucc_error("failed to post ecopy task at client");
                 task->super.status = st;
@@ -249,7 +246,7 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
 {
     ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
     ucc_tl_cuda_team_t *team = TASK_TEAM(task);
-    ucc_coll_args_t *   args = &TASK_ARGS(task);
+    ucc_coll_args_t    *args = &TASK_ARGS(task);
     ucc_datatype_t      dt   = task->bcast_linear.dt;
     size_t              half_scratch_size = get_raw_scratch_size(team) / 2;
 
@@ -270,14 +267,14 @@ ucc_status_t ucc_tl_cuda_bcast_linear_start(ucc_coll_task_t *coll_task)
 }
 
 ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
-                                           ucc_base_team_t *     tl_team,
-                                           ucc_coll_task_t **    task_p)
+                                           ucc_base_team_t      *tl_team,
+                                           ucc_coll_task_t     **task_p)
 {
     ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
     ucc_tl_cuda_task_t *task;
     ucc_status_t        status;
 
-    if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) ||
+    if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) ||
                      UCC_TL_TEAM_SIZE(team) - 1 >
                          UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
         return UCC_ERR_NOT_SUPPORTED;
diff --git a/src/components/tl/cuda/reduce_scatter/reduce_scatter.c b/src/components/tl/cuda/reduce_scatter/reduce_scatter.c
index 468fd68338..1e1d75c3ed 100644
--- a/src/components/tl/cuda/reduce_scatter/reduce_scatter.c
+++ b/src/components/tl/cuda/reduce_scatter/reduce_scatter.c
@@ -48,7 +48,7 @@ ucc_status_t ucc_tl_cuda_reduce_scatter_init(ucc_base_coll_args_t *coll_args,
 {
     ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
 
-    if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
+    if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) {
         return ucc_tl_cuda_reduce_scatter_linear_init(coll_args, tl_team,
                                                       task_p);
     } else {
diff --git a/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c b/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c
index 46efbdb051..9a025267ca 100644
--- a/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c
+++ b/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c
@@ -19,7 +19,7 @@ ucc_status_t ucc_tl_cuda_reduce_scatter_linear_init(ucc_base_coll_args_t *coll_a
         return UCC_ERR_NOT_SUPPORTED;
     }
 
-    if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) ||
+    if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) ||
         UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
         return UCC_ERR_NOT_SUPPORTED;
     }
diff --git a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c
index d85e2c8dd3..d954e38e9e 100644
--- a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c
+++ b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv.c
@@ -51,7 +51,7 @@ ucc_status_t ucc_tl_cuda_reduce_scatterv_init(ucc_base_coll_args_t *coll_args,
 {
     ucc_tl_cuda_team_t *team = ucc_derived_of(tl_team, ucc_tl_cuda_team_t);
 
-    if (ucc_tl_cuda_team_topo_is_fully_conntected(team->topo)) {
+    if (ucc_tl_cuda_team_topo_is_fully_connected(team->topo)) {
         return ucc_tl_cuda_reduce_scatterv_linear_init(coll_args, tl_team,
                                                        task_p);
     } else {
diff --git a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c
index 3e0595fa44..d719632853 100644
--- a/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c
+++ b/src/components/tl/cuda/reduce_scatterv/reduce_scatterv_linear.c
@@ -432,7 +432,7 @@ ucc_tl_cuda_reduce_scatterv_linear_init(ucc_base_coll_args_t *coll_args,
         return UCC_ERR_NOT_SUPPORTED;
     }
 
-    if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) ||
+    if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_connected(team->topo) ||
         UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
         return UCC_ERR_NOT_SUPPORTED;
     }
diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h
index 096bab1a0e..b31a2db070 100644
--- a/src/components/tl/cuda/tl_cuda.h
+++ b/src/components/tl/cuda/tl_cuda.h
@@ -225,7 +225,6 @@ struct ucc_tl_cuda_task {
             size_t (*get_offset)(const ucc_tl_cuda_task_t *task,
                                  ucc_rank_t                block);
         } allgatherv_linear;
-        
         struct {
             int stage;
             int step;
diff --git a/src/components/tl/cuda/tl_cuda_team_topo.h b/src/components/tl/cuda/tl_cuda_team_topo.h
index 96b6d63a5b..1d7d19ad1c 100644
--- a/src/components/tl/cuda/tl_cuda_team_topo.h
+++ b/src/components/tl/cuda/tl_cuda_team_topo.h
@@ -51,7 +51,7 @@ ucc_tl_cuda_team_topo_is_direct(const ucc_tl_team_t *team,
 }
 
 static inline int
-ucc_tl_cuda_team_topo_is_fully_conntected(const ucc_tl_cuda_team_topo_t *topo)
+ucc_tl_cuda_team_topo_is_fully_connected(const ucc_tl_cuda_team_topo_t *topo)
 {
     return topo->is_fully_connected;
 }

From 2af5ea55096e0739ed061bbb5b6f3cd12c2db00b Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Fri, 23 Aug 2024 15:09:20 +0200
Subject: [PATCH 21/34] TL/CUDA: removed done stage

---
 src/components/tl/cuda/bcast/bcast_linear.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index f0ba528635..8db6e43115 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -7,7 +7,6 @@
 #include "bcast/bcast.h"
 
 enum {
-    STAGE_DONE,
     STAGE_SYNC,
     STAGE_SETUP,
     // root
@@ -178,11 +177,9 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                 return;
             } else {
                 // finish
-                task->bcast_linear.stage = STAGE_DONE;
+                task->super.status = UCC_OK;
+                break;
             }
-        case STAGE_DONE:
-            task->super.status = UCC_OK;
-            break;
         default:
             break;
         }
@@ -225,7 +222,9 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                         task->bcast_linear.stage = STAGE_WAIT_ROOT;
                         return;
                     } else {
-                        task->bcast_linear.stage = STAGE_DONE;
+                        // Done
+                        task->super.status = UCC_OK;
+                        break;
                     }
                 } else {
                     return;
@@ -233,9 +232,6 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
             } else {
                 return;
             }
-        case STAGE_DONE:
-            task->super.status = UCC_OK;
-            break;
         default:
             break;
         }

From 38489b3dfed669adf9438fd0d8faf82ff01dfa3f Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Fri, 23 Aug 2024 15:44:24 +0200
Subject: [PATCH 22/34] TL/CUDA: added unit test

---
 test/gtest/coll/test_bcast.cc | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/test/gtest/coll/test_bcast.cc b/test/gtest/coll/test_bcast.cc
index 6d80816a31..69f697a508 100644
--- a/test/gtest/coll/test_bcast.cc
+++ b/test/gtest/coll/test_bcast.cc
@@ -276,6 +276,8 @@ ucc_job_env_t two_step_env = {{"UCC_CL_HIER_TUNE", "bcast:@2step:0-inf:inf"},
                               {"UCC_CLS", "all"}};
 ucc_job_env_t dbt_env      = {{"UCC_TL_UCP_TUNE", "bcast:@dbt:0-inf:inf"},
                               {"UCC_CLS", "basic"}};
+ucc_job_env_t cuda_env     = {{"UCC_TL_CUDA_TUNE", "bcast:cuda:@0"},
+                              {"UCC_CLS", "basic"}};
 INSTANTIATE_TEST_CASE_P(
     , test_bcast_alg,
     ::testing::Combine(
@@ -285,6 +287,10 @@ INSTANTIATE_TEST_CASE_P(
 #else
         ::testing::Values(UCC_MEMORY_TYPE_HOST),
 #endif
+#ifdef HAVE_CUDA
+        ::testing::Values(two_step_env, dbt_env, cuda_env), //env
+#else
         ::testing::Values(two_step_env, dbt_env), //env
+#endif
         ::testing::Values(8, 65536), // count
-        ::testing::Values(15,16))); // n_procs
+        ::testing::Values(15, 16))); // n_procs

From b9272bcb44dc96e60d8c722db9a432403f48a36c Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Tue, 3 Sep 2024 12:43:10 +0200
Subject: [PATCH 23/34] TL/CUDA: addressed  comments

---
 src/components/tl/cuda/bcast/bcast_linear.c |  1 +
 src/components/tl/cuda/tl_cuda.h            | 10 +++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index 8db6e43115..bbb5b667df 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -94,6 +94,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
 
     st = ucc_coll_task_get_executor(&task->super, &exec);
     if (ucc_unlikely(st != UCC_OK)) {
+        task->status = st;
         return;
     }
 
diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h
index b31a2db070..d86dc7cc66 100644
--- a/src/components/tl/cuda/tl_cuda.h
+++ b/src/components/tl/cuda/tl_cuda.h
@@ -226,13 +226,13 @@ struct ucc_tl_cuda_task {
                                  ucc_rank_t                block);
         } allgatherv_linear;
         struct {
-            int stage;
-            int step;
-            void *                  sbuf;
+            int                     stage;
+            int                     step;
+            void                   *sbuf;
             ucc_datatype_t          dt;
             ucc_rank_t              root;
-            size_t size;
-            int num_steps;
+            size_t                  size;
+            int                     num_steps;
             ucc_ee_executor_task_t *exec_task;
         } bcast_linear;
         struct {

From 2c7ae3209293b6dfe382557c7906ec7e0271ce9e Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Tue, 3 Sep 2024 13:00:41 +0200
Subject: [PATCH 24/34] TL/CUDA: fix formatting

---
 src/components/tl/cuda/bcast/bcast_linear.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index bbb5b667df..3ea31d4ba4 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -73,17 +73,17 @@ ucc_status_t ucc_tl_cuda_bcast_linear_finalize(ucc_coll_task_t *coll_task)
 
 void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
 {
-    ucc_tl_cuda_task_t *task  = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
-    ucc_tl_cuda_team_t *team  = TASK_TEAM(task);
-    ucc_rank_t          trank = UCC_TL_TEAM_RANK(team);
-    ucc_rank_t          tsize = UCC_TL_TEAM_SIZE(team);
+    ucc_tl_cuda_task_t *task              = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
+    ucc_tl_cuda_team_t *team              = TASK_TEAM(task);
+    ucc_rank_t          trank             = UCC_TL_TEAM_RANK(team);
+    ucc_rank_t          tsize             = UCC_TL_TEAM_SIZE(team);
     size_t              half_scratch_size = get_raw_scratch_size(team) / 2;
-    size_t              chunk_size =
+    size_t              chunk_size        =
         task->bcast_linear.step < task->bcast_linear.num_steps
                          ? ucc_min(half_scratch_size, task->bcast_linear.size)
                          : task->bcast_linear.size -
                   (task->bcast_linear.step - 1) * half_scratch_size;
-    size_t offset_buff = task->bcast_linear.step * half_scratch_size;
+    size_t offset_buff                    = task->bcast_linear.step * half_scratch_size;
     ucc_ee_executor_t      *exec;
     ucc_ee_executor_task_t *etask;
     ucc_status_t            st;

From 5b8c8c78e2c6afc6c50753a01187266407e1ac7d Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Tue, 3 Sep 2024 13:37:09 +0200
Subject: [PATCH 25/34] TL/CUDA: fixed compilation

---
 src/components/tl/cuda/bcast/bcast_linear.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index 3ea31d4ba4..992dce9470 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -94,7 +94,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
 
     st = ucc_coll_task_get_executor(&task->super, &exec);
     if (ucc_unlikely(st != UCC_OK)) {
-        task->status = st;
+        task->super.status = st;
         return;
     }
 

From 87c44241a153a95cb4a13fb9a10061807f54aa0b Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Fri, 25 Oct 2024 16:40:46 +0200
Subject: [PATCH 26/34] TL/CUDA: fix include

---
 src/components/tl/cuda/bcast/bcast_linear.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index 992dce9470..f21548f1b8 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -4,7 +4,7 @@
  * See file LICENSE for terms.
  */
 
-#include "bcast/bcast.h"
+#include "bcast.h"
 
 enum {
     STAGE_SYNC,

From 60ea28981a63e1419743ab46001e646427f59fd6 Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Mon, 28 Oct 2024 11:48:01 +0100
Subject: [PATCH 27/34] TL/CUDA: removed returns

---
 src/components/tl/cuda/bcast/bcast_linear.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index f21548f1b8..52c34ce56b 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -227,11 +227,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                         task->super.status = UCC_OK;
                         break;
                     }
-                } else {
-                    return;
                 }
-            } else {
-                return;
             }
         default:
             break;

From a4887f7fd576810eee172a6b1771c4327c89e962 Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Wed, 6 Nov 2024 15:13:41 +0100
Subject: [PATCH 28/34] TL/CUDA: active set support

---
 src/components/tl/cuda/bcast/bcast_linear.c |  20 +++-
 src/components/tl/cuda/tl_cuda.h            |  18 +--
 src/components/tl/cuda/tl_cuda_coll.c       |  13 +++
 src/components/tl/cuda/tl_cuda_coll.h       | 122 +++++++++++++++++---
 src/components/tl/cuda/tl_cuda_team.c       |   9 +-
 src/components/tl/ucp/bcast/bcast_knomial.c |   2 +-
 6 files changed, 153 insertions(+), 31 deletions(-)

diff --git a/src/components/tl/cuda/bcast/bcast_linear.c b/src/components/tl/cuda/bcast/bcast_linear.c
index 52c34ce56b..2fab96d7d7 100644
--- a/src/components/tl/cuda/bcast/bcast_linear.c
+++ b/src/components/tl/cuda/bcast/bcast_linear.c
@@ -27,6 +27,7 @@ ucc_status_t ucc_tl_cuda_bcast_linear_setup_start(ucc_tl_cuda_task_t *task)
 
     set_rank_step(task, trank, 0, 0);
     ucc_memory_cpu_store_fence();
+    // initiate barrier wait while all ranks set theirs steps to 0
     status = ucc_tl_cuda_shm_barrier_start(UCC_TL_TEAM_RANK(team), task->bar);
     if (ucc_unlikely(status != UCC_OK)) {
         goto exit_err;
@@ -76,7 +77,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
     ucc_tl_cuda_task_t *task              = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
     ucc_tl_cuda_team_t *team              = TASK_TEAM(task);
     ucc_rank_t          trank             = UCC_TL_TEAM_RANK(team);
-    ucc_rank_t          tsize             = UCC_TL_TEAM_SIZE(team);
+    ucc_rank_t          tsize             = (ucc_rank_t)task->subset.map.ep_num;
     size_t              half_scratch_size = get_raw_scratch_size(team) / 2;
     size_t              chunk_size        =
         task->bcast_linear.step < task->bcast_linear.num_steps
@@ -89,6 +90,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
     ucc_status_t            st;
     void                   *sbuf, *dbuf;
     int                     i;
+    ucc_rank_t              peer;
 
     task->super.status = UCC_INPROGRESS;
 
@@ -100,7 +102,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
 
     switch (task->bcast_linear.stage) {
     case STAGE_SYNC:
-        if (ucc_tl_cuda_get_sync(task) != UCC_OK) {
+        if (ucc_tl_cuda_get_sync_root(task, task->bcast_linear.root) != UCC_OK) {
             return;
         }
         task->bcast_linear.step = 0;
@@ -116,7 +118,6 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
             task->super.status = st;
             return;
         }
-        ucc_tl_cuda_put_sync(task);
         if (trank == task->bcast_linear.root) {
             task->bcast_linear.stage = STAGE_COPY;
         } else {
@@ -165,8 +166,17 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
             }
         case STAGE_WAIT_ALL:
             for (i = 0; i < tsize; ++i) {
+                if (UCC_COLL_ARGS_ACTIVE_SET(&TASK_ARGS(task)))
+                {
+                    // eval phys rank from virt
+                    peer = ucc_ep_map_eval(task->subset.map, i);
+                }
+                else
+                {
+                    peer = i;
+                }
                 // need to wait until all ranks complete step - 1, because of double buffering
-                if (get_rank_step(task, i, 0) < task->bcast_linear.step - 1) {
+                if (get_rank_step(task, peer, 0) < task->bcast_linear.step - 1) {
                     // rank is not ready, lets wait
                     return;
                 }
@@ -178,6 +188,7 @@ void ucc_tl_cuda_bcast_linear_progress(ucc_coll_task_t *coll_task)
                 return;
             } else {
                 // finish
+                ucc_tl_cuda_put_sync_root(task, task->bcast_linear.root);
                 task->super.status = UCC_OK;
                 break;
             }
@@ -286,7 +297,6 @@ ucc_status_t ucc_tl_cuda_bcast_linear_init(ucc_base_coll_args_t *coll_args,
     task->super.post     = ucc_tl_cuda_bcast_linear_start;
     task->super.progress = ucc_tl_cuda_bcast_linear_progress;
     task->super.finalize = ucc_tl_cuda_bcast_linear_finalize;
-    task->bar            = TASK_BAR(task);
 
     *task_p = &task->super;
     return UCC_OK;
diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h
index d86dc7cc66..7931777318 100644
--- a/src/components/tl/cuda/tl_cuda.h
+++ b/src/components/tl/cuda/tl_cuda.h
@@ -12,6 +12,7 @@
 #include "components/tl/ucc_tl_log.h"
 #include "components/mc/ucc_mc.h"
 #include "utils/ucc_mpool.h"
+#include "utils/ucc_datastruct.h"
 #include "tl_cuda_ep_hash.h"
 #include "tl_cuda_topo.h"
 #include "tl_cuda_team_topo.h"
@@ -108,6 +109,7 @@ typedef uint32_t ucc_tl_cuda_sync_state_t;
 typedef struct ucc_tl_cuda_shm_barrier {
     ucc_rank_t   size;
     ucc_rank_t   count;
+    uint32_t     tag;
     int          sense;
     ucc_status_t state[UCC_TL_CUDA_MAX_PEERS];
     int          local_sense[UCC_TL_CUDA_MAX_PEERS];
@@ -155,11 +157,12 @@ typedef struct ucc_tl_cuda_scratch {
 
 typedef struct ucc_tl_cuda_team {
     ucc_tl_team_t              super;
-    uint32_t                   seq_num;
+    uint32_t                   seq_num; // counter for launched collectives (tasks) for this team
+    uint32_t                   seq_num_active_set; // active set
     ucc_tl_cuda_team_topo_t   *topo;
-    ucc_tl_cuda_sync_t        *sync;
-    ucc_tl_cuda_sync_state_t  *sync_state;
-    ucc_tl_cuda_shm_barrier_t *bar;
+    ucc_tl_cuda_sync_t        *sync; // pointer to shared mem
+    ucc_tl_cuda_sync_state_t  *sync_state; // to track if sync segment of shared memory is used by what task?
+    ucc_tl_cuda_shm_barrier_t *bar;     // pointer to first barrier in array of [0; 2 * max_concurrent) first max_concurrent for normal mode and second for active set
     ucc_tl_cuda_scratch_t      scratch;
     cudaStream_t               stream;
     ucc_tl_cuda_rank_id_t     *ids;
@@ -173,9 +176,10 @@ UCC_CLASS_DECLARE(ucc_tl_cuda_team_t, ucc_base_context_t *,
 typedef struct ucc_tl_cuda_task ucc_tl_cuda_task_t;
 struct ucc_tl_cuda_task {
     ucc_coll_task_t            super;
-    uint32_t                   seq_num;
-    uint32_t                   coll_id;
-    ucc_tl_cuda_shm_barrier_t *bar;
+    uint32_t                   seq_num; // sequential number of collective (started task) in team
+    uint32_t                   coll_id; // index of collective in flight [0; max_concurrent)
+    ucc_tl_cuda_shm_barrier_t *bar;     // pointer to barrier that reserved for with task in cuda team
+    ucc_subset_t               subset;  // information about mapping of active set if it present
     union {
         struct {
             int                    stage;
diff --git a/src/components/tl/cuda/tl_cuda_coll.c b/src/components/tl/cuda/tl_cuda_coll.c
index 42b33cdbcc..ab2a5ff733 100644
--- a/src/components/tl/cuda/tl_cuda_coll.c
+++ b/src/components/tl/cuda/tl_cuda_coll.c
@@ -93,6 +93,19 @@ ucc_status_t ucc_tl_cuda_coll_init(ucc_base_coll_args_t *coll_args,
     }
 }
 
+ucc_status_t ucc_tl_cuda_shm_barrier_init_root(ucc_rank_t size, ucc_rank_t rank, ucc_rank_t root,
+                                          ucc_tl_cuda_shm_barrier_t *barrier)
+{
+    if (rank == root) {
+        barrier->size  = size;
+        barrier->count = 0;
+        barrier->sense = 0;
+    }
+    barrier->state[rank]       = UCC_OK;
+    barrier->local_sense[rank] = 1;
+    return UCC_OK;
+}
+
 ucc_status_t ucc_tl_cuda_shm_barrier_init(ucc_rank_t size, ucc_rank_t rank,
                                           ucc_tl_cuda_shm_barrier_t *barrier)
 {
diff --git a/src/components/tl/cuda/tl_cuda_coll.h b/src/components/tl/cuda/tl_cuda_coll.h
index 55b86e2cee..01e41af1f7 100644
--- a/src/components/tl/cuda/tl_cuda_coll.h
+++ b/src/components/tl/cuda/tl_cuda_coll.h
@@ -7,6 +7,8 @@
 #ifndef UCC_TL_CUDA_COLL_H_
 #define UCC_TL_CUDA_COLL_H_
 
+#include <stdbool.h>
+
 #include "tl_cuda.h"
 #include "components/mc/ucc_mc.h"
 
@@ -50,6 +52,19 @@ static inline void ucc_tl_cuda_task_reset(ucc_tl_cuda_task_t *task)
     task->super.status = UCC_INPROGRESS;
 }
 
+ucc_status_t ucc_tl_cuda_shm_barrier_init_root(ucc_rank_t size, ucc_rank_t rank, ucc_rank_t root,
+                                          ucc_tl_cuda_shm_barrier_t *barrier);
+
+ucc_status_t ucc_tl_cuda_shm_barrier_init(ucc_rank_t size, ucc_rank_t rank,
+                                          ucc_tl_cuda_shm_barrier_t *barrier);
+
+ucc_status_t ucc_tl_cuda_shm_barrier_start(ucc_rank_t rank,
+                                           ucc_tl_cuda_shm_barrier_t *barrier);
+
+ucc_status_t ucc_tl_cuda_shm_barrier_test(ucc_rank_t rank,
+                                          ucc_tl_cuda_shm_barrier_t *barrier);
+
+
 static inline ucc_tl_cuda_task_t *ucc_tl_cuda_task_get(ucc_tl_cuda_team_t *team)
 {
     ucc_tl_cuda_context_t *ctx  = UCC_TL_CUDA_TEAM_CTX(team);
@@ -79,11 +94,13 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args,
                                    ucc_tl_cuda_team_t *team,
                                    ucc_tl_cuda_task_t **task_h)
 {
-    ucc_rank_t          trank          = UCC_TL_TEAM_RANK(team);
-    ucc_tl_cuda_lib_t  *lib            = UCC_TL_CUDA_TEAM_LIB(team);
-    uint32_t            max_concurrent = lib->cfg.max_concurrent;
-    ucc_tl_cuda_task_t *task;
-    ucc_status_t        status;
+    ucc_rank_t                 trank          = UCC_TL_TEAM_RANK(team);
+    ucc_tl_cuda_lib_t         *lib            = UCC_TL_CUDA_TEAM_LIB(team);
+    uint32_t                   max_concurrent = lib->cfg.max_concurrent;
+    ucc_tl_cuda_shm_barrier_t *curr_bar;
+    ucc_tl_cuda_task_t        *task;
+    ucc_status_t               status;
+    uint32_t                   i;
 
     if (!ucc_coll_args_is_predefined_dt(&coll_args->args, trank)) {
         return UCC_ERR_NOT_SUPPORTED;
@@ -100,19 +117,74 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args,
         return status;
     }
 
-    task->seq_num = team->seq_num++;
-    task->coll_id = task->seq_num % max_concurrent;
+    /* active set */
+    if (UCC_COLL_ARGS_ACTIVE_SET(&coll_args->args)) {
+        ucc_assert(coll_args->args.coll_type == UCC_COLL_TYPE_BCAST);
+
+        task->subset.map    = ucc_active_set_to_ep_map(&coll_args->args);
+        task->subset.myrank = UCC_TL_TEAM_RANK(team);
+        // root
+        if (task->subset.myrank == coll_args->args.root) {
+            bool found = false;
+            /* search first free barrier in active set pool */
+            for (i = 0; i < max_concurrent; ++i) {
+                curr_bar = UCC_TL_CUDA_TEAM_BARRIER(team, max_concurrent + i);
+                if (curr_bar->tag == 0) {
+                    // free
+                    task->bar = curr_bar;
+                    // set user specified tag to mark that this barrier is used by this task
+                    task->bar->tag = coll_args->args.tag;
+                    status = ucc_tl_cuda_shm_barrier_init_root(task->subset.map.ep_num, task->subset.myrank, coll_args->args.root, task->bar);
+                    if (ucc_unlikely(status != UCC_OK)) {
+                        ucc_error("failed to init root barrier");
+                    }
+
+                    found = true;
+                    break;
+                }
+            }
+            ucc_assert(found);
+        } else {
+            /* pool barrier while root mark any of it with tag */
+            bool found = false;
+            // TODO: get rid of inf loop?
+            while (!found)
+            {
+                for (i = 0; i < max_concurrent; ++i) {
+                    curr_bar = UCC_TL_CUDA_TEAM_BARRIER(team, max_concurrent + i);
+                    if (curr_bar->tag == coll_args->args.tag) {
+                        task->bar = curr_bar;
+                        // TODO: pass root rank???
+                        status = ucc_tl_cuda_shm_barrier_init_root(task->subset.map.ep_num, task->subset.myrank, coll_args->args.root, task->bar);
+                        if (ucc_unlikely(status != UCC_OK)) {
+                            ucc_error("failed to init peer barrier");
+                        }
+
+                        found = true;
+                        break;
+                    }
+                }
+            }
+        }
+        task->seq_num = team->seq_num_active_set++;
+        task->coll_id = task->seq_num % max_concurrent + max_concurrent;
+    } else {
+        task->seq_num = team->seq_num++;
+        task->coll_id = task->seq_num % max_concurrent;
+        task->bar = TASK_BAR(task);
+    }
 
     *task_h = task;
     return UCC_OK;
 }
 
-static inline ucc_status_t ucc_tl_cuda_get_sync(ucc_tl_cuda_task_t *task)
+// check if segment for current task is available and barrier is available (completed from prev iteration)
+static inline ucc_status_t ucc_tl_cuda_get_sync_root(ucc_tl_cuda_task_t *task, ucc_rank_t root)
 {
     ucc_tl_cuda_team_t                *team  = TASK_TEAM(task);
     volatile ucc_tl_cuda_sync_state_t *state = &team->sync_state[task->coll_id];
 
-    if ((UCC_TL_TEAM_RANK(team) == 0) && (*state == 0)) {
+    if ((UCC_TL_TEAM_RANK(team) == root) && (*state == 0)) {
         *state = task->seq_num;
     }
     if ((*state != task->seq_num) ||
@@ -122,6 +194,31 @@ static inline ucc_status_t ucc_tl_cuda_get_sync(ucc_tl_cuda_task_t *task)
     return UCC_OK;
 }
 
+static inline void ucc_tl_cuda_put_sync_root(ucc_tl_cuda_task_t *task, ucc_rank_t root)
+{
+    ucc_tl_cuda_team_t       *team  = TASK_TEAM(task);
+    ucc_tl_cuda_sync_state_t *state = &team->sync_state[task->coll_id];
+
+    if (UCC_TL_TEAM_RANK(team) == root) {
+        ucc_assert(*state == task->seq_num);
+        *state = 0;
+    }
+}
+
+static inline ucc_status_t ucc_tl_cuda_get_sync(ucc_tl_cuda_task_t *task)
+{
+    ucc_tl_cuda_team_t                *team  = TASK_TEAM(task);
+    volatile ucc_tl_cuda_sync_state_t *state = &team->sync_state[task->coll_id];
+
+    if ((UCC_TL_TEAM_RANK(team) == 0) && (*state == 0)) {
+        *state = task->seq_num;
+    }
+    if ((*state != task->seq_num) || (task->bar->state[UCC_TL_TEAM_RANK(team)] != UCC_OK)) {
+        return UCC_INPROGRESS;
+    }
+    return UCC_OK;
+}
+
 static inline void ucc_tl_cuda_put_sync(ucc_tl_cuda_task_t *task)
 {
     ucc_tl_cuda_team_t       *team  = TASK_TEAM(task);
@@ -142,14 +239,7 @@ ucc_status_t ucc_tl_cuda_coll_init(ucc_base_coll_args_t *coll_args,
 
 ucc_status_t ucc_tl_cuda_coll_finalize(ucc_coll_task_t *coll_task);
 
-ucc_status_t ucc_tl_cuda_shm_barrier_init(ucc_rank_t size, ucc_rank_t rank,
-                                          ucc_tl_cuda_shm_barrier_t *barrier);
 
-ucc_status_t ucc_tl_cuda_shm_barrier_start(ucc_rank_t rank,
-                                           ucc_tl_cuda_shm_barrier_t *barrier);
-
-ucc_status_t ucc_tl_cuda_shm_barrier_test(ucc_rank_t rank,
-                                          ucc_tl_cuda_shm_barrier_t *barrier);
 
 ucc_status_t ucc_tl_cuda_alg_id_to_init(int alg_id, const char *alg_id_str,
                                         ucc_coll_type_t          coll_type,
diff --git a/src/components/tl/cuda/tl_cuda_team.c b/src/components/tl/cuda/tl_cuda_team.c
index 64123a8cea..e85537f861 100644
--- a/src/components/tl/cuda/tl_cuda_team.c
+++ b/src/components/tl/cuda/tl_cuda_team.c
@@ -45,7 +45,8 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_team_t, ucc_base_context_t *tl_context,
         return UCC_ERR_NO_MEMORY;
     }
 
-    scratch_size = lib->cfg.max_concurrent * lib->cfg.scratch_size;
+    // active set
+    scratch_size = 2 * lib->cfg.max_concurrent * lib->cfg.scratch_size;
     status = CUDA_FUNC(cudaMalloc(&self->scratch.loc, scratch_size));
     if (status != UCC_OK) {
         tl_error(tl_context->lib, "failed to alloc scratch buffer");
@@ -64,6 +65,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_team_t, ucc_base_context_t *tl_context,
                 lib->cfg.max_concurrent +
                 sizeof(ucc_tl_cuda_shm_barrier_t) * lib->cfg.max_concurrent +
                 sizeof(ucc_tl_cuda_sync_state_t) * lib->cfg.max_concurrent;
+    ctrl_size *= 2; // active sets
 
     shm_id = -1;
     self->sync = (void*)-1;
@@ -79,8 +81,10 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_team_t, ucc_base_context_t *tl_context,
         memset(self->sync, 0, ctrl_size);
         self->bar  = (ucc_tl_cuda_shm_barrier_t*)UCC_TL_CUDA_TEAM_SYNC(self, 0,
                                                        lib->cfg.max_concurrent);
-        for (i = 0; i < lib->cfg.max_concurrent; i++) {
+        /* active set */
+        for (i = 0; i < lib->cfg.max_concurrent * 2; i++) {
             bar = UCC_TL_CUDA_TEAM_BARRIER(self, i);
+            bar->tag = 0; // mark as free
             for (j = 0; j < UCC_TL_TEAM_SIZE(self); j++) {
                 status = ucc_tl_cuda_shm_barrier_init(UCC_TL_TEAM_SIZE(self),
                                                       j, bar);
@@ -109,6 +113,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_team_t, ucc_base_context_t *tl_context,
     tl_debug(tl_context->lib, "posted tl team: %p", self);
 
     self->seq_num = 1;
+    self->seq_num_active_set = 1;
     return UCC_OK;
 
 free_devices:
diff --git a/src/components/tl/ucp/bcast/bcast_knomial.c b/src/components/tl/ucp/bcast/bcast_knomial.c
index 1ca08893e3..21f47bf293 100644
--- a/src/components/tl/ucp/bcast/bcast_knomial.c
+++ b/src/components/tl/ucp/bcast/bcast_knomial.c
@@ -22,7 +22,7 @@ void ucc_tl_ucp_bcast_knomial_progress(ucc_coll_task_t *coll_task)
     ucc_rank_t         size      = (ucc_rank_t)task->subset.map.ep_num;
 
     uint32_t           radix     = task->bcast_kn.radix;
-    ucc_rank_t         root = (uint32_t)TASK_ARGS(task).root;
+    ucc_rank_t         root      = (uint32_t)TASK_ARGS(task).root;
     ucc_rank_t         dist      = task->bcast_kn.dist;
     void              *buffer    = TASK_ARGS(task).src.info.buffer;
     ucc_memory_type_t  mtype     = TASK_ARGS(task).src.info.mem_type;

From 082c643d847214cb7373aebe1369c04cbcb62bfe Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Wed, 6 Nov 2024 15:29:57 +0100
Subject: [PATCH 29/34] TL/CUDA: fix build

---
 src/components/tl/cuda/tl_cuda_coll.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/components/tl/cuda/tl_cuda_coll.h b/src/components/tl/cuda/tl_cuda_coll.h
index 01e41af1f7..d9c2ad2016 100644
--- a/src/components/tl/cuda/tl_cuda_coll.h
+++ b/src/components/tl/cuda/tl_cuda_coll.h
@@ -137,13 +137,18 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args,
                     status = ucc_tl_cuda_shm_barrier_init_root(task->subset.map.ep_num, task->subset.myrank, coll_args->args.root, task->bar);
                     if (ucc_unlikely(status != UCC_OK)) {
                         ucc_error("failed to init root barrier");
+                        return UCC_ERR_NO_RESOURCE;
                     }
-
                     found = true;
                     break;
                 }
             }
             ucc_assert(found);
+            if (!found)
+            {
+                ucc_error("Unable to find free barrier");
+                return UCC_ERR_NO_RESOURCE;
+            }
         } else {
             /* pool barrier while root mark any of it with tag */
             bool found = false;
@@ -158,8 +163,8 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args,
                         status = ucc_tl_cuda_shm_barrier_init_root(task->subset.map.ep_num, task->subset.myrank, coll_args->args.root, task->bar);
                         if (ucc_unlikely(status != UCC_OK)) {
                             ucc_error("failed to init peer barrier");
+                            return UCC_ERR_NO_RESOURCE;
                         }
-
                         found = true;
                         break;
                     }

From 168d2e154a44a8637b2877ada05704006e027877 Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Wed, 6 Nov 2024 15:46:24 +0100
Subject: [PATCH 30/34] TL/CUDA: fixed comments

---
 src/components/tl/cuda/tl_cuda.h | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h
index 7931777318..e7d66eec8a 100644
--- a/src/components/tl/cuda/tl_cuda.h
+++ b/src/components/tl/cuda/tl_cuda.h
@@ -155,14 +155,15 @@ typedef struct ucc_tl_cuda_scratch {
     ucc_tl_cuda_mem_info_t rem_info[UCC_TL_CUDA_MAX_PEERS];
 } ucc_tl_cuda_scratch_t;
 
+// Team represents a communicator created within the CUDA context, typically using NVLink for inter-GPU communication
 typedef struct ucc_tl_cuda_team {
     ucc_tl_team_t              super;
-    uint32_t                   seq_num; // counter for launched collectives (tasks) for this team
-    uint32_t                   seq_num_active_set; // active set
+    uint32_t                   seq_num;            // Counter for the number of launched collective tasks for this team
+    uint32_t                   seq_num_active_set; // Counter for tasks in the active set (subset of tasks requiring special handling)
     ucc_tl_cuda_team_topo_t   *topo;
-    ucc_tl_cuda_sync_t        *sync; // pointer to shared mem
-    ucc_tl_cuda_sync_state_t  *sync_state; // to track if sync segment of shared memory is used by what task?
-    ucc_tl_cuda_shm_barrier_t *bar;     // pointer to first barrier in array of [0; 2 * max_concurrent) first max_concurrent for normal mode and second for active set
+    ucc_tl_cuda_sync_t        *sync;               // Pointer to shared memory segment for synchronization
+    ucc_tl_cuda_sync_state_t  *sync_state;         // Tracks the task currently using the sync segment of shared memory, if free - 0
+    ucc_tl_cuda_shm_barrier_t *bar;                // Pointer to the first barrier in an array of size [0; 2 * max_concurrent]. First max_concurrent barriers are for normal mode, the second one for active set mode
     ucc_tl_cuda_scratch_t      scratch;
     cudaStream_t               stream;
     ucc_tl_cuda_rank_id_t     *ids;
@@ -173,13 +174,14 @@ typedef struct ucc_tl_cuda_team {
 UCC_CLASS_DECLARE(ucc_tl_cuda_team_t, ucc_base_context_t *,
                   const ucc_base_team_params_t *);
 
+// Task represents a collective operation that runs in the CUDA context, typically using NVLink for inter-GPU communication
 typedef struct ucc_tl_cuda_task ucc_tl_cuda_task_t;
 struct ucc_tl_cuda_task {
     ucc_coll_task_t            super;
-    uint32_t                   seq_num; // sequential number of collective (started task) in team
-    uint32_t                   coll_id; // index of collective in flight [0; max_concurrent)
-    ucc_tl_cuda_shm_barrier_t *bar;     // pointer to barrier that reserved for with task in cuda team
-    ucc_subset_t               subset;  // information about mapping of active set if it present
+    uint32_t                   seq_num; // Sequential identifier for each taks started within the team
+    uint32_t                   coll_id; // Index of the collective task in flight, within the range [0; max_concurrent)
+    ucc_tl_cuda_shm_barrier_t *bar;     // Pointer to the reserved barrier for this task in the CUDA team
+    ucc_subset_t               subset;  // Mapping information for the active set, if it is present
     union {
         struct {
             int                    stage;

From 8a7adc4dfbceef7647164005288e8cb55d09a4fe Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Thu, 7 Nov 2024 16:33:09 +0100
Subject: [PATCH 31/34] TL/CUDA: select free bar using atomic

---
 src/components/tl/cuda/tl_cuda_coll.h | 13 +++++++++----
 src/utils/ucc_atomic.h                |  1 +
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/components/tl/cuda/tl_cuda_coll.h b/src/components/tl/cuda/tl_cuda_coll.h
index d9c2ad2016..5904ef3629 100644
--- a/src/components/tl/cuda/tl_cuda_coll.h
+++ b/src/components/tl/cuda/tl_cuda_coll.h
@@ -86,6 +86,12 @@ static inline ucc_tl_cuda_task_t *ucc_tl_cuda_task_get(ucc_tl_cuda_team_t *team)
 static inline void ucc_tl_cuda_task_put(ucc_tl_cuda_task_t *task)
 {
     UCC_TL_CUDA_PROFILE_REQUEST_FREE(task);
+
+    if (UCC_TL_TEAM_RANK(TASK_TEAM(task)) == task->bcast_linear.root) {
+        ucc_print("free bar!");
+        task->bar->tag = 0;
+    }
+
     ucc_mpool_put(task);
 }
 
@@ -129,7 +135,8 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args,
             /* search first free barrier in active set pool */
             for (i = 0; i < max_concurrent; ++i) {
                 curr_bar = UCC_TL_CUDA_TEAM_BARRIER(team, max_concurrent + i);
-                if (curr_bar->tag == 0) {
+                if (ucc_atomic_cswap32(&curr_bar->tag, 0, coll_args->args.tag) == 0) {
+                    ucc_print("found free barrier: %d", i);
                     // free
                     task->bar = curr_bar;
                     // set user specified tag to mark that this barrier is used by this task
@@ -143,10 +150,10 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args,
                     break;
                 }
             }
-            ucc_assert(found);
             if (!found)
             {
                 ucc_error("Unable to find free barrier");
+                ucc_assert(found);
                 return UCC_ERR_NO_RESOURCE;
             }
         } else {
@@ -244,8 +251,6 @@ ucc_status_t ucc_tl_cuda_coll_init(ucc_base_coll_args_t *coll_args,
 
 ucc_status_t ucc_tl_cuda_coll_finalize(ucc_coll_task_t *coll_task);
 
-
-
 ucc_status_t ucc_tl_cuda_alg_id_to_init(int alg_id, const char *alg_id_str,
                                         ucc_coll_type_t          coll_type,
                                         ucc_memory_type_t        mem_type,
diff --git a/src/utils/ucc_atomic.h b/src/utils/ucc_atomic.h
index d5aabd1829..71eb12e233 100644
--- a/src/utils/ucc_atomic.h
+++ b/src/utils/ucc_atomic.h
@@ -16,6 +16,7 @@
 #define ucc_atomic_add64          ucs_atomic_add64
 #define ucc_atomic_sub64          ucs_atomic_sub64
 #define ucc_atomic_cswap8         ucs_atomic_cswap8
+#define ucc_atomic_cswap32        ucs_atomic_cswap32
 #define ucc_atomic_cswap64        ucs_atomic_cswap64
 #define ucc_atomic_bool_cswap8    ucs_atomic_bool_cswap8
 #define ucc_atomic_bool_cswap64   ucs_atomic_bool_cswap64

From f2404db2abbe71d5388518ab0efcbf8f062705b3 Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Fri, 8 Nov 2024 12:40:41 +0100
Subject: [PATCH 32/34] TL/CUDA: fix

---
 src/components/tl/cuda/tl_cuda_coll.h |  4 +++-
 src/components/tl/cuda/tl_cuda_team.c | 13 +++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/components/tl/cuda/tl_cuda_coll.h b/src/components/tl/cuda/tl_cuda_coll.h
index 5904ef3629..7e47deddc2 100644
--- a/src/components/tl/cuda/tl_cuda_coll.h
+++ b/src/components/tl/cuda/tl_cuda_coll.h
@@ -147,6 +147,7 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args,
                         return UCC_ERR_NO_RESOURCE;
                     }
                     found = true;
+                    task->coll_id = i + max_concurrent;
                     break;
                 }
             }
@@ -173,13 +174,14 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args,
                             return UCC_ERR_NO_RESOURCE;
                         }
                         found = true;
+                        task->coll_id = i + max_concurrent;
                         break;
                     }
                 }
             }
         }
         task->seq_num = team->seq_num_active_set++;
-        task->coll_id = task->seq_num % max_concurrent + max_concurrent;
+        // task->coll_id = task->seq_num % max_concurrent + max_concurrent;
     } else {
         task->seq_num = team->seq_num++;
         task->coll_id = task->seq_num % max_concurrent;
diff --git a/src/components/tl/cuda/tl_cuda_team.c b/src/components/tl/cuda/tl_cuda_team.c
index e85537f861..be0286a7a0 100644
--- a/src/components/tl/cuda/tl_cuda_team.c
+++ b/src/components/tl/cuda/tl_cuda_team.c
@@ -80,7 +80,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_team_t, ucc_base_context_t *tl_context,
         }
         memset(self->sync, 0, ctrl_size);
         self->bar  = (ucc_tl_cuda_shm_barrier_t*)UCC_TL_CUDA_TEAM_SYNC(self, 0,
-                                                       lib->cfg.max_concurrent);
+                                                       lib->cfg.max_concurrent * 2);
         /* active set */
         for (i = 0; i < lib->cfg.max_concurrent * 2; i++) {
             bar = UCC_TL_CUDA_TEAM_BARRIER(self, i);
@@ -142,7 +142,7 @@ UCC_CLASS_CLEANUP_FUNC(ucc_tl_cuda_team_t)
     }
     if (self->ids) {
         if (self->sync != (void*)-1) {
-            for (i = 0; i < lib->cfg.max_concurrent; i++) {
+            for (i = 0; i < lib->cfg.max_concurrent * 2; i++) {
                 for (j = 0; j < UCC_TL_TEAM_SIZE(self); j++) {
                     if (j == UCC_TL_TEAM_RANK(self)) {
                         continue;
@@ -273,14 +273,15 @@ ucc_status_t ucc_tl_cuda_team_create_test(ucc_base_team_t *tl_team)
             goto exit_err;
         }
         team->bar = (ucc_tl_cuda_shm_barrier_t*)UCC_TL_CUDA_TEAM_SYNC(team, 0,
-                                                       lib->cfg.max_concurrent);
+                                                       lib->cfg.max_concurrent * 2);
     }
     team->sync_state = (ucc_tl_cuda_sync_state_t*)PTR_OFFSET(team->bar,
                             sizeof(ucc_tl_cuda_shm_barrier_t) *
-                            lib->cfg.max_concurrent);
+                            lib->cfg.max_concurrent * 2);
     CUDA_CHECK_GOTO(cudaStreamCreateWithFlags(&team->stream,
                     cudaStreamNonBlocking), exit_err, status);
-    for (i = 0; i < lib->cfg.max_concurrent; i++) {
+    // second max_concurent events are unused for bcast active set
+    for (i = 0; i < lib->cfg.max_concurrent * 2; i++) {
         sync = UCC_TL_CUDA_TEAM_SYNC(team, UCC_TL_TEAM_RANK(team), i);
         CUDA_CHECK_GOTO(cudaEventCreateWithFlags(&sync->ipc_event_local,
                                                 cudaEventDisableTiming |
@@ -308,7 +309,7 @@ ucc_status_t ucc_tl_cuda_team_create_test(ucc_base_team_t *tl_team)
         goto exit_err;
     }
 
-    for (i = 0; i < lib->cfg.max_concurrent; i++) {
+    for (i = 0; i < lib->cfg.max_concurrent * 2; i++) {
         sync = UCC_TL_CUDA_TEAM_SYNC(team, UCC_TL_TEAM_RANK(team), i);
         for (j = 0 ; j < UCC_TL_TEAM_SIZE(team); j++) {
             if (j == UCC_TL_TEAM_RANK(team)) {

From f4f1e5a314d88eb3068198484401a86dd4f377e5 Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Fri, 8 Nov 2024 16:00:57 +0100
Subject: [PATCH 33/34] TL/CUDA: replace free tag

---
 src/components/tl/cuda/tl_cuda.h      | 2 ++
 src/components/tl/cuda/tl_cuda_coll.h | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/components/tl/cuda/tl_cuda.h b/src/components/tl/cuda/tl_cuda.h
index e7d66eec8a..8a3d63ff41 100644
--- a/src/components/tl/cuda/tl_cuda.h
+++ b/src/components/tl/cuda/tl_cuda.h
@@ -106,6 +106,8 @@ UCC_CLASS_DECLARE(ucc_tl_cuda_context_t, const ucc_base_context_params_t *,
 
 typedef uint32_t ucc_tl_cuda_sync_state_t;
 
+#define UCC_TAG_FREE 0xFFFFFFFF
+
 typedef struct ucc_tl_cuda_shm_barrier {
     ucc_rank_t   size;
     ucc_rank_t   count;
diff --git a/src/components/tl/cuda/tl_cuda_coll.h b/src/components/tl/cuda/tl_cuda_coll.h
index 7e47deddc2..e584c7f0a0 100644
--- a/src/components/tl/cuda/tl_cuda_coll.h
+++ b/src/components/tl/cuda/tl_cuda_coll.h
@@ -89,7 +89,7 @@ static inline void ucc_tl_cuda_task_put(ucc_tl_cuda_task_t *task)
 
     if (UCC_TL_TEAM_RANK(TASK_TEAM(task)) == task->bcast_linear.root) {
         ucc_print("free bar!");
-        task->bar->tag = 0;
+        task->bar->tag = UCC_TAG_FREE;
     }
 
     ucc_mpool_put(task);
@@ -135,7 +135,7 @@ ucc_status_t ucc_tl_cuda_task_init(ucc_base_coll_args_t *coll_args,
             /* search first free barrier in active set pool */
             for (i = 0; i < max_concurrent; ++i) {
                 curr_bar = UCC_TL_CUDA_TEAM_BARRIER(team, max_concurrent + i);
-                if (ucc_atomic_cswap32(&curr_bar->tag, 0, coll_args->args.tag) == 0) {
+                if (ucc_atomic_cswap32(&curr_bar->tag, UCC_TAG_FREE, coll_args->args.tag) == UCC_TAG_FREE) {
                     ucc_print("found free barrier: %d", i);
                     // free
                     task->bar = curr_bar;

From e9c1abe2a6bb7925f27ffe9169168b8366cd7512 Mon Sep 17 00:00:00 2001
From: Ilya Kryukov <ilyakrukov@gmail.com>
Date: Fri, 8 Nov 2024 16:12:57 +0100
Subject: [PATCH 34/34] TL/CUDA: fix bar tag init val

---
 src/components/tl/cuda/tl_cuda_team.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/components/tl/cuda/tl_cuda_team.c b/src/components/tl/cuda/tl_cuda_team.c
index be0286a7a0..8fe668249a 100644
--- a/src/components/tl/cuda/tl_cuda_team.c
+++ b/src/components/tl/cuda/tl_cuda_team.c
@@ -84,7 +84,7 @@ UCC_CLASS_INIT_FUNC(ucc_tl_cuda_team_t, ucc_base_context_t *tl_context,
         /* active set */
         for (i = 0; i < lib->cfg.max_concurrent * 2; i++) {
             bar = UCC_TL_CUDA_TEAM_BARRIER(self, i);
-            bar->tag = 0; // mark as free
+            bar->tag = UCC_TAG_FREE; // mark as free
             for (j = 0; j < UCC_TL_TEAM_SIZE(self); j++) {
                 status = ucc_tl_cuda_shm_barrier_init(UCC_TL_TEAM_SIZE(self),
                                                       j, bar);