From 9523ff15698107eb1c5c00462a3ffe2646a3f9ec Mon Sep 17 00:00:00 2001 From: Li Wei Date: Fri, 22 Mar 2024 10:28:25 +0900 Subject: [PATCH 01/10] DAOS-14261 engine: Add dss_chore for I/O forwarding (#13372) As requested by the Jira ticket, add a new I/O forwarding mechanism, dss_chore, to avoid creating a ULT for every forwarding task. - Forwarding of object I/O and DTX RPCs is converted to chores. - Cancelation is not implemented, because the I/O forwarding tasks themselves do not support cancelation yet. - In certain engine configurations, some xstreams do not need to initialize dx_chore_queue. This is left to future work. Required-githooks: true Skipped-githooks: clang Change-Id: I8d6f9889f5562a8bc3683d26cb830672a8aa40f3 Signed-off-by: Li Wei --- src/dtx/dtx_common.c | 88 +++++++----- src/dtx/dtx_rpc.c | 139 ++++++++++++------- src/engine/sched.c | 18 ++- src/engine/srv.c | 27 +++- src/engine/srv_internal.h | 16 +++ src/engine/ult.c | 209 +++++++++++++++++++++++++++++ src/include/daos_srv/daos_engine.h | 43 ++++++ 7 files changed, 453 insertions(+), 87 deletions(-) diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c index 43ba3d64752..ee119d4b965 100644 --- a/src/dtx/dtx_common.c +++ b/src/dtx/dtx_common.c @@ -1914,10 +1914,16 @@ dtx_handle_resend(daos_handle_t coh, struct dtx_id *dti, */ #define DTX_EXEC_STEP_LENGTH DTX_THRESHOLD_COUNT -struct dtx_ult_arg { +struct dtx_chore { + struct dss_chore chore; dtx_sub_func_t func; void *func_arg; struct dtx_leader_handle *dlh; + + /* Chore-internal state variables */ + uint32_t i; + uint32_t j; + uint32_t k; }; static void @@ -1970,20 +1976,34 @@ dtx_sub_comp_cb(struct dtx_leader_handle *dlh, int idx, int rc) idx, tgt->st_rank, tgt->st_tgt_idx, tgt->st_flags, rc); } -static void -dtx_leader_exec_ops_ult(void *arg) +static enum dss_chore_status +dtx_leader_exec_ops_chore(struct dss_chore *chore, bool is_reentrance) { - struct dtx_ult_arg *ult_arg = arg; - struct dtx_leader_handle *dlh = ult_arg->dlh; + struct dtx_chore *dtx_chore = container_of(chore, struct dtx_chore, chore); + struct dtx_leader_handle *dlh = dtx_chore->dlh; struct dtx_sub_status *sub; struct daos_shard_tgt *tgt; - uint32_t i; - uint32_t j; - uint32_t k; int rc = 0; - for (i = dlh->dlh_forward_idx, j = 0, k = 0; j < dlh->dlh_forward_cnt; i++, j++) { - sub = &dlh->dlh_subs[i]; + /* + * If this is the first entrance, initialize the chore-internal state + * variables. + */ + if (is_reentrance) { + D_DEBUG(DB_TRACE, "%p: resume: i=%u j=%u k=%u forward_cnt=%u\n", chore, + dtx_chore->i, dtx_chore->j, dtx_chore->k, dlh->dlh_forward_cnt); + dtx_chore->i++; + dtx_chore->j++; + } else { + D_DEBUG(DB_TRACE, "%p: initialize: forward_idx=%u forward_cnt=%u\n", chore, + dlh->dlh_forward_idx, dlh->dlh_forward_cnt); + dtx_chore->i = dlh->dlh_forward_idx; + dtx_chore->j = 0; + dtx_chore->k = 0; + } + + for (; dtx_chore->j < dlh->dlh_forward_cnt; dtx_chore->i++, dtx_chore->j++) { + sub = &dlh->dlh_subs[dtx_chore->i]; tgt = &sub->dss_tgt; if (dlh->dlh_normal_sub_done == 0) { @@ -1991,7 +2011,7 @@ dtx_leader_exec_ops_ult(void *arg) sub->dss_comp = 0; if (unlikely(tgt->st_flags & DTF_DELAY_FORWARD)) { - dtx_sub_comp_cb(dlh, i, 0); + dtx_sub_comp_cb(dlh, dtx_chore->i, 0); continue; } } else { @@ -2003,33 +2023,35 @@ dtx_leader_exec_ops_ult(void *arg) } if (tgt->st_rank == DAOS_TGT_IGNORE || - (i == daos_fail_value_get() && DAOS_FAIL_CHECK(DAOS_DTX_SKIP_PREPARE))) { + (dtx_chore->i == daos_fail_value_get() && + DAOS_FAIL_CHECK(DAOS_DTX_SKIP_PREPARE))) { if (dlh->dlh_normal_sub_done == 0 || tgt->st_flags & DTF_DELAY_FORWARD) - dtx_sub_comp_cb(dlh, i, 0); + dtx_sub_comp_cb(dlh, dtx_chore->i, 0); continue; } - rc = ult_arg->func(dlh, ult_arg->func_arg, i, dtx_sub_comp_cb); + rc = dtx_chore->func(dlh, dtx_chore->func_arg, dtx_chore->i, dtx_sub_comp_cb); if (rc != 0) { if (sub->dss_comp == 0) - dtx_sub_comp_cb(dlh, i, rc); + dtx_sub_comp_cb(dlh, dtx_chore->i, rc); break; } /* Yield to avoid holding CPU for too long time. */ - if ((++k) % DTX_RPC_YIELD_THD == 0) - ABT_thread_yield(); + if (++(dtx_chore->k) % DTX_RPC_YIELD_THD == 0) + return DSS_CHORE_YIELD; } if (rc != 0) { - for (i++, j++; j < dlh->dlh_forward_cnt; i++, j++) { - sub = &dlh->dlh_subs[i]; + for (dtx_chore->i++, dtx_chore->j++; dtx_chore->j < dlh->dlh_forward_cnt; + dtx_chore->i++, dtx_chore->j++) { + sub = &dlh->dlh_subs[dtx_chore->i]; tgt = &sub->dss_tgt; if (dlh->dlh_normal_sub_done == 0 || tgt->st_flags & DTF_DELAY_FORWARD) { sub->dss_result = 0; sub->dss_comp = 0; - dtx_sub_comp_cb(dlh, i, 0); + dtx_sub_comp_cb(dlh, dtx_chore->i, 0); } } } @@ -2039,6 +2061,8 @@ dtx_leader_exec_ops_ult(void *arg) D_ASSERTF(rc == ABT_SUCCESS, "ABT_future_set failed [%u, %u), for delay %s: %d\n", dlh->dlh_forward_idx, dlh->dlh_forward_idx + dlh->dlh_forward_cnt, dlh->dlh_normal_sub_done == 1 ? "yes" : "no", rc); + + return DSS_CHORE_DONE; } /** @@ -2048,15 +2072,15 @@ int dtx_leader_exec_ops(struct dtx_leader_handle *dlh, dtx_sub_func_t func, dtx_agg_cb_t agg_cb, int allow_failure, void *func_arg) { - struct dtx_ult_arg ult_arg; + struct dtx_chore dtx_chore; int sub_cnt = dlh->dlh_normal_sub_cnt + dlh->dlh_delay_sub_cnt; int rc = 0; int local_rc = 0; int remote_rc = 0; - ult_arg.func = func; - ult_arg.func_arg = func_arg; - ult_arg.dlh = dlh; + dtx_chore.func = func; + dtx_chore.func_arg = func_arg; + dtx_chore.dlh = dlh; dlh->dlh_result = 0; dlh->dlh_allow_failure = allow_failure; @@ -2092,15 +2116,10 @@ dtx_leader_exec_ops(struct dtx_leader_handle *dlh, dtx_sub_func_t func, D_GOTO(out, rc = dss_abterr2der(rc)); } - /* - * NOTE: Ideally, we probably should create ULT for each shard, but for performance - * reasons, let's only create one for all remote targets for now. - */ - rc = dss_ult_create(dtx_leader_exec_ops_ult, &ult_arg, DSS_XS_IOFW, - dss_get_module_info()->dmi_tgt_id, DSS_DEEP_STACK_SZ, NULL); + rc = dss_chore_delegate(&dtx_chore.chore, dtx_leader_exec_ops_chore); if (rc != 0) { - D_ERROR("ult create failed [%u, %u] (2): "DF_RC"\n", - dlh->dlh_forward_idx, dlh->dlh_forward_cnt, DP_RC(rc)); + DL_ERROR(rc, "chore create failed [%u, %u] (2)", dlh->dlh_forward_idx, + dlh->dlh_forward_cnt); ABT_future_free(&dlh->dlh_future); goto out; } @@ -2168,10 +2187,9 @@ dtx_leader_exec_ops(struct dtx_leader_handle *dlh, dtx_sub_func_t func, /* The ones without DELAY flag will be skipped when scan the targets array. */ dlh->dlh_forward_cnt = dlh->dlh_normal_sub_cnt + dlh->dlh_delay_sub_cnt; - rc = dss_ult_create(dtx_leader_exec_ops_ult, &ult_arg, DSS_XS_IOFW, - dss_get_module_info()->dmi_tgt_id, DSS_DEEP_STACK_SZ, NULL); + rc = dss_chore_delegate(&dtx_chore.chore, dtx_leader_exec_ops_chore); if (rc != 0) { - D_ERROR("ult create failed (4): "DF_RC"\n", DP_RC(rc)); + DL_ERROR(rc, "chore create failed (4)"); ABT_future_free(&dlh->dlh_future); goto out; } diff --git a/src/dtx/dtx_rpc.c b/src/dtx/dtx_rpc.c index 324f8bffd3a..03b0b542383 100644 --- a/src/dtx/dtx_rpc.c +++ b/src/dtx/dtx_rpc.c @@ -363,6 +363,8 @@ dtx_req_wait(struct dtx_req_args *dra) } struct dtx_common_args { + struct dss_chore dca_chore; + ABT_eventual dca_chore_eventual; struct dtx_req_args dca_dra; d_list_t dca_head; struct btr_root dca_tree_root; @@ -373,57 +375,76 @@ struct dtx_common_args { d_rank_t dca_rank; uint32_t dca_tgtid; struct ds_cont_child *dca_cont; - ABT_thread dca_helper; struct dtx_id dca_dti_inline; struct dtx_id *dca_dtis; struct dtx_entry **dca_dtes; + + /* Chore-internal state variables */ + struct dtx_req_rec *dca_drr; + int dca_i; }; +/* If is_reentrance, this function ignores len. */ static int -dtx_req_list_send(struct dtx_common_args *dca, daos_epoch_t epoch, int len) +dtx_req_list_send(struct dtx_common_args *dca, daos_epoch_t epoch, int len, bool is_reentrance) { struct dtx_req_args *dra = &dca->dca_dra; - struct dtx_req_rec *drr; int rc; - int i = 0; - dra->dra_length = len; + if (!is_reentrance) { + dra->dra_length = len; + + rc = ABT_future_create(len, dtx_req_list_cb, &dra->dra_future); + if (rc != ABT_SUCCESS) { + D_ERROR("ABT_future_create failed for opc %x, len = %d: " + "rc = %d.\n", dra->dra_opc, len, rc); + return dss_abterr2der(rc); + } - rc = ABT_future_create(len, dtx_req_list_cb, &dra->dra_future); - if (rc != ABT_SUCCESS) { - D_ERROR("ABT_future_create failed for opc %x, len = %d: " - "rc = %d.\n", dra->dra_opc, len, rc); - return dss_abterr2der(rc); + D_DEBUG(DB_TRACE, "%p: DTX req for opc %x, future %p (%d) start.\n", + &dca->dca_chore, dra->dra_opc, dra->dra_future, len); } - D_DEBUG(DB_TRACE, "DTX req for opc %x, future %p start.\n", dra->dra_opc, dra->dra_future); + /* + * Begin or continue an iteration over dca_head. When beginning the + * iteration, dca->dca_drr does not point to a real entry, and is only + * safe for d_list_for_each_entry_continue. + */ + if (!is_reentrance) { + dca->dca_drr = d_list_entry(&dca->dca_head, struct dtx_req_rec, drr_link); + dca->dca_i = 0; + } + /* DO NOT add any line here! See the comment on dca->dca_drr above. */ + d_list_for_each_entry_continue(dca->dca_drr, &dca->dca_head, drr_link) + { + D_DEBUG(DB_TRACE, "chore=%p: drr=%p i=%d\n", &dca->dca_chore, dca->dca_drr, + dca->dca_i); - d_list_for_each_entry(drr, &dca->dca_head, drr_link) { - drr->drr_parent = dra; - drr->drr_result = 0; + dca->dca_drr->drr_parent = dra; + dca->dca_drr->drr_result = 0; - if (unlikely(dra->dra_opc == DTX_COMMIT && i == 0 && + if (unlikely(dra->dra_opc == DTX_COMMIT && dca->dca_i == 0 && DAOS_FAIL_CHECK(DAOS_DTX_FAIL_COMMIT))) - rc = dtx_req_send(drr, 1); + rc = dtx_req_send(dca->dca_drr, 1); else - rc = dtx_req_send(drr, epoch); + rc = dtx_req_send(dca->dca_drr, epoch); if (rc != 0) { /* If the first sub-RPC failed, then break, otherwise * other remote replicas may have already received the * RPC and executed it, so have to go ahead. */ - if (i == 0) { + if (dca->dca_i == 0) { ABT_future_free(&dra->dra_future); return rc; } } /* Yield to avoid holding CPU for too long time. */ - if (++i % DTX_RPC_YIELD_THD == 0) - ABT_thread_yield(); + if (++(dca->dca_i) % DTX_RPC_YIELD_THD == 0) + return DSS_CHORE_YIELD; } - return 0; + return DSS_CHORE_DONE; } static int @@ -599,16 +620,22 @@ dtx_classify_one(struct ds_pool *pool, daos_handle_t tree, d_list_t *head, int * return rc > 0 ? 0 : rc; } -static int -dtx_rpc_internal(struct dtx_common_args *dca) +static enum dss_chore_status +dtx_rpc_helper(struct dss_chore *chore, bool is_reentrance) { + struct dtx_common_args *dca = container_of(chore, struct dtx_common_args, dca_chore); struct ds_pool *pool = dca->dca_cont->sc_pool->spc_pool; struct umem_attr uma = { 0 }; int length = 0; int rc; int i; - if (dca->dca_dra.dra_opc != DTX_REFRESH) { + if (is_reentrance) { + D_DEBUG(DB_TRACE, "%p: skip to send\n", &dca->dca_chore); + goto send; + } + + if (dca->dca_dtes != NULL) { D_ASSERT(dca->dca_dtis != NULL); if (dca->dca_count > 1) { @@ -616,7 +643,7 @@ dtx_rpc_internal(struct dtx_common_args *dca) rc = dbtree_create_inplace(DBTREE_CLASS_DTX_CF, 0, DTX_CF_BTREE_ORDER, &uma, &dca->dca_tree_root, &dca->dca_tree_hdl); if (rc != 0) - return rc; + goto done; } ABT_rwlock_rdlock(pool->sp_lock); @@ -626,7 +653,7 @@ dtx_rpc_internal(struct dtx_common_args *dca) dca->dca_rank, dca->dca_tgtid); if (rc < 0) { ABT_rwlock_unlock(pool->sp_lock); - return rc; + goto done; } daos_dti_copy(&dca->dca_dtis[i], &dca->dca_dtes[i]->dte_xid); @@ -636,30 +663,33 @@ dtx_rpc_internal(struct dtx_common_args *dca) /* For DTX_CHECK, if no other available target(s), then current target is the * unique valid one (and also 'prepared'), then related DTX can be committed. */ - if (d_list_empty(&dca->dca_head)) - return dca->dca_dra.dra_opc == DTX_CHECK ? DTX_ST_PREPARED : 0; + if (d_list_empty(&dca->dca_head)) { + rc = (dca->dca_dra.dra_opc == DTX_CHECK ? DTX_ST_PREPARED : 0); + goto done; + } } else { length = dca->dca_count; } D_ASSERT(length > 0); - return dtx_req_list_send(dca, dca->dca_epoch, length); -} - -static void -dtx_rpc_helper(void *arg) -{ - struct dtx_common_args *dca = arg; - int rc; - - rc = dtx_rpc_internal(dca); +send: + rc = dtx_req_list_send(dca, dca->dca_epoch, length, is_reentrance); + if (rc == DSS_CHORE_YIELD) + return DSS_CHORE_YIELD; + if (rc == DSS_CHORE_DONE) + rc = 0; +done: if (rc != 0) dca->dca_dra.dra_result = rc; - - D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE, - "DTX helper ULT for %u exit: %d\n", dca->dca_dra.dra_opc, rc); + D_CDEBUG(rc < 0, DLOG_ERR, DB_TRACE, "%p: DTX RPC chore for %u done: %d\n", chore, + dca->dca_dra.dra_opc, rc); + if (dca->dca_chore_eventual != ABT_EVENTUAL_NULL) { + rc = ABT_eventual_set(dca->dca_chore_eventual, NULL, 0); + D_ASSERTF(rc == ABT_SUCCESS, "ABT_eventual_set: %d\n", rc); + } + return DSS_CHORE_DONE; } static int @@ -672,6 +702,7 @@ dtx_rpc_prep(struct ds_cont_child *cont,d_list_t *dti_list, struct dtx_entry ** memset(dca, 0, sizeof(*dca)); + dca->dca_chore_eventual = ABT_EVENTUAL_NULL; D_INIT_LIST_HEAD(&dca->dca_head); dca->dca_tree_hdl = DAOS_HDL_INVAL; dca->dca_epoch = epoch; @@ -679,7 +710,6 @@ dtx_rpc_prep(struct ds_cont_child *cont,d_list_t *dti_list, struct dtx_entry ** crt_group_rank(NULL, &dca->dca_rank); dca->dca_tgtid = dss_get_module_info()->dmi_tgt_id; dca->dca_cont = cont; - dca->dca_helper = ABT_THREAD_NULL; dca->dca_dtes = dtes; dra = &dca->dca_dra; @@ -705,11 +735,18 @@ dtx_rpc_prep(struct ds_cont_child *cont,d_list_t *dti_list, struct dtx_entry ** } /* Use helper ULT to handle DTX RPC if there are enough helper XS. */ - if (dss_has_enough_helper()) - rc = dss_ult_create(dtx_rpc_helper, dca, DSS_XS_IOFW, dca->dca_tgtid, - DSS_DEEP_STACK_SZ, &dca->dca_helper); - else - rc = dtx_rpc_internal(dca); + if (dss_has_enough_helper()) { + rc = ABT_eventual_create(0, &dca->dca_chore_eventual); + if (rc != ABT_SUCCESS) { + D_ERROR("failed to create eventual: %d\n", rc); + rc = dss_abterr2der(rc); + goto out; + } + rc = dss_chore_delegate(&dca->dca_chore, dtx_rpc_helper); + } else { + dss_chore_diy(&dca->dca_chore, dtx_rpc_helper); + rc = dca->dca_dra.dra_result; + } out: return rc; @@ -721,8 +758,12 @@ dtx_rpc_post(struct dtx_common_args *dca, int ret, bool keep_head) struct dtx_req_rec *drr; int rc; - if (dca->dca_helper != ABT_THREAD_NULL) - ABT_thread_free(&dca->dca_helper); + if (dca->dca_chore_eventual != ABT_EVENTUAL_NULL) { + rc = ABT_eventual_wait(dca->dca_chore_eventual, NULL); + D_ASSERTF(rc == ABT_SUCCESS, "ABT_eventual_wait: %d\n", rc); + rc = ABT_eventual_free(&dca->dca_chore_eventual); + D_ASSERTF(rc == ABT_SUCCESS, "ABT_eventual_free: %d\n", rc); + } rc = dtx_req_wait(&dca->dca_dra); diff --git a/src/engine/sched.c b/src/engine/sched.c index aae2fb5554b..988b8f2b6aa 100644 --- a/src/engine/sched.c +++ b/src/engine/sched.c @@ -1441,8 +1441,8 @@ sched_stop(struct dss_xstream *dx) process_all(dx); } -void -sched_cond_wait(ABT_cond cond, ABT_mutex mutex) +static void +cond_wait(ABT_cond cond, ABT_mutex mutex, bool for_business) { struct dss_xstream *dx = dss_current_xstream(); struct sched_info *info = &dx->dx_sched_info; @@ -1451,6 +1451,20 @@ sched_cond_wait(ABT_cond cond, ABT_mutex mutex) ABT_cond_wait(cond, mutex); D_ASSERT(info->si_wait_cnt > 0); info->si_wait_cnt -= 1; + if (for_business) + info->si_stats.ss_busy_ts = info->si_cur_ts; +} + +void +sched_cond_wait(ABT_cond cond, ABT_mutex mutex) +{ + cond_wait(cond, mutex, false /* for_business */); +} + +void +sched_cond_wait_for_business(ABT_cond cond, ABT_mutex mutex) +{ + cond_wait(cond, mutex, true /* for_business */); } uint64_t diff --git a/src/engine/srv.c b/src/engine/srv.c index 246ee975c64..986d8ed04c4 100644 --- a/src/engine/srv.c +++ b/src/engine/srv.c @@ -370,6 +370,7 @@ dss_srv_handler(void *arg) int rc; bool track_mem = false; bool signal_caller = true; + bool with_chore_queue = dx->dx_iofw && !dx->dx_main_xs; rc = dss_xstream_set_affinity(dx); if (rc) @@ -500,6 +501,16 @@ dss_srv_handler(void *arg) } } + if (with_chore_queue) { + rc = dss_chore_queue_start(dx); + if (rc != 0) { + DL_ERROR(rc, "failed to start chore queue"); + ABT_future_set(dx->dx_shutdown, dx); + wait_all_exited(dx, dmi); + goto nvme_fini; + } + } + dmi->dmi_xstream = dx; ABT_mutex_lock(xstream_data.xd_mutex); /* initialized everything for the ULT, notify the creator */ @@ -546,6 +557,9 @@ dss_srv_handler(void *arg) if (dx->dx_comm) dx->dx_progress_started = false; + if (with_chore_queue) + dss_chore_queue_stop(dx); + wait_all_exited(dx, dmi); if (dmi->dmi_dp) { daos_profile_destroy(dmi->dmi_dp); @@ -755,6 +769,8 @@ dss_start_one_xstream(hwloc_cpuset_t cpus, int tag, int xs_id) } else { dx->dx_main_xs = (xs_id >= dss_sys_xs_nr) && (xs_offset == 0); } + /* See the DSS_XS_IOFW case in sched_ult2xs. */ + dx->dx_iofw = xs_id >= dss_sys_xs_nr && (!dx->dx_main_xs || dss_tgt_offload_xs_nr == 0); dx->dx_dsc_started = false; /** @@ -783,6 +799,12 @@ dss_start_one_xstream(hwloc_cpuset_t cpus, int tag, int xs_id) D_GOTO(out_dx, rc); } + rc = dss_chore_queue_init(dx); + if (rc != 0) { + DL_ERROR(rc, "initialize chore queue fails"); + goto out_sched; + } + dss_mem_stats_init(&dx->dx_mem_stats, xs_id); /** start XS, ABT rank 0 is reserved for the primary xstream */ @@ -790,7 +812,7 @@ dss_start_one_xstream(hwloc_cpuset_t cpus, int tag, int xs_id) &dx->dx_xstream); if (rc != ABT_SUCCESS) { D_ERROR("create xstream fails %d\n", rc); - D_GOTO(out_sched, rc = dss_abterr2der(rc)); + D_GOTO(out_chore_queue, rc = dss_abterr2der(rc)); } rc = ABT_thread_attr_create(&attr); @@ -839,6 +861,8 @@ dss_start_one_xstream(hwloc_cpuset_t cpus, int tag, int xs_id) ABT_thread_attr_free(&attr); ABT_xstream_join(dx->dx_xstream); ABT_xstream_free(&dx->dx_xstream); +out_chore_queue: + dss_chore_queue_fini(dx); out_sched: dss_sched_fini(dx); out_dx: @@ -898,6 +922,7 @@ dss_xstreams_fini(bool force) dx = xstream_data.xd_xs_ptrs[i]; if (dx == NULL) continue; + dss_chore_queue_fini(dx); dss_sched_fini(dx); dss_xstream_free(dx); xstream_data.xd_xs_ptrs[i] = NULL; diff --git a/src/engine/srv_internal.h b/src/engine/srv_internal.h index 892e6ae3dc4..8621175b44f 100644 --- a/src/engine/srv_internal.h +++ b/src/engine/srv_internal.h @@ -60,6 +60,15 @@ struct mem_stats { uint64_t ms_current; }; +/* See dss_chore. */ +struct dss_chore_queue { + d_list_t chq_list; + bool chq_stop; + ABT_mutex chq_mutex; + ABT_cond chq_cond; + ABT_thread chq_ult; +}; + /** Per-xstream configuration data */ struct dss_xstream { char dx_name[DSS_XS_NAME_LEN]; @@ -85,6 +94,7 @@ struct dss_xstream { unsigned int dx_timeout; bool dx_main_xs; /* true for main XS */ bool dx_comm; /* true with cart context */ + bool dx_iofw; /* true for DSS_XS_IOFW XS */ bool dx_dsc_started; /* DSC progress ULT started */ struct mem_stats dx_mem_stats; /* memory usages stats on this xstream */ #ifdef ULT_MMAP_STACK @@ -93,6 +103,7 @@ struct dss_xstream { #endif bool dx_progress_started; /* Network poll started */ int dx_tag; /** tag for xstream */ + struct dss_chore_queue dx_chore_queue; }; /** Engine module's metrics */ @@ -370,4 +381,9 @@ dss_xstream_has_nvme(struct dss_xstream *dx) return false; } +int dss_chore_queue_init(struct dss_xstream *dx); +int dss_chore_queue_start(struct dss_xstream *dx); +void dss_chore_queue_stop(struct dss_xstream *dx); +void dss_chore_queue_fini(struct dss_xstream *dx); + #endif /* __DAOS_SRV_INTERNAL__ */ diff --git a/src/engine/ult.c b/src/engine/ult.c index 204381755fb..1e8743fcd89 100644 --- a/src/engine/ult.c +++ b/src/engine/ult.c @@ -610,3 +610,212 @@ dss_main_exec(void (*func)(void *), void *arg) return dss_ult_create(func, arg, DSS_XS_SELF, info->dmi_tgt_id, 0, NULL); } + +static void +dss_chore_diy_internal(struct dss_chore *chore) +{ +reenter: + D_DEBUG(DB_TRACE, "%p: status=%d\n", chore, chore->cho_status); + chore->cho_status = chore->cho_func(chore, chore->cho_status == DSS_CHORE_YIELD); + D_ASSERT(chore->cho_status != DSS_CHORE_NEW); + if (chore->cho_status == DSS_CHORE_YIELD) { + ABT_thread_yield(); + goto reenter; + } +} + +static void +dss_chore_ult(void *arg) +{ + struct dss_chore *chore = arg; + + dss_chore_diy_internal(chore); +} + +/** + * Add \a chore for \a func to the chore queue of some other xstream. + * + * \param[in] chore address of the embedded chore object + * \param[in] func function to be executed via \a chore + * + * \retval -DER_CANCEL chore queue stopping + */ +int +dss_chore_delegate(struct dss_chore *chore, dss_chore_func_t func) +{ + struct dss_module_info *info = dss_get_module_info(); + int xs_id; + struct dss_xstream *dx; + struct dss_chore_queue *queue; + + chore->cho_status = DSS_CHORE_NEW; + chore->cho_func = func; + + /* + * The dss_chore_queue_ult approach may get insufficient scheduling on + * a "main" xstream when the chore queue is long. So we fall back to + * the one-ULT-per-chore approach if there's no helper xstream. + */ + if (dss_tgt_offload_xs_nr == 0) { + D_INIT_LIST_HEAD(&chore->cho_link); + return dss_ult_create(dss_chore_ult, chore, DSS_XS_IOFW, info->dmi_tgt_id, + 0 /* stack_size */, NULL /* ult */); + } + + /* Find the chore queue. */ + xs_id = sched_ult2xs(DSS_XS_IOFW, info->dmi_tgt_id); + D_ASSERT(xs_id != -DER_INVAL); + dx = dss_get_xstream(xs_id); + D_ASSERT(dx != NULL); + queue = &dx->dx_chore_queue; + D_ASSERT(queue != NULL); + + ABT_mutex_lock(queue->chq_mutex); + if (queue->chq_stop) { + ABT_mutex_unlock(queue->chq_mutex); + return -DER_CANCELED; + } + d_list_add_tail(&chore->cho_link, &queue->chq_list); + ABT_cond_broadcast(queue->chq_cond); + ABT_mutex_unlock(queue->chq_mutex); + + D_DEBUG(DB_TRACE, "%p: tgt_id=%d -> xs_id=%d dx.tgt_id=%d\n", chore, info->dmi_tgt_id, + xs_id, dx->dx_tgt_id); + return 0; +} + +/** + * Do \a chore for \a func synchronously in the current ULT. + * + * \param[in] chore embedded chore object + * \param[in] func function to be executed via \a chore + */ +void +dss_chore_diy(struct dss_chore *chore, dss_chore_func_t func) +{ + D_INIT_LIST_HEAD(&chore->cho_link); + chore->cho_status = DSS_CHORE_NEW; + chore->cho_func = func; + + dss_chore_diy_internal(chore); +} + +static void +dss_chore_queue_ult(void *arg) +{ + struct dss_chore_queue *queue = arg; + d_list_t list = D_LIST_HEAD_INIT(list); + + D_ASSERT(queue != NULL); + D_DEBUG(DB_TRACE, "begin\n"); + + for (;;) { + struct dss_chore *chore; + struct dss_chore *chore_tmp; + bool stop = false; + + /* + * The scheduling order shall be + * + * [queue->chq_list] [list], + * + * where list contains chores that have returned + * DSS_CHORE_YIELD in the previous iteration. + */ + ABT_mutex_lock(queue->chq_mutex); + for (;;) { + if (!d_list_empty(&queue->chq_list)) { + d_list_splice_init(&queue->chq_list, &list); + break; + } + if (!d_list_empty(&list)) + break; + if (queue->chq_stop) { + stop = true; + break; + } + sched_cond_wait_for_business(queue->chq_cond, queue->chq_mutex); + } + ABT_mutex_unlock(queue->chq_mutex); + + if (stop) + break; + + d_list_for_each_entry_safe(chore, chore_tmp, &list, cho_link) { + bool is_reentrance = (chore->cho_status == DSS_CHORE_YIELD); + + D_DEBUG(DB_TRACE, "%p: before: status=%d\n", chore, chore->cho_status); + chore->cho_status = chore->cho_func(chore, is_reentrance); + D_ASSERT(chore->cho_status != DSS_CHORE_NEW); + D_DEBUG(DB_TRACE, "%p: after: status=%d\n", chore, chore->cho_status); + if (chore->cho_status == DSS_CHORE_DONE) + d_list_del_init(&chore->cho_link); + ABT_thread_yield(); + } + } + + D_DEBUG(DB_TRACE, "end\n"); +} + +int +dss_chore_queue_init(struct dss_xstream *dx) +{ + struct dss_chore_queue *queue = &dx->dx_chore_queue; + int rc; + + D_INIT_LIST_HEAD(&queue->chq_list); + queue->chq_stop = false; + + rc = ABT_mutex_create(&queue->chq_mutex); + if (rc != ABT_SUCCESS) { + D_ERROR("failed to create chore queue mutex: %d\n", rc); + return dss_abterr2der(rc); + } + + rc = ABT_cond_create(&queue->chq_cond); + if (rc != ABT_SUCCESS) { + D_ERROR("failed to create chore queue condition variable: %d\n", rc); + ABT_mutex_free(&queue->chq_mutex); + return dss_abterr2der(rc); + } + + return 0; +} + +int +dss_chore_queue_start(struct dss_xstream *dx) +{ + struct dss_chore_queue *queue = &dx->dx_chore_queue; + int rc; + + rc = daos_abt_thread_create(dx->dx_sp, dss_free_stack_cb, dx->dx_pools[DSS_POOL_GENERIC], + dss_chore_queue_ult, queue, ABT_THREAD_ATTR_NULL, + &queue->chq_ult); + if (rc != 0) { + D_ERROR("failed to create chore queue ULT: %d\n", rc); + return dss_abterr2der(rc); + } + + return 0; +} + +void +dss_chore_queue_stop(struct dss_xstream *dx) +{ + struct dss_chore_queue *queue = &dx->dx_chore_queue; + + ABT_mutex_lock(queue->chq_mutex); + queue->chq_stop = true; + ABT_cond_broadcast(queue->chq_cond); + ABT_mutex_unlock(queue->chq_mutex); + ABT_thread_free(&queue->chq_ult); +} + +void +dss_chore_queue_fini(struct dss_xstream *dx) +{ + struct dss_chore_queue *queue = &dx->dx_chore_queue; + + ABT_cond_free(&queue->chq_cond); + ABT_mutex_free(&queue->chq_mutex); +} diff --git a/src/include/daos_srv/daos_engine.h b/src/include/daos_srv/daos_engine.h index afdf267cd60..06a927b8d3f 100644 --- a/src/include/daos_srv/daos_engine.h +++ b/src/include/daos_srv/daos_engine.h @@ -338,6 +338,13 @@ int sched_req_space_check(struct sched_request *req); */ void sched_cond_wait(ABT_cond cond, ABT_mutex mutex); +/** + * Wrapper of ABT_cond_wait(), inform scheduler that it's going + * to be blocked for a relative long time. Unlike sched_cond_wait, + * after waking up, this function will prevent relaxing for a while. + */ +void sched_cond_wait_for_business(ABT_cond cond, ABT_mutex mutex); + /** * Get current monotonic time in milli-seconds. */ @@ -812,4 +819,40 @@ enum dss_drpc_call_flag { int dss_drpc_call(int32_t module, int32_t method, void *req, size_t req_size, unsigned int flags, Drpc__Response **resp); +/** Status of a chore */ +enum dss_chore_status { + DSS_CHORE_NEW, /**< ready to be scheduled for the first time (private) */ + DSS_CHORE_YIELD, /**< ready to be scheduled again */ + DSS_CHORE_DONE /**< no more scheduling required */ +}; + +struct dss_chore; + +/** + * Must return either DSS_CHORE_YIELD (if yielding to other chores) or + * DSS_CHORE_DONE (if terminating). If \a is_reentrance is true, this is not + * the first time \a chore is scheduled. A typical implementation shall + * initialize its internal state variables if \a is_reentrance is false. See + * dtx_leader_exec_ops_chore for an example. + */ +typedef enum dss_chore_status (*dss_chore_func_t)(struct dss_chore *chore, bool is_reentrance); + +/** + * Chore (opaque) + * + * A simple task (e.g., an I/O forwarding task) that yields by returning + * DSS_CHORE_YIELD instead of calling ABT_thread_yield. This data structure + * shall be embedded in the user's own task data structure, which typically + * also includes arguments and internal state variables for \a cho_func. All + * fields are private. See dtx_chore for an example. + */ +struct dss_chore { + d_list_t cho_link; + enum dss_chore_status cho_status; + dss_chore_func_t cho_func; +}; + +int dss_chore_delegate(struct dss_chore *chore, dss_chore_func_t func); +void dss_chore_diy(struct dss_chore *chore, dss_chore_func_t func); + #endif /* __DSS_API_H__ */ From 5f615732f6c6e722b04f94177e6d1dbbd806980a Mon Sep 17 00:00:00 2001 From: Jeff Olivier Date: Mon, 15 Apr 2024 12:22:22 -0600 Subject: [PATCH 02/10] b/334920724 - Fix google/2.4 pylint issues disable CODEOWNERS for google branch disable upstream hardware tests on branch by default remove bad merge block fix ordering of imports Rename google-changeId.py set option for dynamic fuse Required-githooks: true Change-Id: I8b0edfdd18a48c2f132102a49ddc44ee04b02586 Signed-off-by: Jeff Olivier --- .github/CODEOWNERS | 62 +++++++++---------- Jenkinsfile | 8 +-- debian/rules | 4 +- site_scons/prereq_tools/base.py | 2 +- .../ftest/control/config_generate_output.py | 10 +-- src/tests/ftest/util/server_utils.py | 8 +-- ...google-changeId.py => google_change_id.py} | 5 +- utils/node_local_test.py | 1 - utils/rpms/daos.spec | 2 + utils/run_utest.py | 2 +- 10 files changed, 49 insertions(+), 55 deletions(-) rename utils/githooks/commit-msg.d/{google-changeId.py => google_change_id.py} (93%) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index e37f56a2ad8..26ea943729d 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,58 +2,58 @@ # a component sha1 to ensure that corresponding package build is done #utils/build.config @daos-stack/release-engineering # or updates packaging in any way -utils/rpms @daos-stack/build-and-release-watchers +#utils/rpms @daos-stack/build-and-release-watchers -src/gurt @daos-stack/common-watchers -src/common @daos-stack/common-watchers +#src/gurt @daos-stack/common-watchers +#src/common @daos-stack/common-watchers # any PR that touches Go files should get a review from go-owners -*.go @daos-stack/go-owners @daos-stack/go-watchers +#*.go @daos-stack/go-owners @daos-stack/go-watchers # Notify vos-watcher of files touched affecting VOS -src/vos/ @daos-stack/vos-owners @daos-stack/vos-watchers -src/common/btree*.* @daos-stack/vos-owners @daos-stack/vos-watchers -src/include/daos/btree*.* @daos-stack/vos-owners @daos-stack/vos-watchers -src/include/daos_srv/vos*.* @daos-stack/vos-owners @daos-stack/vos-watchers -src/include/daos_srv/evtree.h @daos-stack/vos-owners @daos-stack/vos-watchers +#src/vos/ @daos-stack/vos-owners @daos-stack/vos-watchers +#src/common/btree*.* @daos-stack/vos-owners @daos-stack/vos-watchers +#src/include/daos/btree*.* @daos-stack/vos-owners @daos-stack/vos-watchers +#src/include/daos_srv/vos*.* @daos-stack/vos-owners @daos-stack/vos-watchers +#src/include/daos_srv/evtree.h @daos-stack/vos-owners @daos-stack/vos-watchers # Jenkinsfile changes should be reviewed by Release Engineering -Jenkinsfile @daos-stack/build-and-release-watchers +#Jenkinsfile @daos-stack/build-and-release-watchers # any PR that touches client API or high level client code -src/client @daos-stack/client-api-owners @daos-stack/client-api-watchers -src/include/daos_*.* @daos-stack/client-api-owners @daos-stack/client-api-watchers +#src/client @daos-stack/client-api-owners @daos-stack/client-api-watchers +#src/include/daos_*.* @daos-stack/client-api-owners @daos-stack/client-api-watchers # doc-watchers: files affecting documentation (docs, doxygen, etc.) -mkdocs.yml @daos-stack/doc-watchers -Doxyfile @daos-stack/doc-watchers -docs/ @daos-stack/doc-watchers +#mkdocs.yml @daos-stack/doc-watchers +#Doxyfile @daos-stack/doc-watchers +#docs/ @daos-stack/doc-watchers #src/include/*.h @daos-stack/doc-watchers -*.md @daos-stack/doc-watchers +#*.md @daos-stack/doc-watchers # dev-build-watchers: Files affecting local builds (e.g. SCons) -SConstruct @daos-stack/dev-build-owners @daos-stack/dev-build-watchers -SConscript @daos-stack/dev-build-owners @daos-stack/dev-build-watchers -site_scons/ @daos-stack/dev-build-owners @daos-stack/dev-build-watchers -utils/sl @daos-stack/dev-build-owners @daos-stack/dev-build-watchers +#SConstruct @daos-stack/dev-build-owners @daos-stack/dev-build-watchers +#SConscript @daos-stack/dev-build-owners @daos-stack/dev-build-watchers +#site_scons/ @daos-stack/dev-build-owners @daos-stack/dev-build-watchers +#utils/sl @daos-stack/dev-build-owners @daos-stack/dev-build-watchers # ftest-watchers: files affecting functional tests -src/tests/ftest @daos-stack/ftest-owners @daos-stack/ftest-watchers +#src/tests/ftest @daos-stack/ftest-owners @daos-stack/ftest-watchers # telem-watchers: Changes related to the telemetry library -src/utils/daos_metrics @daos-stack/telem-watchers -src/gurt/telemetry.c @daos-stack/telem-watchers +#src/utils/daos_metrics @daos-stack/telem-watchers +#src/gurt/telemetry.c @daos-stack/telem-watchers # PRs that touch the object layer -src/object/ @daos-stack/object-owners @daos-stack/object-watchers -src/rebuild/ @daos-stack/object-owners @daos-stack/object-watchers -src/dtx/ @daos-stack/object-owners @daos-stack/object-watchers +#src/object/ @daos-stack/object-owners @daos-stack/object-watchers +#src/rebuild/ @daos-stack/object-owners @daos-stack/object-watchers +#src/dtx/ @daos-stack/object-owners @daos-stack/object-watchers # PRs that touch the CaRT layer -src/cart/ @daos-stack/cart-owners @daos-stack/cart-watchers +#src/cart/ @daos-stack/cart-owners @daos-stack/cart-watchers # PRs that touch the metadata -src/pool/ @daos-stack/metadata-owners @daos-stack/metadata-watchers -src/container/ @daos-stack/metadata-owners @daos-stack/metadata-watchers -src/rdb/ @daos-stack/metadata-owners @daos-stack/metadata-watchers -src/rsvc/ @daos-stack/metadata-owners @daos-stack/metadata-watchers +#src/pool/ @daos-stack/metadata-owners @daos-stack/metadata-watchers +#src/container/ @daos-stack/metadata-owners @daos-stack/metadata-watchers +#src/rdb/ @daos-stack/metadata-owners @daos-stack/metadata-watchers +#src/rsvc/ @daos-stack/metadata-owners @daos-stack/metadata-watchers diff --git a/Jenkinsfile b/Jenkinsfile index 58a77c1710f..e60b149c158 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -289,16 +289,16 @@ pipeline { description: 'Run the Functional on Ubuntu 20.04 test stage' + ' Requires CI_MORE_FUNCTIONAL_PR_TESTS') booleanParam(name: 'CI_medium_TEST', - defaultValue: true, + defaultValue: false, description: 'Run the Functional Hardware Medium test stage') booleanParam(name: 'CI_medium-verbs-provider_TEST', - defaultValue: true, + defaultValue: false, description: 'Run the Functional Hardware Medium Verbs Provider test stage') booleanParam(name: 'CI_medium-ucx-provider_TEST', - defaultValue: true, + defaultValue: false, description: 'Run the Functional Hardware Medium UCX Provider test stage') booleanParam(name: 'CI_large_TEST', - defaultValue: true, + defaultValue: false, description: 'Run the Functional Hardware Large test stage') string(name: 'CI_UNIT_VM1_LABEL', defaultValue: 'ci_vm1', diff --git a/debian/rules b/debian/rules index b1ecfbb69a7..5b4bb7f6b85 100755 --- a/debian/rules +++ b/debian/rules @@ -18,7 +18,7 @@ endif SCONS = scons -j $(NUMJOBS) DEB_SCONS_OPTIONS := --config=force --no-rpath USE_INSTALLED=all \ CONF_DIR=$(sysconfdir)/daos PREFIX=$(prefix) \ - $(SCONS_ARGS) + $(SCONS_ARGS) STATIC_FUSE=0 export GOCACHE := $(CURDIR)/.gocache @@ -34,7 +34,7 @@ override_dh_auto_build: override_dh_auto_clean: echo $(DEB_BUILD_OPTIONS) - $(SCONS) --clean + $(SCONS) --clean STATIC_FUSE=0 rm -rf build install find . -name '*.pyc' -delete rm -rf _build.external-Linux diff --git a/site_scons/prereq_tools/base.py b/site_scons/prereq_tools/base.py index 847faf5b7c6..e1d86d54899 100644 --- a/site_scons/prereq_tools/base.py +++ b/site_scons/prereq_tools/base.py @@ -520,7 +520,7 @@ def run_build(self, opts): # argobots is not really needed by client but it's difficult to separate common_reqs = ['argobots', 'ucx', 'ofi', 'hwloc', 'mercury', 'boost', 'uuid', 'crypto', 'protobufc', 'lz4', 'isal', 'isal_crypto'] - client_reqs = ['fuse', 'json-c', 'capstone', 'archive'] + client_reqs = ['fuse', 'json-c', 'capstone'] server_reqs = ['pmdk', 'spdk', 'ipmctl'] test_reqs = ['cmocka'] diff --git a/src/tests/ftest/control/config_generate_output.py b/src/tests/ftest/control/config_generate_output.py index 6ff16a9d5e5..d4052ccd275 100644 --- a/src/tests/ftest/control/config_generate_output.py +++ b/src/tests/ftest/control/config_generate_output.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2018-2023 Intel Corporation. + (C) Copyright 2018-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -194,14 +194,6 @@ def test_basic_config(self): self.prepare_expected_data() # 1. Call dmg config generate. - if storage["class"] == "ram": - # Verify scm_list value is not set: - if "scm_list" in storage: - errors.append("unexpected scm_list field exists in ram tier") - # Verify scm_size value is set: - if "scm_size" not in storage: - errors.append("Expected scm_size field does not exist in ram tier") - scm_found = True result = self.get_dmg_command().config_generate( access_points="wolf-a", net_provider=self.def_provider) generated_yaml = yaml.safe_load(result.stdout) diff --git a/src/tests/ftest/util/server_utils.py b/src/tests/ftest/util/server_utils.py index 6491f4fc808..82981206ed5 100644 --- a/src/tests/ftest/util/server_utils.py +++ b/src/tests/ftest/util/server_utils.py @@ -658,10 +658,10 @@ def stop(self): if self.manager.job.using_nvme: # Reset the storage - #try: - # self.reset_storage() - #except ServerFailed as error: - # messages.append(str(error)) + # try: + # self.reset_storage() + # except ServerFailed as error: + # messages.append(str(error)) # Make sure the mount directory belongs to non-root user self.set_scm_mount_ownership() diff --git a/utils/githooks/commit-msg.d/google-changeId.py b/utils/githooks/commit-msg.d/google_change_id.py similarity index 93% rename from utils/githooks/commit-msg.d/google-changeId.py rename to utils/githooks/commit-msg.d/google_change_id.py index b6752f6aac7..6d1403157b7 100755 --- a/utils/githooks/commit-msg.d/google-changeId.py +++ b/utils/githooks/commit-msg.d/google_change_id.py @@ -2,7 +2,8 @@ """Wrapper for changeId hook""" import os import sys -import subprocess +from subprocess import call as subcall # nosec B404 + def run_changeid_hook(): """Execute the changeId.sh hook from user area""" @@ -35,7 +36,7 @@ def run_changeid_hook(): args = sys.argv args[0] = hookpath - return subprocess.call(args) + return subcall(args) if __name__ == "__main__": diff --git a/utils/node_local_test.py b/utils/node_local_test.py index c7062afdcdc..2af41345b04 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -20,7 +20,6 @@ import pickle # nosec import pprint import pwd -import random import re import shutil import signal diff --git a/utils/rpms/daos.spec b/utils/rpms/daos.spec index 4445187313e..abaaf03ef52 100644 --- a/utils/rpms/daos.spec +++ b/utils/rpms/daos.spec @@ -298,6 +298,7 @@ This is the package that bridges the difference between the MOFED openmpi --config=force \ --no-rpath \ USE_INSTALLED=all \ + STATIC_FUSE=0 \ FIRMWARE_MGMT=yes \ CONF_DIR=%{conf_dir} \ %{?scons_args} \ @@ -315,6 +316,7 @@ mv test.cov{,-build} %{buildroot}%{_prefix} \ %{buildroot}%{conf_dir} \ USE_INSTALLED=all \ + STATIC_FUSE=0 \ FIRMWARE_MGMT=yes \ CONF_DIR=%{conf_dir} \ PREFIX=%{_prefix} \ diff --git a/utils/run_utest.py b/utils/run_utest.py index c3f33b4738c..c5fe7cfc6ba 100755 --- a/utils/run_utest.py +++ b/utils/run_utest.py @@ -7,6 +7,7 @@ Test script for running all DAOS unit tests """ import argparse +import getpass import json # pylint: disable=broad-except import os @@ -19,7 +20,6 @@ import yaml from junit_xml import TestCase, TestSuite -import getpass def check_version(): From fade6d667286baeacef049dad22392531875c010 Mon Sep 17 00:00:00 2001 From: Michael MacDonald Date: Fri, 15 Mar 2024 11:30:31 -0400 Subject: [PATCH 03/10] DAOS-15429 test: Fix Go unit tests (#13981) Fixes a couple of Go unit test failures: * Adjust gRPC client tests to behave correctly with newer gRPC versions * Don't run the syslogger test if syslogd is not running * Adjust some system tests to use paths that are more likely to exist on most systems Change-Id: Ifb3198435113f3dc251c6c0822b66c157aa1a369 Signed-off-by: Michael MacDonald --- src/control/lib/control/rpc_test.go | 10 ++++++++-- src/control/logging/syslog_test.go | 4 ++++ src/control/provider/system/system_linux_test.go | 4 ++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/src/control/lib/control/rpc_test.go b/src/control/lib/control/rpc_test.go index 92fbd7e836b..249216a5152 100644 --- a/src/control/lib/control/rpc_test.go +++ b/src/control/lib/control/rpc_test.go @@ -99,8 +99,11 @@ func TestControl_InvokeUnaryRPCAsync(t *testing.T) { "request timeout": { timeout: 1 * time.Nanosecond, req: &testRequest{ - rpcFn: func(_ context.Context, _ *grpc.ClientConn) (proto.Message, error) { + rpcFn: func(ctx context.Context, _ *grpc.ClientConn) (proto.Message, error) { time.Sleep(1 * time.Microsecond) + if ctx.Err() != nil { + return nil, ctx.Err() + } return defaultMessage, nil }, }, @@ -120,7 +123,10 @@ func TestControl_InvokeUnaryRPCAsync(t *testing.T) { } }(), req: &testRequest{ - rpcFn: func(_ context.Context, _ *grpc.ClientConn) (proto.Message, error) { + rpcFn: func(ctx context.Context, _ *grpc.ClientConn) (proto.Message, error) { + if ctx.Err() != nil { + return nil, ctx.Err() + } time.Sleep(10 * time.Second) // shouldn't be allowed to run this long return defaultMessage, nil }, diff --git a/src/control/logging/syslog_test.go b/src/control/logging/syslog_test.go index b0710bb475f..d57ad1a8a8b 100644 --- a/src/control/logging/syslog_test.go +++ b/src/control/logging/syslog_test.go @@ -32,6 +32,10 @@ func TestSyslogOutput(t *testing.T) { t.Log("current user does not have permissions to view system log") return } + if _, err := syslog.New(syslog.LOG_ALERT, "test"); err != nil { + t.Logf("unable to connect to syslog: %s -- not running this test", err) + return + } rand.Seed(time.Now().UnixNano()) runes := []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") diff --git a/src/control/provider/system/system_linux_test.go b/src/control/provider/system/system_linux_test.go index 1e0da4ac0ad..4a1b0c0d3eb 100644 --- a/src/control/provider/system/system_linux_test.go +++ b/src/control/provider/system/system_linux_test.go @@ -97,7 +97,7 @@ func TestIsMounted(t *testing.T) { expErr: errors.New("no such file or directory"), }, "neither dir nor device": { - target: "/dev/log", + target: "/dev/stderr", expErr: errors.New("not a valid mount target"), }, } { @@ -173,7 +173,7 @@ func TestSystemLinux_GetFsType(t *testing.T) { expErr: syscall.ENOENT, }, "temp dir": { - path: "/run", + path: "/dev", expResult: &FsType{ Name: "tmpfs", NoSUID: true, From dd900475c15b466e76f90fa756f67b907ac5963c Mon Sep 17 00:00:00 2001 From: Ashley Pittman Date: Tue, 17 Oct 2023 13:46:42 +0100 Subject: [PATCH 04/10] DAOS-13490 test: Update valgrind suppressions. (#13142) A go change has introduced some new failure traces, suppress them. Signed-off-by: Ashley Pittman --- src/cart/utils/memcheck-cart.supp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/cart/utils/memcheck-cart.supp b/src/cart/utils/memcheck-cart.supp index 7ba54007813..e0feb17fef2 100644 --- a/src/cart/utils/memcheck-cart.supp +++ b/src/cart/utils/memcheck-cart.supp @@ -444,6 +444,24 @@ ... fun:indexbytebody } +{ + go-cond-racecall + Memcheck:Cond + ... + fun:racecall +} +{ + go-value8-write_racecall + Memcheck:Value8 + fun:__tsan_write + fun:racecall +} +{ + go-value8-racecall + Memcheck:Value8 + fun:_ZN6__tsan9ShadowSetEPNS_9RawShadowES1_S0_ + fun:racecall +} { FI leak 8 Memcheck:Leak From 14061590a498793deadad9a88e5e356bea8306a8 Mon Sep 17 00:00:00 2001 From: Mohamad Chaarawi Date: Fri, 9 Feb 2024 15:00:16 -0600 Subject: [PATCH 05/10] DAOS-15159 test: add a supression for new valgrind warning in NLT (#13782) Signed-off-by: Mohamad Chaarawi --- src/cart/utils/memcheck-cart.supp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/cart/utils/memcheck-cart.supp b/src/cart/utils/memcheck-cart.supp index e0feb17fef2..05916304279 100644 --- a/src/cart/utils/memcheck-cart.supp +++ b/src/cart/utils/memcheck-cart.supp @@ -557,3 +557,9 @@ Memcheck:Value8 fun:aeshashbody } +{ + DAOS-15159 + Memcheck:Param + write(buf) + fun:runtime/internal/syscall.Syscall6 +} From 7f9d8c89b603b1df81a6dde5531a15a109c94e56 Mon Sep 17 00:00:00 2001 From: Jerome Soumagne Date: Thu, 21 Mar 2024 14:04:14 -0500 Subject: [PATCH 06/10] DAOS-14669 test: switch tcp;ofi_rxm testing to tcp (#13365) Signed-off-by: Jerome Soumagne Signed-off-by: Jeff Olivier --- src/cart/utils/memcheck-cart.supp | 4 ++++ src/tests/ftest/cart/corpc/corpc_five_node.yaml | 2 +- src/tests/ftest/cart/corpc/corpc_two_node.yaml | 2 +- src/tests/ftest/cart/ctl/ctl_five_node.yaml | 2 +- .../ghost_rank_rpc/ghost_rank_rpc_one_node.yaml | 2 +- src/tests/ftest/cart/group_test/group_test.yaml | 2 +- src/tests/ftest/cart/iv/iv_one_node.yaml | 2 +- src/tests/ftest/cart/iv/iv_two_node.yaml | 2 +- .../ftest/cart/no_pmix/multictx_one_node.yaml | 2 ++ src/tests/ftest/cart/no_pmix_group_test.c | 2 +- .../cart/nopmix_launcher/launcher_one_node.yaml | 2 +- .../ftest/cart/rpc/multisend_one_node.yaml | 2 +- src/tests/ftest/cart/rpc/rpc_one_node.yaml | 2 +- src/tests/ftest/cart/rpc/rpc_two_node.yaml | 2 +- src/tests/ftest/cart/rpc/swim_notification.yaml | 2 +- .../cart/selftest/selftest_three_node.yaml | 2 +- src/tests/ftest/cart/utest/utest_portnumber.c | 17 ++++------------- .../ftest/control/config_generate_output.py | 2 +- .../ftest/control/config_generate_run.yaml | 14 +++++++------- .../ftest/server/multiengine_persocket.yaml | 2 +- src/tests/ftest/util/dmg_utils.py | 5 ++--- src/tests/ftest/util/network_utils.py | 4 ++-- src/tests/ftest/util/server_utils_params.py | 7 ++++--- .../vcluster/daos-server/el8/daos_server.yml.in | 2 +- utils/test_memcheck.supp | 15 ++++++++++++++- 25 files changed, 56 insertions(+), 46 deletions(-) diff --git a/src/cart/utils/memcheck-cart.supp b/src/cart/utils/memcheck-cart.supp index 05916304279..1a616b99ce4 100644 --- a/src/cart/utils/memcheck-cart.supp +++ b/src/cart/utils/memcheck-cart.supp @@ -501,9 +501,12 @@ Memcheck:Param sendmsg(msg.msg_iov[1]) ... + fun:sendmsg + fun:ofi_sockapi_sendv_socket fun:ofi_bsock_sendv ... fun:fi_senddata + ... } { Tcp provider with ofi rxm @@ -513,6 +516,7 @@ fun:ofi_bsock_sendv ... fun:fi_tsend + ... } { Tcp provider with ofi rxm 2 diff --git a/src/tests/ftest/cart/corpc/corpc_five_node.yaml b/src/tests/ftest/cart/corpc/corpc_five_node.yaml index 0fdd2890a25..d26ade1928b 100644 --- a/src/tests/ftest/cart/corpc/corpc_five_node.yaml +++ b/src/tests/ftest/cart/corpc/corpc_five_node.yaml @@ -10,7 +10,7 @@ ENV: - test_servers_CRT_CTX_NUM: "0" env_CRT_PHY_ADDR_STR: !mux ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" env_CRT_CTX_SHARE_ADDR: !mux no_sep: env: no_sep diff --git a/src/tests/ftest/cart/corpc/corpc_two_node.yaml b/src/tests/ftest/cart/corpc/corpc_two_node.yaml index 7f1c4eb488a..04f568b6447 100644 --- a/src/tests/ftest/cart/corpc/corpc_two_node.yaml +++ b/src/tests/ftest/cart/corpc/corpc_two_node.yaml @@ -10,7 +10,7 @@ ENV: - test_servers_CRT_CTX_NUM: "16" env_CRT_PHY_ADDR_STR: !mux ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" env_CRT_CTX_SHARE_ADDR: !mux no_sep: env: no_sep diff --git a/src/tests/ftest/cart/ctl/ctl_five_node.yaml b/src/tests/ftest/cart/ctl/ctl_five_node.yaml index d69f13874cf..52408336aab 100644 --- a/src/tests/ftest/cart/ctl/ctl_five_node.yaml +++ b/src/tests/ftest/cart/ctl/ctl_five_node.yaml @@ -11,7 +11,7 @@ ENV: - test_clients_CRT_CTX_NUM: "0" env_CRT_PHY_ADDR_STR: !mux ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" env_CRT_CTX_SHARE_ADDR: !mux no_sep: env: no_sep diff --git a/src/tests/ftest/cart/ghost_rank_rpc/ghost_rank_rpc_one_node.yaml b/src/tests/ftest/cart/ghost_rank_rpc/ghost_rank_rpc_one_node.yaml index 9c8c50b0012..d347330674a 100644 --- a/src/tests/ftest/cart/ghost_rank_rpc/ghost_rank_rpc_one_node.yaml +++ b/src/tests/ftest/cart/ghost_rank_rpc/ghost_rank_rpc_one_node.yaml @@ -10,7 +10,7 @@ ENV: - test_servers_CRT_CTX_NUM: "16" env_CRT_PHY_ADDR_STR: !mux ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" env_CRT_CTX_SHARE_ADDR: !mux no_sep: env: no_sep diff --git a/src/tests/ftest/cart/group_test/group_test.yaml b/src/tests/ftest/cart/group_test/group_test.yaml index 829406b7136..977df306ff7 100644 --- a/src/tests/ftest/cart/group_test/group_test.yaml +++ b/src/tests/ftest/cart/group_test/group_test.yaml @@ -8,7 +8,7 @@ ENV: - OFI_INTERFACE: "eth0" env_CRT_PHY_ADDR_STR: !mux ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" hosts: !mux hosts_1: config: one_node diff --git a/src/tests/ftest/cart/iv/iv_one_node.yaml b/src/tests/ftest/cart/iv/iv_one_node.yaml index 0eab8a3027c..5c9a7926338 100644 --- a/src/tests/ftest/cart/iv/iv_one_node.yaml +++ b/src/tests/ftest/cart/iv/iv_one_node.yaml @@ -18,7 +18,7 @@ env_CRT_PHY_ADDR_STR: !mux sm: CRT_PHY_ADDR_STR: "sm" ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" hosts: !mux hosts_1: config: one_node diff --git a/src/tests/ftest/cart/iv/iv_two_node.yaml b/src/tests/ftest/cart/iv/iv_two_node.yaml index 3d0e3aece49..f0236bbb385 100644 --- a/src/tests/ftest/cart/iv/iv_two_node.yaml +++ b/src/tests/ftest/cart/iv/iv_two_node.yaml @@ -12,7 +12,7 @@ ENV: - CRT_TEST_CONT: "1" env_CRT_PHY_ADDR_STR: !mux ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" env_CRT_CTX_SHARE_ADDR: !mux no_sep: env: no_sep diff --git a/src/tests/ftest/cart/no_pmix/multictx_one_node.yaml b/src/tests/ftest/cart/no_pmix/multictx_one_node.yaml index 137a4f1c7ee..3e31a85de14 100644 --- a/src/tests/ftest/cart/no_pmix/multictx_one_node.yaml +++ b/src/tests/ftest/cart/no_pmix/multictx_one_node.yaml @@ -13,6 +13,8 @@ env_CRT_PHY_ADDR_STR: !mux sm: CRT_PHY_ADDR_STR: "sm" ofi_tcp: + CRT_PHY_ADDR_STR: "ofi+tcp" + ofi_tcp_rxm: CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" tests: !mux no_pmix_multi_ctx: diff --git a/src/tests/ftest/cart/no_pmix_group_test.c b/src/tests/ftest/cart/no_pmix_group_test.c index 7290b478aa5..d5db7f230f1 100644 --- a/src/tests/ftest/cart/no_pmix_group_test.c +++ b/src/tests/ftest/cart/no_pmix_group_test.c @@ -686,7 +686,7 @@ int main(int argc, char **argv) } for (i = 0; i < 10; i++) { - rc = asprintf(&uris[i], "ofi+tcp;ofi_rxm://127.0.0.1:%d", + rc = asprintf(&uris[i], "ofi+tcp://127.0.0.1:%d", 10000 + i); if (rc == -1) { D_ERROR("asprintf() failed\n"); diff --git a/src/tests/ftest/cart/nopmix_launcher/launcher_one_node.yaml b/src/tests/ftest/cart/nopmix_launcher/launcher_one_node.yaml index 845e962f46b..97dabc26fe2 100644 --- a/src/tests/ftest/cart/nopmix_launcher/launcher_one_node.yaml +++ b/src/tests/ftest/cart/nopmix_launcher/launcher_one_node.yaml @@ -8,7 +8,7 @@ ENV: - OFI_INTERFACE: "eth0" env_CRT_PHY_ADDR_STR: !mux ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" hosts: !mux hosts_1: config: one_node diff --git a/src/tests/ftest/cart/rpc/multisend_one_node.yaml b/src/tests/ftest/cart/rpc/multisend_one_node.yaml index 42700082b77..96bf23c1224 100644 --- a/src/tests/ftest/cart/rpc/multisend_one_node.yaml +++ b/src/tests/ftest/cart/rpc/multisend_one_node.yaml @@ -17,7 +17,7 @@ env_CRT_CTX_SHARE_ADDR: !mux CRT_CTX_SHARE_ADDR: "0" env_CRT_PHY_ADDR_STR: !mux ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" hosts: !mux hosts_1: config: one_node diff --git a/src/tests/ftest/cart/rpc/rpc_one_node.yaml b/src/tests/ftest/cart/rpc/rpc_one_node.yaml index 0e2210ef1bf..13adbb7b568 100644 --- a/src/tests/ftest/cart/rpc/rpc_one_node.yaml +++ b/src/tests/ftest/cart/rpc/rpc_one_node.yaml @@ -19,7 +19,7 @@ env_CRT_PHY_ADDR_STR: !mux sm: CRT_PHY_ADDR_STR: "sm" ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" hosts: !mux hosts_1: config: one_node diff --git a/src/tests/ftest/cart/rpc/rpc_two_node.yaml b/src/tests/ftest/cart/rpc/rpc_two_node.yaml index 8b3ae57880c..6c6f477530b 100644 --- a/src/tests/ftest/cart/rpc/rpc_two_node.yaml +++ b/src/tests/ftest/cart/rpc/rpc_two_node.yaml @@ -11,7 +11,7 @@ ENV: - test_clients_CRT_CTX_NUM: "16" env_CRT_PHY_ADDR_STR: !mux ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" env_CRT_CTX_SHARE_ADDR: !mux no_sep: env: no_sep diff --git a/src/tests/ftest/cart/rpc/swim_notification.yaml b/src/tests/ftest/cart/rpc/swim_notification.yaml index 796915d49c3..36ac59bbfc6 100644 --- a/src/tests/ftest/cart/rpc/swim_notification.yaml +++ b/src/tests/ftest/cart/rpc/swim_notification.yaml @@ -12,7 +12,7 @@ ENV: - test_clients_CRT_CTX_NUM: "16" env_CRT_PHY_ADDR_STR: !mux ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" env_CRT_CTX_SHARE_ADDR: !mux no_sep: env: no_sep diff --git a/src/tests/ftest/cart/selftest/selftest_three_node.yaml b/src/tests/ftest/cart/selftest/selftest_three_node.yaml index 3bbca841e7c..0ed3174dfa3 100644 --- a/src/tests/ftest/cart/selftest/selftest_three_node.yaml +++ b/src/tests/ftest/cart/selftest/selftest_three_node.yaml @@ -11,7 +11,7 @@ ENV: - test_clients_CRT_CTX_NUM: "16" env_CRT_PHY_ADDR_STR: !mux ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" env_CRT_CTX_SHARE_ADDR: !mux no_sep: env: no_sep diff --git a/src/tests/ftest/cart/utest/utest_portnumber.c b/src/tests/ftest/cart/utest/utest_portnumber.c index 435182e85af..05b796d3dab 100644 --- a/src/tests/ftest/cart/utest/utest_portnumber.c +++ b/src/tests/ftest/cart/utest/utest_portnumber.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2020-2022 Intel Corporation. + * (C) Copyright 2020-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -193,19 +193,11 @@ static void test_port_tcp(void **state) { d_setenv("OFI_INTERFACE", "lo", 1); - d_setenv("CRT_PHY_ADDR_STR", "ofi+tcp;ofi_rxm", 1); + d_setenv("CRT_PHY_ADDR_STR", "ofi+tcp", 1); run_test_fork(state); } #ifndef MY_TESTS_NOT_INCLUDED -static void -test_port_sockets(void **state) -{ - d_setenv("OFI_INTERFACE", "eth0", 1); - d_setenv("CRT_PHY_ADDR_STR", "ofi+tcp", 1); - run_test_fork(state); -}; - static void test_port_verb(void **state) { @@ -289,10 +281,9 @@ fini_tests(void **state) int main(int argc, char **argv) { const struct CMUnitTest tests[] = { - cmocka_unit_test(test_port_tcp), + cmocka_unit_test(test_port_tcp), #ifndef MY_TESTS_NOT_INCLUDED - cmocka_unit_test(test_port_sockets), - cmocka_unit_test(test_port_verb), + cmocka_unit_test(test_port_verb), #endif }; diff --git a/src/tests/ftest/control/config_generate_output.py b/src/tests/ftest/control/config_generate_output.py index d4052ccd275..b526e3d93b5 100644 --- a/src/tests/ftest/control/config_generate_output.py +++ b/src/tests/ftest/control/config_generate_output.py @@ -23,7 +23,7 @@ def __init__(self, *args, **kwargs): """Initialize a ConfigGenerateOutput object.""" super().__init__(*args, **kwargs) - self.def_provider = "ofi+tcp;ofi_rxm" + self.def_provider = "ofi+tcp" # Data structure that store expected values. self.numa_node_to_pci_addrs = defaultdict(set) diff --git a/src/tests/ftest/control/config_generate_run.yaml b/src/tests/ftest/control/config_generate_run.yaml index 6a2e9ea2519..c37ded50912 100644 --- a/src/tests/ftest/control/config_generate_run.yaml +++ b/src/tests/ftest/control/config_generate_run.yaml @@ -27,34 +27,34 @@ setup: config_generate_params: !mux # 1. Access points only. Use default for others. all_default: - net_provider: ofi+tcp;ofi_rxm + net_provider: ofi+tcp # 2. Use one engine. single_engine: num_engines: 1 - net_provider: ofi+tcp;ofi_rxm + net_provider: ofi+tcp # 3. Use scm_only=false. This will use the maximum number of SSDs, so the # generated config file should be identical to all_default. scm_only_false: scm_only: false - net_provider: ofi+tcp;ofi_rxm + net_provider: ofi+tcp # 4. Use scm_only=true. No NVMe entry. SCM only. scm_only_true: scm_only: true - net_provider: ofi+tcp;ofi_rxm + net_provider: ofi+tcp # 5. Use infiniband. This is usually the default behavior, so the generated # config file would be identical to all_default if the feature is working # correctly. infiniband: net_class: infiniband - net_provider: ofi+tcp;ofi_rxm + net_provider: ofi+tcp # 6. Use ethernet. There's usually only one ethernet interface available, so # use one engine. Each engine would need different interface. We could come up # with the maximum usable count, but that's out of scope. ethernet: net_class: ethernet num_engines: 1 - net_provider: ofi+tcp;ofi_rxm + net_provider: ofi+tcp # 7. Use tmpfs for scm instead of pmem. tmpfs_scm_true: use_tmpfs_scm: true - net_provider: ofi+tcp;ofi_rxm + net_provider: ofi+tcp diff --git a/src/tests/ftest/server/multiengine_persocket.yaml b/src/tests/ftest/server/multiengine_persocket.yaml index 8f8efb6702c..d183b2cb0dc 100644 --- a/src/tests/ftest/server/multiengine_persocket.yaml +++ b/src/tests/ftest/server/multiengine_persocket.yaml @@ -85,7 +85,7 @@ agent_config: dmg: transport_config: allow_insecure: false -provider: ofi+tcp;ofi_rxm +provider: ofi+tcp pool: control_method: dmg scm_size: 1G diff --git a/src/tests/ftest/util/dmg_utils.py b/src/tests/ftest/util/dmg_utils.py index b1680d17e46..35e0d6de2b7 100644 --- a/src/tests/ftest/util/dmg_utils.py +++ b/src/tests/ftest/util/dmg_utils.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2018-2023 Intel Corporation. + (C) Copyright 2018-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -120,7 +120,6 @@ def network_scan(self, provider=None): # ], # "Providers": [ # "ofi+verbs;ofi_rxm", - # "ofi+tcp;ofi_rxm", # "ofi+verbs", # "ofi+tcp", # "ofi+sockets" @@ -1153,7 +1152,7 @@ def config_generate(self, access_points, num_engines=None, scm_only=False, net_class (str): Network class preferred. Defaults to None. i.e. "ethernet"|"infiniband" net_provider (str): Network provider preferred. Defaults to None. - i.e. "ofi+tcp;ofi_rxm" etc. + i.e. "ofi+tcp" etc. use_tmpfs_scm (bool, optional): Whether to use a ramdisk instead of PMem as SCM. Defaults to False. control_metadata_path (str): External directory provided to store control diff --git a/src/tests/ftest/util/network_utils.py b/src/tests/ftest/util/network_utils.py index de940ed835c..ccad437f3a6 100644 --- a/src/tests/ftest/util/network_utils.py +++ b/src/tests/ftest/util/network_utils.py @@ -17,12 +17,12 @@ "ofi+verbs;ofi_rxm", "ucx+dc_x", "ucx+ud_x", + "ofi+tcp", "ofi+tcp;ofi_rxm", "ofi+opx" ) PROVIDER_ALIAS = { - "ofi+verbs": "ofi+verbs;ofi_rxm", - "ofi+tcp": "ofi+tcp;ofi_rxm" + "ofi+verbs": "ofi+verbs;ofi_rxm" } diff --git a/src/tests/ftest/util/server_utils_params.py b/src/tests/ftest/util/server_utils_params.py index a79c750b54d..6617b2521a2 100644 --- a/src/tests/ftest/util/server_utils_params.py +++ b/src/tests/ftest/util/server_utils_params.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -107,7 +107,7 @@ def __init__(self, filename, common_yaml): # is set for the running process. If group look up fails or user # is not member, use uid return from user lookup. # - default_provider = os.environ.get("CRT_PHY_ADDR_STR", "ofi+tcp;ofi_rxm") + default_provider = os.environ.get("CRT_PHY_ADDR_STR", "ofi+tcp") # All log files should be placed in the same directory on each host to # enable easy log file archiving by launch.py @@ -436,6 +436,7 @@ class EngineYamlParameters(YamlParameters): "common": [ "D_LOG_FILE_APPEND_PID=1", "COVFILE=/tmp/test.cov"], + "ofi+tcp": [], "ofi+tcp;ofi_rxm": [], "ofi+verbs": [ "FI_OFI_RXM_USE_SRX=1"], @@ -458,7 +459,7 @@ def __init__(self, base_namespace, index, provider=None, max_storage_tiers=MAX_S namespace = [os.sep] + base_namespace.split(os.sep)[1:-1] + ["engines", str(index), "*"] self._base_namespace = base_namespace self._index = index - self._provider = provider or os.environ.get("CRT_PHY_ADDR_STR", "ofi+tcp;ofi_rxm") + self._provider = provider or os.environ.get("CRT_PHY_ADDR_STR", "ofi+tcp") self._max_storage_tiers = max_storage_tiers super().__init__(os.path.join(*namespace)) diff --git a/utils/docker/vcluster/daos-server/el8/daos_server.yml.in b/utils/docker/vcluster/daos-server/el8/daos_server.yml.in index 3d6f9e3db58..982deac8670 100644 --- a/utils/docker/vcluster/daos-server/el8/daos_server.yml.in +++ b/utils/docker/vcluster/daos-server/el8/daos_server.yml.in @@ -4,7 +4,7 @@ name: daos_server access_points: ['daos-server'] port: 10001 -provider: ofi+tcp;ofi_rxm +provider: ofi+tcp socket_dir: /var/run/daos_server nr_hugepages: @DAOS_HUGEPAGES_NBR@ diff --git a/utils/test_memcheck.supp b/utils/test_memcheck.supp index d1260ecf572..e4671fef8ae 100644 --- a/utils/test_memcheck.supp +++ b/utils/test_memcheck.supp @@ -246,11 +246,23 @@ fun:hg_dlog_mkcount32 ... } +{ + Tcp provider + Memcheck:Param + sendmsg(msg.msg_iov[1]) + ... + fun:sendmsg + fun:ofi_sockapi_sendv_socket + fun:ofi_bsock_sendv + ... + fun:fi_senddata + ... +} { Tcp provider with ofi rxm Memcheck:Param sendmsg(msg.msg_iov[1]) - obj:* + ... fun:ofi_bsock_sendv ... fun:fi_tsend @@ -260,6 +272,7 @@ Tcp provider with ofi rxm 2 Memcheck:Param sendmsg(msg.msg_iov[2]) + ... fun:sendmsg fun:ofi_sockapi_sendv_socket fun:ofi_bsock_sendv From 5915a9917cbbaf03af55ae236e87f1c7ee1686c5 Mon Sep 17 00:00:00 2001 From: Mohamad Chaarawi Date: Thu, 28 Mar 2024 08:44:26 -0500 Subject: [PATCH 07/10] DAOS-15548 test: add new valgrind suppression for daos tool (#14081) Signed-off-by: Mohamad Chaarawi --- src/cart/utils/memcheck-cart.supp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/cart/utils/memcheck-cart.supp b/src/cart/utils/memcheck-cart.supp index 1a616b99ce4..82a5251490c 100644 --- a/src/cart/utils/memcheck-cart.supp +++ b/src/cart/utils/memcheck-cart.supp @@ -567,3 +567,8 @@ write(buf) fun:runtime/internal/syscall.Syscall6 } +{ + DAOS-15548 + Memcheck:Addr1 + fun:racecallatomic +} From 6dafd4a7a157c38e07749be4301638bcb7702e45 Mon Sep 17 00:00:00 2001 From: Mohamad Chaarawi Date: Fri, 9 Feb 2024 15:00:16 -0600 Subject: [PATCH 08/10] DAOS-15159 test: add a supression for new valgrind warning in NLT (#13782) Signed-off-by: Mohamad Chaarawi --- src/cart/utils/memcheck-cart.supp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/cart/utils/memcheck-cart.supp b/src/cart/utils/memcheck-cart.supp index e0feb17fef2..05916304279 100644 --- a/src/cart/utils/memcheck-cart.supp +++ b/src/cart/utils/memcheck-cart.supp @@ -557,3 +557,9 @@ Memcheck:Value8 fun:aeshashbody } +{ + DAOS-15159 + Memcheck:Param + write(buf) + fun:runtime/internal/syscall.Syscall6 +} From 0a668b320ea50685da2e2e8c0a63fee59bd6d527 Mon Sep 17 00:00:00 2001 From: Jerome Soumagne Date: Thu, 21 Mar 2024 14:04:14 -0500 Subject: [PATCH 09/10] DAOS-14669 test: switch tcp;ofi_rxm testing to tcp (#13365) Signed-off-by: Jerome Soumagne --- src/cart/utils/memcheck-cart.supp | 4 ++++ src/tests/ftest/cart/corpc/corpc_five_node.yaml | 2 +- src/tests/ftest/cart/corpc/corpc_two_node.yaml | 2 +- src/tests/ftest/cart/ctl/ctl_five_node.yaml | 2 +- .../ghost_rank_rpc/ghost_rank_rpc_one_node.yaml | 2 +- src/tests/ftest/cart/group_test/group_test.yaml | 2 +- src/tests/ftest/cart/iv/iv_one_node.yaml | 2 +- src/tests/ftest/cart/iv/iv_two_node.yaml | 2 +- .../ftest/cart/no_pmix/multictx_one_node.yaml | 2 ++ src/tests/ftest/cart/no_pmix_group_test.c | 2 +- .../cart/nopmix_launcher/launcher_one_node.yaml | 2 +- .../ftest/cart/rpc/multisend_one_node.yaml | 2 +- src/tests/ftest/cart/rpc/rpc_one_node.yaml | 2 +- src/tests/ftest/cart/rpc/rpc_two_node.yaml | 2 +- src/tests/ftest/cart/rpc/swim_notification.yaml | 2 +- .../cart/selftest/selftest_three_node.yaml | 2 +- src/tests/ftest/cart/utest/utest_portnumber.c | 17 ++++------------- .../ftest/control/config_generate_output.py | 2 +- .../ftest/control/config_generate_run.yaml | 14 +++++++------- .../ftest/server/multiengine_persocket.yaml | 2 +- src/tests/ftest/util/dmg_utils.py | 5 ++--- src/tests/ftest/util/network_utils.py | 4 ++-- src/tests/ftest/util/server_utils_params.py | 7 ++++--- .../vcluster/daos-server/el8/daos_server.yml.in | 2 +- utils/test_memcheck.supp | 15 ++++++++++++++- 25 files changed, 56 insertions(+), 46 deletions(-) diff --git a/src/cart/utils/memcheck-cart.supp b/src/cart/utils/memcheck-cart.supp index 05916304279..1a616b99ce4 100644 --- a/src/cart/utils/memcheck-cart.supp +++ b/src/cart/utils/memcheck-cart.supp @@ -501,9 +501,12 @@ Memcheck:Param sendmsg(msg.msg_iov[1]) ... + fun:sendmsg + fun:ofi_sockapi_sendv_socket fun:ofi_bsock_sendv ... fun:fi_senddata + ... } { Tcp provider with ofi rxm @@ -513,6 +516,7 @@ fun:ofi_bsock_sendv ... fun:fi_tsend + ... } { Tcp provider with ofi rxm 2 diff --git a/src/tests/ftest/cart/corpc/corpc_five_node.yaml b/src/tests/ftest/cart/corpc/corpc_five_node.yaml index 0fdd2890a25..d26ade1928b 100644 --- a/src/tests/ftest/cart/corpc/corpc_five_node.yaml +++ b/src/tests/ftest/cart/corpc/corpc_five_node.yaml @@ -10,7 +10,7 @@ ENV: - test_servers_CRT_CTX_NUM: "0" env_CRT_PHY_ADDR_STR: !mux ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" env_CRT_CTX_SHARE_ADDR: !mux no_sep: env: no_sep diff --git a/src/tests/ftest/cart/corpc/corpc_two_node.yaml b/src/tests/ftest/cart/corpc/corpc_two_node.yaml index 7f1c4eb488a..04f568b6447 100644 --- a/src/tests/ftest/cart/corpc/corpc_two_node.yaml +++ b/src/tests/ftest/cart/corpc/corpc_two_node.yaml @@ -10,7 +10,7 @@ ENV: - test_servers_CRT_CTX_NUM: "16" env_CRT_PHY_ADDR_STR: !mux ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" env_CRT_CTX_SHARE_ADDR: !mux no_sep: env: no_sep diff --git a/src/tests/ftest/cart/ctl/ctl_five_node.yaml b/src/tests/ftest/cart/ctl/ctl_five_node.yaml index d69f13874cf..52408336aab 100644 --- a/src/tests/ftest/cart/ctl/ctl_five_node.yaml +++ b/src/tests/ftest/cart/ctl/ctl_five_node.yaml @@ -11,7 +11,7 @@ ENV: - test_clients_CRT_CTX_NUM: "0" env_CRT_PHY_ADDR_STR: !mux ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" env_CRT_CTX_SHARE_ADDR: !mux no_sep: env: no_sep diff --git a/src/tests/ftest/cart/ghost_rank_rpc/ghost_rank_rpc_one_node.yaml b/src/tests/ftest/cart/ghost_rank_rpc/ghost_rank_rpc_one_node.yaml index 9c8c50b0012..d347330674a 100644 --- a/src/tests/ftest/cart/ghost_rank_rpc/ghost_rank_rpc_one_node.yaml +++ b/src/tests/ftest/cart/ghost_rank_rpc/ghost_rank_rpc_one_node.yaml @@ -10,7 +10,7 @@ ENV: - test_servers_CRT_CTX_NUM: "16" env_CRT_PHY_ADDR_STR: !mux ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" env_CRT_CTX_SHARE_ADDR: !mux no_sep: env: no_sep diff --git a/src/tests/ftest/cart/group_test/group_test.yaml b/src/tests/ftest/cart/group_test/group_test.yaml index 829406b7136..977df306ff7 100644 --- a/src/tests/ftest/cart/group_test/group_test.yaml +++ b/src/tests/ftest/cart/group_test/group_test.yaml @@ -8,7 +8,7 @@ ENV: - OFI_INTERFACE: "eth0" env_CRT_PHY_ADDR_STR: !mux ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" hosts: !mux hosts_1: config: one_node diff --git a/src/tests/ftest/cart/iv/iv_one_node.yaml b/src/tests/ftest/cart/iv/iv_one_node.yaml index 0eab8a3027c..5c9a7926338 100644 --- a/src/tests/ftest/cart/iv/iv_one_node.yaml +++ b/src/tests/ftest/cart/iv/iv_one_node.yaml @@ -18,7 +18,7 @@ env_CRT_PHY_ADDR_STR: !mux sm: CRT_PHY_ADDR_STR: "sm" ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" hosts: !mux hosts_1: config: one_node diff --git a/src/tests/ftest/cart/iv/iv_two_node.yaml b/src/tests/ftest/cart/iv/iv_two_node.yaml index 3d0e3aece49..f0236bbb385 100644 --- a/src/tests/ftest/cart/iv/iv_two_node.yaml +++ b/src/tests/ftest/cart/iv/iv_two_node.yaml @@ -12,7 +12,7 @@ ENV: - CRT_TEST_CONT: "1" env_CRT_PHY_ADDR_STR: !mux ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" env_CRT_CTX_SHARE_ADDR: !mux no_sep: env: no_sep diff --git a/src/tests/ftest/cart/no_pmix/multictx_one_node.yaml b/src/tests/ftest/cart/no_pmix/multictx_one_node.yaml index 137a4f1c7ee..3e31a85de14 100644 --- a/src/tests/ftest/cart/no_pmix/multictx_one_node.yaml +++ b/src/tests/ftest/cart/no_pmix/multictx_one_node.yaml @@ -13,6 +13,8 @@ env_CRT_PHY_ADDR_STR: !mux sm: CRT_PHY_ADDR_STR: "sm" ofi_tcp: + CRT_PHY_ADDR_STR: "ofi+tcp" + ofi_tcp_rxm: CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" tests: !mux no_pmix_multi_ctx: diff --git a/src/tests/ftest/cart/no_pmix_group_test.c b/src/tests/ftest/cart/no_pmix_group_test.c index 7290b478aa5..d5db7f230f1 100644 --- a/src/tests/ftest/cart/no_pmix_group_test.c +++ b/src/tests/ftest/cart/no_pmix_group_test.c @@ -686,7 +686,7 @@ int main(int argc, char **argv) } for (i = 0; i < 10; i++) { - rc = asprintf(&uris[i], "ofi+tcp;ofi_rxm://127.0.0.1:%d", + rc = asprintf(&uris[i], "ofi+tcp://127.0.0.1:%d", 10000 + i); if (rc == -1) { D_ERROR("asprintf() failed\n"); diff --git a/src/tests/ftest/cart/nopmix_launcher/launcher_one_node.yaml b/src/tests/ftest/cart/nopmix_launcher/launcher_one_node.yaml index 845e962f46b..97dabc26fe2 100644 --- a/src/tests/ftest/cart/nopmix_launcher/launcher_one_node.yaml +++ b/src/tests/ftest/cart/nopmix_launcher/launcher_one_node.yaml @@ -8,7 +8,7 @@ ENV: - OFI_INTERFACE: "eth0" env_CRT_PHY_ADDR_STR: !mux ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" hosts: !mux hosts_1: config: one_node diff --git a/src/tests/ftest/cart/rpc/multisend_one_node.yaml b/src/tests/ftest/cart/rpc/multisend_one_node.yaml index 42700082b77..96bf23c1224 100644 --- a/src/tests/ftest/cart/rpc/multisend_one_node.yaml +++ b/src/tests/ftest/cart/rpc/multisend_one_node.yaml @@ -17,7 +17,7 @@ env_CRT_CTX_SHARE_ADDR: !mux CRT_CTX_SHARE_ADDR: "0" env_CRT_PHY_ADDR_STR: !mux ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" hosts: !mux hosts_1: config: one_node diff --git a/src/tests/ftest/cart/rpc/rpc_one_node.yaml b/src/tests/ftest/cart/rpc/rpc_one_node.yaml index 0e2210ef1bf..13adbb7b568 100644 --- a/src/tests/ftest/cart/rpc/rpc_one_node.yaml +++ b/src/tests/ftest/cart/rpc/rpc_one_node.yaml @@ -19,7 +19,7 @@ env_CRT_PHY_ADDR_STR: !mux sm: CRT_PHY_ADDR_STR: "sm" ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" hosts: !mux hosts_1: config: one_node diff --git a/src/tests/ftest/cart/rpc/rpc_two_node.yaml b/src/tests/ftest/cart/rpc/rpc_two_node.yaml index 8b3ae57880c..6c6f477530b 100644 --- a/src/tests/ftest/cart/rpc/rpc_two_node.yaml +++ b/src/tests/ftest/cart/rpc/rpc_two_node.yaml @@ -11,7 +11,7 @@ ENV: - test_clients_CRT_CTX_NUM: "16" env_CRT_PHY_ADDR_STR: !mux ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" env_CRT_CTX_SHARE_ADDR: !mux no_sep: env: no_sep diff --git a/src/tests/ftest/cart/rpc/swim_notification.yaml b/src/tests/ftest/cart/rpc/swim_notification.yaml index 796915d49c3..36ac59bbfc6 100644 --- a/src/tests/ftest/cart/rpc/swim_notification.yaml +++ b/src/tests/ftest/cart/rpc/swim_notification.yaml @@ -12,7 +12,7 @@ ENV: - test_clients_CRT_CTX_NUM: "16" env_CRT_PHY_ADDR_STR: !mux ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" env_CRT_CTX_SHARE_ADDR: !mux no_sep: env: no_sep diff --git a/src/tests/ftest/cart/selftest/selftest_three_node.yaml b/src/tests/ftest/cart/selftest/selftest_three_node.yaml index 3bbca841e7c..0ed3174dfa3 100644 --- a/src/tests/ftest/cart/selftest/selftest_three_node.yaml +++ b/src/tests/ftest/cart/selftest/selftest_three_node.yaml @@ -11,7 +11,7 @@ ENV: - test_clients_CRT_CTX_NUM: "16" env_CRT_PHY_ADDR_STR: !mux ofi_tcp: - CRT_PHY_ADDR_STR: "ofi+tcp;ofi_rxm" + CRT_PHY_ADDR_STR: "ofi+tcp" env_CRT_CTX_SHARE_ADDR: !mux no_sep: env: no_sep diff --git a/src/tests/ftest/cart/utest/utest_portnumber.c b/src/tests/ftest/cart/utest/utest_portnumber.c index 435182e85af..05b796d3dab 100644 --- a/src/tests/ftest/cart/utest/utest_portnumber.c +++ b/src/tests/ftest/cart/utest/utest_portnumber.c @@ -1,5 +1,5 @@ /* - * (C) Copyright 2020-2022 Intel Corporation. + * (C) Copyright 2020-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -193,19 +193,11 @@ static void test_port_tcp(void **state) { d_setenv("OFI_INTERFACE", "lo", 1); - d_setenv("CRT_PHY_ADDR_STR", "ofi+tcp;ofi_rxm", 1); + d_setenv("CRT_PHY_ADDR_STR", "ofi+tcp", 1); run_test_fork(state); } #ifndef MY_TESTS_NOT_INCLUDED -static void -test_port_sockets(void **state) -{ - d_setenv("OFI_INTERFACE", "eth0", 1); - d_setenv("CRT_PHY_ADDR_STR", "ofi+tcp", 1); - run_test_fork(state); -}; - static void test_port_verb(void **state) { @@ -289,10 +281,9 @@ fini_tests(void **state) int main(int argc, char **argv) { const struct CMUnitTest tests[] = { - cmocka_unit_test(test_port_tcp), + cmocka_unit_test(test_port_tcp), #ifndef MY_TESTS_NOT_INCLUDED - cmocka_unit_test(test_port_sockets), - cmocka_unit_test(test_port_verb), + cmocka_unit_test(test_port_verb), #endif }; diff --git a/src/tests/ftest/control/config_generate_output.py b/src/tests/ftest/control/config_generate_output.py index d4052ccd275..b526e3d93b5 100644 --- a/src/tests/ftest/control/config_generate_output.py +++ b/src/tests/ftest/control/config_generate_output.py @@ -23,7 +23,7 @@ def __init__(self, *args, **kwargs): """Initialize a ConfigGenerateOutput object.""" super().__init__(*args, **kwargs) - self.def_provider = "ofi+tcp;ofi_rxm" + self.def_provider = "ofi+tcp" # Data structure that store expected values. self.numa_node_to_pci_addrs = defaultdict(set) diff --git a/src/tests/ftest/control/config_generate_run.yaml b/src/tests/ftest/control/config_generate_run.yaml index 6a2e9ea2519..c37ded50912 100644 --- a/src/tests/ftest/control/config_generate_run.yaml +++ b/src/tests/ftest/control/config_generate_run.yaml @@ -27,34 +27,34 @@ setup: config_generate_params: !mux # 1. Access points only. Use default for others. all_default: - net_provider: ofi+tcp;ofi_rxm + net_provider: ofi+tcp # 2. Use one engine. single_engine: num_engines: 1 - net_provider: ofi+tcp;ofi_rxm + net_provider: ofi+tcp # 3. Use scm_only=false. This will use the maximum number of SSDs, so the # generated config file should be identical to all_default. scm_only_false: scm_only: false - net_provider: ofi+tcp;ofi_rxm + net_provider: ofi+tcp # 4. Use scm_only=true. No NVMe entry. SCM only. scm_only_true: scm_only: true - net_provider: ofi+tcp;ofi_rxm + net_provider: ofi+tcp # 5. Use infiniband. This is usually the default behavior, so the generated # config file would be identical to all_default if the feature is working # correctly. infiniband: net_class: infiniband - net_provider: ofi+tcp;ofi_rxm + net_provider: ofi+tcp # 6. Use ethernet. There's usually only one ethernet interface available, so # use one engine. Each engine would need different interface. We could come up # with the maximum usable count, but that's out of scope. ethernet: net_class: ethernet num_engines: 1 - net_provider: ofi+tcp;ofi_rxm + net_provider: ofi+tcp # 7. Use tmpfs for scm instead of pmem. tmpfs_scm_true: use_tmpfs_scm: true - net_provider: ofi+tcp;ofi_rxm + net_provider: ofi+tcp diff --git a/src/tests/ftest/server/multiengine_persocket.yaml b/src/tests/ftest/server/multiengine_persocket.yaml index 8f8efb6702c..d183b2cb0dc 100644 --- a/src/tests/ftest/server/multiengine_persocket.yaml +++ b/src/tests/ftest/server/multiengine_persocket.yaml @@ -85,7 +85,7 @@ agent_config: dmg: transport_config: allow_insecure: false -provider: ofi+tcp;ofi_rxm +provider: ofi+tcp pool: control_method: dmg scm_size: 1G diff --git a/src/tests/ftest/util/dmg_utils.py b/src/tests/ftest/util/dmg_utils.py index b1680d17e46..35e0d6de2b7 100644 --- a/src/tests/ftest/util/dmg_utils.py +++ b/src/tests/ftest/util/dmg_utils.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2018-2023 Intel Corporation. + (C) Copyright 2018-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -120,7 +120,6 @@ def network_scan(self, provider=None): # ], # "Providers": [ # "ofi+verbs;ofi_rxm", - # "ofi+tcp;ofi_rxm", # "ofi+verbs", # "ofi+tcp", # "ofi+sockets" @@ -1153,7 +1152,7 @@ def config_generate(self, access_points, num_engines=None, scm_only=False, net_class (str): Network class preferred. Defaults to None. i.e. "ethernet"|"infiniband" net_provider (str): Network provider preferred. Defaults to None. - i.e. "ofi+tcp;ofi_rxm" etc. + i.e. "ofi+tcp" etc. use_tmpfs_scm (bool, optional): Whether to use a ramdisk instead of PMem as SCM. Defaults to False. control_metadata_path (str): External directory provided to store control diff --git a/src/tests/ftest/util/network_utils.py b/src/tests/ftest/util/network_utils.py index de940ed835c..ccad437f3a6 100644 --- a/src/tests/ftest/util/network_utils.py +++ b/src/tests/ftest/util/network_utils.py @@ -17,12 +17,12 @@ "ofi+verbs;ofi_rxm", "ucx+dc_x", "ucx+ud_x", + "ofi+tcp", "ofi+tcp;ofi_rxm", "ofi+opx" ) PROVIDER_ALIAS = { - "ofi+verbs": "ofi+verbs;ofi_rxm", - "ofi+tcp": "ofi+tcp;ofi_rxm" + "ofi+verbs": "ofi+verbs;ofi_rxm" } diff --git a/src/tests/ftest/util/server_utils_params.py b/src/tests/ftest/util/server_utils_params.py index a79c750b54d..6617b2521a2 100644 --- a/src/tests/ftest/util/server_utils_params.py +++ b/src/tests/ftest/util/server_utils_params.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -107,7 +107,7 @@ def __init__(self, filename, common_yaml): # is set for the running process. If group look up fails or user # is not member, use uid return from user lookup. # - default_provider = os.environ.get("CRT_PHY_ADDR_STR", "ofi+tcp;ofi_rxm") + default_provider = os.environ.get("CRT_PHY_ADDR_STR", "ofi+tcp") # All log files should be placed in the same directory on each host to # enable easy log file archiving by launch.py @@ -436,6 +436,7 @@ class EngineYamlParameters(YamlParameters): "common": [ "D_LOG_FILE_APPEND_PID=1", "COVFILE=/tmp/test.cov"], + "ofi+tcp": [], "ofi+tcp;ofi_rxm": [], "ofi+verbs": [ "FI_OFI_RXM_USE_SRX=1"], @@ -458,7 +459,7 @@ def __init__(self, base_namespace, index, provider=None, max_storage_tiers=MAX_S namespace = [os.sep] + base_namespace.split(os.sep)[1:-1] + ["engines", str(index), "*"] self._base_namespace = base_namespace self._index = index - self._provider = provider or os.environ.get("CRT_PHY_ADDR_STR", "ofi+tcp;ofi_rxm") + self._provider = provider or os.environ.get("CRT_PHY_ADDR_STR", "ofi+tcp") self._max_storage_tiers = max_storage_tiers super().__init__(os.path.join(*namespace)) diff --git a/utils/docker/vcluster/daos-server/el8/daos_server.yml.in b/utils/docker/vcluster/daos-server/el8/daos_server.yml.in index 3d6f9e3db58..982deac8670 100644 --- a/utils/docker/vcluster/daos-server/el8/daos_server.yml.in +++ b/utils/docker/vcluster/daos-server/el8/daos_server.yml.in @@ -4,7 +4,7 @@ name: daos_server access_points: ['daos-server'] port: 10001 -provider: ofi+tcp;ofi_rxm +provider: ofi+tcp socket_dir: /var/run/daos_server nr_hugepages: @DAOS_HUGEPAGES_NBR@ diff --git a/utils/test_memcheck.supp b/utils/test_memcheck.supp index d1260ecf572..e4671fef8ae 100644 --- a/utils/test_memcheck.supp +++ b/utils/test_memcheck.supp @@ -246,11 +246,23 @@ fun:hg_dlog_mkcount32 ... } +{ + Tcp provider + Memcheck:Param + sendmsg(msg.msg_iov[1]) + ... + fun:sendmsg + fun:ofi_sockapi_sendv_socket + fun:ofi_bsock_sendv + ... + fun:fi_senddata + ... +} { Tcp provider with ofi rxm Memcheck:Param sendmsg(msg.msg_iov[1]) - obj:* + ... fun:ofi_bsock_sendv ... fun:fi_tsend @@ -260,6 +272,7 @@ Tcp provider with ofi rxm 2 Memcheck:Param sendmsg(msg.msg_iov[2]) + ... fun:sendmsg fun:ofi_sockapi_sendv_socket fun:ofi_bsock_sendv From b8ce6c4c77fd311a557a25a080d61ac4a0937dc6 Mon Sep 17 00:00:00 2001 From: Mohamad Chaarawi Date: Thu, 28 Mar 2024 08:44:26 -0500 Subject: [PATCH 10/10] DAOS-15548 test: add new valgrind suppression for daos tool (#14081) Signed-off-by: Mohamad Chaarawi --- src/cart/utils/memcheck-cart.supp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/cart/utils/memcheck-cart.supp b/src/cart/utils/memcheck-cart.supp index 1a616b99ce4..82a5251490c 100644 --- a/src/cart/utils/memcheck-cart.supp +++ b/src/cart/utils/memcheck-cart.supp @@ -567,3 +567,8 @@ write(buf) fun:runtime/internal/syscall.Syscall6 } +{ + DAOS-15548 + Memcheck:Addr1 + fun:racecallatomic +}