From 6087823aa9a95ef4121940c968b9ae3644852c60 Mon Sep 17 00:00:00 2001 From: Liu Xuezhao Date: Fri, 28 Jun 2024 15:26:37 +0800 Subject: [PATCH] b/349170185 Backport fixes DAOS-16039 object: fix EC aggregation wrong peer address (#14593) DAOS-16009 rebuild: fix O_TRUNC file size related handling DAOS-15056 rebuild: add rpt to the rgt list properly (#13862) DAOS-15517 rebuild: refine lock handling for rpt list (#14064) DAOS-13812 container: fix destroy vs lookup (#12757) DAOS-15627 dtx: redunce stack usage for DTX resync to avoid overflow (#14189) DAOS-14845 rebuild: do not wait for EC agg for reclaim (#13610) Signed-off-by: Xuezhao Liu Signed-off-by: Mohamad Chaarawi Signed-off-by: Jeff Olivier Signed-off-by: Wang, Di Signed-off-by: Di Wang Signed-off-by: Wang Shilong Signed-off-by: Fan Yong --- src/client/array/dc_array.c | 99 +++++++++++++++-------- src/common/pool_map.c | 7 +- src/container/srv_target.c | 65 ++++++++++----- src/dtx/dtx_common.c | 26 ++++-- src/dtx/dtx_internal.h | 1 - src/dtx/dtx_resync.c | 29 ++++--- src/include/daos/common.h | 1 + src/include/daos_srv/container.h | 7 +- src/include/daos_srv/dtx_srv.h | 4 +- src/object/srv_ec_aggregate.c | 132 ++++++++++++++++++++----------- src/object/srv_obj.c | 3 + src/object/srv_obj_migrate.c | 8 +- src/rebuild/rebuild_internal.h | 11 ++- src/rebuild/scan.c | 68 +++++++++++----- src/rebuild/srv.c | 99 +++++++++++++++++++---- src/tests/suite/daos_obj_ec.c | 108 +++++++++++++++++++++---- 16 files changed, 493 insertions(+), 175 deletions(-) diff --git a/src/client/array/dc_array.c b/src/client/array/dc_array.c index 45b8947b04a..17f2daa80ef 100644 --- a/src/client/array/dc_array.c +++ b/src/client/array/dc_array.c @@ -2069,18 +2069,24 @@ free_set_size_cb(tse_task_t *task, void *data) } static int -punch_extent(daos_handle_t oh, daos_handle_t th, daos_size_t dkey_val, daos_off_t record_i, - daos_size_t num_records, tse_task_t *task, d_list_t *task_list) +punch_dkey_or_extent(daos_handle_t oh, daos_handle_t th, daos_size_t dkey_val, daos_off_t start, + daos_size_t num_records, bool punch_dkey, tse_task_t *task, + d_list_t *task_list) { daos_obj_update_t *io_arg; + daos_obj_punch_t *dkey_punch_arg; daos_iod_t *iod; d_sg_list_t *sgl; daos_key_t *dkey; struct io_params *params = NULL; tse_task_t *io_task = NULL; - int rc; + int rc; - D_DEBUG(DB_IO, "Punching (%zu, %zu) in Key %zu\n", record_i + 1, num_records, dkey_val); + if (punch_dkey) + D_DEBUG(DB_IO, "Punching dkey %zu\n", dkey_val); + else + D_DEBUG(DB_IO, "Punching (%zu, %zu) in Key %zu\n", + start, num_records, dkey_val); D_ALLOC_PTR(params); if (params == NULL) @@ -2094,28 +2100,42 @@ punch_extent(daos_handle_t oh, daos_handle_t th, daos_size_t dkey_val, daos_off_ dkey = ¶ms->dkey; d_iov_set(dkey, ¶ms->dkey_val, sizeof(uint64_t)); - /* set descriptor for KV object */ - d_iov_set(&iod->iod_name, ¶ms->akey_val, 1); - iod->iod_nr = 1; - iod->iod_size = 0; /* 0 to punch */ - iod->iod_type = DAOS_IOD_ARRAY; - D_ALLOC_PTR(iod->iod_recxs); - if (iod->iod_recxs == NULL) - D_GOTO(free, rc = -DER_NOMEM); - iod->iod_recxs[0].rx_idx = record_i + 1; - iod->iod_recxs[0].rx_nr = num_records; - - rc = daos_task_create(DAOS_OPC_OBJ_UPDATE, tse_task2sched(task), 0, NULL, &io_task); - if (rc) - D_GOTO(free_reqs, rc); + if (punch_dkey) { + rc = daos_task_create(DAOS_OPC_OBJ_PUNCH_DKEYS, tse_task2sched(task), 0, NULL, + &io_task); + if (rc) + D_GOTO(free_reqs, rc); + + dkey_punch_arg = daos_task_get_args(io_task); + dkey_punch_arg->oh = oh; + dkey_punch_arg->th = th; + dkey_punch_arg->dkey = dkey; + dkey_punch_arg->akeys = NULL; + dkey_punch_arg->akey_nr = 0; + } else { + /* set descriptor for KV object */ + d_iov_set(&iod->iod_name, ¶ms->akey_val, 1); + iod->iod_nr = 1; + iod->iod_size = 0; /* 0 to punch */ + iod->iod_type = DAOS_IOD_ARRAY; + D_ALLOC_PTR(iod->iod_recxs); + if (iod->iod_recxs == NULL) + D_GOTO(free, rc = -DER_NOMEM); + iod->iod_recxs[0].rx_idx = start; + iod->iod_recxs[0].rx_nr = num_records; + + rc = daos_task_create(DAOS_OPC_OBJ_UPDATE, tse_task2sched(task), 0, NULL, &io_task); + if (rc) + D_GOTO(free_reqs, rc); - io_arg = daos_task_get_args(io_task); - io_arg->oh = oh; - io_arg->th = th; - io_arg->dkey = dkey; - io_arg->nr = 1; - io_arg->iods = iod; - io_arg->sgls = sgl; + io_arg = daos_task_get_args(io_task); + io_arg->oh = oh; + io_arg->th = th; + io_arg->dkey = dkey; + io_arg->nr = 1; + io_arg->iods = iod; + io_arg->sgls = sgl; + } rc = tse_task_register_comp_cb(io_task, free_io_params_cb, ¶ms, sizeof(params)); if (rc) @@ -2422,18 +2442,26 @@ adjust_array_size_cb(tse_task_t *task, void *data) memcpy(&dkey_val, ptr, args->kds[i].kd_key_len); ptr += args->kds[i].kd_key_len; + /* + * Either punch the entire dkey or an extent in that dkey depending on the offset + * where we are truncating to. The first dkey of the array (dkey 1) will always be + * an extent punch to maintain an epoch there. + */ if (props->size == 0 || dkey_val > props->dkey_val) { /** Do nothing for DKEY 0 (metadata) */ if (dkey_val == 0) continue; - /* - * The dkey is higher than the adjustded size so we could punch it here. - * But it's better to punch the extent so that the max_write for the object - * doesn't get lost by aggregation. - */ - D_DEBUG(DB_IO, "Punch full extent in key "DF_U64"\n", dkey_val); - rc = punch_extent(args->oh, args->th, dkey_val, (daos_off_t)-1, - props->chunk_size, props->ptask, &task_list); + if (dkey_val == 1) { + D_DEBUG(DB_IO, "Punch full extent in key " DF_U64 "\n", dkey_val); + rc = punch_dkey_or_extent(args->oh, args->th, dkey_val, + 0, props->chunk_size, false, + props->ptask, &task_list); + } else { + D_DEBUG(DB_IO, "Punch dkey " DF_U64 "\n", dkey_val); + rc = punch_dkey_or_extent(args->oh, args->th, dkey_val, + 0, props->chunk_size, true, + props->ptask, &task_list); + } if (rc) goto out; } else if (dkey_val == props->dkey_val && props->record_i) { @@ -2444,8 +2472,9 @@ adjust_array_size_cb(tse_task_t *task, void *data) props->chunk_size); /** Punch all records above record_i */ D_DEBUG(DB_IO, "Punch extent in key "DF_U64"\n", dkey_val); - rc = punch_extent(args->oh, args->th, dkey_val, props->record_i, - props->num_records, props->ptask, &task_list); + rc = punch_dkey_or_extent(args->oh, args->th, dkey_val, + props->record_i + 1, props->num_records, + false, props->ptask, &task_list); if (rc) goto out; } diff --git a/src/common/pool_map.c b/src/common/pool_map.c index 7dd83b9603f..2d478778d4a 100644 --- a/src/common/pool_map.c +++ b/src/common/pool_map.c @@ -1650,11 +1650,12 @@ gen_pool_buf(struct pool_map *map, struct pool_buf **map_buf_out, int map_versio map_comp.co_flags = PO_COMPF_NONE; map_comp.co_nr = 1; - D_DEBUG(DB_TRACE, "adding target: type=0x%hhx, status=%hhu, idx=%d, " + D_DEBUG(DB_TRACE, "adding target: type=0x%hhx, status=%hhu, idx=%d, id=%d, " "rank=%d, ver=%d, in_ver=%d, fseq=%u, flags=0x%x, nr=%u\n", map_comp.co_type, map_comp.co_status, map_comp.co_index, - map_comp.co_rank, map_comp.co_ver, map_comp.co_in_ver, - map_comp.co_fseq, map_comp.co_flags, map_comp.co_nr); + map_comp.co_id, map_comp.co_rank, map_comp.co_ver, + map_comp.co_in_ver, map_comp.co_fseq, map_comp.co_flags, + map_comp.co_nr); rc = pool_buf_attach(map_buf, &map_comp, 1); if (rc != 0) diff --git a/src/container/srv_target.c b/src/container/srv_target.c index 3961f41dcaf..d2c902a2265 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -319,6 +319,7 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb, if (unlikely(DAOS_FAIL_CHECK(DAOS_FORCE_EC_AGG) || DAOS_FAIL_CHECK(DAOS_FORCE_EC_AGG_FAIL) || + DAOS_FAIL_CHECK(DAOS_OBJ_EC_AGG_LEADER_DIFF) || DAOS_FAIL_CHECK(DAOS_FORCE_EC_AGG_PEER_FAIL))) interval = 0; else @@ -627,13 +628,18 @@ cont_child_alloc_ref(void *co_uuid, unsigned int ksize, void *po_uuid, rc = ABT_cond_create(&cont->sc_scrub_cond); if (rc != ABT_SUCCESS) { rc = dss_abterr2der(rc); - goto out_mutex; + goto out_resync_cond; + } + rc = ABT_cond_create(&cont->sc_rebuild_cond); + if (rc != ABT_SUCCESS) { + rc = dss_abterr2der(rc); + goto out_scrub_cond; } cont->sc_pool = ds_pool_child_lookup(po_uuid); if (cont->sc_pool == NULL) { rc = -DER_NO_HDL; - goto out_cond; + goto out_rebuild_cond; } rc = vos_cont_open(cont->sc_pool->spc_hdl, co_uuid, &cont->sc_hdl); @@ -659,7 +665,11 @@ cont_child_alloc_ref(void *co_uuid, unsigned int ksize, void *po_uuid, out_pool: ds_pool_child_put(cont->sc_pool); -out_cond: +out_rebuild_cond: + ABT_cond_free(&cont->sc_rebuild_cond); +out_scrub_cond: + ABT_cond_free(&cont->sc_scrub_cond); +out_resync_cond: ABT_cond_free(&cont->sc_dtx_resync_cond); out_mutex: ABT_mutex_free(&cont->sc_mutex); @@ -686,6 +696,7 @@ cont_child_free_ref(struct daos_llink *llink) D_FREE(cont->sc_snapshots); ABT_cond_free(&cont->sc_dtx_resync_cond); ABT_cond_free(&cont->sc_scrub_cond); + ABT_cond_free(&cont->sc_rebuild_cond); ABT_mutex_free(&cont->sc_mutex); D_FREE(cont); } @@ -740,6 +751,12 @@ ds_cont_child_cache_destroy(struct daos_lru_cache *cache) daos_lru_cache_destroy(cache); } +static void +cont_child_put(struct daos_lru_cache *cache, struct ds_cont_child *cont) +{ + daos_lru_ref_release(cache, &cont->sc_list); +} + /* * If create == false, then this is assumed to be a pure lookup. In this case, * -DER_NONEXIST is returned if the ds_cont_child object does not exist. @@ -774,12 +791,6 @@ cont_child_lookup(struct daos_lru_cache *cache, const uuid_t co_uuid, return 0; } -static void -cont_child_put(struct daos_lru_cache *cache, struct ds_cont_child *cont) -{ - daos_lru_ref_release(cache, &cont->sc_list); -} - static inline bool cont_child_started(struct ds_cont_child *cont_child) { @@ -805,13 +816,17 @@ cont_child_stop(struct ds_cont_child *cont_child) /* Some ds_cont_child will only created by ds_cont_child_lookup(). * never be started at all */ + cont_child->sc_stopping = 1; + + /* Stop DTX reindex by force. */ + stop_dtx_reindex_ult(cont_child, true); + if (cont_child_started(cont_child)) { D_DEBUG(DB_MD, DF_CONT"[%d]: Stopping container\n", DP_CONT(cont_child->sc_pool->spc_uuid, cont_child->sc_uuid), dss_get_module_info()->dmi_tgt_id); - cont_child->sc_stopping = 1; d_list_del_init(&cont_child->sc_link); dtx_cont_deregister(cont_child); @@ -1164,6 +1179,7 @@ cont_child_destroy_one(void *vin) &cont); if (rc == -DER_NONEXIST) break; + if (rc != 0) D_GOTO(out_pool, rc); @@ -1187,10 +1203,6 @@ cont_child_destroy_one(void *vin) ABT_cond_wait(cont->sc_dtx_resync_cond, cont->sc_mutex); ABT_mutex_unlock(cont->sc_mutex); - /* Give chance to DTX reindex ULT for exit. */ - if (unlikely(cont->sc_dtx_reindex)) - ABT_thread_yield(); - /* Make sure checksum scrubbing has stopped */ ABT_mutex_lock(cont->sc_mutex); if (cont->sc_scrubbing) { @@ -1199,6 +1211,12 @@ cont_child_destroy_one(void *vin) } ABT_mutex_unlock(cont->sc_mutex); + /* Make sure rebuild has stopped */ + ABT_mutex_lock(cont->sc_mutex); + if (cont->sc_rebuilding) + ABT_cond_wait(cont->sc_rebuild_cond, cont->sc_mutex); + ABT_mutex_unlock(cont->sc_mutex); + retry_cnt++; if (retry_cnt > 1) { D_ERROR("container is still in-use: open %u, resync %s, reindex %s\n", @@ -1300,9 +1318,20 @@ ds_cont_child_lookup(uuid_t pool_uuid, uuid_t cont_uuid, struct ds_cont_child **ds_cont) { struct dsm_tls *tls = dsm_tls_get(); + int rc; + + rc = cont_child_lookup(tls->dt_cont_cache, cont_uuid, pool_uuid, + true /* create */, ds_cont); + if (rc != 0) + return rc; - return cont_child_lookup(tls->dt_cont_cache, cont_uuid, pool_uuid, - true /* create */, ds_cont); + if ((*ds_cont)->sc_stopping) { + cont_child_put(tls->dt_cont_cache, *ds_cont); + *ds_cont = NULL; + return -DER_SHUTDOWN; + } + + return 0; } /** @@ -1572,7 +1601,7 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid, DF_UUID": %d\n", DP_UUID(cont_uuid), hdl->sch_cont->sc_open); hdl->sch_cont->sc_open--; - dtx_cont_close(hdl->sch_cont); + dtx_cont_close(hdl->sch_cont, true); err_cont: if (daos_handle_is_valid(poh)) { @@ -1694,7 +1723,7 @@ cont_close_hdl(uuid_t cont_hdl_uuid) D_ASSERT(cont_child->sc_open > 0); cont_child->sc_open--; if (cont_child->sc_open == 0) - dtx_cont_close(cont_child); + dtx_cont_close(cont_child, false); D_DEBUG(DB_MD, DF_CONT": closed (%d): hdl="DF_UUID"\n", DP_CONT(cont_child->sc_pool->spc_uuid, cont_child->sc_uuid), diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c index ee119d4b965..c6ce2f38206 100644 --- a/src/dtx/dtx_common.c +++ b/src/dtx/dtx_common.c @@ -1635,6 +1635,10 @@ start_dtx_reindex_ult(struct ds_cont_child *cont) while (cont->sc_dtx_reindex_abort) ABT_thread_yield(); + if (cont->sc_stopping) + return -DER_SHUTDOWN; + + cont->sc_dtx_delay_reset = 0; if (cont->sc_dtx_reindex) return 0; @@ -1652,7 +1656,7 @@ start_dtx_reindex_ult(struct ds_cont_child *cont) } void -stop_dtx_reindex_ult(struct ds_cont_child *cont) +stop_dtx_reindex_ult(struct ds_cont_child *cont, bool force) { /* DTX reindex has been done or not has not been started. */ if (!cont->sc_dtx_reindex) @@ -1662,9 +1666,15 @@ stop_dtx_reindex_ult(struct ds_cont_child *cont) if (dtx_cont_opened(cont)) return; - /* Do not stop DTX reindex if DTX resync is still in-progress. */ - if (cont->sc_dtx_resyncing) + /* + * For non-force case, do not stop DTX re-index if DTX resync + * is in-progress. Related DTX resource will be released after + * DTX resync globally done (via rebuild scanning). + */ + if (unlikely(cont->sc_dtx_resyncing && !force)) { + cont->sc_dtx_delay_reset = 1; return; + } cont->sc_dtx_reindex_abort = 1; @@ -1822,7 +1832,7 @@ dtx_cont_open(struct ds_cont_child *cont) } void -dtx_cont_close(struct ds_cont_child *cont) +dtx_cont_close(struct ds_cont_child *cont, bool force) { struct dss_module_info *dmi = dss_get_module_info(); struct dtx_batched_pool_args *dbpa; @@ -1837,7 +1847,7 @@ dtx_cont_close(struct ds_cont_child *cont) d_list_for_each_entry(dbca, &dbpa->dbpa_cont_list, dbca_pool_link) { if (dbca->dbca_cont == cont) { - stop_dtx_reindex_ult(cont); + stop_dtx_reindex_ult(cont, force); d_list_del(&dbca->dbca_sys_link); d_list_add_tail(&dbca->dbca_sys_link, &dmi->dmi_dtx_batched_cont_close_list); @@ -1845,8 +1855,12 @@ dtx_cont_close(struct ds_cont_child *cont) /* If nobody reopen the container during dtx_flush_on_close, * then reset DTX table in VOS to release related resources. + * + * For non-force case, do not reset DTX table if DTX resync + * is in-progress to avoid redoing DTX re-index. We will do + * that after DTX resync done globally. */ - if (!dtx_cont_opened(cont)) + if (likely(!dtx_cont_opened(cont) && cont->sc_dtx_delay_reset == 0)) vos_dtx_cache_reset(cont->sc_hdl, false); return; } diff --git a/src/dtx/dtx_internal.h b/src/dtx/dtx_internal.h index 70509e25a5c..04f7c3f617e 100644 --- a/src/dtx/dtx_internal.h +++ b/src/dtx/dtx_internal.h @@ -195,7 +195,6 @@ int dtx_handle_reinit(struct dtx_handle *dth); void dtx_batched_commit(void *arg); void dtx_aggregation_main(void *arg); int start_dtx_reindex_ult(struct ds_cont_child *cont); -void stop_dtx_reindex_ult(struct ds_cont_child *cont); /* dtx_cos.c */ int dtx_fetch_committable(struct ds_cont_child *cont, uint32_t max_cnt, diff --git a/src/dtx/dtx_resync.c b/src/dtx/dtx_resync.c index 3a598a7da2e..82e72c81586 100644 --- a/src/dtx/dtx_resync.c +++ b/src/dtx/dtx_resync.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2019-2023 Intel Corporation. + * (C) Copyright 2019-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -704,9 +704,6 @@ dtx_resync(daos_handle_t po_hdl, uuid_t po_uuid, uuid_t co_uuid, uint32_t ver, b ABT_mutex_unlock(cont->sc_mutex); out: - if (!dtx_cont_opened(cont)) - stop_dtx_reindex_ult(cont); - D_DEBUG(DB_MD, "Exit DTX resync (%s) for "DF_UUID"/"DF_UUID" with ver %u, rc = %d\n", block ? "block" : "non-block", DP_UUID(po_uuid), DP_UUID(co_uuid), ver, rc); @@ -751,8 +748,8 @@ dtx_resync_one(void *data) { struct dtx_scan_args *arg = data; struct ds_pool_child *child; - vos_iter_param_t param = { 0 }; - struct vos_iter_anchors anchor = { 0 }; + vos_iter_param_t *param = NULL; + struct vos_iter_anchors *anchor = NULL; struct dtx_container_scan_arg cb_arg = { 0 }; int rc; @@ -760,14 +757,26 @@ dtx_resync_one(void *data) if (child == NULL) D_GOTO(out, rc = -DER_NONEXIST); + D_ALLOC_PTR(param); + if (param == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + D_ALLOC_PTR(anchor); + if (anchor == NULL) + D_GOTO(out, rc = -DER_NOMEM); + cb_arg.arg = *arg; - param.ip_hdl = child->spc_hdl; - param.ip_flags = VOS_IT_FOR_MIGRATION; - rc = vos_iterate(¶m, VOS_ITER_COUUID, false, &anchor, + param->ip_hdl = child->spc_hdl; + param->ip_flags = VOS_IT_FOR_MIGRATION; + rc = vos_iterate(param, VOS_ITER_COUUID, false, anchor, container_scan_cb, NULL, &cb_arg, NULL); - ds_pool_child_put(child); out: + D_FREE(param); + D_FREE(anchor); + if (child != NULL) + ds_pool_child_put(child); + D_DEBUG(DB_TRACE, DF_UUID" iterate pool done: rc %d\n", DP_UUID(arg->pool_uuid), rc); diff --git a/src/include/daos/common.h b/src/include/daos/common.h index c46ce5680b8..6a68642d7fc 100644 --- a/src/include/daos/common.h +++ b/src/include/daos/common.h @@ -829,6 +829,7 @@ enum { #define DAOS_SHARD_OBJ_RW_DROP_REPLY (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x80) #define DAOS_OBJ_FETCH_DATA_LOST (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x81) #define DAOS_OBJ_TRY_SPECIAL_SHARD (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x82) +#define DAOS_OBJ_EC_AGG_LEADER_DIFF (DAOS_FAIL_SYS_TEST_GROUP_LOC | 0x83) #define DAOS_VOS_AGG_RANDOM_YIELD (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x90) #define DAOS_VOS_AGG_MW_THRESH (DAOS_FAIL_UNIT_TEST_GROUP_LOC | 0x91) diff --git a/src/include/daos_srv/container.h b/src/include/daos_srv/container.h index 10feb7e07e3..86310f0bbd7 100644 --- a/src/include/daos_srv/container.h +++ b/src/include/daos_srv/container.h @@ -1,5 +1,5 @@ /* - * (C) Copyright 2015-2023 Intel Corporation. + * (C) Copyright 2015-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -64,15 +64,18 @@ struct ds_cont_child { ABT_mutex sc_mutex; ABT_cond sc_dtx_resync_cond; ABT_cond sc_scrub_cond; + ABT_cond sc_rebuild_cond; uint32_t sc_dtx_resyncing:1, sc_dtx_reindex:1, sc_dtx_reindex_abort:1, + sc_dtx_delay_reset:1, sc_dtx_registered:1, sc_props_fetched:1, sc_stopping:1, sc_vos_agg_active:1, sc_ec_agg_active:1, - sc_scrubbing:1; + sc_scrubbing:1, + sc_rebuilding:1; uint32_t sc_dtx_batched_gen; /* Tracks the schedule request for aggregation ULT */ struct sched_request *sc_agg_req; diff --git a/src/include/daos_srv/dtx_srv.h b/src/include/daos_srv/dtx_srv.h index 7e3b1a67946..cee99e4fbd6 100644 --- a/src/include/daos_srv/dtx_srv.h +++ b/src/include/daos_srv/dtx_srv.h @@ -252,12 +252,14 @@ dtx_leader_exec_ops(struct dtx_leader_handle *dlh, dtx_sub_func_t func, int dtx_cont_open(struct ds_cont_child *cont); -void dtx_cont_close(struct ds_cont_child *cont); +void dtx_cont_close(struct ds_cont_child *cont, bool force); int dtx_cont_register(struct ds_cont_child *cont); void dtx_cont_deregister(struct ds_cont_child *cont); +void stop_dtx_reindex_ult(struct ds_cont_child *cont, bool force); + int dtx_obj_sync(struct ds_cont_child *cont, daos_unit_oid_t *oid, daos_epoch_t epoch); diff --git a/src/object/srv_ec_aggregate.c b/src/object/srv_ec_aggregate.c index c73def48a8d..38fd8c0a496 100644 --- a/src/object/srv_ec_aggregate.c +++ b/src/object/srv_ec_aggregate.c @@ -115,7 +115,8 @@ struct ec_agg_entry { struct pl_obj_layout *ae_obj_layout; struct daos_shard_loc ae_peer_pshards[OBJ_EC_MAX_P]; uint32_t ae_grp_idx; - uint32_t ae_is_leader:1; + uint32_t ae_is_leader:1, + ae_process_partial:1; }; /* Parameters used to drive iterate all. @@ -148,6 +149,7 @@ struct ec_agg_stripe_ud { d_iov_t asu_csum_iov; struct dcs_iod_csums *asu_iod_csums; /* iod csums */ ABT_eventual asu_eventual; /* Eventual for offload */ + bool asu_valid_hole; /* with hole epoch >= parity epoch */ }; /* Represents an replicated data extent. @@ -500,26 +502,28 @@ agg_count_cells(uint8_t *fcbit_map, uint8_t *tbit_map, uint64_t estart, * initialized and share to other servers at higher(pool/container) layer. */ static int -agg_get_obj_handle(struct ec_agg_entry *entry) +agg_get_obj_handle(struct ec_agg_entry *entry, bool reset_peer) { struct ec_agg_param *agg_param; uint32_t grp_start; uint32_t tgt_ec_idx; struct dc_object *obj; int i; - int rc; + int rc = 0; - if (daos_handle_is_valid(entry->ae_obj_hdl)) + if (daos_handle_is_valid(entry->ae_obj_hdl) && !reset_peer) return 0; agg_param = container_of(entry, struct ec_agg_param, ap_agg_entry); - rc = dsc_obj_open(agg_param->ap_pool_info.api_cont_hdl, - entry->ae_oid.id_pub, DAOS_OO_RW, - &entry->ae_obj_hdl); - if (rc) - goto out; + if (!daos_handle_is_valid(entry->ae_obj_hdl)) { + rc = dsc_obj_open(agg_param->ap_pool_info.api_cont_hdl, + entry->ae_oid.id_pub, DAOS_OO_RW, + &entry->ae_obj_hdl); + if (rc) + goto out; + } - if (entry->ae_peer_pshards[0].sd_rank != DAOS_TGT_IGNORE) + if (!reset_peer && entry->ae_peer_pshards[0].sd_rank != DAOS_TGT_IGNORE) D_GOTO(out, rc = 0); grp_start = entry->ae_grp_idx * entry->ae_obj_layout->ol_grp_size; @@ -599,7 +603,7 @@ agg_fetch_odata_cells(struct ec_agg_entry *entry, uint8_t *bit_map, for (i = 0; i < cell_cnt; i++) d_iov_set(&sgl.sg_iovs[i], &buf[i * cell_b], cell_b); - rc = agg_get_obj_handle(entry); + rc = agg_get_obj_handle(entry, false); if (rc) { D_ERROR("Failed to open object: "DF_RC"\n", DP_RC(rc)); goto out; @@ -1180,7 +1184,13 @@ agg_process_partial_stripe(struct ec_agg_entry *entry) estart = entry->ae_cur_stripe.as_offset; d_list_for_each_entry(extent, &entry->ae_cur_stripe.as_dextents, ae_link) { - D_ASSERT(!extent->ae_hole); + if (extent->ae_hole) { + /* valid hole processed by agg_process_holes_ult() */ + D_ASSERTF(extent->ae_epoch < entry->ae_par_extent.ape_epoch, + "hole ext epoch "DF_X64", parity epoch "DF_X64"\n", + extent->ae_epoch, entry->ae_par_extent.ape_epoch); + continue; + } if (extent->ae_epoch <= entry->ae_par_extent.ape_epoch) { has_old_replicas = true; continue; @@ -1311,9 +1321,9 @@ agg_peer_update_ult(void *arg) rc = obj_req_create(dss_get_module_info()->dmi_ctx, &tgt_ep, DAOS_OBJ_RPC_EC_AGGREGATE, &rpc); if (rc) { - D_ERROR(DF_UOID" pidx %d to peer %d, obj_req_create " + D_ERROR(DF_UOID" pidx %d to peer %d, rank %d tag %d obj_req_create " DF_RC"\n", DP_UOID(entry->ae_oid), pidx, peer, - DP_RC(rc)); + tgt_ep.ep_rank, tgt_ep.ep_tag, DP_RC(rc)); goto out; } ec_agg_in = crt_req_get(rpc); @@ -1448,7 +1458,7 @@ agg_peer_update(struct ec_agg_entry *entry, bool write_parity) return -1; } - rc = agg_get_obj_handle(entry); + rc = agg_get_obj_handle(entry, false); if (rc) { D_ERROR("Failed to open object: "DF_RC"\n", DP_RC(rc)); return rc; @@ -1536,8 +1546,8 @@ agg_process_holes_ult(void *arg) uint32_t pidx = ec_age2pidx(entry); uint32_t peer; int i, rc = 0; - bool valid_hole = false; + stripe_ud->asu_valid_hole = false; /* Process extent list to find what to re-replicate -- build recx array */ d_list_for_each_entry(agg_extent, @@ -1545,7 +1555,7 @@ agg_process_holes_ult(void *arg) if (agg_extent->ae_epoch < entry->ae_par_extent.ape_epoch) continue; if (agg_extent->ae_hole) - valid_hole = true; + stripe_ud->asu_valid_hole = true; if (agg_extent->ae_recx.rx_idx - ss > last_ext_end) { stripe_ud->asu_recxs[ext_cnt].rx_idx = ss + last_ext_end; @@ -1561,7 +1571,7 @@ agg_process_holes_ult(void *arg) break; } - if (!valid_hole) + if (!stripe_ud->asu_valid_hole) goto out; obj = obj_hdl2ptr(entry->ae_obj_hdl); @@ -1731,7 +1741,7 @@ agg_process_holes(struct ec_agg_entry *entry) } stripe_ud.asu_agg_entry = entry; - rc = agg_get_obj_handle(entry); + rc = agg_get_obj_handle(entry, false); if (rc) { D_ERROR("Failed to open object: "DF_RC"\n", DP_RC(rc)); goto out; @@ -1758,21 +1768,29 @@ agg_process_holes(struct ec_agg_entry *entry) if (*status != 0) D_GOTO(ev_out, rc = *status); - /* Update local vos with replicate */ - entry->ae_sgl.sg_nr = 1; - if (iod->iod_nr) { - /* write the reps to vos */ - rc = vos_obj_update(agg_param->ap_cont_handle, entry->ae_oid, - entry->ae_cur_stripe.as_hi_epoch, 0, 0, - &entry->ae_dkey, 1, iod, - stripe_ud.asu_iod_csums, - &entry->ae_sgl); - if (rc) { - D_ERROR("vos_update_begin failed: "DF_RC"\n", - DP_RC(rc)); - goto ev_out; + /* If with valid hole (hole epoch >= parity epoch), update local vos with replicas + * and delete parity. + * For the case without valid hole, the only possibility is with partial replica epoch > + * parity epoch, and all hole epoch < parity epoch, this case set ae_process_partial + * flag and will call agg_process_partial_stripe() later. + */ + if (stripe_ud.asu_valid_hole) { + if (iod->iod_nr) { + /* write the reps to vos */ + entry->ae_sgl.sg_nr = 1; + rc = vos_obj_update(agg_param->ap_cont_handle, entry->ae_oid, + entry->ae_cur_stripe.as_hi_epoch, 0, 0, + &entry->ae_dkey, 1, iod, + stripe_ud.asu_iod_csums, + &entry->ae_sgl); + if (rc) { + D_ERROR("vos_update_begin failed: "DF_RC"\n", + DP_RC(rc)); + goto ev_out; + } } - /* Delete parity */ + + /* Delete parity even when nothing to replicate */ epoch_range.epr_lo = agg_param->ap_epr.epr_lo; epoch_range.epr_hi = entry->ae_cur_stripe.as_hi_epoch; recx.rx_nr = ec_age2cs(entry); @@ -1782,7 +1800,11 @@ agg_process_holes(struct ec_agg_entry *entry) entry->ae_oid, &epoch_range, &entry->ae_dkey, &entry->ae_akey, &recx); + } else { + D_DEBUG(DB_EPC, "no valid hole, set ae_process_partial flag\n"); + entry->ae_process_partial = 1; } + ev_out: entry->ae_sgl.sg_nr = AGG_IOV_CNT; ABT_eventual_free(&stripe_ud.asu_eventual); @@ -1792,6 +1814,7 @@ agg_process_holes(struct ec_agg_entry *entry) daos_csummer_free_ic(stripe_ud.asu_csummer, &stripe_ud.asu_iod_csums); D_FREE(stripe_ud.asu_csum_iov.iov_buf); daos_csummer_destroy(&stripe_ud.asu_csummer); + return rc; } @@ -1805,7 +1828,6 @@ agg_process_stripe(struct ec_agg_param *agg_param, struct ec_agg_entry *entry) struct vos_iter_anchors anchors = { 0 }; bool update_vos = true; bool write_parity = true; - bool process_holes = false; int rc = 0; if (DAOS_FAIL_CHECK(DAOS_FORCE_EC_AGG_FAIL)) @@ -1876,16 +1898,23 @@ agg_process_stripe(struct ec_agg_param *agg_param, struct ec_agg_entry *entry) goto out; } - /* With parity and some newer partial replicas, possibly holes */ - if (ec_age_with_hole(entry)) - process_holes = true; - else - rc = agg_process_partial_stripe(entry); + /* With parity and some newer partial replicas, possibly holes. + * Case 1: with valid hole (hole epoch >= parity epoch), can be handled by + * agg_process_holes(). + * Case 2: without valid hole, must with partial replica, should be handled by + * agg_process_partial_stripe(). + */ + if (ec_age_with_hole(entry)) { + entry->ae_process_partial = 0; + rc = agg_process_holes(entry); + if (rc != 0 || !entry->ae_process_partial) + goto clear_exts; + } + + rc = agg_process_partial_stripe(entry); out: - if (process_holes && rc == 0) { - rc = agg_process_holes(entry); - } else if (update_vos && rc == 0) { + if (update_vos && rc == 0) { if (ec_age2p(entry) > 1) { /* offload of ds_obj_update to push remote parity */ rc = agg_peer_update(entry, write_parity); @@ -1902,6 +1931,7 @@ agg_process_stripe(struct ec_agg_param *agg_param, struct ec_agg_entry *entry) } } +clear_exts: agg_clear_extents(entry); return rc; } @@ -2113,9 +2143,16 @@ agg_shard_is_parity(struct ds_pool *pool, struct ec_agg_entry *agg_entry) uint32_t shard_idx; struct pl_obj_shard *shard; - ec_tgt_idx = obj_ec_shard_idx_by_layout_ver(agg_entry->ae_oid.id_layout_ver, - agg_entry->ae_dkey_hash, oca, - daos_oclass_grp_size(oca) - i - 1); + if (unlikely(DAOS_FAIL_CHECK(DAOS_OBJ_EC_AGG_LEADER_DIFF) && + agg_entry->ae_dkey_hash % obj_ec_parity_tgt_nr(oca) == 0)) + ec_tgt_idx = obj_ec_shard_idx_by_layout_ver(agg_entry->ae_oid.id_layout_ver, + agg_entry->ae_dkey_hash, oca, + obj_ec_data_tgt_nr(oca) + i); + else + ec_tgt_idx = obj_ec_shard_idx_by_layout_ver(agg_entry->ae_oid.id_layout_ver, + agg_entry->ae_dkey_hash, oca, + daos_oclass_grp_size(oca) + - i - 1); shard_idx = grp_start + ec_tgt_idx; shard = pl_obj_get_shard(agg_entry->ae_obj_layout, shard_idx); @@ -2162,6 +2199,8 @@ agg_dkey(daos_handle_t ih, vos_iter_entry_t *entry, struct ec_agg_param *agg_param, struct ec_agg_entry *agg_entry, unsigned int *acts) { + int rc = 0; + if (!agg_key_compare(agg_entry->ae_dkey, entry->ie_key)) { D_DEBUG(DB_EPC, "Skip dkey: "DF_KEY" ec agg on re-probe\n", DP_KEY(&entry->ie_key)); @@ -2180,11 +2219,12 @@ agg_dkey(daos_handle_t ih, vos_iter_entry_t *entry, DP_UOID(agg_entry->ae_oid), DP_KEY(&agg_entry->ae_dkey), agg_entry->ae_is_leader ? "yes" : "no"); agg_reset_dkey_entry(&agg_param->ap_agg_entry, entry); + rc = agg_get_obj_handle(agg_entry, true); } else { *acts |= VOS_ITER_CB_SKIP; } - return 0; + return rc; } /* Handles akeys returned by the iterator. */ diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 11a26749695..deadc5f0474 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -2478,6 +2478,8 @@ ds_obj_ec_rep_handler(crt_rpc_t *rpc) D_ASSERT(ioc.ioc_coc != NULL); dkey = (daos_key_t *)&oer->er_dkey; iod = (daos_iod_t *)&oer->er_iod; + if (iod->iod_nr == 0) /* nothing to replicate, directly remove parity */ + goto remove_parity; iod_csums = oer->er_iod_csums.ca_arrays; rc = vos_update_begin(ioc.ioc_coc->sc_hdl, oer->er_oid, oer->er_epoch_range.epr_hi, VOS_OF_REBUILD, dkey, 1, iod, iod_csums, 0, &ioh, NULL); @@ -2510,6 +2512,7 @@ ds_obj_ec_rep_handler(crt_rpc_t *rpc) DP_RC(rc)); goto out; } +remove_parity: recx.rx_nr = obj_ioc2ec_cs(&ioc); recx.rx_idx = (oer->er_stripenum * recx.rx_nr) | PARITY_INDICATOR; rc = vos_obj_array_remove(ioc.ioc_coc->sc_hdl, oer->er_oid, &oer->er_epoch_range, dkey, diff --git a/src/object/srv_obj_migrate.c b/src/object/srv_obj_migrate.c index 7725e60fd1f..3785ae3daee 100644 --- a/src/object/srv_obj_migrate.c +++ b/src/object/srv_obj_migrate.c @@ -1657,6 +1657,12 @@ migrate_punch(struct migrate_pool_tls *tls, struct migrate_one *mrone, /* punch records */ if (mrone->mo_punch_iod_num > 0 && mrone->mo_rec_punch_eph <= tls->mpt_max_eph) { + if (daos_oclass_is_ec(&mrone->mo_oca) && + !is_ec_parity_shard_by_layout_ver(mrone->mo_oid.id_layout_ver, + mrone->mo_dkey_hash, &mrone->mo_oca, + mrone->mo_oid.id_shard)) + mrone_recx_daos2_vos(mrone, mrone->mo_punch_iods, mrone->mo_punch_iod_num); + rc = vos_obj_update(cont->sc_hdl, mrone->mo_oid, mrone->mo_rec_punch_eph, mrone->mo_version, 0, &mrone->mo_dkey, @@ -2351,7 +2357,7 @@ punch_iod_pack(struct migrate_one *mrone, struct dc_object *obj, daos_iod_t *iod D_DEBUG(DB_TRACE, "idx %d akey "DF_KEY" nr %d size "DF_U64" type %d\n", - idx, DP_KEY(&iod->iod_name), iod->iod_nr, iod->iod_size, + idx, DP_KEY(&iod->iod_name), mrone->mo_punch_iods->iod_nr, iod->iod_size, iod->iod_type); if (mrone->mo_rec_punch_eph < eph) diff --git a/src/rebuild/rebuild_internal.h b/src/rebuild/rebuild_internal.h index 7615a86d777..42c40f81430 100644 --- a/src/rebuild/rebuild_internal.h +++ b/src/rebuild/rebuild_internal.h @@ -1,5 +1,5 @@ /** - * (C) Copyright 2017-2023 Intel Corporation. + * (C) Copyright 2017-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -175,7 +175,10 @@ struct rebuild_status_completed { /* Structure on all targets to track all pool rebuilding */ struct rebuild_global { /* Link rebuild_tgt_pool_tracker on all targets. - * Only operated by stream 0, no need lock. + * Can be inserted/deleted by system XS, be lookup by system XS or VOS target main XS. + * Be protected by rg_ttl_rwlock - + * system XS takes wrlock to insert/delete, VOS TGT XS takes rdlock to lookup, + * no lock needed for system XS to lookup. */ d_list_t rg_tgt_tracker_list; @@ -200,6 +203,9 @@ struct rebuild_global { */ d_list_t rg_queue_list; + /* rwlock to protect rg_tgt_tracker_list */ + ABT_rwlock rg_ttl_rwlock; + ABT_mutex rg_lock; ABT_cond rg_stop_cond; /* how many pools is being rebuilt */ @@ -306,6 +312,7 @@ rebuild_tls_get() void rpt_get(struct rebuild_tgt_pool_tracker *rpt); void rpt_put(struct rebuild_tgt_pool_tracker *rpt); +void rpt_delete(struct rebuild_tgt_pool_tracker *rpt); struct rebuild_pool_tls * rebuild_pool_tls_lookup(uuid_t pool_uuid, unsigned int ver, uint32_t gen); diff --git a/src/rebuild/scan.c b/src/rebuild/scan.c index 8778292e346..39f79f29b33 100644 --- a/src/rebuild/scan.c +++ b/src/rebuild/scan.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2017-2023 Intel Corporation. + * (C) Copyright 2017-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -393,6 +393,7 @@ struct rebuild_scan_arg { int snapshot_cnt; uint32_t yield_freq; int32_t obj_yield_cnt; + struct ds_cont_child *cont_child; }; /** @@ -668,7 +669,7 @@ rebuild_obj_scan_cb(daos_handle_t ch, vos_iter_entry_t *ent, int i; int rc = 0; - if (rpt->rt_abort) { + if (rpt->rt_abort || arg->cont_child->sc_stopping) { D_DEBUG(DB_REBUILD, "rebuild is aborted\n"); return 1; } @@ -806,44 +807,66 @@ rebuild_container_scan_cb(daos_handle_t ih, vos_iter_entry_t *entry, } rc = vos_cont_open(iter_param->ip_hdl, entry->ie_couuid, &coh); + if (rc == -DER_NONEXIST) { + D_DEBUG(DB_REBUILD, DF_UUID" already destroyed\n", DP_UUID(arg->co_uuid)); + return 0; + } + if (rc != 0) { D_ERROR("Open container "DF_UUID" failed: "DF_RC"\n", DP_UUID(entry->ie_couuid), DP_RC(rc)); return rc; } + rc = ds_cont_child_lookup(rpt->rt_pool_uuid, entry->ie_couuid, &cont_child); + if (rc == -DER_NONEXIST || rc == -DER_SHUTDOWN) { + D_DEBUG(DB_REBUILD, DF_UUID" already destroyed or destroying\n", + DP_UUID(arg->co_uuid)); + rc = 0; + D_GOTO(close, rc); + } + + if (rc != 0) { + D_ERROR("Container "DF_UUID", ds_cont_child_lookup failed: "DF_RC"\n", + DP_UUID(entry->ie_couuid), DP_RC(rc)); + D_GOTO(close, rc); + } + + /* + * The container may has been closed by the application, but some resource (DRAM) occupied + * by DTX may be not released because DTX resync was in-progress at that time. When arrive + * here, DTX resync must has completed globally. Let's release related resource. + */ + if (unlikely(cont_child->sc_dtx_delay_reset == 1)) { + stop_dtx_reindex_ult(cont_child, true); + vos_dtx_cache_reset(cont_child->sc_hdl, false); + } + + cont_child->sc_rebuilding = 1; + rc = ds_cont_fetch_snaps(rpt->rt_pool->sp_iv_ns, entry->ie_couuid, NULL, &snapshot_cnt); if (rc) { D_ERROR("Container "DF_UUID", ds_cont_fetch_snaps failed: "DF_RC"\n", DP_UUID(entry->ie_couuid), DP_RC(rc)); - vos_cont_close(coh); - return rc; + D_GOTO(close, rc); } rc = ds_cont_get_props(&arg->co_props, rpt->rt_pool->sp_uuid, entry->ie_couuid); if (rc) { D_ERROR("Container "DF_UUID", ds_cont_get_props failed: "DF_RC"\n", DP_UUID(entry->ie_couuid), DP_RC(rc)); - vos_cont_close(coh); - return rc; - } - - rc = ds_cont_child_lookup(rpt->rt_pool_uuid, entry->ie_couuid, &cont_child); - if (rc != 0) { - D_ERROR("Container "DF_UUID", ds_cont_child_lookup failed: "DF_RC"\n", - DP_UUID(entry->ie_couuid), DP_RC(rc)); - vos_cont_close(coh); - return rc; + D_GOTO(close, rc); } /* Wait for EC aggregation to finish. NB: migrate needs to wait for EC aggregation to finish */ - while (cont_child->sc_ec_agg_active) { + while (cont_child->sc_ec_agg_active && + rpt->rt_rebuild_op != RB_OP_RECLAIM && + rpt->rt_rebuild_op != RB_OP_FAIL_RECLAIM) { D_ASSERTF(rpt->rt_pool->sp_rebuilding >= 0, DF_UUID" rebuilding %d\n", DP_UUID(rpt->rt_pool_uuid), rpt->rt_pool->sp_rebuilding); /* Wait for EC aggregation to abort before discard the object */ - D_DEBUG(DB_REBUILD, DF_UUID" wait for ec agg abort.\n", - DP_UUID(entry->ie_couuid)); + D_INFO(DF_UUID" wait for ec agg abort.\n", DP_UUID(entry->ie_couuid)); dss_sleep(1000); if (rpt->rt_abort || rpt->rt_finishing) { D_DEBUG(DB_REBUILD, DF_CONT" rebuild op %s ver %u abort %u/%u.\n", @@ -866,6 +889,7 @@ rebuild_container_scan_cb(daos_handle_t ih, vos_iter_entry_t *entry, param.ip_flags = VOS_IT_FOR_MIGRATION; uuid_copy(arg->co_uuid, entry->ie_couuid); arg->snapshot_cnt = snapshot_cnt; + arg->cont_child = cont_child; /* If there is no snapshots, then rebuild does not need to migrate * punched objects at all. Ideally, it should ignore any objects @@ -881,8 +905,11 @@ rebuild_container_scan_cb(daos_handle_t ih, vos_iter_entry_t *entry, close: vos_cont_close(coh); - if (cont_child != NULL) + if (cont_child != NULL) { + cont_child->sc_rebuilding = 0; + ABT_cond_broadcast(cont_child->sc_rebuild_cond); ds_cont_child_put(cont_child); + } D_DEBUG(DB_REBUILD, DF_UUID"/"DF_UUID" iterate cont done: "DF_RC"\n", DP_UUID(rpt->rt_pool_uuid), DP_UUID(entry->ie_couuid), @@ -1206,8 +1233,11 @@ rebuild_tgt_scan_handler(crt_rpc_t *rpc) if (tls && tls->rebuild_pool_status == 0 && rc != 0) tls->rebuild_pool_status = rc; - if (rpt) + if (rpt) { + if (rc) + rpt_delete(rpt); rpt_put(rpt); + } ro = crt_reply_get(rpc); ro->rso_status = rc; ro->rso_stable_epoch = d_hlc_get(); diff --git a/src/rebuild/srv.c b/src/rebuild/srv.c index 90e1bf0fc22..daac6ac3912 100644 --- a/src/rebuild/srv.c +++ b/src/rebuild/srv.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -207,13 +207,38 @@ rebuild_get_global_dtx_resync_ver(struct rebuild_global_pool_tracker *rgt) return min; } +static void +rpt_insert(struct rebuild_tgt_pool_tracker *rpt) +{ + D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); + ABT_rwlock_wrlock(rebuild_gst.rg_ttl_rwlock); + d_list_add(&rpt->rt_list, &rebuild_gst.rg_tgt_tracker_list); + ABT_rwlock_unlock(rebuild_gst.rg_ttl_rwlock); +} + +void +rpt_delete(struct rebuild_tgt_pool_tracker *rpt) +{ + D_ASSERT(dss_get_module_info()->dmi_xs_id == 0); + ABT_rwlock_wrlock(rebuild_gst.rg_ttl_rwlock); + d_list_del_init(&rpt->rt_list); + ABT_rwlock_unlock(rebuild_gst.rg_ttl_rwlock); +} + struct rebuild_tgt_pool_tracker * rpt_lookup(uuid_t pool_uuid, uint32_t opc, unsigned int ver, unsigned int gen) { struct rebuild_tgt_pool_tracker *rpt; struct rebuild_tgt_pool_tracker *found = NULL; + bool locked = false; - /* Only stream 0 will access the list */ + /* System XS or VOS target XS (obj_inflight_io_check() -> ds_rebuild_running_query()) + * possibly access the list, need to hold rdlock only for VOS XS. + */ + if (dss_get_module_info()->dmi_xs_id != 0) { + ABT_rwlock_rdlock(rebuild_gst.rg_ttl_rwlock); + locked = true; + } d_list_for_each_entry(rpt, &rebuild_gst.rg_tgt_tracker_list, rt_list) { if (uuid_compare(rpt->rt_pool_uuid, pool_uuid) == 0 && rpt->rt_finishing == 0 && @@ -225,6 +250,8 @@ rpt_lookup(uuid_t pool_uuid, uint32_t opc, unsigned int ver, unsigned int gen) break; } } + if (locked) + ABT_rwlock_unlock(rebuild_gst.rg_ttl_rwlock); return found; } @@ -1037,16 +1064,45 @@ rpt_get(struct rebuild_tgt_pool_tracker *rpt) ABT_mutex_unlock(rpt->rt_lock); } +static int +rpt_put_destroy(void *data) +{ + struct rebuild_tgt_pool_tracker *rpt = data; + + rpt_destroy(rpt); + return 0; +} + void -rpt_put(struct rebuild_tgt_pool_tracker *rpt) +rpt_put(struct rebuild_tgt_pool_tracker *rpt) { + bool zombie; + int rc; + ABT_mutex_lock(rpt->rt_lock); rpt->rt_refcount--; D_ASSERT(rpt->rt_refcount >= 0); D_DEBUG(DB_REBUILD, "rpt %p ref %d\n", rpt, rpt->rt_refcount); if (rpt->rt_refcount == 1 && rpt->rt_finishing) ABT_cond_signal(rpt->rt_fini_cond); + zombie = (rpt->rt_refcount == 0); ABT_mutex_unlock(rpt->rt_lock); + if (!zombie) + return; + + if (dss_get_module_info()->dmi_xs_id == 0) { + rpt_destroy(rpt); + } else { + /* Possibly triggered by VOS target XS by obj_inflight_io_check() -> + * ds_rebuild_running_query(), but rpt_destroy() -> ds_pool_put() can only + * be called in system XS. + * If dss_ult_execute failed that due to fatal system error (no memory + * or ABT failure), throw an ERR log. + */ + rc = dss_ult_execute(rpt_put_destroy, rpt, NULL, NULL, DSS_XS_SYS, 0, 0); + if (rc) + DL_ERROR(rc, "failed to destroy rpt %p", rpt); + } } static void @@ -1706,7 +1762,6 @@ void ds_rebuild_abort(uuid_t pool_uuid, unsigned int ver, unsigned int gen, uint64_t term) { struct rebuild_tgt_pool_tracker *rpt; - struct rebuild_tgt_pool_tracker *tmp; rebuild_leader_stop(pool_uuid, ver, gen, term); @@ -1714,11 +1769,14 @@ ds_rebuild_abort(uuid_t pool_uuid, unsigned int ver, unsigned int gen, uint64_t while(1) { bool aborted = true; - d_list_for_each_entry_safe(rpt, tmp, &rebuild_gst.rg_tgt_tracker_list, rt_list) { + d_list_for_each_entry(rpt, &rebuild_gst.rg_tgt_tracker_list, rt_list) { if (uuid_compare(rpt->rt_pool_uuid, pool_uuid) == 0 && (ver == (unsigned int)(-1) || rpt->rt_rebuild_ver == ver) && (gen == (unsigned int)(-1) || rpt->rt_rebuild_gen == gen) && (term == (uint64_t)(-1) || rpt->rt_leader_term == term)) { + D_INFO(DF_UUID" try abort the rpt %p op %s ver %u gen %u\n", + DP_UUID(rpt->rt_pool_uuid), rpt, RB_OP_STR(rpt->rt_rebuild_op), + rpt->rt_rebuild_ver, rpt->rt_rebuild_gen); rpt->rt_abort = 1; aborted = false; } @@ -2139,16 +2197,11 @@ rebuild_tgt_fini(struct rebuild_tgt_pool_tracker *rpt) DP_UUID(rpt->rt_pool_uuid), DP_RC(rc)); /* destroy the migrate_tls of 0-xstream */ ds_migrate_stop(rpt->rt_pool, rpt->rt_rebuild_ver, rpt->rt_rebuild_gen); - d_list_del_init(&rpt->rt_list); - rpt_put(rpt); - /* No one should access rpt after rebuild_fini_one. - */ - D_ASSERT(rpt->rt_refcount == 0); - + /* No one should access rpt after rebuild_fini_one. */ D_INFO("Finalized rebuild for "DF_UUID", map_ver=%u.\n", DP_UUID(rpt->rt_pool_uuid), rpt->rt_rebuild_ver); - - rpt_destroy(rpt); + rpt_delete(rpt); + rpt_put(rpt); } void @@ -2447,6 +2500,11 @@ rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) rpt->rt_rebuild_op = rsi->rsi_rebuild_op; + /* Let's add the rpt to the tracker list before IV fetch, which might yield, + * to make sure the new coming request can find the rpt in the list. + */ + rpt_get(rpt); + rpt_insert(rpt); rc = ds_pool_iv_srv_hdl_fetch(pool, &rpt->rt_poh_uuid, &rpt->rt_coh_uuid); if (rc) @@ -2486,14 +2544,16 @@ rebuild_tgt_prepare(crt_rpc_t *rpc, struct rebuild_tgt_pool_tracker **p_rpt) rpt->rt_pool = pool; /* pin it */ ABT_mutex_unlock(rpt->rt_lock); - rpt_get(rpt); - d_list_add(&rpt->rt_list, &rebuild_gst.rg_tgt_tracker_list); *p_rpt = rpt; out: if (rc) { - if (rpt) + if (rpt) { + if (!d_list_empty(&rpt->rt_list)) { + rpt_delete(rpt); + rpt_put(rpt); + } rpt_put(rpt); - + } ds_pool_put(pool); } daos_prop_fini(&prop); @@ -2539,6 +2599,10 @@ init(void) D_INIT_LIST_HEAD(&rebuild_gst.rg_queue_list); D_INIT_LIST_HEAD(&rebuild_gst.rg_running_list); + rc = ABT_rwlock_create(&rebuild_gst.rg_ttl_rwlock); + if (rc != ABT_SUCCESS) + return dss_abterr2der(rc); + rc = ABT_mutex_create(&rebuild_gst.rg_lock); if (rc != ABT_SUCCESS) return dss_abterr2der(rc); @@ -2556,6 +2620,7 @@ fini(void) ABT_cond_free(&rebuild_gst.rg_stop_cond); ABT_mutex_free(&rebuild_gst.rg_lock); + ABT_rwlock_free(&rebuild_gst.rg_ttl_rwlock); rebuild_iv_fini(); return 0; diff --git a/src/tests/suite/daos_obj_ec.c b/src/tests/suite/daos_obj_ec.c index 2eef576d096..7fa47fc406b 100644 --- a/src/tests/suite/daos_obj_ec.c +++ b/src/tests/suite/daos_obj_ec.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2023 Intel Corporation. + * (C) Copyright 2016-2024 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -450,6 +450,27 @@ trigger_and_wait_ec_aggreation(test_arg_t *arg, daos_obj_id_t *oids, daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, 0, 0, NULL); } +void +trigger_and_wait_ec_aggreation_2dkeys(test_arg_t *arg, daos_obj_id_t *oids, + int oids_nr, char *dkey, char *dkey2, char *akey, + daos_off_t offset, daos_size_t size) +{ + int i; + + daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, + DAOS_OBJ_EC_AGG_LEADER_DIFF | DAOS_FAIL_ALWAYS, 0, NULL); + + print_message("wait for 35 seconds for EC aggregation.\n"); + sleep(35); + + for (i = 0; i < oids_nr; i++) { + ec_agg_check_replica_on_parity(arg, oids[i], dkey, akey, offset, size, false); + ec_agg_check_replica_on_parity(arg, oids[i], dkey2, akey, offset, size, false); + } + + daos_debug_set_params(arg->group, -1, DMG_KEY_FAIL_LOC, 0, 0, NULL); +} + void ec_verify_parity_data(struct ioreq *req, char *dkey, char *akey, daos_off_t offset, daos_size_t size, @@ -474,6 +495,37 @@ ec_verify_parity_data(struct ioreq *req, char *dkey, char *akey, free(data); } +void +ec_verify_parity_data_fail2shards(struct ioreq *req, char *dkey, char *akey, + daos_off_t offset, daos_size_t size, + char *verify_data, daos_handle_t th, + uint16_t shard1, uint16_t shard2) +{ + daos_recx_t recx; + char *data; + uint16_t fail_shards[2]; + uint64_t fail_val; + + data = (char *)malloc(size); + assert_true(data != NULL); + memset(data, 0, size); + + req->iod_type = DAOS_IOD_ARRAY; + recx.rx_nr = size; + recx.rx_idx = offset; + + fail_shards[0] = shard1; + fail_shards[1] = shard2; + fail_val = daos_shard_fail_value(fail_shards, 2); + daos_fail_value_set(fail_val); + daos_fail_loc_set(DAOS_FAIL_SHARD_OPEN | DAOS_FAIL_ALWAYS); + + lookup_recxs(dkey, akey, 1, th, &recx, 1, data, size, req); + assert_memory_equal(data, verify_data, size); + daos_fail_loc_set(0); + free(data); +} + static void ec_partial_update_agg(void **state) { @@ -503,20 +555,34 @@ ec_partial_update_agg(void **state) recx.rx_nr = EC_CELL_SIZE; recx.rx_idx = i * EC_CELL_SIZE; memset(data, 'a' + i, EC_CELL_SIZE); - insert_recxs("d_key", "a_key", 1, DAOS_TX_NONE, &recx, 1, + insert_recxs("d_key1", "a_key", 1, DAOS_TX_NONE, &recx, 1, + data, EC_CELL_SIZE, &req); + insert_recxs("d_key2", "a_key", 1, DAOS_TX_NONE, &recx, 1, data, EC_CELL_SIZE, &req); } - trigger_and_wait_ec_aggreation(arg, &oid, 1, "d_key", "a_key", 0, - EC_CELL_SIZE * 8, DAOS_FORCE_EC_AGG); + trigger_and_wait_ec_aggreation_2dkeys(arg, &oid, 1, "d_key1", "d_key2", "a_key", 0, + EC_CELL_SIZE * 8); for (i = 0; i < 10; i++) { daos_off_t offset = i * EC_CELL_SIZE; memset(verify_data, 'a' + i, EC_CELL_SIZE); - ec_verify_parity_data(&req, "d_key", "a_key", offset, - (daos_size_t)EC_CELL_SIZE, verify_data, - DAOS_TX_NONE, true); + ec_verify_parity_data_fail2shards(&req, "d_key1", "a_key", offset, + (daos_size_t)EC_CELL_SIZE, verify_data, + DAOS_TX_NONE, 0, 3); + ec_verify_parity_data_fail2shards(&req, "d_key1", "a_key", offset, + (daos_size_t)EC_CELL_SIZE, verify_data, + DAOS_TX_NONE, 0, 2); + ec_verify_parity_data_fail2shards(&req, "d_key2", "a_key", offset, + (daos_size_t)EC_CELL_SIZE, verify_data, + DAOS_TX_NONE, 0, 1); + ec_verify_parity_data_fail2shards(&req, "d_key2", "a_key", offset, + (daos_size_t)EC_CELL_SIZE, verify_data, + DAOS_TX_NONE, 2, 3); + ec_verify_parity_data_fail2shards(&req, "d_key2", "a_key", offset, + (daos_size_t)EC_CELL_SIZE, verify_data, + DAOS_TX_NONE, 4, 5); } ioreq_fini(&req); free(data); @@ -612,7 +678,9 @@ ec_full_partial_update_agg(void **state) recx.rx_idx = 0; memset(data, 'a', full_update_size); memcpy(verify_data, data, full_update_size); - insert_recxs("d_key", "a_key", 1, DAOS_TX_NONE, &recx, 1, + insert_recxs("d_key1", "a_key", 1, DAOS_TX_NONE, &recx, 1, + data, full_update_size, &req); + insert_recxs("d_key2", "a_key", 1, DAOS_TX_NONE, &recx, 1, data, full_update_size, &req); /* then partial stripe update */ @@ -627,15 +695,27 @@ ec_full_partial_update_agg(void **state) memset(buffer, 'a' + i, partial_update_size); memcpy(verify_buffer, buffer, partial_update_size); - insert_recxs("d_key", "a_key", 1, DAOS_TX_NONE, &recx, 1, + insert_recxs("d_key1", "a_key", 1, DAOS_TX_NONE, &recx, 1, + buffer, partial_update_size, &req); + insert_recxs("d_key2", "a_key", 1, DAOS_TX_NONE, &recx, 1, buffer, partial_update_size, &req); } - trigger_and_wait_ec_aggreation(arg, &oid, 1, "d_key", "a_key", 0, - full_update_size, DAOS_FORCE_EC_AGG); - - ec_verify_parity_data(&req, "d_key", "a_key", (daos_size_t)0, - full_update_size, verify_data, DAOS_TX_NONE, true); + trigger_and_wait_ec_aggreation_2dkeys(arg, &oid, 1, "d_key1", "d_key2", "a_key", 0, + full_update_size); + + ec_verify_parity_data_fail2shards(&req, "d_key1", "a_key", (daos_size_t)0, full_update_size, + verify_data, DAOS_TX_NONE, 0, 2); + ec_verify_parity_data_fail2shards(&req, "d_key1", "a_key", (daos_size_t)0, full_update_size, + verify_data, DAOS_TX_NONE, 3, 4); + ec_verify_parity_data_fail2shards(&req, "d_key1", "a_key", (daos_size_t)0, full_update_size, + verify_data, DAOS_TX_NONE, 4, 5); + ec_verify_parity_data_fail2shards(&req, "d_key2", "a_key", (daos_size_t)0, full_update_size, + verify_data, DAOS_TX_NONE, 0, 3); + ec_verify_parity_data_fail2shards(&req, "d_key2", "a_key", (daos_size_t)0, full_update_size, + verify_data, DAOS_TX_NONE, 3, 4); + ec_verify_parity_data_fail2shards(&req, "d_key2", "a_key", (daos_size_t)0, full_update_size, + verify_data, DAOS_TX_NONE, 4, 5); free(data); free(verify_data); }