Skip to content

Commit

Permalink
b/349170185 Backport fixes (#14680)
Browse files Browse the repository at this point in the history
DAOS-16039 object: fix EC aggregation wrong peer address (#14593)
DAOS-16009 rebuild: fix O_TRUNC file size related handling
DAOS-15056 rebuild: add rpt to the rgt list properly (#13862)
DAOS-15517 rebuild: refine lock handling for rpt list (#14064)
DAOS-13812 container: fix destroy vs lookup (#12757)
DAOS-15627 dtx: redunce stack usage for DTX resync to avoid overflow (#14189)
DAOS-14845 rebuild: do not wait for EC agg for reclaim (#13610)

Signed-off-by: Xuezhao Liu <xuezhao.liu@intel.com>
Signed-off-by: Mohamad Chaarawi <mohamad.chaarawi@intel.com>
Signed-off-by: Jeff Olivier <jeffolivier@google.com>
Signed-off-by: Wang, Di <wddi218@gmail.com>
Signed-off-by: Di Wang <di.wang@intel.com>
Signed-off-by: Wang Shilong <shilong.wang@intel.com>
Signed-off-by: Fan Yong <fan.yong@intel.com>
  • Loading branch information
jolivier23 authored Jul 3, 2024
1 parent 833d393 commit 5b9557b
Show file tree
Hide file tree
Showing 16 changed files with 493 additions and 175 deletions.
99 changes: 64 additions & 35 deletions src/client/array/dc_array.c
Original file line number Diff line number Diff line change
Expand Up @@ -2069,18 +2069,24 @@ free_set_size_cb(tse_task_t *task, void *data)
}

static int
punch_extent(daos_handle_t oh, daos_handle_t th, daos_size_t dkey_val, daos_off_t record_i,
daos_size_t num_records, tse_task_t *task, d_list_t *task_list)
punch_dkey_or_extent(daos_handle_t oh, daos_handle_t th, daos_size_t dkey_val, daos_off_t start,
daos_size_t num_records, bool punch_dkey, tse_task_t *task,
d_list_t *task_list)
{
daos_obj_update_t *io_arg;
daos_obj_punch_t *dkey_punch_arg;
daos_iod_t *iod;
d_sg_list_t *sgl;
daos_key_t *dkey;
struct io_params *params = NULL;
tse_task_t *io_task = NULL;
int rc;
int rc;

D_DEBUG(DB_IO, "Punching (%zu, %zu) in Key %zu\n", record_i + 1, num_records, dkey_val);
if (punch_dkey)
D_DEBUG(DB_IO, "Punching dkey %zu\n", dkey_val);
else
D_DEBUG(DB_IO, "Punching (%zu, %zu) in Key %zu\n",
start, num_records, dkey_val);

D_ALLOC_PTR(params);
if (params == NULL)
Expand All @@ -2094,28 +2100,42 @@ punch_extent(daos_handle_t oh, daos_handle_t th, daos_size_t dkey_val, daos_off_
dkey = &params->dkey;
d_iov_set(dkey, &params->dkey_val, sizeof(uint64_t));

/* set descriptor for KV object */
d_iov_set(&iod->iod_name, &params->akey_val, 1);
iod->iod_nr = 1;
iod->iod_size = 0; /* 0 to punch */
iod->iod_type = DAOS_IOD_ARRAY;
D_ALLOC_PTR(iod->iod_recxs);
if (iod->iod_recxs == NULL)
D_GOTO(free, rc = -DER_NOMEM);
iod->iod_recxs[0].rx_idx = record_i + 1;
iod->iod_recxs[0].rx_nr = num_records;

rc = daos_task_create(DAOS_OPC_OBJ_UPDATE, tse_task2sched(task), 0, NULL, &io_task);
if (rc)
D_GOTO(free_reqs, rc);
if (punch_dkey) {
rc = daos_task_create(DAOS_OPC_OBJ_PUNCH_DKEYS, tse_task2sched(task), 0, NULL,
&io_task);
if (rc)
D_GOTO(free_reqs, rc);

dkey_punch_arg = daos_task_get_args(io_task);
dkey_punch_arg->oh = oh;
dkey_punch_arg->th = th;
dkey_punch_arg->dkey = dkey;
dkey_punch_arg->akeys = NULL;
dkey_punch_arg->akey_nr = 0;
} else {
/* set descriptor for KV object */
d_iov_set(&iod->iod_name, &params->akey_val, 1);
iod->iod_nr = 1;
iod->iod_size = 0; /* 0 to punch */
iod->iod_type = DAOS_IOD_ARRAY;
D_ALLOC_PTR(iod->iod_recxs);
if (iod->iod_recxs == NULL)
D_GOTO(free, rc = -DER_NOMEM);
iod->iod_recxs[0].rx_idx = start;
iod->iod_recxs[0].rx_nr = num_records;

rc = daos_task_create(DAOS_OPC_OBJ_UPDATE, tse_task2sched(task), 0, NULL, &io_task);
if (rc)
D_GOTO(free_reqs, rc);

io_arg = daos_task_get_args(io_task);
io_arg->oh = oh;
io_arg->th = th;
io_arg->dkey = dkey;
io_arg->nr = 1;
io_arg->iods = iod;
io_arg->sgls = sgl;
io_arg = daos_task_get_args(io_task);
io_arg->oh = oh;
io_arg->th = th;
io_arg->dkey = dkey;
io_arg->nr = 1;
io_arg->iods = iod;
io_arg->sgls = sgl;
}

rc = tse_task_register_comp_cb(io_task, free_io_params_cb, &params, sizeof(params));
if (rc)
Expand Down Expand Up @@ -2422,18 +2442,26 @@ adjust_array_size_cb(tse_task_t *task, void *data)
memcpy(&dkey_val, ptr, args->kds[i].kd_key_len);
ptr += args->kds[i].kd_key_len;

/*
* Either punch the entire dkey or an extent in that dkey depending on the offset
* where we are truncating to. The first dkey of the array (dkey 1) will always be
* an extent punch to maintain an epoch there.
*/
if (props->size == 0 || dkey_val > props->dkey_val) {
/** Do nothing for DKEY 0 (metadata) */
if (dkey_val == 0)
continue;
/*
* The dkey is higher than the adjustded size so we could punch it here.
* But it's better to punch the extent so that the max_write for the object
* doesn't get lost by aggregation.
*/
D_DEBUG(DB_IO, "Punch full extent in key "DF_U64"\n", dkey_val);
rc = punch_extent(args->oh, args->th, dkey_val, (daos_off_t)-1,
props->chunk_size, props->ptask, &task_list);
if (dkey_val == 1) {
D_DEBUG(DB_IO, "Punch full extent in key " DF_U64 "\n", dkey_val);
rc = punch_dkey_or_extent(args->oh, args->th, dkey_val,
0, props->chunk_size, false,
props->ptask, &task_list);
} else {
D_DEBUG(DB_IO, "Punch dkey " DF_U64 "\n", dkey_val);
rc = punch_dkey_or_extent(args->oh, args->th, dkey_val,
0, props->chunk_size, true,
props->ptask, &task_list);
}
if (rc)
goto out;
} else if (dkey_val == props->dkey_val && props->record_i) {
Expand All @@ -2444,8 +2472,9 @@ adjust_array_size_cb(tse_task_t *task, void *data)
props->chunk_size);
/** Punch all records above record_i */
D_DEBUG(DB_IO, "Punch extent in key "DF_U64"\n", dkey_val);
rc = punch_extent(args->oh, args->th, dkey_val, props->record_i,
props->num_records, props->ptask, &task_list);
rc = punch_dkey_or_extent(args->oh, args->th, dkey_val,
props->record_i + 1, props->num_records,
false, props->ptask, &task_list);
if (rc)
goto out;
}
Expand Down
7 changes: 4 additions & 3 deletions src/common/pool_map.c
Original file line number Diff line number Diff line change
Expand Up @@ -1650,11 +1650,12 @@ gen_pool_buf(struct pool_map *map, struct pool_buf **map_buf_out, int map_versio
map_comp.co_flags = PO_COMPF_NONE;
map_comp.co_nr = 1;

D_DEBUG(DB_TRACE, "adding target: type=0x%hhx, status=%hhu, idx=%d, "
D_DEBUG(DB_TRACE, "adding target: type=0x%hhx, status=%hhu, idx=%d, id=%d, "
"rank=%d, ver=%d, in_ver=%d, fseq=%u, flags=0x%x, nr=%u\n",
map_comp.co_type, map_comp.co_status, map_comp.co_index,
map_comp.co_rank, map_comp.co_ver, map_comp.co_in_ver,
map_comp.co_fseq, map_comp.co_flags, map_comp.co_nr);
map_comp.co_id, map_comp.co_rank, map_comp.co_ver,
map_comp.co_in_ver, map_comp.co_fseq, map_comp.co_flags,
map_comp.co_nr);

rc = pool_buf_attach(map_buf, &map_comp, 1);
if (rc != 0)
Expand Down
65 changes: 47 additions & 18 deletions src/container/srv_target.c
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,7 @@ cont_child_aggregate(struct ds_cont_child *cont, cont_aggregate_cb_t agg_cb,

if (unlikely(DAOS_FAIL_CHECK(DAOS_FORCE_EC_AGG) ||
DAOS_FAIL_CHECK(DAOS_FORCE_EC_AGG_FAIL) ||
DAOS_FAIL_CHECK(DAOS_OBJ_EC_AGG_LEADER_DIFF) ||
DAOS_FAIL_CHECK(DAOS_FORCE_EC_AGG_PEER_FAIL)))
interval = 0;
else
Expand Down Expand Up @@ -627,13 +628,18 @@ cont_child_alloc_ref(void *co_uuid, unsigned int ksize, void *po_uuid,
rc = ABT_cond_create(&cont->sc_scrub_cond);
if (rc != ABT_SUCCESS) {
rc = dss_abterr2der(rc);
goto out_mutex;
goto out_resync_cond;
}
rc = ABT_cond_create(&cont->sc_rebuild_cond);
if (rc != ABT_SUCCESS) {
rc = dss_abterr2der(rc);
goto out_scrub_cond;
}

cont->sc_pool = ds_pool_child_lookup(po_uuid);
if (cont->sc_pool == NULL) {
rc = -DER_NO_HDL;
goto out_cond;
goto out_rebuild_cond;
}

rc = vos_cont_open(cont->sc_pool->spc_hdl, co_uuid, &cont->sc_hdl);
Expand All @@ -659,7 +665,11 @@ cont_child_alloc_ref(void *co_uuid, unsigned int ksize, void *po_uuid,

out_pool:
ds_pool_child_put(cont->sc_pool);
out_cond:
out_rebuild_cond:
ABT_cond_free(&cont->sc_rebuild_cond);
out_scrub_cond:
ABT_cond_free(&cont->sc_scrub_cond);
out_resync_cond:
ABT_cond_free(&cont->sc_dtx_resync_cond);
out_mutex:
ABT_mutex_free(&cont->sc_mutex);
Expand All @@ -686,6 +696,7 @@ cont_child_free_ref(struct daos_llink *llink)
D_FREE(cont->sc_snapshots);
ABT_cond_free(&cont->sc_dtx_resync_cond);
ABT_cond_free(&cont->sc_scrub_cond);
ABT_cond_free(&cont->sc_rebuild_cond);
ABT_mutex_free(&cont->sc_mutex);
D_FREE(cont);
}
Expand Down Expand Up @@ -740,6 +751,12 @@ ds_cont_child_cache_destroy(struct daos_lru_cache *cache)
daos_lru_cache_destroy(cache);
}

static void
cont_child_put(struct daos_lru_cache *cache, struct ds_cont_child *cont)
{
daos_lru_ref_release(cache, &cont->sc_list);
}

/*
* If create == false, then this is assumed to be a pure lookup. In this case,
* -DER_NONEXIST is returned if the ds_cont_child object does not exist.
Expand Down Expand Up @@ -774,12 +791,6 @@ cont_child_lookup(struct daos_lru_cache *cache, const uuid_t co_uuid,
return 0;
}

static void
cont_child_put(struct daos_lru_cache *cache, struct ds_cont_child *cont)
{
daos_lru_ref_release(cache, &cont->sc_list);
}

static inline bool
cont_child_started(struct ds_cont_child *cont_child)
{
Expand All @@ -805,13 +816,17 @@ cont_child_stop(struct ds_cont_child *cont_child)
/* Some ds_cont_child will only created by ds_cont_child_lookup().
* never be started at all
*/
cont_child->sc_stopping = 1;

/* Stop DTX reindex by force. */
stop_dtx_reindex_ult(cont_child, true);

if (cont_child_started(cont_child)) {
D_DEBUG(DB_MD, DF_CONT"[%d]: Stopping container\n",
DP_CONT(cont_child->sc_pool->spc_uuid,
cont_child->sc_uuid),
dss_get_module_info()->dmi_tgt_id);

cont_child->sc_stopping = 1;
d_list_del_init(&cont_child->sc_link);

dtx_cont_deregister(cont_child);
Expand Down Expand Up @@ -1164,6 +1179,7 @@ cont_child_destroy_one(void *vin)
&cont);
if (rc == -DER_NONEXIST)
break;

if (rc != 0)
D_GOTO(out_pool, rc);

Expand All @@ -1187,10 +1203,6 @@ cont_child_destroy_one(void *vin)
ABT_cond_wait(cont->sc_dtx_resync_cond, cont->sc_mutex);
ABT_mutex_unlock(cont->sc_mutex);

/* Give chance to DTX reindex ULT for exit. */
if (unlikely(cont->sc_dtx_reindex))
ABT_thread_yield();

/* Make sure checksum scrubbing has stopped */
ABT_mutex_lock(cont->sc_mutex);
if (cont->sc_scrubbing) {
Expand All @@ -1199,6 +1211,12 @@ cont_child_destroy_one(void *vin)
}
ABT_mutex_unlock(cont->sc_mutex);

/* Make sure rebuild has stopped */
ABT_mutex_lock(cont->sc_mutex);
if (cont->sc_rebuilding)
ABT_cond_wait(cont->sc_rebuild_cond, cont->sc_mutex);
ABT_mutex_unlock(cont->sc_mutex);

retry_cnt++;
if (retry_cnt > 1) {
D_ERROR("container is still in-use: open %u, resync %s, reindex %s\n",
Expand Down Expand Up @@ -1300,9 +1318,20 @@ ds_cont_child_lookup(uuid_t pool_uuid, uuid_t cont_uuid,
struct ds_cont_child **ds_cont)
{
struct dsm_tls *tls = dsm_tls_get();
int rc;

rc = cont_child_lookup(tls->dt_cont_cache, cont_uuid, pool_uuid,
true /* create */, ds_cont);
if (rc != 0)
return rc;

return cont_child_lookup(tls->dt_cont_cache, cont_uuid, pool_uuid,
true /* create */, ds_cont);
if ((*ds_cont)->sc_stopping) {
cont_child_put(tls->dt_cont_cache, *ds_cont);
*ds_cont = NULL;
return -DER_SHUTDOWN;
}

return 0;
}

/**
Expand Down Expand Up @@ -1572,7 +1601,7 @@ ds_cont_local_open(uuid_t pool_uuid, uuid_t cont_hdl_uuid, uuid_t cont_uuid,
DF_UUID": %d\n", DP_UUID(cont_uuid), hdl->sch_cont->sc_open);

hdl->sch_cont->sc_open--;
dtx_cont_close(hdl->sch_cont);
dtx_cont_close(hdl->sch_cont, true);

err_cont:
if (daos_handle_is_valid(poh)) {
Expand Down Expand Up @@ -1694,7 +1723,7 @@ cont_close_hdl(uuid_t cont_hdl_uuid)
D_ASSERT(cont_child->sc_open > 0);
cont_child->sc_open--;
if (cont_child->sc_open == 0)
dtx_cont_close(cont_child);
dtx_cont_close(cont_child, false);

D_DEBUG(DB_MD, DF_CONT": closed (%d): hdl="DF_UUID"\n",
DP_CONT(cont_child->sc_pool->spc_uuid, cont_child->sc_uuid),
Expand Down
26 changes: 20 additions & 6 deletions src/dtx/dtx_common.c
Original file line number Diff line number Diff line change
Expand Up @@ -1635,6 +1635,10 @@ start_dtx_reindex_ult(struct ds_cont_child *cont)
while (cont->sc_dtx_reindex_abort)
ABT_thread_yield();

if (cont->sc_stopping)
return -DER_SHUTDOWN;

cont->sc_dtx_delay_reset = 0;
if (cont->sc_dtx_reindex)
return 0;

Expand All @@ -1652,7 +1656,7 @@ start_dtx_reindex_ult(struct ds_cont_child *cont)
}

void
stop_dtx_reindex_ult(struct ds_cont_child *cont)
stop_dtx_reindex_ult(struct ds_cont_child *cont, bool force)
{
/* DTX reindex has been done or not has not been started. */
if (!cont->sc_dtx_reindex)
Expand All @@ -1662,9 +1666,15 @@ stop_dtx_reindex_ult(struct ds_cont_child *cont)
if (dtx_cont_opened(cont))
return;

/* Do not stop DTX reindex if DTX resync is still in-progress. */
if (cont->sc_dtx_resyncing)
/*
* For non-force case, do not stop DTX re-index if DTX resync
* is in-progress. Related DTX resource will be released after
* DTX resync globally done (via rebuild scanning).
*/
if (unlikely(cont->sc_dtx_resyncing && !force)) {
cont->sc_dtx_delay_reset = 1;
return;
}

cont->sc_dtx_reindex_abort = 1;

Expand Down Expand Up @@ -1822,7 +1832,7 @@ dtx_cont_open(struct ds_cont_child *cont)
}

void
dtx_cont_close(struct ds_cont_child *cont)
dtx_cont_close(struct ds_cont_child *cont, bool force)
{
struct dss_module_info *dmi = dss_get_module_info();
struct dtx_batched_pool_args *dbpa;
Expand All @@ -1837,16 +1847,20 @@ dtx_cont_close(struct ds_cont_child *cont)

d_list_for_each_entry(dbca, &dbpa->dbpa_cont_list, dbca_pool_link) {
if (dbca->dbca_cont == cont) {
stop_dtx_reindex_ult(cont);
stop_dtx_reindex_ult(cont, force);
d_list_del(&dbca->dbca_sys_link);
d_list_add_tail(&dbca->dbca_sys_link,
&dmi->dmi_dtx_batched_cont_close_list);
dtx_flush_on_close(dmi, dbca);

/* If nobody reopen the container during dtx_flush_on_close,
* then reset DTX table in VOS to release related resources.
*
* For non-force case, do not reset DTX table if DTX resync
* is in-progress to avoid redoing DTX re-index. We will do
* that after DTX resync done globally.
*/
if (!dtx_cont_opened(cont))
if (likely(!dtx_cont_opened(cont) && cont->sc_dtx_delay_reset == 0))
vos_dtx_cache_reset(cont->sc_hdl, false);
return;
}
Expand Down
Loading

0 comments on commit 5b9557b

Please sign in to comment.