Skip to content

Commit

Permalink
Merge branch 'master' into liw/dsc_pool_svc
Browse files Browse the repository at this point in the history
Features: pool
Required-githooks: true
  • Loading branch information
liw committed May 17, 2024
2 parents c702299 + a2d0e91 commit 30ce6cb
Show file tree
Hide file tree
Showing 68 changed files with 1,035 additions and 670 deletions.
21 changes: 18 additions & 3 deletions .github/workflows/linting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ jobs:
- name: Checkout code
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
- name: Check DAOS ftest tags.
run: \[ ! -x src/tests/ftest/tags.py \] || ./src/tests/ftest/tags.py lint
run: \[ ! -x src/tests/ftest/tags.py \] || ./src/tests/ftest/tags.py lint --verbose

flake8-lint:
runs-on: ubuntu-22.04
Expand Down Expand Up @@ -105,7 +105,7 @@ jobs:
max-line-length: '100'
args: '--filename */SConscript, SConstruct'

Doxygen:
doxygen:
name: Doxygen
runs-on: ubuntu-22.04
steps:
Expand Down Expand Up @@ -146,7 +146,7 @@ jobs:
- name: Run pylint check.
run: ./utils/cq/daos_pylint.py --git --output-format github

Codespell:
codespell:
name: Codespell
runs-on: ubuntu-22.04
steps:
Expand All @@ -160,3 +160,18 @@ jobs:
skip: ./src/control/vendor,./src/control/go.sum,./.git
ignore_words_file: ci/codespell.ignores
builtin: clear,rare,informal,names,en-GB_to_en-US

linting-summary:
name: Linting Summary
runs-on: ubuntu-22.04
needs: [isort, shell-check, log-check, ftest-tags, flake8-lint, doxygen, pylint, codespell]
if: (!cancelled())
steps:
- name: Check if any job failed
run: |
if [[ -z "$(echo "${{ join(needs.*.result, '') }}" | sed -e 's/success//g')" ]]; then
echo "All jobs succeeded"
else
echo "One or more jobs did not succeed"
exit 1
fi
2 changes: 1 addition & 1 deletion ci/jira_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
# valid. We've never checked/enforced these before so there have been a lot of values used in the
# past.
VALID_COMPONENTS = ('agent', 'build', 'ci', 'csum', 'doc', 'gha', 'il', 'md', 'mercury',
'packaging', 'pil4dfs', 'swim', 'test', 'tools')
'packaging', 'pil4dfs', 'swim', 'test', 'tools', 'ddb')

# Expected ticket prefix.
VALID_TICKET_PREFIX = ('DAOS', 'CORCI', 'SRE')
Expand Down
58 changes: 22 additions & 36 deletions src/chk/chk_common.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2022-2023 Intel Corporation.
* (C) Copyright 2022-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -391,39 +391,30 @@ chk_pool_remove_nowait(struct chk_pool_rec *cpr)
D_WARN("Failed to delete pool record: "DF_RC"\n", DP_RC(rc));
}

void
chk_pool_start_svc(struct chk_pool_rec *cpr, int *ret)
int
chk_pool_restart_svc(struct chk_pool_rec *cpr)
{
int rc = 0;

ABT_mutex_lock(cpr->cpr_mutex);
/* Stop the pool, then restart it with full pool service. */

if (!cpr->cpr_started) {
rc = ds_pool_start_with_svc(cpr->cpr_uuid);
if (rc == 0)
cpr->cpr_started = 1;
else
D_WARN("Cannot start (1) the pool for "DF_UUIDF" after check: "DF_RC"\n",
DP_UUID(cpr->cpr_uuid), DP_RC(rc));
}
ABT_mutex_lock(cpr->cpr_mutex);
if (!cpr->cpr_start_post) {
if (cpr->cpr_started)
chk_pool_shutdown(cpr, true);

if (cpr->cpr_started && !cpr->cpr_start_post) {
rc = ds_pool_chk_post(cpr->cpr_uuid);
rc = ds_pool_start_after_check(cpr->cpr_uuid);
if (rc != 0) {
D_WARN("Cannot post handle (1) pool start for "
DF_UUIDF" after check: "DF_RC"\n",
D_WARN("Cannot start full PS for "DF_UUIDF" after CR check: "DF_RC"\n",
DP_UUID(cpr->cpr_uuid), DP_RC(rc));
/* Failed to post handle pool start, have to stop it. */
chk_pool_shutdown(cpr, true);
} else {
cpr->cpr_started = 1;
cpr->cpr_start_post = 1;
}
}

ABT_mutex_unlock(cpr->cpr_mutex);

if (ret != NULL)
*ret = rc;
return rc;
}

static void
Expand Down Expand Up @@ -460,7 +451,7 @@ chk_pool_wait(struct chk_pool_rec *cpr)
}

void
chk_pool_stop_one(struct chk_instance *ins, uuid_t uuid, int status, uint32_t phase, int *ret)
chk_pool_stop_one(struct chk_instance *ins, uuid_t uuid, uint32_t status, uint32_t phase, int *ret)
{
struct chk_bookmark *cbk;
struct chk_pool_rec *cpr;
Expand Down Expand Up @@ -805,10 +796,6 @@ chk_pool_handle_notify(struct chk_instance *ins, struct chk_iv *iv)

if (iv->ci_pool_status == CHK__CHECK_POOL_STATUS__CPS_CHECKED) {
cpr->cpr_done = 1;
if (iv->ci_pool_destroyed) {
cpr->cpr_destroyed = 1;
cpr->cpr_not_export_ps = 1;
}
} else if (iv->ci_pool_status == CHK__CHECK_POOL_STATUS__CPS_FAILED ||
iv->ci_pool_status == CHK__CHECK_POOL_STATUS__CPS_IMPLICATED) {
cpr->cpr_skip = 1;
Expand All @@ -818,24 +805,23 @@ chk_pool_handle_notify(struct chk_instance *ins, struct chk_iv *iv)
D_GOTO(out, rc = -DER_NOTAPPLICABLE);
}

if (!ins->ci_is_leader && !cpr->cpr_destroyed && cpr->cpr_done) {
if (iv->ci_phase != cbk->cb_phase || iv->ci_pool_status != cbk->cb_pool_status) {
cbk->cb_phase = iv->ci_phase;
cbk->cb_pool_status = iv->ci_pool_status;
uuid_unparse_lower(cpr->cpr_uuid, uuid_str);
rc = chk_bk_update_pool(cbk, uuid_str);
}

if (rc == 0 && !ins->ci_is_leader && cpr->cpr_done) {
if (iv->ci_pool_status == CHK__CHECK_POOL_STATUS__CPS_CHECKED &&
!cpr->cpr_not_export_ps) {
chk_pool_start_svc(cpr, NULL);
rc = chk_pool_restart_svc(cpr);
} else if (ins->ci_sched_running && !ins->ci_sched_exiting) {
chk_pool_get(cpr);
d_list_add_tail(&cpr->cpr_shutdown_link, &ins->ci_pool_shutdown_list);
}
}

if (iv->ci_phase != cbk->cb_phase || iv->ci_pool_status != cbk->cb_pool_status ||
cpr->cpr_destroyed) {
cbk->cb_phase = iv->ci_phase;
cbk->cb_pool_status = iv->ci_pool_status;
uuid_unparse_lower(cpr->cpr_uuid, uuid_str);
rc = chk_bk_update_pool(cbk, uuid_str);
}

out:
if (cpr != NULL)
chk_pool_put(cpr);
Expand Down
69 changes: 57 additions & 12 deletions src/chk/chk_engine.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2022-2023 Intel Corporation.
* (C) Copyright 2022-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -827,7 +827,13 @@ chk_engine_find_dangling_pm(struct chk_pool_rec *cpr, struct pool_map *map)
t_comp->co_flags |= PO_COMPF_CHK_DONE;
}

/* dangling parent domain. */
/*
* dangling parent domain.
*
* NOTE: When we arrive here, all the pool membership information have already been
* scanned via chk_engine_pool_mbs_one(). No service for this pool is started
* on the rank (dangling parent domain). So it is safe to "DOWN{OUT}" it.
*/
rc = chk_engine_pm_dangling(cpr, map, r_comp,
down ? PO_COMP_ST_DOWN : PO_COMP_ST_DOWNOUT);
if (rc != 0)
Expand Down Expand Up @@ -1672,6 +1678,7 @@ chk_engine_pool_ult(void *args)
int rc1 = 0;
int rc2 = 0;
bool update = true;
bool svc_ref = true;

D_ASSERT(svc != NULL);
D_ASSERT(cpr != NULL);
Expand Down Expand Up @@ -1821,27 +1828,42 @@ chk_engine_pool_ult(void *args)
}
chk_engine_pool_notify(cpr);
cbk->cb_time.ct_stop_time = time(NULL);
if (likely(update))
if (likely(update)) {
rc1 = chk_bk_update_pool(cbk, uuid_str);
if (unlikely(rc1 != 0))
goto log;
}

/*
* The pool may has been marked as non-connectable before corruption, re-enable
* it to allow new connection.
*
* NOTE: After chk_pool_restart_svc(), current rank may be not PS leader again.
* To simplify the logic, we enable such flag on current PS leader before
* chk_pool_restart_svc(). If some client tries to connect the pool after
* that (mark connectable) but before chk_pool_restart_svc(), it will get
* -DER_BUSY temporarily until the rank is ready for full pool service.
*/
if (cbk->cb_pool_status == CHK__CHECK_POOL_STATUS__CPS_CHECKED &&
!cpr->cpr_not_export_ps) {
chk_pool_start_svc(cpr, &rc2);
if (cpr->cpr_started && cpr->cpr_start_post)
/*
* The pool may has been marked as non-connectable before
* corruption, re-enable it to allow new connection.
*/
rc2 = ds_pool_mark_connectable(svc);
rc1 = ds_pool_mark_connectable(svc);
if (rc1 == 0) {
svc_ref = false;
ds_pool_svc_put_leader(svc);
rc2 = chk_pool_restart_svc(cpr);
}
}
}

log:
D_CDEBUG(rc != 0 || rc1 != 0 || rc2 != 0, DLOG_ERR, DLOG_INFO,
DF_ENGINE" on rank %u exit pool ULT for "DF_UUIDF" with %s stop: %d/%d/%d\n",
DP_ENGINE(ins), dss_self_rank(), DP_UUID(cpr->cpr_uuid),
cpr->cpr_stop ? "external" : "self", rc, rc1, rc2);

ds_pool_svc_put_leader(svc);
if (svc_ref)
ds_pool_svc_put_leader(svc);

cpr->cpr_done = 1;
if (ins->ci_sched_running && !ins->ci_sched_exiting &&
(cbk->cb_pool_status != CHK__CHECK_POOL_STATUS__CPS_CHECKED || cpr->cpr_not_export_ps))
Expand Down Expand Up @@ -2928,7 +2950,7 @@ chk_engine_pool_start(uint64_t gen, uuid_t uuid, uint32_t phase, uint32_t flags)
cbk = &cpr->cpr_bk;
chk_pool_get(cpr);

rc = ds_pool_start(uuid);
rc = ds_pool_start(uuid, false);
if (rc != 0)
D_GOTO(put, rc = (rc == -DER_NONEXIST ? 1 : rc));

Expand Down Expand Up @@ -3514,3 +3536,26 @@ chk_engine_fini(void)
{
chk_ins_fini(&chk_engine);
}

int
chk_engine_pool_stop(uuid_t pool_uuid, bool destroy)
{
uint32_t status;
uint32_t phase;
int rc = 0;

if (destroy) {
status = CHK__CHECK_POOL_STATUS__CPS_CHECKED;
phase = CHK__CHECK_SCAN_PHASE__CSP_DONE;
} else {
status = CHK__CHECK_POOL_STATUS__CPS_PAUSED;
phase = CHK_INVAL_PHASE;
}

chk_pool_stop_one(chk_engine, pool_uuid, status, phase, &rc);

D_INFO(DF_ENGINE" stop pool "DF_UUIDF" with %s: "DF_RC"\n", DP_ENGINE(chk_engine),
DP_UUID(pool_uuid), destroy ? "destroy" : "non-destroy", DP_RC(rc));

return rc;
}
10 changes: 5 additions & 5 deletions src/chk/chk_internal.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2022-2023 Intel Corporation.
* (C) Copyright 2022-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -568,7 +568,6 @@ struct chk_iv {
uint32_t ci_ins_status;
uint32_t ci_pool_status;
uint32_t ci_to_leader:1, /* To check leader. */
ci_pool_destroyed:1, /* Pool has been destroyed. */
ci_from_psl:1; /* From pool service leader. */
};

Expand Down Expand Up @@ -696,9 +695,10 @@ void chk_pools_dump(d_list_t *head, int pool_nr, uuid_t pools[]);

void chk_pool_remove_nowait(struct chk_pool_rec *cpr);

void chk_pool_start_svc(struct chk_pool_rec *cpr, int *ret);
int chk_pool_restart_svc(struct chk_pool_rec *cpr);

void chk_pool_stop_one(struct chk_instance *ins, uuid_t uuid, int status, uint32_t phase, int *ret);
void chk_pool_stop_one(struct chk_instance *ins, uuid_t uuid, uint32_t status, uint32_t phase,
int *ret);

void chk_pool_stop_all(struct chk_instance *ins, uint32_t status, int *ret);

Expand Down Expand Up @@ -834,7 +834,7 @@ int chk_pool_start_remote(d_rank_list_t *rank_list, uint64_t gen, uuid_t uuid, u

int chk_pool_mbs_remote(d_rank_t rank, uint32_t phase, uint64_t gen, uuid_t uuid, char *label,
uint64_t seq, uint32_t flags, uint32_t mbs_nr,
struct chk_pool_mbs *mbs_array, struct rsvc_hint *hint);
struct chk_pool_mbs *mbs_array, int *svc_rc, struct rsvc_hint *svc_hint);

int chk_report_remote(d_rank_t leader, uint64_t gen, uint32_t cla, uint32_t act, int result,
d_rank_t rank, uint32_t target, uuid_t *pool, char *pool_label,
Expand Down
6 changes: 3 additions & 3 deletions src/chk/chk_iv.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2022-2023 Intel Corporation.
* (C) Copyright 2022-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -213,10 +213,10 @@ chk_iv_update(void *ns, struct chk_iv *iv, uint32_t shortcut, uint32_t sync_mode

D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO,
"CHK iv "DF_X64"/"DF_X64" on rank %u, phase %u, ins_status %u, "
"pool_status %u, to_leader %s, from_psl %s, destroyed %s: rc = %d\n",
"pool_status %u, to_leader %s, from_psl %s: rc = %d\n",
iv->ci_gen, iv->ci_seq, iv->ci_rank, iv->ci_phase, iv->ci_ins_status,
iv->ci_pool_status, iv->ci_to_leader ? "yes" : "no",
iv->ci_from_psl ? "yes" : "no", iv->ci_pool_destroyed ? "yes" : "no", rc);
iv->ci_from_psl ? "yes" : "no", rc);

return rc;
}
Expand Down
14 changes: 7 additions & 7 deletions src/chk/chk_leader.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2022-2023 Intel Corporation.
* (C) Copyright 2022-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -315,8 +315,6 @@ chk_leader_post_repair(struct chk_instance *ins, struct chk_pool_rec *cpr,
iv.ci_ins_status = ins->ci_bk.cb_ins_status;
iv.ci_phase = cbk->cb_phase;
iv.ci_pool_status = cbk->cb_pool_status;
if (cpr->cpr_destroyed)
iv.ci_pool_destroyed = 1;

/* Synchronously notify the engines that check on the pool got failure. */
rc = chk_iv_update(ins->ci_iv_ns, &iv, CRT_IV_SHORTCUT_NONE,
Expand Down Expand Up @@ -945,7 +943,8 @@ chk_leader_orphan_pool(struct chk_pool_rec *cpr)
* to fix related inconsistency), then notify check engines to remove related
* pool record and bookmark.
*/
chk_leader_post_repair(ins, cpr, &result, rc <= 0, cpr->cpr_skip ? true : false);
chk_leader_post_repair(ins, cpr, &result, rc <= 0,
cpr->cpr_skip && !cpr->cpr_destroyed ? true : false);

return result;
}
Expand Down Expand Up @@ -1867,13 +1866,14 @@ chk_leader_pool_mbs_one(struct chk_pool_rec *cpr)
{
struct rsvc_client client = { 0 };
crt_endpoint_t ep = { 0 };
struct rsvc_hint hint = { 0 };
struct rsvc_hint svc_hint = { 0 };
struct chk_instance *ins = cpr->cpr_ins;
struct chk_bookmark *cbk = &ins->ci_bk;
d_rank_list_t *ps_ranks = NULL;
struct chk_pool_shard *cps;
struct ds_pool_clue *clue;
uint32_t interval;
int svc_rc = 0;
int rc = 0;
int rc1;
int i = 0;
Expand Down Expand Up @@ -1925,9 +1925,9 @@ chk_leader_pool_mbs_one(struct chk_pool_rec *cpr)
rc = chk_pool_mbs_remote(ep.ep_rank, CHK__CHECK_SCAN_PHASE__CSP_POOL_MBS, cbk->cb_gen,
cpr->cpr_uuid, cpr->cpr_label, cpr->cpr_label_seq,
cpr->cpr_delay_label ? CMF_REPAIR_LABEL : 0,
cpr->cpr_shard_nr, cpr->cpr_mbs, &hint);
cpr->cpr_shard_nr, cpr->cpr_mbs, &svc_rc, &svc_hint);

rc1 = rsvc_client_complete_rpc(&client, &ep, rc, rc, &hint);
rc1 = rsvc_client_complete_rpc(&client, &ep, rc, svc_rc, &svc_hint);
if (rc1 == RSVC_CLIENT_RECHOOSE ||
(rc1 == RSVC_CLIENT_PROCEED && daos_rpc_retryable_rc(rc))) {
dss_sleep(interval);
Expand Down
Loading

0 comments on commit 30ce6cb

Please sign in to comment.