From bb8a0c2d4f2ebd5d5def174bb478be4dfb0968c0 Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Sat, 9 Nov 2024 00:38:57 +0800 Subject: [PATCH] DAOS-16170 cart: do not release completed RPC reference repeatedly - b26 For collective RPC, when handle failure cases during crt_req_send(), its reference may has been released via crt_rpc_complete_and_unlock() that is triggered by crt_corpc_complete(). Under such case, we should check whether the RPC is completed or not before calling RPC_DECREF() to avoid releasing the RPC reference repeatedly. The patch also initializes some local variable for CHK RPC to avoid accessing invalid DRAM when handle failed collective CHK RPC. Some enhancement for CR test logic. Test-tag: test_daos_cat_recov_core Allow-unstable-test: true Signed-off-by: Fan Yong --- src/cart/crt_rpc.c | 2 +- src/chk/chk_rpc.c | 12 +++--- src/tests/suite/daos_cr.c | 86 +++++++++++++++++++++++++++++++++++---- 3 files changed, 85 insertions(+), 15 deletions(-) diff --git a/src/cart/crt_rpc.c b/src/cart/crt_rpc.c index ead03d1bf29..c1011bfbd3a 100644 --- a/src/cart/crt_rpc.c +++ b/src/cart/crt_rpc.c @@ -1537,7 +1537,7 @@ crt_req_send(crt_rpc_t *req, crt_cb_t complete_cb, void *arg) /* failure already reported through complete cb */ if (complete_cb != NULL) rc = 0; - } else { + } else if (!crt_rpc_completed(rpc_priv)) { RPC_DECREF(rpc_priv); } } diff --git a/src/chk/chk_rpc.c b/src/chk/chk_rpc.c index 84e6ae33857..e340d482cec 100644 --- a/src/chk/chk_rpc.c +++ b/src/chk/chk_rpc.c @@ -694,7 +694,7 @@ chk_query_remote(d_rank_list_t *rank_list, uint64_t gen, int pool_nr, uuid_t poo int chk_mark_remote(d_rank_list_t *rank_list, uint64_t gen, d_rank_t rank, uint32_t version) { - crt_rpc_t *req; + crt_rpc_t *req = NULL; struct chk_mark_in *cmi; struct chk_mark_out *cmo; int rc; @@ -730,7 +730,7 @@ int chk_act_remote(d_rank_list_t *rank_list, uint64_t gen, uint64_t seq, uint32_t cla, uint32_t act, d_rank_t rank, bool for_all) { - crt_rpc_t *req; + crt_rpc_t *req = NULL; struct chk_act_in *cai; struct chk_act_out *cao; int rc; @@ -825,7 +825,7 @@ int chk_pool_start_remote(d_rank_list_t *rank_list, uint64_t gen, uuid_t uuid, uint32_t phase, uint32_t flags) { - crt_rpc_t *req; + crt_rpc_t *req = NULL; struct chk_pool_start_in *cpsi; struct chk_pool_start_out *cpso; int rc; @@ -863,7 +863,7 @@ chk_pool_mbs_remote(d_rank_t rank, uint32_t phase, uint64_t gen, uuid_t uuid, ch uint64_t seq, uint32_t flags, uint32_t mbs_nr, struct chk_pool_mbs *mbs_array, int *svc_rc, struct rsvc_hint *svc_hint) { - crt_rpc_t *req; + crt_rpc_t *req = NULL; struct chk_pool_mbs_in *cpmi; struct chk_pool_mbs_out *cpmo; int rc; @@ -908,7 +908,7 @@ int chk_report_remote(d_rank_t leader, uint64_t gen, uint32_t cla, uint32_t act, char *msg, uint32_t option_nr, uint32_t *options, uint32_t detail_nr, d_sg_list_t *details, uint64_t seq) { - crt_rpc_t *req; + crt_rpc_t *req = NULL; struct chk_report_in *cri; struct chk_report_out *cro; int rc; @@ -984,7 +984,7 @@ int chk_rejoin_remote(d_rank_t leader, uint64_t gen, d_rank_t rank, uuid_t iv_uuid, uint32_t *flags, uint32_t *pool_nr, uuid_t **pools) { - crt_rpc_t *req; + crt_rpc_t *req = NULL; struct chk_rejoin_in *cri; struct chk_rejoin_out *cro; uuid_t *tmp; diff --git a/src/tests/suite/daos_cr.c b/src/tests/suite/daos_cr.c index 4c981f29645..85ade4754de 100644 --- a/src/tests/suite/daos_cr.c +++ b/src/tests/suite/daos_cr.c @@ -382,6 +382,78 @@ cr_locate_dcri(struct daos_check_info *dci, struct daos_check_report_info *base, return dcri; } +static int +cr_pool_fail_result(struct daos_check_info *dci, uuid_t uuid, uint32_t cla, uint32_t act, int *ret) +{ + struct daos_check_pool_info *dcpi; + struct daos_check_report_info *dcri; + int i; + + if (dci->dci_pool_nr != 1) { + print_message("CR pool count %d (pool " DF_UUID ") is not 1\n", + dci->dci_pool_nr, DP_UUID(uuid)); + return -DER_INVAL; + } + + dcpi = &dci->dci_pools[0]; + D_ASSERTF(uuid_compare(dcpi->dcpi_uuid, uuid) == 0, + "Unmatched pool UUID (1): " DF_UUID " vs " DF_UUID "\n", + DP_UUID(dcpi->dcpi_uuid), DP_UUID(uuid)); + + if (!cr_pool_status_failed(dcpi->dcpi_status)) { + print_message("CR pool " DF_UUID " status %s is not failed\n", + DP_UUID(uuid), dcpi->dcpi_status); + return -DER_INVAL; + } + + if (cr_pool_phase_is_done(dcpi->dcpi_phase)) { + print_message("CR pool " DF_UUID " phase should not be done\n", + DP_UUID(uuid)); + return -DER_INVAL; + } + +#ifdef CR_ACCURATE_QUERY_RESULT + if (dci->dci_report_nr != 1) { + print_message("CR (failed) pool " DF_UUID " has unexpected reports: %d\n", + DP_UUID(uuid), dci->dci_report_nr); + return -DER_INVAL; + } +#endif + + for (i = 0; i < dci->dci_report_nr; i++) { + dcri = &dci->dci_reports[i]; + if (uuid_compare(dcri->dcri_uuid, uuid) != 0) { +#ifdef CR_ACCURATE_QUERY_RESULT + print_message("Detect unrelated inconsistency report: " + DF_UUID " vs " DF_UUID "\n", + DP_UUID(dcpi->dcpi_uuid), DP_UUID(uuid)); + return -DER_INVAL; +#else + continue; +#endif + } + + if (dcri->dcri_class != cla) { + print_message("CR (failed) pool " DF_UUID " reports unexpected " + "inconsistency at %d: %u vs %u\n", + DP_UUID(uuid), i, dcri->dcri_class, cla); + return -DER_INVAL; + } + + if (dcri->dcri_act != act) { + print_message("CR (failed) pool " DF_UUID " reports unexpected " + "solution at %d: %u vs %u\n", + DP_UUID(uuid), i, dcri->dcri_act, act); + return -DER_INVAL; + } + + *ret = dcri->dcri_result; + return 0; + } + + return -DER_INVAL; +} + static void cr_dci_fini(struct daos_check_info *dci) { @@ -2953,7 +3025,7 @@ cr_engine_rejoin_fail(void **state) uint32_t class = TCC_POOL_NONEXIST_ON_MS; uint32_t action; int rank = -1; - int result; + int result = 0; int rc; int i; @@ -3006,15 +3078,13 @@ cr_engine_rejoin_fail(void **state) rc = cr_ins_verify(&dci, TCIS_COMPLETED); assert_rc_equal(rc, 0); - /* The check on the pool will fail as -DER_HG or -DER_TIMEDOUT. */ - result = -DER_HG; - rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_FAILED, 1, &class, &action, &result); - if (rc == -DER_INVAL) { - result = -DER_TIMEDOUT; - rc = cr_pool_verify(&dci, pool.pool_uuid, TCPS_FAILED, 1, &class, &action, &result); - } + rc = cr_pool_fail_result(&dci, pool.pool_uuid, class, action, &result); assert_rc_equal(rc, 0); + /* The check on the pool will fail because of -DER_HG or -DER_TIMEDOUT or -DER_OOG. */ + D_ASSERTF(result == -DER_HG || result == -DER_TIMEDOUT || result == -DER_OOG, + "Unexpected pool " DF_UUID " fail result: %d\n", DP_UUID(pool.pool_uuid), result); + /* Reint the rank, rejoin will fail but not affect the rank start. */ rc = cr_rank_reint(rank, true); assert_rc_equal(rc, 0);