diff --git a/src/include/daos/object.h b/src/include/daos/object.h index 71cc6c5ca8c..e23196e3e70 100644 --- a/src/include/daos/object.h +++ b/src/include/daos/object.h @@ -240,8 +240,9 @@ int daos_obj_set_oid_by_class(daos_obj_id_t *oid, enum daos_otype_t type, unsigned int daos_oclass_grp_size(struct daos_oclass_attr *oc_attr); unsigned int daos_oclass_grp_nr(struct daos_oclass_attr *oc_attr, struct daos_obj_md *md); -int daos_oclass_fit_max(daos_oclass_id_t oc_id, int domain_nr, int target_nr, - enum daos_obj_redun *ord, uint32_t *nr); +int +daos_oclass_fit_max(daos_oclass_id_t oc_id, int domain_nr, int target_nr, enum daos_obj_redun *ord, + uint32_t *nr, uint32_t rf_factor); bool daos_oclass_is_valid(daos_oclass_id_t oc_id); int daos_obj_get_oclass(daos_handle_t coh, enum daos_otype_t type, daos_oclass_hints_t hints, uint32_t args, daos_oclass_id_t *cid); diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index c36306ba8c8..4e8b9634c94 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -7174,6 +7174,7 @@ daos_obj_generate_oid(daos_handle_t coh, daos_obj_id_t *oid, uint32_t nr_grp; struct cont_props props; int rc; + uint32_t rf; struct dc_cont *dc; if (!daos_otype_t_is_valid(type)) @@ -7199,18 +7200,17 @@ daos_obj_generate_oid(daos_handle_t coh, daos_obj_id_t *oid, rc = pl_map_query(pool->dp_pool, &attr); D_ASSERT(rc == 0); dc_pool_put(pool); + rf = dc->dc_props.dcp_redun_fac; - D_DEBUG(DB_TRACE, "available domain=%d, targets=%d\n", - attr.pa_domain_nr, attr.pa_target_nr); + D_DEBUG(DB_TRACE, "available domain=%d, targets=%d rf:%u\n", attr.pa_domain_nr, + attr.pa_target_nr, rf); if (cid == OC_UNKNOWN) { - uint32_t rf; - - rf = dc->dc_props.dcp_redun_fac; rc = dc_set_oclass(rf, attr.pa_domain_nr, attr.pa_target_nr, type, hints, &ord, &nr_grp); } else { - rc = daos_oclass_fit_max(cid, attr.pa_domain_nr, attr.pa_target_nr, &ord, &nr_grp); + rc = daos_oclass_fit_max(cid, attr.pa_domain_nr, attr.pa_target_nr, &ord, &nr_grp, + rf); } dc_cont_put(dc); @@ -7263,8 +7263,8 @@ daos_obj_generate_oid_by_rf(daos_handle_t poh, uint64_t rf_factor, attr.pa_target_nr, type, hints, &ord, &nr_grp); else - rc = daos_oclass_fit_max(cid, attr.pa_domain_nr, - attr.pa_target_nr, &ord, &nr_grp); + rc = daos_oclass_fit_max(cid, attr.pa_domain_nr, attr.pa_target_nr, &ord, &nr_grp, + rf_factor); if (rc) return rc; diff --git a/src/object/obj_class.c b/src/object/obj_class.c index cbb2931dfe1..f533ccd7417 100644 --- a/src/object/obj_class.c +++ b/src/object/obj_class.c @@ -1,5 +1,5 @@ /** - * (C) Copyright 2016-2022 Intel Corporation. + * (C) Copyright 2016-2023 Intel Corporation. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -234,9 +234,24 @@ daos_oclass_grp_nr(struct daos_oclass_attr *oc_attr, struct daos_obj_md *md) return oc_attr->ca_grp_nr; } +/** + * To honor RF setting during failure cases, let's reserve RF + * groups, so if some targets fail, there will be enough replacement + * targets to rebuild, so to avoid putting multiple shards in the same + * domain, which may break the RF setting. + * + * Though let's keep reserve targets to be less than 30% of the total + * targets. + */ +static uint32_t +reserve_grp_by_rf(uint32_t target_nr, uint32_t grp_size, uint32_t rf) +{ + return min(((target_nr * 3) / 10) / grp_size, rf); +} + int -daos_oclass_fit_max(daos_oclass_id_t oc_id, int domain_nr, int target_nr, - enum daos_obj_redun *ord, uint32_t *nr) +daos_oclass_fit_max(daos_oclass_id_t oc_id, int domain_nr, int target_nr, enum daos_obj_redun *ord, + uint32_t *nr, uint32_t rf_factor) { struct daos_obj_class *oc; struct daos_oclass_attr ca; @@ -270,9 +285,14 @@ daos_oclass_fit_max(daos_oclass_id_t oc_id, int domain_nr, int target_nr, } grp_size = daos_oclass_grp_size(&ca); - if (ca.ca_grp_nr == DAOS_OBJ_GRP_MAX) + if (ca.ca_grp_nr == DAOS_OBJ_GRP_MAX) { + uint32_t reserve_grp = reserve_grp_by_rf(target_nr, grp_size, rf_factor); + ca.ca_grp_nr = max(1, (target_nr / grp_size)); + if (ca.ca_grp_nr > reserve_grp) + ca.ca_grp_nr -= reserve_grp; + } if (grp_size > domain_nr) { D_ERROR("grp size (%u) (%u) is larger than domain nr (%u)\n", grp_size, DAOS_OBJ_REPL_MAX, domain_nr); @@ -823,8 +843,13 @@ dc_set_oclass(uint32_t rf, int domain_nr, int target_nr, enum daos_otype_t otype } if (grp_nr == DAOS_OBJ_GRP_MAX || grp_nr * grp_size > target_nr) { + uint32_t max_grp = target_nr / grp_size; + uint32_t reserve_grp = reserve_grp_by_rf(target_nr, grp_size, rf); + /* search for the highest scalability in the allowed range */ - *nr = max(1, (target_nr / grp_size)); + if (max_grp > reserve_grp) + max_grp = max_grp - reserve_grp; + *nr = max(1, max_grp); } else { *nr = grp_nr; }