diff --git a/src/common/dav_v2/dav_iface.c b/src/common/dav_v2/dav_iface.c index 65f1665a1ea..63b8e86d556 100644 --- a/src/common/dav_v2/dav_iface.c +++ b/src/common/dav_v2/dav_iface.c @@ -470,3 +470,9 @@ dav_class_register_v2(dav_obj_t *pop, struct dav_alloc_class_desc *p) return 0; } + +DAV_FUNC_EXPORT size_t +dav_obj_pgsz_v2() +{ + return ZONE_MAX_SIZE; +} diff --git a/src/common/dav_v2/dav_v2.h b/src/common/dav_v2/dav_v2.h index 4b88339d1f0..6147d33ba4e 100644 --- a/src/common/dav_v2/dav_v2.h +++ b/src/common/dav_v2/dav_v2.h @@ -313,4 +313,10 @@ dav_get_heap_mb_stats_v2(dav_obj_t *pop, uint32_t mb_id, struct dav_heap_mb_stat uint32_t dav_allot_mb_evictable_v2(dav_obj_t *pop, int flags); +/* + * Return the page size for dav_v2. + */ +size_t +dav_obj_pgsz_v2(); + #endif /* __DAOS_COMMON_DAV_V2_H */ diff --git a/src/common/mem.c b/src/common/mem.c index a40b12d854a..ceefe467803 100644 --- a/src/common/mem.c +++ b/src/common/mem.c @@ -36,7 +36,8 @@ struct umem_tx_stage_item { #ifdef DAOS_PMEM_BUILD -static int daos_md_backend = DAOS_MD_PMEM; +static int daos_md_backend = DAOS_MD_PMEM; +static bool daos_disable_bmem_v2 = false; #define UMM_SLABS_CNT 16 /** Initializes global settings for the pmem objects. @@ -51,6 +52,7 @@ umempobj_settings_init(bool md_on_ssd) int rc; enum pobj_arenas_assignment_type atype; unsigned int md_mode = DAOS_MD_BMEM; + unsigned int md_disable_bmem_v2 = 0; if (!md_on_ssd) { daos_md_backend = DAOS_MD_PMEM; @@ -81,16 +83,30 @@ umempobj_settings_init(bool md_on_ssd) return -DER_INVAL; }; + d_getenv_uint("DAOS_MD_DISABLE_BMEM_V2", &md_disable_bmem_v2); + if (md_disable_bmem_v2 && (md_mode != DAOS_MD_BMEM)) + D_INFO("Ignoring DAOS_MD_DISABLE_BMEM_V2 tunable"); + else + daos_disable_bmem_v2 = md_disable_bmem_v2; + daos_md_backend = md_mode; return 0; } -int umempobj_get_backend_type(void) +int +umempobj_get_backend_type(void) { return daos_md_backend; } -int umempobj_backend_type2class_id(int backend) +bool +umempobj_allow_md_bmem_v2() +{ + return !daos_disable_bmem_v2; +} + +int +umempobj_backend_type2class_id(int backend) { switch (backend) { case DAOS_MD_PMEM: @@ -108,6 +124,15 @@ int umempobj_backend_type2class_id(int backend) } } +size_t +umempobj_pgsz(int backend) +{ + if (backend == DAOS_MD_BMEM_V2) + return dav_obj_pgsz_v2(); + else + return (1UL << 12); +} + /** Define common slabs. We can refine this for 2.4 pools but that is for next patch */ static const int slab_map[] = { 0, /* 32 bytes */ diff --git a/src/include/daos/mem.h b/src/include/daos/mem.h index 2b03ac82e02..8cbc9b77906 100644 --- a/src/include/daos/mem.h +++ b/src/include/daos/mem.h @@ -30,6 +30,10 @@ int umempobj_settings_init(bool md_on_ssd); /* convert backend type to umem class id */ int umempobj_backend_type2class_id(int backend); +/* get page size for the backend */ +size_t +umempobj_pgsz(int backend); + /* umem persistent object property flags */ #define UMEMPOBJ_ENABLE_STATS 0x1 @@ -46,6 +50,10 @@ enum { /* return umem backend type */ int umempobj_get_backend_type(void); +/* returns whether bmem_v2 pools are allowed */ +bool +umempobj_allow_md_bmem_v2(); + #endif struct umem_wal_tx; diff --git a/src/include/daos_srv/vos.h b/src/include/daos_srv/vos.h index 3486c850574..3d94065b64a 100644 --- a/src/include/daos_srv/vos.h +++ b/src/include/daos_srv/vos.h @@ -519,6 +519,16 @@ int vos_aggregate(daos_handle_t coh, daos_epoch_range_t *epr, int (*yield_func)(void *arg), void *yield_arg, uint32_t flags); +/** + * Round up the scm and meta sizes to match the backend requirement. + * \param[in/out] scm_sz SCM size that needs to be aligned up + * \param[in/out] meta_sz META size that needs to be aligned up + * + * \return 0 on success, error otherwise. + */ +int +vos_pool_roundup_size(size_t *scm_sz, size_t *meta_sz); + /** * Discards changes in all epochs with the epoch range \a epr * diff --git a/src/mgmt/srv_drpc.c b/src/mgmt/srv_drpc.c index 9a2f013d32c..cd5fcdcb999 100644 --- a/src/mgmt/srv_drpc.c +++ b/src/mgmt/srv_drpc.c @@ -503,7 +503,7 @@ ds_mgmt_drpc_pool_create(Drpc__Call *drpc_req, Drpc__Response *drpc_resp) scm_size = req->tier_bytes[DAOS_MEDIA_SCM]; if (req->mem_ratio) - scm_size *= req->mem_ratio; + scm_size *= (double)req->mem_ratio; rc = ds_mgmt_create_pool(pool_uuid, req->sys, "pmem", targets, scm_size, req->tier_bytes[DAOS_MEDIA_NVME], prop, &svc, req->n_fault_domains, diff --git a/src/mgmt/srv_target.c b/src/mgmt/srv_target.c index 1cec76c769f..219c5564d60 100644 --- a/src/mgmt/srv_target.c +++ b/src/mgmt/srv_target.c @@ -1048,15 +1048,14 @@ tgt_create_preallocate(void *arg) * 16MB minimum per pmemobj file (SCM partition) */ D_ASSERT(dss_tgt_nr > 0); + D_ASSERT((tca->tca_scm_size / dss_tgt_nr) >= (1 << 24)); if (!bio_nvme_configured(SMD_DEV_TYPE_META)) { - rc = tgt_vos_preallocate_sequential(tca->tca_ptrec->dptr_uuid, - max(tca->tca_scm_size / dss_tgt_nr, - 1 << 24), dss_tgt_nr); + rc = tgt_vos_preallocate_sequential( + tca->tca_ptrec->dptr_uuid, tca->tca_scm_size / dss_tgt_nr, dss_tgt_nr); } else { - rc = tgt_vos_preallocate_parallel(tca->tca_ptrec->dptr_uuid, - max(tca->tca_scm_size / dss_tgt_nr, - 1 << 24), dss_tgt_nr, - &tca->tca_ptrec->cancel_create); + rc = tgt_vos_preallocate_parallel( + tca->tca_ptrec->dptr_uuid, tca->tca_scm_size / dss_tgt_nr, dss_tgt_nr, + &tca->tca_ptrec->cancel_create); } if (rc) goto out; @@ -1083,6 +1082,8 @@ ds_mgmt_hdlr_tgt_create(crt_rpc_t *tc_req) pthread_t thread; bool canceled_thread = false; int rc = 0; + size_t tgt_scm_sz; + size_t tgt_meta_sz; /** incoming request buffer */ tc_in = crt_req_get(tc_req); @@ -1119,6 +1120,17 @@ ds_mgmt_hdlr_tgt_create(crt_rpc_t *tc_req) D_DEBUG(DB_MGMT, DF_UUID": record inserted to dpt_creates_ht\n", DP_UUID(tca.tca_ptrec->dptr_uuid)); + tgt_scm_sz = tc_in->tc_scm_size / dss_tgt_nr; + tgt_meta_sz = tc_in->tc_meta_size / dss_tgt_nr; + rc = vos_pool_roundup_size(&tgt_scm_sz, &tgt_meta_sz); + if (rc) { + D_ERROR(DF_UUID": failed to roundup the vos size: "DF_RC"\n", + DP_UUID(tc_in->tc_pool_uuid), DP_RC(rc)); + goto out_rec; + } + tc_in->tc_scm_size = tgt_scm_sz * dss_tgt_nr; + tc_in->tc_meta_size = tgt_meta_sz * dss_tgt_nr; + tca.tca_scm_size = tc_in->tc_scm_size; tca.tca_nvme_size = tc_in->tc_nvme_size; tca.tca_dx = dss_current_xstream(); diff --git a/src/vos/vos_pool.c b/src/vos/vos_pool.c index e206ecbb479..ac84f7db5d2 100644 --- a/src/vos/vos_pool.c +++ b/src/vos/vos_pool.c @@ -753,6 +753,51 @@ init_umem_store(struct umem_store *store, struct bio_meta_context *mc) store->store_type = DAOS_MD_BMEM; } +static int +vos_pool_store_type(daos_size_t scm_sz, daos_size_t meta_sz) +{ + int backend; + + backend = umempobj_get_backend_type(); + D_ASSERT((meta_sz != 0) && (scm_sz != 0)); + + if (scm_sz > meta_sz) { + D_ERROR("memsize %lu is greater than metasize %lu", scm_sz, meta_sz); + return -DER_INVAL; + } + + if (scm_sz < meta_sz) { + if ((backend == DAOS_MD_BMEM) && umempobj_allow_md_bmem_v2()) + backend = DAOS_MD_BMEM_V2; + else if (backend != DAOS_MD_BMEM_V2) { + D_ERROR("scm_sz %lu is less than meta_sz %lu", scm_sz, meta_sz); + return -DER_INVAL; + } + } + + return backend; +} + +int +vos_pool_roundup_size(daos_size_t *scm_sz, daos_size_t *meta_sz) +{ + size_t alignsz; + int rc; + + D_ASSERT(*scm_sz != 0); + rc = vos_pool_store_type(*scm_sz, *meta_sz ? *meta_sz : *scm_sz); + if (rc < 0) + return rc; + + /* Round up the size such that it is compatible with backend */ + alignsz = umempobj_pgsz(rc); + *scm_sz = max(D_ALIGNUP(*scm_sz, alignsz), 1 << 24); + if (*meta_sz) + *meta_sz = max(D_ALIGNUP(*meta_sz, alignsz), 1 << 24); + + return 0; +} + static int vos_pmemobj_create(const char *path, uuid_t pool_id, const char *layout, size_t scm_sz, size_t nvme_sz, size_t wal_sz, size_t meta_sz, @@ -794,9 +839,13 @@ vos_pmemobj_create(const char *path, uuid_t pool_id, const char *layout, if (!meta_sz) meta_sz = scm_sz_actual; - store.store_type = umempobj_get_backend_type(); - if (store.store_type == DAOS_MD_BMEM && meta_sz > scm_sz_actual) - store.store_type = DAOS_MD_BMEM_V2; + rc = vos_pool_store_type(scm_sz_actual, meta_sz); + if (rc < 0) { + D_ERROR("Failed to determine the store type for xs:%p pool:"DF_UUID". "DF_RC, + xs_ctxt, DP_UUID(pool_id), DP_RC(rc)); + return rc; + } + store.store_type = rc; D_DEBUG(DB_MGMT, "Create BIO meta context for xs:%p pool:"DF_UUID" " "scm_sz: %zu meta_sz: %zu, nvme_sz: %zu wal_sz:%zu backend:%d\n",