daos-stack · NiuYawei · Sep 24, 2024 · Sep 18, 2024 · Sep 19, 2024 · Sep 23, 2024
@@ -36,7 +36,8 @@
 
 #ifdef DAOS_PMEM_BUILD
 
-static int daos_md_backend = DAOS_MD_PMEM;
+static int  daos_md_backend      = DAOS_MD_PMEM;
+static bool daos_disable_bmem_v2 = false;
 #define UMM_SLABS_CNT 16
 
 /** Initializes global settings for the pmem objects.
@@ -51,6 +52,7 @@
 	int					rc;
 	enum pobj_arenas_assignment_type	atype;
 	unsigned int				md_mode = DAOS_MD_BMEM;
+	unsigned int                            md_disable_bmem_v2 = 0;
 
 	if (!md_on_ssd) {
 		daos_md_backend = DAOS_MD_PMEM;
@@ -81,6 +83,12 @@
 		return -DER_INVAL;
 	};
 
+	d_getenv_uint("DAOS_MD_DISABLE_BMEM_V2", &md_disable_bmem_v2);
+	if (md_disable_bmem_v2 && (md_mode != DAOS_MD_BMEM))
+		D_INFO("Ignoring DAOS_MD_DISABLE_BMEM_V2 tunable");
+	else
+		daos_disable_bmem_v2 = md_disable_bmem_v2;
+
 	daos_md_backend = md_mode;
 	return 0;
 }
@@ -91,6 +99,12 @@
 	return daos_md_backend;
 }
 
+bool
+umempobj_allow_md_bmem_v2()
+{
+	return !daos_disable_bmem_v2;
+}
+
 int
 umempobj_backend_type2class_id(int backend)
 {
@@ -3046,7 +3060,7 @@
 		VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE((char *)pinfo->pi_addr, len);
 	pinfo->pi_io = 0;
 	if (rc) {
 		DL_ERROR(rc, "Read MD blob failed.\n");
 		page_wakeup_io(cache, pinfo);
 		return rc;
 	} else if (cache->ca_evtcb_fn) {
@@ -3188,7 +3202,7 @@
 	if (is_page_dirty(pinfo)) {
 		rc = cache_flush_page(cache, pinfo);
 		if (rc) {
 			DL_ERROR(rc, "Flush page failed.\n");
 			return rc;
 		}

@@ -3248,7 +3262,7 @@
 	while (need_evict(cache)) {
 		rc = cache_evict_page(cache, for_sys);
 		if (rc && rc != -DER_AGAIN && rc != -DER_BUSY) {
 			DL_ERROR(rc, "Evict page failed.\n");
 			return rc;
 		}

@@ -3299,7 +3313,7 @@
 		if (is_id_evictable(cache, pg_id)) {
 			rc = cache_get_free_page(cache, &pinfo, 0, false);
 			if (rc) {
 				DL_ERROR(rc, "Failed to get free page.\n");
 				break;
 			}
 		} else {
@@ -3482,7 +3496,7 @@

 	rc = cache_map_pages(cache, out_pages, page_nr);
 	if (rc)
 		DL_ERROR(rc, "Map page failed.\n");

 	if (out_pages != &in_pages[0])
 		D_FREE(out_pages);
@@ -3505,7 +3519,7 @@

 	rc = cache_pin_pages(cache, out_pages, page_nr, for_sys);
 	if (rc) {
 		DL_ERROR(rc, "Load page failed.\n");
 	} else {
 		for (i = 0; i < page_nr; i++) {
 			uint32_t	pg_id = out_pages[i];
@@ -3542,7 +3556,7 @@

 	rc = cache_pin_pages(cache, out_pages, page_nr, for_sys);
 	if (rc) {
 		DL_ERROR(rc, "Load page failed.\n");
 		goto out;
 	}

@@ -3605,7 +3619,7 @@
 	while (need_reserve(cache, 0)) {
 		rc = cache_evict_page(cache, false);
 		if (rc && rc != -DER_AGAIN && rc != -DER_BUSY) {
 			DL_ERROR(rc, "Evict page failed.\n");
 			break;
 		}


diff --git a/src/include/daos/mem.h b/src/include/daos/mem.h
@@ -50,6 +50,10 @@ enum {
 /* return umem backend type */
 int umempobj_get_backend_type(void);
 
+/* returns whether bmem_v2 pools are allowed */
+bool
+umempobj_allow_md_bmem_v2();
+
 #endif
 
 struct umem_wal_tx;

diff --git a/src/mgmt/srv_target.c b/src/mgmt/srv_target.c
@@ -1048,15 +1048,14 @@ tgt_create_preallocate(void *arg)
 		 * 16MB minimum per pmemobj file (SCM partition)
 		 */
 		D_ASSERT(dss_tgt_nr > 0);
+		D_ASSERT((tca->tca_scm_size / dss_tgt_nr) >= (1 << 24));
 		if (!bio_nvme_configured(SMD_DEV_TYPE_META)) {
-			rc = tgt_vos_preallocate_sequential(tca->tca_ptrec->dptr_uuid,
-							    max(tca->tca_scm_size / dss_tgt_nr,
-								1 << 24), dss_tgt_nr);
+			rc = tgt_vos_preallocate_sequential(
+			    tca->tca_ptrec->dptr_uuid, tca->tca_scm_size / dss_tgt_nr, dss_tgt_nr);
 		} else {
-			rc = tgt_vos_preallocate_parallel(tca->tca_ptrec->dptr_uuid,
-							  max(tca->tca_scm_size / dss_tgt_nr,
-							      1 << 24), dss_tgt_nr,
-							  &tca->tca_ptrec->cancel_create);
+			rc = tgt_vos_preallocate_parallel(
+			    tca->tca_ptrec->dptr_uuid, tca->tca_scm_size / dss_tgt_nr, dss_tgt_nr,
+			    &tca->tca_ptrec->cancel_create);
 		}
 		if (rc)
 			goto out;
@@ -1123,7 +1122,12 @@ ds_mgmt_hdlr_tgt_create(crt_rpc_t *tc_req)
 
 	tgt_scm_sz  = tc_in->tc_scm_size / dss_tgt_nr;
 	tgt_meta_sz = tc_in->tc_meta_size / dss_tgt_nr;
-	vos_pool_roundup_size(&tgt_scm_sz, &tgt_meta_sz);
+	rc          = vos_pool_roundup_size(&tgt_scm_sz, &tgt_meta_sz);
+	if (rc) {
+		D_ERROR(DF_UUID": failed to roundup the vos size: "DF_RC"\n",
+			DP_UUID(tc_in->tc_pool_uuid), DP_RC(rc));
+		goto out_rec;
+	}
 	tc_in->tc_scm_size  = tgt_scm_sz * dss_tgt_nr;
 	tc_in->tc_meta_size = tgt_meta_sz * dss_tgt_nr;
 

@@ -753,6 +753,50 @@ init_umem_store(struct umem_store *store, struct bio_meta_context *mc)
 		store->store_type = DAOS_MD_BMEM;
 }
 
+static int
+vos_pool_store_type(daos_size_t scm_sz, daos_size_t meta_sz)
+{
+	int backend;
+
+	backend = umempobj_get_backend_type();
+	D_ASSERT((meta_sz != 0) && (scm_sz != 0));
+
+	if (scm_sz > meta_sz) {
+		D_ERROR("memsize %lu is greater than metasize %lu", scm_sz, meta_sz);
+		return -DER_INVAL;
+	}
+
+	if (scm_sz < meta_sz) {
+		if ((backend == DAOS_MD_BMEM) && umempobj_allow_md_bmem_v2())
+			backend = DAOS_MD_BMEM_V2;
+		else if (backend != DAOS_MD_BMEM_V2) {
+			D_ERROR("scm_sz %lu is less than meta_sz %lu", scm_sz, meta_sz);
+			return -DER_INVAL;
+		}
+	}
+
+	return backend;
+}
+
+int
+vos_pool_roundup_size(daos_size_t *scm_sz, daos_size_t *meta_sz)
+{
+	size_t alignsz;
+	int    rc;
+
+	D_ASSERT((*scm_sz != 0) && (*meta_sz != 0));
+	rc = vos_pool_store_type(*scm_sz, *meta_sz);
+	if (rc < 0)
+		return rc;
+
+	/* Round up the size such that it is compatible with backend */
+	alignsz  = umempobj_pgsz(rc);
+	*scm_sz  = max(D_ALIGNUP(*scm_sz, alignsz), 1 << 24);
+	*meta_sz = max(D_ALIGNUP(*meta_sz, alignsz), 1 << 24);
+
+	return 0;
+}
+
 static int
 vos_pmemobj_create(const char *path, uuid_t pool_id, const char *layout,
 		   size_t scm_sz, size_t nvme_sz, size_t wal_sz, size_t meta_sz,
@@ -794,9 +838,13 @@ vos_pmemobj_create(const char *path, uuid_t pool_id, const char *layout,
 	if (!meta_sz)
 		meta_sz = scm_sz_actual;
 
-	store.store_type = umempobj_get_backend_type();
-	if (store.store_type == DAOS_MD_BMEM && meta_sz > scm_sz_actual)
-		store.store_type = DAOS_MD_BMEM_V2;
+	rc = vos_pool_store_type(scm_sz_actual, meta_sz);
+	if (rc < 0) {
+		D_ERROR("Failed to determine the store type for xs:%p pool:"DF_UUID". "DF_RC,
+			xs_ctxt, DP_UUID(pool_id), DP_RC(rc));
+		return rc;
+	}
+	store.store_type = rc;
 
 	D_DEBUG(DB_MGMT, "Create BIO meta context for xs:%p pool:"DF_UUID" "
 		"scm_sz: %zu meta_sz: %zu, nvme_sz: %zu wal_sz:%zu backend:%d\n",
@@ -1272,26 +1320,6 @@ vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_
 	return rc;
 }
 
-int
-vos_pool_roundup_size(daos_size_t *scm_sz, daos_size_t *meta_sz)
-{
-	int    backend;
-	size_t alignsz;
-
-	backend = umempobj_get_backend_type();
-	if ((*scm_sz != *meta_sz) && (backend == DAOS_MD_BMEM))
-		backend = DAOS_MD_BMEM_V2;
-
-	/* Round up the size such that it is compatible with backend */
-	alignsz = umempobj_pgsz(backend);
-
-	*scm_sz = D_ALIGNUP(*scm_sz, alignsz);
-	if (*meta_sz)
-		*meta_sz = D_ALIGNUP(*meta_sz, alignsz);
-
-	return 0;
-}
-
 int
 vos_pool_create(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_t data_sz,
 		daos_size_t meta_sz, unsigned int flags, uint32_t version, daos_handle_t *poh)