Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-16591 mgmt, vos, common: Align scm/meta size #15146

Merged
merged 3 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/common/dav_v2/dav_iface.c
Original file line number Diff line number Diff line change
Expand Up @@ -470,3 +470,9 @@ dav_class_register_v2(dav_obj_t *pop, struct dav_alloc_class_desc *p)

return 0;
}

DAV_FUNC_EXPORT size_t
dav_obj_pgsz_v2()
{
return ZONE_MAX_SIZE;
}
6 changes: 6 additions & 0 deletions src/common/dav_v2/dav_v2.h
Original file line number Diff line number Diff line change
Expand Up @@ -313,4 +313,10 @@ dav_get_heap_mb_stats_v2(dav_obj_t *pop, uint32_t mb_id, struct dav_heap_mb_stat
uint32_t
dav_allot_mb_evictable_v2(dav_obj_t *pop, int flags);

/*
* Return the page size for dav_v2.
*/
size_t
dav_obj_pgsz_v2();

#endif /* __DAOS_COMMON_DAV_V2_H */
31 changes: 28 additions & 3 deletions src/common/mem.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@

#ifdef DAOS_PMEM_BUILD

static int daos_md_backend = DAOS_MD_PMEM;
static int daos_md_backend = DAOS_MD_PMEM;
static bool daos_disable_bmem_v2 = false;
#define UMM_SLABS_CNT 16

/** Initializes global settings for the pmem objects.
Expand All @@ -51,6 +52,7 @@
int rc;
enum pobj_arenas_assignment_type atype;
unsigned int md_mode = DAOS_MD_BMEM;
unsigned int md_disable_bmem_v2 = 0;

if (!md_on_ssd) {
daos_md_backend = DAOS_MD_PMEM;
Expand Down Expand Up @@ -81,16 +83,30 @@
return -DER_INVAL;
};

d_getenv_uint("DAOS_MD_DISABLE_BMEM_V2", &md_disable_bmem_v2);
if (md_disable_bmem_v2 && (md_mode != DAOS_MD_BMEM))
D_INFO("Ignoring DAOS_MD_DISABLE_BMEM_V2 tunable");
else
daos_disable_bmem_v2 = md_disable_bmem_v2;

daos_md_backend = md_mode;
return 0;
}

int umempobj_get_backend_type(void)
int
umempobj_get_backend_type(void)
{
return daos_md_backend;
}

int umempobj_backend_type2class_id(int backend)
bool
umempobj_allow_md_bmem_v2()
{
return !daos_disable_bmem_v2;
}

int
umempobj_backend_type2class_id(int backend)
{
switch (backend) {
case DAOS_MD_PMEM:
Expand All @@ -108,6 +124,15 @@
}
}

size_t
umempobj_pgsz(int backend)
{
if (backend == DAOS_MD_BMEM_V2)
return dav_obj_pgsz_v2();
else
return (1UL << 12);
}

/** Define common slabs. We can refine this for 2.4 pools but that is for next patch */
static const int slab_map[] = {
0, /* 32 bytes */
Expand Down Expand Up @@ -3035,7 +3060,7 @@
VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE((char *)pinfo->pi_addr, len);
pinfo->pi_io = 0;
if (rc) {
DL_ERROR(rc, "Read MD blob failed.\n");

Check warning on line 3063 in src/common/mem.c

View workflow job for this annotation

GitHub Actions / Logging macro checking

check-return, Line contains too many newlines
page_wakeup_io(cache, pinfo);
return rc;
} else if (cache->ca_evtcb_fn) {
Expand Down Expand Up @@ -3177,7 +3202,7 @@
if (is_page_dirty(pinfo)) {
rc = cache_flush_page(cache, pinfo);
if (rc) {
DL_ERROR(rc, "Flush page failed.\n");

Check warning on line 3205 in src/common/mem.c

View workflow job for this annotation

GitHub Actions / Logging macro checking

check-return, Line contains too many newlines
return rc;
}

Expand Down Expand Up @@ -3237,7 +3262,7 @@
while (need_evict(cache)) {
rc = cache_evict_page(cache, for_sys);
if (rc && rc != -DER_AGAIN && rc != -DER_BUSY) {
DL_ERROR(rc, "Evict page failed.\n");

Check warning on line 3265 in src/common/mem.c

View workflow job for this annotation

GitHub Actions / Logging macro checking

check-return, Line contains too many newlines
return rc;
}

Expand Down Expand Up @@ -3288,7 +3313,7 @@
if (is_id_evictable(cache, pg_id)) {
rc = cache_get_free_page(cache, &pinfo, 0, false);
if (rc) {
DL_ERROR(rc, "Failed to get free page.\n");

Check warning on line 3316 in src/common/mem.c

View workflow job for this annotation

GitHub Actions / Logging macro checking

check-return, Line contains too many newlines
break;
}
} else {
Expand Down Expand Up @@ -3471,7 +3496,7 @@

rc = cache_map_pages(cache, out_pages, page_nr);
if (rc)
DL_ERROR(rc, "Map page failed.\n");

Check warning on line 3499 in src/common/mem.c

View workflow job for this annotation

GitHub Actions / Logging macro checking

check-return, Line contains too many newlines

if (out_pages != &in_pages[0])
D_FREE(out_pages);
Expand All @@ -3494,7 +3519,7 @@

rc = cache_pin_pages(cache, out_pages, page_nr, for_sys);
if (rc) {
DL_ERROR(rc, "Load page failed.\n");

Check warning on line 3522 in src/common/mem.c

View workflow job for this annotation

GitHub Actions / Logging macro checking

check-return, Line contains too many newlines
} else {
for (i = 0; i < page_nr; i++) {
uint32_t pg_id = out_pages[i];
Expand Down Expand Up @@ -3531,7 +3556,7 @@

rc = cache_pin_pages(cache, out_pages, page_nr, for_sys);
if (rc) {
DL_ERROR(rc, "Load page failed.\n");

Check warning on line 3559 in src/common/mem.c

View workflow job for this annotation

GitHub Actions / Logging macro checking

check-return, Line contains too many newlines
goto out;
}

Expand Down Expand Up @@ -3594,7 +3619,7 @@
while (need_reserve(cache, 0)) {
rc = cache_evict_page(cache, false);
if (rc && rc != -DER_AGAIN && rc != -DER_BUSY) {
DL_ERROR(rc, "Evict page failed.\n");

Check warning on line 3622 in src/common/mem.c

View workflow job for this annotation

GitHub Actions / Logging macro checking

check-return, Line contains too many newlines
break;
}

Expand Down
8 changes: 8 additions & 0 deletions src/include/daos/mem.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ int umempobj_settings_init(bool md_on_ssd);
/* convert backend type to umem class id */
int umempobj_backend_type2class_id(int backend);

/* get page size for the backend */
size_t
umempobj_pgsz(int backend);

/* umem persistent object property flags */
#define UMEMPOBJ_ENABLE_STATS 0x1

Expand All @@ -46,6 +50,10 @@ enum {
/* return umem backend type */
int umempobj_get_backend_type(void);

/* returns whether bmem_v2 pools are allowed */
bool
umempobj_allow_md_bmem_v2();

#endif

struct umem_wal_tx;
Expand Down
10 changes: 10 additions & 0 deletions src/include/daos_srv/vos.h
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,16 @@ int
vos_aggregate(daos_handle_t coh, daos_epoch_range_t *epr,
int (*yield_func)(void *arg), void *yield_arg, uint32_t flags);

/**
* Round up the scm and meta sizes to match the backend requirement.
* \param[in/out] scm_sz SCM size that needs to be aligned up
* \param[in/out] meta_sz META size that needs to be aligned up
*
* \return 0 on success, error otherwise.
*/
int
vos_pool_roundup_size(size_t *scm_sz, size_t *meta_sz);

/**
* Discards changes in all epochs with the epoch range \a epr
*
Expand Down
2 changes: 1 addition & 1 deletion src/mgmt/srv_drpc.c
Original file line number Diff line number Diff line change
Expand Up @@ -503,7 +503,7 @@ ds_mgmt_drpc_pool_create(Drpc__Call *drpc_req, Drpc__Response *drpc_resp)

scm_size = req->tier_bytes[DAOS_MEDIA_SCM];
if (req->mem_ratio)
scm_size *= req->mem_ratio;
scm_size *= (double)req->mem_ratio;

rc = ds_mgmt_create_pool(pool_uuid, req->sys, "pmem", targets, scm_size,
req->tier_bytes[DAOS_MEDIA_NVME], prop, &svc, req->n_fault_domains,
Expand Down
26 changes: 19 additions & 7 deletions src/mgmt/srv_target.c
Original file line number Diff line number Diff line change
Expand Up @@ -1048,15 +1048,14 @@ tgt_create_preallocate(void *arg)
* 16MB minimum per pmemobj file (SCM partition)
*/
D_ASSERT(dss_tgt_nr > 0);
D_ASSERT((tca->tca_scm_size / dss_tgt_nr) >= (1 << 24));
if (!bio_nvme_configured(SMD_DEV_TYPE_META)) {
rc = tgt_vos_preallocate_sequential(tca->tca_ptrec->dptr_uuid,
max(tca->tca_scm_size / dss_tgt_nr,
1 << 24), dss_tgt_nr);
rc = tgt_vos_preallocate_sequential(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we have to allocate sequentially in MD mode rather than parallel like in PMem?

Copy link
Collaborator Author

@sherintg sherintg Sep 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its the other way round, its sequential in PMEM and parallel in MD. This change was done in phase 1 to overcome the overhead of fallocate on tmpfs.

tca->tca_ptrec->dptr_uuid, tca->tca_scm_size / dss_tgt_nr, dss_tgt_nr);
} else {
rc = tgt_vos_preallocate_parallel(tca->tca_ptrec->dptr_uuid,
max(tca->tca_scm_size / dss_tgt_nr,
1 << 24), dss_tgt_nr,
&tca->tca_ptrec->cancel_create);
rc = tgt_vos_preallocate_parallel(
tca->tca_ptrec->dptr_uuid, tca->tca_scm_size / dss_tgt_nr, dss_tgt_nr,
&tca->tca_ptrec->cancel_create);
}
if (rc)
goto out;
Expand All @@ -1083,6 +1082,8 @@ ds_mgmt_hdlr_tgt_create(crt_rpc_t *tc_req)
pthread_t thread;
bool canceled_thread = false;
int rc = 0;
size_t tgt_scm_sz;
size_t tgt_meta_sz;

/** incoming request buffer */
tc_in = crt_req_get(tc_req);
Expand Down Expand Up @@ -1119,6 +1120,17 @@ ds_mgmt_hdlr_tgt_create(crt_rpc_t *tc_req)
D_DEBUG(DB_MGMT, DF_UUID": record inserted to dpt_creates_ht\n",
DP_UUID(tca.tca_ptrec->dptr_uuid));

tgt_scm_sz = tc_in->tc_scm_size / dss_tgt_nr;
tgt_meta_sz = tc_in->tc_meta_size / dss_tgt_nr;
rc = vos_pool_roundup_size(&tgt_scm_sz, &tgt_meta_sz);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this strange formatting enforced by the clang linter?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes

if (rc) {
D_ERROR(DF_UUID": failed to roundup the vos size: "DF_RC"\n",
DP_UUID(tc_in->tc_pool_uuid), DP_RC(rc));
goto out_rec;
}
tc_in->tc_scm_size = tgt_scm_sz * dss_tgt_nr;
tc_in->tc_meta_size = tgt_meta_sz * dss_tgt_nr;

tca.tca_scm_size = tc_in->tc_scm_size;
tca.tca_nvme_size = tc_in->tc_nvme_size;
tca.tca_dx = dss_current_xstream();
Expand Down
54 changes: 51 additions & 3 deletions src/vos/vos_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -753,6 +753,50 @@ init_umem_store(struct umem_store *store, struct bio_meta_context *mc)
store->store_type = DAOS_MD_BMEM;
}

static int
vos_pool_store_type(daos_size_t scm_sz, daos_size_t meta_sz)
{
int backend;

backend = umempobj_get_backend_type();
D_ASSERT((meta_sz != 0) && (scm_sz != 0));

if (scm_sz > meta_sz) {
D_ERROR("memsize %lu is greater than metasize %lu", scm_sz, meta_sz);
return -DER_INVAL;
}

if (scm_sz < meta_sz) {
if ((backend == DAOS_MD_BMEM) && umempobj_allow_md_bmem_v2())
backend = DAOS_MD_BMEM_V2;
else if (backend != DAOS_MD_BMEM_V2) {
D_ERROR("scm_sz %lu is less than meta_sz %lu", scm_sz, meta_sz);
return -DER_INVAL;
}
}

return backend;
}

int
vos_pool_roundup_size(daos_size_t *scm_sz, daos_size_t *meta_sz)
{
size_t alignsz;
int rc;

D_ASSERT((*scm_sz != 0) && (*meta_sz != 0));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This assert is triggered in CI tests, looks we pass 0 meta_size sometimes (for pmem or phase1 mode).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed with the 3rd commit.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From https://build.hpdd.intel.com/job/daos-stack/job/daos/view/change-requests/job/PR-15146/4/artifact/Functional%20Hardware%20Medium/daos_test/rebuild.py/daos_logs.wolf-254/01-DAOS_Rebuild_0to10_daos_control.log/ failed we can see that the assert happens before pool create, just after engine start (before any dmg pool commands are issued).

  • Wolf-254 engine 1 starts - rank 2
  • Wolf-254 engine 0 starts - rank 5
  • SmdQuery issued from dmg to wolf-254 for ranks 2,5
  • ranks 2&5 fail due to assert
    e.g. 13:06:12 INFO | Sep 20 13:03:37 wolf-254.wolf.hpdd.intel.com daos_server[167095]: ERROR: daos_engine:0 09/20-13:03:37.29 wolf-254 DAOS[168521/0/4058] vos EMRG src/vos/vos_pool.c:787 vos_pool_roundup_size() Assertion '(*scm_sz != 0) && (*meta_sz != 0)' failed

So whilst meta_sz will be set nonzero in control-plane pool create calls, other engine internal calls need to be handled for meta_sz == 0.

rc = vos_pool_store_type(*scm_sz, *meta_sz);
if (rc < 0)
return rc;

/* Round up the size such that it is compatible with backend */
alignsz = umempobj_pgsz(rc);
*scm_sz = max(D_ALIGNUP(*scm_sz, alignsz), 1 << 24);
*meta_sz = max(D_ALIGNUP(*meta_sz, alignsz), 1 << 24);

return 0;
}

static int
vos_pmemobj_create(const char *path, uuid_t pool_id, const char *layout,
size_t scm_sz, size_t nvme_sz, size_t wal_sz, size_t meta_sz,
Expand Down Expand Up @@ -794,9 +838,13 @@ vos_pmemobj_create(const char *path, uuid_t pool_id, const char *layout,
if (!meta_sz)
meta_sz = scm_sz_actual;

store.store_type = umempobj_get_backend_type();
if (store.store_type == DAOS_MD_BMEM && meta_sz > scm_sz_actual)
store.store_type = DAOS_MD_BMEM_V2;
rc = vos_pool_store_type(scm_sz_actual, meta_sz);
if (rc < 0) {
D_ERROR("Failed to determine the store type for xs:%p pool:"DF_UUID". "DF_RC,
xs_ctxt, DP_UUID(pool_id), DP_RC(rc));
return rc;
}
store.store_type = rc;

D_DEBUG(DB_MGMT, "Create BIO meta context for xs:%p pool:"DF_UUID" "
"scm_sz: %zu meta_sz: %zu, nvme_sz: %zu wal_sz:%zu backend:%d\n",
Expand Down
Loading