Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-16591 mgmt, vos, common: Align scm/meta size #15146

Merged
merged 3 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion src/common/mem.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@

#ifdef DAOS_PMEM_BUILD

static int daos_md_backend = DAOS_MD_PMEM;
static int daos_md_backend = DAOS_MD_PMEM;
static bool daos_disable_bmem_v2 = false;
#define UMM_SLABS_CNT 16

/** Initializes global settings for the pmem objects.
Expand All @@ -51,6 +52,7 @@
int rc;
enum pobj_arenas_assignment_type atype;
unsigned int md_mode = DAOS_MD_BMEM;
unsigned int md_disable_bmem_v2 = 0;

if (!md_on_ssd) {
daos_md_backend = DAOS_MD_PMEM;
Expand Down Expand Up @@ -81,6 +83,12 @@
return -DER_INVAL;
};

d_getenv_uint("DAOS_MD_DISABLE_BMEM_V2", &md_disable_bmem_v2);
if (md_disable_bmem_v2 && (md_mode != DAOS_MD_BMEM))
D_INFO("Ignoring DAOS_MD_DISABLE_BMEM_V2 tunable");
else
daos_disable_bmem_v2 = md_disable_bmem_v2;

daos_md_backend = md_mode;
return 0;
}
Expand All @@ -91,6 +99,12 @@
return daos_md_backend;
}

bool
umempobj_allow_md_bmem_v2()
{
return !daos_disable_bmem_v2;
}

int
umempobj_backend_type2class_id(int backend)
{
Expand Down Expand Up @@ -3046,7 +3060,7 @@
VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE((char *)pinfo->pi_addr, len);
pinfo->pi_io = 0;
if (rc) {
DL_ERROR(rc, "Read MD blob failed.\n");

Check warning on line 3063 in src/common/mem.c

View workflow job for this annotation

GitHub Actions / Logging macro checking

check-return, Line contains too many newlines
page_wakeup_io(cache, pinfo);
return rc;
} else if (cache->ca_evtcb_fn) {
Expand Down Expand Up @@ -3188,7 +3202,7 @@
if (is_page_dirty(pinfo)) {
rc = cache_flush_page(cache, pinfo);
if (rc) {
DL_ERROR(rc, "Flush page failed.\n");

Check warning on line 3205 in src/common/mem.c

View workflow job for this annotation

GitHub Actions / Logging macro checking

check-return, Line contains too many newlines
return rc;
}

Expand Down Expand Up @@ -3248,7 +3262,7 @@
while (need_evict(cache)) {
rc = cache_evict_page(cache, for_sys);
if (rc && rc != -DER_AGAIN && rc != -DER_BUSY) {
DL_ERROR(rc, "Evict page failed.\n");

Check warning on line 3265 in src/common/mem.c

View workflow job for this annotation

GitHub Actions / Logging macro checking

check-return, Line contains too many newlines
return rc;
}

Expand Down Expand Up @@ -3299,7 +3313,7 @@
if (is_id_evictable(cache, pg_id)) {
rc = cache_get_free_page(cache, &pinfo, 0, false);
if (rc) {
DL_ERROR(rc, "Failed to get free page.\n");

Check warning on line 3316 in src/common/mem.c

View workflow job for this annotation

GitHub Actions / Logging macro checking

check-return, Line contains too many newlines
break;
}
} else {
Expand Down Expand Up @@ -3482,7 +3496,7 @@

rc = cache_map_pages(cache, out_pages, page_nr);
if (rc)
DL_ERROR(rc, "Map page failed.\n");

Check warning on line 3499 in src/common/mem.c

View workflow job for this annotation

GitHub Actions / Logging macro checking

check-return, Line contains too many newlines

if (out_pages != &in_pages[0])
D_FREE(out_pages);
Expand All @@ -3505,7 +3519,7 @@

rc = cache_pin_pages(cache, out_pages, page_nr, for_sys);
if (rc) {
DL_ERROR(rc, "Load page failed.\n");

Check warning on line 3522 in src/common/mem.c

View workflow job for this annotation

GitHub Actions / Logging macro checking

check-return, Line contains too many newlines
} else {
for (i = 0; i < page_nr; i++) {
uint32_t pg_id = out_pages[i];
Expand Down Expand Up @@ -3542,7 +3556,7 @@

rc = cache_pin_pages(cache, out_pages, page_nr, for_sys);
if (rc) {
DL_ERROR(rc, "Load page failed.\n");

Check warning on line 3559 in src/common/mem.c

View workflow job for this annotation

GitHub Actions / Logging macro checking

check-return, Line contains too many newlines
goto out;
}

Expand Down Expand Up @@ -3605,7 +3619,7 @@
while (need_reserve(cache, 0)) {
rc = cache_evict_page(cache, false);
if (rc && rc != -DER_AGAIN && rc != -DER_BUSY) {
DL_ERROR(rc, "Evict page failed.\n");

Check warning on line 3622 in src/common/mem.c

View workflow job for this annotation

GitHub Actions / Logging macro checking

check-return, Line contains too many newlines
break;
}

Expand Down
4 changes: 4 additions & 0 deletions src/include/daos/mem.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ enum {
/* return umem backend type */
int umempobj_get_backend_type(void);

/* returns whether bmem_v2 pools are allowed */
bool
umempobj_allow_md_bmem_v2();

#endif

struct umem_wal_tx;
Expand Down
20 changes: 12 additions & 8 deletions src/mgmt/srv_target.c
Original file line number Diff line number Diff line change
Expand Up @@ -1048,15 +1048,14 @@ tgt_create_preallocate(void *arg)
* 16MB minimum per pmemobj file (SCM partition)
*/
D_ASSERT(dss_tgt_nr > 0);
D_ASSERT((tca->tca_scm_size / dss_tgt_nr) >= (1 << 24));
if (!bio_nvme_configured(SMD_DEV_TYPE_META)) {
rc = tgt_vos_preallocate_sequential(tca->tca_ptrec->dptr_uuid,
max(tca->tca_scm_size / dss_tgt_nr,
1 << 24), dss_tgt_nr);
rc = tgt_vos_preallocate_sequential(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we have to allocate sequentially in MD mode rather than parallel like in PMem?

Copy link
Collaborator Author

@sherintg sherintg Sep 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its the other way round, its sequential in PMEM and parallel in MD. This change was done in phase 1 to overcome the overhead of fallocate on tmpfs.

tca->tca_ptrec->dptr_uuid, tca->tca_scm_size / dss_tgt_nr, dss_tgt_nr);
} else {
rc = tgt_vos_preallocate_parallel(tca->tca_ptrec->dptr_uuid,
max(tca->tca_scm_size / dss_tgt_nr,
1 << 24), dss_tgt_nr,
&tca->tca_ptrec->cancel_create);
rc = tgt_vos_preallocate_parallel(
tca->tca_ptrec->dptr_uuid, tca->tca_scm_size / dss_tgt_nr, dss_tgt_nr,
&tca->tca_ptrec->cancel_create);
}
if (rc)
goto out;
Expand Down Expand Up @@ -1123,7 +1122,12 @@ ds_mgmt_hdlr_tgt_create(crt_rpc_t *tc_req)

tgt_scm_sz = tc_in->tc_scm_size / dss_tgt_nr;
tgt_meta_sz = tc_in->tc_meta_size / dss_tgt_nr;
vos_pool_roundup_size(&tgt_scm_sz, &tgt_meta_sz);
rc = vos_pool_roundup_size(&tgt_scm_sz, &tgt_meta_sz);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this strange formatting enforced by the clang linter?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes

if (rc) {
D_ERROR(DF_UUID": failed to roundup the vos size: "DF_RC"\n",
DP_UUID(tc_in->tc_pool_uuid), DP_RC(rc));
goto out_rec;
}
tc_in->tc_scm_size = tgt_scm_sz * dss_tgt_nr;
tc_in->tc_meta_size = tgt_meta_sz * dss_tgt_nr;

Expand Down
74 changes: 51 additions & 23 deletions src/vos/vos_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -753,6 +753,50 @@ init_umem_store(struct umem_store *store, struct bio_meta_context *mc)
store->store_type = DAOS_MD_BMEM;
}

static int
vos_pool_store_type(daos_size_t scm_sz, daos_size_t meta_sz)
{
int backend;

backend = umempobj_get_backend_type();
D_ASSERT((meta_sz != 0) && (scm_sz != 0));

if (scm_sz > meta_sz) {
D_ERROR("memsize %lu is greater than metasize %lu", scm_sz, meta_sz);
return -DER_INVAL;
}

if (scm_sz < meta_sz) {
if ((backend == DAOS_MD_BMEM) && umempobj_allow_md_bmem_v2())
backend = DAOS_MD_BMEM_V2;
else if (backend != DAOS_MD_BMEM_V2) {
D_ERROR("scm_sz %lu is less than meta_sz %lu", scm_sz, meta_sz);
return -DER_INVAL;
}
}

return backend;
}

int
vos_pool_roundup_size(daos_size_t *scm_sz, daos_size_t *meta_sz)
{
size_t alignsz;
int rc;

D_ASSERT((*scm_sz != 0) && (*meta_sz != 0));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This assert is triggered in CI tests, looks we pass 0 meta_size sometimes (for pmem or phase1 mode).

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed with the 3rd commit.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From https://build.hpdd.intel.com/job/daos-stack/job/daos/view/change-requests/job/PR-15146/4/artifact/Functional%20Hardware%20Medium/daos_test/rebuild.py/daos_logs.wolf-254/01-DAOS_Rebuild_0to10_daos_control.log/ failed we can see that the assert happens before pool create, just after engine start (before any dmg pool commands are issued).

  • Wolf-254 engine 1 starts - rank 2
  • Wolf-254 engine 0 starts - rank 5
  • SmdQuery issued from dmg to wolf-254 for ranks 2,5
  • ranks 2&5 fail due to assert
    e.g. 13:06:12 INFO | Sep 20 13:03:37 wolf-254.wolf.hpdd.intel.com daos_server[167095]: ERROR: daos_engine:0 09/20-13:03:37.29 wolf-254 DAOS[168521/0/4058] vos EMRG src/vos/vos_pool.c:787 vos_pool_roundup_size() Assertion '(*scm_sz != 0) && (*meta_sz != 0)' failed

So whilst meta_sz will be set nonzero in control-plane pool create calls, other engine internal calls need to be handled for meta_sz == 0.

rc = vos_pool_store_type(*scm_sz, *meta_sz);
if (rc < 0)
return rc;

/* Round up the size such that it is compatible with backend */
alignsz = umempobj_pgsz(rc);
*scm_sz = max(D_ALIGNUP(*scm_sz, alignsz), 1 << 24);
*meta_sz = max(D_ALIGNUP(*meta_sz, alignsz), 1 << 24);

return 0;
}

static int
vos_pmemobj_create(const char *path, uuid_t pool_id, const char *layout,
size_t scm_sz, size_t nvme_sz, size_t wal_sz, size_t meta_sz,
Expand Down Expand Up @@ -794,9 +838,13 @@ vos_pmemobj_create(const char *path, uuid_t pool_id, const char *layout,
if (!meta_sz)
meta_sz = scm_sz_actual;

store.store_type = umempobj_get_backend_type();
if (store.store_type == DAOS_MD_BMEM && meta_sz > scm_sz_actual)
store.store_type = DAOS_MD_BMEM_V2;
rc = vos_pool_store_type(scm_sz_actual, meta_sz);
if (rc < 0) {
D_ERROR("Failed to determine the store type for xs:%p pool:"DF_UUID". "DF_RC,
xs_ctxt, DP_UUID(pool_id), DP_RC(rc));
return rc;
}
store.store_type = rc;

D_DEBUG(DB_MGMT, "Create BIO meta context for xs:%p pool:"DF_UUID" "
"scm_sz: %zu meta_sz: %zu, nvme_sz: %zu wal_sz:%zu backend:%d\n",
Expand Down Expand Up @@ -1272,26 +1320,6 @@ vos_pool_create_ex(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_
return rc;
}

int
vos_pool_roundup_size(daos_size_t *scm_sz, daos_size_t *meta_sz)
{
int backend;
size_t alignsz;

backend = umempobj_get_backend_type();
if ((*scm_sz != *meta_sz) && (backend == DAOS_MD_BMEM))
backend = DAOS_MD_BMEM_V2;

/* Round up the size such that it is compatible with backend */
alignsz = umempobj_pgsz(backend);

*scm_sz = D_ALIGNUP(*scm_sz, alignsz);
if (*meta_sz)
*meta_sz = D_ALIGNUP(*meta_sz, alignsz);

return 0;
}

int
vos_pool_create(const char *path, uuid_t uuid, daos_size_t scm_sz, daos_size_t data_sz,
daos_size_t meta_sz, unsigned int flags, uint32_t version, daos_handle_t *poh)
Expand Down
Loading