From b8c37adc7a44f4059cfe9f577ad434889a6f9dab Mon Sep 17 00:00:00 2001 From: Ashley Pittman Date: Sat, 7 Oct 2023 08:34:27 +0100 Subject: [PATCH] DAOS-13216 dfuse: Add a pre-read feature for non-cached files. (#12015) When the kernel cache is in use but a file is not cached then pre-read the file on open. This works for files up to the read buffer size (1Mb) and is enabled based on the I/O pattern of the last file closed in the same directory. Required-githooks: true Signed-off-by: Ashley Pittman Signed-off-by: Jeff Olivier --- .clang-format | 2 +- src/include/daos_obj.h | 4 +- src/include/daos_srv/vos_types.h | 18 +- src/tests/vos_perf.c | 32 +- src/vos/vos_gc.c | 8 +- src/vos/vos_ilog.h | 10 + src/vos/vos_internal.h | 78 ++- src/vos/vos_io.c | 322 +++++++----- src/vos/vos_iterator.c | 85 ++- src/vos/vos_layout.h | 12 +- src/vos/vos_obj.c | 855 ++++++++++++++++++++++--------- src/vos/vos_query.c | 9 +- src/vos/vos_tree.c | 59 ++- 13 files changed, 1009 insertions(+), 485 deletions(-) diff --git a/.clang-format b/.clang-format index 9cf1ce43ff05..91db8e9b5131 100644 --- a/.clang-format +++ b/.clang-format @@ -12,7 +12,7 @@ BreakBeforeBraces: Linux AllowShortIfStatementsOnASingleLine: false IndentCaseLabels: false ForEachMacros: ['d_list_for_each_entry', - 'd_list_for_each_safe', + 'd_list_for_each_safe', 'd_list_for_each_entry_safe', 'evt_ent_array_for_each'] PointerAlignment: Right diff --git a/src/include/daos_obj.h b/src/include/daos_obj.h index 6801ca71ed64..5ff269240ab6 100644 --- a/src/include/daos_obj.h +++ b/src/include/daos_obj.h @@ -178,12 +178,12 @@ static inline bool daos_is_dkey_uint64_type(enum daos_otype_t type) { switch (type) { + case DAOS_OT_ARRAY_BYTE: case DAOS_OT_MULTI_UINT64: case DAOS_OT_DKEY_UINT64: case DAOS_OT_KV_UINT64: case DAOS_OT_ARRAY: case DAOS_OT_ARRAY_ATTR: - case DAOS_OT_ARRAY_BYTE: return true; default: return false; @@ -220,9 +220,9 @@ static inline bool daos_is_array_type(enum daos_otype_t type) { switch (type) { + case DAOS_OT_ARRAY_BYTE: case DAOS_OT_ARRAY: case DAOS_OT_ARRAY_ATTR: - case DAOS_OT_ARRAY_BYTE: return true; default: return false; diff --git a/src/include/daos_srv/vos_types.h b/src/include/daos_srv/vos_types.h index d621a3251f64..f0b433de874d 100644 --- a/src/include/daos_srv/vos_types.h +++ b/src/include/daos_srv/vos_types.h @@ -290,6 +290,8 @@ enum { VOS_POOL_FEAT_DYN_ROOT = (1ULL << 2), /** Embedded value in tree root supported */ VOS_POOL_FEAT_EMB_VALUE = (1ULL << 3), + /** Flat DKEY support enabled */ + VOS_POOL_FEAT_FLAT_DKEY = (1ULL << 4), }; /** Mask for any conditionals passed to to the fetch */ @@ -318,23 +320,25 @@ D_CASSERT((VOS_OF_PUNCH_PROPAGATE & DAOS_COND_MASK) == 0); /** vos definitions that match daos_obj_key_query flags */ enum { /** retrieve the max of dkey, akey, and/or idx of array value */ - VOS_GET_MAX = DAOS_GET_MAX, + VOS_GET_MAX = DAOS_GET_MAX, /** retrieve the min of dkey, akey, and/or idx of array value */ - VOS_GET_MIN = DAOS_GET_MIN, + VOS_GET_MIN = DAOS_GET_MIN, /** retrieve the dkey */ - VOS_GET_DKEY = DAOS_GET_DKEY, + VOS_GET_DKEY = DAOS_GET_DKEY, /** retrieve the akey */ - VOS_GET_AKEY = DAOS_GET_AKEY, + VOS_GET_AKEY = DAOS_GET_AKEY, /** retrieve the idx of array value */ - VOS_GET_RECX = DAOS_GET_RECX, + VOS_GET_RECX = DAOS_GET_RECX, /** * Internal flag to indicate retrieve the idx of EC array value, * in that case need to retrieve both normal space and parity space * (parity space with DAOS_EC_PARITY_BIT in the recx index). */ - VOS_GET_RECX_EC = (1 << 5), + VOS_GET_RECX_EC = (1 << 5), /** Internal flag to indicate timestamps are used */ - VOS_USE_TIMESTAMPS = (1 << 6), + VOS_USE_TIMESTAMPS = (1 << 6), + /** Internal flag to indicate dkey is flat */ + VOS_FLAT_DKEY = (1 << 7), }; D_CASSERT((VOS_USE_TIMESTAMPS & (VOS_GET_MAX | VOS_GET_MIN | VOS_GET_DKEY | diff --git a/src/tests/vos_perf.c b/src/tests/vos_perf.c index ad8694183f88..c64ebd2393db 100644 --- a/src/tests/vos_perf.c +++ b/src/tests/vos_perf.c @@ -27,6 +27,7 @@ #include uint64_t ts_flags; +bool ts_flat = false; char ts_pmem_path[PATH_MAX - 32]; char ts_pmem_file[PATH_MAX]; @@ -553,7 +554,7 @@ pf_query(struct pf_test *ts, struct pf_param *param) { int rc; - if (ts_flags != DAOS_OT_DKEY_UINT64) { + if (ts_flags != DAOS_OT_DKEY_UINT64 && ts_flags != DAOS_OT_ARRAY_BYTE) { fprintf(stderr, "Integer dkeys required for query test (-i)\n"); return -1; } @@ -743,15 +744,16 @@ ts_print_usage(void) } const struct option perf_vos_opts[] = { - { "dir", required_argument, NULL, 'D' }, - { "zcopy", no_argument, NULL, 'z' }, - { "int_dkey", no_argument, NULL, 'i' }, - { "const_akey", no_argument, NULL, 'I' }, - { "abt_ult", no_argument, NULL, 'x' }, - { NULL, 0, NULL, 0 }, + {"dir", required_argument, NULL, 'D'}, + {"zcopy", no_argument, NULL, 'z'}, + {"int_dkey", no_argument, NULL, 'i'}, + {"flat_dkey", no_argument, NULL, 'f'}, + {"const_akey", no_argument, NULL, 'I'}, + {"abt_ult", no_argument, NULL, 'x'}, + {NULL, 0, NULL, 0}, }; -const char perf_vos_optstr[] = "D:ziIx"; +const char perf_vos_optstr[] = "D:zifIx"; int main(int argc, char **argv) @@ -805,6 +807,12 @@ main(int argc, char **argv) ts_flags = DAOS_OT_DKEY_UINT64; ts_dkey_prefix = NULL; break; + case 'f': + ts_flat = true; + ts_dkey_prefix = NULL; + /** Flat dkey implies const_akey */ + ts_const_akey = true; + break; case 'I': ts_const_akey = true; break; @@ -818,6 +826,14 @@ main(int argc, char **argv) if (ts_const_akey) ts_akey_p_dkey = 1; + if (ts_flat) { + if (ts_single) + ts_flags = DAOS_OT_KV_HASHED; + else + ts_flags = DAOS_OT_ARRAY_BYTE; + ts_dkey_prefix = NULL; + } + if (!cmds) { D_PRINT("Please provide command string\n"); ts_print_usage(); diff --git a/src/vos/vos_gc.c b/src/vos/vos_gc.c index 6c6c9e44d129..64e43ee50a4f 100644 --- a/src/vos/vos_gc.c +++ b/src/vos/vos_gc.c @@ -150,13 +150,17 @@ gc_drain_key(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh, { struct vos_krec_df *key = umem_off2ptr(&pool->vp_umm, item->it_addr); int creds = *credits; + bool flat_kv = false; int rc; + if (key->kr_bmap & KREC_BF_FLAT) + flat_kv = true; + if (key->kr_bmap & KREC_BF_BTR) { rc = gc_drain_btr(gc, pool, coh, &key->kr_btr, credits, empty); } else if (key->kr_bmap & KREC_BF_EVT) { - D_ASSERT(gc->gc_type == GC_AKEY); + D_ASSERT(gc->gc_type == GC_AKEY || flat_kv); rc = gc_drain_evt(gc, pool, coh, &key->kr_evt, credits, empty); } else { /* empty key generated by punch */ @@ -169,7 +173,7 @@ gc_drain_key(struct vos_gc *gc, struct vos_pool *pool, daos_handle_t coh, return rc; } - if (gc->gc_type == GC_DKEY) + if (gc->gc_type == GC_DKEY && !flat_kv) return 0; /* gather value stats for akey */ diff --git a/src/vos/vos_ilog.h b/src/vos/vos_ilog.h index b256889e2ede..898f65b202f7 100644 --- a/src/vos/vos_ilog.h +++ b/src/vos/vos_ilog.h @@ -68,6 +68,16 @@ struct vos_ilog_info { bool ii_full_scan; }; +/** Copies only the parsed information, ii_entries is not touched in + * destination. + */ +static inline void +vos_ilog_copy_info(struct vos_ilog_info *dest, const struct vos_ilog_info *src) +{ + memcpy(&dest->ii_uncommitted, &src->ii_uncommitted, + sizeof(*src) - offsetof(__typeof__(*src), ii_uncommitted)); +} + /** Initialize the incarnation log globals */ int vos_ilog_init(void); diff --git a/src/vos/vos_internal.h b/src/vos/vos_internal.h index 2bee64673bf5..e03a5448d09f 100644 --- a/src/vos/vos_internal.h +++ b/src/vos/vos_internal.h @@ -564,7 +564,10 @@ vos_feats_agg_time_update(daos_epoch_t epoch, uint64_t *feats) /** Iterator ops for objects and OIDs */ extern struct vos_iter_ops vos_oi_iter_ops; -extern struct vos_iter_ops vos_obj_iter_ops; +extern struct vos_iter_ops vos_obj_dkey_iter_ops; +extern struct vos_iter_ops vos_obj_akey_iter_ops; +extern struct vos_iter_ops vos_obj_sv_iter_ops; +extern struct vos_iter_ops vos_obj_ev_iter_ops; extern struct vos_iter_ops vos_cont_iter_ops; extern struct vos_iter_ops vos_dtx_iter_ops; @@ -1036,13 +1039,9 @@ struct vos_iterator { vos_iter_type_t it_type; enum vos_iter_state it_state; uint32_t it_ref_cnt; - uint32_t it_from_parent:1, - it_for_purge:1, - it_for_discard:1, - it_for_migration:1, - it_show_uncommitted:1, - it_ignore_uncommitted:1, - it_for_sysdb:1; + uint32_t it_from_parent : 1, it_key_flat : 1, it_key_fake : 1, it_for_purge : 1, + it_for_discard : 1, it_for_migration : 1, it_show_uncommitted : 1, + it_ignore_uncommitted : 1, it_for_sysdb : 1; }; /* Auxiliary structure for passing information between parent and nested @@ -1056,6 +1055,8 @@ struct vos_iter_info { struct evt_root *ii_evt; /* Pointer to btree for nested iterator */ struct btr_root *ii_btr; + /** Open tree handle for nested iterator */ + daos_handle_t ii_tree_hdl; /* oid to hold */ daos_unit_oid_t ii_oid; }; @@ -1065,7 +1066,8 @@ struct vos_iter_info { struct vea_space_info *ii_vea_info; /* Reference to vos object, set in iop_tree_prepare. */ struct vos_object *ii_obj; - d_iov_t *ii_akey; /* conditional akey */ + /** for fake akey, pass the parent ilog info */ + struct vos_ilog_info *ii_ilog_info; /** address range (RECX); rx_nr == 0 means entire range (0:~0ULL) */ daos_recx_t ii_recx; daos_epoch_range_t ii_epr; @@ -1078,7 +1080,8 @@ struct vos_iter_info { vos_it_epc_expr_t ii_epc_expr; /** iterator flags */ uint32_t ii_flags; - + /** Indicate this is a fake akey and which type */ + uint32_t ii_fake_akey_flag; }; /** function table for vos iterator */ @@ -1137,14 +1140,28 @@ vos_hdl2iter(daos_handle_t hdl) return (struct vos_iterator *)hdl.cookie; } +/** Special internal marker for fake akey. If set, it_hdl will point + * at krec of the dkey. We just need a struct as a placeholder + * to keep iterator presenting an akey to the caller. This adds + * some small complication to VOS iterator but simplifies rebuild + * and other entities that use it. This flag must not conflict with + * other iterator flags. + */ +#define VOS_IT_DKEY_SV (1 << 31) +#define VOS_IT_DKEY_EV (1 << 30) +D_CASSERT((VOS_IT_DKEY_SV & VOS_IT_MASK) == 0); +D_CASSERT((VOS_IT_DKEY_EV & VOS_IT_MASK) == 0); + /** iterator for dkey/akey/recx */ struct vos_obj_iter { /* public part of the iterator */ struct vos_iterator it_iter; /** Incarnation log entries for current iterator */ struct vos_ilog_info it_ilog_info; - /** handle of iterator */ - daos_handle_t it_hdl; + /** For flat akey, this will open value tree handle and either + * VOS_IT_DKEY_SV or VOS_IT_DKEY_EV will be set. + */ + daos_handle_t it_hdl; /** condition of the iterator: epoch logic expression */ vos_it_epc_expr_t it_epc_expr; /** iterator flags */ @@ -1152,13 +1169,15 @@ struct vos_obj_iter { /** condition of the iterator: epoch range */ daos_epoch_range_t it_epr; /** highest epoch where parent obj/key was punched */ - struct vos_punch_record it_punched; - /** condition of the iterator: attribute key */ - daos_key_t it_akey; + struct vos_punch_record it_punched; /* reference on the object */ struct vos_object *it_obj; /** condition of the iterator: extent range */ daos_recx_t it_recx; + /** For fake akey, save the dkey krec as well */ + struct vos_krec_df *it_dkey_krec; + /** Store the fake akey */ + char it_fake_akey; }; static inline struct vos_obj_iter * @@ -1195,8 +1214,10 @@ tree_rec_bundle2iov(struct vos_rec_bundle *rbund, d_iov_t *iov) } enum { - SUBTR_CREATE = (1 << 0), /**< may create the subtree */ - SUBTR_EVT = (1 << 1), /**< subtree is evtree */ + SUBTR_CREATE = (1 << 0), /**< may create the subtree */ + SUBTR_EVT = (1 << 1), /**< subtree is evtree */ + SUBTR_FLAT = (1 << 2), /**< use flat kv on create */ + SUBTR_NO_OPEN = (1 << 3), /**< Don't initialize the subtree if the key is flat */ }; /* vos_common.c */ @@ -1728,4 +1749,27 @@ vos_oi_upgrade_layout_ver(struct vos_container *cont, daos_unit_oid_t oid, void vos_lru_free_track(void *arg, daos_size_t size); void vos_lru_alloc_track(void *arg, daos_size_t size); + +static inline bool +vos_obj_flat_kv_supported(struct vos_container *cont, daos_unit_oid_t oid) +{ + struct vos_pool *pool = vos_cont2pool(cont); + + if ((pool->vp_feats & VOS_POOL_FEAT_FLAT_DKEY) == 0) + return false; + + if (daos_is_array(oid.id_pub) || daos_is_kv(oid.id_pub)) + return true; + + return false; +} + +/** For flat trees, we sometimes need a fake akey anchor */ +static inline void +vos_fake_anchor_create(daos_anchor_t *anchor) +{ + memset(&anchor->da_buf[0], 0, sizeof(anchor->da_buf)); + anchor->da_type = DAOS_ANCHOR_TYPE_HKEY; +} + #endif /* __VOS_INTERNAL_H__ */ diff --git a/src/vos/vos_io.c b/src/vos/vos_io.c index ef7e858bbb1a..6fdd25891ce9 100644 --- a/src/vos/vos_io.c +++ b/src/vos/vos_io.c @@ -1188,79 +1188,19 @@ has_uncertainty(const struct vos_io_context *ioc, } static int -akey_fetch(struct vos_io_context *ioc, daos_handle_t ak_toh) +fetch_value(struct vos_io_context *ioc, daos_iod_t *iod, daos_handle_t toh, + const daos_epoch_range_t *epr, bool standalone) { - daos_iod_t *iod = &ioc->ic_iods[ioc->ic_sgl_at]; - struct vos_krec_df *krec = NULL; - daos_epoch_range_t val_epr = {0}; - daos_handle_t toh = DAOS_HDL_INVAL; - int i, rc; - int flags = 0; - bool is_array = (iod->iod_type == DAOS_IOD_ARRAY); - bool has_cond = false; struct daos_recx_ep_list *shadow; - bool standalone = ioc->ic_cont->vc_pool->vp_sysdb; - - D_DEBUG(DB_IO, "akey "DF_KEY" fetch %s epr "DF_X64"-"DF_X64"\n", - DP_KEY(&iod->iod_name), - iod->iod_type == DAOS_IOD_ARRAY ? "array" : "single", - ioc->ic_epr.epr_lo, ioc->ic_epr.epr_hi); - - if (is_array) { - if (iod->iod_nr == 0 || iod->iod_recxs == NULL) { - D_ASSERT(iod->iod_nr == 0 && iod->iod_recxs == NULL); - D_DEBUG(DB_TRACE, "akey "DF_KEY" fetch array bypassed - NULL iod_recxs.\n", - DP_KEY(&iod->iod_name)); - return 0; - } - flags |= SUBTR_EVT; - } - - rc = key_tree_prepare(ioc->ic_obj, ak_toh, - VOS_BTR_AKEY, &iod->iod_name, flags, - DAOS_INTENT_DEFAULT, &krec, - (ioc->ic_check_existence || ioc->ic_read_ts_only) ? NULL : &toh, - ioc->ic_ts_set); - - if (stop_check(ioc, VOS_OF_COND_AKEY_FETCH, iod, &rc, true)) { - if (rc == 0 && !ioc->ic_read_ts_only) - iod_empty_sgl(ioc, ioc->ic_sgl_at); - VOS_TX_LOG_FAIL(rc, "Failed to get akey "DF_KEY" "DF_RC"\n", - DP_KEY(&iod->iod_name), DP_RC(rc)); - goto out; - } - - if (ioc->ic_ts_set != NULL) { - if (ioc->ic_ts_set->ts_flags & VOS_OF_COND_PER_AKEY && - iod->iod_flags & VOS_OF_COND_AKEY_FETCH) { - has_cond = true; - } else if (!(ioc->ic_ts_set->ts_flags & VOS_OF_COND_PER_AKEY) && - ioc->ic_ts_set->ts_flags & VOS_OF_COND_AKEY_FETCH) { - has_cond = true; - } - } + int rc = 0; + int i; - rc = key_ilog_check(ioc, krec, &ioc->ic_dkey_info, &val_epr, - &ioc->ic_akey_info, has_cond); - - if (stop_check(ioc, VOS_OF_COND_AKEY_FETCH, iod, &rc, false)) { - if (rc == 0 && !ioc->ic_read_ts_only) { - if (has_uncertainty(ioc, &ioc->ic_akey_info)) - goto fetch_value; - iod_empty_sgl(ioc, ioc->ic_sgl_at); - } - VOS_TX_LOG_FAIL(rc, "Fetch akey failed: rc="DF_RC"\n", - DP_RC(rc)); - goto out; - } - -fetch_value: if (ioc->ic_read_ts_only || ioc->ic_check_existence) - goto out; /* skip value fetch */ + return rc; if (iod->iod_type == DAOS_IOD_SINGLE) { - rc = akey_fetch_single(toh, &val_epr, &iod->iod_size, ioc); - goto out; + rc = akey_fetch_single(toh, epr, &iod->iod_size, ioc); + return rc; } iod->iod_size = 0; @@ -1284,8 +1224,7 @@ akey_fetch(struct vos_io_context *ioc, daos_handle_t ak_toh) while (iod_recx.rx_nr > 0) { akey_fetch_recx_get(&iod_recx, shadow, &fetch_recx, &shadow_ep); - rc = akey_fetch_recx(toh, &val_epr, &fetch_recx, - shadow_ep, &rsize, ioc); + rc = akey_fetch_recx(toh, epr, &fetch_recx, shadow_ep, &rsize, ioc); if (vos_dtx_continue_detect(rc, standalone)) continue; @@ -1293,7 +1232,7 @@ akey_fetch(struct vos_io_context *ioc, daos_handle_t ak_toh) if (rc != 0) { VOS_TX_LOG_FAIL(rc, "Failed to fetch index %d: " DF_RC"\n", i, DP_RC(rc)); - goto out; + return rc; } } @@ -1322,15 +1261,82 @@ akey_fetch(struct vos_io_context *ioc, daos_handle_t ak_toh) if (iod->iod_size != rsize) { D_ERROR("Cannot support mixed record size " DF_U64"/"DF_U64"\n", iod->iod_size, rsize); - rc = -DER_INVAL; - goto out; + return -DER_INVAL; } } if (vos_dtx_hit_inprogress(standalone)) - goto out; + return 0; ioc_trim_tail_holes(ioc); + + return rc; +} + +static int +akey_fetch(struct vos_io_context *ioc, daos_handle_t ak_toh) +{ + daos_iod_t *iod = &ioc->ic_iods[ioc->ic_sgl_at]; + struct vos_krec_df *krec = NULL; + daos_epoch_range_t val_epr = {0}; + daos_handle_t toh = DAOS_HDL_INVAL; + int rc; + int flags = 0; + bool is_array = (iod->iod_type == DAOS_IOD_ARRAY); + bool has_cond = false; + bool standalone = ioc->ic_cont->vc_pool->vp_sysdb; + + D_DEBUG(DB_IO, "akey " DF_KEY " fetch %s epr " DF_X64 "-" DF_X64 "\n", + DP_KEY(&iod->iod_name), iod->iod_type == DAOS_IOD_ARRAY ? "array" : "single", + ioc->ic_epr.epr_lo, ioc->ic_epr.epr_hi); + + if (is_array) { + if (iod->iod_nr == 0 || iod->iod_recxs == NULL) { + D_ASSERT(iod->iod_nr == 0 && iod->iod_recxs == NULL); + D_DEBUG(DB_TRACE, + "akey " DF_KEY " fetch array bypassed - NULL iod_recxs.\n", + DP_KEY(&iod->iod_name)); + return 0; + } + flags |= SUBTR_EVT; + } + + rc = key_tree_prepare( + ioc->ic_obj, ak_toh, VOS_BTR_AKEY, &iod->iod_name, flags, DAOS_INTENT_DEFAULT, &krec, + (ioc->ic_check_existence || ioc->ic_read_ts_only) ? NULL : &toh, ioc->ic_ts_set); + + if (stop_check(ioc, VOS_OF_COND_AKEY_FETCH, iod, &rc, true)) { + if (rc == 0 && !ioc->ic_read_ts_only) + iod_empty_sgl(ioc, ioc->ic_sgl_at); + VOS_TX_LOG_FAIL(rc, "Failed to get akey " DF_KEY " " DF_RC "\n", + DP_KEY(&iod->iod_name), DP_RC(rc)); + goto out; + } + + if (ioc->ic_ts_set != NULL) { + if (ioc->ic_ts_set->ts_flags & VOS_OF_COND_PER_AKEY && + iod->iod_flags & VOS_OF_COND_AKEY_FETCH) { + has_cond = true; + } else if (!(ioc->ic_ts_set->ts_flags & VOS_OF_COND_PER_AKEY) && + ioc->ic_ts_set->ts_flags & VOS_OF_COND_AKEY_FETCH) { + has_cond = true; + } + } + + rc = key_ilog_check(ioc, krec, &ioc->ic_dkey_info, &val_epr, &ioc->ic_akey_info, has_cond); + + if (stop_check(ioc, VOS_OF_COND_AKEY_FETCH, iod, &rc, false)) { + if (rc == 0 && !ioc->ic_read_ts_only) { + if (has_uncertainty(ioc, &ioc->ic_akey_info)) + goto fetch_value; + iod_empty_sgl(ioc, ioc->ic_sgl_at); + } + VOS_TX_LOG_FAIL(rc, "Fetch akey failed: rc=" DF_RC "\n", DP_RC(rc)); + goto out; + } + +fetch_value: + rc = fetch_value(ioc, iod, toh, &val_epr, standalone); out: if (daos_handle_is_valid(toh)) key_tree_release(toh, is_array); @@ -1355,6 +1361,7 @@ dkey_fetch(struct vos_io_context *ioc, daos_key_t *dkey) struct vos_krec_df *krec; daos_handle_t toh = DAOS_HDL_INVAL; int i, rc; + int flags = 0; bool has_cond; bool standalone = ioc->ic_cont->vc_pool->vp_sysdb; @@ -1362,9 +1369,16 @@ dkey_fetch(struct vos_io_context *ioc, daos_key_t *dkey) if (rc != 0) return rc; - rc = key_tree_prepare(obj, obj->obj_toh, VOS_BTR_DKEY, - dkey, 0, DAOS_INTENT_DEFAULT, &krec, - &toh, ioc->ic_ts_set); + if (vos_obj_flat_kv_supported(ioc->ic_cont, obj->obj_id)) { + if (ioc->ic_iod_nr == 1) { + flags |= SUBTR_FLAT; + if (ioc->ic_iods[0].iod_type == DAOS_IOD_ARRAY) + flags |= SUBTR_EVT; + } + } + + rc = key_tree_prepare(obj, obj->obj_toh, VOS_BTR_DKEY, dkey, flags, DAOS_INTENT_DEFAULT, + &krec, &toh, ioc->ic_ts_set); if (stop_check(ioc, VOS_COND_FETCH_MASK | VOS_OF_COND_PER_AKEY, NULL, &rc, true)) { D_DEBUG(DB_IO, "Stop fetch "DF_UOID": "DF_RC"\n", DP_UOID(obj->obj_id), @@ -1408,14 +1422,19 @@ dkey_fetch(struct vos_io_context *ioc, daos_key_t *dkey) } fetch_akey: - for (i = 0; i < ioc->ic_iod_nr; i++) { - iod_set_cursor(ioc, i); - rc = akey_fetch(ioc, toh); - if (vos_dtx_continue_detect(rc, standalone)) - continue; + if (krec->kr_bmap & KREC_BF_FLAT) { + iod_set_cursor(ioc, 0); + rc = fetch_value(ioc, &ioc->ic_iods[0], toh, &ioc->ic_epr, standalone); + } else { + for (i = 0; i < ioc->ic_iod_nr; i++) { + iod_set_cursor(ioc, i); + rc = akey_fetch(ioc, toh); + if (vos_dtx_continue_detect(rc, standalone)) + continue; - if (rc != 0) - break; + if (rc != 0) + break; + } } /* Add this check to prevent some new added logic after above for(). */ @@ -1424,7 +1443,7 @@ dkey_fetch(struct vos_io_context *ioc, daos_key_t *dkey) out: if (daos_handle_is_valid(toh)) - key_tree_release(toh, false); + key_tree_release(toh, (krec->kr_bmap & KREC_BF_EVT) != 0); return vos_dtx_hit_inprogress(standalone) ? -DER_INPROGRESS : rc; } @@ -1732,6 +1751,59 @@ vos_ioc_mark_agg(struct vos_io_context *ioc) &ioc->ic_cont->vc_cont_df->cd_obj_root, ioc->ic_epr.epr_hi); } +static int +update_value(struct vos_io_context *ioc, daos_iod_t *iod, struct dcs_csum_info *iod_csums, + int pm_ver, daos_handle_t toh, uint16_t minor_epc) +{ + struct dcs_csum_info *recx_csum; + struct vos_object *obj = ioc->ic_obj; + int rc = 0; + int i; + + if (iod->iod_type == DAOS_IOD_SINGLE) { + uint64_t gsize = iod->iod_size; + + /* See obj_singv_ec_rw_filter. */ + if (ioc->ic_ec && iod->iod_recxs != NULL) + gsize = (uintptr_t)iod->iod_recxs; + + rc = akey_update_single(toh, pm_ver, iod->iod_size, gsize, ioc, minor_epc); + if (rc) + D_ERROR("akey " DF_KEY " update, akey_update_single failed, " DF_RC "\n", + DP_KEY(&iod->iod_name), DP_RC(rc)); + return rc; + } + + for (i = 0; i < iod->iod_nr; i++) { + umem_off_t umoff = iod_update_umoff(ioc); + + if (iod->iod_recxs[i].rx_nr == 0) { + D_ASSERT(UMOFF_IS_NULL(umoff)); + D_DEBUG(DB_IO, "Skip empty write IOD at %d: idx %lu, nr %lu\n", i, + (unsigned long)iod->iod_recxs[i].rx_idx, + (unsigned long)iod->iod_recxs[i].rx_nr); + continue; + } + + recx_csum = recx_csum_at(iod_csums, i, iod); + rc = akey_update_recx(toh, pm_ver, &iod->iod_recxs[i], recx_csum, iod->iod_size, + ioc, minor_epc); + if (rc == 1) { + ioc->ic_agg_needed = 1; + rc = 0; + } + if (rc != 0) { + VOS_TX_LOG_FAIL(rc, + DF_UOID " akey " DF_KEY " update, akey_update_recx" + " failed, " DF_RC "\n", + DP_UOID(obj->obj_id), DP_KEY(&iod->iod_name), DP_RC(rc)); + break; + } + } + + return rc; +} + static int akey_update(struct vos_io_context *ioc, uint32_t pm_ver, daos_handle_t ak_toh, uint16_t minor_epc) @@ -1739,13 +1811,11 @@ akey_update(struct vos_io_context *ioc, uint32_t pm_ver, daos_handle_t ak_toh, struct vos_object *obj = ioc->ic_obj; struct vos_krec_df *krec = NULL; daos_iod_t *iod = &ioc->ic_iods[ioc->ic_sgl_at]; - struct dcs_csum_info *iod_csums = vos_csum_at(ioc->ic_iod_csums, ioc->ic_sgl_at); - struct dcs_csum_info *recx_csum; + struct dcs_csum_info *iod_csums = vos_csum_at(ioc->ic_iod_csums, ioc->ic_sgl_at); uint32_t update_cond = 0; bool is_array = (iod->iod_type == DAOS_IOD_ARRAY); int flags = SUBTR_CREATE; - daos_handle_t toh = DAOS_HDL_INVAL; - int i; + daos_handle_t toh = DAOS_HDL_INVAL; int rc = 0; D_DEBUG(DB_TRACE, "akey "DF_KEY" update %s value eph "DF_X64"\n", @@ -1807,49 +1877,7 @@ akey_update(struct vos_io_context *ioc, uint32_t pm_ver, daos_handle_t ak_toh, goto out; } - if (iod->iod_type == DAOS_IOD_SINGLE) { - uint64_t gsize = iod->iod_size; - - /* See obj_singv_ec_rw_filter. */ - if (ioc->ic_ec && iod->iod_recxs != NULL) - gsize = (uintptr_t)iod->iod_recxs; - - rc = akey_update_single(toh, pm_ver, iod->iod_size, gsize, ioc, - minor_epc); - if (rc) - D_ERROR("akey "DF_KEY" update, akey_update_single failed, "DF_RC"\n", - DP_KEY(&iod->iod_name), DP_RC(rc)); - goto out; - } /* else: array */ - - for (i = 0; i < iod->iod_nr; i++) { - umem_off_t umoff = iod_update_umoff(ioc); - - if (iod->iod_recxs[i].rx_nr == 0) { - D_ASSERT(UMOFF_IS_NULL(umoff)); - D_DEBUG(DB_IO, - "Skip empty write IOD at %d: idx %lu, nr %lu\n", - i, (unsigned long)iod->iod_recxs[i].rx_idx, - (unsigned long)iod->iod_recxs[i].rx_nr); - continue; - } - - recx_csum = recx_csum_at(iod_csums, i, iod); - rc = akey_update_recx(toh, pm_ver, &iod->iod_recxs[i], - recx_csum, iod->iod_size, ioc, - minor_epc); - if (rc == 1) { - ioc->ic_agg_needed = 1; - rc = 0; - } - if (rc != 0) { - VOS_TX_LOG_FAIL(rc, DF_UOID" akey "DF_KEY" update, akey_update_recx" - " failed, "DF_RC"\n", DP_UOID(obj->obj_id), - DP_KEY(&iod->iod_name), DP_RC(rc)); - goto out; - } - } - + rc = update_value(ioc, iod, iod_csums, pm_ver, toh, minor_epc); out: if (daos_handle_is_valid(toh)) key_tree_release(toh, is_array); @@ -1868,6 +1896,7 @@ dkey_update(struct vos_io_context *ioc, uint32_t pm_ver, daos_key_t *dkey, daos_handle_t ak_toh; struct vos_krec_df *krec; uint32_t update_cond = 0; + uint32_t flags = SUBTR_CREATE; bool subtr_created = false; int i, rc; @@ -1875,9 +1904,16 @@ dkey_update(struct vos_io_context *ioc, uint32_t pm_ver, daos_key_t *dkey, if (rc != 0) return rc; - rc = key_tree_prepare(obj, obj->obj_toh, VOS_BTR_DKEY, dkey, - SUBTR_CREATE, DAOS_INTENT_UPDATE, &krec, &ak_toh, - ioc->ic_ts_set); + if (vos_obj_flat_kv_supported(ioc->ic_cont, obj->obj_id)) { + if (ioc->ic_iod_nr == 1) { + flags |= SUBTR_FLAT; + if (ioc->ic_iods[0].iod_type == DAOS_IOD_ARRAY) + flags |= SUBTR_EVT; + } + } + + rc = key_tree_prepare(obj, obj->obj_toh, VOS_BTR_DKEY, dkey, flags, DAOS_INTENT_UPDATE, + &krec, &ak_toh, ioc->ic_ts_set); if (rc != 0) { D_ERROR("Error preparing dkey tree: rc="DF_RC"\n", DP_RC(rc)); goto out; @@ -1908,12 +1944,18 @@ dkey_update(struct vos_io_context *ioc, uint32_t pm_ver, daos_key_t *dkey, goto out; } - for (i = 0; i < ioc->ic_iod_nr; i++) { - iod_set_cursor(ioc, i); + if (krec->kr_bmap & KREC_BF_FLAT) { + struct dcs_csum_info *iod_csums = vos_csum_at(ioc->ic_iod_csums, 0); + iod_set_cursor(ioc, 0); + rc = update_value(ioc, &ioc->ic_iods[0], iod_csums, pm_ver, ak_toh, minor_epc); + } else { + for (i = 0; i < ioc->ic_iod_nr; i++) { + iod_set_cursor(ioc, i); - rc = akey_update(ioc, pm_ver, ak_toh, minor_epc); - if (rc != 0) - goto out; + rc = akey_update(ioc, pm_ver, ak_toh, minor_epc); + if (rc != 0) + goto out; + } } out: @@ -1924,7 +1966,7 @@ dkey_update(struct vos_io_context *ioc, uint32_t pm_ver, daos_key_t *dkey, goto release; release: - key_tree_release(ak_toh, false); + key_tree_release(ak_toh, (krec->kr_bmap & KREC_BF_EVT) != 0); if (rc == 0 && ioc->ic_agg_needed) rc = vos_key_mark_agg(ioc->ic_cont, krec, ioc->ic_epr.epr_hi); diff --git a/src/vos/vos_iterator.c b/src/vos/vos_iterator.c index a22af96de456..1537f4b4369a 100644 --- a/src/vos/vos_iterator.c +++ b/src/vos/vos_iterator.c @@ -23,46 +23,46 @@ struct vos_iter_dict { }; static struct vos_iter_dict vos_iterators[] = { - { - .id_type = VOS_ITER_COUUID, - .id_name = "co", - .id_ops = &vos_cont_iter_ops, - }, - { - .id_type = VOS_ITER_OBJ, - .id_name = "obj", - .id_ops = &vos_oi_iter_ops, - }, - { - .id_type = VOS_ITER_DKEY, - .id_name = "dkey", - .id_ops = &vos_obj_iter_ops, - }, - { - .id_type = VOS_ITER_AKEY, - .id_name = "akey", - .id_ops = &vos_obj_iter_ops, - }, - { - .id_type = VOS_ITER_SINGLE, - .id_name = "single", - .id_ops = &vos_obj_iter_ops, - }, - { - .id_type = VOS_ITER_RECX, - .id_name = "recx", - .id_ops = &vos_obj_iter_ops, - }, - { - .id_type = VOS_ITER_DTX, - .id_name = "dtx", - .id_ops = &vos_dtx_iter_ops, - }, - { - .id_type = VOS_ITER_NONE, - .id_name = "unknown", - .id_ops = NULL, - }, + { + .id_type = VOS_ITER_COUUID, + .id_name = "co", + .id_ops = &vos_cont_iter_ops, + }, + { + .id_type = VOS_ITER_OBJ, + .id_name = "obj", + .id_ops = &vos_oi_iter_ops, + }, + { + .id_type = VOS_ITER_DKEY, + .id_name = "dkey", + .id_ops = &vos_obj_dkey_iter_ops, + }, + { + .id_type = VOS_ITER_AKEY, + .id_name = "akey", + .id_ops = &vos_obj_akey_iter_ops, + }, + { + .id_type = VOS_ITER_SINGLE, + .id_name = "single", + .id_ops = &vos_obj_sv_iter_ops, + }, + { + .id_type = VOS_ITER_RECX, + .id_name = "recx", + .id_ops = &vos_obj_ev_iter_ops, + }, + { + .id_type = VOS_ITER_DTX, + .id_name = "dtx", + .id_ops = &vos_dtx_iter_ops, + }, + { + .id_type = VOS_ITER_NONE, + .id_name = "unknown", + .id_ops = NULL, + }, }; const char * @@ -84,7 +84,7 @@ nested_prepare(vos_iter_type_t type, struct vos_iter_dict *dict, struct vos_iterator *iter = vos_hdl2iter(param->ip_ih); struct vos_iterator *citer; struct dtx_handle *old; - struct vos_iter_info info; + struct vos_iter_info info = {}; int rc; D_ASSERT(iter->it_ops != NULL); @@ -117,8 +117,7 @@ nested_prepare(vos_iter_type_t type, struct vos_iter_dict *dict, info.ii_epc_expr = param->ip_epc_expr; info.ii_recx = param->ip_recx; - info.ii_flags = param->ip_flags; - info.ii_akey = ¶m->ip_akey; + info.ii_flags = param->ip_flags; rc = dict->id_ops->iop_nested_prepare(type, &info, &citer); if (rc != 0) { diff --git a/src/vos/vos_layout.h b/src/vos/vos_layout.h index 1746147800d1..ed3fe7688840 100644 --- a/src/vos/vos_layout.h +++ b/src/vos/vos_layout.h @@ -102,7 +102,7 @@ enum vos_gc_type { #define VOS_POOL_FEAT_2_4 (VOS_POOL_FEAT_CHK | VOS_POOL_FEAT_DYN_ROOT) /** 2.6 features */ -#define VOS_POOL_FEAT_2_6 (VOS_POOL_FEAT_EMB_VALUE) +#define VOS_POOL_FEAT_2_6 (VOS_POOL_FEAT_EMB_VALUE | VOS_POOL_FEAT_FLAT_DKEY) /** * Durable format for VOS pool @@ -288,11 +288,13 @@ D_CASSERT(offsetof(struct vos_cont_df, cd_dtx_committed_tail) == /** btree (d/a-key) record bit flags */ enum vos_krec_bf { /* Array value (evtree) */ - KREC_BF_EVT = (1 << 0), + KREC_BF_EVT = (1 << 0), /* Single Value or Key (btree) */ - KREC_BF_BTR = (1 << 1), - /* it's a dkey, otherwise is akey */ - KREC_BF_DKEY = (1 << 2), + KREC_BF_BTR = (1 << 1), + /* it's a dkey, otherwise is akey or single value if KREC_BF_FLAT is set */ + KREC_BF_DKEY = (1 << 2), + /* Value is stored in DKEY */ + KREC_BF_FLAT = (1 << 3), }; /** diff --git a/src/vos/vos_obj.c b/src/vos/vos_obj.c index b7ce02edd08e..ced0293872cd 100644 --- a/src/vos/vos_obj.c +++ b/src/vos/vos_obj.c @@ -24,6 +24,18 @@ D_CASSERT((uint32_t)VOS_VIS_FLAG_VISIBLE == (uint32_t)EVT_VISIBLE); D_CASSERT((uint32_t)VOS_VIS_FLAG_PARTIAL == (uint32_t)EVT_PARTIAL); D_CASSERT((uint32_t)VOS_VIS_FLAG_LAST == (uint32_t)EVT_LAST); +static inline bool +is_fake_iter(struct vos_obj_iter *oiter) +{ + return (oiter->it_flags & (VOS_IT_DKEY_EV | VOS_IT_DKEY_SV)) != 0; +} + +static inline bool +fake_iter_child_is_array(struct vos_obj_iter *oiter) +{ + return (oiter->it_flags & VOS_IT_DKEY_EV) != 0; +} + bool vos_dkey_punch_propagate; struct vos_key_info { @@ -343,7 +355,7 @@ key_punch(struct vos_object *obj, daos_epoch_t epoch, daos_epoch_t bound, vos_ilog_fetch_finish(&info->ki_akey); if (daos_handle_is_valid(toh)) - key_tree_release(toh, 0); + key_tree_release(toh, (krec->kr_bmap & KREC_BF_EVT) != 0); D_FREE(info); @@ -418,6 +430,15 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, if (oid.id_shard % 3 == 1 && DAOS_FAIL_CHECK(DAOS_DTX_FAIL_IO)) return -DER_IO; + cont = vos_hdl2cont(coh); + + if (vos_obj_flat_kv_supported(cont, oid) && dkey != NULL && akeys != NULL) { + D_ERROR("Akey punch is not supported with flat object types: " DF_UOID "\n", + DP_UOID(oid)); + + return -DER_INVAL; + } + if (dtx_is_valid_handle(dth)) { epr.epr_hi = dth->dth_epoch; bound = MAX(dth->dth_epoch_bound, dth->dth_epoch); @@ -429,8 +450,6 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, D_DEBUG(DB_IO, "Punch "DF_UOID", epoch "DF_X64"\n", DP_UOID(oid), epr.epr_hi); - cont = vos_hdl2cont(coh); - if (dtx_is_valid_handle(dth)) { if (akey_nr) { cflags = VOS_TS_WRITE_AKEY; @@ -562,8 +581,10 @@ vos_obj_key2anchor(daos_handle_t coh, daos_unit_oid_t oid, daos_key_t *dkey, dao daos_anchor_t *anchor) { struct vos_container *cont; + struct vos_krec_df *krec = NULL; struct daos_lru_cache *occ; int rc; + int flags = 0; struct vos_object *obj; daos_epoch_range_t epr = {0, DAOS_EPOCH_MAX}; daos_handle_t toh; @@ -598,9 +619,16 @@ vos_obj_key2anchor(daos_handle_t coh, daos_unit_oid_t oid, daos_key_t *dkey, dao goto out; } + /** If the dkey is flat, this will enable the operation to succeed */ + if (vos_obj_flat_kv_supported(obj->obj_cont, obj->obj_id)) { + flags |= SUBTR_FLAT; + if (daos_is_array(obj->obj_id.id_pub)) + flags |= SUBTR_EVT; + } + /** Otherwise, we need to find the dkey to convert the akey to the anchor */ - rc = key_tree_prepare(obj, obj->obj_toh, VOS_BTR_DKEY, dkey, 0, DAOS_INTENT_DEFAULT, NULL, - &toh, NULL); + rc = key_tree_prepare(obj, obj->obj_toh, VOS_BTR_DKEY, dkey, flags, DAOS_INTENT_DEFAULT, + &krec, &toh, NULL); if (rc) { if (rc == -DER_NONEXIST) { daos_anchor_set_eof(anchor); @@ -611,12 +639,19 @@ vos_obj_key2anchor(daos_handle_t coh, daos_unit_oid_t oid, daos_key_t *dkey, dao D_GOTO(out, rc); } - rc = dbtree_key2anchor(toh, akey, anchor); + if (krec->kr_bmap & KREC_BF_FLAT) { + /** There is no akey tree to query. In accordance with the design to fake it for + * iterators, let's create a fake anchor + */ + vos_fake_anchor_create(anchor); + } else { + rc = dbtree_key2anchor(toh, akey, anchor); + } D_DEBUG(DB_TRACE, "oid=" DF_UOID " dkey=" DF_KEY " akey=" DF_KEY " to anchor: rc=" DF_RC "\n", DP_UOID(oid), DP_KEY(dkey), DP_KEY(akey), DP_RC(rc)); - key_tree_release(toh, false); + key_tree_release(toh, (krec->kr_bmap & KREC_BF_EVT) != 0); out: vos_obj_release(occ, obj, false); @@ -769,8 +804,8 @@ key_iter_ilog_check(struct vos_krec_df *krec, struct vos_obj_iter *oiter, } static int -key_ilog_prepare(struct vos_obj_iter *oiter, daos_handle_t toh, int key_type, - daos_key_t *key, int flags, daos_handle_t *sub_toh, +key_ilog_prepare(struct vos_obj_iter *oiter, daos_handle_t toh, int key_type, daos_key_t *key, + int flags, daos_handle_t *sub_toh, struct vos_krec_df **krecp, daos_epoch_range_t *epr, struct vos_punch_record *punched, struct vos_ilog_info *info, struct vos_ts_set *ts_set) { @@ -778,6 +813,9 @@ key_ilog_prepare(struct vos_obj_iter *oiter, daos_handle_t toh, int key_type, struct vos_object *obj = oiter->it_obj; int rc; + if (krecp != NULL) + *krecp = NULL; + rc = key_tree_prepare(obj, toh, key_type, key, flags, vos_iter_intent(&oiter->it_iter), &krec, sub_toh, ts_set); @@ -799,6 +837,9 @@ key_ilog_prepare(struct vos_obj_iter *oiter, daos_handle_t toh, int key_type, &info->ii_prior_punch)) *punched = info->ii_prior_punch; + if (krecp != NULL) + *krecp = krec; + return 0; fail: if (sub_toh) @@ -806,6 +847,23 @@ key_ilog_prepare(struct vos_obj_iter *oiter, daos_handle_t toh, int key_type, return rc; } +static inline int +key_ilog_prepare_dkey(struct vos_obj_iter *oiter, daos_key_t *key, daos_handle_t *sub_toh, + struct vos_krec_df **krecp, struct vos_ts_set *ts_set) +{ + struct vos_object *obj = oiter->it_obj; + int flags = 0; + + if (vos_obj_flat_kv_supported(obj->obj_cont, obj->obj_id)) { + flags |= SUBTR_FLAT; + if (daos_is_array(obj->obj_id.id_pub)) + flags |= SUBTR_EVT; + } + + return key_ilog_prepare(oiter, obj->obj_toh, VOS_BTR_DKEY, key, flags, sub_toh, krecp, + &oiter->it_epr, &oiter->it_punched, &oiter->it_ilog_info, ts_set); +} + /** * @defgroup vos_obj_iters VOS object iterators * @{ @@ -934,6 +992,7 @@ key_iter_fetch_root(struct vos_obj_iter *oiter, vos_iter_type_t type, struct vos_iter_info *info) { struct vos_object *obj = oiter->it_obj; + struct evt_desc_cbs cbs; struct vos_krec_df *krec; struct vos_rec_bundle rbund; d_iov_t keybuf; @@ -968,10 +1027,42 @@ key_iter_fetch_root(struct vos_obj_iter *oiter, vos_iter_type_t type, if ((krec->kr_bmap & KREC_BF_EVT) == 0) return -DER_NONEXIST; info->ii_evt = &krec->kr_evt; - } else { + } else if (type == VOS_ITER_SINGLE || (krec->kr_bmap & KREC_BF_FLAT) == 0) { if ((krec->kr_bmap & KREC_BF_BTR) == 0) return -DER_NONEXIST; info->ii_btr = &krec->kr_btr; + } else { + D_ASSERTF(type == VOS_ITER_AKEY, "type = %d\n", type); + D_ASSERTF(krec->kr_bmap & KREC_BF_FLAT, "krec->kr_bmap = %x\n", krec->kr_bmap); + /** For fake akey, we open the subtree and store it in the + * iterator handle. For nested case, go ahead and open the + * subtree + */ + if (krec->kr_bmap & KREC_BF_EVT) { + vos_evt_desc_cbs_init(&cbs, vos_obj2pool(obj), vos_cont2hdl(obj->obj_cont)); + rc = evt_open(info->ii_evt, info->ii_uma, &cbs, &info->ii_tree_hdl); + if (rc) { + D_DEBUG(DB_TRACE, + "Failed to open tree for nested iterator:" + " rc = " DF_RC "\n", + DP_RC(rc)); + return rc; + } + info->ii_fake_akey_flag = VOS_IT_DKEY_EV; + } else { + rc = dbtree_open_inplace_ex(info->ii_btr, info->ii_uma, + vos_cont2hdl(obj->obj_cont), vos_obj2pool(obj), + &info->ii_tree_hdl); + if (rc) { + D_DEBUG(DB_TRACE, + "Failed to open tree for nested iterator:" + " rc = " DF_RC "\n", + DP_RC(rc)); + return rc; + } + info->ii_fake_akey_flag = VOS_IT_DKEY_SV; + info->ii_ilog_info = &oiter->it_ilog_info; + } } return 0; @@ -992,93 +1083,16 @@ key_iter_copy(struct vos_obj_iter *oiter, vos_iter_entry_t *ent, return 0; } -/** - * Check if the current entry can match the iterator condition, this function - * returns VOS_ITER_CB_NONE for true, returns VOS_ITER_CB_SKIP if further - * operation is required. - */ -static int -key_iter_match(struct vos_obj_iter *oiter, vos_iter_entry_t *ent, daos_anchor_t *anchor, - uint32_t flags) -{ - struct vos_object *obj = oiter->it_obj; - daos_epoch_range_t *epr = &oiter->it_epr; - struct vos_ilog_info info; - daos_handle_t toh; - int rc; - - rc = key_iter_fetch(oiter, ent, anchor, true, flags); - if (rc != 0) { - VOS_TX_TRACE_FAIL(rc, "Failed to fetch the entry: "DF_RC"\n", DP_RC(rc)); - return rc; - } - - if ((oiter->it_iter.it_type == VOS_ITER_AKEY) || - (oiter->it_akey.iov_buf == NULL)) /* dkey w/o akey as condition */ - return VOS_ITER_CB_NONE; - - /* else: has akey as condition */ - if (epr->epr_lo != epr->epr_hi || (oiter->it_flags & VOS_IT_PUNCHED)) { - D_ERROR("Cannot support epoch range for conditional iteration " - "because it is not clearly defined.\n"); - return -DER_INVAL; /* XXX simplify it for now */ - } - - rc = key_tree_prepare(obj, obj->obj_toh, VOS_BTR_DKEY, - &ent->ie_key, 0, vos_iter_intent(&oiter->it_iter), - NULL, &toh, NULL); - if (rc != 0) { - D_DEBUG(DB_IO, "can't load the akey tree: "DF_RC"\n", - DP_RC(rc)); - return rc; - } - - vos_ilog_fetch_init(&info); - rc = key_ilog_prepare(oiter, toh, VOS_BTR_AKEY, &oiter->it_akey, 0, - NULL, NULL, NULL, &info, NULL); - if (rc == 0) - rc = VOS_ITER_CB_NONE; - - if (rc == -DER_NONEXIST) - rc = VOS_ITER_CB_SKIP; - - vos_ilog_fetch_finish(&info); - key_tree_release(toh, false); - - return rc; -} - -/** - * Check if the current item can match the provided condition (with the - * giving a-key). If the item can't match the condition, this function - * traverses the tree until a matched item is found. - */ +/** Check the current key */ static int key_iter_match_probe(struct vos_obj_iter *oiter, daos_anchor_t *anchor, uint32_t flags) { static __thread vos_iter_entry_t entry; int rc; -retry: - rc = key_iter_match(oiter, &entry, anchor, flags); - switch (rc) { - default: - /** Either there is an error, we aborted the iterator, or - * the callback imposed a yield and we need to tell upper - * layer to re-probe - */ - break; - case VOS_ITER_CB_NONE: - /* already match the condition, no further operation */ + rc = key_iter_fetch(oiter, &entry, anchor, true, flags); + if (rc == VOS_ITER_CB_NONE) rc = 0; - break; - case VOS_ITER_CB_SKIP: - flags = 0; - /* move to the next tree record */ - rc = dbtree_iter_next(oiter->it_hdl); - if (rc == 0) - goto retry; - } D_ASSERT(rc <= 0 || (rc & (VOS_ITER_CB_EXIT | VOS_ITER_CB_DELETE | VOS_ITER_CB_YIELD | VOS_ITER_CB_ABORT)) != 0); VOS_TX_TRACE_FAIL(rc, "match failed, rc="DF_RC"\n", @@ -1093,6 +1107,7 @@ key_iter_probe(struct vos_obj_iter *oiter, daos_anchor_t *anchor, uint32_t flags int rc; next_opc = (flags & VOS_ITER_PROBE_NEXT) ? BTR_PROBE_GT : BTR_PROBE_GE; + rc = dbtree_iter_probe(oiter->it_hdl, vos_anchor_is_zero(anchor) ? BTR_PROBE_FIRST : next_opc, vos_iter_intent(&oiter->it_iter), @@ -1123,12 +1138,10 @@ key_iter_next(struct vos_obj_iter *oiter, daos_anchor_t *anchor) * Iterator for the d-key tree. */ static int -dkey_iter_prepare(struct vos_obj_iter *oiter, daos_key_t *akey) +dkey_iter_prepare(struct vos_obj_iter *oiter) { int rc; - oiter->it_akey = *akey; - rc = dbtree_iter_prepare(oiter->it_obj->obj_toh, 0, &oiter->it_hdl); return rc; @@ -1142,17 +1155,31 @@ akey_iter_prepare(struct vos_obj_iter *oiter, daos_key_t *dkey, struct vos_ts_set *ts_set) { daos_handle_t toh; - int rc; + struct vos_krec_df *krec = NULL; + int rc; - rc = key_ilog_prepare(oiter, oiter->it_obj->obj_toh, VOS_BTR_DKEY, dkey, - 0, &toh, &oiter->it_epr, &oiter->it_punched, - &oiter->it_ilog_info, ts_set); + rc = key_ilog_prepare_dkey(oiter, dkey, &toh, &krec, ts_set); if (rc != 0) goto failed; - /* see BTR_ITER_EMBEDDED for the details */ - rc = dbtree_iter_prepare(toh, BTR_ITER_EMBEDDED, &oiter->it_hdl); - key_tree_release(toh, false); + if (krec->kr_bmap & KREC_BF_FLAT) { + /** In such case, toh will refer to a child tree so we an + * initialze its iterator in such case as it is needed. + * We also set the type of the tree here so we know what + * type of nested iterator we need to use. + */ + oiter->it_hdl = toh; + if (krec->kr_bmap & KREC_BF_EVT) + oiter->it_flags |= VOS_IT_DKEY_EV; + else + oiter->it_flags |= VOS_IT_DKEY_SV; + oiter->it_fake_akey = 0; + oiter->it_dkey_krec = krec; + } else { + /* see BTR_ITER_EMBEDDED for the details */ + rc = dbtree_iter_prepare(toh, BTR_ITER_EMBEDDED, &oiter->it_hdl); + key_tree_release(toh, false); + } if (rc == 0) return 0; @@ -1186,22 +1213,25 @@ static int singv_iter_prepare(struct vos_obj_iter *oiter, daos_key_t *dkey, daos_key_t *akey) { - struct vos_object *obj = oiter->it_obj; + struct vos_krec_df *krec = NULL; daos_handle_t ak_toh; daos_handle_t sv_toh; int rc; - rc = key_ilog_prepare(oiter, obj->obj_toh, VOS_BTR_DKEY, dkey, 0, - &ak_toh, &oiter->it_epr, &oiter->it_punched, - &oiter->it_ilog_info, NULL); + rc = key_ilog_prepare_dkey(oiter, dkey, &ak_toh, &krec, NULL); if (rc != 0) return rc; - rc = key_ilog_prepare(oiter, ak_toh, VOS_BTR_AKEY, akey, 0, &sv_toh, - &oiter->it_epr, &oiter->it_punched, - &oiter->it_ilog_info, NULL); - if (rc != 0) - D_GOTO(failed_1, rc); + if (krec->kr_bmap & KREC_BF_FLAT) { + sv_toh = ak_toh; + ak_toh = DAOS_HDL_INVAL; + } else { + rc = key_ilog_prepare(oiter, ak_toh, VOS_BTR_AKEY, akey, 0, &sv_toh, NULL, + &oiter->it_epr, &oiter->it_punched, &oiter->it_ilog_info, + NULL); + if (rc != 0) + D_GOTO(failed_1, rc); + } /* see BTR_ITER_EMBEDDED for the details */ rc = dbtree_iter_prepare(sv_toh, BTR_ITER_EMBEDDED, &oiter->it_hdl); @@ -1210,7 +1240,8 @@ singv_iter_prepare(struct vos_obj_iter *oiter, daos_key_t *dkey, DP_RC(rc)); key_tree_release(sv_toh, false); failed_1: - key_tree_release(ak_toh, false); + if (daos_handle_is_valid(ak_toh)) + key_tree_release(ak_toh, false); return rc; } @@ -1486,24 +1517,27 @@ static int recx_iter_prepare(struct vos_obj_iter *oiter, daos_key_t *dkey, daos_key_t *akey, struct vos_ts_set *ts_set) { - struct vos_object *obj = oiter->it_obj; + struct vos_krec_df *krec = NULL; struct evt_filter filter = {0}; daos_handle_t ak_toh; daos_handle_t rx_toh; int rc; uint32_t options; - rc = key_ilog_prepare(oiter, obj->obj_toh, VOS_BTR_DKEY, dkey, 0, - &ak_toh, &oiter->it_epr, &oiter->it_punched, - &oiter->it_ilog_info, ts_set); + rc = key_ilog_prepare_dkey(oiter, dkey, &ak_toh, &krec, NULL); if (rc != 0) return rc; - rc = key_ilog_prepare(oiter, ak_toh, VOS_BTR_AKEY, akey, SUBTR_EVT, - &rx_toh, &oiter->it_epr, &oiter->it_punched, - &oiter->it_ilog_info, ts_set); - if (rc != 0) - D_GOTO(failed, rc); + if (krec->kr_bmap & KREC_BF_FLAT) { + rx_toh = ak_toh; + ak_toh = DAOS_HDL_INVAL; + } else { + rc = key_ilog_prepare(oiter, ak_toh, VOS_BTR_AKEY, akey, SUBTR_EVT, &rx_toh, NULL, + &oiter->it_epr, &oiter->it_punched, &oiter->it_ilog_info, + ts_set); + if (rc != 0) + D_GOTO(failed, rc); + } recx2filter(&filter, &oiter->it_recx); filter.fr_epr.epr_lo = oiter->it_epr.epr_lo; @@ -1520,7 +1554,8 @@ recx_iter_prepare(struct vos_obj_iter *oiter, daos_key_t *dkey, } key_tree_release(rx_toh, true); failed: - key_tree_release(ak_toh, false); + if (daos_handle_is_valid(ak_toh)) + key_tree_release(ak_toh, false); return rc; } static int @@ -1693,7 +1728,7 @@ vos_obj_iter_prep(vos_iter_type_t type, vos_iter_param_t *param, break; case VOS_ITER_DKEY: - rc = dkey_iter_prepare(oiter, ¶m->ip_akey); + rc = dkey_iter_prepare(oiter); break; case VOS_ITER_AKEY: @@ -1723,34 +1758,68 @@ vos_obj_iter_prep(vos_iter_type_t type, vos_iter_param_t *param, } int -vos_obj_iter_nested_tree_fetch(struct vos_iterator *iter, vos_iter_type_t type, - struct vos_iter_info *info) +vos_obj_iter_dkey_nested_tree_fetch(struct vos_iterator *iter, vos_iter_type_t type, + struct vos_iter_info *info) { struct vos_obj_iter *oiter = vos_iter2oiter(iter); int rc = 0; - switch (iter->it_type) { - default: - D_ASSERT(0); - case VOS_ITER_RECX: - case VOS_ITER_SINGLE: - D_ERROR("Iterator type has no subtree\n"); + if (unlikely(type != VOS_ITER_AKEY)) { + D_ERROR("Invalid nested iterator type for " + "VOS_ITER_DKEY: %d\n", + type); return -DER_INVAL; - case VOS_ITER_DKEY: - if (type != VOS_ITER_AKEY) { - D_ERROR("Invalid nested iterator type for " - "VOS_ITER_DKEY: %d\n", type); - return -DER_INVAL; - } - break; - case VOS_ITER_AKEY: - if (type != VOS_ITER_RECX && - type != VOS_ITER_SINGLE) { - D_ERROR("Invalid nested iterator type for " - "VOS_ITER_AKEY: %d\n", type); - return -DER_INVAL; - } - }; + } + + rc = key_iter_fetch_root(oiter, type, info); + + if (rc != 0) { + D_DEBUG(DB_TRACE, + "Failed to fetch and initialize cursor " + "subtree: rc=" DF_RC "\n", + DP_RC(rc)); + return rc; + } + + info->ii_obj = oiter->it_obj; + + return 0; +} + +int +vos_obj_iter_akey_nested_tree_fetch(struct vos_iterator *iter, vos_iter_type_t type, + struct vos_iter_info *info) +{ + struct vos_obj_iter *oiter = vos_iter2oiter(iter); + int rc = 0; + + if (unlikely(type != VOS_ITER_RECX && type != VOS_ITER_SINGLE)) { + D_ERROR("Invalid nested iterator type for " + "VOS_ITER_AKEY: %d\n", + type); + return -DER_INVAL; + } + + if (is_fake_iter(oiter)) { + info->ii_vea_info = oiter->it_obj->obj_cont->vc_pool->vp_vea_info; + info->ii_uma = vos_obj2uma(oiter->it_obj); + + info->ii_epr = oiter->it_epr; + info->ii_punched = oiter->it_punched; + info->ii_filter_cb = oiter->it_iter.it_filter_cb; + info->ii_filter_arg = oiter->it_iter.it_filter_arg; + + if (vos_epc_punched(info->ii_punched.pr_epc, info->ii_punched.pr_minor_epc, + &oiter->it_ilog_info.ii_prior_punch)) + info->ii_punched = oiter->it_ilog_info.ii_prior_punch; + + info->ii_tree_hdl = oiter->it_hdl; + /** Tells the prepare that it should use the handle to prepare + * the nested tree handle to prepare nested iterator + */ + info->ii_fake_akey_flag = oiter->it_flags & (VOS_IT_DKEY_SV | VOS_IT_DKEY_EV); + goto out; + } rc = key_iter_fetch_root(oiter, type, info); @@ -1760,13 +1829,22 @@ vos_obj_iter_nested_tree_fetch(struct vos_iterator *iter, vos_iter_type_t type, return rc; } +out: info->ii_obj = oiter->it_obj; return 0; } +int +vos_obj_iter_invalid_nested_tree_fetch(struct vos_iterator *iter, vos_iter_type_t type, + struct vos_iter_info *info) +{ + D_ERROR("Iterator type has no subtree\n"); + return -DER_INVAL; +} + static int -nested_dkey_iter_init(struct vos_obj_iter *oiter, struct vos_iter_info *info) +dkey_nested_iter_init(struct vos_obj_iter *oiter, struct vos_iter_info *info) { int rc; struct vos_container *cont = vos_hdl2cont(info->ii_hdl); @@ -1799,7 +1877,7 @@ nested_dkey_iter_init(struct vos_obj_iter *oiter, struct vos_iter_info *info) info->ii_punched = oiter->it_obj->obj_ilog_info.ii_prior_punch; - rc = dkey_iter_prepare(oiter, info->ii_akey); + rc = dkey_iter_prepare(oiter); if (rc != 0) goto failed; @@ -1811,31 +1889,23 @@ nested_dkey_iter_init(struct vos_obj_iter *oiter, struct vos_iter_info *info) return rc; } -int -vos_obj_iter_nested_prep(vos_iter_type_t type, struct vos_iter_info *info, - struct vos_iterator **iter_pp) +static inline int +nested_prep_common_init(struct vos_container *cont, struct vos_obj_iter **oiterp, + struct vos_iter_info *info) { - struct vos_object *obj = info->ii_obj; - struct vos_obj_iter *oiter; - struct vos_container *vos_cont; + struct vos_obj_iter *oiter; struct dtx_handle *dth; - daos_epoch_t bound; - struct evt_desc_cbs cbs; - struct evt_filter filter = {0}; - daos_handle_t toh; - int rc = 0; - uint32_t options; + daos_epoch_t bound; + + *oiterp = NULL; - if (type != VOS_ITER_DKEY) - vos_cont = obj->obj_cont; - else - vos_cont = vos_hdl2cont(info->ii_hdl); - dth = vos_dth_get(vos_cont->vc_pool->vp_sysdb); D_ALLOC_PTR(oiter); if (oiter == NULL) return -DER_NOMEM; + *oiterp = oiter; vos_ilog_fetch_init(&oiter->it_ilog_info); + dth = vos_dth_get(cont->vc_pool->vp_sysdb); bound = dtx_is_valid_handle(dth) ? dth->dth_epoch_bound : info->ii_epr.epr_hi; oiter->it_iter.it_bound = MAX(bound, info->ii_epr.epr_hi); @@ -1844,63 +1914,93 @@ vos_obj_iter_nested_prep(vos_iter_type_t type, struct vos_iter_info *info, oiter->it_iter.it_filter_arg = info->ii_filter_arg; oiter->it_punched = info->ii_punched; oiter->it_epc_expr = info->ii_epc_expr; - oiter->it_flags = info->ii_flags; - if (type != VOS_ITER_DKEY) - oiter->it_obj = obj; + oiter->it_flags = info->ii_flags; if (info->ii_flags & VOS_IT_FOR_PURGE) oiter->it_iter.it_for_purge = 1; if (info->ii_flags & VOS_IT_FOR_DISCARD) oiter->it_iter.it_for_discard = 1; if (info->ii_flags & VOS_IT_FOR_MIGRATION) oiter->it_iter.it_for_migration = 1; - if (vos_cont->vc_pool->vp_sysdb) + if (cont->vc_pool->vp_sysdb) oiter->it_iter.it_for_sysdb = 1; - switch (type) { - default: - D_ERROR("unknown iterator type %d.\n", type); - rc = -DER_INVAL; - goto failed; + return 0; +} - case VOS_ITER_DKEY: - rc = nested_dkey_iter_init(oiter, info); - if (rc != 0) - goto failed; +static inline void +nested_prep_common_abort(struct vos_obj_iter *oiter) +{ + vos_ilog_fetch_finish(&oiter->it_ilog_info); + D_FREE(oiter); +} + +static int +vos_obj_iter_dkey_nested_prep(vos_iter_type_t type, struct vos_iter_info *info, + struct vos_iterator **iter_pp) +{ + struct vos_obj_iter *oiter; + int rc = 0; + + if (type != VOS_ITER_DKEY) { + D_ERROR("Unexpected type: %d\n", type); + return -DER_INVAL; + } + + rc = nested_prep_common_init(vos_hdl2cont(info->ii_hdl), &oiter, info); + if (rc != 0) + return rc; + + rc = dkey_nested_iter_init(oiter, info); + if (rc == 0) { + *iter_pp = &oiter->it_iter; + return 0; + } + + nested_prep_common_abort(oiter); + return rc; +} + +static int +vos_obj_iter_akey_nested_prep(vos_iter_type_t type, struct vos_iter_info *info, + struct vos_iterator **iter_pp) +{ + struct vos_object *obj = info->ii_obj; + struct vos_obj_iter *oiter; + daos_handle_t toh; + int rc = 0; + + if (type != VOS_ITER_AKEY) { + D_ERROR("Unexpected type: %d\n", type); + return -DER_INVAL; + } + + rc = nested_prep_common_init(obj->obj_cont, &oiter, info); + + oiter->it_obj = obj; + + if (info->ii_fake_akey_flag) { + /** In this case, we already opened the subtree so just store it + * in the iterator handle for future use. + */ + vos_ilog_copy_info(&oiter->it_ilog_info, info->ii_ilog_info); + oiter->it_hdl = info->ii_tree_hdl; + oiter->it_flags |= info->ii_fake_akey_flag; + oiter->it_fake_akey = 0; goto success; - case VOS_ITER_AKEY: - case VOS_ITER_SINGLE: - rc = dbtree_open_inplace_ex(info->ii_btr, info->ii_uma, - vos_cont2hdl(obj->obj_cont), - vos_obj2pool(obj), &toh); - if (rc) { - D_DEBUG(DB_TRACE, "Failed to open tree for iterator:" - " rc = "DF_RC"\n", DP_RC(rc)); - goto failed; - } - rc = dbtree_iter_prepare(toh, BTR_ITER_EMBEDDED, - &oiter->it_hdl); - break; + } - case VOS_ITER_RECX: - vos_evt_desc_cbs_init(&cbs, vos_obj2pool(obj), - vos_cont2hdl(obj->obj_cont)); - rc = evt_open(info->ii_evt, info->ii_uma, &cbs, &toh); - if (rc) { - D_DEBUG(DB_TRACE, "Failed to open tree for iterator:" - " rc = "DF_RC"\n", DP_RC(rc)); - goto failed; - } - recx2filter(&filter, &info->ii_recx); - filter.fr_epr.epr_lo = oiter->it_epr.epr_lo; - filter.fr_epr.epr_hi = oiter->it_iter.it_bound; - filter.fr_epoch = oiter->it_epr.epr_hi; - filter.fr_punch_epc = oiter->it_punched.pr_epc; - filter.fr_punch_minor_epc = oiter->it_punched.pr_minor_epc; - options = recx_get_flags(oiter); - rc = evt_iter_prepare(toh, options, &filter, &oiter->it_hdl); - break; + rc = dbtree_open_inplace_ex(info->ii_btr, info->ii_uma, vos_cont2hdl(obj->obj_cont), + vos_obj2pool(obj), &toh); + if (rc) { + D_DEBUG(DB_TRACE, + "Failed to open tree for iterator:" + " rc = " DF_RC "\n", + DP_RC(rc)); + goto failed; } - key_tree_release(toh, type == VOS_ITER_RECX); + rc = dbtree_iter_prepare(toh, BTR_ITER_EMBEDDED, &oiter->it_hdl); + + key_tree_release(toh, false); if (rc != 0) { D_DEBUG(DB_TRACE, "Failed to prepare iterator: rc = "DF_RC"\n", @@ -1911,9 +2011,122 @@ vos_obj_iter_nested_prep(vos_iter_type_t type, struct vos_iter_info *info, success: *iter_pp = &oiter->it_iter; return 0; + failed: - vos_ilog_fetch_finish(&oiter->it_ilog_info); - D_FREE(oiter); + nested_prep_common_abort(oiter); + return rc; +} + +static int +vos_obj_iter_sv_nested_prep(vos_iter_type_t type, struct vos_iter_info *info, + struct vos_iterator **iter_pp) +{ + struct vos_object *obj = info->ii_obj; + struct vos_obj_iter *oiter; + daos_handle_t toh; + int rc = 0; + + if (type != VOS_ITER_SINGLE) { + D_ERROR("Unexpected type: %d\n", type); + return -DER_INVAL; + } + + rc = nested_prep_common_init(obj->obj_cont, &oiter, info); + + oiter->it_obj = obj; + if (info->ii_fake_akey_flag) { + D_ASSERTF(info->ii_fake_akey_flag == VOS_IT_DKEY_SV, "Invalid value for flag: %x\n", + info->ii_fake_akey_flag); + toh = info->ii_tree_hdl; + goto prepare; + } + + rc = dbtree_open_inplace_ex(info->ii_btr, info->ii_uma, vos_cont2hdl(obj->obj_cont), + vos_obj2pool(obj), &toh); + if (rc) { + D_DEBUG(DB_TRACE, + "Failed to open tree for iterator:" + " rc = " DF_RC "\n", + DP_RC(rc)); + goto failed; + } +prepare: + rc = dbtree_iter_prepare(toh, BTR_ITER_EMBEDDED, &oiter->it_hdl); + + if (info->ii_fake_akey_flag == 0) + key_tree_release(toh, false); + + if (rc != 0) { + D_DEBUG(DB_TRACE, "Failed to prepare iterator: rc = " DF_RC "\n", DP_RC(rc)); + goto failed; + } + + *iter_pp = &oiter->it_iter; + return 0; + +failed: + nested_prep_common_abort(oiter); + return rc; +} + +static int +vos_obj_iter_ev_nested_prep(vos_iter_type_t type, struct vos_iter_info *info, + struct vos_iterator **iter_pp) +{ + struct vos_object *obj = info->ii_obj; + struct vos_obj_iter *oiter; + struct evt_desc_cbs cbs; + struct evt_filter filter = {0}; + daos_handle_t toh; + int rc = 0; + uint32_t options; + + if (type != VOS_ITER_RECX) { + D_ERROR("Unexpected type: %d\n", type); + return -DER_INVAL; + } + + rc = nested_prep_common_init(obj->obj_cont, &oiter, info); + + oiter->it_obj = obj; + + if (info->ii_fake_akey_flag) { + D_ASSERTF(info->ii_fake_akey_flag == VOS_IT_DKEY_EV, "Invalid value for flag: %x\n", + info->ii_fake_akey_flag); + toh = info->ii_tree_hdl; + goto prepare; + } + + vos_evt_desc_cbs_init(&cbs, vos_obj2pool(obj), vos_cont2hdl(obj->obj_cont)); + rc = evt_open(info->ii_evt, info->ii_uma, &cbs, &toh); + if (rc) { + D_DEBUG(DB_TRACE, + "Failed to open tree for iterator:" + " rc = " DF_RC "\n", + DP_RC(rc)); + goto failed; + } +prepare: + recx2filter(&filter, &info->ii_recx); + filter.fr_epr.epr_lo = oiter->it_epr.epr_lo; + filter.fr_epr.epr_hi = oiter->it_iter.it_bound; + filter.fr_epoch = oiter->it_epr.epr_hi; + filter.fr_punch_epc = oiter->it_punched.pr_epc; + filter.fr_punch_minor_epc = oiter->it_punched.pr_minor_epc; + options = recx_get_flags(oiter); + rc = evt_iter_prepare(toh, options, &filter, &oiter->it_hdl); + if (info->ii_fake_akey_flag == 0) + key_tree_release(toh, type == VOS_ITER_RECX); + + if (rc != 0) { + D_DEBUG(DB_TRACE, "Failed to prepare iterator: rc = " DF_RC "\n", DP_RC(rc)); + goto failed; + } + + *iter_pp = &oiter->it_iter; + return 0; +failed: + nested_prep_common_abort(oiter); return rc; } @@ -1935,6 +2148,13 @@ vos_obj_iter_fini(struct vos_iterator *iter) case VOS_ITER_DKEY: case VOS_ITER_AKEY: + if (is_fake_iter(oiter)) { + /** In fake akey iterator, we use the subtree handle, so + * release it here. + */ + key_tree_release(oiter->it_hdl, fake_iter_child_is_array(oiter)); + break; + } case VOS_ITER_SINGLE: rc = dbtree_iter_finish(oiter->it_hdl); break; @@ -1960,29 +2180,146 @@ vos_obj_iter_fini(struct vos_iterator *iter) } int -vos_obj_iter_probe(struct vos_iterator *iter, daos_anchor_t *anchor, uint32_t flags) +vos_obj_dkey_iter_probe(struct vos_iterator *iter, daos_anchor_t *anchor, uint32_t flags) { struct vos_obj_iter *oiter = vos_iter2oiter(iter); - switch (iter->it_type) { - default: - D_ASSERT(0); - return -DER_INVAL; + D_ASSERTF(iter->it_type == VOS_ITER_DKEY, "type is %d\n", iter->it_type); - case VOS_ITER_DKEY: - case VOS_ITER_AKEY: - return key_iter_probe(oiter, anchor, flags); + return key_iter_probe(oiter, anchor, flags); +} - case VOS_ITER_SINGLE: - return singv_iter_probe(oiter, anchor, flags); +int +vos_obj_akey_iter_probe(struct vos_iterator *iter, daos_anchor_t *anchor, uint32_t flags) +{ + struct vos_obj_iter *oiter = vos_iter2oiter(iter); - case VOS_ITER_RECX: - return recx_iter_probe(oiter, anchor); + D_ASSERTF(iter->it_type == VOS_ITER_AKEY, "type is %d\n", iter->it_type); + + if (is_fake_iter(oiter)) { + if (vos_anchor_is_zero(anchor) || (flags & VOS_ITER_PROBE_NEXT) == 0) { + oiter->it_fake_akey = '0'; + return 0; + } + /** Indicate we are done iterating */ + oiter->it_fake_akey = 0; + return -DER_NONEXIST; } + + return key_iter_probe(oiter, anchor, flags); +} + +int +vos_obj_sv_iter_probe(struct vos_iterator *iter, daos_anchor_t *anchor, uint32_t flags) +{ + struct vos_obj_iter *oiter = vos_iter2oiter(iter); + + D_ASSERTF(iter->it_type == VOS_ITER_SINGLE, "type is %d\n", iter->it_type); + + return singv_iter_probe(oiter, anchor, flags); +} + +int +vos_obj_ev_iter_probe(struct vos_iterator *iter, daos_anchor_t *anchor, uint32_t flags) +{ + struct vos_obj_iter *oiter = vos_iter2oiter(iter); + + D_ASSERTF(iter->it_type == VOS_ITER_RECX, "type is %d\n", iter->it_type); + + return recx_iter_probe(oiter, anchor); +} + +static int +vos_obj_dkey_iter_next(struct vos_iterator *iter, daos_anchor_t *anchor) +{ + struct vos_obj_iter *oiter = vos_iter2oiter(iter); + + D_ASSERTF(iter->it_type == VOS_ITER_DKEY, "type is %d\n", iter->it_type); + + return key_iter_next(oiter, anchor); +} + +static int +vos_obj_akey_iter_next(struct vos_iterator *iter, daos_anchor_t *anchor) +{ + struct vos_obj_iter *oiter = vos_iter2oiter(iter); + + D_ASSERTF(iter->it_type == VOS_ITER_AKEY, "type is %d\n", iter->it_type); + if (is_fake_iter(oiter)) { + /** Indicate we are done iterating */ + oiter->it_fake_akey = 0; + return -DER_NONEXIST; + } + + return key_iter_next(oiter, anchor); } static int -vos_obj_iter_next(struct vos_iterator *iter, daos_anchor_t *anchor) +vos_obj_sv_iter_next(struct vos_iterator *iter, daos_anchor_t *anchor) +{ + struct vos_obj_iter *oiter = vos_iter2oiter(iter); + + D_ASSERTF(iter->it_type == VOS_ITER_SINGLE, "type is %d\n", iter->it_type); + return singv_iter_next(oiter); +} + +static int +vos_obj_ev_iter_next(struct vos_iterator *iter, daos_anchor_t *anchor) +{ + struct vos_obj_iter *oiter = vos_iter2oiter(iter); + + D_ASSERTF(iter->it_type == VOS_ITER_RECX, "type is %d\n", iter->it_type); + return recx_iter_next(oiter); +} + +static int +vos_obj_dkey_iter_fetch(struct vos_iterator *iter, vos_iter_entry_t *it_entry, + daos_anchor_t *anchor) +{ + struct vos_obj_iter *oiter = vos_iter2oiter(iter); + + D_ASSERTF(iter->it_type == VOS_ITER_DKEY, "type is %d\n", iter->it_type); + + return key_iter_fetch(oiter, it_entry, anchor, false, 0); +} + +static int +vos_obj_akey_iter_fetch(struct vos_iterator *iter, vos_iter_entry_t *it_entry, + daos_anchor_t *anchor) +{ + struct vos_obj_iter *oiter = vos_iter2oiter(iter); + + D_ASSERTF(iter->it_type == VOS_ITER_AKEY, "type is %d\n", iter->it_type); + if (is_fake_iter(oiter)) { + D_ASSERTF(oiter->it_fake_akey == '0', "Must probe before fetch"); + if (anchor != NULL) + vos_fake_anchor_create(anchor); + if (fake_iter_child_is_array(oiter)) + it_entry->ie_child_type = VOS_ITER_RECX; + else + it_entry->ie_child_type = VOS_ITER_SINGLE; + it_entry->ie_vis_flags = VOS_VIS_FLAG_VISIBLE; + if (oiter->it_ilog_info.ii_create == 0) { + /* The key has no visible subtrees so mark it covered */ + it_entry->ie_vis_flags = VOS_VIS_FLAG_COVERED; + } + + it_entry->ie_epoch = oiter->it_epr.epr_hi; + it_entry->ie_punch = oiter->it_ilog_info.ii_next_punch; + it_entry->ie_obj_punch = oiter->it_obj->obj_ilog_info.ii_next_punch; + /** Use the dkey for this */ + vos_ilog_last_update(&oiter->it_dkey_krec->kr_ilog, VOS_TS_TYPE_DKEY, + &it_entry->ie_last_update, !!oiter->it_iter.it_for_sysdb); + d_iov_set(&it_entry->ie_key, &oiter->it_fake_akey, sizeof(oiter->it_fake_akey)); + + return 0; + } + + return key_iter_fetch(oiter, it_entry, anchor, false, 0); +} + +static int +vos_obj_sv_iter_fetch(struct vos_iterator *iter, vos_iter_entry_t *it_entry, daos_anchor_t *anchor) { struct vos_obj_iter *oiter = vos_iter2oiter(iter); @@ -1993,19 +2330,18 @@ vos_obj_iter_next(struct vos_iterator *iter, daos_anchor_t *anchor) case VOS_ITER_DKEY: case VOS_ITER_AKEY: - return key_iter_next(oiter, anchor); + return key_iter_fetch(oiter, it_entry, anchor, false, 0); case VOS_ITER_SINGLE: - return singv_iter_next(oiter); + return singv_iter_fetch(oiter, it_entry, anchor); case VOS_ITER_RECX: - return recx_iter_next(oiter); + return recx_iter_fetch(oiter, it_entry, anchor); } } static int -vos_obj_iter_fetch(struct vos_iterator *iter, vos_iter_entry_t *it_entry, - daos_anchor_t *anchor) +vos_obj_ev_iter_fetch(struct vos_iterator *iter, vos_iter_entry_t *it_entry, daos_anchor_t *anchor) { struct vos_obj_iter *oiter = vos_iter2oiter(iter); @@ -2246,6 +2582,8 @@ vos_obj_iter_process(struct vos_iterator *iter, vos_iter_proc_op_t op, return -DER_INVAL; case VOS_ITER_DKEY: case VOS_ITER_AKEY: + if (is_fake_iter(oiter)) + return 0; case VOS_ITER_SINGLE: return obj_iter_delete(oiter, args); case VOS_ITER_RECX: @@ -2284,17 +2622,56 @@ vos_obj_iter_empty(struct vos_iterator *iter) } } -struct vos_iter_ops vos_obj_iter_ops = { - .iop_prepare = vos_obj_iter_prep, - .iop_nested_tree_fetch = vos_obj_iter_nested_tree_fetch, - .iop_nested_prepare = vos_obj_iter_nested_prep, - .iop_finish = vos_obj_iter_fini, - .iop_probe = vos_obj_iter_probe, - .iop_next = vos_obj_iter_next, - .iop_fetch = vos_obj_iter_fetch, - .iop_copy = vos_obj_iter_copy, - .iop_process = vos_obj_iter_process, - .iop_empty = vos_obj_iter_empty, +struct vos_iter_ops vos_obj_dkey_iter_ops = { + .iop_prepare = vos_obj_iter_prep, + .iop_nested_tree_fetch = vos_obj_iter_dkey_nested_tree_fetch, + .iop_nested_prepare = vos_obj_iter_dkey_nested_prep, + .iop_finish = vos_obj_iter_fini, + .iop_probe = vos_obj_dkey_iter_probe, + .iop_next = vos_obj_dkey_iter_next, + .iop_fetch = vos_obj_dkey_iter_fetch, + .iop_copy = vos_obj_iter_copy, + .iop_process = vos_obj_iter_process, + .iop_empty = vos_obj_iter_empty, +}; + +struct vos_iter_ops vos_obj_akey_iter_ops = { + .iop_prepare = vos_obj_iter_prep, + .iop_nested_tree_fetch = vos_obj_iter_akey_nested_tree_fetch, + .iop_nested_prepare = vos_obj_iter_akey_nested_prep, + .iop_finish = vos_obj_iter_fini, + .iop_probe = vos_obj_akey_iter_probe, + .iop_next = vos_obj_akey_iter_next, + .iop_fetch = vos_obj_akey_iter_fetch, + .iop_copy = vos_obj_iter_copy, + .iop_process = vos_obj_iter_process, + .iop_empty = vos_obj_iter_empty, +}; + +struct vos_iter_ops vos_obj_sv_iter_ops = { + .iop_prepare = vos_obj_iter_prep, + .iop_nested_tree_fetch = vos_obj_iter_invalid_nested_tree_fetch, + .iop_nested_prepare = vos_obj_iter_sv_nested_prep, + .iop_finish = vos_obj_iter_fini, + .iop_probe = vos_obj_sv_iter_probe, + .iop_next = vos_obj_sv_iter_next, + .iop_fetch = vos_obj_sv_iter_fetch, + .iop_copy = vos_obj_iter_copy, + .iop_process = vos_obj_iter_process, + .iop_empty = vos_obj_iter_empty, +}; + +struct vos_iter_ops vos_obj_ev_iter_ops = { + .iop_prepare = vos_obj_iter_prep, + .iop_nested_tree_fetch = vos_obj_iter_invalid_nested_tree_fetch, + .iop_nested_prepare = vos_obj_iter_ev_nested_prep, + .iop_finish = vos_obj_iter_fini, + .iop_probe = vos_obj_ev_iter_probe, + .iop_next = vos_obj_ev_iter_next, + .iop_fetch = vos_obj_ev_iter_fetch, + .iop_copy = vos_obj_iter_copy, + .iop_process = vos_obj_iter_process, + .iop_empty = vos_obj_iter_empty, }; /** * @} vos_obj_iters diff --git a/src/vos/vos_query.c b/src/vos/vos_query.c index b7007ec6b877..621053d09fec 100644 --- a/src/vos/vos_query.c +++ b/src/vos/vos_query.c @@ -503,6 +503,9 @@ open_and_query_key(struct open_query *query, daos_key_t *key, toh = &query->qt_akey_toh; to_open = query->qt_akey_root; tclass = VOS_BTR_AKEY; + + if (query->qt_flags & VOS_FLAT_DKEY) + return 0; } if (daos_handle_is_valid(*toh)) { @@ -548,13 +551,17 @@ open_and_query_key(struct open_query *query, daos_key_t *key, return rc; } - if (tree_type == VOS_GET_DKEY) { + if (tree_type == VOS_GET_DKEY && (rbund.rb_krec->kr_bmap & KREC_BF_FLAT) == 0) { query->qt_akey_root = &rbund.rb_krec->kr_btr; } else if ((rbund.rb_krec->kr_bmap & KREC_BF_EVT) == 0) { if (query->qt_flags & VOS_GET_RECX) return -DER_NONEXIST; } else { query->qt_recx_root = &rbund.rb_krec->kr_evt; + if (tree_type == VOS_GET_DKEY) + query->qt_flags |= VOS_FLAT_DKEY; + else + query->qt_flags &= ~VOS_FLAT_DKEY; } return 0; diff --git a/src/vos/vos_tree.c b/src/vos/vos_tree.c index ed8703973f3e..8fc5639ccdaf 100644 --- a/src/vos/vos_tree.c +++ b/src/vos/vos_tree.c @@ -34,7 +34,8 @@ struct vos_btr_attr { btr_ops_t *ta_ops; }; -static struct vos_btr_attr *obj_tree_find_attr(unsigned tree_class); +static struct vos_btr_attr * +obj_tree_find_attr(unsigned tree_class, int flags); static struct vos_svt_key * iov2svt_key(d_iov_t *key_iov) @@ -806,7 +807,13 @@ tree_open_create(struct vos_object *obj, enum vos_tree_class tclass, int flags, int unexpected_flag; int rc = 0; - if (flags & SUBTR_EVT) { + vos_evt_desc_cbs_init(&cbs, pool, coh); + if ((krec->kr_bmap & (KREC_BF_BTR | KREC_BF_EVT)) == 0) + goto create; + + /** If subtree is already created, it could have been created by an older pool version + * so if the dkey is not flat, we need to use KREC_BF_BTR here */ + if (flags & SUBTR_EVT && (tclass == VOS_BTR_AKEY || (krec->kr_bmap & KREC_BF_FLAT))) { expected_flag = KREC_BF_EVT; unexpected_flag = KREC_BF_BTR; } else { @@ -825,20 +832,16 @@ tree_open_create(struct vos_object *obj, enum vos_tree_class tclass, int flags, goto out; } - vos_evt_desc_cbs_init(&cbs, pool, coh); - if (krec->kr_bmap & expected_flag) { - if (flags & SUBTR_EVT) { - rc = evt_open(&krec->kr_evt, uma, &cbs, sub_toh); - } else { - rc = dbtree_open_inplace_ex(&krec->kr_btr, uma, coh, - pool, sub_toh); - } - if (rc != 0) - D_ERROR("Failed to open tree: "DF_RC"\n", DP_RC(rc)); - - goto out; + if (flags & SUBTR_EVT) { + rc = evt_open(&krec->kr_evt, uma, &cbs, sub_toh); + } else { + rc = dbtree_open_inplace_ex(&krec->kr_btr, uma, coh, pool, sub_toh); } + if (rc != 0) + D_ERROR("Failed to open tree: " DF_RC "\n", DP_RC(rc)); + goto out; +create: if ((flags & SUBTR_CREATE) == 0) { /** This can happen if application does a punch first before any * updates. Simply return -DER_NONEXIST in such case. @@ -847,6 +850,12 @@ tree_open_create(struct vos_object *obj, enum vos_tree_class tclass, int flags, goto out; } + if (flags & SUBTR_EVT) { + expected_flag = KREC_BF_EVT; + } else { + expected_flag = KREC_BF_BTR; + } + if (!created) { rc = umem_tx_add_ptr(vos_obj2umm(obj), krec, sizeof(*krec)); @@ -881,7 +890,7 @@ tree_open_create(struct vos_object *obj, enum vos_tree_class tclass, int flags, tree_feats |= VOS_KEY_CMP_LEXICAL_SET; } - ta = obj_tree_find_attr(tclass); + ta = obj_tree_find_attr(tclass, flags); D_DEBUG(DB_TRACE, "Create dbtree %s feats 0x"DF_X64"\n", ta->ta_name, tree_feats); @@ -900,6 +909,8 @@ tree_open_create(struct vos_object *obj, enum vos_tree_class tclass, int flags, * levels, only the tree_feats version is used. */ krec->kr_bmap |= expected_flag; + if (flags & SUBTR_FLAT) + krec->kr_bmap |= KREC_BF_FLAT; out: return rc; } @@ -933,8 +944,14 @@ key_tree_prepare(struct vos_object *obj, daos_handle_t toh, *krecp = NULL; D_DEBUG(DB_TRACE, "prepare tree, flags=%x, tclass=%d\n", flags, tclass); - if (tclass != VOS_BTR_AKEY && (flags & SUBTR_EVT)) - D_GOTO(out, rc = -DER_INVAL); + if (flags & SUBTR_EVT) { + if (tclass != VOS_BTR_AKEY && (flags & SUBTR_FLAT) == 0) { + D_ERROR("SUBTR_EVT flag passed with invalid type or flags: tclass = %x, " + "flags = %x\n", + tclass, flags); + D_GOTO(out, rc = -DER_INVAL); + } + } tree_rec_bundle2iov(&rbund, &riov); rbund.rb_off = UMOFF_NULL; @@ -1214,7 +1231,7 @@ obj_tree_register(void) /** find the attributes of the subtree of @tree_class */ static struct vos_btr_attr * -obj_tree_find_attr(unsigned tree_class) +obj_tree_find_attr(unsigned tree_class, int flags) { int i; @@ -1228,8 +1245,10 @@ obj_tree_find_attr(unsigned tree_class) break; case VOS_BTR_DKEY: - /* TODO: change it to VOS_BTR_AKEY while adding akey support */ - tree_class = VOS_BTR_AKEY; + if (flags & SUBTR_FLAT) + tree_class = VOS_BTR_SINGV; + else + tree_class = VOS_BTR_AKEY; break; }