From 3790bf7474f77b237309b0083a0eb11b1bce43ba Mon Sep 17 00:00:00 2001 From: Mohamad Chaarawi Date: Wed, 21 Jun 2023 16:02:46 +0000 Subject: [PATCH] DAOS-13175 dfs: print progress of dfs mwc checker Every 30 seconds print how many objects have been scanned in the dfs namespace or marked in the OIT. This informs users that there is some progress being done by the checker and it's not hung in case of a large namespace. Required-githooks: true Signed-off-by: Mohamad Chaarawi --- src/client/dfs/dfs.c | 99 +++++++++++++++++++++++++++++++--------- utils/node_local_test.py | 8 ++-- 2 files changed, 81 insertions(+), 26 deletions(-) diff --git a/src/client/dfs/dfs.c b/src/client/dfs/dfs.c index b114db75b1e..3bd68e9e5bd 100644 --- a/src/client/dfs/dfs.c +++ b/src/client/dfs/dfs.c @@ -6454,6 +6454,7 @@ dfs_dir_anchor_set(dfs_obj_t *obj, const char name[], daos_anchor_t *anchor) #define DFS_ITER_NR 128 #define DFS_ITER_DKEY_BUF (DFS_ITER_NR * sizeof(uint64_t)) #define DFS_ITER_ENTRY_BUF (DFS_ITER_NR * DFS_MAX_NAME) +#define DFS_ELAPSED_TIME 30 struct dfs_oit_args { daos_handle_t oit; @@ -6461,6 +6462,9 @@ struct dfs_oit_args { uint64_t snap_epoch; uint64_t skipped; uint64_t failed; + time_t start_time; + time_t print_time; + uint64_t num_scanned; }; static int @@ -6556,8 +6560,19 @@ oit_mark_cb(dfs_t *dfs, dfs_obj_t *parent, const char name[], void *args) daos_obj_id_t oid; d_iov_t marker; bool mark_data = true; + struct timespec current_time; int rc; + rc = clock_gettime(CLOCK_REALTIME, ¤t_time); + if (rc) + return errno; + oit_args->num_scanned ++; + if (current_time.tv_sec - oit_args->print_time >= DFS_ELAPSED_TIME) { + D_PRINT("DFS checker: Scanned "DF_U64" files/directories (runtime: "DF_U64" sec)\n", + oit_args->num_scanned, current_time.tv_sec - oit_args->start_time); + oit_args->print_time = current_time.tv_sec; + } + /** open the entry name and get the oid */ rc = dfs_lookup_rel(dfs, parent, name, O_RDONLY, &obj, NULL, NULL); if (rc) { @@ -6703,12 +6718,24 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char * d_iov_t marker; bool mark_data = true; daos_epoch_range_t epr; - struct timespec now; + struct timespec now, current_time; uid_t uid = geteuid(); gid_t gid = getegid(); unsigned int co_flags = DAOS_COO_EX; + char now_name[24]; + struct tm *now_tm; + daos_size_t len; int rc, rc2; + rc = clock_gettime(CLOCK_REALTIME, &now); + if (rc) + return errno; + now_tm = localtime(&now.tv_sec); + len = strftime(now_name, sizeof(now_name), "%Y-%m-%d-%H:%M:%S", now_tm); + if (len == 0) + return EINVAL; + D_PRINT("DFS checker: Start (%s)\n", now_name); + if (flags & DFS_CHECK_RELINK && flags & DFS_CHECK_REMOVE) { D_ERROR("can't request remove and link to l+f at the same time\n"); return EINVAL; @@ -6729,6 +6756,7 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char * D_GOTO(out_cont, rc); } + D_PRINT("DFS checker: Create OIT table\n"); /** create snapshot for OIT */ rc = daos_cont_create_snap_opt(coh, &snap_epoch, NULL, DAOS_SNAP_OPT_CR | DAOS_SNAP_OPT_OIT, NULL); @@ -6742,6 +6770,8 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char * D_GOTO(out_dfs, rc = ENOMEM); oit_args->flags = flags; oit_args->snap_epoch = snap_epoch; + oit_args->start_time = now.tv_sec; + oit_args->print_time = now.tv_sec; /** Open OIT table */ rc = daos_oit_open(coh, snap_epoch, &oit_args->oit, NULL); @@ -6758,10 +6788,11 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char * D_GOTO(out_oit, rc = daos_der2errno(rc)); } rc = daos_oit_mark(oit_args->oit, dfs->root.oid, &marker, NULL); - if (rc) { + if (rc && rc != -DER_NONEXIST) { D_ERROR("Failed to mark ROOT OID in OIT: "DF_RC"\n", DP_RC(rc)); D_GOTO(out_oit, rc = daos_der2errno(rc)); } + rc = 0; if (flags & DFS_CHECK_VERIFY) { rc = daos_obj_verify(coh, dfs->super_oid, snap_epoch); @@ -6791,6 +6822,8 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char * } } + D_PRINT("DFS checker: Iterating namespace and marking objects\n"); + oit_args->num_scanned = 2; /** iterate through the namespace and mark OITs starting from the root object */ while (!daos_anchor_is_eof(&anchor)) { rc = dfs_iterate(dfs, &dfs->root, &anchor, &nr_entries, DFS_MAX_NAME * nr_entries, @@ -6803,14 +6836,14 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char * nr_entries = DFS_ITER_NR; } - /** Create lost+found directory and properly link unmarked oids there. */ - if (flags & DFS_CHECK_RELINK) { - char now_name[24]; - - rc = clock_gettime(CLOCK_REALTIME, &now); - if (rc) - D_GOTO(out_oit, rc = errno); + rc = clock_gettime(CLOCK_REALTIME, ¤t_time); + if (rc) + D_GOTO(out_oit, rc = errno); + D_PRINT("DFS checker: marked "DF_U64" files/directories (runtime: "DF_U64" sec))\n", + oit_args->num_scanned, current_time.tv_sec - oit_args->start_time); + /** Create lost+found directory to link unmarked oids there. */ + if (flags & DFS_CHECK_RELINK) { rc = dfs_open(dfs, NULL, "lost+found", S_IFDIR | 0755, O_CREAT | O_RDWR, 0, 0, NULL, &lf); if (rc) { @@ -6819,21 +6852,15 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char * } if (name == NULL) { - struct tm *now_tm; - size_t len; /* * Create a directory with current timestamp in l+f where leaked oids will * be linked in this run. */ - now_tm = localtime(&now.tv_sec); - len = strftime(now_name, sizeof(now_name), "%Y-%m-%d-%H:%M:%S", now_tm); - if (len == 0) { - D_ERROR("Invalid time format\n"); - D_GOTO(out_lf1, rc = EINVAL); - } - D_PRINT("Leaked OIDs will be inserted in /lost+found/%s\n", now_name); + D_PRINT("DFS checker: Leaked OIDs will be inserted in /lost+found/%s\n", + now_name); } else { - D_PRINT("Leaked OIDs will be inserted in /lost+found/%s\n", name); + D_PRINT("DFS checker: Leaked OIDs will be inserted in /lost+found/%s\n", + name); } rc = dfs_open(dfs, lf, name ? name : now_name, S_IFDIR | 0755, @@ -6865,6 +6892,8 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char * * Pass 1: check directories only and descend to mark all oids in the namespace of each dir. * Pass 2: relink remaining oids in the L+F root that are unmarked still after first pass. */ + D_PRINT("DFS checker: Checking unmarked OIDs (Pass 1)\n"); + oit_args->num_scanned = 0; memset(&anchor, 0, sizeof(anchor)); /** Start Pass 1 */ while (!daos_anchor_is_eof(&anchor)) { @@ -6875,6 +6904,16 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char * D_GOTO(out_lf2, rc = daos_der2errno(rc)); } + clock_gettime(CLOCK_REALTIME, ¤t_time); + if (rc) + D_GOTO(out_lf2, rc = errno); + oit_args->num_scanned += nr_entries; + if (current_time.tv_sec - oit_args->print_time >= DFS_ELAPSED_TIME) { + D_PRINT("DFS checker: Checked "DF_U64" objects (runtime: "DF_U64" sec)\n", + oit_args->num_scanned, current_time.tv_sec - oit_args->start_time); + oit_args->print_time = current_time.tv_sec; + } + for (i = 0; i < nr_entries; i++) { if (flags & DFS_CHECK_RELINK) { enum daos_otype_t otype = daos_obj_id2type(oids[i]); @@ -6935,6 +6974,8 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char * if (!(flags & DFS_CHECK_RELINK)) goto done; + D_PRINT("DFS checker: Checking unmarked OIDs (Pass 2)\n"); + oit_args->num_scanned = 0; memset(&anchor, 0, sizeof(anchor)); while (!daos_anchor_is_eof(&anchor)) { nr_entries = DFS_ITER_NR; @@ -6944,11 +6985,20 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char * D_GOTO(out_lf2, rc = daos_der2errno(rc)); } + clock_gettime(CLOCK_REALTIME, ¤t_time); + if (rc) + D_GOTO(out_lf2, rc = errno); + oit_args->num_scanned += nr_entries; + if (current_time.tv_sec - oit_args->print_time >= DFS_ELAPSED_TIME) { + D_PRINT("DFS checker: Checked "DF_U64" objects (runtime: "DF_U64" sec)\n", + oit_args->num_scanned, current_time.tv_sec - oit_args->start_time); + oit_args->print_time = current_time.tv_sec; + } + for (i = 0; i < nr_entries; i++) { struct dfs_entry entry = {0}; enum daos_otype_t otype = daos_obj_id2type(oids[i]); char oid_name[DFS_MAX_NAME + 1]; - daos_size_t len; if (flags & DFS_CHECK_PRINT) D_PRINT("oid["DF_U64"]: "DF_OID"\n", unmarked_entries, @@ -7017,8 +7067,12 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char * } done: - if (flags & DFS_CHECK_PRINT) - D_PRINT("Number of Leaked OIDs in Namespace = "DF_U64"\n", unmarked_entries); + rc = clock_gettime(CLOCK_REALTIME, ¤t_time); + if (rc) + D_GOTO(out_lf2, rc = errno); + D_PRINT("DFS checker: Done! (runtime: "DF_U64" sec)\n", + current_time.tv_sec - oit_args->start_time); + D_PRINT("DFS checker: Number of leaked OIDs in namespace = "DF_U64"\n", unmarked_entries); if (flags & DFS_CHECK_VERIFY) { if (oit_args->failed) { D_ERROR(""DF_U64" OIDs failed data consistency check!\n", oit_args->failed); @@ -7058,6 +7112,7 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char * rc2 = daos_cont_close(coh, NULL); if (rc == 0) rc = daos_der2errno(rc2); + return rc; } diff --git a/utils/node_local_test.py b/utils/node_local_test.py index db674bfe36e..f341e837510 100755 --- a/utils/node_local_test.py +++ b/utils/node_local_test.py @@ -3548,7 +3548,7 @@ def test_daos_fs_check(self): assert rc.returncode == 0 output = rc.stdout.decode('utf-8') line = output.splitlines() - if line[-1] != 'Number of Leaked OIDs in Namespace = 2': + if line[-1] != 'DFS checker: Number of leaked OIDs in namespace = 2': raise NLTestFail('Wrong number of Leaked OIDs') # run again to check nothing is detected @@ -3558,7 +3558,7 @@ def test_daos_fs_check(self): assert rc.returncode == 0 output = rc.stdout.decode('utf-8') line = output.splitlines() - if line[-1] != 'Number of Leaked OIDs in Namespace = 0': + if line[-1] != 'DFS checker: Number of leaked OIDs in namespace = 0': raise NLTestFail('Wrong number of Leaked OIDs') # remount dfuse @@ -3613,7 +3613,7 @@ def test_daos_fs_check(self): assert rc.returncode == 0 output = rc.stdout.decode('utf-8') line = output.splitlines() - if line[-1] != 'Number of Leaked OIDs in Namespace = 4': + if line[-1] != 'DFS checker: Number of leaked OIDs in namespace = 4': raise NLTestFail('Wrong number of Leaked OIDs') # run again to check nothing is detected @@ -3623,7 +3623,7 @@ def test_daos_fs_check(self): assert rc.returncode == 0 output = rc.stdout.decode('utf-8') line = output.splitlines() - if line[-1] != 'Number of Leaked OIDs in Namespace = 0': + if line[-1] != 'DFS checker: Number of leaked OIDs in namespace = 0': raise NLTestFail('Wrong number of Leaked OIDs') # remount dfuse