Skip to content

Commit

Permalink
DAOS-13175 dfs: print progress of dfs mwc checker
Browse files Browse the repository at this point in the history
Every 30 seconds print how many objects have been scanned in the dfs namespace or marked in the OIT. This informs users that there is some progress being done by the checker and it's not hung in case of a large namespace.

Required-githooks: true

Signed-off-by: Mohamad Chaarawi <mohamad.chaarawi@intel.com>
  • Loading branch information
mchaarawi committed Jun 21, 2023
1 parent a3c57e5 commit 3790bf7
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 26 deletions.
99 changes: 77 additions & 22 deletions src/client/dfs/dfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -6454,13 +6454,17 @@ dfs_dir_anchor_set(dfs_obj_t *obj, const char name[], daos_anchor_t *anchor)
#define DFS_ITER_NR 128
#define DFS_ITER_DKEY_BUF (DFS_ITER_NR * sizeof(uint64_t))
#define DFS_ITER_ENTRY_BUF (DFS_ITER_NR * DFS_MAX_NAME)
#define DFS_ELAPSED_TIME 30

struct dfs_oit_args {
daos_handle_t oit;
uint64_t flags;
uint64_t snap_epoch;
uint64_t skipped;
uint64_t failed;
time_t start_time;
time_t print_time;
uint64_t num_scanned;
};

static int
Expand Down Expand Up @@ -6556,8 +6560,19 @@ oit_mark_cb(dfs_t *dfs, dfs_obj_t *parent, const char name[], void *args)
daos_obj_id_t oid;
d_iov_t marker;
bool mark_data = true;
struct timespec current_time;
int rc;

rc = clock_gettime(CLOCK_REALTIME, &current_time);
if (rc)
return errno;
oit_args->num_scanned ++;
if (current_time.tv_sec - oit_args->print_time >= DFS_ELAPSED_TIME) {
D_PRINT("DFS checker: Scanned "DF_U64" files/directories (runtime: "DF_U64" sec)\n",
oit_args->num_scanned, current_time.tv_sec - oit_args->start_time);
oit_args->print_time = current_time.tv_sec;
}

/** open the entry name and get the oid */
rc = dfs_lookup_rel(dfs, parent, name, O_RDONLY, &obj, NULL, NULL);
if (rc) {
Expand Down Expand Up @@ -6703,12 +6718,24 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
d_iov_t marker;
bool mark_data = true;
daos_epoch_range_t epr;
struct timespec now;
struct timespec now, current_time;
uid_t uid = geteuid();
gid_t gid = getegid();
unsigned int co_flags = DAOS_COO_EX;
char now_name[24];
struct tm *now_tm;
daos_size_t len;
int rc, rc2;

rc = clock_gettime(CLOCK_REALTIME, &now);
if (rc)
return errno;
now_tm = localtime(&now.tv_sec);
len = strftime(now_name, sizeof(now_name), "%Y-%m-%d-%H:%M:%S", now_tm);
if (len == 0)
return EINVAL;
D_PRINT("DFS checker: Start (%s)\n", now_name);

if (flags & DFS_CHECK_RELINK && flags & DFS_CHECK_REMOVE) {
D_ERROR("can't request remove and link to l+f at the same time\n");
return EINVAL;
Expand All @@ -6729,6 +6756,7 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
D_GOTO(out_cont, rc);
}

D_PRINT("DFS checker: Create OIT table\n");
/** create snapshot for OIT */
rc = daos_cont_create_snap_opt(coh, &snap_epoch, NULL, DAOS_SNAP_OPT_CR | DAOS_SNAP_OPT_OIT,
NULL);
Expand All @@ -6742,6 +6770,8 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
D_GOTO(out_dfs, rc = ENOMEM);
oit_args->flags = flags;
oit_args->snap_epoch = snap_epoch;
oit_args->start_time = now.tv_sec;
oit_args->print_time = now.tv_sec;

/** Open OIT table */
rc = daos_oit_open(coh, snap_epoch, &oit_args->oit, NULL);
Expand All @@ -6758,10 +6788,11 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
D_GOTO(out_oit, rc = daos_der2errno(rc));
}
rc = daos_oit_mark(oit_args->oit, dfs->root.oid, &marker, NULL);
if (rc) {
if (rc && rc != -DER_NONEXIST) {
D_ERROR("Failed to mark ROOT OID in OIT: "DF_RC"\n", DP_RC(rc));
D_GOTO(out_oit, rc = daos_der2errno(rc));
}
rc = 0;

if (flags & DFS_CHECK_VERIFY) {
rc = daos_obj_verify(coh, dfs->super_oid, snap_epoch);
Expand Down Expand Up @@ -6791,6 +6822,8 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
}
}

D_PRINT("DFS checker: Iterating namespace and marking objects\n");
oit_args->num_scanned = 2;
/** iterate through the namespace and mark OITs starting from the root object */
while (!daos_anchor_is_eof(&anchor)) {
rc = dfs_iterate(dfs, &dfs->root, &anchor, &nr_entries, DFS_MAX_NAME * nr_entries,
Expand All @@ -6803,14 +6836,14 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
nr_entries = DFS_ITER_NR;
}

/** Create lost+found directory and properly link unmarked oids there. */
if (flags & DFS_CHECK_RELINK) {
char now_name[24];

rc = clock_gettime(CLOCK_REALTIME, &now);
if (rc)
D_GOTO(out_oit, rc = errno);
rc = clock_gettime(CLOCK_REALTIME, &current_time);
if (rc)
D_GOTO(out_oit, rc = errno);
D_PRINT("DFS checker: marked "DF_U64" files/directories (runtime: "DF_U64" sec))\n",
oit_args->num_scanned, current_time.tv_sec - oit_args->start_time);

/** Create lost+found directory to link unmarked oids there. */
if (flags & DFS_CHECK_RELINK) {
rc = dfs_open(dfs, NULL, "lost+found", S_IFDIR | 0755, O_CREAT | O_RDWR, 0, 0, NULL,
&lf);
if (rc) {
Expand All @@ -6819,21 +6852,15 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
}

if (name == NULL) {
struct tm *now_tm;
size_t len;
/*
* Create a directory with current timestamp in l+f where leaked oids will
* be linked in this run.
*/
now_tm = localtime(&now.tv_sec);
len = strftime(now_name, sizeof(now_name), "%Y-%m-%d-%H:%M:%S", now_tm);
if (len == 0) {
D_ERROR("Invalid time format\n");
D_GOTO(out_lf1, rc = EINVAL);
}
D_PRINT("Leaked OIDs will be inserted in /lost+found/%s\n", now_name);
D_PRINT("DFS checker: Leaked OIDs will be inserted in /lost+found/%s\n",
now_name);
} else {
D_PRINT("Leaked OIDs will be inserted in /lost+found/%s\n", name);
D_PRINT("DFS checker: Leaked OIDs will be inserted in /lost+found/%s\n",
name);
}

rc = dfs_open(dfs, lf, name ? name : now_name, S_IFDIR | 0755,
Expand Down Expand Up @@ -6865,6 +6892,8 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
* Pass 1: check directories only and descend to mark all oids in the namespace of each dir.
* Pass 2: relink remaining oids in the L+F root that are unmarked still after first pass.
*/
D_PRINT("DFS checker: Checking unmarked OIDs (Pass 1)\n");
oit_args->num_scanned = 0;
memset(&anchor, 0, sizeof(anchor));
/** Start Pass 1 */
while (!daos_anchor_is_eof(&anchor)) {
Expand All @@ -6875,6 +6904,16 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
D_GOTO(out_lf2, rc = daos_der2errno(rc));
}

clock_gettime(CLOCK_REALTIME, &current_time);
if (rc)
D_GOTO(out_lf2, rc = errno);
oit_args->num_scanned += nr_entries;
if (current_time.tv_sec - oit_args->print_time >= DFS_ELAPSED_TIME) {
D_PRINT("DFS checker: Checked "DF_U64" objects (runtime: "DF_U64" sec)\n",
oit_args->num_scanned, current_time.tv_sec - oit_args->start_time);
oit_args->print_time = current_time.tv_sec;
}

for (i = 0; i < nr_entries; i++) {
if (flags & DFS_CHECK_RELINK) {
enum daos_otype_t otype = daos_obj_id2type(oids[i]);
Expand Down Expand Up @@ -6935,6 +6974,8 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
if (!(flags & DFS_CHECK_RELINK))
goto done;

D_PRINT("DFS checker: Checking unmarked OIDs (Pass 2)\n");
oit_args->num_scanned = 0;
memset(&anchor, 0, sizeof(anchor));
while (!daos_anchor_is_eof(&anchor)) {
nr_entries = DFS_ITER_NR;
Expand All @@ -6944,11 +6985,20 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
D_GOTO(out_lf2, rc = daos_der2errno(rc));
}

clock_gettime(CLOCK_REALTIME, &current_time);
if (rc)
D_GOTO(out_lf2, rc = errno);
oit_args->num_scanned += nr_entries;
if (current_time.tv_sec - oit_args->print_time >= DFS_ELAPSED_TIME) {
D_PRINT("DFS checker: Checked "DF_U64" objects (runtime: "DF_U64" sec)\n",
oit_args->num_scanned, current_time.tv_sec - oit_args->start_time);
oit_args->print_time = current_time.tv_sec;
}

for (i = 0; i < nr_entries; i++) {
struct dfs_entry entry = {0};
enum daos_otype_t otype = daos_obj_id2type(oids[i]);
char oid_name[DFS_MAX_NAME + 1];
daos_size_t len;

if (flags & DFS_CHECK_PRINT)
D_PRINT("oid["DF_U64"]: "DF_OID"\n", unmarked_entries,
Expand Down Expand Up @@ -7017,8 +7067,12 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
}

done:
if (flags & DFS_CHECK_PRINT)
D_PRINT("Number of Leaked OIDs in Namespace = "DF_U64"\n", unmarked_entries);
rc = clock_gettime(CLOCK_REALTIME, &current_time);
if (rc)
D_GOTO(out_lf2, rc = errno);
D_PRINT("DFS checker: Done! (runtime: "DF_U64" sec)\n",
current_time.tv_sec - oit_args->start_time);
D_PRINT("DFS checker: Number of leaked OIDs in namespace = "DF_U64"\n", unmarked_entries);
if (flags & DFS_CHECK_VERIFY) {
if (oit_args->failed) {
D_ERROR(""DF_U64" OIDs failed data consistency check!\n", oit_args->failed);
Expand Down Expand Up @@ -7058,6 +7112,7 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
rc2 = daos_cont_close(coh, NULL);
if (rc == 0)
rc = daos_der2errno(rc2);

return rc;
}

Expand Down
8 changes: 4 additions & 4 deletions utils/node_local_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3548,7 +3548,7 @@ def test_daos_fs_check(self):
assert rc.returncode == 0
output = rc.stdout.decode('utf-8')
line = output.splitlines()
if line[-1] != 'Number of Leaked OIDs in Namespace = 2':
if line[-1] != 'DFS checker: Number of leaked OIDs in namespace = 2':
raise NLTestFail('Wrong number of Leaked OIDs')

# run again to check nothing is detected
Expand All @@ -3558,7 +3558,7 @@ def test_daos_fs_check(self):
assert rc.returncode == 0
output = rc.stdout.decode('utf-8')
line = output.splitlines()
if line[-1] != 'Number of Leaked OIDs in Namespace = 0':
if line[-1] != 'DFS checker: Number of leaked OIDs in namespace = 0':
raise NLTestFail('Wrong number of Leaked OIDs')

# remount dfuse
Expand Down Expand Up @@ -3613,7 +3613,7 @@ def test_daos_fs_check(self):
assert rc.returncode == 0
output = rc.stdout.decode('utf-8')
line = output.splitlines()
if line[-1] != 'Number of Leaked OIDs in Namespace = 4':
if line[-1] != 'DFS checker: Number of leaked OIDs in namespace = 4':
raise NLTestFail('Wrong number of Leaked OIDs')

# run again to check nothing is detected
Expand All @@ -3623,7 +3623,7 @@ def test_daos_fs_check(self):
assert rc.returncode == 0
output = rc.stdout.decode('utf-8')
line = output.splitlines()
if line[-1] != 'Number of Leaked OIDs in Namespace = 0':
if line[-1] != 'DFS checker: Number of leaked OIDs in namespace = 0':
raise NLTestFail('Wrong number of Leaked OIDs')

# remount dfuse
Expand Down

0 comments on commit 3790bf7

Please sign in to comment.