Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-13175 dfs: print progress of dfs mwc checker #12470

Merged
merged 1 commit into from
Jun 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 77 additions & 22 deletions src/client/dfs/dfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -6454,13 +6454,17 @@ dfs_dir_anchor_set(dfs_obj_t *obj, const char name[], daos_anchor_t *anchor)
#define DFS_ITER_NR 128
#define DFS_ITER_DKEY_BUF (DFS_ITER_NR * sizeof(uint64_t))
#define DFS_ITER_ENTRY_BUF (DFS_ITER_NR * DFS_MAX_NAME)
#define DFS_ELAPSED_TIME 30

struct dfs_oit_args {
daos_handle_t oit;
uint64_t flags;
uint64_t snap_epoch;
uint64_t skipped;
uint64_t failed;
time_t start_time;
time_t print_time;
uint64_t num_scanned;
};

static int
Expand Down Expand Up @@ -6556,8 +6560,19 @@ oit_mark_cb(dfs_t *dfs, dfs_obj_t *parent, const char name[], void *args)
daos_obj_id_t oid;
d_iov_t marker;
bool mark_data = true;
struct timespec current_time;
int rc;

rc = clock_gettime(CLOCK_REALTIME, &current_time);
if (rc)
return errno;
oit_args->num_scanned ++;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nit] oit_args->num_scanned++;

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure. i can fix that in another coverity PR i have.

if (current_time.tv_sec - oit_args->print_time >= DFS_ELAPSED_TIME) {
D_PRINT("DFS checker: Scanned "DF_U64" files/directories (runtime: "DF_U64" sec)\n",
oit_args->num_scanned, current_time.tv_sec - oit_args->start_time);
oit_args->print_time = current_time.tv_sec;
}

/** open the entry name and get the oid */
rc = dfs_lookup_rel(dfs, parent, name, O_RDONLY, &obj, NULL, NULL);
if (rc) {
Expand Down Expand Up @@ -6703,12 +6718,24 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
d_iov_t marker;
bool mark_data = true;
daos_epoch_range_t epr;
struct timespec now;
struct timespec now, current_time;
uid_t uid = geteuid();
gid_t gid = getegid();
unsigned int co_flags = DAOS_COO_EX;
char now_name[24];
struct tm *now_tm;
daos_size_t len;
int rc, rc2;

rc = clock_gettime(CLOCK_REALTIME, &now);
if (rc)
return errno;
now_tm = localtime(&now.tv_sec);
len = strftime(now_name, sizeof(now_name), "%Y-%m-%d-%H:%M:%S", now_tm);
if (len == 0)
return EINVAL;
D_PRINT("DFS checker: Start (%s)\n", now_name);

if (flags & DFS_CHECK_RELINK && flags & DFS_CHECK_REMOVE) {
D_ERROR("can't request remove and link to l+f at the same time\n");
return EINVAL;
Expand All @@ -6729,6 +6756,7 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
D_GOTO(out_cont, rc);
}

D_PRINT("DFS checker: Create OIT table\n");
/** create snapshot for OIT */
rc = daos_cont_create_snap_opt(coh, &snap_epoch, NULL, DAOS_SNAP_OPT_CR | DAOS_SNAP_OPT_OIT,
NULL);
Expand All @@ -6742,6 +6770,8 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
D_GOTO(out_dfs, rc = ENOMEM);
oit_args->flags = flags;
oit_args->snap_epoch = snap_epoch;
oit_args->start_time = now.tv_sec;
oit_args->print_time = now.tv_sec;

/** Open OIT table */
rc = daos_oit_open(coh, snap_epoch, &oit_args->oit, NULL);
Expand All @@ -6758,10 +6788,11 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
D_GOTO(out_oit, rc = daos_der2errno(rc));
}
rc = daos_oit_mark(oit_args->oit, dfs->root.oid, &marker, NULL);
if (rc) {
if (rc && rc != -DER_NONEXIST) {
D_ERROR("Failed to mark ROOT OID in OIT: "DF_RC"\n", DP_RC(rc));
D_GOTO(out_oit, rc = daos_der2errno(rc));
}
rc = 0;

if (flags & DFS_CHECK_VERIFY) {
rc = daos_obj_verify(coh, dfs->super_oid, snap_epoch);
Expand Down Expand Up @@ -6791,6 +6822,8 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
}
}

D_PRINT("DFS checker: Iterating namespace and marking objects\n");
oit_args->num_scanned = 2;
/** iterate through the namespace and mark OITs starting from the root object */
while (!daos_anchor_is_eof(&anchor)) {
rc = dfs_iterate(dfs, &dfs->root, &anchor, &nr_entries, DFS_MAX_NAME * nr_entries,
Expand All @@ -6803,14 +6836,14 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
nr_entries = DFS_ITER_NR;
}

/** Create lost+found directory and properly link unmarked oids there. */
if (flags & DFS_CHECK_RELINK) {
char now_name[24];

rc = clock_gettime(CLOCK_REALTIME, &now);
if (rc)
D_GOTO(out_oit, rc = errno);
rc = clock_gettime(CLOCK_REALTIME, &current_time);
if (rc)
D_GOTO(out_oit, rc = errno);
D_PRINT("DFS checker: marked "DF_U64" files/directories (runtime: "DF_U64" sec))\n",
oit_args->num_scanned, current_time.tv_sec - oit_args->start_time);

/** Create lost+found directory to link unmarked oids there. */
if (flags & DFS_CHECK_RELINK) {
rc = dfs_open(dfs, NULL, "lost+found", S_IFDIR | 0755, O_CREAT | O_RDWR, 0, 0, NULL,
&lf);
if (rc) {
Expand All @@ -6819,21 +6852,15 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
}

if (name == NULL) {
struct tm *now_tm;
size_t len;
/*
* Create a directory with current timestamp in l+f where leaked oids will
* be linked in this run.
*/
now_tm = localtime(&now.tv_sec);
len = strftime(now_name, sizeof(now_name), "%Y-%m-%d-%H:%M:%S", now_tm);
if (len == 0) {
D_ERROR("Invalid time format\n");
D_GOTO(out_lf1, rc = EINVAL);
}
D_PRINT("Leaked OIDs will be inserted in /lost+found/%s\n", now_name);
D_PRINT("DFS checker: Leaked OIDs will be inserted in /lost+found/%s\n",
now_name);
} else {
D_PRINT("Leaked OIDs will be inserted in /lost+found/%s\n", name);
D_PRINT("DFS checker: Leaked OIDs will be inserted in /lost+found/%s\n",
name);
}

rc = dfs_open(dfs, lf, name ? name : now_name, S_IFDIR | 0755,
Expand Down Expand Up @@ -6865,6 +6892,8 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
* Pass 1: check directories only and descend to mark all oids in the namespace of each dir.
* Pass 2: relink remaining oids in the L+F root that are unmarked still after first pass.
*/
D_PRINT("DFS checker: Checking unmarked OIDs (Pass 1)\n");
oit_args->num_scanned = 0;
memset(&anchor, 0, sizeof(anchor));
/** Start Pass 1 */
while (!daos_anchor_is_eof(&anchor)) {
Expand All @@ -6875,6 +6904,16 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
D_GOTO(out_lf2, rc = daos_der2errno(rc));
}

clock_gettime(CLOCK_REALTIME, &current_time);
if (rc)
D_GOTO(out_lf2, rc = errno);
oit_args->num_scanned += nr_entries;
if (current_time.tv_sec - oit_args->print_time >= DFS_ELAPSED_TIME) {
D_PRINT("DFS checker: Checked "DF_U64" objects (runtime: "DF_U64" sec)\n",
oit_args->num_scanned, current_time.tv_sec - oit_args->start_time);
oit_args->print_time = current_time.tv_sec;
}

for (i = 0; i < nr_entries; i++) {
if (flags & DFS_CHECK_RELINK) {
enum daos_otype_t otype = daos_obj_id2type(oids[i]);
Expand Down Expand Up @@ -6935,6 +6974,8 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
if (!(flags & DFS_CHECK_RELINK))
goto done;

D_PRINT("DFS checker: Checking unmarked OIDs (Pass 2)\n");
oit_args->num_scanned = 0;
memset(&anchor, 0, sizeof(anchor));
while (!daos_anchor_is_eof(&anchor)) {
nr_entries = DFS_ITER_NR;
Expand All @@ -6944,11 +6985,20 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
D_GOTO(out_lf2, rc = daos_der2errno(rc));
}

clock_gettime(CLOCK_REALTIME, &current_time);
if (rc)
D_GOTO(out_lf2, rc = errno);
oit_args->num_scanned += nr_entries;
if (current_time.tv_sec - oit_args->print_time >= DFS_ELAPSED_TIME) {
D_PRINT("DFS checker: Checked "DF_U64" objects (runtime: "DF_U64" sec)\n",
oit_args->num_scanned, current_time.tv_sec - oit_args->start_time);
oit_args->print_time = current_time.tv_sec;
}

for (i = 0; i < nr_entries; i++) {
struct dfs_entry entry = {0};
enum daos_otype_t otype = daos_obj_id2type(oids[i]);
char oid_name[DFS_MAX_NAME + 1];
daos_size_t len;

if (flags & DFS_CHECK_PRINT)
D_PRINT("oid["DF_U64"]: "DF_OID"\n", unmarked_entries,
Expand Down Expand Up @@ -7017,8 +7067,12 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
}

done:
if (flags & DFS_CHECK_PRINT)
D_PRINT("Number of Leaked OIDs in Namespace = "DF_U64"\n", unmarked_entries);
rc = clock_gettime(CLOCK_REALTIME, &current_time);
if (rc)
D_GOTO(out_lf2, rc = errno);
D_PRINT("DFS checker: Done! (runtime: "DF_U64" sec)\n",
current_time.tv_sec - oit_args->start_time);
D_PRINT("DFS checker: Number of leaked OIDs in namespace = "DF_U64"\n", unmarked_entries);
if (flags & DFS_CHECK_VERIFY) {
if (oit_args->failed) {
D_ERROR(""DF_U64" OIDs failed data consistency check!\n", oit_args->failed);
Expand Down Expand Up @@ -7058,6 +7112,7 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
rc2 = daos_cont_close(coh, NULL);
if (rc == 0)
rc = daos_der2errno(rc2);

return rc;
}

Expand Down
8 changes: 4 additions & 4 deletions utils/node_local_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3548,7 +3548,7 @@ def test_daos_fs_check(self):
assert rc.returncode == 0
output = rc.stdout.decode('utf-8')
line = output.splitlines()
if line[-1] != 'Number of Leaked OIDs in Namespace = 2':
if line[-1] != 'DFS checker: Number of leaked OIDs in namespace = 2':
raise NLTestFail('Wrong number of Leaked OIDs')

# run again to check nothing is detected
Expand All @@ -3558,7 +3558,7 @@ def test_daos_fs_check(self):
assert rc.returncode == 0
output = rc.stdout.decode('utf-8')
line = output.splitlines()
if line[-1] != 'Number of Leaked OIDs in Namespace = 0':
if line[-1] != 'DFS checker: Number of leaked OIDs in namespace = 0':
raise NLTestFail('Wrong number of Leaked OIDs')

# remount dfuse
Expand Down Expand Up @@ -3613,7 +3613,7 @@ def test_daos_fs_check(self):
assert rc.returncode == 0
output = rc.stdout.decode('utf-8')
line = output.splitlines()
if line[-1] != 'Number of Leaked OIDs in Namespace = 4':
if line[-1] != 'DFS checker: Number of leaked OIDs in namespace = 4':
raise NLTestFail('Wrong number of Leaked OIDs')

# run again to check nothing is detected
Expand All @@ -3623,7 +3623,7 @@ def test_daos_fs_check(self):
assert rc.returncode == 0
output = rc.stdout.decode('utf-8')
line = output.splitlines()
if line[-1] != 'Number of Leaked OIDs in Namespace = 0':
if line[-1] != 'DFS checker: Number of leaked OIDs in namespace = 0':
raise NLTestFail('Wrong number of Leaked OIDs')

# remount dfuse
Expand Down