Skip to content

Commit

Permalink
DAOS-13175 dfs: print progress of dfs mwc checker
Browse files Browse the repository at this point in the history
Every 30 seconds print how many objects have been scanned in the dfs namespace or marked in the OIT. This informs users that there is some progress being done by the checker and it's not hung in case of a large namespace.

Required-githooks: true

Signed-off-by: Mohamad Chaarawi <mohamad.chaarawi@intel.com>
  • Loading branch information
mchaarawi committed Jun 21, 2023
1 parent a3c57e5 commit 1b34850
Showing 1 changed file with 48 additions and 17 deletions.
65 changes: 48 additions & 17 deletions src/client/dfs/dfs.c
Original file line number Diff line number Diff line change
Expand Up @@ -6454,13 +6454,16 @@ dfs_dir_anchor_set(dfs_obj_t *obj, const char name[], daos_anchor_t *anchor)
#define DFS_ITER_NR 128
#define DFS_ITER_DKEY_BUF (DFS_ITER_NR * sizeof(uint64_t))
#define DFS_ITER_ENTRY_BUF (DFS_ITER_NR * DFS_MAX_NAME)
#define DFS_ELAPSED_TIME 30

struct dfs_oit_args {
daos_handle_t oit;
uint64_t flags;
uint64_t snap_epoch;
uint64_t skipped;
uint64_t failed;
time_t start_time;
uint64_t num_scanned;
};

static int
Expand Down Expand Up @@ -6556,8 +6559,17 @@ oit_mark_cb(dfs_t *dfs, dfs_obj_t *parent, const char name[], void *args)
daos_obj_id_t oid;
d_iov_t marker;
bool mark_data = true;
struct timespec current_time;
int rc;

clock_gettime(CLOCK_REALTIME, &current_time);
oit_args->num_scanned ++;
if (current_time.tv_sec - oit_args->start_time >= DFS_ELAPSED_TIME) {
D_PRINT("DFS container check: Scanned "DF_U64" files/directories in namespace\n",
oit_args->num_scanned);
oit_args->start_time = current_time.tv_sec;
}

/** open the entry name and get the oid */
rc = dfs_lookup_rel(dfs, parent, name, O_RDONLY, &obj, NULL, NULL);
if (rc) {
Expand Down Expand Up @@ -6707,8 +6719,20 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
uid_t uid = geteuid();
gid_t gid = getegid();
unsigned int co_flags = DAOS_COO_EX;
char now_name[24];
struct tm *now_tm;
daos_size_t len;
int rc, rc2;

rc = clock_gettime(CLOCK_REALTIME, &now);
if (rc)
return errno;
now_tm = localtime(&now.tv_sec);
len = strftime(now_name, sizeof(now_name), "%Y-%m-%d-%H:%M:%S", now_tm);
if (len == 0)
return EINVAL;
D_PRINT("DFS container check: Start (%s)\n", now_name);

if (flags & DFS_CHECK_RELINK && flags & DFS_CHECK_REMOVE) {
D_ERROR("can't request remove and link to l+f at the same time\n");
return EINVAL;
Expand All @@ -6729,6 +6753,7 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
D_GOTO(out_cont, rc);
}

D_PRINT("DFS container check: Create OIT table\n");
/** create snapshot for OIT */
rc = daos_cont_create_snap_opt(coh, &snap_epoch, NULL, DAOS_SNAP_OPT_CR | DAOS_SNAP_OPT_OIT,
NULL);
Expand All @@ -6742,6 +6767,7 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
D_GOTO(out_dfs, rc = ENOMEM);
oit_args->flags = flags;
oit_args->snap_epoch = snap_epoch;
oit_args->start_time = now.tv_sec;

/** Open OIT table */
rc = daos_oit_open(coh, snap_epoch, &oit_args->oit, NULL);
Expand All @@ -6758,10 +6784,11 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
D_GOTO(out_oit, rc = daos_der2errno(rc));
}
rc = daos_oit_mark(oit_args->oit, dfs->root.oid, &marker, NULL);
if (rc) {
if (rc && rc != -DER_NONEXIST) {
D_ERROR("Failed to mark ROOT OID in OIT: "DF_RC"\n", DP_RC(rc));
D_GOTO(out_oit, rc = daos_der2errno(rc));
}
rc = 0;

if (flags & DFS_CHECK_VERIFY) {
rc = daos_obj_verify(coh, dfs->super_oid, snap_epoch);
Expand Down Expand Up @@ -6791,6 +6818,8 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
}
}

D_PRINT("DFS container check: Iterating namespace and marking objects\n");
oit_args->num_scanned = 2;
/** iterate through the namespace and mark OITs starting from the root object */
while (!daos_anchor_is_eof(&anchor)) {
rc = dfs_iterate(dfs, &dfs->root, &anchor, &nr_entries, DFS_MAX_NAME * nr_entries,
Expand All @@ -6802,15 +6831,11 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *

nr_entries = DFS_ITER_NR;
}
D_PRINT("DFS container check: Done (Marked "DF_U64" files/directories)\n",
oit_args->num_scanned);

/** Create lost+found directory and properly link unmarked oids there. */
/** Create lost+found directory to link unmarked oids there. */
if (flags & DFS_CHECK_RELINK) {
char now_name[24];

rc = clock_gettime(CLOCK_REALTIME, &now);
if (rc)
D_GOTO(out_oit, rc = errno);

rc = dfs_open(dfs, NULL, "lost+found", S_IFDIR | 0755, O_CREAT | O_RDWR, 0, 0, NULL,
&lf);
if (rc) {
Expand All @@ -6819,18 +6844,10 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
}

if (name == NULL) {
struct tm *now_tm;
size_t len;
/*
* Create a directory with current timestamp in l+f where leaked oids will
* be linked in this run.
*/
now_tm = localtime(&now.tv_sec);
len = strftime(now_name, sizeof(now_name), "%Y-%m-%d-%H:%M:%S", now_tm);
if (len == 0) {
D_ERROR("Invalid time format\n");
D_GOTO(out_lf1, rc = EINVAL);
}
D_PRINT("Leaked OIDs will be inserted in /lost+found/%s\n", now_name);
} else {
D_PRINT("Leaked OIDs will be inserted in /lost+found/%s\n", name);
Expand Down Expand Up @@ -6865,16 +6882,28 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
* Pass 1: check directories only and descend to mark all oids in the namespace of each dir.
* Pass 2: relink remaining oids in the L+F root that are unmarked still after first pass.
*/
D_PRINT("DFS container check: Checking unmarked OIDs (Pass 1)\n");
oit_args->num_scanned = 0;
memset(&anchor, 0, sizeof(anchor));
/** Start Pass 1 */
while (!daos_anchor_is_eof(&anchor)) {
struct timespec current_time;

nr_entries = DFS_ITER_NR;
rc = daos_oit_list_unmarked(oit_args->oit, oids, &nr_entries, &anchor, NULL);
if (rc) {
D_ERROR("daos_oit_list_unmarked() failed: "DF_RC"\n", DP_RC(rc));
D_GOTO(out_lf2, rc = daos_der2errno(rc));
}

clock_gettime(CLOCK_REALTIME, &current_time);
oit_args->num_scanned += nr_entries;
if (current_time.tv_sec - oit_args->start_time >= DFS_ELAPSED_TIME) {
D_PRINT("DFS container check: Checked "DF_U64" unmarked objects in OIT\n",
oit_args->num_scanned);
oit_args->start_time = current_time.tv_sec;
}

for (i = 0; i < nr_entries; i++) {
if (flags & DFS_CHECK_RELINK) {
enum daos_otype_t otype = daos_obj_id2type(oids[i]);
Expand Down Expand Up @@ -6935,6 +6964,7 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
if (!(flags & DFS_CHECK_RELINK))
goto done;

D_PRINT("DFS container check: Checking unmarked OIDs (Pass 2)\n");
memset(&anchor, 0, sizeof(anchor));
while (!daos_anchor_is_eof(&anchor)) {
nr_entries = DFS_ITER_NR;
Expand All @@ -6948,7 +6978,6 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
struct dfs_entry entry = {0};
enum daos_otype_t otype = daos_obj_id2type(oids[i]);
char oid_name[DFS_MAX_NAME + 1];
daos_size_t len;

if (flags & DFS_CHECK_PRINT)
D_PRINT("oid["DF_U64"]: "DF_OID"\n", unmarked_entries,
Expand Down Expand Up @@ -7058,6 +7087,8 @@ dfs_cont_check(daos_handle_t poh, const char *cont, uint64_t flags, const char *
rc2 = daos_cont_close(coh, NULL);
if (rc == 0)
rc = daos_der2errno(rc2);

D_PRINT("DFS container check: Done!\n");
return rc;
}

Expand Down

0 comments on commit 1b34850

Please sign in to comment.