Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-14445 dfuse: Add test for eviction of in-use container. #14139

Draft
wants to merge 27 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
9d2fa58
DAOS-13490 test: Update valgrind suppressions.
ashleypittman Oct 9, 2023
2dbb8ab
Add a test for container handle evict.
ashleypittman Oct 9, 2023
c676a2b
Merge branch 'master' into amd/dfuse-ch-evict
ashleypittman Oct 10, 2023
182e37f
Fix merge and spelling.
ashleypittman Oct 10, 2023
06de65b
Merge branch 'master' into amd/dfuse-ch-evict
ashleypittman Oct 31, 2023
f24842a
Merge branch 'master' into amd/dfuse-ch-evict
ashleypittman Nov 14, 2023
f8dc011
Add some more testing.
ashleypittman Nov 14, 2023
21ed551
Merge branch 'master' into amd/dfuse-ch-evict
ashleypittman Nov 30, 2023
19e2872
Fix warnings.
ashleypittman Nov 30, 2023
72f934d
Clean up some logging.
ashleypittman Nov 30, 2023
5f2805d
Revise the locking.
ashleypittman Nov 30, 2023
3568c7e
Merge branch 'master' into amd/dfuse-ch-evict
ashleypittman Jan 16, 2024
196d639
Merge branch 'master' into amd/dfuse-ch-evict
ashleypittman Jan 30, 2024
052b807
Back out conflict.
ashleypittman Feb 13, 2024
be6849a
Merge branch 'master' into amd/dfuse-ch-evict
ashleypittman Feb 13, 2024
8efd8ab
Back out cahnge.
ashleypittman Feb 13, 2024
ec1e27d
Fix merge and update test.
ashleypittman Feb 13, 2024
c678195
Merge branch 'master' into amd/dfuse-ch-evict
ashleypittman Feb 26, 2024
ae55f24
Change the lookup fix.
ashleypittman Feb 26, 2024
080c2f1
Merge branch 'master' into amd/dfuse-ch-evict
ashleypittman Mar 18, 2024
9781740
Improve test.
ashleypittman Mar 18, 2024
d4849bd
Merge branch 'master' into amd/dfuse-ch-evict
ashleypittman Apr 9, 2024
9819c11
Merge branch 'master' into amd/dfuse-ch-evict
ashleypittman Apr 11, 2024
9cfd876
Remove debug changes.
ashleypittman Apr 11, 2024
805ec31
Merge branch 'master' into amd/dfuse-ch-evict
ashleypittman Apr 15, 2024
3493b24
Merge branch 'master' into amd/dfuse-ch-evict
ashleypittman Apr 23, 2024
7e8c33b
Merge branch 'master' into amd/dfuse-ch-evict
ashleypittman Sep 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 38 additions & 6 deletions src/client/dfuse/ops/lookup.c
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,12 @@ check_for_uns_ep(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie, ch
}
ie->ie_obj = NULL;

ie->ie_obj = 0;

/* This is where a I/O failure will happen when accessing a idle but evicted container */
rc = dfs_lookup(dfs->dfs_ns, "/", O_RDWR, &ie->ie_obj, NULL, &ie->ie_stat);
if (rc) {
ie->ie_stat.st_ino = dfs->dfs_ino;
if (rc == EINVAL) {
rc = ENOLINK;
DHS_INFO(dfs, rc, "dfs_lookup() failed");
Expand Down Expand Up @@ -247,15 +251,17 @@ check_for_uns_ep(struct dfuse_info *dfuse_info, struct dfuse_inode_entry *ie, ch
}

void
dfuse_cb_lookup(fuse_req_t req, struct dfuse_inode_entry *parent,
const char *name)
dfuse_cb_lookup(fuse_req_t req, struct dfuse_inode_entry *parent, const char *name)
{
struct dfuse_info *dfuse_info = fuse_req_userdata(req);
struct dfuse_inode_entry *ie;
int rc;
char out[DUNS_MAX_XATTR_LEN];
char *outp = &out[0];
daos_size_t attr_len = DUNS_MAX_XATTR_LEN;
bool evict = false;
ino_t pinode = parent->ie_stat.st_ino;
ino_t cinode = 0;

DFUSE_TRA_DEBUG(parent, "Parent:%#lx " DF_DE, parent->ie_stat.st_ino, DP_DE(name));

Expand Down Expand Up @@ -286,14 +292,21 @@ dfuse_cb_lookup(fuse_req_t req, struct dfuse_inode_entry *parent,

dfs_obj2id(ie->ie_obj, &ie->ie_oid);

dfuse_compute_inode(ie->ie_dfs, &ie->ie_oid,
&ie->ie_stat.st_ino);
dfuse_compute_inode(ie->ie_dfs, &ie->ie_oid, &ie->ie_stat.st_ino);

if (S_ISDIR(ie->ie_stat.st_mode) && attr_len) {
rc = check_for_uns_ep(dfuse_info, ie, out, attr_len);
DFUSE_TRA_DEBUG(ie, "check_for_uns_ep() returned %d", rc);
if (rc != 0)
D_GOTO(out_release, rc);
if (rc != 0) {
/* At this point, we know the dentry exists but there's an error, so try and
* evict the dentry afterwards
*/
if (rc == EINVAL || rc == EIO) {
evict = true;
cinode = ie->ie_stat.st_ino;
}
goto out_release;
}
}

dfuse_reply_entry(dfuse_info, ie, NULL, false, req);
Expand All @@ -308,4 +321,23 @@ dfuse_cb_lookup(fuse_req_t req, struct dfuse_inode_entry *parent,
DFUSE_REPLY_NO_ENTRY(parent, req, parent->ie_dfs->dfc_ndentry_timeout);
else
DFUSE_REPLY_ERR_RAW(parent, req, rc);

if (evict) {
D_INFO("Calling inval_entry %#lx " DF_DE " cinode %#lx", pinode, DP_DE(name),
cinode);
rc = fuse_lowlevel_notify_inval_entry(dfuse_info->di_session, pinode, name,
strnlen(name, NAME_MAX));
if (rc && rc != -ENOENT)
DS_ERROR(-rc, "inval_entry() failed");

if (cinode != 0) {
rc = fuse_lowlevel_notify_inval_inode(dfuse_info->di_session, cinode, 0, 0);
if (rc && rc != -ENOENT)
DS_ERROR(-rc, "inval_inode() cinode %#lx failed", cinode);
}

rc = fuse_lowlevel_notify_inval_inode(dfuse_info->di_session, pinode, 0, 0);
if (rc && rc != -ENOENT)
DS_ERROR(-rc, "inval_inode() pinode %#lx failed", pinode);
}
}
2 changes: 1 addition & 1 deletion src/client/dfuse/ops/opendir.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2016-2023 Intel Corporation.
* (C) Copyright 2016-2024 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down
2 changes: 1 addition & 1 deletion src/include/daos/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -639,7 +639,6 @@ daos_der2errno(int err)
case -DER_NOTYPE:
case -DER_NOSCHEMA:
case -DER_NOLOCAL:
case -DER_NO_HDL:
case -DER_IO_INVAL: return EINVAL;
case -DER_KEY2BIG:
case -DER_REC2BIG: return E2BIG;
Expand All @@ -654,6 +653,7 @@ daos_der2errno(int err)
case -DER_EQ_BUSY: return EBUSY;
case -DER_AGAIN: return EAGAIN;
case -DER_PROTO: return EPROTO;
case -DER_NO_HDL:
case -DER_IO: return EIO;
case -DER_CANCELED: return ECANCELED;
case -DER_OVERFLOW: return EOVERFLOW;
Expand Down
247 changes: 245 additions & 2 deletions utils/node_local_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4814,7 +4814,8 @@ def sizeof_fmt(num, suffix='B'):
lto.hide_fi_calls = skip_fi

if ignore_einval:
lto.skip_suffixes.append(': 22 (Invalid argument)')
# lto.skip_suffixes.append(': 22 (Invalid argument)')
lto.skip_suffixes.append(': 5 (Input/output error)')
lto.skip_suffixes.append(" DER_NO_HDL(-1002): 'Invalid handle'")

if ignore_busy:
Expand Down Expand Up @@ -4968,6 +4969,242 @@ def run_dfuse(server, conf):
return fatal_errors.errors


def run_evict_test(server):
"""Run dfuse, do some I/O and then evict the container to see what happens.

Create a container which will be persistent, then create a new container within that to
test on.

<root> first container
<root>/intermediate_dir
<root>/intermediate_dir/subcont second (evicted) container
<root>/intermediate_dir/subcont/test_dir
<root>/intermediate_dir/subcont/test_dir/dd_file opened after eviction
"""
pool = server.get_test_pool_obj()

container = create_cont(server.conf, pool=pool, label='evict_cont', ctype='POSIX')

pool = server.get_test_pool_obj()
dfuse = DFuse(server, server.conf, container=container, caching=False)
dfuse.start()

p_path = join(dfuse.dir, "intermediate_dir")
os.mkdir(p_path)
cont_path = join(p_path, 'subcont')

sub_cont = create_cont(server.conf, pool=pool, path=cont_path)

test_path = join(cont_path, "test_dir")

os.mkdir(test_path)

# Sample the inode number of the test dir in the sub container, and check it's accessible.
sub_stat = os.stat(test_path)
dfuse_stat = dfuse.check_usage(ino=sub_stat.st_ino)
print(dfuse_stat)

# pylint: disable-next=consider-using-with
fd = open(join(test_path, 'testfile'), 'wb', buffering=0)
fd.write(b'hello')
fd.close()

# Evict the container.
run_daos_cmd(server.conf, ['container', 'evict', '--all', pool.id(), sub_cont.id()])

# The evict can take some time to come into effect.
time.sleep(20)

subprocess.run(['dd', 'if=/dev/zero', 'bs=16k', 'count=64', # nosec
f'of={join(test_path, "dd_file")}'], check=False)

time.sleep(2)

print(os.stat(cont_path))

# Now evict the whole new container. This will cause dfuse to flush everything and then
# unmount the container.
# Any missing refs in dfuse should cause an assertion, any extra refs should cause the number
# of inodes to be incorrect.
dfuse.evict_and_wait([p_path])

# Now check there is only the root inode, everything else should be disconnected/closed.
dfuse_stat = dfuse.check_usage(inodes=1, open_files=1, pools=1, containers=1)

dfuse.stop(ignore_einval=True)
return False


def run_evict_test_fd(server):
"""Run dfuse, do some I/O and then evict the container to see what happens.

Create a container which will be persistent, then create a new container within that to
test on.

<root> first container
<root>/intermediate_dir
<root>/intermediate_dir/subcont second (evicted) container
<root>/intermediate_dir/subcont/test_dir
<root>/intermediate_dir/subcont/test_dir/testfile file with I/O errors
<root>/intermediate_dir/subcont/test_dir/dd_file opened after eviction
"""
pool = server.get_test_pool_obj()

container = create_cont(server.conf, pool=pool, label='evict_cont', ctype='POSIX')

pool = server.get_test_pool_obj()
dfuse = DFuse(server, server.conf, container=container, caching=False)
dfuse.start()

p_path = join(dfuse.dir, "intermediate_dir")
os.mkdir(p_path)
cont_path = join(p_path, 'subcont')

sub_cont = create_cont(server.conf, pool=pool, path=cont_path)

test_path = join(cont_path, "test_dir")

os.mkdir(test_path)

# Sample the inode number of the test dir in the sub container, and check it's accessible.
sub_stat = os.stat(test_path)
dfuse_stat = dfuse.check_usage(ino=sub_stat.st_ino)
print(dfuse_stat)

# pylint: disable-next=consider-using-with
fd = open(join(test_path, 'testfile'), 'wb', buffering=0)
fd.write(b'hello')

# Evict the container.
run_daos_cmd(server.conf, ['container', 'evict', '--all', pool.id(), sub_cont.id()])

# The evict can take some time to come into effect.
time.sleep(20)

try:
fd.write(b'world')
assert False
except OSError as error:
if error.errno != errno.EIO:
raise

fd.close()

time.sleep(2)

# Try a lookup again without any open files to trigger an inval_dentry in dfuse.
print(os.stat(cont_path))

# Now evict the whole new container. This will cause dfuse to flush everything and then
# unmount the container.
# Any missing refs in dfuse should cause an assertion, any extra refs should cause the number
# of inodes to be incorrect.
dfuse.evict_and_wait([p_path])

# Now check there is only the root inode, everything else should be disconnected/closed.
dfuse_stat = dfuse.check_usage(inodes=1, open_files=1, pools=1, containers=1)

dfuse.stop(ignore_einval=True)
return False


def run_evict_test_fd_dd(server):
"""Run dfuse, do some I/O and then evict the container to see what happens.

Create a container which will be persistent, then create a new container within that to
test on.

<root> first container
<root>/intermediate_dir
<root>/intermediate_dir/subcont second (evicted) container
<root>/intermediate_dir/subcont/test_dir
<root>/intermediate_dir/subcont/test_dir/testfile file with I/O errors
<root>/intermediate_dir/subcont/test_dir/dd_file opened after eviction
"""
pool = server.get_test_pool_obj()

container = create_cont(server.conf, pool=pool, label='evict_cont', ctype='POSIX')

pool = server.get_test_pool_obj()
dfuse = DFuse(server, server.conf, container=container, caching=False)
dfuse.start()

p_path = join(dfuse.dir, "intermediate_dir")
os.mkdir(p_path)
cont_path = join(p_path, 'subcont')

sub_cont = create_cont(server.conf, pool=pool, path=cont_path)

test_path = join(cont_path, "test_dir")

os.mkdir(test_path)

# Sample the inode number of the test dir in the sub container, and check it's accessible.
sub_stat = os.stat(test_path)
dfuse_stat = dfuse.check_usage(ino=sub_stat.st_ino)
print(dfuse_stat)

# pylint: disable-next=consider-using-with
fd = open(join(test_path, 'testfile'), 'wb', buffering=0)
fd.write(b'hello')

# Evict the container.
run_daos_cmd(server.conf, ['container', 'evict', '--all', pool.id(), sub_cont.id()])

# The evict can take some time to come into effect.
time.sleep(20)

try:
fd.write(b'world')
assert False
except OSError as error:
if error.errno != errno.EIO:
raise

subprocess.run(['dd', 'if=/dev/zero', 'bs=16k', 'count=64', # nosec
f'of={join(test_path, "dd_file")}'], check=False)

fd.close()

time.sleep(2)

# Try a lookup again without any open files to trigger an inval_dentry in dfuse.
try:
print(os.stat(cont_path))
assert False
except OSError as error:
if error.errno != errno.EIO:
raise

# Re-sample the sub-container to see if it's been evicted. This does not access it but queries
# the mount. The logic here is that dfuse should have received an I/O error and therefore
# evicted the inode.
count = 3
while True:
dfuse_stat = dfuse.check_usage(ino=sub_stat.st_ino)
print(dfuse_stat)
if dfuse_stat['resident'] is False:
print('Path has been evicted')
break
count -= 1
if count == 0:
print(f'Path with inode 0x{sub_stat.st_ino:x} should have been evicted')
assert False, f'Path with inode 0x{sub_stat.st_ino:x} should have been evicted'
time.sleep(1)

# Now evict the whole new container. This will cause dfuse to flush everything and then
# unmount the container.
# Any missing refs in dfuse should cause an assertion, any extra refs should cause the number
# of inodes to be incorrect.
dfuse.evict_and_wait([p_path])

# Now check there is only the root inode, everything else should be disconnected/closed.
dfuse_stat = dfuse.check_usage(inodes=1, open_files=1, pools=1, containers=1)

dfuse.stop(ignore_einval=True)
return False


def run_in_fg(server, conf, args):
"""Run dfuse in the foreground.

Expand Down Expand Up @@ -6348,6 +6585,7 @@ def run(wf, args):
else:
with DaosServer(conf, test_class='first', wf=wf_server,
fatal_errors=fatal_errors) as server:

if args.mode == 'launch':
run_in_fg(server, conf, args)
elif args.mode == 'overlay':
Expand All @@ -6362,6 +6600,10 @@ def run(wf, args):
test_pydaos_kv(server, conf)
test_pydaos_kv_obj_class(server, conf)
fatal_errors.add_result(server.set_fi())
elif args.mode == 'evict-test':
fatal_errors.add_result(run_evict_test(server))
fatal_errors.add_result(run_evict_test_fd(server))
fatal_errors.add_result(run_evict_test_fd_dd(server))
elif args.test == 'all':
fatal_errors.add_result(run_posix_tests(server, conf))
elif args.test:
Expand Down Expand Up @@ -6498,7 +6740,8 @@ def main():
parser.add_argument('--log-usage-save')
parser.add_argument('--dtx', action='store_true')
parser.add_argument('--test', help="Use '--test list' for list")
parser.add_argument('mode', nargs='*')
parser.add_argument('mode', nargs='*', choices=['fi', 'all', 'overlay', 'set-fi', 'launch',
'evict-test', []])
args = parser.parse_args()

if args.server_fi:
Expand Down
Loading