Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-11283 test: Enhance network failure test with IO. #10133

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 109 additions & 9 deletions src/tests/ftest/deployment/network_failure.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
"""

Check failure on line 1 in src/tests/ftest/deployment/network_failure.py

View workflow job for this annotation

GitHub Actions / Python isort

Imports are incorrectly sorted and/or formatted.
(C) Copyright 2022-2024 Intel Corporation.

SPDX-License-Identifier: BSD-2-Clause-Patent
"""
import os
import time
import threading
from collections import defaultdict

from ClusterShell.NodeSet import NodeSet
Expand Down Expand Up @@ -151,9 +152,42 @@
self.log.info("One or more servers crashed. Check system query again.")

return False

def verify_network_failure(self, ior_namespace, container_namespace):
"""Verify network failure can be recovered with some user interventions with DAOS.

Check warning on line 155 in src/tests/ftest/deployment/network_failure.py

View workflow job for this annotation

GitHub Actions / Flake8 check

W293 blank line contains whitespace

Check warning on line 155 in src/tests/ftest/deployment/network_failure.py

View workflow job for this annotation

GitHub Actions / Pylint check

trailing-whitespace, Trailing whitespace
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(style) trailing whitespace

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change

def bring_network_interface_up(self, test_env):
"""Bring the network interface up.
"""
errors = []
if test_env == "ci":
# wolf
update_network_interface(

Check failure on line 162 in src/tests/ftest/deployment/network_failure.py

View workflow job for this annotation

GitHub Actions / Flake8 check

F821 undefined name 'update_network_interface'

Check failure on line 162 in src/tests/ftest/deployment/network_failure.py

View workflow job for this annotation

GitHub Actions / Pylint check

undefined-variable, Undefined variable 'update_network_interface'
interface=self.interface, state="up", hosts=self.network_down_host,
errors=errors)
else:
# Aurora. Manually run the command.
command = f"sudo ip link set {self.interface} up"
self.log.debug("## Call %s on %s", command, self.network_down_host)
time.sleep(60)
return errors

Check warning on line 171 in src/tests/ftest/deployment/network_failure.py

View workflow job for this annotation

GitHub Actions / Flake8 check

W293 blank line contains whitespace

Check warning on line 171 in src/tests/ftest/deployment/network_failure.py

View workflow job for this annotation

GitHub Actions / Pylint check

trailing-whitespace, Trailing whitespace
def bring_network_interface_down(self, test_env):
"""Bring the network interface down.
"""
errors = []
if test_env == "ci":
# wolf
update_network_interface(

Check failure on line 178 in src/tests/ftest/deployment/network_failure.py

View workflow job for this annotation

GitHub Actions / Flake8 check

F821 undefined name 'update_network_interface'

Check failure on line 178 in src/tests/ftest/deployment/network_failure.py

View workflow job for this annotation

GitHub Actions / Pylint check

undefined-variable, Undefined variable 'update_network_interface'
interface=self.interface, state="down", hosts=self.network_down_host,
errors=errors)
else:
# Aurora. Manually run the command.
command = f"sudo ip link set {self.interface} up"
self.log.debug("## Call %s on %s", command, self.network_down_host)
time.sleep(60)
return errors

def verify_network_failure(self, ior_namespace, container_namespace, with_io=False):
"""Verify network failure can be recovered with some user interventions with
DAOS.

1. Create a pool and a container. Create a container with or without redundancy
factor based on container_namespace.
Expand Down Expand Up @@ -194,24 +228,57 @@
self.interface = NetworkInterface(
self.server_managers[0].get_config_value("fabric_iface"), self.network_down_host,
self.update_nic)

# Test code

Check warning on line 232 in src/tests/ftest/deployment/network_failure.py

View workflow job for this annotation

GitHub Actions / Flake8 check

W291 trailing whitespace

Check warning on line 232 in src/tests/ftest/deployment/network_failure.py

View workflow job for this annotation

GitHub Actions / Pylint check

trailing-whitespace, Trailing whitespace
# Create IP address - Hostname mapping by calling "hostname -i" on every server
# node.
ip_to_host = self.create_ip_to_host()
# Using dmg system query output and ip_to_host, create Hostname - Ranks mapping.
host_to_ranks = self.create_host_to_ranks(
ip_to_host=ip_to_host, system_query_members=members)

Check failure on line 238 in src/tests/ftest/deployment/network_failure.py

View workflow job for this annotation

GitHub Actions / Flake8 check

F821 undefined name 'members'

Check failure on line 238 in src/tests/ftest/deployment/network_failure.py

View workflow job for this annotation

GitHub Actions / Pylint check

undefined-variable, Undefined variable 'members'
rank_value = {i for i in host_to_ranks if host_to_ranks[i]==self.network_down_host}

Check failure on line 239 in src/tests/ftest/deployment/network_failure.py

View workflow job for this annotation

GitHub Actions / Flake8 check

E225 missing whitespace around operator
self.register_cleanup(self.interface.restore, logger=self.log)
self.log.info("interface to update = %s", self.interface)
errors.extend(self.interface.bring_down(self.log))

# 3. Run IOR with given object class. It should fail.
threads = []
# For non-IO testing, bring the network interface down now.
if with_io is False:
errors.extend(self.interface.bring_down(self.log))

# 3. Run IOR with given object class. It should fail.
self.log_step("Expect IOR to fail with the down network interface.")
job_num = 1
ior_results = {}
file_name = "test_file_1"
# IOR will not work, so we'll be waiting for the Mpirun timeout.
self.run_ior_report_error(
job_num=job_num, results=ior_results, file_name="test_file_1",
pool=self.pool, container=self.container[0], namespace=ior_namespace,
timeout=10)
threads.append(threading.Thread(target=self.run_ior_report_error,
kwargs={"job_num": job_num,
"results": ior_results,
"file_name": file_name,
"pool": self.pool,
"container": self.container[0],
"namespace": ior_namespace}))
# Launch the IOR threads and wait for IOR to write some data
for thrd in threads:
self.log.info("Thread : %s", thrd)
thrd.start()
time.sleep(5)

# For I/O testing, bring the network interface after starting IOR.
if with_io is True:
errors = self.bring_network_interface_down(self.test_env)

# Wait to finish the threads
for thrd in threads:
thrd.join()
self.log.info(ior_results)

# 4. Bring up the network interface.
self.log_step("Bring up the network interface.")
errors.extend(self.interface.bring_up(self.log))
self.log.info("Sleeping for 20 seconds after network is up")
time.sleep(20)

# 5. Restart DAOS with dmg.
self.log_step("Restart DAOS with dmg.")
Expand All @@ -220,8 +287,9 @@
dmg_cmd = self.get_dmg_command()
# For debugging.
dmg_cmd.system_query()
stop_rank = rank_value
self.log.info("Call dmg system stop")
dmg_cmd.system_stop()
dmg_cmd.system_stop(ranks=stop_rank, force=True)
self.log.info("Call dmg system start")
dmg_cmd.system_start()

Expand Down Expand Up @@ -293,6 +361,22 @@
ior_namespace="/run/ior_with_rp/*",
container_namespace="/run/container_with_rf/*")

def test_network_failure_with_rp_io(self):
"""Jira ID: DAOS-10003

Test rank failure with redundancy factor and RP_2G1 object class. See
verify_rank_failure() for test steps.

:avocado: tags=all,full_regression
:avocado: tags=hw,medium
:avocado: tags=deployment,network_failure
:avocado: tags=network_failure_with_rp_io
"""
self.verify_network_failure(
ior_namespace="/run/ior_with_rp/*",
container_namespace="/run/container_with_rf/*",
with_io=True)

def test_network_failure_with_ec(self):
"""Jira ID: DAOS-10003.

Expand All @@ -308,6 +392,22 @@
ior_namespace="/run/ior_with_ec/*",
container_namespace="/run/container_with_rf/*")

def test_network_failure_with_ec_io(self):
"""Jira ID: DAOS-10003.

Test rank failure with redundancy factor and EC_8P2 object class. See
verify_rank_failure() for test steps.

:avocado: tags=all,full_regression
:avocado: tags=hw,medium
:avocado: tags=deployment,network_failure,rebuild
:avocado: tags=NetworkFailureTest,test_network_failure_with_ec_io
"""
self.verify_network_failure(
ior_namespace="/run/ior_with_ec/*",
container_namespace="/run/container_with_rf/*",
with_io=True)

def test_network_failure_isolation(self):
"""Jira ID: DAOS-10003.

Expand Down
Loading