Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-11283 test: Enhance network failure test with IO. #10133

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 57 additions & 15 deletions src/tests/ftest/deployment/network_failure.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""
import os
import time
import threading
from ClusterShell.NodeSet import NodeSet

from ior_test_base import IorTestBase
Expand Down Expand Up @@ -112,8 +113,22 @@ def create_ip_to_host(self):
self.log.info("hostname -i results = %s", results)

return {result["stdout"][0]: NodeSet(str(result["hosts"])) for result in results}

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(style) trailing whitespace

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change

def bring_network_interface_down(self):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(pylint-missing-function-docstring) Missing function or method docstring

errors = []
if self.test_env == "ci":
# wolf
update_network_interface(
interface=self.interface, state="down", hosts=self.network_down_host,
errors=errors)
else:
# Aurora. Manually run the command.
command = f"sudo ip link set {self.interface} down"
self.log.debug("## Call %s on %s", command, self.network_down_host)
time.sleep(20)


def verify_network_failure(self, ior_namespace, container_namespace):
def verify_network_failure(self, ior_namespace, container_namespace, with_io=False):
"""Verify network failure can be recovered with some user interventions with
DAOS.

Expand Down Expand Up @@ -155,25 +170,36 @@ def verify_network_failure(self, ior_namespace, container_namespace):
"fabric_iface", "/run/server_config/servers/0/*")
self.log.info("interface to update = %s", self.interface)

if self.test_env == "ci":
# wolf
update_network_interface(
interface=self.interface, state="down", hosts=self.network_down_host,
errors=errors)
else:
# Aurora. Manually run the command.
command = f"sudo ip link set {self.interface} down"
self.log.debug("## Call %s on %s", command, self.network_down_host)
time.sleep(20)
# For non-IO testing, bring the network interface down now.
if with_io is False:
self.bring_network_interface_down()

# 3. Run IOR with given object class. It should fail.
threads = []
job_num = 1
ior_results = {}
file_name = "test_file_1"
# IOR will not work, so we'll be waiting for the Mpirun timeout.
self.run_ior_report_error(
job_num=job_num, results=ior_results, file_name="test_file_1",
pool=self.pool, container=self.container[0], namespace=ior_namespace,
timeout=10)
threads.append(threading.Thread(target=self.run_ior_report_error,
kwargs={"job_num": job_num,
"results": ior_results,
"file_name": file_name,
"pool": self.pool,
"container": self.container[0],
"namespace": ior_namespace}))
# Launch the IOR threads and wait for IOR to write some data
for thrd in threads:
self.log.info("Thread : %s", thrd)
thrd.start()
time.sleep(5)

# For I/O testing, bring the network interface after starting IOR.
if with_io is True:
self.bring_network_interface_down()

# Wait to finish the threads
for thrd in threads:
thrd.join()
self.log.info(ior_results)

# 4. Bring up the network interface.
Expand Down Expand Up @@ -262,6 +288,22 @@ def test_network_failure_with_rp(self):
ior_namespace="/run/ior_with_rp/*",
container_namespace="/run/container_with_rf/*")

def test_network_failure_with_rp_io(self):
"""Jira ID: DAOS-10003

Test rank failure with redundancy factor and RP_2G1 object class. See
verify_rank_failure() for test steps.

:avocado: tags=all,full_regression
:avocado: tags=hw,medium
:avocado: tags=deployment,network_failure
:avocado: tags=network_failure_with_rp_io
"""
self.verify_network_failure(
ior_namespace="/run/ior_with_rp/*",
container_namespace="/run/container_with_rf/*",
with_io=True)

def test_network_failure_with_ec(self):
"""Jira ID: DAOS-10003

Expand Down