Skip to content

Commit

Permalink
DAOS-16298 test: improve get_clush_command timeout (#15113)
Browse files Browse the repository at this point in the history
Make timeout in get_clush_command per host instead of for all hosts.

Signed-off-by: Dalton Bohning <dalton.bohning@intel.com>
  • Loading branch information
daltonbohning committed Sep 26, 2024
1 parent 2a006ad commit 173d862
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 13 deletions.
14 changes: 6 additions & 8 deletions src/tests/ftest/recovery/ddb.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,23 +90,21 @@ def copy_remote_to_local(remote_file_path, test_dir, remote):
# Use clush --rcopy to copy the file from the remote server node to the local test
# node. clush will append .<server_hostname> to the file when copying.
args = "--rcopy {} --dest {}".format(remote_file_path, test_dir)
clush_command = get_clush_command(hosts=remote, args=args)
clush_command = get_clush_command(hosts=remote, args=args, timeout=60)
try:
run_command(command=clush_command)
run_command(command=clush_command, timeout=None)
except DaosTestError as error:
print("ERROR: Copying {} from {}: {}".format(remote_file_path, remote, error))
raise error
raise DaosTestError(
f"ERROR: Copying {remote_file_path} from {remote}: {error}") from error

# Remove the appended .<server_hostname> from the copied file.
current_file_path = "".join([remote_file_path, ".", remote])
mv_command = "mv {} {}".format(current_file_path, remote_file_path)
try:
run_command(command=mv_command)
except DaosTestError as error:
print(
"ERROR: Moving {} to {}: {}".format(
current_file_path, remote_file_path, error))
raise error
raise DaosTestError(
f"ERROR: Moving {current_file_path} to {remote_file_path}: {error}") from error


class DdbTest(RecoveryTestBase):
Expand Down
5 changes: 3 additions & 2 deletions src/tests/ftest/slurm_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,9 @@ def start_munge(self, user):
non_control = self.nodes.difference(self.control)
self.log.debug('Copying the munge key to %s', non_control)
command = get_clush_command(
non_control, args=f"-B -S -v --copy {self.MUNGE_KEY} --dest {self.MUNGE_KEY}")
result = run_remote(self.log, self.control, command)
non_control, args=f"-B -S -v --copy {self.MUNGE_KEY} --dest {self.MUNGE_KEY}",
timeout=60)
result = run_remote(self.log, self.control, command, timeout=None)
if not result.passed:
raise SlurmSetupException(f'Error creating munge key on {result.failed_hosts}')

Expand Down
14 changes: 11 additions & 3 deletions src/tests/ftest/util/run_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,8 @@ def log_result_data(log, data):
log.debug("%s%s", " " * indent, line)


def get_clush_command(hosts, args=None, command="", command_env=None, command_sudo=False):
def get_clush_command(hosts, args=None, command="", command_env=None, command_sudo=False,
timeout=None, fanout=None):
"""Get the clush command with optional sudo arguments.
Args:
Expand All @@ -355,14 +356,21 @@ def get_clush_command(hosts, args=None, command="", command_env=None, command_su
command_env (EnvironmentVariables, optional): environment variables to export with the
command. Defaults to None.
sudo (bool, optional): whether to run the command with sudo privileges. Defaults to False.
timeout (int, optional): number of seconds to wait for the command to complete.
Defaults to None.
fanout (int, optional): fanout to use. Default uses the max of the
clush default (64) or available cores
Returns:
str: the clush command
"""
cmd_list = ["clush"]
if fanout is None:
fanout = max(64, len(os.sched_getaffinity(0)))
cmd_list = ["clush", "-f", str(fanout), "-w", str(hosts)]
if timeout is not None:
cmd_list.extend(["-u", str(timeout)])
if args:
cmd_list.append(args)
cmd_list.extend(["-w", str(hosts)])
# If ever needed, this is how to disable host key checking:
# cmd_list.extend(["-o", "-oStrictHostKeyChecking=no"])
cmd_list.append(command_as_user(command, "root" if command_sudo else "", command_env))
Expand Down

0 comments on commit 173d862

Please sign in to comment.