From 173d8627c2a25e144062b2efd50e4ec5b915cda4 Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Thu, 26 Sep 2024 12:41:20 -0700 Subject: [PATCH] DAOS-16298 test: improve get_clush_command timeout (#15113) Make timeout in get_clush_command per host instead of for all hosts. Signed-off-by: Dalton Bohning --- src/tests/ftest/recovery/ddb.py | 14 ++++++-------- src/tests/ftest/slurm_setup.py | 5 +++-- src/tests/ftest/util/run_utils.py | 14 +++++++++++--- 3 files changed, 20 insertions(+), 13 deletions(-) diff --git a/src/tests/ftest/recovery/ddb.py b/src/tests/ftest/recovery/ddb.py index 04df3184984..25e7223e0fa 100644 --- a/src/tests/ftest/recovery/ddb.py +++ b/src/tests/ftest/recovery/ddb.py @@ -90,12 +90,12 @@ def copy_remote_to_local(remote_file_path, test_dir, remote): # Use clush --rcopy to copy the file from the remote server node to the local test # node. clush will append . to the file when copying. args = "--rcopy {} --dest {}".format(remote_file_path, test_dir) - clush_command = get_clush_command(hosts=remote, args=args) + clush_command = get_clush_command(hosts=remote, args=args, timeout=60) try: - run_command(command=clush_command) + run_command(command=clush_command, timeout=None) except DaosTestError as error: - print("ERROR: Copying {} from {}: {}".format(remote_file_path, remote, error)) - raise error + raise DaosTestError( + f"ERROR: Copying {remote_file_path} from {remote}: {error}") from error # Remove the appended . from the copied file. current_file_path = "".join([remote_file_path, ".", remote]) @@ -103,10 +103,8 @@ def copy_remote_to_local(remote_file_path, test_dir, remote): try: run_command(command=mv_command) except DaosTestError as error: - print( - "ERROR: Moving {} to {}: {}".format( - current_file_path, remote_file_path, error)) - raise error + raise DaosTestError( + f"ERROR: Moving {current_file_path} to {remote_file_path}: {error}") from error class DdbTest(RecoveryTestBase): diff --git a/src/tests/ftest/slurm_setup.py b/src/tests/ftest/slurm_setup.py index 0c3d300d5ff..00e95c6e128 100755 --- a/src/tests/ftest/slurm_setup.py +++ b/src/tests/ftest/slurm_setup.py @@ -145,8 +145,9 @@ def start_munge(self, user): non_control = self.nodes.difference(self.control) self.log.debug('Copying the munge key to %s', non_control) command = get_clush_command( - non_control, args=f"-B -S -v --copy {self.MUNGE_KEY} --dest {self.MUNGE_KEY}") - result = run_remote(self.log, self.control, command) + non_control, args=f"-B -S -v --copy {self.MUNGE_KEY} --dest {self.MUNGE_KEY}", + timeout=60) + result = run_remote(self.log, self.control, command, timeout=None) if not result.passed: raise SlurmSetupException(f'Error creating munge key on {result.failed_hosts}') diff --git a/src/tests/ftest/util/run_utils.py b/src/tests/ftest/util/run_utils.py index 2f9d33b07c5..f4893558fb0 100644 --- a/src/tests/ftest/util/run_utils.py +++ b/src/tests/ftest/util/run_utils.py @@ -345,7 +345,8 @@ def log_result_data(log, data): log.debug("%s%s", " " * indent, line) -def get_clush_command(hosts, args=None, command="", command_env=None, command_sudo=False): +def get_clush_command(hosts, args=None, command="", command_env=None, command_sudo=False, + timeout=None, fanout=None): """Get the clush command with optional sudo arguments. Args: @@ -355,14 +356,21 @@ def get_clush_command(hosts, args=None, command="", command_env=None, command_su command_env (EnvironmentVariables, optional): environment variables to export with the command. Defaults to None. sudo (bool, optional): whether to run the command with sudo privileges. Defaults to False. + timeout (int, optional): number of seconds to wait for the command to complete. + Defaults to None. + fanout (int, optional): fanout to use. Default uses the max of the + clush default (64) or available cores Returns: str: the clush command """ - cmd_list = ["clush"] + if fanout is None: + fanout = max(64, len(os.sched_getaffinity(0))) + cmd_list = ["clush", "-f", str(fanout), "-w", str(hosts)] + if timeout is not None: + cmd_list.extend(["-u", str(timeout)]) if args: cmd_list.append(args) - cmd_list.extend(["-w", str(hosts)]) # If ever needed, this is how to disable host key checking: # cmd_list.extend(["-o", "-oStrictHostKeyChecking=no"]) cmd_list.append(command_as_user(command, "root" if command_sudo else "", command_env))